this repo has no description
at trunk 2610 lines 92 kB view raw
1// Copyright (c) Facebook, Inc. and its affiliates. (http://www.facebook.com) 2// unicodeobject.c implementation 3#include <cerrno> 4#include <cstdarg> 5#include <cstring> 6#include <cwchar> 7 8#include "cpython-data.h" 9#include "cpython-func.h" 10 11#include "api-handle.h" 12#include "bytearray-builtins.h" 13#include "bytes-builtins.h" 14#include "handles.h" 15#include "modules.h" 16#include "objects.h" 17#include "runtime.h" 18#include "str-builtins.h" 19#include "unicode.h" 20#include "utils.h" 21 22const char* Py_FileSystemDefaultEncoding = "utf-8"; 23int Py_HasFileSystemDefaultEncoding = 1; 24const char* Py_FileSystemDefaultEncodeErrors = "surrogatepass"; 25 26namespace py { 27 28typedef byte Py_UCS1; 29typedef uint16_t Py_UCS2; 30 31static const int kMaxLongLongChars = 19; // len(str(2**63-1)) 32static const int kOverallocateFactor = 4; 33 34PY_EXPORT PyTypeObject* PyUnicodeIter_Type_Ptr() { 35 Runtime* runtime = Thread::current()->runtime(); 36 return reinterpret_cast<PyTypeObject*>(ApiHandle::borrowedReference( 37 runtime, runtime->typeAt(LayoutId::kStrIterator))); 38} 39 40static RawObject symbolFromError(Thread* thread, const char* error) { 41 Runtime* runtime = thread->runtime(); 42 Symbols* symbols = runtime->symbols(); 43 if (error == nullptr || std::strcmp(error, "strict") == 0) { 44 return symbols->at(ID(strict)); 45 } 46 if (std::strcmp(error, "ignore") == 0) { 47 return symbols->at(ID(ignore)); 48 } 49 if (std::strcmp(error, "replace") == 0) { 50 return symbols->at(ID(replace)); 51 } 52 return Runtime::internStrFromCStr(thread, error); 53} 54 55PY_EXPORT void PyUnicode_WRITE_Func(enum PyUnicode_Kind kind, void* data, 56 Py_ssize_t index, Py_UCS4 value) { 57 if (kind == PyUnicode_1BYTE_KIND) { 58 static_cast<Py_UCS1*>(data)[index] = static_cast<Py_UCS1>(value); 59 } else if (kind == PyUnicode_2BYTE_KIND) { 60 static_cast<Py_UCS2*>(data)[index] = static_cast<Py_UCS2>(value); 61 } else { 62 DCHECK(kind == PyUnicode_4BYTE_KIND, "kind must be PyUnicode_4BYTE_KIND"); 63 static_cast<Py_UCS4*>(data)[index] = static_cast<Py_UCS4>(value); 64 } 65} 66 67PY_EXPORT void _PyUnicodeWriter_Dealloc(_PyUnicodeWriter* writer) { 68 PyMem_Free(writer->data); 69} 70 71PY_EXPORT PyObject* _PyUnicodeWriter_Finish(_PyUnicodeWriter* writer) { 72 Thread* thread = Thread::current(); 73 HandleScope scope(thread); 74 Runtime* runtime = thread->runtime(); 75 Str str(&scope, runtime->newStrFromUTF32(View<int32_t>( 76 static_cast<int32_t*>(writer->data), writer->pos))); 77 PyMem_Free(writer->data); 78 return ApiHandle::newReference(runtime, *str); 79} 80 81PY_EXPORT void _PyUnicodeWriter_Init(_PyUnicodeWriter* writer) { 82 std::memset(writer, 0, sizeof(*writer)); 83 writer->kind = PyUnicode_4BYTE_KIND; 84} 85 86static int _PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter* writer, 87 Py_ssize_t length, 88 Py_UCS4 /* maxchar */) { 89 writer->maxchar = kMaxUnicode; 90 if (length > kMaxWord - writer->pos) { 91 Thread::current()->raiseMemoryError(); 92 return -1; 93 } 94 Py_ssize_t newlen = writer->pos + length; 95 if (writer->data == nullptr) { 96 if (writer->overallocate && 97 newlen <= (kMaxWord - newlen / kOverallocateFactor)) { 98 // overallocate to limit the number of realloc() 99 newlen += newlen / kOverallocateFactor; 100 } 101 writer->data = PyMem_Malloc(newlen * sizeof(int32_t)); 102 if (writer->data == nullptr) return -1; 103 } else if (newlen > writer->size) { 104 if (writer->overallocate && 105 newlen <= (kMaxWord - newlen / kOverallocateFactor)) { 106 // overallocate to limit the number of realloc() 107 newlen += newlen / kOverallocateFactor; 108 } 109 writer->data = PyMem_Realloc(writer->data, newlen * sizeof(int32_t)); 110 if (writer->data == nullptr) return -1; 111 } 112 writer->size = newlen; 113 return 0; 114} 115 116PY_EXPORT int _PyUnicodeWriter_Prepare(_PyUnicodeWriter* writer, 117 Py_ssize_t length, Py_UCS4 maxchar) { 118 if (length <= writer->size - writer->pos || length == 0) return 0; 119 return _PyUnicodeWriter_PrepareInternal(writer, length, maxchar); 120} 121 122PY_EXPORT int _PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter* writer, 123 const char* ascii, 124 Py_ssize_t len) { 125 if (len == -1) len = std::strlen(ascii); 126 if (writer->data == nullptr && !writer->overallocate) { 127 writer->data = PyMem_Malloc(len * sizeof(int32_t)); 128 writer->size = len; 129 } 130 131 if (_PyUnicodeWriter_Prepare(writer, len, kMaxUnicode) == -1) return -1; 132 Py_UCS4* data = static_cast<Py_UCS4*>(writer->data); 133 for (Py_ssize_t i = 0; i < len; ++i) { 134 CHECK(ascii[i] >= 0, "_PyUnicodeWriter_WriteASCIIString only takes ASCII"); 135 data[writer->pos++] = static_cast<uint8_t>(ascii[i]); 136 } 137 return 0; 138} 139 140PY_EXPORT int _PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter* writer, 141 Py_UCS4 ch) { 142 if (_PyUnicodeWriter_Prepare(writer, 1, ch) < 0) return -1; 143 PyUnicode_WRITE(PyUnicode_4BYTE_KIND, writer->data, writer->pos, ch); 144 writer->pos++; 145 return 0; 146} 147 148PY_EXPORT int _PyUnicodeWriter_WriteChar(_PyUnicodeWriter* writer, Py_UCS4 ch) { 149 return _PyUnicodeWriter_WriteCharInline(writer, ch); 150} 151 152PY_EXPORT int _PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter* writer, 153 const char* str, 154 Py_ssize_t len) { 155 if (_PyUnicodeWriter_Prepare(writer, len, kMaxUnicode) == -1) return -1; 156 Py_UCS4* data = static_cast<Py_UCS4*>(writer->data); 157 for (Py_ssize_t i = 0; i < len; ++i) { 158 data[writer->pos++] = static_cast<uint8_t>(str[i]); 159 } 160 return 0; 161} 162 163PY_EXPORT int _PyUnicodeWriter_WriteStr(_PyUnicodeWriter* writer, 164 PyObject* str) { 165 Thread* thread = Thread::current(); 166 HandleScope scope(thread); 167 Object obj(&scope, ApiHandle::asObject(ApiHandle::fromPyObject(str))); 168 Str src(&scope, strUnderlying(*obj)); 169 Py_ssize_t codepoints = src.codePointLength(); 170 if (_PyUnicodeWriter_Prepare(writer, codepoints, kMaxUnicode) == -1) { 171 return -1; 172 } 173 Py_UCS4* data = static_cast<Py_UCS4*>(writer->data); 174 for (word i = 0, len = src.length(), cp_len; i < len; i += cp_len) { 175 int32_t cp = src.codePointAt(i, &cp_len); 176 data[writer->pos++] = cp; 177 } 178 return 0; 179} 180 181PY_EXPORT int _PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter* writer, 182 PyObject* str, Py_ssize_t start, 183 Py_ssize_t end) { 184 if (end == 0) return 0; 185 Py_ssize_t len = end - start; 186 if (_PyUnicodeWriter_Prepare(writer, len, kMaxUnicode) < 0) return -1; 187 188 Thread* thread = Thread::current(); 189 HandleScope scope(thread); 190 Object obj(&scope, ApiHandle::asObject(ApiHandle::fromPyObject(str))); 191 Str src(&scope, strUnderlying(*obj)); 192 word start_index = thread->strOffset(src, start); 193 DCHECK_BOUND(start_index, src.length()); 194 word end_index = thread->strOffset(src, end); 195 DCHECK_BOUND(end_index, src.length()); 196 Py_UCS4* data = static_cast<Py_UCS4*>(writer->data); 197 for (word i = start_index, cp_len; i < end_index; i += cp_len) { 198 int32_t cp = src.codePointAt(i, &cp_len); 199 data[writer->pos++] = cp; 200 } 201 return 0; 202} 203 204// Facebook: D13491655 205// Most of the following helper functions, along with PyUnicode_FromFormat and 206// PyUnicode_FromFormatV are directly imported from CPython. The following 207// modifications have been made: 208// 209// - Since our internal strings are always UTF-8, we don't need maxchar or any 210// of the helper functions required to calculate it 211// 212// - Since our strings are immutable, we can't use PyUnicode_Fill. However, 213// since the helper functions always use it to append to strings, we can get 214// away with just writing characters in a loop. 215// 216// - Since our internal strings are always UTF-8, there is no need to check 217// a character's 'Kind' before writing it to a string 218static int writeStr(_PyUnicodeWriter* writer, PyObject* str, Py_ssize_t width, 219 Py_ssize_t precision) { 220 if (PyUnicode_READY(str) == -1) return -1; 221 222 Py_ssize_t length = PyUnicode_GET_LENGTH(str); 223 if ((precision == -1 || precision >= length) && width <= length) { 224 return _PyUnicodeWriter_WriteStr(writer, str); 225 } 226 227 if (precision != -1) length = Py_MIN(precision, length); 228 229 Py_ssize_t arglen = Py_MAX(length, width); 230 // Facebook: Our internal strings are always UTF-8, don't need maxchar 231 // (D13491655) 232 if (_PyUnicodeWriter_Prepare(writer, arglen, 0) == -1) return -1; 233 234 if (width > length) { 235 Py_ssize_t fill = width - length; 236 // Facebook: Our internal strings are immutable, can't use PyUnicode_Fill 237 // (D13491655) 238 for (Py_ssize_t i = 0; i < fill; ++i) { 239 if (_PyUnicodeWriter_WriteCharInline(writer, ' ') == -1) return -1; 240 } 241 } 242 // Facebook: Since we only have one internal representation, we don't have 243 // to worry about changing a string's 'Kind' (D13491655) 244 return _PyUnicodeWriter_WriteSubstring(writer, str, 0, length); 245} 246 247static int writeCStr(_PyUnicodeWriter* writer, const char* str, 248 Py_ssize_t width, Py_ssize_t precision) { 249 Py_ssize_t length = std::strlen(str); 250 if (precision != -1) length = Py_MIN(length, precision); 251 PyObject* unicode = 252 PyUnicode_DecodeUTF8Stateful(str, length, "replace", nullptr); 253 if (unicode == nullptr) return -1; 254 255 int res = writeStr(writer, unicode, width, -1); 256 Py_DECREF(unicode); 257 return res; 258} 259 260static const char* writeArg(_PyUnicodeWriter* writer, const char* f, 261 va_list* vargs) { 262 const char* p = f; 263 f++; 264 int zeropad = 0; 265 if (*f == '0') { 266 zeropad = 1; 267 f++; 268 } 269 270 // parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 271 Py_ssize_t width = -1; 272 if (Py_ISDIGIT(static_cast<unsigned>(*f))) { 273 width = *f - '0'; 274 f++; 275 while (Py_ISDIGIT(static_cast<unsigned>(*f))) { 276 if (width > (kMaxWord - (static_cast<int>(*f) - '0')) / 10) { 277 Thread::current()->raiseWithFmt(LayoutId::kValueError, "width too big"); 278 return nullptr; 279 } 280 width = (width * 10) + (*f - '0'); 281 f++; 282 } 283 } 284 Py_ssize_t precision = -1; 285 if (*f == '.') { 286 f++; 287 if (Py_ISDIGIT(static_cast<unsigned>(*f))) { 288 precision = (*f - '0'); 289 f++; 290 while (Py_ISDIGIT(static_cast<unsigned>(*f))) { 291 if (precision > (kMaxWord - (static_cast<int>(*f) - '0')) / 10) { 292 Thread::current()->raiseWithFmt(LayoutId::kValueError, 293 "precision too big"); 294 return nullptr; 295 } 296 precision = (precision * 10) + (*f - '0'); 297 f++; 298 } 299 } 300 if (*f == '%') { 301 // "%.3%s" => f points to "3" 302 f--; 303 } 304 } 305 if (*f == '\0') { 306 // bogus format "%.123" => go backward, f points to "3" 307 f--; 308 } 309 310 // Handle %ld, %lu, %lld and %llu. 311 int longflag = 0; 312 int longlongflag = 0; 313 int size_tflag = 0; 314 if (*f == 'l') { 315 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') { 316 longflag = 1; 317 ++f; 318 } else if (f[1] == 'l' && (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) { 319 longlongflag = 1; 320 f += 2; 321 } 322 } 323 // handle the size_t flag. 324 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) { 325 size_tflag = 1; 326 ++f; 327 } 328 329 if (f[1] == '\0') writer->overallocate = 0; 330 331 switch (*f) { 332 case 'c': { 333 int ordinal = va_arg(*vargs, int); 334 if (ordinal < 0 || ordinal > kMaxUnicode) { 335 Thread::current()->raiseWithFmt( 336 LayoutId::kOverflowError, 337 "character argument not in range(0x110000)"); 338 return nullptr; 339 } 340 if (_PyUnicodeWriter_WriteCharInline(writer, ordinal) < 0) return nullptr; 341 break; 342 } 343 344 case 'i': 345 case 'd': 346 case 'u': 347 case 'x': { 348 // used by sprintf 349 char buffer[kMaxLongLongChars]; 350 Py_ssize_t len; 351 352 if (*f == 'u') { 353 if (longflag) { 354 len = std::sprintf(buffer, "%lu", va_arg(*vargs, unsigned long)); 355 } else if (longlongflag) { 356 len = 357 std::sprintf(buffer, "%llu", va_arg(*vargs, unsigned long long)); 358 } else if (size_tflag) { 359 len = std::sprintf(buffer, "%" PY_FORMAT_SIZE_T "u", 360 va_arg(*vargs, size_t)); 361 } else { 362 len = std::sprintf(buffer, "%u", va_arg(*vargs, unsigned int)); 363 } 364 } else if (*f == 'x') { 365 len = std::sprintf(buffer, "%x", va_arg(*vargs, int)); 366 } else { 367 if (longflag) { 368 len = std::sprintf(buffer, "%li", va_arg(*vargs, long)); 369 } else if (longlongflag) { 370 len = std::sprintf(buffer, "%lli", va_arg(*vargs, long long)); 371 } else if (size_tflag) { 372 len = std::sprintf(buffer, "%" PY_FORMAT_SIZE_T "i", 373 va_arg(*vargs, Py_ssize_t)); 374 } else { 375 len = std::sprintf(buffer, "%i", va_arg(*vargs, int)); 376 } 377 } 378 DCHECK(len >= 0, "len must be >= 0"); 379 380 if (precision < len) precision = len; 381 382 Py_ssize_t arglen = Py_MAX(precision, width); 383 if (_PyUnicodeWriter_Prepare(writer, arglen, 127) == -1) return nullptr; 384 385 if (width > precision) { 386 Py_ssize_t fill = width - precision; 387 Py_UCS4 fillchar = zeropad ? '0' : ' '; 388 // Facebook: Our internal strings are immutable, can't use 389 // PyUnicode_Fill (D13491655) 390 for (Py_ssize_t i = 0; i < fill; ++i) { 391 if (_PyUnicodeWriter_WriteCharInline(writer, fillchar) == -1) { 392 return nullptr; 393 } 394 } 395 } 396 if (precision > len) { 397 Py_ssize_t fill = precision - len; 398 // Facebook: Our internal strings are immutable, can't use 399 // PyUnicode_Fill (D13491655) 400 for (Py_ssize_t i = 0; i < fill; ++i) { 401 if (_PyUnicodeWriter_WriteCharInline(writer, '0') == -1) { 402 return nullptr; 403 } 404 } 405 } 406 407 if (_PyUnicodeWriter_WriteASCIIString(writer, buffer, len) < 0) { 408 return nullptr; 409 } 410 break; 411 } 412 413 case 'p': { 414 char number[kMaxLongLongChars]; 415 416 Py_ssize_t len = std::sprintf(number, "%p", va_arg(*vargs, void*)); 417 DCHECK(len >= 0, "len must be >= 0"); 418 419 // %p is ill-defined: ensure leading 0x. 420 if (number[1] == 'X') { 421 number[1] = 'x'; 422 } else if (number[1] != 'x') { 423 std::memmove(number + 2, number, std::strlen(number) + 1); 424 number[0] = '0'; 425 number[1] = 'x'; 426 len += 2; 427 } 428 429 if (_PyUnicodeWriter_WriteASCIIString(writer, number, len) < 0) { 430 return nullptr; 431 } 432 break; 433 } 434 435 case 's': { 436 // UTF-8 437 const char* s = va_arg(*vargs, const char*); 438 if (writeCStr(writer, s, width, precision) < 0) { 439 return nullptr; 440 } 441 break; 442 } 443 444 case 'U': { 445 PyObject* obj = va_arg(*vargs, PyObject*); 446 // This used to call _PyUnicode_CHECK, which is deprecated, and which we 447 // have not imported. 448 DCHECK(obj, "obj must not be null"); 449 450 if (writeStr(writer, obj, width, precision) == -1) { 451 return nullptr; 452 } 453 break; 454 } 455 456 case 'V': { 457 PyObject* obj = va_arg(*vargs, PyObject*); 458 const char* str = va_arg(*vargs, const char*); 459 if (obj) { 460 // This used to DCHECK _PyUnicode_CHECK, which is deprecated, and which 461 // we have not imported. 462 if (writeStr(writer, obj, width, precision) == -1) { 463 return nullptr; 464 } 465 } else { 466 DCHECK(str != nullptr, "str must not be null"); 467 if (writeCStr(writer, str, width, precision) < 0) { 468 return nullptr; 469 } 470 } 471 break; 472 } 473 474 case 'S': { 475 PyObject* obj = va_arg(*vargs, PyObject*); 476 DCHECK(obj, "obj must not be null"); 477 PyObject* str = PyObject_Str(obj); 478 if (!str) return nullptr; 479 if (writeStr(writer, str, width, precision) == -1) { 480 Py_DECREF(str); 481 return nullptr; 482 } 483 Py_DECREF(str); 484 break; 485 } 486 487 case 'R': { 488 PyObject* obj = va_arg(*vargs, PyObject*); 489 DCHECK(obj, "obj must not be null"); 490 PyObject* repr = PyObject_Repr(obj); 491 if (!repr) return nullptr; 492 if (writeStr(writer, repr, width, precision) == -1) { 493 Py_DECREF(repr); 494 return nullptr; 495 } 496 Py_DECREF(repr); 497 break; 498 } 499 500 case 'A': { 501 PyObject* obj = va_arg(*vargs, PyObject*); 502 DCHECK(obj, "obj must not be null"); 503 PyObject* ascii = PyObject_ASCII(obj); 504 if (!ascii) return nullptr; 505 if (writeStr(writer, ascii, width, precision) == -1) { 506 Py_DECREF(ascii); 507 return nullptr; 508 } 509 Py_DECREF(ascii); 510 break; 511 } 512 513 case '%': 514 if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0) return nullptr; 515 break; 516 517 default: { 518 // if we stumble upon an unknown formatting code, copy the rest 519 // of the format string to the output string. (we cannot just 520 // skip the code, since there's no way to know what's in the 521 // argument list) 522 Py_ssize_t len = std::strlen(p); 523 if (_PyUnicodeWriter_WriteLatin1String(writer, p, len) == -1) { 524 return nullptr; 525 } 526 f = p + len; 527 return f; 528 } 529 } 530 531 f++; 532 return f; 533} 534 535PY_EXPORT int _PyUnicode_EqualToASCIIString(PyObject* unicode, 536 const char* c_str) { 537 DCHECK(unicode, "nullptr argument"); 538 DCHECK(c_str, "nullptr argument"); 539 RawObject obj = ApiHandle::asObject(ApiHandle::fromPyObject(unicode)); 540 DCHECK(Thread::current()->runtime()->isInstanceOfStr(obj), 541 "non-str argument"); 542 return strUnderlying(obj).equalsCStr(c_str); 543} 544 545PY_EXPORT int _PyUnicode_EQ(PyObject* aa, PyObject* bb) { 546 Thread* thread = Thread::current(); 547 HandleScope scope(thread); 548 Object obj_aa(&scope, ApiHandle::asObject(ApiHandle::fromPyObject(aa))); 549 Object obj_bb(&scope, ApiHandle::asObject(ApiHandle::fromPyObject(bb))); 550 Str lhs(&scope, strUnderlying(*obj_aa)); 551 Str rhs(&scope, strUnderlying(*obj_bb)); 552 return lhs.equals(*rhs); 553} 554 555PY_EXPORT size_t Py_UNICODE_strlen(const Py_UNICODE* u) { 556 DCHECK(u != nullptr, "u should not be null"); 557 return std::wcslen(u); 558} 559 560PY_EXPORT int _PyUnicode_Ready(PyObject* /* unicode */) { return 0; } 561 562PY_EXPORT int PyUnicode_CheckExact_Func(PyObject* obj) { 563 return ApiHandle::asObject(ApiHandle::fromPyObject(obj)).isStr(); 564} 565 566PY_EXPORT int PyUnicode_Check_Func(PyObject* obj) { 567 return Thread::current()->runtime()->isInstanceOfStr( 568 ApiHandle::asObject(ApiHandle::fromPyObject(obj))); 569} 570 571PY_EXPORT PyObject* PyUnicode_FromString(const char* c_string) { 572 Runtime* runtime = Thread::current()->runtime(); 573 return ApiHandle::newReference(runtime, runtime->newStrFromCStr(c_string)); 574} 575 576// Look for a surrogate codepoint in str[start:]. Note that start is a byte 577// offset. Return the first index found in that range, or -1 if not found. 578static word strFindSurrogateCodepoint(const Str& str, word start) { 579 word length = str.length(); 580 word byte_index = start; 581 while (byte_index < length) { 582 word num_bytes; 583 int32_t codepoint = str.codePointAt(byte_index, &num_bytes); 584 if (Unicode::isSurrogate(codepoint)) { 585 return byte_index; 586 } 587 byte_index += num_bytes; 588 } 589 return -1; 590} 591 592PY_EXPORT const char* PyUnicode_AsUTF8AndSize(PyObject* pyunicode, 593 Py_ssize_t* size) { 594 Thread* thread = Thread::current(); 595 if (pyunicode == nullptr) { 596 thread->raiseBadArgument(); 597 return nullptr; 598 } 599 600 HandleScope scope(thread); 601 ApiHandle* handle = ApiHandle::fromPyObject(pyunicode); 602 Object obj(&scope, ApiHandle::asObject(handle)); 603 Runtime* runtime = thread->runtime(); 604 if (!runtime->isInstanceOfStr(*obj)) { 605 thread->raiseBadInternalCall(); 606 return nullptr; 607 } 608 609 Str str(&scope, strUnderlying(*obj)); 610 word length = str.length(); 611 if (size != nullptr) *size = length; 612 if (void* cache = ApiHandle::cache(runtime, handle)) { 613 return static_cast<char*>(cache); 614 } 615 616 word surr_index = strFindSurrogateCodepoint(str, 0); 617 if (surr_index != -1) { 618 Object encoding(&scope, SmallStr::fromCStr("utf-8")); 619 Object start(&scope, SmallInt::fromWord(surr_index)); 620 Object end(&scope, SmallInt::fromWord(surr_index + 1)); 621 Object reason(&scope, runtime->newStrFromCStr("surrogates not allowed")); 622 Object exc(&scope, 623 thread->invokeFunction5(ID(builtins), ID(UnicodeEncodeError), 624 encoding, str, start, end, reason)); 625 Object err(&scope, 626 thread->invokeFunction1(ID(_codecs), ID(strict_errors), exc)); 627 DCHECK(err.isErrorException(), 628 "_codecs.strict_errors should raise an exception"); 629 return nullptr; 630 } 631 632 byte* result = static_cast<byte*>(std::malloc(length + 1)); 633 str.copyTo(result, length); 634 result[length] = '\0'; 635 ApiHandle::setCache(runtime, handle, result); 636 ApiHandle::setBorrowedNoImmediate(handle); 637 return reinterpret_cast<char*>(result); 638} 639 640PY_EXPORT const char* PyUnicode_AsUTF8(PyObject* unicode) { 641 return PyUnicode_AsUTF8AndSize(unicode, nullptr); 642} 643 644PY_EXPORT PyObject* PyUnicode_FromStringAndSize(const char* u, 645 Py_ssize_t size) { 646 Thread* thread = Thread::current(); 647 648 if (size < 0) { 649 thread->raiseWithFmt(LayoutId::kSystemError, 650 "Negative size passed to PyUnicode_FromStringAndSize"); 651 return nullptr; 652 } 653 if (u == nullptr && size != 0) { 654 // TODO(T36562134): Implement _PyUnicode_New 655 UNIMPLEMENTED("_PyUnicode_New"); 656 } 657 const byte* data = reinterpret_cast<const byte*>(u); 658 Runtime* runtime = thread->runtime(); 659 return ApiHandle::newReference( 660 runtime, runtime->newStrWithAll(View<byte>(data, size))); 661} 662 663PY_EXPORT PyObject* PyUnicode_EncodeFSDefault(PyObject* unicode) { 664 // TODO(T40363016): Allow arbitrary encodings instead of defaulting to utf-8 665 return _PyUnicode_AsUTF8String(unicode, Py_FileSystemDefaultEncodeErrors); 666} 667 668PY_EXPORT PyObject* PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar) { 669 Thread* thread = Thread::current(); 670 // Since CPython optimizes for empty string, we must do so as well to make 671 // sure we don't fail if maxchar is invalid 672 if (size == 0) { 673 return ApiHandle::newReference(thread->runtime(), Str::empty()); 674 } 675 if (maxchar > kMaxUnicode) { 676 thread->raiseWithFmt(LayoutId::kSystemError, 677 "invalid maximum character passed to PyUnicode_New"); 678 return nullptr; 679 } 680 if (size < 0) { 681 thread->raiseWithFmt(LayoutId::kSystemError, 682 "Negative size passed to PyUnicode_New"); 683 return nullptr; 684 } 685 // TODO(T41498010): Add modifiable string state 686 UNIMPLEMENTED("Cannot create mutable strings yet"); 687} 688 689PY_EXPORT void PyUnicode_Append(PyObject** p_left, PyObject* right) { 690 if (p_left == nullptr) { 691 if (!PyErr_Occurred()) { 692 PyErr_BadInternalCall(); 693 } 694 return; 695 } 696 697 PyObject* left = *p_left; 698 if (left == nullptr || right == nullptr || !PyUnicode_Check(left) || 699 !PyUnicode_Check(right)) { 700 if (!PyErr_Occurred()) { 701 PyErr_BadInternalCall(); 702 } 703 Py_CLEAR(*p_left); 704 return; 705 } 706 *p_left = PyUnicode_Concat(left, right); 707 Py_DECREF(left); 708} 709 710PY_EXPORT void PyUnicode_AppendAndDel(PyObject** p_left, PyObject* right) { 711 PyUnicode_Append(p_left, right); 712 Py_XDECREF(right); 713} 714 715PY_EXPORT PyObject* _PyUnicode_AsASCIIString(PyObject* unicode, 716 const char* errors) { 717 DCHECK(unicode != nullptr, "unicode cannot be null"); 718 Thread* thread = Thread::current(); 719 HandleScope scope(thread); 720 Runtime* runtime = thread->runtime(); 721 Object str(&scope, ApiHandle::asObject(ApiHandle::fromPyObject(unicode))); 722 if (!runtime->isInstanceOfStr(*str)) { 723 thread->raiseBadArgument(); 724 return nullptr; 725 } 726 Object errors_obj(&scope, symbolFromError(thread, errors)); 727 Object tuple_obj(&scope, thread->invokeFunction2( 728 ID(_codecs), ID(ascii_encode), str, errors_obj)); 729 if (tuple_obj.isError()) { 730 return nullptr; 731 } 732 Tuple tuple(&scope, *tuple_obj); 733 return ApiHandle::newReference(runtime, tuple.at(0)); 734} 735 736PY_EXPORT PyObject* PyUnicode_AsASCIIString(PyObject* unicode) { 737 return _PyUnicode_AsASCIIString(unicode, "strict"); 738} 739 740PY_EXPORT PyObject* PyUnicode_AsCharmapString(PyObject* /* e */, 741 PyObject* /* g */) { 742 UNIMPLEMENTED("PyUnicode_AsCharmapString"); 743} 744 745PY_EXPORT PyObject* PyUnicode_AsDecodedObject(PyObject* /* e */, 746 const char* /* g */, 747 const char* /* s */) { 748 UNIMPLEMENTED("PyUnicode_AsDecodedObject"); 749} 750 751PY_EXPORT PyObject* PyUnicode_AsDecodedUnicode(PyObject* /* e */, 752 const char* /* g */, 753 const char* /* s */) { 754 UNIMPLEMENTED("PyUnicode_AsDecodedUnicode"); 755} 756 757PY_EXPORT PyObject* PyUnicode_AsEncodedObject(PyObject* /* e */, 758 const char* /* g */, 759 const char* /* s */) { 760 UNIMPLEMENTED("PyUnicode_AsEncodedObject"); 761} 762 763PY_EXPORT PyObject* PyUnicode_AsEncodedString(PyObject* unicode, 764 const char* encoding, 765 const char* errors) { 766 DCHECK(unicode != nullptr, "unicode cannot be null"); 767 if (encoding == nullptr) { 768 return _PyUnicode_AsUTF8String(unicode, errors); 769 } 770 Thread* thread = Thread::current(); 771 HandleScope scope(thread); 772 Runtime* runtime = thread->runtime(); 773 Object str(&scope, ApiHandle::asObject(ApiHandle::fromPyObject(unicode))); 774 if (!runtime->isInstanceOfStr(*str)) { 775 thread->raiseBadArgument(); 776 return nullptr; 777 } 778 Object encoding_obj(&scope, runtime->newStrFromCStr(encoding)); 779 Object errors_obj(&scope, errors == nullptr 780 ? Unbound::object() 781 : symbolFromError(thread, errors)); 782 Object result(&scope, thread->invokeFunction3(ID(_codecs), ID(encode), str, 783 encoding_obj, errors_obj)); 784 if (result.isError()) { 785 return nullptr; 786 } 787 if (runtime->isInstanceOfBytes(*result)) { 788 return ApiHandle::newReference(runtime, *result); 789 } 790 if (runtime->isInstanceOfBytearray(*result)) { 791 // Equivalent to calling PyErr_WarnFormat 792 if (!ensureBuiltinModuleById(thread, ID(warnings)).isErrorException()) { 793 Object category(&scope, runtime->typeAt(LayoutId::kRuntimeWarning)); 794 Object message(&scope, 795 runtime->newStrFromFmt( 796 "encoder %s returned bytearray instead of bytes; " 797 "use codecs.encode() to encode to arbitrary types", 798 encoding)); 799 Object stack_level(&scope, runtime->newInt(1)); 800 Object source(&scope, NoneType::object()); 801 Object err(&scope, 802 thread->invokeFunction4(ID(warnings), ID(warn), message, 803 category, stack_level, source)); 804 if (err.isErrorException()) { 805 thread->clearPendingException(); 806 } 807 } 808 Bytearray result_bytearray(&scope, *result); 809 return ApiHandle::newReference(runtime, 810 bytearrayAsBytes(thread, result_bytearray)); 811 } 812 thread->raiseWithFmt(LayoutId::kTypeError, 813 "'%s' encoder returned '%T' instead of 'bytes'; " 814 "use codecs.encode() to encode to arbitrary types", 815 encoding, *result); 816 return nullptr; 817} 818 819PY_EXPORT PyObject* PyUnicode_AsEncodedUnicode(PyObject* /* e */, 820 const char* /* g */, 821 const char* /* s */) { 822 UNIMPLEMENTED("PyUnicode_AsEncodedUnicode"); 823} 824 825PY_EXPORT PyObject* _PyUnicode_AsLatin1String(PyObject* unicode, 826 const char* errors) { 827 DCHECK(unicode != nullptr, "unicode cannot be null"); 828 Thread* thread = Thread::current(); 829 HandleScope scope(thread); 830 Runtime* runtime = thread->runtime(); 831 Object str(&scope, ApiHandle::asObject(ApiHandle::fromPyObject(unicode))); 832 if (!runtime->isInstanceOfStr(*str)) { 833 thread->raiseBadArgument(); 834 return nullptr; 835 } 836 Object errors_obj(&scope, symbolFromError(thread, errors)); 837 Object tuple_obj(&scope, 838 thread->invokeFunction2(ID(_codecs), ID(latin_1_encode), str, 839 errors_obj)); 840 if (tuple_obj.isError()) { 841 return nullptr; 842 } 843 Tuple tuple(&scope, *tuple_obj); 844 return ApiHandle::newReference(runtime, tuple.at(0)); 845} 846 847PY_EXPORT PyObject* PyUnicode_AsLatin1String(PyObject* unicode) { 848 return _PyUnicode_AsLatin1String(unicode, "strict"); 849} 850 851PY_EXPORT PyObject* PyUnicode_AsMBCSString(PyObject* /* e */) { 852 UNIMPLEMENTED("PyUnicode_AsMBCSString"); 853} 854 855PY_EXPORT PyObject* PyUnicode_AsRawUnicodeEscapeString(PyObject* /* e */) { 856 UNIMPLEMENTED("PyUnicode_AsRawUnicodeEscapeString"); 857} 858 859PY_EXPORT Py_UCS4* PyUnicode_AsUCS4(PyObject* u, Py_UCS4* buffer, 860 Py_ssize_t buflen, int copy_null) { 861 if (buffer == nullptr || buflen < 0) { 862 PyErr_BadInternalCall(); 863 return nullptr; 864 } 865 866 Thread* thread = Thread::current(); 867 HandleScope scope(thread); 868 Object obj(&scope, ApiHandle::asObject(ApiHandle::fromPyObject(u))); 869 if (!thread->runtime()->isInstanceOfStr(*obj)) { 870 thread->raiseBadArgument(); 871 } 872 873 Str str(&scope, strUnderlying(*obj)); 874 word num_codepoints = str.codePointLength(); 875 word target_buflen = copy_null ? num_codepoints + 1 : num_codepoints; 876 if (buflen < target_buflen) { 877 thread->raiseWithFmt(LayoutId::kSystemError, 878 "string is longer than the buffer"); 879 if (copy_null != 0 && 0 < buflen) { 880 buffer[0] = 0; 881 } 882 return nullptr; 883 } 884 885 for (word i = 0, offset = 0; i < num_codepoints; i++) { 886 word num_bytes; 887 buffer[i] = str.codePointAt(offset, &num_bytes); 888 offset += num_bytes; 889 } 890 if (copy_null != 0) buffer[num_codepoints] = 0; 891 892 return buffer; 893} 894 895PY_EXPORT Py_UCS4* PyUnicode_AsUCS4Copy(PyObject* str) { 896 Py_ssize_t len = PyUnicode_GET_LENGTH(str) + 1; 897 Py_UCS4* result = static_cast<Py_UCS4*>(PyMem_Malloc(len * sizeof(Py_UCS4))); 898 if (result == nullptr) { 899 PyErr_NoMemory(); 900 return nullptr; 901 } 902 return PyUnicode_AsUCS4(str, result, len, 1); 903} 904 905PY_EXPORT PyObject* PyUnicode_AsUTF16String(PyObject* unicode) { 906 return _PyUnicode_EncodeUTF16(unicode, nullptr, 0); 907} 908 909PY_EXPORT PyObject* PyUnicode_AsUTF32String(PyObject* unicode) { 910 return _PyUnicode_EncodeUTF32(unicode, nullptr, 0); 911} 912 913PY_EXPORT PyObject* PyUnicode_AsUTF8String(PyObject* unicode) { 914 return _PyUnicode_AsUTF8String(unicode, "strict"); 915} 916 917PY_EXPORT PyObject* PyUnicode_AsUnicodeEscapeString(PyObject* /* e */) { 918 UNIMPLEMENTED("PyUnicode_AsUnicodeEscapeString"); 919} 920 921PY_EXPORT Py_ssize_t PyUnicode_AsWideChar(PyObject* str, wchar_t* result, 922 Py_ssize_t size) { 923 Thread* thread = Thread::current(); 924 if (str == nullptr) { 925 thread->raiseBadInternalCall(); 926 return -1; 927 } 928 HandleScope scope(thread); 929 Object str_obj(&scope, ApiHandle::asObject(ApiHandle::fromPyObject(str))); 930 Runtime* runtime = thread->runtime(); 931 if (!runtime->isInstanceOfStr(*str_obj)) { 932 thread->raiseWithFmt( 933 LayoutId::kTypeError, 934 "PyUnicode_AsWideChar requires 'str' object but received a '%T'", 935 &str_obj); 936 return -1; 937 } 938 Str str_str(&scope, strUnderlying(*str_obj)); 939 Py_ssize_t num_code_points = str_str.codePointLength(); 940 if (size > num_code_points) { 941 size = num_code_points + 1; 942 } else { 943 num_code_points = size; 944 } 945 946 { 947 word byte_count = str_str.length(); 948 for (word byte_index = 0, wchar_index = 0, num_bytes = 0; 949 byte_index < byte_count && wchar_index < size; 950 byte_index += num_bytes, wchar_index += 1) { 951 int32_t cp = str_str.codePointAt(byte_index, &num_bytes); 952 static_assert(sizeof(wchar_t) == sizeof(cp), "Requires 32bit wchar_t"); 953 if (result != nullptr) { 954 result[wchar_index] = static_cast<wchar_t>(cp); 955 } 956 } 957 if (num_code_points < size) { 958 result[num_code_points] = '\0'; 959 } 960 } 961 962 return num_code_points; 963} 964 965PY_EXPORT wchar_t* PyUnicode_AsWideCharString(PyObject* str, 966 Py_ssize_t* result_len) { 967 Thread* thread = Thread::current(); 968 if (str == nullptr) { 969 thread->raiseBadInternalCall(); 970 return nullptr; 971 } 972 HandleScope scope(thread); 973 Object str_obj(&scope, ApiHandle::asObject(ApiHandle::fromPyObject(str))); 974 Runtime* runtime = thread->runtime(); 975 if (!runtime->isInstanceOfStr(*str_obj)) { 976 thread->raiseWithFmt( 977 LayoutId::kTypeError, 978 "PyUnicode_AsWideChar requires 'str' object but received a '%T'", 979 &str_obj); 980 return nullptr; 981 } 982 Str str_str(&scope, strUnderlying(*str_obj)); 983 word length = str_str.codePointLength(); 984 wchar_t* result = 985 static_cast<wchar_t*>(PyMem_Malloc((length + 1) * sizeof(wchar_t))); 986 if (result == nullptr) { 987 thread->raiseMemoryError(); 988 return nullptr; 989 } 990 991 { 992 word byte_count = str_str.length(); 993 for (word byte_index = 0, wchar_index = 0, num_bytes = 0; 994 byte_index < byte_count && wchar_index < length + 1; 995 byte_index += num_bytes, wchar_index += 1) { 996 int32_t cp = str_str.codePointAt(byte_index, &num_bytes); 997 if (cp == '\0') { 998 PyMem_Free(result); 999 thread->raiseWithFmt(LayoutId::kValueError, "embedded null character"); 1000 return nullptr; 1001 } 1002 static_assert(sizeof(wchar_t) == sizeof(cp), "Requires 32bit wchar_t"); 1003 result[wchar_index] = static_cast<wchar_t>(cp); 1004 } 1005 result[length] = '\0'; 1006 } 1007 1008 if (result_len != nullptr) { 1009 *result_len = length; 1010 } 1011 return result; 1012} 1013 1014PY_EXPORT PyObject* PyUnicode_BuildEncodingMap(PyObject* /* g */) { 1015 UNIMPLEMENTED("PyUnicode_BuildEncodingMap"); 1016} 1017 1018PY_EXPORT int PyUnicode_Compare(PyObject* left, PyObject* right) { 1019 Thread* thread = Thread::current(); 1020 if (left == nullptr || right == nullptr) { 1021 thread->raiseBadInternalCall(); 1022 return -1; 1023 } 1024 1025 Runtime* runtime = thread->runtime(); 1026 HandleScope scope(thread); 1027 Object left_obj(&scope, ApiHandle::asObject(ApiHandle::fromPyObject(left))); 1028 Object right_obj(&scope, ApiHandle::asObject(ApiHandle::fromPyObject(right))); 1029 if (runtime->isInstanceOfStr(*left_obj) && 1030 runtime->isInstanceOfStr(*right_obj)) { 1031 Str left_str(&scope, strUnderlying(*left_obj)); 1032 Str right_str(&scope, strUnderlying(*right_obj)); 1033 word result = left_str.compare(*right_str); 1034 return result > 0 ? 1 : (result < 0 ? -1 : 0); 1035 } 1036 thread->raiseWithFmt(LayoutId::kTypeError, "Can't compare %T and %T", 1037 &left_obj, &right_obj); 1038 return -1; 1039} 1040 1041PY_EXPORT int PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str) { 1042 Thread* thread = Thread::current(); 1043 HandleScope scope(thread); 1044 Object obj(&scope, ApiHandle::asObject(ApiHandle::fromPyObject(uni))); 1045 Str str_obj(&scope, strUnderlying(*obj)); 1046 // TODO(atalaba): Allow for proper comparison against Latin-1 strings. For 1047 // example, in CPython: "\xC3\xA9" (UTF-8) == "\xE9" (Latin-1), and 1048 // "\xE9 longer" > "\xC3\xA9". 1049 return str_obj.compareCStr(str); 1050} 1051 1052PY_EXPORT PyObject* PyUnicode_Concat(PyObject* left, PyObject* right) { 1053 Thread* thread = Thread::current(); 1054 HandleScope scope(thread); 1055 Runtime* runtime = thread->runtime(); 1056 1057 Object left_obj(&scope, ApiHandle::asObject(ApiHandle::fromPyObject(left))); 1058 Object right_obj(&scope, ApiHandle::asObject(ApiHandle::fromPyObject(right))); 1059 if (!runtime->isInstanceOfStr(*left_obj) || 1060 !runtime->isInstanceOfStr(*right_obj)) { 1061 thread->raiseWithFmt(LayoutId::kTypeError, 1062 "can only concatenate str to str"); 1063 return nullptr; 1064 } 1065 Str left_str(&scope, strUnderlying(*left_obj)); 1066 Str right_str(&scope, strUnderlying(*right_obj)); 1067 word dummy; 1068 if (__builtin_add_overflow(left_str.length(), right_str.length(), &dummy)) { 1069 thread->raiseWithFmt(LayoutId::kOverflowError, 1070 "strings are too large to concat"); 1071 return nullptr; 1072 } 1073 return ApiHandle::newReference( 1074 runtime, runtime->strConcat(thread, left_str, right_str)); 1075} 1076 1077PY_EXPORT int PyUnicode_Contains(PyObject* str, PyObject* substr) { 1078 DCHECK(str != nullptr, "str should not be null"); 1079 DCHECK(substr != nullptr, "substr should not be null"); 1080 Thread* thread = Thread::current(); 1081 HandleScope scope(thread); 1082 Object str_obj(&scope, ApiHandle::asObject(ApiHandle::fromPyObject(str))); 1083 Object substr_obj(&scope, ApiHandle::asObject(ApiHandle::fromPyObject(substr))); 1084 Object result(&scope, 1085 thread->invokeMethodStatic2(LayoutId::kStr, ID(__contains__), 1086 str_obj, substr_obj)); 1087 if (result.isError()) { 1088 if (result.isErrorNotFound()) { 1089 thread->raiseWithFmt(LayoutId::kTypeError, 1090 "could not call str.__contains__"); 1091 } 1092 return -1; 1093 } 1094 DCHECK(result.isBool(), "result of __contains__ should be bool"); 1095 return Bool::cast(*result).value(); 1096} 1097 1098PY_EXPORT Py_ssize_t PyUnicode_CopyCharacters(PyObject*, Py_ssize_t, PyObject*, 1099 Py_ssize_t, Py_ssize_t) { 1100 UNIMPLEMENTED("PyUnicode_CopyCharacters"); 1101} 1102 1103PY_EXPORT Py_ssize_t PyUnicode_Count(PyObject* /* r */, PyObject* /* r */, 1104 Py_ssize_t /* t */, Py_ssize_t /* d */) { 1105 UNIMPLEMENTED("PyUnicode_Count"); 1106} 1107 1108PY_EXPORT PyObject* PyUnicode_Decode(const char* c_str, Py_ssize_t size, 1109 const char* encoding, const char* errors) { 1110 DCHECK(c_str != nullptr, "c_str cannot be null"); 1111 if (encoding == nullptr) { 1112 return PyUnicode_DecodeUTF8Stateful(c_str, size, errors, nullptr); 1113 } 1114 1115 Thread* thread = Thread::current(); 1116 Runtime* runtime = thread->runtime(); 1117 HandleScope scope(thread); 1118 Bytes bytes(&scope, runtime->newBytesWithAll(View<byte>( 1119 reinterpret_cast<const byte*>(c_str), size))); 1120 Object errors_obj(&scope, symbolFromError(thread, errors)); 1121 Object encoding_obj(&scope, runtime->newStrFromCStr(encoding)); 1122 Object result(&scope, thread->invokeFunction3(ID(_codecs), ID(decode), bytes, 1123 encoding_obj, errors_obj)); 1124 if (result.isError()) { 1125 return nullptr; 1126 } 1127 return ApiHandle::newReference(runtime, *result); 1128} 1129 1130PY_EXPORT PyObject* PyUnicode_DecodeASCII(const char* c_str, Py_ssize_t size, 1131 const char* errors) { 1132 Thread* thread = Thread::current(); 1133 Runtime* runtime = thread->runtime(); 1134 HandleScope scope(thread); 1135 Bytes bytes(&scope, runtime->newBytesWithAll(View<byte>( 1136 reinterpret_cast<const byte*>(c_str), size))); 1137 Str errors_obj(&scope, symbolFromError(thread, errors)); 1138 Object result_obj( 1139 &scope, thread->invokeFunction2(ID(_codecs), ID(ascii_decode), bytes, 1140 errors_obj)); 1141 if (result_obj.isError()) { 1142 if (result_obj.isErrorNotFound()) { 1143 thread->raiseWithFmt(LayoutId::kSystemError, 1144 "could not call _codecs.ascii_decode"); 1145 } 1146 return nullptr; 1147 } 1148 Tuple result(&scope, *result_obj); 1149 return ApiHandle::newReference(runtime, result.at(0)); 1150} 1151 1152PY_EXPORT PyObject* PyUnicode_DecodeCharmap(const char* /* s */, 1153 Py_ssize_t /* e */, 1154 PyObject* /* g */, 1155 const char* /* s */) { 1156 UNIMPLEMENTED("PyUnicode_DecodeCharmap"); 1157} 1158 1159PY_EXPORT PyObject* PyUnicode_DecodeCodePageStateful(int /* e */, 1160 const char* /* s */, 1161 Py_ssize_t /* e */, 1162 const char* /* s */, 1163 Py_ssize_t* /* d */) { 1164 UNIMPLEMENTED("PyUnicode_DecodeCodePageStateful"); 1165} 1166 1167PY_EXPORT PyObject* PyUnicode_DecodeFSDefault(const char* c_str) { 1168 Runtime* runtime = Thread::current()->runtime(); 1169 return ApiHandle::newReference(runtime, runtime->newStrFromCStr(c_str)); 1170} 1171 1172PY_EXPORT PyObject* PyUnicode_DecodeFSDefaultAndSize(const char* c_str, 1173 Py_ssize_t size) { 1174 Runtime* runtime = Thread::current()->runtime(); 1175 View<byte> str(reinterpret_cast<const byte*>(c_str), size); 1176 return ApiHandle::newReference(runtime, runtime->newStrWithAll(str)); 1177} 1178 1179PY_EXPORT PyObject* PyUnicode_DecodeLatin1(const char* c_str, Py_ssize_t size, 1180 const char* /* errors */) { 1181 Thread* thread = Thread::current(); 1182 Runtime* runtime = thread->runtime(); 1183 HandleScope scope(thread); 1184 Bytes bytes(&scope, runtime->newBytesWithAll(View<byte>( 1185 reinterpret_cast<const byte*>(c_str), size))); 1186 Object result_obj( 1187 &scope, thread->invokeFunction1(ID(_codecs), ID(latin_1_decode), bytes)); 1188 if (result_obj.isError()) { 1189 if (result_obj.isErrorNotFound()) { 1190 thread->raiseWithFmt(LayoutId::kSystemError, 1191 "could not call _codecs.latin_1_decode"); 1192 } 1193 return nullptr; 1194 } 1195 Tuple result(&scope, *result_obj); 1196 return ApiHandle::newReference(runtime, result.at(0)); 1197} 1198 1199PY_EXPORT PyObject* PyUnicode_DecodeLocale(const char* str, 1200 const char* errors) { 1201 return PyUnicode_DecodeLocaleAndSize(str, std::strlen(str), errors); 1202} 1203 1204PY_EXPORT PyObject* PyUnicode_DecodeLocaleAndSize(const char* str, 1205 Py_ssize_t len, 1206 const char* errors) { 1207 _Py_error_handler surrogateescape; 1208 if (errors == nullptr || std::strcmp(errors, "strict") == 0) { 1209 surrogateescape = _Py_ERROR_STRICT; 1210 } else if (std::strcmp(errors, "surrogateescape") == 0) { 1211 surrogateescape = _Py_ERROR_SURROGATEESCAPE; 1212 } else { 1213 Thread::current()->raiseWithFmt( 1214 LayoutId::kValueError, 1215 "only 'strict' and 'surrogateescape' error handlers " 1216 "are supported, not '%s'", 1217 errors); 1218 return nullptr; 1219 } 1220 1221 if (str[len] != '\0' || static_cast<size_t>(len) != std::strlen(str)) { 1222 Thread::current()->raiseWithFmt(LayoutId::kValueError, 1223 "embedded null byte"); 1224 return nullptr; 1225 } 1226 1227 wchar_t* wstr; 1228 size_t wlen; 1229 const char* reason; 1230 int res = _Py_DecodeLocaleEx(str, &wstr, &wlen, &reason, 1, surrogateescape); 1231 if (res != 0) { 1232 if (res == -2) { 1233 PyObject* exc = 1234 PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nns", "locale", 1235 str, len, wlen, wlen + 1, reason); 1236 if (exc != nullptr) { 1237 PyCodec_StrictErrors(exc); 1238 Py_DECREF(exc); 1239 } 1240 } else { 1241 PyErr_NoMemory(); 1242 } 1243 return nullptr; 1244 } 1245 1246 PyObject* unicode = PyUnicode_FromWideChar(wstr, wlen); 1247 PyMem_RawFree(wstr); 1248 return unicode; 1249} 1250 1251PY_EXPORT PyObject* PyUnicode_DecodeMBCS(const char* /* s */, 1252 Py_ssize_t /* e */, 1253 const char* /* s */) { 1254 UNIMPLEMENTED("PyUnicode_DecodeMBCS"); 1255} 1256 1257PY_EXPORT PyObject* PyUnicode_DecodeMBCSStateful(const char* /* s */, 1258 Py_ssize_t /* e */, 1259 const char* /* s */, 1260 Py_ssize_t* /* d */) { 1261 UNIMPLEMENTED("PyUnicode_DecodeMBCSStateful"); 1262} 1263 1264PY_EXPORT PyObject* PyUnicode_DecodeRawUnicodeEscape(const char* /* s */, 1265 Py_ssize_t /* e */, 1266 const char* /* s */) { 1267 UNIMPLEMENTED("PyUnicode_DecodeRawUnicodeEscape"); 1268} 1269 1270PY_EXPORT PyObject* PyUnicode_DecodeUTF16(const char* /* s */, 1271 Py_ssize_t /* e */, 1272 const char* /* s */, int* /* r */) { 1273 UNIMPLEMENTED("PyUnicode_DecodeUTF16"); 1274} 1275 1276PY_EXPORT PyObject* PyUnicode_DecodeUTF16Stateful(const char* /* s */, 1277 Py_ssize_t /* e */, 1278 const char* /* s */, 1279 int* /* r */, 1280 Py_ssize_t* /* d */) { 1281 UNIMPLEMENTED("PyUnicode_DecodeUTF16Stateful"); 1282} 1283 1284PY_EXPORT PyObject* PyUnicode_DecodeUTF32(const char* /* s */, 1285 Py_ssize_t /* e */, 1286 const char* /* s */, int* /* r */) { 1287 UNIMPLEMENTED("PyUnicode_DecodeUTF32"); 1288} 1289 1290PY_EXPORT PyObject* PyUnicode_DecodeUTF32Stateful(const char* /* s */, 1291 Py_ssize_t /* e */, 1292 const char* /* s */, 1293 int* /* r */, 1294 Py_ssize_t* /* d */) { 1295 UNIMPLEMENTED("PyUnicode_DecodeUTF32Stateful"); 1296} 1297 1298PY_EXPORT PyObject* PyUnicode_DecodeUTF7(const char* /* s */, 1299 Py_ssize_t /* e */, 1300 const char* /* s */) { 1301 UNIMPLEMENTED("PyUnicode_DecodeUTF7"); 1302} 1303 1304PY_EXPORT PyObject* PyUnicode_DecodeUTF7Stateful(const char* /* s */, 1305 Py_ssize_t /* e */, 1306 const char* /* s */, 1307 Py_ssize_t* /* d */) { 1308 UNIMPLEMENTED("PyUnicode_DecodeUTF7Stateful"); 1309} 1310 1311PY_EXPORT PyObject* PyUnicode_DecodeUTF8(const char* c_str, Py_ssize_t size, 1312 const char* errors) { 1313 return PyUnicode_DecodeUTF8Stateful(c_str, size, errors, nullptr); 1314} 1315 1316PY_EXPORT PyObject* PyUnicode_DecodeUTF8Stateful(const char* c_str, 1317 Py_ssize_t size, 1318 const char* errors, 1319 Py_ssize_t* consumed) { 1320 DCHECK(c_str != nullptr, "c_str cannot be null"); 1321 1322 Thread* thread = Thread::current(); 1323 HandleScope scope(thread); 1324 Runtime* runtime = thread->runtime(); 1325 word i = 0; 1326 const byte* byte_str = reinterpret_cast<const byte*>(c_str); 1327 for (; i < size; ++i) { 1328 if (byte_str[i] > kMaxASCII) break; 1329 } 1330 if (i == size) { 1331 if (consumed != nullptr) { 1332 *consumed = size; 1333 } 1334 return ApiHandle::newReference(runtime, 1335 runtime->newStrWithAll({byte_str, size})); 1336 } 1337 Object bytes(&scope, runtime->newBytesWithAll(View<byte>({byte_str, size}))); 1338 Object errors_obj(&scope, symbolFromError(thread, errors)); 1339 Object is_final(&scope, Bool::fromBool(consumed == nullptr)); 1340 Object result_obj( 1341 &scope, thread->invokeFunction3(ID(_codecs), ID(utf_8_decode), bytes, 1342 errors_obj, is_final)); 1343 if (result_obj.isError()) { 1344 if (result_obj.isErrorNotFound()) { 1345 thread->raiseWithFmt(LayoutId::kSystemError, 1346 "could not call _codecs._utf_8_decode_stateful"); 1347 } 1348 return nullptr; 1349 } 1350 Tuple result(&scope, *result_obj); 1351 if (consumed != nullptr) { 1352 *consumed = Int::cast(result.at(1)).asWord(); 1353 } 1354 return ApiHandle::newReference(runtime, result.at(0)); 1355} 1356 1357PY_EXPORT PyObject* PyUnicode_DecodeUnicodeEscape(const char* c_str, 1358 Py_ssize_t size, 1359 const char* errors) { 1360 DCHECK(c_str != nullptr, "c_str cannot be null"); 1361 const char* first_invalid_escape; 1362 PyObject* result = _PyUnicode_DecodeUnicodeEscape(c_str, size, errors, 1363 &first_invalid_escape); 1364 if (result == nullptr) { 1365 return nullptr; 1366 } 1367 if (first_invalid_escape != nullptr) { 1368 if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1, 1369 "invalid escape sequence '\\%c'", 1370 static_cast<byte>(*first_invalid_escape)) < 0) { 1371 Py_DECREF(result); 1372 return nullptr; 1373 } 1374 } 1375 return result; 1376} 1377 1378PY_EXPORT PyObject* _PyUnicode_DecodeUnicodeEscape( 1379 const char* c_str, Py_ssize_t size, const char* errors, 1380 const char** first_invalid_escape) { 1381 DCHECK(c_str != nullptr, "c_str cannot be null"); 1382 DCHECK(first_invalid_escape != nullptr, 1383 "first_invalid_escape cannot be null"); 1384 1385 // So we can remember if we've seen an invalid escape char or not 1386 *first_invalid_escape = nullptr; 1387 1388 Thread* thread = Thread::current(); 1389 HandleScope scope(thread); 1390 Runtime* runtime = thread->runtime(); 1391 Object bytes(&scope, runtime->newBytesWithAll(View<byte>( 1392 reinterpret_cast<const byte*>(c_str), size))); 1393 Object errors_obj(&scope, symbolFromError(thread, errors)); 1394 Object result_obj( 1395 &scope, 1396 thread->invokeFunction2(ID(_codecs), ID(_unicode_escape_decode_stateful), 1397 bytes, errors_obj)); 1398 if (result_obj.isError()) { 1399 if (result_obj.isErrorNotFound()) { 1400 thread->raiseWithFmt(LayoutId::kSystemError, 1401 "could not call _codecs.unicode_escape_decode"); 1402 } 1403 return nullptr; 1404 } 1405 Tuple result(&scope, *result_obj); 1406 Int first_invalid_index(&scope, result.at(2)); 1407 word invalid_index = first_invalid_index.asWord(); 1408 if (invalid_index > -1) { 1409 *first_invalid_escape = c_str + invalid_index; 1410 } 1411 return ApiHandle::newReference(runtime, result.at(0)); 1412} 1413 1414PY_EXPORT PyObject* PyUnicode_EncodeCodePage(int /* e */, PyObject* /* e */, 1415 const char* /* s */) { 1416 UNIMPLEMENTED("PyUnicode_EncodeCodePage"); 1417} 1418 1419PY_EXPORT PyObject* PyUnicode_EncodeLocale(PyObject* unicode, 1420 const char* errors) { 1421 _Py_error_handler surrogateescape; 1422 if (errors == nullptr || std::strcmp(errors, "strict") == 0) { 1423 surrogateescape = _Py_ERROR_STRICT; 1424 } else if (std::strcmp(errors, "surrogateescape") == 0) { 1425 surrogateescape = _Py_ERROR_SURROGATEESCAPE; 1426 } else { 1427 Thread::current()->raiseWithFmt( 1428 LayoutId::kValueError, 1429 "only 'strict' and 'surrogateescape' error handlers " 1430 "are supported, not '%s'", 1431 errors); 1432 return nullptr; 1433 } 1434 Py_ssize_t wlen; 1435 wchar_t* wstr = PyUnicode_AsWideCharString(unicode, &wlen); 1436 if (wstr == nullptr) { 1437 return nullptr; 1438 } 1439 1440 if (static_cast<size_t>(wlen) != std::wcslen(wstr)) { 1441 Thread::current()->raiseWithFmt(LayoutId::kValueError, 1442 "embedded null character"); 1443 PyMem_Free(wstr); 1444 return nullptr; 1445 } 1446 1447 char* str; 1448 size_t error_pos; 1449 const char* reason; 1450 int res = _Py_EncodeLocaleEx(wstr, &str, &error_pos, &reason, 1451 /*current_locale=*/1, surrogateescape); 1452 PyMem_Free(wstr); 1453 1454 if (res != 0) { 1455 if (res == -2) { 1456 PyObject* exc = 1457 PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnns", "locale", 1458 unicode, error_pos, error_pos + 1, reason); 1459 if (exc != nullptr) { 1460 PyCodec_StrictErrors(exc); 1461 Py_DECREF(exc); 1462 } 1463 } else { 1464 PyErr_NoMemory(); 1465 } 1466 return nullptr; 1467 } 1468 1469 PyObject* bytes = PyBytes_FromString(str); 1470 PyMem_RawFree(str); 1471 return bytes; 1472} 1473 1474PY_EXPORT PyObject* _PyUnicode_EncodeUTF16(PyObject* unicode, 1475 const char* errors, int byteorder) { 1476 DCHECK(unicode != nullptr, "unicode cannot be null"); 1477 Thread* thread = Thread::current(); 1478 HandleScope scope(thread); 1479 Runtime* runtime = thread->runtime(); 1480 Object str(&scope, ApiHandle::asObject(ApiHandle::fromPyObject(unicode))); 1481 if (!runtime->isInstanceOfStr(*str)) { 1482 thread->raiseBadArgument(); 1483 return nullptr; 1484 } 1485 Object errors_obj(&scope, symbolFromError(thread, errors)); 1486 Object byteorder_obj(&scope, runtime->newInt(byteorder)); 1487 Object tuple_obj(&scope, 1488 thread->invokeFunction3(ID(_codecs), ID(utf_16_encode), str, 1489 errors_obj, byteorder_obj)); 1490 if (tuple_obj.isError()) { 1491 return nullptr; 1492 } 1493 Tuple tuple(&scope, *tuple_obj); 1494 return ApiHandle::newReference(runtime, tuple.at(0)); 1495} 1496 1497PY_EXPORT PyObject* PyUnicode_EncodeUTF16(const Py_UNICODE* unicode, 1498 Py_ssize_t size, const char* errors, 1499 int byteorder) { 1500 PyObject* str = PyUnicode_FromUnicode(unicode, size); 1501 if (str == nullptr) return nullptr; 1502 PyObject* result = _PyUnicode_EncodeUTF16(str, errors, byteorder); 1503 Py_DECREF(str); 1504 return result; 1505} 1506 1507PY_EXPORT PyObject* _PyUnicode_EncodeUTF32(PyObject* unicode, 1508 const char* errors, int byteorder) { 1509 DCHECK(unicode != nullptr, "unicode cannot be null"); 1510 Thread* thread = Thread::current(); 1511 HandleScope scope(thread); 1512 Runtime* runtime = thread->runtime(); 1513 Object str(&scope, ApiHandle::asObject(ApiHandle::fromPyObject(unicode))); 1514 if (!runtime->isInstanceOfStr(*str)) { 1515 thread->raiseBadArgument(); 1516 return nullptr; 1517 } 1518 Object errors_obj(&scope, symbolFromError(thread, errors)); 1519 Object byteorder_obj(&scope, runtime->newInt(byteorder)); 1520 Object tuple_obj(&scope, 1521 thread->invokeFunction3(ID(_codecs), ID(utf_32_encode), str, 1522 errors_obj, byteorder_obj)); 1523 if (tuple_obj.isError()) { 1524 return nullptr; 1525 } 1526 Tuple tuple(&scope, *tuple_obj); 1527 return ApiHandle::newReference(runtime, tuple.at(0)); 1528} 1529 1530PY_EXPORT PyObject* PyUnicode_EncodeUTF32(const Py_UNICODE* unicode, 1531 Py_ssize_t size, const char* errors, 1532 int byteorder) { 1533 PyObject* str = PyUnicode_FromUnicode(unicode, size); 1534 if (str == nullptr) return nullptr; 1535 PyObject* result = _PyUnicode_EncodeUTF32(str, errors, byteorder); 1536 Py_DECREF(str); 1537 return result; 1538} 1539 1540PY_EXPORT int PyUnicode_FSConverter(PyObject* arg, void* addr) { 1541 if (arg == nullptr) { 1542 Py_DECREF(*reinterpret_cast<PyObject**>(addr)); 1543 *reinterpret_cast<PyObject**>(addr) = nullptr; 1544 return 1; 1545 } 1546 Thread* thread = Thread::current(); 1547 HandleScope scope(thread); 1548 Object arg_obj(&scope, ApiHandle::asObject(ApiHandle::fromPyObject(arg))); 1549 Object path(&scope, NoneType::object()); 1550 Runtime* runtime = thread->runtime(); 1551 if (runtime->isInstanceOfStr(*arg_obj) || 1552 runtime->isInstanceOfBytes(*arg_obj)) { 1553 path = *arg_obj; 1554 } else { 1555 path = thread->invokeFunction1(ID(_io), ID(_fspath), arg_obj); 1556 if (path.isErrorException()) { 1557 return 0; 1558 } 1559 } 1560 Object output(&scope, NoneType::object()); 1561 if (runtime->isInstanceOfBytes(*path)) { 1562 output = *path; 1563 } else { 1564 CHECK(std::strcmp(Py_FileSystemDefaultEncoding, "utf-8") == 0, ""); 1565 CHECK(std::strcmp(Py_FileSystemDefaultEncodeErrors, "surrogatepass") == 0, 1566 ""); 1567 // PyOS_FSPath/_io._fspath guarantee their returned value is bytes or str. 1568 // This is an inlined PyUnicode_FSDecoder, which does a UTF-8 decode with 1569 // surrogatepass. Since our strings are UTF-8 with UTF-16 surrogates 1570 // (WTF-8), we can just copy the bytes out. 1571 Str path_str(&scope, strUnderlying(*path)); 1572 word path_len = path_str.length(); 1573 MutableBytes bytes(&scope, runtime->newMutableBytesUninitialized(path_len)); 1574 bytes.replaceFromWithStr(0, *path_str, path_len); 1575 output = bytes.becomeImmutable(); 1576 } 1577 Bytes underlying(&scope, bytesUnderlying(*output)); 1578 if (underlying.findByte('\0', /*start=*/0, /*length=*/underlying.length()) != 1579 -1) { 1580 thread->raiseWithFmt(LayoutId::kValueError, "embedded null byte"); 1581 return 0; 1582 } 1583 *reinterpret_cast<PyObject**>(addr) = 1584 ApiHandle::newReference(runtime, *output); 1585 return Py_CLEANUP_SUPPORTED; 1586} 1587 1588PY_EXPORT int PyUnicode_FSDecoder(PyObject* arg, void* addr) { 1589 if (arg == nullptr) { 1590 Py_DECREF(*(PyObject**)addr); 1591 *reinterpret_cast<PyObject**>(addr) = nullptr; 1592 return 1; 1593 } 1594 1595 bool is_buffer = PyObject_CheckBuffer(arg); 1596 PyObject* path; 1597 if (!is_buffer) { 1598 path = PyOS_FSPath(arg); 1599 if (path == nullptr) return 0; 1600 } else { 1601 path = arg; 1602 Py_INCREF(arg); 1603 } 1604 1605 PyObject* output; 1606 if (PyUnicode_Check(path)) { 1607 output = path; 1608 } else if (PyBytes_Check(path) || is_buffer) { 1609 if (!PyBytes_Check(path) && 1610 PyErr_WarnFormat( 1611 PyExc_DeprecationWarning, 1, 1612 "path should be string, bytes, or os.PathLike, not %.200s", 1613 PyObject_TypeName(arg))) { 1614 Py_DECREF(path); 1615 return 0; 1616 } 1617 PyObject* path_bytes = PyBytes_FromObject(path); 1618 Py_DECREF(path); 1619 if (!path_bytes) return 0; 1620 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(path_bytes), 1621 PyBytes_GET_SIZE(path_bytes)); 1622 Py_DECREF(path_bytes); 1623 if (!output) return 0; 1624 } else { 1625 Thread::current()->raiseWithFmt( 1626 LayoutId::kTypeError, 1627 "path should be string, bytes, or os.PathLike, not %s", 1628 PyObject_TypeName(arg)); 1629 Py_DECREF(path); 1630 return 0; 1631 } 1632 1633 Thread* thread = Thread::current(); 1634 HandleScope scope(thread); 1635 Str output_str(&scope, ApiHandle::asObject(ApiHandle::fromPyObject(output))); 1636 if (strFindAsciiChar(output_str, '\0') >= 0) { 1637 thread->raiseWithFmt(LayoutId::kValueError, "embedded null character"); 1638 Py_DECREF(output); 1639 return 0; 1640 } 1641 *reinterpret_cast<PyObject**>(addr) = output; 1642 return Py_CLEANUP_SUPPORTED; 1643} 1644 1645PY_EXPORT Py_ssize_t PyUnicode_Find(PyObject* str, PyObject* substr, 1646 Py_ssize_t start, Py_ssize_t end, 1647 int direction) { 1648 DCHECK(str != nullptr, "str must be non-null"); 1649 DCHECK(substr != nullptr, "substr must be non-null"); 1650 DCHECK(direction == -1 || direction == 1, "direction must be -1 or 1"); 1651 Thread* thread = Thread::current(); 1652 HandleScope scope(thread); 1653 Object haystack_obj(&scope, ApiHandle::asObject(ApiHandle::fromPyObject(str))); 1654 Object needle_obj(&scope, ApiHandle::asObject(ApiHandle::fromPyObject(substr))); 1655 Runtime* runtime = thread->runtime(); 1656 if (!runtime->isInstanceOfStr(*haystack_obj)) { 1657 thread->raiseWithFmt(LayoutId::kTypeError, 1658 "PyUnicode_Find requires a 'str' instance"); 1659 return -2; 1660 } 1661 Str haystack(&scope, strUnderlying(*haystack_obj)); 1662 if (!runtime->isInstanceOfStr(*needle_obj)) { 1663 thread->raiseWithFmt(LayoutId::kTypeError, 1664 "PyUnicode_Find requires a 'str' instance"); 1665 return -2; 1666 } 1667 Str needle(&scope, strUnderlying(*needle_obj)); 1668 if (direction == 1) return strFindWithRange(haystack, needle, start, end); 1669 return strRFind(haystack, needle, start, end); 1670} 1671 1672PY_EXPORT Py_ssize_t PyUnicode_FindChar(PyObject* str, Py_UCS4 ch, 1673 Py_ssize_t start, Py_ssize_t end, 1674 int direction) { 1675 DCHECK(str != nullptr, "str must not be null"); 1676 DCHECK(direction == 1 || direction == -1, "direction must be -1 or 1"); 1677 Thread* thread = Thread::current(); 1678 HandleScope scope(thread); 1679 Object haystack_obj(&scope, ApiHandle::asObject(ApiHandle::fromPyObject(str))); 1680 Runtime* runtime = thread->runtime(); 1681 DCHECK(runtime->isInstanceOfStr(*haystack_obj), 1682 "PyUnicode_FindChar requires a 'str' instance"); 1683 Str haystack(&scope, strUnderlying(*haystack_obj)); 1684 Str needle(&scope, SmallStr::fromCodePoint(ch)); 1685 if (direction == 1) return strFindWithRange(haystack, needle, start, end); 1686 return strRFind(haystack, needle, start, end); 1687} 1688 1689PY_EXPORT PyObject* PyUnicode_Format(PyObject* format, PyObject* args) { 1690 if (format == nullptr || args == nullptr) { 1691 PyErr_BadInternalCall(); 1692 return nullptr; 1693 } 1694 if (!PyUnicode_Check(format)) { 1695 Thread::current()->raiseWithFmt(LayoutId::kTypeError, "must be str, not %s", 1696 _PyType_Name(Py_TYPE(format))); 1697 return nullptr; 1698 } 1699 return PyNumber_Remainder(format, args); 1700} 1701 1702PY_EXPORT PyObject* PyUnicode_FromEncodedObject(PyObject* /* j */, 1703 const char* /* g */, 1704 const char* /* s */) { 1705 UNIMPLEMENTED("PyUnicode_FromEncodedObject"); 1706} 1707 1708PY_EXPORT PyObject* PyUnicode_FromFormat(const char* format, ...) { 1709 va_list vargs; 1710 1711 va_start(vargs, format); 1712 PyObject* ret = PyUnicode_FromFormatV(format, vargs); 1713 va_end(vargs); 1714 return ret; 1715} 1716 1717PY_EXPORT PyObject* PyUnicode_FromFormatV(const char* format, va_list vargs) { 1718 va_list vargs2; 1719 _PyUnicodeWriter writer; 1720 1721 _PyUnicodeWriter_Init(&writer); 1722 writer.min_length = std::strlen(format) + 100; 1723 writer.overallocate = 1; 1724 1725 // This copy seems unnecessary but it may have been needed by CPython for 1726 // historical reasons. 1727 va_copy(vargs2, vargs); 1728 1729 for (const char* f = format; *f;) { 1730 if (*f == '%') { 1731 f = writeArg(&writer, f, &vargs2); 1732 if (f == nullptr) goto fail; 1733 } else { 1734 const char* p = f; 1735 do { 1736 if (static_cast<unsigned char>(*p) > 127) { 1737 PyErr_Format( 1738 PyExc_ValueError, 1739 "PyUnicode_FromFormatV() expects an ASCII-encoded format " 1740 "string, got a non-ASCII byte: 0x%02x", 1741 static_cast<unsigned char>(*p)); 1742 goto fail; 1743 } 1744 p++; 1745 } while (*p != '\0' && *p != '%'); 1746 Py_ssize_t len = p - f; 1747 1748 if (*p == '\0') writer.overallocate = 0; 1749 1750 if (_PyUnicodeWriter_WriteASCIIString(&writer, f, len) < 0) goto fail; 1751 1752 f = p; 1753 } 1754 } 1755 va_end(vargs2); 1756 return _PyUnicodeWriter_Finish(&writer); 1757 1758fail: 1759 va_end(vargs2); 1760 _PyUnicodeWriter_Dealloc(&writer); 1761 return nullptr; 1762} 1763 1764PY_EXPORT PyObject* PyUnicode_FromObject(PyObject* /* j */) { 1765 UNIMPLEMENTED("PyUnicode_FromObject"); 1766} 1767 1768PY_EXPORT PyObject* PyUnicode_FromOrdinal(int ordinal) { 1769 Thread* thread = Thread::current(); 1770 if (ordinal < 0 || ordinal > kMaxUnicode) { 1771 thread->raiseWithFmt(LayoutId::kValueError, 1772 "chr() arg not in range(0x110000)"); 1773 return nullptr; 1774 } 1775 return ApiHandle::newReference(thread->runtime(), 1776 SmallStr::fromCodePoint(ordinal)); 1777} 1778 1779PY_EXPORT PyObject* PyUnicode_FromWideChar(const wchar_t* buffer, 1780 Py_ssize_t size) { 1781 Thread* thread = Thread::current(); 1782 if (buffer == nullptr && size != 0) { 1783 thread->raiseBadInternalCall(); 1784 return nullptr; 1785 } 1786 1787 RawObject result = size == -1 1788 ? newStrFromWideChar(thread, buffer) 1789 : newStrFromWideCharWithLength(thread, buffer, size); 1790 return result.isErrorException() 1791 ? nullptr 1792 : ApiHandle::newReference(thread->runtime(), result); 1793} 1794 1795PY_EXPORT Py_ssize_t PyUnicode_GET_LENGTH_Func(PyObject* pyobj) { 1796 RawObject obj = ApiHandle::asObjectNoImmediate(ApiHandle::fromPyObject(pyobj)); 1797 DCHECK(Thread::current()->runtime()->isInstanceOfStr(obj), 1798 "non-str argument to PyUnicode_GET_LENGTH"); 1799 return strUnderlying(obj).codePointLength(); 1800} 1801 1802PY_EXPORT const char* PyUnicode_GetDefaultEncoding() { 1803 return Py_FileSystemDefaultEncoding; 1804} 1805 1806PY_EXPORT Py_ssize_t PyUnicode_GetLength(PyObject* pyobj) { 1807 Thread* thread = Thread::current(); 1808 RawObject obj = ApiHandle::asObject(ApiHandle::fromPyObject(pyobj)); 1809 if (!thread->runtime()->isInstanceOfStr(obj)) { 1810 thread->raiseBadArgument(); 1811 return -1; 1812 } 1813 return strUnderlying(obj).codePointLength(); 1814} 1815 1816PY_EXPORT Py_ssize_t PyUnicode_GetSize(PyObject* pyobj) { 1817 // This function returns the number of UTF-16 or UTF-32 code units, depending 1818 // on the size of wchar_t on the operating system. On the machines that we 1819 // currently use for testing, this is the same as the number of Unicode code 1820 // points. This must be modified when we support operating systems with 1821 // different wchar_t (e.g. Windows). 1822 return PyUnicode_GetLength(pyobj); 1823} 1824 1825PY_EXPORT PyObject* PyUnicode_InternFromString(const char* c_str) { 1826 DCHECK(c_str != nullptr, "c_str must not be nullptr"); 1827 Thread* thread = Thread::current(); 1828 return ApiHandle::newReference(thread->runtime(), 1829 Runtime::internStrFromCStr(thread, c_str)); 1830} 1831 1832PY_EXPORT void PyUnicode_InternImmortal(PyObject** /* p */) { 1833 UNIMPLEMENTED("PyUnicode_InternImmortal"); 1834} 1835 1836PY_EXPORT void PyUnicode_InternInPlace(PyObject** obj_ptr) { 1837 PyObject* pobj = *obj_ptr; 1838 DCHECK(pobj != nullptr, "pobj should not be null"); 1839 if (pobj == nullptr) { 1840 return; 1841 } 1842 Thread* thread = Thread::current(); 1843 HandleScope scope(thread); 1844 Object obj(&scope, ApiHandle::asObject(ApiHandle::fromPyObject(pobj))); 1845 if (!obj.isLargeStr()) { 1846 return; 1847 } 1848 Object result(&scope, Runtime::internStr(thread, obj)); 1849 if (result != obj) { 1850 Py_DECREF(pobj); 1851 *obj_ptr = ApiHandle::newReference(thread->runtime(), *result); 1852 } 1853} 1854 1855PY_EXPORT int PyUnicode_IsIdentifier(PyObject* str) { 1856 DCHECK(str != nullptr, "str must not be null"); 1857 Thread* thread = Thread::current(); 1858 HandleScope scope(thread); 1859 Object str_obj(&scope, ApiHandle::asObject(ApiHandle::fromPyObject(str))); 1860 if (str_obj == Str::empty()) { 1861 return false; 1862 } 1863 Object result(&scope, thread->invokeMethodStatic1(LayoutId::kStr, 1864 ID(isidentifier), str_obj)); 1865 DCHECK(!result.isErrorNotFound(), "could not call str.isidentifier"); 1866 CHECK(!result.isError(), "this function should not error"); 1867 return Bool::cast(*result).value(); 1868} 1869 1870PY_EXPORT PyObject* PyUnicode_Join(PyObject* sep, PyObject* seq) { 1871 DCHECK(sep != nullptr, "sep should not be null"); 1872 DCHECK(seq != nullptr, "seq should not be null"); 1873 Thread* thread = Thread::current(); 1874 HandleScope scope(thread); 1875 Object sep_obj(&scope, ApiHandle::asObject(ApiHandle::fromPyObject(sep))); 1876 // An optimization to rule out non-str values here to use the further 1877 // optimization of `strJoinWithTupleOrList`. 1878 Runtime* runtime = thread->runtime(); 1879 if (!runtime->isInstanceOfStr(*sep_obj)) { 1880 thread->raiseWithFmt(LayoutId::kTypeError, 1881 "separator: expected str instance," 1882 "'%T' found", 1883 &sep_obj); 1884 return nullptr; 1885 } 1886 Str sep_str(&scope, strUnderlying(*sep_obj)); 1887 Object seq_obj(&scope, ApiHandle::asObject(ApiHandle::fromPyObject(seq))); 1888 // An ad-hoc optimization for the case `seq_obj` is a `tuple` or `list`, 1889 // that can be removed without changing the correctness of PyUnicode_Join. 1890 Object result(&scope, strJoinWithTupleOrList(thread, sep_str, seq_obj)); 1891 if (result.isUnbound()) { 1892 result = 1893 thread->invokeMethodStatic2(LayoutId::kStr, ID(join), sep_str, seq_obj); 1894 } 1895 if (result.isError()) { 1896 if (result.isErrorNotFound()) { 1897 thread->raiseWithFmt(LayoutId::kTypeError, "could not call str.join"); 1898 } 1899 return nullptr; 1900 } 1901 return ApiHandle::newReference(runtime, *result); 1902} 1903 1904PY_EXPORT PyObject* PyUnicode_Partition(PyObject* str, PyObject* sep) { 1905 DCHECK(str != nullptr, "str should not be null"); 1906 DCHECK(sep != nullptr, "sep should not be null"); 1907 Thread* thread = Thread::current(); 1908 HandleScope scope(thread); 1909 Object str_obj(&scope, ApiHandle::asObject(ApiHandle::fromPyObject(str))); 1910 Object sep_obj(&scope, ApiHandle::asObject(ApiHandle::fromPyObject(sep))); 1911 Object result(&scope, thread->invokeMethodStatic2( 1912 LayoutId::kStr, ID(partition), str_obj, sep_obj)); 1913 if (result.isError()) { 1914 if (result.isErrorNotFound()) { 1915 thread->raiseWithFmt(LayoutId::kTypeError, 1916 "could not call str.partition"); 1917 } 1918 return nullptr; 1919 } 1920 return ApiHandle::newReference(thread->runtime(), *result); 1921} 1922 1923PY_EXPORT PyObject* PyUnicode_RPartition(PyObject* str, PyObject* sep) { 1924 DCHECK(str != nullptr, "str should not be null"); 1925 DCHECK(sep != nullptr, "sep should not be null"); 1926 Thread* thread = Thread::current(); 1927 HandleScope scope(thread); 1928 Object str_obj(&scope, ApiHandle::asObject(ApiHandle::fromPyObject(str))); 1929 Object sep_obj(&scope, ApiHandle::asObject(ApiHandle::fromPyObject(sep))); 1930 Object result(&scope, thread->invokeMethodStatic2( 1931 LayoutId::kStr, ID(rpartition), str_obj, sep_obj)); 1932 if (result.isError()) { 1933 if (result.isErrorNotFound()) { 1934 thread->raiseWithFmt(LayoutId::kTypeError, 1935 "could not call str.rpartition"); 1936 } 1937 return nullptr; 1938 } 1939 return ApiHandle::newReference(thread->runtime(), *result); 1940} 1941 1942PY_EXPORT PyObject* PyUnicode_RSplit(PyObject* str, PyObject* sep, 1943 Py_ssize_t maxsplit) { 1944 DCHECK(str != nullptr, "str must not be null"); 1945 DCHECK(sep != nullptr, "sep must not be null"); 1946 Thread* thread = Thread::current(); 1947 HandleScope scope(thread); 1948 Object str_obj(&scope, ApiHandle::asObject(ApiHandle::fromPyObject(str))); 1949 Object sep_obj(&scope, ApiHandle::asObject(ApiHandle::fromPyObject(sep))); 1950 Runtime* runtime = thread->runtime(); 1951 Object maxsplit_obj(&scope, runtime->newInt(maxsplit)); 1952 Object result(&scope, 1953 thread->invokeMethodStatic3(LayoutId::kStr, ID(rsplit), str_obj, 1954 sep_obj, maxsplit_obj)); 1955 if (result.isError()) { 1956 if (result.isErrorNotFound()) { 1957 thread->raiseWithFmt(LayoutId::kTypeError, "could not call str.rsplit"); 1958 } 1959 return nullptr; 1960 } 1961 return ApiHandle::newReference(runtime, *result); 1962} 1963 1964PY_EXPORT Py_UCS4 PyUnicode_ReadChar(PyObject* obj, Py_ssize_t index) { 1965 DCHECK(obj != nullptr, "obj must not be null"); 1966 Thread* thread = Thread::current(); 1967 HandleScope scope(thread); 1968 Runtime* runtime = thread->runtime(); 1969 Object str_obj(&scope, ApiHandle::asObject(ApiHandle::fromPyObject(obj))); 1970 if (!runtime->isInstanceOfStr(*str_obj)) { 1971 thread->raiseBadArgument(); 1972 return -1; 1973 } 1974 Str str(&scope, strUnderlying(*str_obj)); 1975 word byte_offset; 1976 if (index < 0 || 1977 (byte_offset = thread->strOffset(str, index)) >= str.length()) { 1978 thread->raiseWithFmt(LayoutId::kIndexError, "string index out of range"); 1979 return -1; 1980 } 1981 word num_bytes; 1982 return str.codePointAt(byte_offset, &num_bytes); 1983} 1984 1985PY_EXPORT PyObject* PyUnicode_Replace(PyObject* str, PyObject* substr, 1986 PyObject* replstr, Py_ssize_t maxcount) { 1987 DCHECK(str != nullptr, "str must not be null"); 1988 DCHECK(substr != nullptr, "substr must not be null"); 1989 DCHECK(replstr != nullptr, "replstr must not be null"); 1990 Thread* thread = Thread::current(); 1991 HandleScope scope(thread); 1992 Runtime* runtime = thread->runtime(); 1993 Object str_obj(&scope, ApiHandle::asObject(ApiHandle::fromPyObject(str))); 1994 if (!runtime->isInstanceOfStr(*str_obj)) { 1995 thread->raiseWithFmt(LayoutId::kTypeError, "str must be str"); 1996 return nullptr; 1997 } 1998 1999 Object substr_obj(&scope, ApiHandle::asObject(ApiHandle::fromPyObject(substr))); 2000 if (!runtime->isInstanceOfStr(*substr_obj)) { 2001 thread->raiseWithFmt(LayoutId::kTypeError, "substr must be str"); 2002 return nullptr; 2003 } 2004 2005 Object replstr_obj(&scope, ApiHandle::asObject(ApiHandle::fromPyObject(replstr))); 2006 if (!runtime->isInstanceOfStr(*replstr_obj)) { 2007 thread->raiseWithFmt(LayoutId::kTypeError, "replstr must be str"); 2008 return nullptr; 2009 } 2010 2011 Str str_str(&scope, strUnderlying(*str_obj)); 2012 Str substr_str(&scope, strUnderlying(*substr_obj)); 2013 Str replstr_str(&scope, strUnderlying(*replstr_obj)); 2014 return ApiHandle::newReference( 2015 runtime, 2016 runtime->strReplace(thread, str_str, substr_str, replstr_str, maxcount)); 2017} 2018 2019PY_EXPORT int PyUnicode_Resize(PyObject** /* p_unicode */, Py_ssize_t /* h */) { 2020 UNIMPLEMENTED("PyUnicode_Resize"); 2021} 2022 2023PY_EXPORT PyObject* PyUnicode_RichCompare(PyObject* /* t */, PyObject* /* t */, 2024 int /* p */) { 2025 UNIMPLEMENTED("PyUnicode_RichCompare"); 2026} 2027 2028PY_EXPORT PyObject* PyUnicode_Split(PyObject* str, PyObject* sep, 2029 Py_ssize_t maxsplit) { 2030 DCHECK(str != nullptr, "str must not be null"); 2031 DCHECK(sep != nullptr, "sep must not be null"); 2032 Thread* thread = Thread::current(); 2033 HandleScope scope(thread); 2034 Object str_obj(&scope, ApiHandle::asObject(ApiHandle::fromPyObject(str))); 2035 Object sep_obj(&scope, ApiHandle::asObject(ApiHandle::fromPyObject(sep))); 2036 Runtime* runtime = thread->runtime(); 2037 Object maxsplit_obj(&scope, runtime->newInt(maxsplit)); 2038 Object result(&scope, 2039 thread->invokeMethodStatic3(LayoutId::kStr, ID(split), str_obj, 2040 sep_obj, maxsplit_obj)); 2041 if (result.isError()) { 2042 if (result.isErrorNotFound()) { 2043 thread->raiseWithFmt(LayoutId::kTypeError, "could not call str.split"); 2044 } 2045 return nullptr; 2046 } 2047 return ApiHandle::newReference(runtime, *result); 2048} 2049 2050PY_EXPORT PyObject* PyUnicode_Splitlines(PyObject* str, int keepends) { 2051 Thread* thread = Thread::current(); 2052 HandleScope scope(thread); 2053 Object str_obj(&scope, ApiHandle::asObject(ApiHandle::fromPyObject(str))); 2054 Runtime* runtime = thread->runtime(); 2055 if (!runtime->isInstanceOfStr(*str_obj)) { 2056 thread->raiseWithFmt(LayoutId::kTypeError, "must be str, not '%T'", 2057 &str_obj); 2058 return nullptr; 2059 } 2060 Str str_str(&scope, strUnderlying(*str_obj)); 2061 return ApiHandle::newReference(runtime, 2062 strSplitlines(thread, str_str, keepends)); 2063} 2064 2065PY_EXPORT PyObject* PyUnicode_Substring(PyObject* pyobj, Py_ssize_t start, 2066 Py_ssize_t end) { 2067 DCHECK(pyobj != nullptr, "null argument to PyUnicode_Substring"); 2068 Thread* thread = Thread::current(); 2069 if (start < 0 || end < 0) { 2070 thread->raiseWithFmt(LayoutId::kIndexError, "string index out of range"); 2071 return nullptr; 2072 } 2073 Runtime* runtime = thread->runtime(); 2074 if (end <= start) { 2075 return ApiHandle::newReference(runtime, Str::empty()); 2076 } 2077 HandleScope scope(thread); 2078 ApiHandle* handle = ApiHandle::fromPyObject(pyobj); 2079 Object obj(&scope, ApiHandle::asObject(handle)); 2080 DCHECK(runtime->isInstanceOfStr(*obj), 2081 "PyUnicode_Substring requires a 'str' instance"); 2082 Str self(&scope, strUnderlying(*obj)); 2083 word len = self.length(); 2084 word start_index = thread->strOffset(self, start); 2085 if (start_index == len) { 2086 return ApiHandle::newReference(runtime, Str::empty()); 2087 } 2088 word end_index = thread->strOffset(self, end); 2089 if (end_index == len) { 2090 if (start_index == 0) { 2091 ApiHandle::incref(handle); 2092 return pyobj; 2093 } 2094 } 2095 return ApiHandle::newReference( 2096 runtime, strSubstr(thread, self, start_index, end_index - start_index)); 2097} 2098 2099PY_EXPORT Py_ssize_t PyUnicode_Tailmatch(PyObject* str, PyObject* substr, 2100 Py_ssize_t start, Py_ssize_t end, 2101 int direction) { 2102 DCHECK(str != nullptr, "str must be non-null"); 2103 DCHECK(substr != nullptr, "substr must be non-null"); 2104 DCHECK(direction == -1 || direction == 1, "direction must be -1 or 1"); 2105 Thread* thread = Thread::current(); 2106 HandleScope scope(thread); 2107 Object haystack_obj(&scope, ApiHandle::asObject(ApiHandle::fromPyObject(str))); 2108 Object needle_obj(&scope, ApiHandle::asObject(ApiHandle::fromPyObject(substr))); 2109 Runtime* runtime = thread->runtime(); 2110 if (!runtime->isInstanceOfStr(*haystack_obj) || 2111 !runtime->isInstanceOfStr(*needle_obj)) { 2112 thread->raiseBadArgument(); 2113 return -1; 2114 } 2115 Str haystack(&scope, strUnderlying(*haystack_obj)); 2116 Str needle(&scope, strUnderlying(*needle_obj)); 2117 word haystack_len = haystack.codePointLength(); 2118 Slice::adjustSearchIndices(&start, &end, haystack_len); 2119 word needle_len = needle.codePointLength(); 2120 if (start + needle_len > end) { 2121 return 0; 2122 } 2123 word start_offset; 2124 if (direction == 1) { 2125 start_offset = haystack.offsetByCodePoints(0, end - needle_len); 2126 } else { 2127 start_offset = haystack.offsetByCodePoints(0, start); 2128 } 2129 word needle_chars = needle.length(); 2130 for (word i = start_offset, j = 0; j < needle_chars; i++, j++) { 2131 if (haystack.byteAt(i) != needle.byteAt(j)) { 2132 return 0; 2133 } 2134 } 2135 return 1; 2136} 2137 2138PY_EXPORT PyObject* PyUnicode_Translate(PyObject* /* r */, PyObject* /* g */, 2139 const char* /* s */) { 2140 UNIMPLEMENTED("PyUnicode_Translate"); 2141} 2142 2143PY_EXPORT PyTypeObject* PyUnicode_Type_Ptr() { 2144 Runtime* runtime = Thread::current()->runtime(); 2145 return reinterpret_cast<PyTypeObject*>( 2146 ApiHandle::borrowedReference(runtime, runtime->typeAt(LayoutId::kStr))); 2147} 2148 2149PY_EXPORT int PyUnicode_WriteChar(PyObject* /* e */, Py_ssize_t /* x */, 2150 Py_UCS4 /* h */) { 2151 UNIMPLEMENTED("PyUnicode_WriteChar"); 2152} 2153 2154PY_EXPORT Py_UNICODE* PyUnicode_AsUnicode(PyObject* /* e */) { 2155 UNIMPLEMENTED("PyUnicode_AsUnicode"); 2156} 2157 2158PY_EXPORT Py_UNICODE* PyUnicode_AsUnicodeAndSize(PyObject* /* unicode */, 2159 Py_ssize_t* /* size */) { 2160 UNIMPLEMENTED("PyUnicode_AsUnicodeAndSize"); 2161} 2162 2163template <typename T> 2164static PyObject* decodeUnicodeToString(Thread* thread, const void* src, 2165 word size) { 2166 Runtime* runtime = thread->runtime(); 2167 DCHECK(src != nullptr, "Must pass in a non-null buffer"); 2168 const T* cp = static_cast<const T*>(src); 2169 if (size == 1) { 2170 return ApiHandle::newReference(runtime, SmallStr::fromCodePoint(cp[0])); 2171 } 2172 HandleScope scope(thread); 2173 // TODO(T41785453): Remove the StrArray intermediary 2174 StrArray array(&scope, runtime->newStrArray()); 2175 runtime->strArrayEnsureCapacity(thread, array, size); 2176 for (word i = 0; i < size; ++i) { 2177 runtime->strArrayAddCodePoint(thread, array, cp[i]); 2178 } 2179 return ApiHandle::newReference(runtime, runtime->strFromStrArray(array)); 2180} 2181 2182PY_EXPORT PyObject* PyUnicode_FromKindAndData(int kind, const void* buffer, 2183 Py_ssize_t size) { 2184 Thread* thread = Thread::current(); 2185 if (size < 0) { 2186 thread->raiseWithFmt(LayoutId::kValueError, "size must be positive"); 2187 return nullptr; 2188 } 2189 if (size == 0) { 2190 return ApiHandle::newReference(thread->runtime(), Str::empty()); 2191 } 2192 switch (kind) { 2193 case PyUnicode_1BYTE_KIND: 2194 return decodeUnicodeToString<Py_UCS1>(thread, buffer, size); 2195 case PyUnicode_2BYTE_KIND: 2196 return decodeUnicodeToString<Py_UCS2>(thread, buffer, size); 2197 case PyUnicode_4BYTE_KIND: 2198 return decodeUnicodeToString<Py_UCS4>(thread, buffer, size); 2199 } 2200 thread->raiseWithFmt(LayoutId::kSystemError, "invalid kind"); 2201 return nullptr; 2202} 2203 2204PY_EXPORT PyObject* PyUnicode_FromUnicode(const Py_UNICODE* code_units, 2205 Py_ssize_t size) { 2206 if (code_units == nullptr) { 2207 // TODO(T36562134): Implement _PyUnicode_New 2208 UNIMPLEMENTED("_PyUnicode_New"); 2209 } 2210 2211 Thread* thread = Thread::current(); 2212 RawObject result = newStrFromWideCharWithLength(thread, code_units, size); 2213 return result.isErrorException() 2214 ? nullptr 2215 : ApiHandle::newReference(thread->runtime(), result); 2216} 2217 2218PY_EXPORT int PyUnicode_KIND_Func(PyObject* obj) { 2219 // TODO(T47682853): Introduce new PyUnicode_VARBYTE_KIND 2220 CHECK(PyUnicode_IS_ASCII_Func(obj), "only ASCII allowed"); 2221 return PyUnicode_1BYTE_KIND; 2222} 2223 2224// NOTE: This will return a cached and managed C-string buffer that is a copy 2225// of the Str internal buffer. It is NOT a direct pointer into the string 2226// object, so writing into this buffer will do nothing. This is different 2227// behavior from CPython, where changing the data in the buffer changes the 2228// string object. 2229PY_EXPORT void* PyUnicode_DATA_Func(PyObject* str) { 2230 Thread* thread = Thread::current(); 2231 Runtime* runtime = thread->runtime(); 2232 ApiHandle* handle = ApiHandle::fromPyObject(str); 2233 if (void* cache = ApiHandle::cache(runtime, handle)) { 2234 return static_cast<char*>(cache); 2235 } 2236 HandleScope scope(thread); 2237 Object obj(&scope, ApiHandle::asObject(handle)); 2238 DCHECK(runtime->isInstanceOfStr(*obj), "str should be a str instance"); 2239 Str str_obj(&scope, strUnderlying(*obj)); 2240 word length = str_obj.length(); 2241 byte* result = static_cast<byte*>(std::malloc(length + 1)); 2242 str_obj.copyTo(result, length); 2243 result[length] = '\0'; 2244 ApiHandle::setCache(runtime, handle, result); 2245 ApiHandle::setBorrowedNoImmediate(handle); 2246 return reinterpret_cast<char*>(result); 2247} 2248 2249PY_EXPORT Py_UCS4 PyUnicode_READ_Func(int kind, void* data, Py_ssize_t index) { 2250 if (kind == PyUnicode_1BYTE_KIND) return static_cast<Py_UCS1*>(data)[index]; 2251 if (kind == PyUnicode_2BYTE_KIND) return static_cast<Py_UCS2*>(data)[index]; 2252 DCHECK(kind == PyUnicode_4BYTE_KIND, "kind must be PyUnicode_4BYTE_KIND"); 2253 return static_cast<Py_UCS4*>(data)[index]; 2254} 2255 2256PY_EXPORT Py_UCS4 PyUnicode_READ_CHAR_Func(PyObject* obj, Py_ssize_t index) { 2257 Thread* thread = Thread::current(); 2258 HandleScope scope(thread); 2259 Object str_obj(&scope, ApiHandle::asObject(ApiHandle::fromPyObject(obj))); 2260 DCHECK(thread->runtime()->isInstanceOfStr(*str_obj), 2261 "PyUnicode_READ_CHAR must receive a unicode object"); 2262 Str str(&scope, strUnderlying(*str_obj)); 2263 word byte_offset = thread->strOffset(str, index); 2264 if (byte_offset == str.length()) return Py_UCS4{0}; 2265 word num_bytes; 2266 return static_cast<Py_UCS4>(str.codePointAt(byte_offset, &num_bytes)); 2267} 2268 2269PY_EXPORT int PyUnicode_IS_ASCII_Func(PyObject* obj) { 2270 Thread* thread = Thread::current(); 2271 HandleScope scope(thread); 2272 Object str(&scope, ApiHandle::asObject(ApiHandle::fromPyObject(obj))); 2273 DCHECK(thread->runtime()->isInstanceOfStr(*str), 2274 "strIsASCII must receive a unicode object"); 2275 return strUnderlying(*str).isASCII() ? 1 : 0; 2276} 2277 2278PY_EXPORT int Py_UNICODE_ISALPHA_Func(Py_UCS4 code_point) { 2279 if (code_point > kMaxUnicode) { 2280 return 0; 2281 } 2282 return Unicode::isAlpha(static_cast<int32_t>(code_point)) ? 1 : 0; 2283} 2284 2285PY_EXPORT int Py_UNICODE_ISDECIMAL_Func(Py_UCS4 code_point) { 2286 if (code_point > kMaxUnicode) { 2287 return 0; 2288 } 2289 return Unicode::isDecimal(static_cast<int32_t>(code_point)) ? 1 : 0; 2290} 2291 2292PY_EXPORT int Py_UNICODE_ISDIGIT_Func(Py_UCS4 code_point) { 2293 if (code_point > kMaxUnicode) { 2294 return 0; 2295 } 2296 return Unicode::isDigit(static_cast<int32_t>(code_point)) ? 1 : 0; 2297} 2298 2299PY_EXPORT int Py_UNICODE_ISLINEBREAK_Func(Py_UCS4 code_point) { 2300 if (code_point > kMaxUnicode) { 2301 return 0; 2302 } 2303 return Unicode::isLinebreak(static_cast<int32_t>(code_point)) ? 1 : 0; 2304} 2305 2306PY_EXPORT int Py_UNICODE_ISLOWER_Func(Py_UCS4 code_point) { 2307 if (code_point > kMaxUnicode) { 2308 return 0; 2309 } 2310 return Unicode::isLower(static_cast<int32_t>(code_point)) ? 1 : 0; 2311} 2312 2313PY_EXPORT int Py_UNICODE_ISNUMERIC_Func(Py_UCS4 code_point) { 2314 if (code_point > kMaxUnicode) { 2315 return 0; 2316 } 2317 return Unicode::isNumeric(static_cast<int32_t>(code_point)) ? 1 : 0; 2318} 2319 2320PY_EXPORT int Py_UNICODE_ISPRINTABLE_Func(Py_UCS4 code_point) { 2321 if (code_point > kMaxUnicode) { 2322 return 0; 2323 } 2324 return Unicode::isPrintable(static_cast<int32_t>(code_point)) ? 1 : 0; 2325} 2326 2327PY_EXPORT int Py_UNICODE_ISSPACE_Func(Py_UCS4 code_point) { 2328 if (code_point > kMaxUnicode) { 2329 return 0; 2330 } 2331 return Unicode::isSpace(static_cast<int32_t>(code_point)) ? 1 : 0; 2332} 2333 2334PY_EXPORT int Py_UNICODE_ISTITLE_Func(Py_UCS4 code_point) { 2335 if (code_point > kMaxUnicode) { 2336 return 0; 2337 } 2338 return Unicode::isTitle(static_cast<int32_t>(code_point)) ? 1 : 0; 2339} 2340 2341PY_EXPORT int Py_UNICODE_ISUPPER_Func(Py_UCS4 code_point) { 2342 if (code_point > kMaxUnicode) { 2343 return 0; 2344 } 2345 return Unicode::isUpper(static_cast<int32_t>(code_point)) ? 1 : 0; 2346} 2347 2348PY_EXPORT int Py_UNICODE_TODECIMAL_Func(Py_UCS4 code_point) { 2349 if (code_point > kMaxUnicode) { 2350 return -1; 2351 } 2352 return Unicode::toDecimal(static_cast<int32_t>(code_point)); 2353} 2354 2355PY_EXPORT int Py_UNICODE_TODIGIT_Func(Py_UCS4 code_point) { 2356 if (code_point > kMaxUnicode) { 2357 return -1; 2358 } 2359 return Unicode::toDigit(static_cast<int32_t>(code_point)); 2360} 2361 2362PY_EXPORT Py_UCS4 Py_UNICODE_TOLOWER_Func(Py_UCS4 code_point) { 2363 if (code_point > kMaxUnicode) { 2364 return code_point; 2365 } 2366 FullCasing lower = Unicode::toLower(static_cast<int32_t>(code_point)); 2367 return lower.code_points[0]; 2368} 2369 2370PY_EXPORT double Py_UNICODE_TONUMERIC_Func(Py_UCS4 code_point) { 2371 if (code_point > kMaxUnicode) { 2372 return -1.0; 2373 } 2374 return Unicode::toNumeric(static_cast<int32_t>(code_point)); 2375} 2376 2377PY_EXPORT Py_UCS4 Py_UNICODE_TOTITLE_Func(Py_UCS4 code_point) { 2378 if (code_point > kMaxUnicode) { 2379 return code_point; 2380 } 2381 FullCasing title = Unicode::toTitle(static_cast<int32_t>(code_point)); 2382 return title.code_points[0]; 2383} 2384 2385PY_EXPORT Py_UCS4 Py_UNICODE_TOUPPER_Func(Py_UCS4 code_point) { 2386 if (code_point > kMaxUnicode) { 2387 return code_point; 2388 } 2389 FullCasing upper = Unicode::toUpper(static_cast<int32_t>(code_point)); 2390 return upper.code_points[0]; 2391} 2392 2393PY_EXPORT int _Py_normalize_encoding(const char* encoding, char* lower, 2394 size_t lower_len) { 2395 char* buffer = lower; 2396 const char* lower_end = &lower[lower_len - 1]; 2397 bool has_punct = false; 2398 for (char ch = *encoding; ch != '\0'; ch = *++encoding) { 2399 if (Py_ISALNUM(ch) || ch == '.') { 2400 if (has_punct && buffer != lower) { 2401 if (buffer == lower_end) { 2402 return 0; 2403 } 2404 *buffer++ = '_'; 2405 } 2406 has_punct = false; 2407 2408 if (buffer == lower_end) { 2409 return 0; 2410 } 2411 *buffer++ = Py_TOLOWER(ch); 2412 } else { 2413 has_punct = true; 2414 } 2415 } 2416 *buffer = '\0'; 2417 return 1; 2418} 2419 2420PY_EXPORT PyObject* _PyUnicode_AsUTF8String(PyObject* unicode, 2421 const char* errors) { 2422 DCHECK(unicode != nullptr, "unicode cannot be null"); 2423 Thread* thread = Thread::current(); 2424 HandleScope scope(thread); 2425 Runtime* runtime = thread->runtime(); 2426 Object obj(&scope, ApiHandle::asObject(ApiHandle::fromPyObject(unicode))); 2427 if (!runtime->isInstanceOfStr(*obj)) { 2428 thread->raiseBadArgument(); 2429 return nullptr; 2430 } 2431 Str str(&scope, strUnderlying(*obj)); 2432 if (!strHasSurrogate(str)) { 2433 word length = str.length(); 2434 MutableBytes result(&scope, runtime->newMutableBytesUninitialized(length)); 2435 result.replaceFromWithStr(0, *str, length); 2436 return ApiHandle::newReference(runtime, result.becomeImmutable()); 2437 } 2438 Object errors_obj(&scope, symbolFromError(thread, errors)); 2439 Object tuple_obj(&scope, thread->invokeFunction2( 2440 ID(_codecs), ID(utf_8_encode), str, errors_obj)); 2441 if (tuple_obj.isError()) { 2442 return nullptr; 2443 } 2444 Tuple tuple(&scope, *tuple_obj); 2445 return ApiHandle::newReference(runtime, tuple.at(0)); 2446} 2447 2448PY_EXPORT wchar_t* _Py_DecodeUTF8_surrogateescape(const char* c_str, 2449 Py_ssize_t size, 2450 size_t* wlen) { 2451 DCHECK(c_str != nullptr, "c_str cannot be null"); 2452 wchar_t* wc_str = 2453 static_cast<wchar_t*>(PyMem_RawMalloc((size + 1) * sizeof(wchar_t))); 2454 for (Py_ssize_t i = 0; i < size; i++) { 2455 char ch = c_str[i]; 2456 // TODO(T57811636): Support UTF-8 arguments on macOS. 2457 // We don't have UTF-8 decoding machinery that is decoupled from the 2458 // runtime 2459 if (ch & 0x80) { 2460 UNIMPLEMENTED("UTF-8 argument support unimplemented"); 2461 } 2462 wc_str[i] = static_cast<wchar_t>(ch); 2463 } 2464 wc_str[size] = '\0'; 2465 if (wlen != nullptr) { 2466 *wlen = size; 2467 } 2468 return wc_str; 2469} 2470 2471PY_EXPORT int _Py_DecodeUTF8Ex(const char* c_str, Py_ssize_t size, 2472 wchar_t** result, size_t* wlen, 2473 const char** /* reason */, 2474 _Py_error_handler /* surrogateescape */) { 2475 wchar_t* wc_str = 2476 static_cast<wchar_t*>(PyMem_RawMalloc((size + 1) * sizeof(*wc_str))); 2477 if (wc_str == nullptr) { 2478 return -1; 2479 } 2480 for (Py_ssize_t i = 0; i < size; i++) { 2481 byte ch = c_str[i]; 2482 // TODO(T57811636): Support UTF-8 decoding decoupled from the runtime. 2483 // We don't have UTF-8 decoding machinery that is decoupled from the 2484 // runtime 2485 if (ch > kMaxASCII) { 2486 UNIMPLEMENTED("UTF-8 argument support unimplemented"); 2487 } 2488 wc_str[i] = ch; 2489 } 2490 wc_str[size] = '\0'; 2491 *result = wc_str; 2492 if (wlen) { 2493 *wlen = size; 2494 } 2495 return 0; 2496} 2497 2498// UTF-8 encoder using the surrogateescape error handler . 2499// 2500// On success, return 0 and write the newly allocated character string (use 2501// PyMem_Free() to free the memory) into *str. 2502// 2503// On encoding failure, return -2 and write the position of the invalid 2504// surrogate character into *error_pos (if error_pos is set) and the decoding 2505// error message into *reason (if reason is set). 2506// 2507// On memory allocation failure, return -1. 2508PY_EXPORT int _Py_EncodeUTF8Ex(const wchar_t* text, char** str, 2509 size_t* error_pos, const char** reason, 2510 int raw_malloc, _Py_error_handler errors) { 2511 const Py_ssize_t max_char_size = 4; 2512 Py_ssize_t len = std::wcslen(text); 2513 DCHECK(len >= 0, "len must be non-negative"); 2514 2515 bool surrogateescape = false; 2516 bool surrogatepass = false; 2517 switch (errors) { 2518 case _Py_ERROR_STRICT: 2519 break; 2520 case _Py_ERROR_SURROGATEESCAPE: 2521 surrogateescape = true; 2522 break; 2523 case _Py_ERROR_SURROGATEPASS: 2524 surrogatepass = true; 2525 break; 2526 default: 2527 return -3; 2528 } 2529 2530 if (len > PY_SSIZE_T_MAX / max_char_size - 1) { 2531 return -1; 2532 } 2533 char* bytes; 2534 if (raw_malloc) { 2535 bytes = reinterpret_cast<char*>(PyMem_RawMalloc((len + 1) * max_char_size)); 2536 } else { 2537 bytes = reinterpret_cast<char*>(PyMem_Malloc((len + 1) * max_char_size)); 2538 } 2539 if (bytes == nullptr) { 2540 return -1; 2541 } 2542 2543 char* p = bytes; 2544 for (Py_ssize_t i = 0; i < len; i++) { 2545 Py_UCS4 ch = text[i]; 2546 2547 if (ch < 0x80) { 2548 // Encode ASCII 2549 *p++ = (char)ch; 2550 2551 } else if (ch < 0x0800) { 2552 // Encode Latin-1 2553 *p++ = (char)(0xc0 | (ch >> 6)); 2554 *p++ = (char)(0x80 | (ch & 0x3f)); 2555 } else if (Py_UNICODE_IS_SURROGATE(ch) && !surrogatepass) { 2556 // surrogateescape error handler 2557 if (!surrogateescape || !(0xDC80 <= ch && ch <= 0xDCFF)) { 2558 if (error_pos != nullptr) { 2559 *error_pos = (size_t)i; 2560 } 2561 if (reason != nullptr) { 2562 *reason = "encoding error"; 2563 } 2564 if (raw_malloc) { 2565 PyMem_RawFree(bytes); 2566 } else { 2567 PyMem_Free(bytes); 2568 } 2569 return -2; 2570 } 2571 *p++ = (char)(ch & 0xff); 2572 } else if (ch < 0x10000) { 2573 *p++ = (char)(0xe0 | (ch >> 12)); 2574 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); 2575 *p++ = (char)(0x80 | (ch & 0x3f)); 2576 } else { 2577 // ch >= 0x10000 2578 DCHECK(ch <= kMaxUnicode, "ch must be a valid unicode code point"); 2579 // Encode UCS4 Unicode ordinals 2580 *p++ = (char)(0xf0 | (ch >> 18)); 2581 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f)); 2582 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); 2583 *p++ = (char)(0x80 | (ch & 0x3f)); 2584 } 2585 } 2586 *p++ = '\0'; 2587 2588 size_t final_size = (p - bytes); 2589 char* bytes2; 2590 if (raw_malloc) { 2591 bytes2 = reinterpret_cast<char*>(PyMem_RawRealloc(bytes, final_size)); 2592 } else { 2593 bytes2 = reinterpret_cast<char*>(PyMem_Realloc(bytes, final_size)); 2594 } 2595 if (bytes2 == nullptr) { 2596 if (error_pos != nullptr) { 2597 *error_pos = (size_t)-1; 2598 } 2599 if (raw_malloc) { 2600 PyMem_RawFree(bytes); 2601 } else { 2602 PyMem_Free(bytes); 2603 } 2604 return -1; 2605 } 2606 *str = bytes2; 2607 return 0; 2608} 2609 2610} // namespace py