this repo has no description
at trunk 1876 lines 61 kB view raw
1// Copyright (c) Facebook, Inc. and its affiliates. (http://www.facebook.com) 2#include "str-builtins.h" 3 4#include <cctype> 5#include <cwchar> 6 7#include "builtins.h" 8#include "formatter-utils.h" 9#include "formatter.h" 10#include "frame.h" 11#include "globals.h" 12#include "int-builtins.h" 13#include "objects.h" 14#include "runtime.h" 15#include "set-builtins.h" 16#include "slice-builtins.h" 17#include "thread.h" 18#include "tuple-builtins.h" 19#include "type-builtins.h" 20#include "unicode.h" 21#include "utils.h" 22 23namespace py { 24 25word adjustedStrIndex(const Str& str, word index) { 26 word len = str.length(); 27 if (index >= 0) { 28 return str.offsetByCodePoints(0, index); 29 } 30 if (-len < index) { 31 return Utils::maximum(0l, str.offsetByCodePoints(len, index)); 32 } 33 return 0; 34} 35 36RawObject dataArraySubstr(Thread* thread, const DataArray& data, word start, 37 word length) { 38 word data_len = data.length(); 39 DCHECK_BOUND(start, data_len); 40 DCHECK_BOUND(start, data_len - length); 41 // SmallStr result 42 if (length <= RawSmallStr::kMaxLength) { 43 byte buffer[RawSmallStr::kMaxLength]; 44 data.copyToStartAt(buffer, length, start); 45 return SmallStr::fromBytes(View<byte>(buffer, length)); 46 } 47 // LargeStr result 48 HandleScope scope(thread); 49 MutableBytes result(&scope, 50 thread->runtime()->newMutableBytesUninitialized(length)); 51 result.replaceFromWithStartAt(0, *data, length, start); 52 return result.becomeStr(); 53} 54 55RawObject newStrFromWideChar(Thread* thread, const wchar_t* wc_str) { 56 return newStrFromWideCharWithLength(thread, wc_str, std::wcslen(wc_str)); 57} 58 59RawObject newStrFromWideCharWithLength(Thread* thread, const wchar_t* wc_str, 60 word length) { 61 static_assert(sizeof(*wc_str) * kBitsPerByte == 32, 62 "only 32bit wchar_t supported."); 63 64 for (word i = 0; i < length; ++i) { 65 if (wc_str[i] < 0 || wc_str[i] > kMaxUnicode) { 66 return thread->raiseWithFmt(LayoutId::kValueError, 67 "character is not in range"); 68 } 69 } 70 71 return thread->runtime()->newStrFromUTF32( 72 View<int32_t>(reinterpret_cast<const int32_t*>(wc_str), length)); 73} 74 75void strCopyToWCStr(wchar_t* buf, size_t buf_length, const Str& str) { 76 uword wchar_index = 0; 77 for (word byte_index = 0, num_bytes = 0, byte_count = str.length(); 78 byte_index < byte_count && wchar_index < buf_length; 79 byte_index += num_bytes, wchar_index += 1) { 80 int32_t cp = Str::cast(*str).codePointAt(byte_index, &num_bytes); 81 static_assert(sizeof(*buf) == sizeof(cp), "Requires 32bit wchar_t"); 82 buf[wchar_index] = static_cast<wchar_t>(cp); 83 } 84 buf[wchar_index] = '\0'; 85} 86 87static word strCountCharFromTo(const Str& haystack, byte needle, word start, 88 word end) { 89 word result = 0; 90 if (haystack.isSmallStr()) { 91 RawSmallStr small = SmallStr::cast(*haystack); 92 for (word i = start; i < end; i++) { 93 if (small.byteAt(i) == needle) result++; 94 } 95 return result; 96 } 97 byte* ptr = reinterpret_cast<byte*>(LargeStr::cast(*haystack).address()); 98 for (word i = start; i < end; i++) { 99 if (ptr[i] == needle) result++; 100 } 101 return result; 102} 103 104RawObject strCount(const Str& haystack, const Str& needle, word start, 105 word end) { 106 if (end < 0 || start < 0) { 107 // N.B.: If end is negative we may be able to cheaply walk backward. We 108 // should avoid calling adjustSearchIndices here since the underlying 109 // container is not O(1) and replace it with something that preserves some 110 // of the signals that would be useful to lower the cost of the O(n) 111 // traversal. 112 // TODO(T41400083): Use a different search algorithm 113 Slice::adjustSearchIndices(&start, &end, haystack.codePointLength()); 114 } 115 116 word start_index = start == 0 ? 0 : haystack.offsetByCodePoints(0, start); 117 if (start_index == haystack.length() && needle.length() > 0) { 118 // Haystack is too small; fast early return 119 return SmallInt::fromWord(0); 120 } 121 122 word end_index = end == kMaxWord 123 ? haystack.length() 124 : haystack.offsetByCodePoints(start_index, end - start); 125 if ((end_index - start_index) < needle.length() || start_index > end_index) { 126 // Haystack is too small; fast early return 127 return SmallInt::fromWord(0); 128 } 129 130 if (needle.length() == 1) { 131 return SmallInt::fromWord(strCountCharFromTo( 132 haystack, SmallStr::cast(*needle).byteAt(0), start_index, end_index)); 133 } 134 135 // TODO(T41400083): Use a different search algorithm 136 return SmallInt::fromWord(strCountSubStrFromTo(haystack, needle, start_index, 137 end_index, haystack.length())); 138} 139 140word strCountSubStrFromTo(const Str& haystack, const Str& needle, word start, 141 word end, word max_count) { 142 DCHECK(max_count >= 0, "max_count must be non-negative"); 143 word needle_len = needle.length(); 144 word num_match = 0; 145 // Loop is in byte space, not code point space 146 for (word i = start; i <= end - needle_len && num_match < max_count;) { 147 if (strHasPrefix(haystack, needle, i)) { 148 i += needle_len; 149 num_match++; 150 continue; 151 } 152 i++; 153 } 154 return num_match; 155} 156 157word strCountSubStr(const Str& haystack, const Str& needle, word max_count) { 158 return strCountSubStrFromTo(haystack, needle, 0, haystack.length(), 159 max_count); 160} 161 162RawObject strEncodeASCII(Thread* thread, const Str& str) { 163 HandleScope scope(thread); 164 if (!str.isASCII()) { 165 return Unbound::object(); 166 } 167 if (str.isSmallStr()) { 168 return SmallStr::cast(*str).becomeBytes(); 169 } 170 word str_len = str.length(); 171 MutableBytes bytes(&scope, 172 thread->runtime()->newMutableBytesUninitialized(str_len)); 173 bytes.replaceFromWithStr(0, *str, str_len); 174 return bytes.becomeImmutable(); 175} 176 177RawObject strEscapeNonASCII(Thread* thread, const Str& str) { 178 if (str.isASCII()) return *str; 179 // Slow implementation using `codecs`. 180 HandleScope scope(thread); 181 Runtime* runtime = thread->runtime(); 182 Symbols* symbols = runtime->symbols(); 183 Object ascii(&scope, symbols->at(ID(ascii))); 184 Object backslashreplace(&scope, symbols->at(ID(backslashreplace))); 185 Bytes result_bytes( 186 &scope, thread->invokeMethod3(str, ID(encode), ascii, backslashreplace)); 187 word length = result_bytes.length(); 188 MutableBytes result(&scope, runtime->newMutableBytesUninitialized(length)); 189 result.replaceFromWithBytes(0, *result_bytes, length); 190 return result.becomeStr(); 191} 192 193RawObject strJoinWithTupleOrList(Thread* thread, const Str& sep, 194 const Object& iterable) { 195 Runtime* runtime = thread->runtime(); 196 HandleScope scope(thread); 197 Tuple tuple(&scope, runtime->emptyTuple()); 198 word length = 0; 199 if (iterable.isTuple()) { 200 tuple = *iterable; 201 length = tuple.length(); 202 } else if (iterable.isList()) { 203 tuple = List::cast(*iterable).items(); 204 length = List::cast(*iterable).numItems(); 205 } else { 206 // Slow path: collect items into list in Python and call again 207 return Unbound::object(); 208 } 209 Object elt(&scope, NoneType::object()); 210 for (word i = 0; i < length; i++) { 211 elt = tuple.at(i); 212 if (!runtime->isInstanceOfStr(*elt)) { 213 return thread->raiseWithFmt( 214 LayoutId::kTypeError, 215 "sequence item %w: expected str instance, %T found", i, &elt); 216 } 217 } 218 return runtime->strJoin(thread, sep, tuple, length); 219} 220 221word strSpan(const Str& src, const Str& str) { 222 word length = src.length(); 223 word str_length = str.length(); 224 word first = 0; 225 for (; first < length; first++) { 226 bool has_match = false; 227 byte ch = src.byteAt(first); 228 for (word j = 0; j < str_length; j++) { 229 if (ch == str.byteAt(j)) { 230 has_match = true; 231 break; 232 } 233 } 234 if (!has_match) { 235 break; 236 } 237 } 238 return first; 239} 240 241RawObject strSubstr(Thread* thread, const Str& str, word start, word length) { 242 word str_len = str.length(); 243 DCHECK_BOUND(start, str_len); 244 DCHECK_BOUND(length, str_len - start); 245 if (length == str_len) { 246 return *str; 247 } 248 // SmallStr result 249 if (length <= RawSmallStr::kMaxLength) { 250 byte buffer[RawSmallStr::kMaxLength]; 251 str.copyToStartAt(buffer, length, start); 252 return SmallStr::fromBytes(View<byte>(buffer, length)); 253 } 254 // LargeStr result 255 HandleScope scope(thread); 256 MutableBytes result(&scope, 257 thread->runtime()->newMutableBytesUninitialized(length)); 258 result.replaceFromWithStartAt(0, LargeStr::cast(*str), length, start); 259 return result.becomeStr(); 260} 261 262word strRSpan(const Str& src, const Str& str, word rend) { 263 DCHECK(rend >= 0, "string index underflow"); 264 word length = src.length(); 265 word str_length = str.length(); 266 word result = 0; 267 for (word i = length - 1; i >= rend; i--, result++) { 268 byte ch = src.byteAt(i); 269 bool has_match = false; 270 for (word j = 0; j < str_length; j++) { 271 if (ch == str.byteAt(j)) { 272 has_match = true; 273 break; 274 } 275 } 276 if (!has_match) { 277 break; 278 } 279 } 280 return result; 281} 282 283static bool isLineBreak(int32_t c) { 284 switch (c) { 285 // Common cases 286 case '\n': 287 case '\r': 288 // Less common cases 289 case '\f': 290 case '\v': 291 case 0x1c: 292 case 0x1d: 293 case 0x1e: 294 case 0x85: 295 case 0x2028: 296 case 0x2029: 297 return true; 298 default: 299 return false; 300 } 301} 302 303RawObject strSplit(Thread* thread, const Str& str, const Str& sep, 304 word maxsplit) { 305 Runtime* runtime = thread->runtime(); 306 HandleScope scope(thread); 307 word num_splits = strCountSubStr(str, sep, maxsplit); 308 word result_len = num_splits + 1; 309 MutableTuple result_items(&scope, runtime->newMutableTuple(result_len)); 310 word last_idx = 0; 311 word sep_len = sep.length(); 312 for (word i = 0, result_idx = 0; result_idx < num_splits;) { 313 if (strHasPrefix(str, sep, i)) { 314 result_items.atPut(result_idx++, 315 strSubstr(thread, str, last_idx, i - last_idx)); 316 i += sep_len; 317 last_idx = i; 318 } else { 319 i = str.offsetByCodePoints(i, 1); 320 } 321 } 322 result_items.atPut(num_splits, 323 strSubstr(thread, str, last_idx, str.length() - last_idx)); 324 List result(&scope, runtime->newList()); 325 result.setItems(*result_items); 326 result.setNumItems(result_len); 327 return *result; 328} 329 330RawObject strSplitlines(Thread* thread, const Str& str, bool keepends) { 331 HandleScope scope(thread); 332 Runtime* runtime = thread->runtime(); 333 List result(&scope, runtime->newList()); 334 // Looping over code points, not bytes, but i is a byte offset 335 for (word i = 0, j = 0; i < str.length(); j = i) { 336 // Skip newline chars 337 word num_bytes; 338 while (i < str.length() && !isLineBreak(str.codePointAt(i, &num_bytes))) { 339 i += num_bytes; 340 } 341 342 word eol_pos = i; 343 if (i < str.length()) { 344 int32_t cp = str.codePointAt(i, &num_bytes); 345 word next = i + num_bytes; 346 word next_num_bytes; 347 // Check for \r\n specifically 348 if (cp == '\r' && next < str.length() && 349 str.codePointAt(next, &next_num_bytes) == '\n') { 350 i += next_num_bytes; 351 } 352 i += num_bytes; 353 if (keepends) { 354 eol_pos = i; 355 } 356 } 357 358 // If there are no newlines, the str returned should be identity-equal 359 if (j == 0 && eol_pos == str.length() && str.isStr()) { 360 runtime->listAdd(thread, result, str); 361 return *result; 362 } 363 364 Str substr(&scope, strSubstr(thread, str, j, eol_pos - j)); 365 runtime->listAdd(thread, result, substr); 366 } 367 368 return *result; 369} 370 371RawObject strStripSpace(Thread* thread, const Str& src) { 372 word length = src.length(); 373 if (length == 0) { 374 return *src; 375 } 376 if (length == 1 && ASCII::isSpace(src.byteAt(0))) { 377 return Str::empty(); 378 } 379 word first = 0; 380 while (first < length) { 381 word num_bytes; 382 int32_t ch = src.codePointAt(first, &num_bytes); 383 if (!Unicode::isSpace(ch)) { 384 break; 385 } 386 first += num_bytes; 387 } 388 word last = length; 389 while (last > first) { 390 last = src.offsetByCodePoints(last, -1); 391 word num_bytes; 392 int32_t ch = src.codePointAt(last, &num_bytes); 393 if (!Unicode::isSpace(ch)) { 394 last += num_bytes; 395 break; 396 } 397 } 398 return strSubstr(thread, src, first, last - first); 399} 400 401RawObject strStripSpaceLeft(Thread* thread, const Str& src) { 402 word length = src.length(); 403 if (length == 0) { 404 return *src; 405 } 406 if (length == 1 && ASCII::isSpace(src.byteAt(0))) { 407 return Str::empty(); 408 } 409 word first = 0; 410 while (first < length) { 411 word num_bytes; 412 int32_t ch = src.codePointAt(first, &num_bytes); 413 if (!Unicode::isSpace(ch)) { 414 break; 415 } 416 first += num_bytes; 417 } 418 return strSubstr(thread, src, first, length - first); 419} 420 421RawObject strStripSpaceRight(Thread* thread, const Str& src) { 422 word length = src.length(); 423 if (length == 0) { 424 return *src; 425 } 426 if (length == 1 && ASCII::isSpace(src.byteAt(0))) { 427 return Str::empty(); 428 } 429 word last = length; 430 while (last > 0) { 431 last = src.offsetByCodePoints(last, -1); 432 word num_bytes; 433 int32_t ch = src.codePointAt(last, &num_bytes); 434 if (!Unicode::isSpace(ch)) { 435 last += num_bytes; 436 break; 437 } 438 } 439 return strSubstr(thread, src, 0, last); 440} 441 442RawObject strStrip(Thread* thread, const Str& src, const Str& str) { 443 word length = src.length(); 444 if (length == 0 || str.length() == 0) { 445 return *src; 446 } 447 word first = strSpan(src, str); 448 word last = strRSpan(src, str, first); 449 return strSubstr(thread, src, first, length - first - last); 450} 451 452RawObject strStripLeft(Thread* thread, const Str& src, const Str& str) { 453 word length = src.length(); 454 if (length == 0 || str.length() == 0) { 455 return *src; 456 } 457 word first = strSpan(src, str); 458 return strSubstr(thread, src, first, length - first); 459} 460 461RawObject strStripRight(Thread* thread, const Str& src, const Str& str) { 462 word length = src.length(); 463 if (length == 0 || str.length() == 0) { 464 return *src; 465 } 466 word first = 0; 467 word last = strRSpan(src, str, first); 468 return strSubstr(thread, src, 0, length - last); 469} 470 471RawObject strTranslateASCII(Thread* thread, const Str& src, const Str& table) { 472 if (table.length() > kMaxASCII || !table.isASCII()) { 473 return Unbound::object(); 474 } 475 word src_len = src.length(); 476 word table_len = table.length(); 477 HandleScope scope(thread); 478 MutableBytes result(&scope, 479 thread->runtime()->newMutableBytesUninitialized(src_len)); 480 // Since all non-ASCII bytes in UTF-8 have a 1 in front, we can iterate by 481 // bytes instead of codepoints 482 for (word i = 0; i < src.length(); i++) { 483 byte to_translate = src.byteAt(i); 484 if (to_translate > table_len) { 485 result.byteAtPut(i, to_translate); 486 } else { 487 result.byteAtPut(i, table.byteAt(to_translate)); 488 } 489 } 490 return result.becomeStr(); 491} 492 493RawObject strIteratorNext(Thread* thread, const StrIterator& iter) { 494 HandleScope scope(thread); 495 word byte_offset = iter.index(); 496 Str underlying(&scope, iter.iterable()); 497 if (byte_offset >= underlying.length()) { 498 return Error::noMoreItems(); 499 } 500 word num_bytes = 0; 501 word code_point = underlying.codePointAt(byte_offset, &num_bytes); 502 iter.setIndex(byte_offset + num_bytes); 503 return RawSmallStr::fromCodePoint(code_point); 504} 505 506static const BuiltinAttribute kUserStrBaseAttributes[] = { 507 {ID(_UserStr__value), RawUserStrBase::kValueOffset, 508 AttributeFlags::kHidden}, 509}; 510 511static const BuiltinAttribute kStrIteratorAttributes[] = { 512 {ID(_str_iterator__iterable), RawStrIterator::kIterableOffset, 513 AttributeFlags::kHidden}, 514 {ID(_str_iterator__index), RawStrIterator::kIndexOffset, 515 AttributeFlags::kHidden}, 516}; 517 518void initializeStrTypes(Thread* thread) { 519 HandleScope scope(thread); 520 Runtime* runtime = thread->runtime(); 521 522 Type str(&scope, addBuiltinType(thread, ID(str), LayoutId::kStr, 523 /*superclass_id=*/LayoutId::kObject, 524 kUserStrBaseAttributes, UserStrBase::kSize, 525 /*basetype=*/true)); 526 527 { 528 Type type(&scope, 529 addImmediateBuiltinType(thread, ID(largestr), LayoutId::kLargeStr, 530 /*builtin_base=*/LayoutId::kStr, 531 /*superclass_id=*/LayoutId::kObject, 532 /*basetype=*/false)); 533 Layout::cast(type.instanceLayout()).setDescribedType(*str); 534 runtime->setLargeStrType(type); 535 } 536 537 { 538 Type type(&scope, 539 addImmediateBuiltinType(thread, ID(smallstr), LayoutId::kSmallStr, 540 /*builtin_base=*/LayoutId::kStr, 541 /*superclass_id=*/LayoutId::kObject, 542 /*basetype=*/false)); 543 Layout::cast(type.instanceLayout()).setDescribedType(*str); 544 runtime->setSmallStrType(type); 545 } 546 547 addBuiltinType(thread, ID(str_iterator), LayoutId::kStrIterator, 548 /*superclass_id=*/LayoutId::kObject, kStrIteratorAttributes, 549 StrIterator::kSize, /*basetype=*/false); 550} 551 552RawObject METH(str, __add__)(Thread* thread, Arguments args) { 553 Runtime* runtime = thread->runtime(); 554 HandleScope scope(thread); 555 Object self_obj(&scope, args.get(0)); 556 Object other_obj(&scope, args.get(1)); 557 if (!runtime->isInstanceOfStr(*self_obj)) { 558 return thread->raiseRequiresType(self_obj, ID(str)); 559 } 560 if (!runtime->isInstanceOfStr(*other_obj)) { 561 return NotImplementedType::object(); 562 } 563 Str self(&scope, strUnderlying(*self_obj)); 564 Str other(&scope, strUnderlying(*other_obj)); 565 return runtime->strConcat(thread, self, other); 566} 567 568RawObject METH(str, __bool__)(Thread* thread, Arguments args) { 569 HandleScope scope(thread); 570 Object self_obj(&scope, args.get(0)); 571 if (!thread->runtime()->isInstanceOfStr(*self_obj)) { 572 return thread->raiseRequiresType(self_obj, ID(str)); 573 } 574 Str self(&scope, strUnderlying(*self_obj)); 575 return Bool::fromBool(*self != Str::empty()); 576} 577 578RawObject METH(str, __contains__)(Thread* thread, Arguments args) { 579 RawObject self = args.get(0); 580 RawObject other = args.get(1); 581 Runtime* runtime = thread->runtime(); 582 bool is_self_invalid = true; 583 if (runtime->isInstanceOfStr(self)) { 584 is_self_invalid = false; 585 if (runtime->isInstanceOfStr(other)) { 586 return Bool::fromBool(strUnderlying(self).includes(strUnderlying(other))); 587 } 588 } 589 HandleScope scope(thread); 590 Object invalid(&scope, is_self_invalid ? self : other); 591 return thread->raiseRequiresType(invalid, ID(str)); 592} 593 594RawObject METH(str, __eq__)(Thread* thread, Arguments args) { 595 HandleScope scope(thread); 596 Object self_obj(&scope, args.get(0)); 597 Object other_obj(&scope, args.get(1)); 598 if (!thread->runtime()->isInstanceOfStr(*self_obj)) { 599 return thread->raiseRequiresType(self_obj, ID(str)); 600 } 601 if (!thread->runtime()->isInstanceOfStr(*other_obj)) { 602 return NotImplementedType::object(); 603 } 604 Str self(&scope, strUnderlying(*self_obj)); 605 Str other(&scope, strUnderlying(*other_obj)); 606 return Bool::fromBool(self.equals(*other)); 607} 608 609RawObject METH(str, __format__)(Thread* thread, Arguments args) { 610 HandleScope scope(thread); 611 Object self_obj(&scope, args.get(0)); 612 Runtime* runtime = thread->runtime(); 613 if (!runtime->isInstanceOfStr(*self_obj)) { 614 return thread->raiseRequiresType(self_obj, ID(str)); 615 } 616 Str self(&scope, strUnderlying(*self_obj)); 617 618 Object spec_obj(&scope, args.get(1)); 619 if (!runtime->isInstanceOfStr(*spec_obj)) { 620 return thread->raiseWithFmt(LayoutId::kTypeError, 621 "__format__() argument 1 must be str, not %T", 622 &spec_obj); 623 } 624 Str spec(&scope, strUnderlying(*spec_obj)); 625 626 if (spec.length() == 0) { 627 return *self; 628 } 629 630 FormatSpec format; 631 Object possible_error(&scope, 632 parseFormatSpec(thread, spec, 633 /*default_type=*/'s', 634 /*default_align=*/'<', &format)); 635 if (!possible_error.isNoneType()) { 636 DCHECK(possible_error.isErrorException(), "expected exception"); 637 return *possible_error; 638 } 639 if (format.type != 's') { 640 return raiseUnknownFormatError(thread, format.type, self_obj); 641 } 642 if (format.positive_sign != '\0') { 643 return thread->raiseWithFmt(LayoutId::kValueError, 644 "Sign not allowed in string format specifier"); 645 } 646 if (format.alternate) { 647 return thread->raiseWithFmt( 648 LayoutId::kValueError, 649 "Alternate form (#) not allowed in string format specifier"); 650 } 651 if (format.alignment == '=') { 652 return thread->raiseWithFmt( 653 LayoutId::kValueError, 654 "'=' alignment not allowed in string format specifier"); 655 } 656 657 return formatStr(thread, self, &format); 658} 659 660RawObject METH(str, __ge__)(Thread* thread, Arguments args) { 661 HandleScope scope(thread); 662 Object self_obj(&scope, args.get(0)); 663 Object other_obj(&scope, args.get(1)); 664 if (!thread->runtime()->isInstanceOfStr(*self_obj)) { 665 return thread->raiseRequiresType(self_obj, ID(str)); 666 } 667 if (!thread->runtime()->isInstanceOfStr(*other_obj)) { 668 return NotImplementedType::object(); 669 } 670 Str self(&scope, strUnderlying(*self_obj)); 671 Str other(&scope, strUnderlying(*other_obj)); 672 return Bool::fromBool(self.compare(*other) >= 0); 673} 674 675bool METH(str, __getitem___intrinsic)(Thread* thread) { 676 RawObject arg0 = thread->stackPeek(1); 677 if (!arg0.isStr()) { 678 return false; 679 } 680 RawObject arg1 = thread->stackPeek(0); 681 word idx; 682 if (arg1.isSmallInt()) { 683 idx = RawSmallInt::cast(arg1).value(); 684 } else if (arg1.isBool()) { 685 idx = RawBool::cast(arg1).value(); 686 } else { 687 word start, stop; 688 if (!tryUnpackSlice(arg1, &start, &stop)) { 689 return false; 690 } 691 { 692 // Manually adjust slice bounds to avoid an extra call to codePointLength 693 HandleScope scope(thread); 694 Str self(&scope, arg0); 695 word start_index = adjustedStrIndex(self, start); 696 word stop_index = adjustedStrIndex(self, stop); 697 word length = stop_index - start_index; 698 699 thread->stackDrop(2); 700 thread->stackSetTop(length <= 0 701 ? Str::empty() 702 : strSubstr(thread, self, start_index, length)); 703 } 704 return true; 705 } 706 RawStr self = Str::cast(arg0); 707 word len = self.length(); 708 if (0 <= idx && idx < len) { 709 word offset = self.offsetByCodePoints(0, idx); 710 if (offset < len) { 711 word ignored; 712 thread->stackDrop(2); 713 thread->stackSetTop( 714 RawSmallStr::fromCodePoint(self.codePointAt(offset, &ignored))); 715 return true; 716 } 717 } 718 if (0 > idx) { 719 word offset = self.offsetByCodePoints(len, idx); 720 if (offset < len && offset != -1) { 721 word ignored; 722 thread->stackDrop(2); 723 thread->stackSetTop( 724 RawSmallStr::fromCodePoint(self.codePointAt(offset, &ignored))); 725 return true; 726 } 727 } 728 return false; 729} 730 731RawObject METH(str, __gt__)(Thread* thread, Arguments args) { 732 HandleScope scope(thread); 733 Object self_obj(&scope, args.get(0)); 734 Object other_obj(&scope, args.get(1)); 735 if (!thread->runtime()->isInstanceOfStr(*self_obj)) { 736 return thread->raiseRequiresType(self_obj, ID(str)); 737 } 738 if (!thread->runtime()->isInstanceOfStr(*other_obj)) { 739 return NotImplementedType::object(); 740 } 741 Str self(&scope, strUnderlying(*self_obj)); 742 Str other(&scope, strUnderlying(*other_obj)); 743 return Bool::fromBool(self.compare(*other) > 0); 744} 745 746RawObject METH(str, __hash__)(Thread* thread, Arguments args) { 747 HandleScope scope(thread); 748 Object self(&scope, args.get(0)); 749 Runtime* runtime = thread->runtime(); 750 if (!runtime->isInstanceOfStr(*self)) { 751 return thread->raiseRequiresType(self, ID(str)); 752 } 753 Str self_str(&scope, strUnderlying(*self)); 754 return SmallInt::fromWord(strHash(thread, *self_str)); 755} 756 757void strInternInTuple(Thread* thread, const Object& items) { 758 HandleScope scope(thread); 759 Runtime* runtime = thread->runtime(); 760 DCHECK(runtime->isInstanceOfTuple(*items), "items must be a tuple instance"); 761 Tuple tuple(&scope, tupleUnderlying(*items)); 762 Str str(&scope, Str::empty()); 763 Object result(&scope, NoneType::object()); 764 for (word i = 0; i < tuple.length(); i++) { 765 str = tuple.at(i); 766 result = Runtime::internStr(thread, str); 767 if (result.isError()) continue; 768 if (result != str) { 769 tuple.atPut(i, *result); 770 } 771 } 772} 773 774static bool allNameChars(const Str& str) { 775 for (word i = 0; i < str.length(); i++) { 776 byte c = str.byteAt(i); 777 if (!ASCII::isAlnum(c) && c != '_') { 778 return false; 779 } 780 } 781 return true; 782} 783 784static bool strInternConstant(Thread* thread, Object* object) { 785 if (object->isStr()) { 786 HandleScope scope(thread); 787 Str str(&scope, **object); 788 if (allNameChars(str)) { 789 str = Runtime::internStr(thread, str); 790 if (str != *object) { 791 *object = *str; 792 return true; 793 } 794 } 795 return false; 796 } 797 if (object->isTuple()) { 798 HandleScope scope(thread); 799 Tuple tuple(&scope, **object); 800 Object item(&scope, NoneType::object()); 801 bool modified = false; 802 for (word i = 0, length = tuple.length(); i < length; i++) { 803 item = tuple.at(i); 804 if (strInternConstant(thread, &item)) { 805 modified = true; 806 tuple.atPut(i, *item); 807 } 808 } 809 if (!modified) return false; 810 *object = *tuple; 811 return true; 812 } 813 if (object->isFrozenSet()) { 814 HandleScope scope(thread); 815 FrozenSet set(&scope, **object); 816 FrozenSet new_set(&scope, thread->runtime()->newFrozenSet()); 817 Object value(&scope, NoneType::object()); 818 Object hash(&scope, NoneType::object()); 819 bool modified = false; 820 for (word idx = 0; setNextItem(set, &idx, &value);) { 821 if (strInternConstant(thread, &value)) { 822 modified = true; 823 } 824 hash = Interpreter::hash(thread, value); 825 if (hash.isErrorException()) return false; 826 setAdd(thread, new_set, value, SmallInt::cast(*hash).value()); 827 } 828 if (!modified) return false; 829 *object = *new_set; 830 return true; 831 } 832 return false; 833} 834 835bool strInternConstants(Thread* thread, const Object& items) { 836 HandleScope scope(thread); 837 DCHECK(thread->runtime()->isInstanceOfTuple(*items), 838 "items must be a tuple instance"); 839 Object tuple(&scope, tupleUnderlying(*items)); 840 return strInternConstant(thread, &tuple); 841} 842 843word strFind(const Str& haystack, const Str& needle) { 844 word haystack_len = haystack.length(); 845 word needle_len = needle.length(); 846 if (needle_len > haystack_len) { 847 return -1; 848 } 849 if (needle_len == 0) { 850 return 0; 851 } 852 if (needle_len == 1 && haystack.isASCII()) { 853 return strFindAsciiChar(haystack, needle.byteAt(0)); 854 } 855 // Loop is in byte space, not code point space 856 word result = 0; 857 // TODO(T41400083): Use a different search algorithm 858 for (word i = 0; i <= haystack_len - needle_len; result++) { 859 if (strHasPrefix(haystack, needle, i)) { 860 return result; 861 } 862 i = haystack.offsetByCodePoints(i, 1); 863 } 864 return -1; 865} 866 867word strFindWithRange(const Str& haystack, const Str& needle, word start, 868 word end) { 869 if (end < 0 || start < 0) { 870 Slice::adjustSearchIndices(&start, &end, haystack.codePointLength()); 871 } 872 873 word start_index = haystack.offsetByCodePoints(0, start); 874 if (start_index == haystack.length() && needle.length() > 0) { 875 // Haystack is too small; fast early return 876 return -1; 877 } 878 word end_index = haystack.offsetByCodePoints(start_index, end - start); 879 880 if ((end_index - start_index) < needle.length() || start_index > end_index) { 881 // Haystack is too small; fast early return 882 return -1; 883 } 884 885 // Loop is in byte space, not code point space 886 word result = start; 887 // TODO(T41400083): Use a different search algorithm 888 for (word i = start_index; i <= end_index - needle.length(); result++) { 889 bool has_match = strHasPrefix(haystack, needle, i); 890 word next = haystack.offsetByCodePoints(i, 1); 891 if (i == next) { 892 // We've reached a fixpoint; offsetByCodePoints will not advance past the 893 // length of the string. 894 if (start_index >= i) { 895 // The start is greater than the length of the string. 896 return -1; 897 } 898 // If the start is within bounds, just return the last found index. 899 break; 900 } 901 if (has_match) { 902 return result; 903 } 904 i = next; 905 } 906 return -1; 907} 908 909word strFindAsciiChar(const Str& haystack, byte needle) { 910 DCHECK(needle <= kMaxASCII, "must only be called for ASCII `needle`"); 911 for (word i = 0, length = haystack.length(); i < length; i++) { 912 if (haystack.byteAt(i) == needle) { 913 return i; 914 } 915 } 916 return -1; 917} 918 919word strFindFirstNonWhitespace(const Str& str) { 920 word i = 0; 921 for (word codepoint_len, length = str.length(); i < length; 922 i += codepoint_len) { 923 if (!Unicode::isSpace(str.codePointAt(i, &codepoint_len))) return i; 924 } 925 return i; 926} 927 928bool strHasPrefix(const Str& str, const Str& prefix, word start) { 929 word str_len = str.length(); 930 word prefix_len = prefix.length(); 931 if (str_len - start < prefix_len) { 932 return false; 933 } 934 for (word i = 0; i < prefix_len; i++) { 935 if (str.byteAt(start + i) != prefix.byteAt(i)) { 936 return false; 937 } 938 } 939 return true; 940} 941 942bool strHasSurrogate(const Str& str) { 943 return str.includesByte(UTF8::kSurrogateLeadByte); 944} 945 946word strRFind(const Str& haystack, const Str& needle, word start, word end) { 947 // Haystack slice is empty; fast early return 948 if (start > end) return -1; 949 // Needle is empty 950 if (needle == Str::empty()) return end; 951 word start_index = haystack.offsetByCodePoints(0, start); 952 if (start_index == haystack.length()) { 953 // Haystack is too small; fast early return 954 return -1; 955 } 956 word end_index = haystack.offsetByCodePoints(start_index, end - start); 957 if ((end_index - start_index) < needle.length() || start_index > end_index) { 958 // Haystack is too small; fast early return 959 return -1; 960 } 961 // Loop is in byte space, not code point space 962 // Invariant: cp_offset and byte_offset describe the same offset into the 963 // string, but one is in code point space and the other is in byte space 964 // TODO(T41400083): Use a different search algorithm 965 for (word cp_offset = end - 1, 966 byte_offset = haystack.offsetByCodePoints(end_index, -1); 967 byte_offset >= 0; cp_offset--, 968 byte_offset = haystack.offsetByCodePoints(byte_offset, -1)) { 969 if (strHasPrefix(haystack, needle, byte_offset)) { 970 return cp_offset; 971 } 972 } 973 return -1; 974} 975 976word strRFindAsciiChar(const Str& haystack, byte needle) { 977 DCHECK(needle <= kMaxASCII, "must only be called for ASCII `needle`"); 978 for (word i = haystack.length() - 1; i >= 0; i--) { 979 if (haystack.byteAt(i) == needle) { 980 return i; 981 } 982 } 983 return -1; 984} 985 986RawObject METH(str, __le__)(Thread* thread, Arguments args) { 987 HandleScope scope(thread); 988 Object self_obj(&scope, args.get(0)); 989 Object other_obj(&scope, args.get(1)); 990 if (!thread->runtime()->isInstanceOfStr(*self_obj)) { 991 return thread->raiseRequiresType(self_obj, ID(str)); 992 } 993 if (!thread->runtime()->isInstanceOfStr(*other_obj)) { 994 return NotImplementedType::object(); 995 } 996 Str self(&scope, strUnderlying(*self_obj)); 997 Str other(&scope, strUnderlying(*other_obj)); 998 return Bool::fromBool(self.compare(*other) <= 0); 999} 1000 1001RawObject METH(str, __len__)(Thread* thread, Arguments args) { 1002 HandleScope scope(thread); 1003 Object self_obj(&scope, args.get(0)); 1004 if (!thread->runtime()->isInstanceOfStr(*self_obj)) { 1005 return thread->raiseRequiresType(self_obj, ID(str)); 1006 } 1007 Str self(&scope, strUnderlying(*self_obj)); 1008 return SmallInt::fromWord(self.codePointLength()); 1009} 1010 1011static RawObject strLowerASCII(Thread* thread, Object& str_obj, Str& str, 1012 word length) { 1013 if (str.isSmallStr()) { 1014 byte buf[SmallStr::kMaxLength]; 1015 for (word i = 0; i < length; i++) { 1016 buf[i] = ASCII::toLower(str.byteAt(i)); 1017 } 1018 return SmallStr::fromBytes({buf, length}); 1019 } 1020 // Search for the first uppercase character. 1021 word first_uppercase = 0; 1022 for (; first_uppercase < length; first_uppercase++) { 1023 if (ASCII::isUpper(str.byteAt(first_uppercase))) { 1024 break; 1025 } 1026 } 1027 if (first_uppercase >= length && str_obj.isStr()) { 1028 return *str_obj; 1029 } 1030 1031 HandleScope scope(thread); 1032 Runtime* runtime = thread->runtime(); 1033 MutableBytes result(&scope, runtime->newMutableBytesUninitialized(length)); 1034 result.replaceFromWithStr(0, *str, first_uppercase); 1035 for (word i = first_uppercase; i < length; i++) { 1036 byte lower = ASCII::toLower(str.byteAt(i)); 1037 result.byteAtPut(i, lower); 1038 } 1039 return result.becomeStr(); 1040} 1041 1042RawObject METH(str, casefold)(Thread* thread, Arguments args) { 1043 HandleScope scope(thread); 1044 Object self_obj(&scope, args.get(0)); 1045 Runtime* runtime = thread->runtime(); 1046 if (!runtime->isInstanceOfStr(*self_obj)) { 1047 return thread->raiseRequiresType(self_obj, ID(str)); 1048 } 1049 Str self(&scope, strUnderlying(*self_obj)); 1050 word length = self.length(); 1051 if (self.isASCII()) { 1052 return strLowerASCII(thread, self_obj, self, length); 1053 } 1054 1055 // Search for the first uppercase character. 1056 word first_uppercase = 0; 1057 for (word char_length; first_uppercase < length; 1058 first_uppercase += char_length) { 1059 int32_t code_point = self.codePointAt(first_uppercase, &char_length); 1060 if (Unicode::isUpper(code_point) || Unicode::isTitle(code_point) || 1061 Unicode::isUnfolded(code_point)) { 1062 break; 1063 } 1064 } 1065 if (first_uppercase >= length && self_obj.isStr()) { 1066 return *self_obj; 1067 } 1068 1069 StrArray result(&scope, runtime->newStrArray()); 1070 runtime->strArrayEnsureCapacity(thread, result, length); 1071 // Since the prefix is valid UTF-8 and guaranteed to fit, it's safe to write 1072 // directly to the underlying MutableBytes. 1073 MutableBytes result_bytes(&scope, result.items()); 1074 result_bytes.replaceFromWithStr(0, *self, first_uppercase); 1075 result.setNumItems(first_uppercase); 1076 for (word char_length, i = first_uppercase; i < length; i += char_length) { 1077 struct FullCasing casefold = 1078 Unicode::toFolded(self.codePointAt(i, &char_length)); 1079 for (word j = 0; j < 3; j++) { 1080 int32_t decoded = casefold.code_points[j]; 1081 if (decoded == -1) break; 1082 runtime->strArrayAddCodePoint(thread, result, decoded); 1083 } 1084 } 1085 return runtime->strFromStrArray(result); 1086} 1087 1088RawObject METH(str, lower)(Thread* thread, Arguments args) { 1089 HandleScope scope(thread); 1090 Object self_obj(&scope, args.get(0)); 1091 Runtime* runtime = thread->runtime(); 1092 if (!runtime->isInstanceOfStr(*self_obj)) { 1093 return thread->raiseRequiresType(self_obj, ID(str)); 1094 } 1095 Str self(&scope, strUnderlying(*self_obj)); 1096 word length = self.length(); 1097 if (self.isASCII()) { 1098 return strLowerASCII(thread, self_obj, self, length); 1099 } 1100 1101 // Search for the first uppercase character. 1102 word first_uppercase = 0; 1103 for (word char_length; first_uppercase < length; 1104 first_uppercase += char_length) { 1105 int32_t code_point = self.codePointAt(first_uppercase, &char_length); 1106 if (Unicode::isUpper(code_point) || Unicode::isTitle(code_point)) { 1107 break; 1108 } 1109 } 1110 if (first_uppercase >= length && self_obj.isStr()) { 1111 return *self_obj; 1112 } 1113 1114 StrArray result(&scope, runtime->newStrArray()); 1115 runtime->strArrayEnsureCapacity(thread, result, length); 1116 // Since the prefix is valid UTF-8 and guaranteed to fit, it's safe to write 1117 // directly to the underlying MutableBytes. 1118 MutableBytes result_bytes(&scope, result.items()); 1119 result_bytes.replaceFromWithStr(0, *self, first_uppercase); 1120 result.setNumItems(first_uppercase); 1121 for (word char_length, i = first_uppercase; i < length; i += char_length) { 1122 struct FullCasing lower = 1123 Unicode::toLower(self.codePointAt(i, &char_length)); 1124 for (word j = 0; j < 3; j++) { 1125 int32_t decoded = lower.code_points[j]; 1126 if (decoded == -1) break; 1127 runtime->strArrayAddCodePoint(thread, result, decoded); 1128 } 1129 } 1130 return runtime->strFromStrArray(result); 1131} 1132 1133RawObject METH(str, title)(Thread* thread, Arguments args) { 1134 Runtime* runtime = thread->runtime(); 1135 HandleScope scope(thread); 1136 Object self_obj(&scope, args.get(0)); 1137 if (!thread->runtime()->isInstanceOfStr(*self_obj)) { 1138 return thread->raiseRequiresType(self_obj, ID(str)); 1139 } 1140 1141 Str self(&scope, strUnderlying(*self_obj)); 1142 word char_length = self.length(); 1143 1144 bool previous_is_cased = false; 1145 StrArray result(&scope, runtime->newStrArray()); 1146 for (word i = 0, len; i < char_length; i += len) { 1147 int32_t code_point = self.codePointAt(i, &len); 1148 1149 FullCasing mapped = previous_is_cased ? Unicode::toLower(code_point) 1150 : Unicode::toTitle(code_point); 1151 for (word j = 0; mapped.code_points[j] != -1; j++) { 1152 runtime->strArrayAddCodePoint(thread, result, mapped.code_points[j]); 1153 } 1154 1155 previous_is_cased = Unicode::isCased(code_point); 1156 } 1157 1158 return runtime->strFromStrArray(result); 1159} 1160 1161RawObject METH(str, upper)(Thread* thread, Arguments args) { 1162 HandleScope scope(thread); 1163 Object self_obj(&scope, args.get(0)); 1164 Runtime* runtime = thread->runtime(); 1165 if (!runtime->isInstanceOfStr(*self_obj)) { 1166 return thread->raiseRequiresType(self_obj, ID(str)); 1167 } 1168 Str self(&scope, strUnderlying(*self_obj)); 1169 word length = self.length(); 1170 if (self.isASCII()) { 1171 if (self.isSmallStr()) { 1172 byte buf[SmallStr::kMaxLength]; 1173 for (word i = 0; i < length; i++) { 1174 buf[i] = ASCII::toUpper(self.byteAt(i)); 1175 } 1176 return SmallStr::fromBytes({buf, length}); 1177 } 1178 // Search for the first lowercase character. 1179 word first_lowercase = 0; 1180 for (; first_lowercase < length; first_lowercase++) { 1181 if (ASCII::isLower(self.byteAt(first_lowercase))) { 1182 break; 1183 } 1184 } 1185 if (first_lowercase >= length && self_obj.isStr()) { 1186 return *self_obj; 1187 } 1188 1189 MutableBytes result(&scope, runtime->newMutableBytesUninitialized(length)); 1190 result.replaceFromWithStr(0, *self, first_lowercase); 1191 for (word i = first_lowercase; i < length; i++) { 1192 byte lower = ASCII::toUpper(self.byteAt(i)); 1193 result.byteAtPut(i, lower); 1194 } 1195 return result.becomeStr(); 1196 } 1197 1198 // Search for the first lowercase character. 1199 word first_lowercase = 0; 1200 for (word char_length; first_lowercase < length; 1201 first_lowercase += char_length) { 1202 int32_t code_point = self.codePointAt(first_lowercase, &char_length); 1203 if (Unicode::isLower(code_point) || Unicode::isTitle(code_point)) { 1204 break; 1205 } 1206 } 1207 if (first_lowercase >= length && self_obj.isStr()) { 1208 return *self_obj; 1209 } 1210 1211 StrArray result(&scope, runtime->newStrArray()); 1212 runtime->strArrayEnsureCapacity(thread, result, length); 1213 // Since the prefix is valid UTF-8 and guaranteed to fit, it's safe to write 1214 // directly to the underlying MutableBytes. 1215 MutableBytes result_bytes(&scope, result.items()); 1216 result_bytes.replaceFromWithStr(0, *self, first_lowercase); 1217 result.setNumItems(first_lowercase); 1218 for (word char_length, i = first_lowercase; i < length; i += char_length) { 1219 struct FullCasing upper = 1220 Unicode::toUpper(self.codePointAt(i, &char_length)); 1221 for (word j = 0; j < 3; j++) { 1222 int32_t decoded = upper.code_points[j]; 1223 if (decoded == -1) break; 1224 runtime->strArrayAddCodePoint(thread, result, decoded); 1225 } 1226 } 1227 return runtime->strFromStrArray(result); 1228} 1229 1230RawObject METH(str, __lt__)(Thread* thread, Arguments args) { 1231 HandleScope scope(thread); 1232 Object self_obj(&scope, args.get(0)); 1233 Object other_obj(&scope, args.get(1)); 1234 if (!thread->runtime()->isInstanceOfStr(*self_obj)) { 1235 return thread->raiseRequiresType(self_obj, ID(str)); 1236 } 1237 if (!thread->runtime()->isInstanceOfStr(*other_obj)) { 1238 return NotImplementedType::object(); 1239 } 1240 Str self(&scope, strUnderlying(*self_obj)); 1241 Str other(&scope, strUnderlying(*other_obj)); 1242 return Bool::fromBool(self.compare(*other) < 0); 1243} 1244 1245RawObject METH(str, __mul__)(Thread* thread, Arguments args) { 1246 Runtime* runtime = thread->runtime(); 1247 HandleScope scope(thread); 1248 Object self_obj(&scope, args.get(0)); 1249 if (!runtime->isInstanceOfStr(*self_obj)) { 1250 return thread->raiseRequiresType(self_obj, ID(str)); 1251 } 1252 Object count_index(&scope, args.get(1)); 1253 Object count_obj(&scope, intFromIndex(thread, count_index)); 1254 if (count_obj.isError()) return *count_obj; 1255 word count = intUnderlying(*count_obj).asWordSaturated(); 1256 if (!SmallInt::isValid(count)) { 1257 return thread->raiseWithFmt(LayoutId::kOverflowError, 1258 "cannot fit '%T' into an index-sized integer", 1259 &count_obj); 1260 } 1261 Str self(&scope, *self_obj); 1262 word length = self.length(); 1263 if (count <= 0 || length == 0) { 1264 return Str::empty(); 1265 } 1266 word new_length; 1267 if (__builtin_mul_overflow(length, count, &new_length) || 1268 !SmallInt::isValid(new_length)) { 1269 return thread->raiseWithFmt(LayoutId::kOverflowError, 1270 "repeated string is too long"); 1271 } 1272 return runtime->strRepeat(thread, self, count); 1273} 1274 1275RawObject METH(str, __ne__)(Thread* thread, Arguments args) { 1276 HandleScope scope(thread); 1277 Object self_obj(&scope, args.get(0)); 1278 Object other_obj(&scope, args.get(1)); 1279 if (!thread->runtime()->isInstanceOfStr(*self_obj)) { 1280 return thread->raiseRequiresType(self_obj, ID(str)); 1281 } 1282 if (!thread->runtime()->isInstanceOfStr(*other_obj)) { 1283 return NotImplementedType::object(); 1284 } 1285 Str self(&scope, strUnderlying(*self_obj)); 1286 Str other(&scope, strUnderlying(*other_obj)); 1287 return Bool::fromBool(!self.equals(*other)); 1288} 1289 1290RawObject METH(str, __iter__)(Thread* thread, Arguments args) { 1291 HandleScope scope(thread); 1292 Object self_obj(&scope, args.get(0)); 1293 Runtime* runtime = thread->runtime(); 1294 if (!runtime->isInstanceOfStr(*self_obj)) { 1295 return thread->raiseRequiresType(self_obj, ID(str)); 1296 } 1297 Str self(&scope, strUnderlying(*self_obj)); 1298 return runtime->newStrIterator(self); 1299} 1300 1301RawObject METH(str, __repr__)(Thread* thread, Arguments args) { 1302 Runtime* runtime = thread->runtime(); 1303 HandleScope scope(thread); 1304 Object self_obj(&scope, args.get(0)); 1305 if (!runtime->isInstanceOfStr(*self_obj)) { 1306 return thread->raiseRequiresType(self_obj, ID(str)); 1307 } 1308 Str self(&scope, strUnderlying(*self_obj)); 1309 const word self_len = self.length(); 1310 word result_len = 0; 1311 word squote = 0; 1312 word dquote = 0; 1313 // Precompute the size so that only one allocation is necessary. 1314 for (word i = 0, char_len; i < self_len; i += char_len) { 1315 int32_t code_point = self.codePointAt(i, &char_len); 1316 if (code_point == '\'') { 1317 squote++; 1318 result_len += 1; 1319 } else if (code_point == '"') { 1320 dquote++; 1321 result_len += 1; 1322 } else if (code_point == '\\' || code_point == '\t' || code_point == '\r' || 1323 code_point == '\n') { 1324 result_len += 2; 1325 } else if (Unicode::isPrintable(code_point)) { 1326 result_len += char_len; 1327 } else if (code_point < 0x100) { 1328 result_len += 4; 1329 } else if (code_point < 0x10000) { 1330 result_len += 6; 1331 } else { 1332 result_len += 10; 1333 } 1334 } 1335 1336 byte quote = '\''; 1337 bool unchanged = (result_len == self_len); 1338 if (squote > 0) { 1339 unchanged = false; 1340 // If there are both single quotes and double quotes, the outer quote will 1341 // be singles, and all internal quotes will need to be escaped. 1342 if (dquote > 0) { 1343 // Add the size of the escape backslashes on the single quotes. 1344 result_len += squote; 1345 } else { 1346 quote = '"'; 1347 } 1348 } 1349 result_len += 2; // quotes 1350 1351 MutableBytes buf(&scope, runtime->newMutableBytesUninitialized(result_len)); 1352 buf.byteAtPut(0, quote); 1353 buf.byteAtPut(result_len - 1, quote); 1354 if (unchanged) { 1355 // Remaining characters were unmodified, so copy them directly. 1356 buf.replaceFromWithStr(1, *self, self_len); 1357 return buf.becomeStr(); 1358 } 1359 word out = 1; 1360 for (word in = 0, char_len; in < self_len; in += char_len) { 1361 int32_t code_point = self.codePointAt(in, &char_len); 1362 if (code_point == quote) { 1363 buf.byteAtPut(out++, '\\'); 1364 buf.byteAtPut(out++, quote); 1365 } else if (code_point == '\\') { 1366 buf.byteAtPut(out++, '\\'); 1367 buf.byteAtPut(out++, '\\'); 1368 } else if (code_point == '\t') { 1369 buf.byteAtPut(out++, '\\'); 1370 buf.byteAtPut(out++, 't'); 1371 } else if (code_point == '\r') { 1372 buf.byteAtPut(out++, '\\'); 1373 buf.byteAtPut(out++, 'r'); 1374 } else if (code_point == '\n') { 1375 buf.byteAtPut(out++, '\\'); 1376 buf.byteAtPut(out++, 'n'); 1377 } else if (' ' <= code_point && code_point < kMaxASCII) { 1378 buf.byteAtPut(out++, code_point); 1379 } else if (code_point <= kMaxASCII) { 1380 buf.byteAtPut(out++, '\\'); 1381 buf.byteAtPut(out++, 'x'); 1382 uwordToHexadecimalWithMutableBytes(*buf, /*index=*/out, /*num_digits=*/2, 1383 code_point); 1384 out += 2; 1385 } else if (Unicode::isPrintable(code_point)) { 1386 for (word i = 0; i < char_len; i++) { 1387 buf.byteAtPut(out + i, self.byteAt(in + i)); 1388 } 1389 out += char_len; 1390 } else if (code_point <= 0xff) { 1391 buf.byteAtPut(out++, '\\'); 1392 buf.byteAtPut(out++, 'x'); 1393 uwordToHexadecimalWithMutableBytes(*buf, /*index=*/out, /*num_digits=*/2, 1394 code_point); 1395 out += 2; 1396 } else if (code_point <= 0xffff) { 1397 buf.byteAtPut(out++, '\\'); 1398 buf.byteAtPut(out++, 'u'); 1399 uwordToHexadecimalWithMutableBytes(*buf, /*index=*/out, /*num_digits=*/4, 1400 code_point); 1401 out += 4; 1402 } else { 1403 buf.byteAtPut(out++, '\\'); 1404 buf.byteAtPut(out++, 'U'); 1405 uwordToHexadecimalWithMutableBytes(*buf, /*index=*/out, 1406 /*num_digits=*/8, code_point); 1407 out += 8; 1408 } 1409 } 1410 DCHECK(out == result_len - 1, "wrote %ld characters, expected %ld", out - 1, 1411 result_len - 2); 1412 return buf.becomeStr(); 1413} 1414 1415bool METH(str, _mod_convert_number_int_intrinsic)(Thread* thread) { 1416 RawObject arg = thread->stackTop(); 1417 if (arg.isBool()) { 1418 thread->stackDrop(2); 1419 thread->stackSetTop(convertBoolToInt(arg)); 1420 return true; 1421 } 1422 if (arg.isInt()) { 1423 thread->stackDrop(2); 1424 thread->stackSetTop(arg); 1425 return true; 1426 } 1427 return false; 1428} 1429 1430bool METH(str, _mod_convert_number_index_intrinsic)(Thread* thread) { 1431 RawObject arg = thread->stackTop(); 1432 if (arg.isBool()) { 1433 thread->stackDrop(2); 1434 thread->stackSetTop(convertBoolToInt(arg)); 1435 return true; 1436 } 1437 if (arg.isInt()) { 1438 thread->stackDrop(2); 1439 thread->stackSetTop(arg); 1440 return true; 1441 } 1442 return false; 1443} 1444 1445bool METH(str, _mod_check_single_arg_intrinsic)(Thread* thread) { 1446 Runtime* runtime = thread->runtime(); 1447 RawObject arg = thread->stackTop(); 1448 if (runtime->isInstanceOfTuple(arg)) { 1449 RawTuple arg_tuple = tupleUnderlying(arg); 1450 if (arg_tuple.length() != 1) return false; 1451 thread->stackDrop(2); 1452 thread->stackSetTop(arg_tuple); 1453 return true; 1454 } 1455 RawMutableTuple result = MutableTuple::cast(runtime->newMutableTuple(1)); 1456 // Note that we need to re-fetch stackTop() since newMutableTuple() may 1457 // have triggered GC. 1458 result.atPut(0, thread->stackTop()); 1459 thread->stackDrop(2); 1460 thread->stackSetTop(result.becomeImmutable()); 1461 return true; 1462} 1463 1464RawObject METH(str, isalnum)(Thread* thread, Arguments args) { 1465 HandleScope scope(thread); 1466 Object self_obj(&scope, args.get(0)); 1467 if (!thread->runtime()->isInstanceOfStr(*self_obj)) { 1468 return thread->raiseRequiresType(self_obj, ID(str)); 1469 } 1470 Str self(&scope, strUnderlying(*self_obj)); 1471 word char_length = self.length(); 1472 if (char_length == 0) { 1473 return Bool::falseObj(); 1474 } 1475 for (word i = 0, len; i < char_length; i += len) { 1476 int32_t code_point = self.codePointAt(i, &len); 1477 if (!Unicode::isAlnum(code_point)) { 1478 return Bool::falseObj(); 1479 } 1480 } 1481 return Bool::trueObj(); 1482} 1483 1484RawObject METH(str, isalpha)(Thread* thread, Arguments args) { 1485 HandleScope scope(thread); 1486 Object self_obj(&scope, args.get(0)); 1487 if (!thread->runtime()->isInstanceOfStr(*self_obj)) { 1488 return thread->raiseRequiresType(self_obj, ID(str)); 1489 } 1490 Str self(&scope, strUnderlying(*self_obj)); 1491 word char_length = self.length(); 1492 if (char_length == 0) { 1493 return Bool::falseObj(); 1494 } 1495 for (word i = 0, len; i < char_length; i += len) { 1496 int32_t code_point = self.codePointAt(i, &len); 1497 if (!Unicode::isAlpha(code_point)) { 1498 return Bool::falseObj(); 1499 } 1500 } 1501 return Bool::trueObj(); 1502} 1503 1504RawObject METH(str, isascii)(Thread* thread, Arguments args) { 1505 HandleScope scope(thread); 1506 Object self_obj(&scope, args.get(0)); 1507 if (!thread->runtime()->isInstanceOfStr(*self_obj)) { 1508 return thread->raiseRequiresType(self_obj, ID(str)); 1509 } 1510 Str self(&scope, strUnderlying(*self_obj)); 1511 return Bool::fromBool(self.isASCII()); 1512} 1513 1514RawObject METH(str, isdecimal)(Thread* thread, Arguments args) { 1515 HandleScope scope(thread); 1516 Object self_obj(&scope, args.get(0)); 1517 if (!thread->runtime()->isInstanceOfStr(*self_obj)) { 1518 return thread->raiseRequiresType(self_obj, ID(str)); 1519 } 1520 Str self(&scope, strUnderlying(*self_obj)); 1521 word char_length = self.length(); 1522 if (char_length == 0) { 1523 return Bool::falseObj(); 1524 } 1525 for (word i = 0, len; i < char_length; i += len) { 1526 int32_t code_point = self.codePointAt(i, &len); 1527 if (!Unicode::isDecimal(code_point)) { 1528 return Bool::falseObj(); 1529 } 1530 } 1531 return Bool::trueObj(); 1532} 1533 1534RawObject METH(str, isdigit)(Thread* thread, Arguments args) { 1535 HandleScope scope(thread); 1536 Object self_obj(&scope, args.get(0)); 1537 if (!thread->runtime()->isInstanceOfStr(*self_obj)) { 1538 return thread->raiseRequiresType(self_obj, ID(str)); 1539 } 1540 Str self(&scope, strUnderlying(*self_obj)); 1541 word char_length = self.length(); 1542 if (char_length == 0) { 1543 return Bool::falseObj(); 1544 } 1545 for (word i = 0, len; i < char_length; i += len) { 1546 int32_t code_point = self.codePointAt(i, &len); 1547 if (!Unicode::isDigit(code_point)) { 1548 return Bool::falseObj(); 1549 } 1550 } 1551 return Bool::trueObj(); 1552} 1553 1554bool strIsIdentifier(const Str& str) { 1555 word char_length = str.length(); 1556 if (char_length == 0) { 1557 return false; 1558 } 1559 word len; 1560 int32_t first = str.codePointAt(0, &len); 1561 if (!Unicode::isXidStart(first) && first != '_') { 1562 return false; 1563 } 1564 for (word i = len; i < char_length; i += len) { 1565 int32_t code_point = str.codePointAt(i, &len); 1566 if (!Unicode::isXidContinue(code_point)) { 1567 return false; 1568 } 1569 } 1570 return true; 1571} 1572 1573RawObject METH(str, isidentifier)(Thread* thread, Arguments args) { 1574 HandleScope scope(thread); 1575 Object self_obj(&scope, args.get(0)); 1576 if (!thread->runtime()->isInstanceOfStr(*self_obj)) { 1577 return thread->raiseRequiresType(self_obj, ID(str)); 1578 } 1579 Str self(&scope, strUnderlying(*self_obj)); 1580 return Bool::fromBool(strIsIdentifier(self)); 1581} 1582 1583RawObject METH(str, islower)(Thread* thread, Arguments args) { 1584 HandleScope scope(thread); 1585 Object self_obj(&scope, args.get(0)); 1586 if (!thread->runtime()->isInstanceOfStr(*self_obj)) { 1587 return thread->raiseRequiresType(self_obj, ID(str)); 1588 } 1589 Str self(&scope, strUnderlying(*self_obj)); 1590 word char_length = self.length(); 1591 bool cased = false; 1592 for (word i = 0, len; i < char_length; i += len) { 1593 int32_t code_point = self.codePointAt(i, &len); 1594 if (Unicode::isUpper(code_point) || Unicode::isTitle(code_point)) { 1595 return Bool::falseObj(); 1596 } 1597 if (!cased && Unicode::isLower(code_point)) { 1598 cased = true; 1599 } 1600 } 1601 return Bool::fromBool(cased); 1602} 1603 1604RawObject METH(str, isnumeric)(Thread* thread, Arguments args) { 1605 HandleScope scope(thread); 1606 Object self_obj(&scope, args.get(0)); 1607 if (!thread->runtime()->isInstanceOfStr(*self_obj)) { 1608 return thread->raiseRequiresType(self_obj, ID(str)); 1609 } 1610 Str self(&scope, strUnderlying(*self_obj)); 1611 word char_length = self.length(); 1612 if (char_length == 0) { 1613 return Bool::falseObj(); 1614 } 1615 for (word i = 0, len; i < char_length; i += len) { 1616 int32_t code_point = self.codePointAt(i, &len); 1617 if (!Unicode::isNumeric(code_point)) { 1618 return Bool::falseObj(); 1619 } 1620 } 1621 return Bool::trueObj(); 1622} 1623 1624RawObject METH(str, isprintable)(Thread* thread, Arguments args) { 1625 HandleScope scope(thread); 1626 Object self_obj(&scope, args.get(0)); 1627 if (!thread->runtime()->isInstanceOfStr(*self_obj)) { 1628 return thread->raiseRequiresType(self_obj, ID(str)); 1629 } 1630 Str self(&scope, strUnderlying(*self_obj)); 1631 word char_length = self.length(); 1632 for (word i = 0, len; i < char_length; i += len) { 1633 int32_t code_point = self.codePointAt(i, &len); 1634 if (!Unicode::isPrintable(code_point)) { 1635 return Bool::falseObj(); 1636 } 1637 } 1638 return Bool::trueObj(); 1639} 1640 1641RawObject METH(str, isspace)(Thread* thread, Arguments args) { 1642 HandleScope scope(thread); 1643 Object self_obj(&scope, args.get(0)); 1644 if (!thread->runtime()->isInstanceOfStr(*self_obj)) { 1645 return thread->raiseRequiresType(self_obj, ID(str)); 1646 } 1647 Str self(&scope, strUnderlying(*self_obj)); 1648 word char_length = self.length(); 1649 if (char_length == 0) { 1650 return Bool::falseObj(); 1651 } 1652 if (char_length == 1) { 1653 return ASCII::isSpace(self.byteAt(0)) ? Bool::trueObj() : Bool::falseObj(); 1654 } 1655 word byte_index = 0; 1656 do { 1657 word num_bytes; 1658 int32_t codepoint = self.codePointAt(byte_index, &num_bytes); 1659 if (!Unicode::isSpace(codepoint)) { 1660 return Bool::falseObj(); 1661 } 1662 byte_index += num_bytes; 1663 } while (byte_index < char_length); 1664 return Bool::trueObj(); 1665} 1666 1667RawObject METH(str, istitle)(Thread* thread, Arguments args) { 1668 HandleScope scope(thread); 1669 Object self_obj(&scope, args.get(0)); 1670 if (!thread->runtime()->isInstanceOfStr(*self_obj)) { 1671 return thread->raiseRequiresType(self_obj, ID(str)); 1672 } 1673 Str self(&scope, strUnderlying(*self_obj)); 1674 bool cased = false; 1675 bool previous_is_cased = false; 1676 word char_length = self.length(); 1677 for (word i = 0, len; i < char_length; i += len) { 1678 int32_t code_point = self.codePointAt(i, &len); 1679 if (Unicode::isUpper(code_point) || Unicode::isTitle(code_point)) { 1680 if (previous_is_cased) return Bool::falseObj(); 1681 cased = true; 1682 previous_is_cased = true; 1683 } else if (Unicode::isLower(code_point)) { 1684 if (!previous_is_cased) return Bool::falseObj(); 1685 previous_is_cased = true; 1686 cased = true; 1687 } else { 1688 previous_is_cased = false; 1689 } 1690 } 1691 return Bool::fromBool(cased); 1692} 1693 1694RawObject METH(str, isupper)(Thread* thread, Arguments args) { 1695 HandleScope scope(thread); 1696 Object self_obj(&scope, args.get(0)); 1697 if (!thread->runtime()->isInstanceOfStr(*self_obj)) { 1698 return thread->raiseRequiresType(self_obj, ID(str)); 1699 } 1700 Str self(&scope, strUnderlying(*self_obj)); 1701 word char_length = self.length(); 1702 bool cased = false; 1703 for (word i = 0, len; i < char_length; i += len) { 1704 int32_t code_point = self.codePointAt(i, &len); 1705 if (Unicode::isLower(code_point) || Unicode::isTitle(code_point)) { 1706 return Bool::falseObj(); 1707 } 1708 if (!cased && Unicode::isUpper(code_point)) { 1709 cased = true; 1710 } 1711 } 1712 return Bool::fromBool(cased); 1713} 1714 1715RawObject METH(str, lstrip)(Thread* thread, Arguments args) { 1716 Runtime* runtime = thread->runtime(); 1717 HandleScope scope(thread); 1718 Object self_obj(&scope, args.get(0)); 1719 if (!runtime->isInstanceOfStr(*self_obj)) { 1720 return thread->raiseRequiresType(self_obj, ID(str)); 1721 } 1722 Str str(&scope, strUnderlying(*self_obj)); 1723 Object other_obj(&scope, args.get(1)); 1724 if (other_obj.isNoneType()) { 1725 return strStripSpaceLeft(thread, str); 1726 } 1727 if (!runtime->isInstanceOfStr(*other_obj)) { 1728 return thread->raiseWithFmt(LayoutId::kTypeError, 1729 "str.lstrip() arg must be None or str"); 1730 } 1731 Str chars(&scope, strUnderlying(*other_obj)); 1732 return strStripLeft(thread, str, chars); 1733} 1734 1735RawObject METH(str, rstrip)(Thread* thread, Arguments args) { 1736 Runtime* runtime = thread->runtime(); 1737 HandleScope scope(thread); 1738 Object self_obj(&scope, args.get(0)); 1739 if (!runtime->isInstanceOfStr(*self_obj)) { 1740 return thread->raiseRequiresType(self_obj, ID(str)); 1741 } 1742 Str str(&scope, strUnderlying(*self_obj)); 1743 Object other_obj(&scope, args.get(1)); 1744 if (other_obj.isNoneType()) { 1745 return strStripSpaceRight(thread, str); 1746 } 1747 if (!runtime->isInstanceOfStr(*other_obj)) { 1748 return thread->raiseWithFmt(LayoutId::kTypeError, 1749 "str.rstrip() arg must be None or str"); 1750 } 1751 Str chars(&scope, strUnderlying(*other_obj)); 1752 return strStripRight(thread, str, chars); 1753} 1754 1755RawObject METH(str, strip)(Thread* thread, Arguments args) { 1756 Runtime* runtime = thread->runtime(); 1757 HandleScope scope(thread); 1758 Object self_obj(&scope, args.get(0)); 1759 if (!runtime->isInstanceOfStr(*self_obj)) { 1760 return thread->raiseRequiresType(self_obj, ID(str)); 1761 } 1762 Str str(&scope, strUnderlying(*self_obj)); 1763 Object other_obj(&scope, args.get(1)); 1764 if (other_obj.isNoneType()) { 1765 return strStripSpace(thread, str); 1766 } 1767 if (!runtime->isInstanceOfStr(*other_obj)) { 1768 return thread->raiseWithFmt(LayoutId::kTypeError, 1769 "str.strip() arg must be None or str"); 1770 } 1771 Str chars(&scope, strUnderlying(*other_obj)); 1772 return strStrip(thread, str, chars); 1773} 1774 1775static int32_t handleCapitalSigma(const Str& str, word i) { 1776 bool final_sigma = false; 1777 for (word j = str.offsetByCodePoints(i, -1), len; j >= 0; 1778 j = str.offsetByCodePoints(j, -1)) { 1779 int32_t code_point = str.codePointAt(j, &len); 1780 if (!Unicode::isCaseIgnorable(code_point)) { 1781 final_sigma = Unicode::isCased(code_point); 1782 break; 1783 } 1784 } 1785 if (!final_sigma) { 1786 return 0x03C3; 1787 } 1788 1789 word char_length = str.length(); 1790 for (word j = str.offsetByCodePoints(i, 1), len; j < char_length; j += len) { 1791 int32_t code_point = str.codePointAt(j, &len); 1792 if (!Unicode::isCaseIgnorable(code_point)) { 1793 return Unicode::isCased(code_point) ? 0x03C3 : 0x03C2; 1794 } 1795 } 1796 return 0x03C2; 1797} 1798 1799RawObject METH(str, swapcase)(Thread* thread, Arguments args) { 1800 HandleScope scope(thread); 1801 Runtime* runtime = thread->runtime(); 1802 Object self_obj(&scope, args.get(0)); 1803 if (!runtime->isInstanceOfStr(*self_obj)) { 1804 return thread->raiseRequiresType(self_obj, ID(str)); 1805 } 1806 Str self(&scope, strUnderlying(*self_obj)); 1807 word char_length = self.length(); 1808 1809 // Most of the time, this will be sufficient. However, due to Unicode casing, 1810 // it's possible that we could need up to 3 times as much space as the input. 1811 StrArray result(&scope, runtime->newStrArray()); 1812 runtime->strArrayEnsureCapacity(thread, result, char_length); 1813 for (word i = 0, len; i < char_length; i += len) { 1814 int32_t code_point = self.codePointAt(i, &len); 1815 if (Unicode::isUpper(code_point)) { 1816 if (code_point == 0x03a3) { 1817 runtime->strArrayAddCodePoint(thread, result, 1818 handleCapitalSigma(self, i)); 1819 } else { 1820 FullCasing lower = Unicode::toLower(code_point); 1821 for (word j = 0; j < 3; j++) { 1822 int32_t decoded = lower.code_points[j]; 1823 if (decoded == -1) break; 1824 runtime->strArrayAddCodePoint(thread, result, decoded); 1825 } 1826 } 1827 } else { 1828 FullCasing upper = Unicode::toUpper(code_point); 1829 for (word j = 0; j < 3; j++) { 1830 int32_t decoded = upper.code_points[j]; 1831 if (decoded == -1) break; 1832 runtime->strArrayAddCodePoint(thread, result, decoded); 1833 } 1834 } 1835 } 1836 return runtime->strFromStrArray(result); 1837} 1838 1839RawObject METH(str_iterator, __iter__)(Thread* thread, Arguments args) { 1840 HandleScope scope(thread); 1841 Object self(&scope, args.get(0)); 1842 if (!self.isStrIterator()) { 1843 return thread->raiseRequiresType(self, ID(str_iterator)); 1844 } 1845 return *self; 1846} 1847 1848// TODO(T35578204) Implement this for UTF-8. This probably means keeping extra 1849// state and logic so that __next__() will advance to the next codepoint. 1850 1851RawObject METH(str_iterator, __next__)(Thread* thread, Arguments args) { 1852 HandleScope scope(thread); 1853 Object self(&scope, args.get(0)); 1854 if (!self.isStrIterator()) { 1855 return thread->raiseRequiresType(self, ID(str_iterator)); 1856 } 1857 StrIterator iter(&scope, *self); 1858 Object value(&scope, strIteratorNext(thread, iter)); 1859 if (value.isError()) { 1860 return thread->raise(LayoutId::kStopIteration, NoneType::object()); 1861 } 1862 return *value; 1863} 1864 1865RawObject METH(str_iterator, __length_hint__)(Thread* thread, Arguments args) { 1866 HandleScope scope(thread); 1867 Object self(&scope, args.get(0)); 1868 if (!self.isStrIterator()) { 1869 return thread->raiseRequiresType(self, ID(str_iterator)); 1870 } 1871 StrIterator str_iterator(&scope, *self); 1872 Str str(&scope, str_iterator.iterable()); 1873 return SmallInt::fromWord(str.length() - str_iterator.index()); 1874} 1875 1876} // namespace py