this repo has no description
1// Copyright (c) Facebook, Inc. and its affiliates. (http://www.facebook.com)
2#include "str-builtins.h"
3
4#include <cctype>
5#include <cwchar>
6
7#include "builtins.h"
8#include "formatter-utils.h"
9#include "formatter.h"
10#include "frame.h"
11#include "globals.h"
12#include "int-builtins.h"
13#include "objects.h"
14#include "runtime.h"
15#include "set-builtins.h"
16#include "slice-builtins.h"
17#include "thread.h"
18#include "tuple-builtins.h"
19#include "type-builtins.h"
20#include "unicode.h"
21#include "utils.h"
22
23namespace py {
24
25word adjustedStrIndex(const Str& str, word index) {
26 word len = str.length();
27 if (index >= 0) {
28 return str.offsetByCodePoints(0, index);
29 }
30 if (-len < index) {
31 return Utils::maximum(0l, str.offsetByCodePoints(len, index));
32 }
33 return 0;
34}
35
36RawObject dataArraySubstr(Thread* thread, const DataArray& data, word start,
37 word length) {
38 word data_len = data.length();
39 DCHECK_BOUND(start, data_len);
40 DCHECK_BOUND(start, data_len - length);
41 // SmallStr result
42 if (length <= RawSmallStr::kMaxLength) {
43 byte buffer[RawSmallStr::kMaxLength];
44 data.copyToStartAt(buffer, length, start);
45 return SmallStr::fromBytes(View<byte>(buffer, length));
46 }
47 // LargeStr result
48 HandleScope scope(thread);
49 MutableBytes result(&scope,
50 thread->runtime()->newMutableBytesUninitialized(length));
51 result.replaceFromWithStartAt(0, *data, length, start);
52 return result.becomeStr();
53}
54
55RawObject newStrFromWideChar(Thread* thread, const wchar_t* wc_str) {
56 return newStrFromWideCharWithLength(thread, wc_str, std::wcslen(wc_str));
57}
58
59RawObject newStrFromWideCharWithLength(Thread* thread, const wchar_t* wc_str,
60 word length) {
61 static_assert(sizeof(*wc_str) * kBitsPerByte == 32,
62 "only 32bit wchar_t supported.");
63
64 for (word i = 0; i < length; ++i) {
65 if (wc_str[i] < 0 || wc_str[i] > kMaxUnicode) {
66 return thread->raiseWithFmt(LayoutId::kValueError,
67 "character is not in range");
68 }
69 }
70
71 return thread->runtime()->newStrFromUTF32(
72 View<int32_t>(reinterpret_cast<const int32_t*>(wc_str), length));
73}
74
75void strCopyToWCStr(wchar_t* buf, size_t buf_length, const Str& str) {
76 uword wchar_index = 0;
77 for (word byte_index = 0, num_bytes = 0, byte_count = str.length();
78 byte_index < byte_count && wchar_index < buf_length;
79 byte_index += num_bytes, wchar_index += 1) {
80 int32_t cp = Str::cast(*str).codePointAt(byte_index, &num_bytes);
81 static_assert(sizeof(*buf) == sizeof(cp), "Requires 32bit wchar_t");
82 buf[wchar_index] = static_cast<wchar_t>(cp);
83 }
84 buf[wchar_index] = '\0';
85}
86
87static word strCountCharFromTo(const Str& haystack, byte needle, word start,
88 word end) {
89 word result = 0;
90 if (haystack.isSmallStr()) {
91 RawSmallStr small = SmallStr::cast(*haystack);
92 for (word i = start; i < end; i++) {
93 if (small.byteAt(i) == needle) result++;
94 }
95 return result;
96 }
97 byte* ptr = reinterpret_cast<byte*>(LargeStr::cast(*haystack).address());
98 for (word i = start; i < end; i++) {
99 if (ptr[i] == needle) result++;
100 }
101 return result;
102}
103
104RawObject strCount(const Str& haystack, const Str& needle, word start,
105 word end) {
106 if (end < 0 || start < 0) {
107 // N.B.: If end is negative we may be able to cheaply walk backward. We
108 // should avoid calling adjustSearchIndices here since the underlying
109 // container is not O(1) and replace it with something that preserves some
110 // of the signals that would be useful to lower the cost of the O(n)
111 // traversal.
112 // TODO(T41400083): Use a different search algorithm
113 Slice::adjustSearchIndices(&start, &end, haystack.codePointLength());
114 }
115
116 word start_index = start == 0 ? 0 : haystack.offsetByCodePoints(0, start);
117 if (start_index == haystack.length() && needle.length() > 0) {
118 // Haystack is too small; fast early return
119 return SmallInt::fromWord(0);
120 }
121
122 word end_index = end == kMaxWord
123 ? haystack.length()
124 : haystack.offsetByCodePoints(start_index, end - start);
125 if ((end_index - start_index) < needle.length() || start_index > end_index) {
126 // Haystack is too small; fast early return
127 return SmallInt::fromWord(0);
128 }
129
130 if (needle.length() == 1) {
131 return SmallInt::fromWord(strCountCharFromTo(
132 haystack, SmallStr::cast(*needle).byteAt(0), start_index, end_index));
133 }
134
135 // TODO(T41400083): Use a different search algorithm
136 return SmallInt::fromWord(strCountSubStrFromTo(haystack, needle, start_index,
137 end_index, haystack.length()));
138}
139
140word strCountSubStrFromTo(const Str& haystack, const Str& needle, word start,
141 word end, word max_count) {
142 DCHECK(max_count >= 0, "max_count must be non-negative");
143 word needle_len = needle.length();
144 word num_match = 0;
145 // Loop is in byte space, not code point space
146 for (word i = start; i <= end - needle_len && num_match < max_count;) {
147 if (strHasPrefix(haystack, needle, i)) {
148 i += needle_len;
149 num_match++;
150 continue;
151 }
152 i++;
153 }
154 return num_match;
155}
156
157word strCountSubStr(const Str& haystack, const Str& needle, word max_count) {
158 return strCountSubStrFromTo(haystack, needle, 0, haystack.length(),
159 max_count);
160}
161
162RawObject strEncodeASCII(Thread* thread, const Str& str) {
163 HandleScope scope(thread);
164 if (!str.isASCII()) {
165 return Unbound::object();
166 }
167 if (str.isSmallStr()) {
168 return SmallStr::cast(*str).becomeBytes();
169 }
170 word str_len = str.length();
171 MutableBytes bytes(&scope,
172 thread->runtime()->newMutableBytesUninitialized(str_len));
173 bytes.replaceFromWithStr(0, *str, str_len);
174 return bytes.becomeImmutable();
175}
176
177RawObject strEscapeNonASCII(Thread* thread, const Str& str) {
178 if (str.isASCII()) return *str;
179 // Slow implementation using `codecs`.
180 HandleScope scope(thread);
181 Runtime* runtime = thread->runtime();
182 Symbols* symbols = runtime->symbols();
183 Object ascii(&scope, symbols->at(ID(ascii)));
184 Object backslashreplace(&scope, symbols->at(ID(backslashreplace)));
185 Bytes result_bytes(
186 &scope, thread->invokeMethod3(str, ID(encode), ascii, backslashreplace));
187 word length = result_bytes.length();
188 MutableBytes result(&scope, runtime->newMutableBytesUninitialized(length));
189 result.replaceFromWithBytes(0, *result_bytes, length);
190 return result.becomeStr();
191}
192
193RawObject strJoinWithTupleOrList(Thread* thread, const Str& sep,
194 const Object& iterable) {
195 Runtime* runtime = thread->runtime();
196 HandleScope scope(thread);
197 Tuple tuple(&scope, runtime->emptyTuple());
198 word length = 0;
199 if (iterable.isTuple()) {
200 tuple = *iterable;
201 length = tuple.length();
202 } else if (iterable.isList()) {
203 tuple = List::cast(*iterable).items();
204 length = List::cast(*iterable).numItems();
205 } else {
206 // Slow path: collect items into list in Python and call again
207 return Unbound::object();
208 }
209 Object elt(&scope, NoneType::object());
210 for (word i = 0; i < length; i++) {
211 elt = tuple.at(i);
212 if (!runtime->isInstanceOfStr(*elt)) {
213 return thread->raiseWithFmt(
214 LayoutId::kTypeError,
215 "sequence item %w: expected str instance, %T found", i, &elt);
216 }
217 }
218 return runtime->strJoin(thread, sep, tuple, length);
219}
220
221word strSpan(const Str& src, const Str& str) {
222 word length = src.length();
223 word str_length = str.length();
224 word first = 0;
225 for (; first < length; first++) {
226 bool has_match = false;
227 byte ch = src.byteAt(first);
228 for (word j = 0; j < str_length; j++) {
229 if (ch == str.byteAt(j)) {
230 has_match = true;
231 break;
232 }
233 }
234 if (!has_match) {
235 break;
236 }
237 }
238 return first;
239}
240
241RawObject strSubstr(Thread* thread, const Str& str, word start, word length) {
242 word str_len = str.length();
243 DCHECK_BOUND(start, str_len);
244 DCHECK_BOUND(length, str_len - start);
245 if (length == str_len) {
246 return *str;
247 }
248 // SmallStr result
249 if (length <= RawSmallStr::kMaxLength) {
250 byte buffer[RawSmallStr::kMaxLength];
251 str.copyToStartAt(buffer, length, start);
252 return SmallStr::fromBytes(View<byte>(buffer, length));
253 }
254 // LargeStr result
255 HandleScope scope(thread);
256 MutableBytes result(&scope,
257 thread->runtime()->newMutableBytesUninitialized(length));
258 result.replaceFromWithStartAt(0, LargeStr::cast(*str), length, start);
259 return result.becomeStr();
260}
261
262word strRSpan(const Str& src, const Str& str, word rend) {
263 DCHECK(rend >= 0, "string index underflow");
264 word length = src.length();
265 word str_length = str.length();
266 word result = 0;
267 for (word i = length - 1; i >= rend; i--, result++) {
268 byte ch = src.byteAt(i);
269 bool has_match = false;
270 for (word j = 0; j < str_length; j++) {
271 if (ch == str.byteAt(j)) {
272 has_match = true;
273 break;
274 }
275 }
276 if (!has_match) {
277 break;
278 }
279 }
280 return result;
281}
282
283static bool isLineBreak(int32_t c) {
284 switch (c) {
285 // Common cases
286 case '\n':
287 case '\r':
288 // Less common cases
289 case '\f':
290 case '\v':
291 case 0x1c:
292 case 0x1d:
293 case 0x1e:
294 case 0x85:
295 case 0x2028:
296 case 0x2029:
297 return true;
298 default:
299 return false;
300 }
301}
302
303RawObject strSplit(Thread* thread, const Str& str, const Str& sep,
304 word maxsplit) {
305 Runtime* runtime = thread->runtime();
306 HandleScope scope(thread);
307 word num_splits = strCountSubStr(str, sep, maxsplit);
308 word result_len = num_splits + 1;
309 MutableTuple result_items(&scope, runtime->newMutableTuple(result_len));
310 word last_idx = 0;
311 word sep_len = sep.length();
312 for (word i = 0, result_idx = 0; result_idx < num_splits;) {
313 if (strHasPrefix(str, sep, i)) {
314 result_items.atPut(result_idx++,
315 strSubstr(thread, str, last_idx, i - last_idx));
316 i += sep_len;
317 last_idx = i;
318 } else {
319 i = str.offsetByCodePoints(i, 1);
320 }
321 }
322 result_items.atPut(num_splits,
323 strSubstr(thread, str, last_idx, str.length() - last_idx));
324 List result(&scope, runtime->newList());
325 result.setItems(*result_items);
326 result.setNumItems(result_len);
327 return *result;
328}
329
330RawObject strSplitlines(Thread* thread, const Str& str, bool keepends) {
331 HandleScope scope(thread);
332 Runtime* runtime = thread->runtime();
333 List result(&scope, runtime->newList());
334 // Looping over code points, not bytes, but i is a byte offset
335 for (word i = 0, j = 0; i < str.length(); j = i) {
336 // Skip newline chars
337 word num_bytes;
338 while (i < str.length() && !isLineBreak(str.codePointAt(i, &num_bytes))) {
339 i += num_bytes;
340 }
341
342 word eol_pos = i;
343 if (i < str.length()) {
344 int32_t cp = str.codePointAt(i, &num_bytes);
345 word next = i + num_bytes;
346 word next_num_bytes;
347 // Check for \r\n specifically
348 if (cp == '\r' && next < str.length() &&
349 str.codePointAt(next, &next_num_bytes) == '\n') {
350 i += next_num_bytes;
351 }
352 i += num_bytes;
353 if (keepends) {
354 eol_pos = i;
355 }
356 }
357
358 // If there are no newlines, the str returned should be identity-equal
359 if (j == 0 && eol_pos == str.length() && str.isStr()) {
360 runtime->listAdd(thread, result, str);
361 return *result;
362 }
363
364 Str substr(&scope, strSubstr(thread, str, j, eol_pos - j));
365 runtime->listAdd(thread, result, substr);
366 }
367
368 return *result;
369}
370
371RawObject strStripSpace(Thread* thread, const Str& src) {
372 word length = src.length();
373 if (length == 0) {
374 return *src;
375 }
376 if (length == 1 && ASCII::isSpace(src.byteAt(0))) {
377 return Str::empty();
378 }
379 word first = 0;
380 while (first < length) {
381 word num_bytes;
382 int32_t ch = src.codePointAt(first, &num_bytes);
383 if (!Unicode::isSpace(ch)) {
384 break;
385 }
386 first += num_bytes;
387 }
388 word last = length;
389 while (last > first) {
390 last = src.offsetByCodePoints(last, -1);
391 word num_bytes;
392 int32_t ch = src.codePointAt(last, &num_bytes);
393 if (!Unicode::isSpace(ch)) {
394 last += num_bytes;
395 break;
396 }
397 }
398 return strSubstr(thread, src, first, last - first);
399}
400
401RawObject strStripSpaceLeft(Thread* thread, const Str& src) {
402 word length = src.length();
403 if (length == 0) {
404 return *src;
405 }
406 if (length == 1 && ASCII::isSpace(src.byteAt(0))) {
407 return Str::empty();
408 }
409 word first = 0;
410 while (first < length) {
411 word num_bytes;
412 int32_t ch = src.codePointAt(first, &num_bytes);
413 if (!Unicode::isSpace(ch)) {
414 break;
415 }
416 first += num_bytes;
417 }
418 return strSubstr(thread, src, first, length - first);
419}
420
421RawObject strStripSpaceRight(Thread* thread, const Str& src) {
422 word length = src.length();
423 if (length == 0) {
424 return *src;
425 }
426 if (length == 1 && ASCII::isSpace(src.byteAt(0))) {
427 return Str::empty();
428 }
429 word last = length;
430 while (last > 0) {
431 last = src.offsetByCodePoints(last, -1);
432 word num_bytes;
433 int32_t ch = src.codePointAt(last, &num_bytes);
434 if (!Unicode::isSpace(ch)) {
435 last += num_bytes;
436 break;
437 }
438 }
439 return strSubstr(thread, src, 0, last);
440}
441
442RawObject strStrip(Thread* thread, const Str& src, const Str& str) {
443 word length = src.length();
444 if (length == 0 || str.length() == 0) {
445 return *src;
446 }
447 word first = strSpan(src, str);
448 word last = strRSpan(src, str, first);
449 return strSubstr(thread, src, first, length - first - last);
450}
451
452RawObject strStripLeft(Thread* thread, const Str& src, const Str& str) {
453 word length = src.length();
454 if (length == 0 || str.length() == 0) {
455 return *src;
456 }
457 word first = strSpan(src, str);
458 return strSubstr(thread, src, first, length - first);
459}
460
461RawObject strStripRight(Thread* thread, const Str& src, const Str& str) {
462 word length = src.length();
463 if (length == 0 || str.length() == 0) {
464 return *src;
465 }
466 word first = 0;
467 word last = strRSpan(src, str, first);
468 return strSubstr(thread, src, 0, length - last);
469}
470
471RawObject strTranslateASCII(Thread* thread, const Str& src, const Str& table) {
472 if (table.length() > kMaxASCII || !table.isASCII()) {
473 return Unbound::object();
474 }
475 word src_len = src.length();
476 word table_len = table.length();
477 HandleScope scope(thread);
478 MutableBytes result(&scope,
479 thread->runtime()->newMutableBytesUninitialized(src_len));
480 // Since all non-ASCII bytes in UTF-8 have a 1 in front, we can iterate by
481 // bytes instead of codepoints
482 for (word i = 0; i < src.length(); i++) {
483 byte to_translate = src.byteAt(i);
484 if (to_translate > table_len) {
485 result.byteAtPut(i, to_translate);
486 } else {
487 result.byteAtPut(i, table.byteAt(to_translate));
488 }
489 }
490 return result.becomeStr();
491}
492
493RawObject strIteratorNext(Thread* thread, const StrIterator& iter) {
494 HandleScope scope(thread);
495 word byte_offset = iter.index();
496 Str underlying(&scope, iter.iterable());
497 if (byte_offset >= underlying.length()) {
498 return Error::noMoreItems();
499 }
500 word num_bytes = 0;
501 word code_point = underlying.codePointAt(byte_offset, &num_bytes);
502 iter.setIndex(byte_offset + num_bytes);
503 return RawSmallStr::fromCodePoint(code_point);
504}
505
506static const BuiltinAttribute kUserStrBaseAttributes[] = {
507 {ID(_UserStr__value), RawUserStrBase::kValueOffset,
508 AttributeFlags::kHidden},
509};
510
511static const BuiltinAttribute kStrIteratorAttributes[] = {
512 {ID(_str_iterator__iterable), RawStrIterator::kIterableOffset,
513 AttributeFlags::kHidden},
514 {ID(_str_iterator__index), RawStrIterator::kIndexOffset,
515 AttributeFlags::kHidden},
516};
517
518void initializeStrTypes(Thread* thread) {
519 HandleScope scope(thread);
520 Runtime* runtime = thread->runtime();
521
522 Type str(&scope, addBuiltinType(thread, ID(str), LayoutId::kStr,
523 /*superclass_id=*/LayoutId::kObject,
524 kUserStrBaseAttributes, UserStrBase::kSize,
525 /*basetype=*/true));
526
527 {
528 Type type(&scope,
529 addImmediateBuiltinType(thread, ID(largestr), LayoutId::kLargeStr,
530 /*builtin_base=*/LayoutId::kStr,
531 /*superclass_id=*/LayoutId::kObject,
532 /*basetype=*/false));
533 Layout::cast(type.instanceLayout()).setDescribedType(*str);
534 runtime->setLargeStrType(type);
535 }
536
537 {
538 Type type(&scope,
539 addImmediateBuiltinType(thread, ID(smallstr), LayoutId::kSmallStr,
540 /*builtin_base=*/LayoutId::kStr,
541 /*superclass_id=*/LayoutId::kObject,
542 /*basetype=*/false));
543 Layout::cast(type.instanceLayout()).setDescribedType(*str);
544 runtime->setSmallStrType(type);
545 }
546
547 addBuiltinType(thread, ID(str_iterator), LayoutId::kStrIterator,
548 /*superclass_id=*/LayoutId::kObject, kStrIteratorAttributes,
549 StrIterator::kSize, /*basetype=*/false);
550}
551
552RawObject METH(str, __add__)(Thread* thread, Arguments args) {
553 Runtime* runtime = thread->runtime();
554 HandleScope scope(thread);
555 Object self_obj(&scope, args.get(0));
556 Object other_obj(&scope, args.get(1));
557 if (!runtime->isInstanceOfStr(*self_obj)) {
558 return thread->raiseRequiresType(self_obj, ID(str));
559 }
560 if (!runtime->isInstanceOfStr(*other_obj)) {
561 return NotImplementedType::object();
562 }
563 Str self(&scope, strUnderlying(*self_obj));
564 Str other(&scope, strUnderlying(*other_obj));
565 return runtime->strConcat(thread, self, other);
566}
567
568RawObject METH(str, __bool__)(Thread* thread, Arguments args) {
569 HandleScope scope(thread);
570 Object self_obj(&scope, args.get(0));
571 if (!thread->runtime()->isInstanceOfStr(*self_obj)) {
572 return thread->raiseRequiresType(self_obj, ID(str));
573 }
574 Str self(&scope, strUnderlying(*self_obj));
575 return Bool::fromBool(*self != Str::empty());
576}
577
578RawObject METH(str, __contains__)(Thread* thread, Arguments args) {
579 RawObject self = args.get(0);
580 RawObject other = args.get(1);
581 Runtime* runtime = thread->runtime();
582 bool is_self_invalid = true;
583 if (runtime->isInstanceOfStr(self)) {
584 is_self_invalid = false;
585 if (runtime->isInstanceOfStr(other)) {
586 return Bool::fromBool(strUnderlying(self).includes(strUnderlying(other)));
587 }
588 }
589 HandleScope scope(thread);
590 Object invalid(&scope, is_self_invalid ? self : other);
591 return thread->raiseRequiresType(invalid, ID(str));
592}
593
594RawObject METH(str, __eq__)(Thread* thread, Arguments args) {
595 HandleScope scope(thread);
596 Object self_obj(&scope, args.get(0));
597 Object other_obj(&scope, args.get(1));
598 if (!thread->runtime()->isInstanceOfStr(*self_obj)) {
599 return thread->raiseRequiresType(self_obj, ID(str));
600 }
601 if (!thread->runtime()->isInstanceOfStr(*other_obj)) {
602 return NotImplementedType::object();
603 }
604 Str self(&scope, strUnderlying(*self_obj));
605 Str other(&scope, strUnderlying(*other_obj));
606 return Bool::fromBool(self.equals(*other));
607}
608
609RawObject METH(str, __format__)(Thread* thread, Arguments args) {
610 HandleScope scope(thread);
611 Object self_obj(&scope, args.get(0));
612 Runtime* runtime = thread->runtime();
613 if (!runtime->isInstanceOfStr(*self_obj)) {
614 return thread->raiseRequiresType(self_obj, ID(str));
615 }
616 Str self(&scope, strUnderlying(*self_obj));
617
618 Object spec_obj(&scope, args.get(1));
619 if (!runtime->isInstanceOfStr(*spec_obj)) {
620 return thread->raiseWithFmt(LayoutId::kTypeError,
621 "__format__() argument 1 must be str, not %T",
622 &spec_obj);
623 }
624 Str spec(&scope, strUnderlying(*spec_obj));
625
626 if (spec.length() == 0) {
627 return *self;
628 }
629
630 FormatSpec format;
631 Object possible_error(&scope,
632 parseFormatSpec(thread, spec,
633 /*default_type=*/'s',
634 /*default_align=*/'<', &format));
635 if (!possible_error.isNoneType()) {
636 DCHECK(possible_error.isErrorException(), "expected exception");
637 return *possible_error;
638 }
639 if (format.type != 's') {
640 return raiseUnknownFormatError(thread, format.type, self_obj);
641 }
642 if (format.positive_sign != '\0') {
643 return thread->raiseWithFmt(LayoutId::kValueError,
644 "Sign not allowed in string format specifier");
645 }
646 if (format.alternate) {
647 return thread->raiseWithFmt(
648 LayoutId::kValueError,
649 "Alternate form (#) not allowed in string format specifier");
650 }
651 if (format.alignment == '=') {
652 return thread->raiseWithFmt(
653 LayoutId::kValueError,
654 "'=' alignment not allowed in string format specifier");
655 }
656
657 return formatStr(thread, self, &format);
658}
659
660RawObject METH(str, __ge__)(Thread* thread, Arguments args) {
661 HandleScope scope(thread);
662 Object self_obj(&scope, args.get(0));
663 Object other_obj(&scope, args.get(1));
664 if (!thread->runtime()->isInstanceOfStr(*self_obj)) {
665 return thread->raiseRequiresType(self_obj, ID(str));
666 }
667 if (!thread->runtime()->isInstanceOfStr(*other_obj)) {
668 return NotImplementedType::object();
669 }
670 Str self(&scope, strUnderlying(*self_obj));
671 Str other(&scope, strUnderlying(*other_obj));
672 return Bool::fromBool(self.compare(*other) >= 0);
673}
674
675bool METH(str, __getitem___intrinsic)(Thread* thread) {
676 RawObject arg0 = thread->stackPeek(1);
677 if (!arg0.isStr()) {
678 return false;
679 }
680 RawObject arg1 = thread->stackPeek(0);
681 word idx;
682 if (arg1.isSmallInt()) {
683 idx = RawSmallInt::cast(arg1).value();
684 } else if (arg1.isBool()) {
685 idx = RawBool::cast(arg1).value();
686 } else {
687 word start, stop;
688 if (!tryUnpackSlice(arg1, &start, &stop)) {
689 return false;
690 }
691 {
692 // Manually adjust slice bounds to avoid an extra call to codePointLength
693 HandleScope scope(thread);
694 Str self(&scope, arg0);
695 word start_index = adjustedStrIndex(self, start);
696 word stop_index = adjustedStrIndex(self, stop);
697 word length = stop_index - start_index;
698
699 thread->stackDrop(2);
700 thread->stackSetTop(length <= 0
701 ? Str::empty()
702 : strSubstr(thread, self, start_index, length));
703 }
704 return true;
705 }
706 RawStr self = Str::cast(arg0);
707 word len = self.length();
708 if (0 <= idx && idx < len) {
709 word offset = self.offsetByCodePoints(0, idx);
710 if (offset < len) {
711 word ignored;
712 thread->stackDrop(2);
713 thread->stackSetTop(
714 RawSmallStr::fromCodePoint(self.codePointAt(offset, &ignored)));
715 return true;
716 }
717 }
718 if (0 > idx) {
719 word offset = self.offsetByCodePoints(len, idx);
720 if (offset < len && offset != -1) {
721 word ignored;
722 thread->stackDrop(2);
723 thread->stackSetTop(
724 RawSmallStr::fromCodePoint(self.codePointAt(offset, &ignored)));
725 return true;
726 }
727 }
728 return false;
729}
730
731RawObject METH(str, __gt__)(Thread* thread, Arguments args) {
732 HandleScope scope(thread);
733 Object self_obj(&scope, args.get(0));
734 Object other_obj(&scope, args.get(1));
735 if (!thread->runtime()->isInstanceOfStr(*self_obj)) {
736 return thread->raiseRequiresType(self_obj, ID(str));
737 }
738 if (!thread->runtime()->isInstanceOfStr(*other_obj)) {
739 return NotImplementedType::object();
740 }
741 Str self(&scope, strUnderlying(*self_obj));
742 Str other(&scope, strUnderlying(*other_obj));
743 return Bool::fromBool(self.compare(*other) > 0);
744}
745
746RawObject METH(str, __hash__)(Thread* thread, Arguments args) {
747 HandleScope scope(thread);
748 Object self(&scope, args.get(0));
749 Runtime* runtime = thread->runtime();
750 if (!runtime->isInstanceOfStr(*self)) {
751 return thread->raiseRequiresType(self, ID(str));
752 }
753 Str self_str(&scope, strUnderlying(*self));
754 return SmallInt::fromWord(strHash(thread, *self_str));
755}
756
757void strInternInTuple(Thread* thread, const Object& items) {
758 HandleScope scope(thread);
759 Runtime* runtime = thread->runtime();
760 DCHECK(runtime->isInstanceOfTuple(*items), "items must be a tuple instance");
761 Tuple tuple(&scope, tupleUnderlying(*items));
762 Str str(&scope, Str::empty());
763 Object result(&scope, NoneType::object());
764 for (word i = 0; i < tuple.length(); i++) {
765 str = tuple.at(i);
766 result = Runtime::internStr(thread, str);
767 if (result.isError()) continue;
768 if (result != str) {
769 tuple.atPut(i, *result);
770 }
771 }
772}
773
774static bool allNameChars(const Str& str) {
775 for (word i = 0; i < str.length(); i++) {
776 byte c = str.byteAt(i);
777 if (!ASCII::isAlnum(c) && c != '_') {
778 return false;
779 }
780 }
781 return true;
782}
783
784static bool strInternConstant(Thread* thread, Object* object) {
785 if (object->isStr()) {
786 HandleScope scope(thread);
787 Str str(&scope, **object);
788 if (allNameChars(str)) {
789 str = Runtime::internStr(thread, str);
790 if (str != *object) {
791 *object = *str;
792 return true;
793 }
794 }
795 return false;
796 }
797 if (object->isTuple()) {
798 HandleScope scope(thread);
799 Tuple tuple(&scope, **object);
800 Object item(&scope, NoneType::object());
801 bool modified = false;
802 for (word i = 0, length = tuple.length(); i < length; i++) {
803 item = tuple.at(i);
804 if (strInternConstant(thread, &item)) {
805 modified = true;
806 tuple.atPut(i, *item);
807 }
808 }
809 if (!modified) return false;
810 *object = *tuple;
811 return true;
812 }
813 if (object->isFrozenSet()) {
814 HandleScope scope(thread);
815 FrozenSet set(&scope, **object);
816 FrozenSet new_set(&scope, thread->runtime()->newFrozenSet());
817 Object value(&scope, NoneType::object());
818 Object hash(&scope, NoneType::object());
819 bool modified = false;
820 for (word idx = 0; setNextItem(set, &idx, &value);) {
821 if (strInternConstant(thread, &value)) {
822 modified = true;
823 }
824 hash = Interpreter::hash(thread, value);
825 if (hash.isErrorException()) return false;
826 setAdd(thread, new_set, value, SmallInt::cast(*hash).value());
827 }
828 if (!modified) return false;
829 *object = *new_set;
830 return true;
831 }
832 return false;
833}
834
835bool strInternConstants(Thread* thread, const Object& items) {
836 HandleScope scope(thread);
837 DCHECK(thread->runtime()->isInstanceOfTuple(*items),
838 "items must be a tuple instance");
839 Object tuple(&scope, tupleUnderlying(*items));
840 return strInternConstant(thread, &tuple);
841}
842
843word strFind(const Str& haystack, const Str& needle) {
844 word haystack_len = haystack.length();
845 word needle_len = needle.length();
846 if (needle_len > haystack_len) {
847 return -1;
848 }
849 if (needle_len == 0) {
850 return 0;
851 }
852 if (needle_len == 1 && haystack.isASCII()) {
853 return strFindAsciiChar(haystack, needle.byteAt(0));
854 }
855 // Loop is in byte space, not code point space
856 word result = 0;
857 // TODO(T41400083): Use a different search algorithm
858 for (word i = 0; i <= haystack_len - needle_len; result++) {
859 if (strHasPrefix(haystack, needle, i)) {
860 return result;
861 }
862 i = haystack.offsetByCodePoints(i, 1);
863 }
864 return -1;
865}
866
867word strFindWithRange(const Str& haystack, const Str& needle, word start,
868 word end) {
869 if (end < 0 || start < 0) {
870 Slice::adjustSearchIndices(&start, &end, haystack.codePointLength());
871 }
872
873 word start_index = haystack.offsetByCodePoints(0, start);
874 if (start_index == haystack.length() && needle.length() > 0) {
875 // Haystack is too small; fast early return
876 return -1;
877 }
878 word end_index = haystack.offsetByCodePoints(start_index, end - start);
879
880 if ((end_index - start_index) < needle.length() || start_index > end_index) {
881 // Haystack is too small; fast early return
882 return -1;
883 }
884
885 // Loop is in byte space, not code point space
886 word result = start;
887 // TODO(T41400083): Use a different search algorithm
888 for (word i = start_index; i <= end_index - needle.length(); result++) {
889 bool has_match = strHasPrefix(haystack, needle, i);
890 word next = haystack.offsetByCodePoints(i, 1);
891 if (i == next) {
892 // We've reached a fixpoint; offsetByCodePoints will not advance past the
893 // length of the string.
894 if (start_index >= i) {
895 // The start is greater than the length of the string.
896 return -1;
897 }
898 // If the start is within bounds, just return the last found index.
899 break;
900 }
901 if (has_match) {
902 return result;
903 }
904 i = next;
905 }
906 return -1;
907}
908
909word strFindAsciiChar(const Str& haystack, byte needle) {
910 DCHECK(needle <= kMaxASCII, "must only be called for ASCII `needle`");
911 for (word i = 0, length = haystack.length(); i < length; i++) {
912 if (haystack.byteAt(i) == needle) {
913 return i;
914 }
915 }
916 return -1;
917}
918
919word strFindFirstNonWhitespace(const Str& str) {
920 word i = 0;
921 for (word codepoint_len, length = str.length(); i < length;
922 i += codepoint_len) {
923 if (!Unicode::isSpace(str.codePointAt(i, &codepoint_len))) return i;
924 }
925 return i;
926}
927
928bool strHasPrefix(const Str& str, const Str& prefix, word start) {
929 word str_len = str.length();
930 word prefix_len = prefix.length();
931 if (str_len - start < prefix_len) {
932 return false;
933 }
934 for (word i = 0; i < prefix_len; i++) {
935 if (str.byteAt(start + i) != prefix.byteAt(i)) {
936 return false;
937 }
938 }
939 return true;
940}
941
942bool strHasSurrogate(const Str& str) {
943 return str.includesByte(UTF8::kSurrogateLeadByte);
944}
945
946word strRFind(const Str& haystack, const Str& needle, word start, word end) {
947 // Haystack slice is empty; fast early return
948 if (start > end) return -1;
949 // Needle is empty
950 if (needle == Str::empty()) return end;
951 word start_index = haystack.offsetByCodePoints(0, start);
952 if (start_index == haystack.length()) {
953 // Haystack is too small; fast early return
954 return -1;
955 }
956 word end_index = haystack.offsetByCodePoints(start_index, end - start);
957 if ((end_index - start_index) < needle.length() || start_index > end_index) {
958 // Haystack is too small; fast early return
959 return -1;
960 }
961 // Loop is in byte space, not code point space
962 // Invariant: cp_offset and byte_offset describe the same offset into the
963 // string, but one is in code point space and the other is in byte space
964 // TODO(T41400083): Use a different search algorithm
965 for (word cp_offset = end - 1,
966 byte_offset = haystack.offsetByCodePoints(end_index, -1);
967 byte_offset >= 0; cp_offset--,
968 byte_offset = haystack.offsetByCodePoints(byte_offset, -1)) {
969 if (strHasPrefix(haystack, needle, byte_offset)) {
970 return cp_offset;
971 }
972 }
973 return -1;
974}
975
976word strRFindAsciiChar(const Str& haystack, byte needle) {
977 DCHECK(needle <= kMaxASCII, "must only be called for ASCII `needle`");
978 for (word i = haystack.length() - 1; i >= 0; i--) {
979 if (haystack.byteAt(i) == needle) {
980 return i;
981 }
982 }
983 return -1;
984}
985
986RawObject METH(str, __le__)(Thread* thread, Arguments args) {
987 HandleScope scope(thread);
988 Object self_obj(&scope, args.get(0));
989 Object other_obj(&scope, args.get(1));
990 if (!thread->runtime()->isInstanceOfStr(*self_obj)) {
991 return thread->raiseRequiresType(self_obj, ID(str));
992 }
993 if (!thread->runtime()->isInstanceOfStr(*other_obj)) {
994 return NotImplementedType::object();
995 }
996 Str self(&scope, strUnderlying(*self_obj));
997 Str other(&scope, strUnderlying(*other_obj));
998 return Bool::fromBool(self.compare(*other) <= 0);
999}
1000
1001RawObject METH(str, __len__)(Thread* thread, Arguments args) {
1002 HandleScope scope(thread);
1003 Object self_obj(&scope, args.get(0));
1004 if (!thread->runtime()->isInstanceOfStr(*self_obj)) {
1005 return thread->raiseRequiresType(self_obj, ID(str));
1006 }
1007 Str self(&scope, strUnderlying(*self_obj));
1008 return SmallInt::fromWord(self.codePointLength());
1009}
1010
1011static RawObject strLowerASCII(Thread* thread, Object& str_obj, Str& str,
1012 word length) {
1013 if (str.isSmallStr()) {
1014 byte buf[SmallStr::kMaxLength];
1015 for (word i = 0; i < length; i++) {
1016 buf[i] = ASCII::toLower(str.byteAt(i));
1017 }
1018 return SmallStr::fromBytes({buf, length});
1019 }
1020 // Search for the first uppercase character.
1021 word first_uppercase = 0;
1022 for (; first_uppercase < length; first_uppercase++) {
1023 if (ASCII::isUpper(str.byteAt(first_uppercase))) {
1024 break;
1025 }
1026 }
1027 if (first_uppercase >= length && str_obj.isStr()) {
1028 return *str_obj;
1029 }
1030
1031 HandleScope scope(thread);
1032 Runtime* runtime = thread->runtime();
1033 MutableBytes result(&scope, runtime->newMutableBytesUninitialized(length));
1034 result.replaceFromWithStr(0, *str, first_uppercase);
1035 for (word i = first_uppercase; i < length; i++) {
1036 byte lower = ASCII::toLower(str.byteAt(i));
1037 result.byteAtPut(i, lower);
1038 }
1039 return result.becomeStr();
1040}
1041
1042RawObject METH(str, casefold)(Thread* thread, Arguments args) {
1043 HandleScope scope(thread);
1044 Object self_obj(&scope, args.get(0));
1045 Runtime* runtime = thread->runtime();
1046 if (!runtime->isInstanceOfStr(*self_obj)) {
1047 return thread->raiseRequiresType(self_obj, ID(str));
1048 }
1049 Str self(&scope, strUnderlying(*self_obj));
1050 word length = self.length();
1051 if (self.isASCII()) {
1052 return strLowerASCII(thread, self_obj, self, length);
1053 }
1054
1055 // Search for the first uppercase character.
1056 word first_uppercase = 0;
1057 for (word char_length; first_uppercase < length;
1058 first_uppercase += char_length) {
1059 int32_t code_point = self.codePointAt(first_uppercase, &char_length);
1060 if (Unicode::isUpper(code_point) || Unicode::isTitle(code_point) ||
1061 Unicode::isUnfolded(code_point)) {
1062 break;
1063 }
1064 }
1065 if (first_uppercase >= length && self_obj.isStr()) {
1066 return *self_obj;
1067 }
1068
1069 StrArray result(&scope, runtime->newStrArray());
1070 runtime->strArrayEnsureCapacity(thread, result, length);
1071 // Since the prefix is valid UTF-8 and guaranteed to fit, it's safe to write
1072 // directly to the underlying MutableBytes.
1073 MutableBytes result_bytes(&scope, result.items());
1074 result_bytes.replaceFromWithStr(0, *self, first_uppercase);
1075 result.setNumItems(first_uppercase);
1076 for (word char_length, i = first_uppercase; i < length; i += char_length) {
1077 struct FullCasing casefold =
1078 Unicode::toFolded(self.codePointAt(i, &char_length));
1079 for (word j = 0; j < 3; j++) {
1080 int32_t decoded = casefold.code_points[j];
1081 if (decoded == -1) break;
1082 runtime->strArrayAddCodePoint(thread, result, decoded);
1083 }
1084 }
1085 return runtime->strFromStrArray(result);
1086}
1087
1088RawObject METH(str, lower)(Thread* thread, Arguments args) {
1089 HandleScope scope(thread);
1090 Object self_obj(&scope, args.get(0));
1091 Runtime* runtime = thread->runtime();
1092 if (!runtime->isInstanceOfStr(*self_obj)) {
1093 return thread->raiseRequiresType(self_obj, ID(str));
1094 }
1095 Str self(&scope, strUnderlying(*self_obj));
1096 word length = self.length();
1097 if (self.isASCII()) {
1098 return strLowerASCII(thread, self_obj, self, length);
1099 }
1100
1101 // Search for the first uppercase character.
1102 word first_uppercase = 0;
1103 for (word char_length; first_uppercase < length;
1104 first_uppercase += char_length) {
1105 int32_t code_point = self.codePointAt(first_uppercase, &char_length);
1106 if (Unicode::isUpper(code_point) || Unicode::isTitle(code_point)) {
1107 break;
1108 }
1109 }
1110 if (first_uppercase >= length && self_obj.isStr()) {
1111 return *self_obj;
1112 }
1113
1114 StrArray result(&scope, runtime->newStrArray());
1115 runtime->strArrayEnsureCapacity(thread, result, length);
1116 // Since the prefix is valid UTF-8 and guaranteed to fit, it's safe to write
1117 // directly to the underlying MutableBytes.
1118 MutableBytes result_bytes(&scope, result.items());
1119 result_bytes.replaceFromWithStr(0, *self, first_uppercase);
1120 result.setNumItems(first_uppercase);
1121 for (word char_length, i = first_uppercase; i < length; i += char_length) {
1122 struct FullCasing lower =
1123 Unicode::toLower(self.codePointAt(i, &char_length));
1124 for (word j = 0; j < 3; j++) {
1125 int32_t decoded = lower.code_points[j];
1126 if (decoded == -1) break;
1127 runtime->strArrayAddCodePoint(thread, result, decoded);
1128 }
1129 }
1130 return runtime->strFromStrArray(result);
1131}
1132
1133RawObject METH(str, title)(Thread* thread, Arguments args) {
1134 Runtime* runtime = thread->runtime();
1135 HandleScope scope(thread);
1136 Object self_obj(&scope, args.get(0));
1137 if (!thread->runtime()->isInstanceOfStr(*self_obj)) {
1138 return thread->raiseRequiresType(self_obj, ID(str));
1139 }
1140
1141 Str self(&scope, strUnderlying(*self_obj));
1142 word char_length = self.length();
1143
1144 bool previous_is_cased = false;
1145 StrArray result(&scope, runtime->newStrArray());
1146 for (word i = 0, len; i < char_length; i += len) {
1147 int32_t code_point = self.codePointAt(i, &len);
1148
1149 FullCasing mapped = previous_is_cased ? Unicode::toLower(code_point)
1150 : Unicode::toTitle(code_point);
1151 for (word j = 0; mapped.code_points[j] != -1; j++) {
1152 runtime->strArrayAddCodePoint(thread, result, mapped.code_points[j]);
1153 }
1154
1155 previous_is_cased = Unicode::isCased(code_point);
1156 }
1157
1158 return runtime->strFromStrArray(result);
1159}
1160
1161RawObject METH(str, upper)(Thread* thread, Arguments args) {
1162 HandleScope scope(thread);
1163 Object self_obj(&scope, args.get(0));
1164 Runtime* runtime = thread->runtime();
1165 if (!runtime->isInstanceOfStr(*self_obj)) {
1166 return thread->raiseRequiresType(self_obj, ID(str));
1167 }
1168 Str self(&scope, strUnderlying(*self_obj));
1169 word length = self.length();
1170 if (self.isASCII()) {
1171 if (self.isSmallStr()) {
1172 byte buf[SmallStr::kMaxLength];
1173 for (word i = 0; i < length; i++) {
1174 buf[i] = ASCII::toUpper(self.byteAt(i));
1175 }
1176 return SmallStr::fromBytes({buf, length});
1177 }
1178 // Search for the first lowercase character.
1179 word first_lowercase = 0;
1180 for (; first_lowercase < length; first_lowercase++) {
1181 if (ASCII::isLower(self.byteAt(first_lowercase))) {
1182 break;
1183 }
1184 }
1185 if (first_lowercase >= length && self_obj.isStr()) {
1186 return *self_obj;
1187 }
1188
1189 MutableBytes result(&scope, runtime->newMutableBytesUninitialized(length));
1190 result.replaceFromWithStr(0, *self, first_lowercase);
1191 for (word i = first_lowercase; i < length; i++) {
1192 byte lower = ASCII::toUpper(self.byteAt(i));
1193 result.byteAtPut(i, lower);
1194 }
1195 return result.becomeStr();
1196 }
1197
1198 // Search for the first lowercase character.
1199 word first_lowercase = 0;
1200 for (word char_length; first_lowercase < length;
1201 first_lowercase += char_length) {
1202 int32_t code_point = self.codePointAt(first_lowercase, &char_length);
1203 if (Unicode::isLower(code_point) || Unicode::isTitle(code_point)) {
1204 break;
1205 }
1206 }
1207 if (first_lowercase >= length && self_obj.isStr()) {
1208 return *self_obj;
1209 }
1210
1211 StrArray result(&scope, runtime->newStrArray());
1212 runtime->strArrayEnsureCapacity(thread, result, length);
1213 // Since the prefix is valid UTF-8 and guaranteed to fit, it's safe to write
1214 // directly to the underlying MutableBytes.
1215 MutableBytes result_bytes(&scope, result.items());
1216 result_bytes.replaceFromWithStr(0, *self, first_lowercase);
1217 result.setNumItems(first_lowercase);
1218 for (word char_length, i = first_lowercase; i < length; i += char_length) {
1219 struct FullCasing upper =
1220 Unicode::toUpper(self.codePointAt(i, &char_length));
1221 for (word j = 0; j < 3; j++) {
1222 int32_t decoded = upper.code_points[j];
1223 if (decoded == -1) break;
1224 runtime->strArrayAddCodePoint(thread, result, decoded);
1225 }
1226 }
1227 return runtime->strFromStrArray(result);
1228}
1229
1230RawObject METH(str, __lt__)(Thread* thread, Arguments args) {
1231 HandleScope scope(thread);
1232 Object self_obj(&scope, args.get(0));
1233 Object other_obj(&scope, args.get(1));
1234 if (!thread->runtime()->isInstanceOfStr(*self_obj)) {
1235 return thread->raiseRequiresType(self_obj, ID(str));
1236 }
1237 if (!thread->runtime()->isInstanceOfStr(*other_obj)) {
1238 return NotImplementedType::object();
1239 }
1240 Str self(&scope, strUnderlying(*self_obj));
1241 Str other(&scope, strUnderlying(*other_obj));
1242 return Bool::fromBool(self.compare(*other) < 0);
1243}
1244
1245RawObject METH(str, __mul__)(Thread* thread, Arguments args) {
1246 Runtime* runtime = thread->runtime();
1247 HandleScope scope(thread);
1248 Object self_obj(&scope, args.get(0));
1249 if (!runtime->isInstanceOfStr(*self_obj)) {
1250 return thread->raiseRequiresType(self_obj, ID(str));
1251 }
1252 Object count_index(&scope, args.get(1));
1253 Object count_obj(&scope, intFromIndex(thread, count_index));
1254 if (count_obj.isError()) return *count_obj;
1255 word count = intUnderlying(*count_obj).asWordSaturated();
1256 if (!SmallInt::isValid(count)) {
1257 return thread->raiseWithFmt(LayoutId::kOverflowError,
1258 "cannot fit '%T' into an index-sized integer",
1259 &count_obj);
1260 }
1261 Str self(&scope, *self_obj);
1262 word length = self.length();
1263 if (count <= 0 || length == 0) {
1264 return Str::empty();
1265 }
1266 word new_length;
1267 if (__builtin_mul_overflow(length, count, &new_length) ||
1268 !SmallInt::isValid(new_length)) {
1269 return thread->raiseWithFmt(LayoutId::kOverflowError,
1270 "repeated string is too long");
1271 }
1272 return runtime->strRepeat(thread, self, count);
1273}
1274
1275RawObject METH(str, __ne__)(Thread* thread, Arguments args) {
1276 HandleScope scope(thread);
1277 Object self_obj(&scope, args.get(0));
1278 Object other_obj(&scope, args.get(1));
1279 if (!thread->runtime()->isInstanceOfStr(*self_obj)) {
1280 return thread->raiseRequiresType(self_obj, ID(str));
1281 }
1282 if (!thread->runtime()->isInstanceOfStr(*other_obj)) {
1283 return NotImplementedType::object();
1284 }
1285 Str self(&scope, strUnderlying(*self_obj));
1286 Str other(&scope, strUnderlying(*other_obj));
1287 return Bool::fromBool(!self.equals(*other));
1288}
1289
1290RawObject METH(str, __iter__)(Thread* thread, Arguments args) {
1291 HandleScope scope(thread);
1292 Object self_obj(&scope, args.get(0));
1293 Runtime* runtime = thread->runtime();
1294 if (!runtime->isInstanceOfStr(*self_obj)) {
1295 return thread->raiseRequiresType(self_obj, ID(str));
1296 }
1297 Str self(&scope, strUnderlying(*self_obj));
1298 return runtime->newStrIterator(self);
1299}
1300
1301RawObject METH(str, __repr__)(Thread* thread, Arguments args) {
1302 Runtime* runtime = thread->runtime();
1303 HandleScope scope(thread);
1304 Object self_obj(&scope, args.get(0));
1305 if (!runtime->isInstanceOfStr(*self_obj)) {
1306 return thread->raiseRequiresType(self_obj, ID(str));
1307 }
1308 Str self(&scope, strUnderlying(*self_obj));
1309 const word self_len = self.length();
1310 word result_len = 0;
1311 word squote = 0;
1312 word dquote = 0;
1313 // Precompute the size so that only one allocation is necessary.
1314 for (word i = 0, char_len; i < self_len; i += char_len) {
1315 int32_t code_point = self.codePointAt(i, &char_len);
1316 if (code_point == '\'') {
1317 squote++;
1318 result_len += 1;
1319 } else if (code_point == '"') {
1320 dquote++;
1321 result_len += 1;
1322 } else if (code_point == '\\' || code_point == '\t' || code_point == '\r' ||
1323 code_point == '\n') {
1324 result_len += 2;
1325 } else if (Unicode::isPrintable(code_point)) {
1326 result_len += char_len;
1327 } else if (code_point < 0x100) {
1328 result_len += 4;
1329 } else if (code_point < 0x10000) {
1330 result_len += 6;
1331 } else {
1332 result_len += 10;
1333 }
1334 }
1335
1336 byte quote = '\'';
1337 bool unchanged = (result_len == self_len);
1338 if (squote > 0) {
1339 unchanged = false;
1340 // If there are both single quotes and double quotes, the outer quote will
1341 // be singles, and all internal quotes will need to be escaped.
1342 if (dquote > 0) {
1343 // Add the size of the escape backslashes on the single quotes.
1344 result_len += squote;
1345 } else {
1346 quote = '"';
1347 }
1348 }
1349 result_len += 2; // quotes
1350
1351 MutableBytes buf(&scope, runtime->newMutableBytesUninitialized(result_len));
1352 buf.byteAtPut(0, quote);
1353 buf.byteAtPut(result_len - 1, quote);
1354 if (unchanged) {
1355 // Remaining characters were unmodified, so copy them directly.
1356 buf.replaceFromWithStr(1, *self, self_len);
1357 return buf.becomeStr();
1358 }
1359 word out = 1;
1360 for (word in = 0, char_len; in < self_len; in += char_len) {
1361 int32_t code_point = self.codePointAt(in, &char_len);
1362 if (code_point == quote) {
1363 buf.byteAtPut(out++, '\\');
1364 buf.byteAtPut(out++, quote);
1365 } else if (code_point == '\\') {
1366 buf.byteAtPut(out++, '\\');
1367 buf.byteAtPut(out++, '\\');
1368 } else if (code_point == '\t') {
1369 buf.byteAtPut(out++, '\\');
1370 buf.byteAtPut(out++, 't');
1371 } else if (code_point == '\r') {
1372 buf.byteAtPut(out++, '\\');
1373 buf.byteAtPut(out++, 'r');
1374 } else if (code_point == '\n') {
1375 buf.byteAtPut(out++, '\\');
1376 buf.byteAtPut(out++, 'n');
1377 } else if (' ' <= code_point && code_point < kMaxASCII) {
1378 buf.byteAtPut(out++, code_point);
1379 } else if (code_point <= kMaxASCII) {
1380 buf.byteAtPut(out++, '\\');
1381 buf.byteAtPut(out++, 'x');
1382 uwordToHexadecimalWithMutableBytes(*buf, /*index=*/out, /*num_digits=*/2,
1383 code_point);
1384 out += 2;
1385 } else if (Unicode::isPrintable(code_point)) {
1386 for (word i = 0; i < char_len; i++) {
1387 buf.byteAtPut(out + i, self.byteAt(in + i));
1388 }
1389 out += char_len;
1390 } else if (code_point <= 0xff) {
1391 buf.byteAtPut(out++, '\\');
1392 buf.byteAtPut(out++, 'x');
1393 uwordToHexadecimalWithMutableBytes(*buf, /*index=*/out, /*num_digits=*/2,
1394 code_point);
1395 out += 2;
1396 } else if (code_point <= 0xffff) {
1397 buf.byteAtPut(out++, '\\');
1398 buf.byteAtPut(out++, 'u');
1399 uwordToHexadecimalWithMutableBytes(*buf, /*index=*/out, /*num_digits=*/4,
1400 code_point);
1401 out += 4;
1402 } else {
1403 buf.byteAtPut(out++, '\\');
1404 buf.byteAtPut(out++, 'U');
1405 uwordToHexadecimalWithMutableBytes(*buf, /*index=*/out,
1406 /*num_digits=*/8, code_point);
1407 out += 8;
1408 }
1409 }
1410 DCHECK(out == result_len - 1, "wrote %ld characters, expected %ld", out - 1,
1411 result_len - 2);
1412 return buf.becomeStr();
1413}
1414
1415bool METH(str, _mod_convert_number_int_intrinsic)(Thread* thread) {
1416 RawObject arg = thread->stackTop();
1417 if (arg.isBool()) {
1418 thread->stackDrop(2);
1419 thread->stackSetTop(convertBoolToInt(arg));
1420 return true;
1421 }
1422 if (arg.isInt()) {
1423 thread->stackDrop(2);
1424 thread->stackSetTop(arg);
1425 return true;
1426 }
1427 return false;
1428}
1429
1430bool METH(str, _mod_convert_number_index_intrinsic)(Thread* thread) {
1431 RawObject arg = thread->stackTop();
1432 if (arg.isBool()) {
1433 thread->stackDrop(2);
1434 thread->stackSetTop(convertBoolToInt(arg));
1435 return true;
1436 }
1437 if (arg.isInt()) {
1438 thread->stackDrop(2);
1439 thread->stackSetTop(arg);
1440 return true;
1441 }
1442 return false;
1443}
1444
1445bool METH(str, _mod_check_single_arg_intrinsic)(Thread* thread) {
1446 Runtime* runtime = thread->runtime();
1447 RawObject arg = thread->stackTop();
1448 if (runtime->isInstanceOfTuple(arg)) {
1449 RawTuple arg_tuple = tupleUnderlying(arg);
1450 if (arg_tuple.length() != 1) return false;
1451 thread->stackDrop(2);
1452 thread->stackSetTop(arg_tuple);
1453 return true;
1454 }
1455 RawMutableTuple result = MutableTuple::cast(runtime->newMutableTuple(1));
1456 // Note that we need to re-fetch stackTop() since newMutableTuple() may
1457 // have triggered GC.
1458 result.atPut(0, thread->stackTop());
1459 thread->stackDrop(2);
1460 thread->stackSetTop(result.becomeImmutable());
1461 return true;
1462}
1463
1464RawObject METH(str, isalnum)(Thread* thread, Arguments args) {
1465 HandleScope scope(thread);
1466 Object self_obj(&scope, args.get(0));
1467 if (!thread->runtime()->isInstanceOfStr(*self_obj)) {
1468 return thread->raiseRequiresType(self_obj, ID(str));
1469 }
1470 Str self(&scope, strUnderlying(*self_obj));
1471 word char_length = self.length();
1472 if (char_length == 0) {
1473 return Bool::falseObj();
1474 }
1475 for (word i = 0, len; i < char_length; i += len) {
1476 int32_t code_point = self.codePointAt(i, &len);
1477 if (!Unicode::isAlnum(code_point)) {
1478 return Bool::falseObj();
1479 }
1480 }
1481 return Bool::trueObj();
1482}
1483
1484RawObject METH(str, isalpha)(Thread* thread, Arguments args) {
1485 HandleScope scope(thread);
1486 Object self_obj(&scope, args.get(0));
1487 if (!thread->runtime()->isInstanceOfStr(*self_obj)) {
1488 return thread->raiseRequiresType(self_obj, ID(str));
1489 }
1490 Str self(&scope, strUnderlying(*self_obj));
1491 word char_length = self.length();
1492 if (char_length == 0) {
1493 return Bool::falseObj();
1494 }
1495 for (word i = 0, len; i < char_length; i += len) {
1496 int32_t code_point = self.codePointAt(i, &len);
1497 if (!Unicode::isAlpha(code_point)) {
1498 return Bool::falseObj();
1499 }
1500 }
1501 return Bool::trueObj();
1502}
1503
1504RawObject METH(str, isascii)(Thread* thread, Arguments args) {
1505 HandleScope scope(thread);
1506 Object self_obj(&scope, args.get(0));
1507 if (!thread->runtime()->isInstanceOfStr(*self_obj)) {
1508 return thread->raiseRequiresType(self_obj, ID(str));
1509 }
1510 Str self(&scope, strUnderlying(*self_obj));
1511 return Bool::fromBool(self.isASCII());
1512}
1513
1514RawObject METH(str, isdecimal)(Thread* thread, Arguments args) {
1515 HandleScope scope(thread);
1516 Object self_obj(&scope, args.get(0));
1517 if (!thread->runtime()->isInstanceOfStr(*self_obj)) {
1518 return thread->raiseRequiresType(self_obj, ID(str));
1519 }
1520 Str self(&scope, strUnderlying(*self_obj));
1521 word char_length = self.length();
1522 if (char_length == 0) {
1523 return Bool::falseObj();
1524 }
1525 for (word i = 0, len; i < char_length; i += len) {
1526 int32_t code_point = self.codePointAt(i, &len);
1527 if (!Unicode::isDecimal(code_point)) {
1528 return Bool::falseObj();
1529 }
1530 }
1531 return Bool::trueObj();
1532}
1533
1534RawObject METH(str, isdigit)(Thread* thread, Arguments args) {
1535 HandleScope scope(thread);
1536 Object self_obj(&scope, args.get(0));
1537 if (!thread->runtime()->isInstanceOfStr(*self_obj)) {
1538 return thread->raiseRequiresType(self_obj, ID(str));
1539 }
1540 Str self(&scope, strUnderlying(*self_obj));
1541 word char_length = self.length();
1542 if (char_length == 0) {
1543 return Bool::falseObj();
1544 }
1545 for (word i = 0, len; i < char_length; i += len) {
1546 int32_t code_point = self.codePointAt(i, &len);
1547 if (!Unicode::isDigit(code_point)) {
1548 return Bool::falseObj();
1549 }
1550 }
1551 return Bool::trueObj();
1552}
1553
1554bool strIsIdentifier(const Str& str) {
1555 word char_length = str.length();
1556 if (char_length == 0) {
1557 return false;
1558 }
1559 word len;
1560 int32_t first = str.codePointAt(0, &len);
1561 if (!Unicode::isXidStart(first) && first != '_') {
1562 return false;
1563 }
1564 for (word i = len; i < char_length; i += len) {
1565 int32_t code_point = str.codePointAt(i, &len);
1566 if (!Unicode::isXidContinue(code_point)) {
1567 return false;
1568 }
1569 }
1570 return true;
1571}
1572
1573RawObject METH(str, isidentifier)(Thread* thread, Arguments args) {
1574 HandleScope scope(thread);
1575 Object self_obj(&scope, args.get(0));
1576 if (!thread->runtime()->isInstanceOfStr(*self_obj)) {
1577 return thread->raiseRequiresType(self_obj, ID(str));
1578 }
1579 Str self(&scope, strUnderlying(*self_obj));
1580 return Bool::fromBool(strIsIdentifier(self));
1581}
1582
1583RawObject METH(str, islower)(Thread* thread, Arguments args) {
1584 HandleScope scope(thread);
1585 Object self_obj(&scope, args.get(0));
1586 if (!thread->runtime()->isInstanceOfStr(*self_obj)) {
1587 return thread->raiseRequiresType(self_obj, ID(str));
1588 }
1589 Str self(&scope, strUnderlying(*self_obj));
1590 word char_length = self.length();
1591 bool cased = false;
1592 for (word i = 0, len; i < char_length; i += len) {
1593 int32_t code_point = self.codePointAt(i, &len);
1594 if (Unicode::isUpper(code_point) || Unicode::isTitle(code_point)) {
1595 return Bool::falseObj();
1596 }
1597 if (!cased && Unicode::isLower(code_point)) {
1598 cased = true;
1599 }
1600 }
1601 return Bool::fromBool(cased);
1602}
1603
1604RawObject METH(str, isnumeric)(Thread* thread, Arguments args) {
1605 HandleScope scope(thread);
1606 Object self_obj(&scope, args.get(0));
1607 if (!thread->runtime()->isInstanceOfStr(*self_obj)) {
1608 return thread->raiseRequiresType(self_obj, ID(str));
1609 }
1610 Str self(&scope, strUnderlying(*self_obj));
1611 word char_length = self.length();
1612 if (char_length == 0) {
1613 return Bool::falseObj();
1614 }
1615 for (word i = 0, len; i < char_length; i += len) {
1616 int32_t code_point = self.codePointAt(i, &len);
1617 if (!Unicode::isNumeric(code_point)) {
1618 return Bool::falseObj();
1619 }
1620 }
1621 return Bool::trueObj();
1622}
1623
1624RawObject METH(str, isprintable)(Thread* thread, Arguments args) {
1625 HandleScope scope(thread);
1626 Object self_obj(&scope, args.get(0));
1627 if (!thread->runtime()->isInstanceOfStr(*self_obj)) {
1628 return thread->raiseRequiresType(self_obj, ID(str));
1629 }
1630 Str self(&scope, strUnderlying(*self_obj));
1631 word char_length = self.length();
1632 for (word i = 0, len; i < char_length; i += len) {
1633 int32_t code_point = self.codePointAt(i, &len);
1634 if (!Unicode::isPrintable(code_point)) {
1635 return Bool::falseObj();
1636 }
1637 }
1638 return Bool::trueObj();
1639}
1640
1641RawObject METH(str, isspace)(Thread* thread, Arguments args) {
1642 HandleScope scope(thread);
1643 Object self_obj(&scope, args.get(0));
1644 if (!thread->runtime()->isInstanceOfStr(*self_obj)) {
1645 return thread->raiseRequiresType(self_obj, ID(str));
1646 }
1647 Str self(&scope, strUnderlying(*self_obj));
1648 word char_length = self.length();
1649 if (char_length == 0) {
1650 return Bool::falseObj();
1651 }
1652 if (char_length == 1) {
1653 return ASCII::isSpace(self.byteAt(0)) ? Bool::trueObj() : Bool::falseObj();
1654 }
1655 word byte_index = 0;
1656 do {
1657 word num_bytes;
1658 int32_t codepoint = self.codePointAt(byte_index, &num_bytes);
1659 if (!Unicode::isSpace(codepoint)) {
1660 return Bool::falseObj();
1661 }
1662 byte_index += num_bytes;
1663 } while (byte_index < char_length);
1664 return Bool::trueObj();
1665}
1666
1667RawObject METH(str, istitle)(Thread* thread, Arguments args) {
1668 HandleScope scope(thread);
1669 Object self_obj(&scope, args.get(0));
1670 if (!thread->runtime()->isInstanceOfStr(*self_obj)) {
1671 return thread->raiseRequiresType(self_obj, ID(str));
1672 }
1673 Str self(&scope, strUnderlying(*self_obj));
1674 bool cased = false;
1675 bool previous_is_cased = false;
1676 word char_length = self.length();
1677 for (word i = 0, len; i < char_length; i += len) {
1678 int32_t code_point = self.codePointAt(i, &len);
1679 if (Unicode::isUpper(code_point) || Unicode::isTitle(code_point)) {
1680 if (previous_is_cased) return Bool::falseObj();
1681 cased = true;
1682 previous_is_cased = true;
1683 } else if (Unicode::isLower(code_point)) {
1684 if (!previous_is_cased) return Bool::falseObj();
1685 previous_is_cased = true;
1686 cased = true;
1687 } else {
1688 previous_is_cased = false;
1689 }
1690 }
1691 return Bool::fromBool(cased);
1692}
1693
1694RawObject METH(str, isupper)(Thread* thread, Arguments args) {
1695 HandleScope scope(thread);
1696 Object self_obj(&scope, args.get(0));
1697 if (!thread->runtime()->isInstanceOfStr(*self_obj)) {
1698 return thread->raiseRequiresType(self_obj, ID(str));
1699 }
1700 Str self(&scope, strUnderlying(*self_obj));
1701 word char_length = self.length();
1702 bool cased = false;
1703 for (word i = 0, len; i < char_length; i += len) {
1704 int32_t code_point = self.codePointAt(i, &len);
1705 if (Unicode::isLower(code_point) || Unicode::isTitle(code_point)) {
1706 return Bool::falseObj();
1707 }
1708 if (!cased && Unicode::isUpper(code_point)) {
1709 cased = true;
1710 }
1711 }
1712 return Bool::fromBool(cased);
1713}
1714
1715RawObject METH(str, lstrip)(Thread* thread, Arguments args) {
1716 Runtime* runtime = thread->runtime();
1717 HandleScope scope(thread);
1718 Object self_obj(&scope, args.get(0));
1719 if (!runtime->isInstanceOfStr(*self_obj)) {
1720 return thread->raiseRequiresType(self_obj, ID(str));
1721 }
1722 Str str(&scope, strUnderlying(*self_obj));
1723 Object other_obj(&scope, args.get(1));
1724 if (other_obj.isNoneType()) {
1725 return strStripSpaceLeft(thread, str);
1726 }
1727 if (!runtime->isInstanceOfStr(*other_obj)) {
1728 return thread->raiseWithFmt(LayoutId::kTypeError,
1729 "str.lstrip() arg must be None or str");
1730 }
1731 Str chars(&scope, strUnderlying(*other_obj));
1732 return strStripLeft(thread, str, chars);
1733}
1734
1735RawObject METH(str, rstrip)(Thread* thread, Arguments args) {
1736 Runtime* runtime = thread->runtime();
1737 HandleScope scope(thread);
1738 Object self_obj(&scope, args.get(0));
1739 if (!runtime->isInstanceOfStr(*self_obj)) {
1740 return thread->raiseRequiresType(self_obj, ID(str));
1741 }
1742 Str str(&scope, strUnderlying(*self_obj));
1743 Object other_obj(&scope, args.get(1));
1744 if (other_obj.isNoneType()) {
1745 return strStripSpaceRight(thread, str);
1746 }
1747 if (!runtime->isInstanceOfStr(*other_obj)) {
1748 return thread->raiseWithFmt(LayoutId::kTypeError,
1749 "str.rstrip() arg must be None or str");
1750 }
1751 Str chars(&scope, strUnderlying(*other_obj));
1752 return strStripRight(thread, str, chars);
1753}
1754
1755RawObject METH(str, strip)(Thread* thread, Arguments args) {
1756 Runtime* runtime = thread->runtime();
1757 HandleScope scope(thread);
1758 Object self_obj(&scope, args.get(0));
1759 if (!runtime->isInstanceOfStr(*self_obj)) {
1760 return thread->raiseRequiresType(self_obj, ID(str));
1761 }
1762 Str str(&scope, strUnderlying(*self_obj));
1763 Object other_obj(&scope, args.get(1));
1764 if (other_obj.isNoneType()) {
1765 return strStripSpace(thread, str);
1766 }
1767 if (!runtime->isInstanceOfStr(*other_obj)) {
1768 return thread->raiseWithFmt(LayoutId::kTypeError,
1769 "str.strip() arg must be None or str");
1770 }
1771 Str chars(&scope, strUnderlying(*other_obj));
1772 return strStrip(thread, str, chars);
1773}
1774
1775static int32_t handleCapitalSigma(const Str& str, word i) {
1776 bool final_sigma = false;
1777 for (word j = str.offsetByCodePoints(i, -1), len; j >= 0;
1778 j = str.offsetByCodePoints(j, -1)) {
1779 int32_t code_point = str.codePointAt(j, &len);
1780 if (!Unicode::isCaseIgnorable(code_point)) {
1781 final_sigma = Unicode::isCased(code_point);
1782 break;
1783 }
1784 }
1785 if (!final_sigma) {
1786 return 0x03C3;
1787 }
1788
1789 word char_length = str.length();
1790 for (word j = str.offsetByCodePoints(i, 1), len; j < char_length; j += len) {
1791 int32_t code_point = str.codePointAt(j, &len);
1792 if (!Unicode::isCaseIgnorable(code_point)) {
1793 return Unicode::isCased(code_point) ? 0x03C3 : 0x03C2;
1794 }
1795 }
1796 return 0x03C2;
1797}
1798
1799RawObject METH(str, swapcase)(Thread* thread, Arguments args) {
1800 HandleScope scope(thread);
1801 Runtime* runtime = thread->runtime();
1802 Object self_obj(&scope, args.get(0));
1803 if (!runtime->isInstanceOfStr(*self_obj)) {
1804 return thread->raiseRequiresType(self_obj, ID(str));
1805 }
1806 Str self(&scope, strUnderlying(*self_obj));
1807 word char_length = self.length();
1808
1809 // Most of the time, this will be sufficient. However, due to Unicode casing,
1810 // it's possible that we could need up to 3 times as much space as the input.
1811 StrArray result(&scope, runtime->newStrArray());
1812 runtime->strArrayEnsureCapacity(thread, result, char_length);
1813 for (word i = 0, len; i < char_length; i += len) {
1814 int32_t code_point = self.codePointAt(i, &len);
1815 if (Unicode::isUpper(code_point)) {
1816 if (code_point == 0x03a3) {
1817 runtime->strArrayAddCodePoint(thread, result,
1818 handleCapitalSigma(self, i));
1819 } else {
1820 FullCasing lower = Unicode::toLower(code_point);
1821 for (word j = 0; j < 3; j++) {
1822 int32_t decoded = lower.code_points[j];
1823 if (decoded == -1) break;
1824 runtime->strArrayAddCodePoint(thread, result, decoded);
1825 }
1826 }
1827 } else {
1828 FullCasing upper = Unicode::toUpper(code_point);
1829 for (word j = 0; j < 3; j++) {
1830 int32_t decoded = upper.code_points[j];
1831 if (decoded == -1) break;
1832 runtime->strArrayAddCodePoint(thread, result, decoded);
1833 }
1834 }
1835 }
1836 return runtime->strFromStrArray(result);
1837}
1838
1839RawObject METH(str_iterator, __iter__)(Thread* thread, Arguments args) {
1840 HandleScope scope(thread);
1841 Object self(&scope, args.get(0));
1842 if (!self.isStrIterator()) {
1843 return thread->raiseRequiresType(self, ID(str_iterator));
1844 }
1845 return *self;
1846}
1847
1848// TODO(T35578204) Implement this for UTF-8. This probably means keeping extra
1849// state and logic so that __next__() will advance to the next codepoint.
1850
1851RawObject METH(str_iterator, __next__)(Thread* thread, Arguments args) {
1852 HandleScope scope(thread);
1853 Object self(&scope, args.get(0));
1854 if (!self.isStrIterator()) {
1855 return thread->raiseRequiresType(self, ID(str_iterator));
1856 }
1857 StrIterator iter(&scope, *self);
1858 Object value(&scope, strIteratorNext(thread, iter));
1859 if (value.isError()) {
1860 return thread->raise(LayoutId::kStopIteration, NoneType::object());
1861 }
1862 return *value;
1863}
1864
1865RawObject METH(str_iterator, __length_hint__)(Thread* thread, Arguments args) {
1866 HandleScope scope(thread);
1867 Object self(&scope, args.get(0));
1868 if (!self.isStrIterator()) {
1869 return thread->raiseRequiresType(self, ID(str_iterator));
1870 }
1871 StrIterator str_iterator(&scope, *self);
1872 Str str(&scope, str_iterator.iterable());
1873 return SmallInt::fromWord(str.length() - str_iterator.index());
1874}
1875
1876} // namespace py