this repo has no description
1// Copyright (c) Facebook, Inc. and its affiliates. (http://www.facebook.com)
2// unicodeobject.c implementation
3#include <cerrno>
4#include <cstdarg>
5#include <cstring>
6#include <cwchar>
7
8#include "cpython-data.h"
9#include "cpython-func.h"
10
11#include "api-handle.h"
12#include "bytearray-builtins.h"
13#include "bytes-builtins.h"
14#include "handles.h"
15#include "modules.h"
16#include "objects.h"
17#include "runtime.h"
18#include "str-builtins.h"
19#include "unicode.h"
20#include "utils.h"
21
22const char* Py_FileSystemDefaultEncoding = "utf-8";
23int Py_HasFileSystemDefaultEncoding = 1;
24const char* Py_FileSystemDefaultEncodeErrors = "surrogatepass";
25
26namespace py {
27
28typedef byte Py_UCS1;
29typedef uint16_t Py_UCS2;
30
31static const int kMaxLongLongChars = 19; // len(str(2**63-1))
32static const int kOverallocateFactor = 4;
33
34PY_EXPORT PyTypeObject* PyUnicodeIter_Type_Ptr() {
35 Runtime* runtime = Thread::current()->runtime();
36 return reinterpret_cast<PyTypeObject*>(ApiHandle::borrowedReference(
37 runtime, runtime->typeAt(LayoutId::kStrIterator)));
38}
39
40static RawObject symbolFromError(Thread* thread, const char* error) {
41 Runtime* runtime = thread->runtime();
42 Symbols* symbols = runtime->symbols();
43 if (error == nullptr || std::strcmp(error, "strict") == 0) {
44 return symbols->at(ID(strict));
45 }
46 if (std::strcmp(error, "ignore") == 0) {
47 return symbols->at(ID(ignore));
48 }
49 if (std::strcmp(error, "replace") == 0) {
50 return symbols->at(ID(replace));
51 }
52 return Runtime::internStrFromCStr(thread, error);
53}
54
55PY_EXPORT void PyUnicode_WRITE_Func(enum PyUnicode_Kind kind, void* data,
56 Py_ssize_t index, Py_UCS4 value) {
57 if (kind == PyUnicode_1BYTE_KIND) {
58 static_cast<Py_UCS1*>(data)[index] = static_cast<Py_UCS1>(value);
59 } else if (kind == PyUnicode_2BYTE_KIND) {
60 static_cast<Py_UCS2*>(data)[index] = static_cast<Py_UCS2>(value);
61 } else {
62 DCHECK(kind == PyUnicode_4BYTE_KIND, "kind must be PyUnicode_4BYTE_KIND");
63 static_cast<Py_UCS4*>(data)[index] = static_cast<Py_UCS4>(value);
64 }
65}
66
67PY_EXPORT void _PyUnicodeWriter_Dealloc(_PyUnicodeWriter* writer) {
68 PyMem_Free(writer->data);
69}
70
71PY_EXPORT PyObject* _PyUnicodeWriter_Finish(_PyUnicodeWriter* writer) {
72 Thread* thread = Thread::current();
73 HandleScope scope(thread);
74 Runtime* runtime = thread->runtime();
75 Str str(&scope, runtime->newStrFromUTF32(View<int32_t>(
76 static_cast<int32_t*>(writer->data), writer->pos)));
77 PyMem_Free(writer->data);
78 return ApiHandle::newReference(runtime, *str);
79}
80
81PY_EXPORT void _PyUnicodeWriter_Init(_PyUnicodeWriter* writer) {
82 std::memset(writer, 0, sizeof(*writer));
83 writer->kind = PyUnicode_4BYTE_KIND;
84}
85
86static int _PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter* writer,
87 Py_ssize_t length,
88 Py_UCS4 /* maxchar */) {
89 writer->maxchar = kMaxUnicode;
90 if (length > kMaxWord - writer->pos) {
91 Thread::current()->raiseMemoryError();
92 return -1;
93 }
94 Py_ssize_t newlen = writer->pos + length;
95 if (writer->data == nullptr) {
96 if (writer->overallocate &&
97 newlen <= (kMaxWord - newlen / kOverallocateFactor)) {
98 // overallocate to limit the number of realloc()
99 newlen += newlen / kOverallocateFactor;
100 }
101 writer->data = PyMem_Malloc(newlen * sizeof(int32_t));
102 if (writer->data == nullptr) return -1;
103 } else if (newlen > writer->size) {
104 if (writer->overallocate &&
105 newlen <= (kMaxWord - newlen / kOverallocateFactor)) {
106 // overallocate to limit the number of realloc()
107 newlen += newlen / kOverallocateFactor;
108 }
109 writer->data = PyMem_Realloc(writer->data, newlen * sizeof(int32_t));
110 if (writer->data == nullptr) return -1;
111 }
112 writer->size = newlen;
113 return 0;
114}
115
116PY_EXPORT int _PyUnicodeWriter_Prepare(_PyUnicodeWriter* writer,
117 Py_ssize_t length, Py_UCS4 maxchar) {
118 if (length <= writer->size - writer->pos || length == 0) return 0;
119 return _PyUnicodeWriter_PrepareInternal(writer, length, maxchar);
120}
121
122PY_EXPORT int _PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter* writer,
123 const char* ascii,
124 Py_ssize_t len) {
125 if (len == -1) len = std::strlen(ascii);
126 if (writer->data == nullptr && !writer->overallocate) {
127 writer->data = PyMem_Malloc(len * sizeof(int32_t));
128 writer->size = len;
129 }
130
131 if (_PyUnicodeWriter_Prepare(writer, len, kMaxUnicode) == -1) return -1;
132 Py_UCS4* data = static_cast<Py_UCS4*>(writer->data);
133 for (Py_ssize_t i = 0; i < len; ++i) {
134 CHECK(ascii[i] >= 0, "_PyUnicodeWriter_WriteASCIIString only takes ASCII");
135 data[writer->pos++] = static_cast<uint8_t>(ascii[i]);
136 }
137 return 0;
138}
139
140PY_EXPORT int _PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter* writer,
141 Py_UCS4 ch) {
142 if (_PyUnicodeWriter_Prepare(writer, 1, ch) < 0) return -1;
143 PyUnicode_WRITE(PyUnicode_4BYTE_KIND, writer->data, writer->pos, ch);
144 writer->pos++;
145 return 0;
146}
147
148PY_EXPORT int _PyUnicodeWriter_WriteChar(_PyUnicodeWriter* writer, Py_UCS4 ch) {
149 return _PyUnicodeWriter_WriteCharInline(writer, ch);
150}
151
152PY_EXPORT int _PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter* writer,
153 const char* str,
154 Py_ssize_t len) {
155 if (_PyUnicodeWriter_Prepare(writer, len, kMaxUnicode) == -1) return -1;
156 Py_UCS4* data = static_cast<Py_UCS4*>(writer->data);
157 for (Py_ssize_t i = 0; i < len; ++i) {
158 data[writer->pos++] = static_cast<uint8_t>(str[i]);
159 }
160 return 0;
161}
162
163PY_EXPORT int _PyUnicodeWriter_WriteStr(_PyUnicodeWriter* writer,
164 PyObject* str) {
165 Thread* thread = Thread::current();
166 HandleScope scope(thread);
167 Object obj(&scope, ApiHandle::asObject(ApiHandle::fromPyObject(str)));
168 Str src(&scope, strUnderlying(*obj));
169 Py_ssize_t codepoints = src.codePointLength();
170 if (_PyUnicodeWriter_Prepare(writer, codepoints, kMaxUnicode) == -1) {
171 return -1;
172 }
173 Py_UCS4* data = static_cast<Py_UCS4*>(writer->data);
174 for (word i = 0, len = src.length(), cp_len; i < len; i += cp_len) {
175 int32_t cp = src.codePointAt(i, &cp_len);
176 data[writer->pos++] = cp;
177 }
178 return 0;
179}
180
181PY_EXPORT int _PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter* writer,
182 PyObject* str, Py_ssize_t start,
183 Py_ssize_t end) {
184 if (end == 0) return 0;
185 Py_ssize_t len = end - start;
186 if (_PyUnicodeWriter_Prepare(writer, len, kMaxUnicode) < 0) return -1;
187
188 Thread* thread = Thread::current();
189 HandleScope scope(thread);
190 Object obj(&scope, ApiHandle::asObject(ApiHandle::fromPyObject(str)));
191 Str src(&scope, strUnderlying(*obj));
192 word start_index = thread->strOffset(src, start);
193 DCHECK_BOUND(start_index, src.length());
194 word end_index = thread->strOffset(src, end);
195 DCHECK_BOUND(end_index, src.length());
196 Py_UCS4* data = static_cast<Py_UCS4*>(writer->data);
197 for (word i = start_index, cp_len; i < end_index; i += cp_len) {
198 int32_t cp = src.codePointAt(i, &cp_len);
199 data[writer->pos++] = cp;
200 }
201 return 0;
202}
203
204// Facebook: D13491655
205// Most of the following helper functions, along with PyUnicode_FromFormat and
206// PyUnicode_FromFormatV are directly imported from CPython. The following
207// modifications have been made:
208//
209// - Since our internal strings are always UTF-8, we don't need maxchar or any
210// of the helper functions required to calculate it
211//
212// - Since our strings are immutable, we can't use PyUnicode_Fill. However,
213// since the helper functions always use it to append to strings, we can get
214// away with just writing characters in a loop.
215//
216// - Since our internal strings are always UTF-8, there is no need to check
217// a character's 'Kind' before writing it to a string
218static int writeStr(_PyUnicodeWriter* writer, PyObject* str, Py_ssize_t width,
219 Py_ssize_t precision) {
220 if (PyUnicode_READY(str) == -1) return -1;
221
222 Py_ssize_t length = PyUnicode_GET_LENGTH(str);
223 if ((precision == -1 || precision >= length) && width <= length) {
224 return _PyUnicodeWriter_WriteStr(writer, str);
225 }
226
227 if (precision != -1) length = Py_MIN(precision, length);
228
229 Py_ssize_t arglen = Py_MAX(length, width);
230 // Facebook: Our internal strings are always UTF-8, don't need maxchar
231 // (D13491655)
232 if (_PyUnicodeWriter_Prepare(writer, arglen, 0) == -1) return -1;
233
234 if (width > length) {
235 Py_ssize_t fill = width - length;
236 // Facebook: Our internal strings are immutable, can't use PyUnicode_Fill
237 // (D13491655)
238 for (Py_ssize_t i = 0; i < fill; ++i) {
239 if (_PyUnicodeWriter_WriteCharInline(writer, ' ') == -1) return -1;
240 }
241 }
242 // Facebook: Since we only have one internal representation, we don't have
243 // to worry about changing a string's 'Kind' (D13491655)
244 return _PyUnicodeWriter_WriteSubstring(writer, str, 0, length);
245}
246
247static int writeCStr(_PyUnicodeWriter* writer, const char* str,
248 Py_ssize_t width, Py_ssize_t precision) {
249 Py_ssize_t length = std::strlen(str);
250 if (precision != -1) length = Py_MIN(length, precision);
251 PyObject* unicode =
252 PyUnicode_DecodeUTF8Stateful(str, length, "replace", nullptr);
253 if (unicode == nullptr) return -1;
254
255 int res = writeStr(writer, unicode, width, -1);
256 Py_DECREF(unicode);
257 return res;
258}
259
260static const char* writeArg(_PyUnicodeWriter* writer, const char* f,
261 va_list* vargs) {
262 const char* p = f;
263 f++;
264 int zeropad = 0;
265 if (*f == '0') {
266 zeropad = 1;
267 f++;
268 }
269
270 // parse the width.precision part, e.g. "%2.5s" => width=2, precision=5
271 Py_ssize_t width = -1;
272 if (Py_ISDIGIT(static_cast<unsigned>(*f))) {
273 width = *f - '0';
274 f++;
275 while (Py_ISDIGIT(static_cast<unsigned>(*f))) {
276 if (width > (kMaxWord - (static_cast<int>(*f) - '0')) / 10) {
277 Thread::current()->raiseWithFmt(LayoutId::kValueError, "width too big");
278 return nullptr;
279 }
280 width = (width * 10) + (*f - '0');
281 f++;
282 }
283 }
284 Py_ssize_t precision = -1;
285 if (*f == '.') {
286 f++;
287 if (Py_ISDIGIT(static_cast<unsigned>(*f))) {
288 precision = (*f - '0');
289 f++;
290 while (Py_ISDIGIT(static_cast<unsigned>(*f))) {
291 if (precision > (kMaxWord - (static_cast<int>(*f) - '0')) / 10) {
292 Thread::current()->raiseWithFmt(LayoutId::kValueError,
293 "precision too big");
294 return nullptr;
295 }
296 precision = (precision * 10) + (*f - '0');
297 f++;
298 }
299 }
300 if (*f == '%') {
301 // "%.3%s" => f points to "3"
302 f--;
303 }
304 }
305 if (*f == '\0') {
306 // bogus format "%.123" => go backward, f points to "3"
307 f--;
308 }
309
310 // Handle %ld, %lu, %lld and %llu.
311 int longflag = 0;
312 int longlongflag = 0;
313 int size_tflag = 0;
314 if (*f == 'l') {
315 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
316 longflag = 1;
317 ++f;
318 } else if (f[1] == 'l' && (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
319 longlongflag = 1;
320 f += 2;
321 }
322 }
323 // handle the size_t flag.
324 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
325 size_tflag = 1;
326 ++f;
327 }
328
329 if (f[1] == '\0') writer->overallocate = 0;
330
331 switch (*f) {
332 case 'c': {
333 int ordinal = va_arg(*vargs, int);
334 if (ordinal < 0 || ordinal > kMaxUnicode) {
335 Thread::current()->raiseWithFmt(
336 LayoutId::kOverflowError,
337 "character argument not in range(0x110000)");
338 return nullptr;
339 }
340 if (_PyUnicodeWriter_WriteCharInline(writer, ordinal) < 0) return nullptr;
341 break;
342 }
343
344 case 'i':
345 case 'd':
346 case 'u':
347 case 'x': {
348 // used by sprintf
349 char buffer[kMaxLongLongChars];
350 Py_ssize_t len;
351
352 if (*f == 'u') {
353 if (longflag) {
354 len = std::sprintf(buffer, "%lu", va_arg(*vargs, unsigned long));
355 } else if (longlongflag) {
356 len =
357 std::sprintf(buffer, "%llu", va_arg(*vargs, unsigned long long));
358 } else if (size_tflag) {
359 len = std::sprintf(buffer, "%" PY_FORMAT_SIZE_T "u",
360 va_arg(*vargs, size_t));
361 } else {
362 len = std::sprintf(buffer, "%u", va_arg(*vargs, unsigned int));
363 }
364 } else if (*f == 'x') {
365 len = std::sprintf(buffer, "%x", va_arg(*vargs, int));
366 } else {
367 if (longflag) {
368 len = std::sprintf(buffer, "%li", va_arg(*vargs, long));
369 } else if (longlongflag) {
370 len = std::sprintf(buffer, "%lli", va_arg(*vargs, long long));
371 } else if (size_tflag) {
372 len = std::sprintf(buffer, "%" PY_FORMAT_SIZE_T "i",
373 va_arg(*vargs, Py_ssize_t));
374 } else {
375 len = std::sprintf(buffer, "%i", va_arg(*vargs, int));
376 }
377 }
378 DCHECK(len >= 0, "len must be >= 0");
379
380 if (precision < len) precision = len;
381
382 Py_ssize_t arglen = Py_MAX(precision, width);
383 if (_PyUnicodeWriter_Prepare(writer, arglen, 127) == -1) return nullptr;
384
385 if (width > precision) {
386 Py_ssize_t fill = width - precision;
387 Py_UCS4 fillchar = zeropad ? '0' : ' ';
388 // Facebook: Our internal strings are immutable, can't use
389 // PyUnicode_Fill (D13491655)
390 for (Py_ssize_t i = 0; i < fill; ++i) {
391 if (_PyUnicodeWriter_WriteCharInline(writer, fillchar) == -1) {
392 return nullptr;
393 }
394 }
395 }
396 if (precision > len) {
397 Py_ssize_t fill = precision - len;
398 // Facebook: Our internal strings are immutable, can't use
399 // PyUnicode_Fill (D13491655)
400 for (Py_ssize_t i = 0; i < fill; ++i) {
401 if (_PyUnicodeWriter_WriteCharInline(writer, '0') == -1) {
402 return nullptr;
403 }
404 }
405 }
406
407 if (_PyUnicodeWriter_WriteASCIIString(writer, buffer, len) < 0) {
408 return nullptr;
409 }
410 break;
411 }
412
413 case 'p': {
414 char number[kMaxLongLongChars];
415
416 Py_ssize_t len = std::sprintf(number, "%p", va_arg(*vargs, void*));
417 DCHECK(len >= 0, "len must be >= 0");
418
419 // %p is ill-defined: ensure leading 0x.
420 if (number[1] == 'X') {
421 number[1] = 'x';
422 } else if (number[1] != 'x') {
423 std::memmove(number + 2, number, std::strlen(number) + 1);
424 number[0] = '0';
425 number[1] = 'x';
426 len += 2;
427 }
428
429 if (_PyUnicodeWriter_WriteASCIIString(writer, number, len) < 0) {
430 return nullptr;
431 }
432 break;
433 }
434
435 case 's': {
436 // UTF-8
437 const char* s = va_arg(*vargs, const char*);
438 if (writeCStr(writer, s, width, precision) < 0) {
439 return nullptr;
440 }
441 break;
442 }
443
444 case 'U': {
445 PyObject* obj = va_arg(*vargs, PyObject*);
446 // This used to call _PyUnicode_CHECK, which is deprecated, and which we
447 // have not imported.
448 DCHECK(obj, "obj must not be null");
449
450 if (writeStr(writer, obj, width, precision) == -1) {
451 return nullptr;
452 }
453 break;
454 }
455
456 case 'V': {
457 PyObject* obj = va_arg(*vargs, PyObject*);
458 const char* str = va_arg(*vargs, const char*);
459 if (obj) {
460 // This used to DCHECK _PyUnicode_CHECK, which is deprecated, and which
461 // we have not imported.
462 if (writeStr(writer, obj, width, precision) == -1) {
463 return nullptr;
464 }
465 } else {
466 DCHECK(str != nullptr, "str must not be null");
467 if (writeCStr(writer, str, width, precision) < 0) {
468 return nullptr;
469 }
470 }
471 break;
472 }
473
474 case 'S': {
475 PyObject* obj = va_arg(*vargs, PyObject*);
476 DCHECK(obj, "obj must not be null");
477 PyObject* str = PyObject_Str(obj);
478 if (!str) return nullptr;
479 if (writeStr(writer, str, width, precision) == -1) {
480 Py_DECREF(str);
481 return nullptr;
482 }
483 Py_DECREF(str);
484 break;
485 }
486
487 case 'R': {
488 PyObject* obj = va_arg(*vargs, PyObject*);
489 DCHECK(obj, "obj must not be null");
490 PyObject* repr = PyObject_Repr(obj);
491 if (!repr) return nullptr;
492 if (writeStr(writer, repr, width, precision) == -1) {
493 Py_DECREF(repr);
494 return nullptr;
495 }
496 Py_DECREF(repr);
497 break;
498 }
499
500 case 'A': {
501 PyObject* obj = va_arg(*vargs, PyObject*);
502 DCHECK(obj, "obj must not be null");
503 PyObject* ascii = PyObject_ASCII(obj);
504 if (!ascii) return nullptr;
505 if (writeStr(writer, ascii, width, precision) == -1) {
506 Py_DECREF(ascii);
507 return nullptr;
508 }
509 Py_DECREF(ascii);
510 break;
511 }
512
513 case '%':
514 if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0) return nullptr;
515 break;
516
517 default: {
518 // if we stumble upon an unknown formatting code, copy the rest
519 // of the format string to the output string. (we cannot just
520 // skip the code, since there's no way to know what's in the
521 // argument list)
522 Py_ssize_t len = std::strlen(p);
523 if (_PyUnicodeWriter_WriteLatin1String(writer, p, len) == -1) {
524 return nullptr;
525 }
526 f = p + len;
527 return f;
528 }
529 }
530
531 f++;
532 return f;
533}
534
535PY_EXPORT int _PyUnicode_EqualToASCIIString(PyObject* unicode,
536 const char* c_str) {
537 DCHECK(unicode, "nullptr argument");
538 DCHECK(c_str, "nullptr argument");
539 RawObject obj = ApiHandle::asObject(ApiHandle::fromPyObject(unicode));
540 DCHECK(Thread::current()->runtime()->isInstanceOfStr(obj),
541 "non-str argument");
542 return strUnderlying(obj).equalsCStr(c_str);
543}
544
545PY_EXPORT int _PyUnicode_EQ(PyObject* aa, PyObject* bb) {
546 Thread* thread = Thread::current();
547 HandleScope scope(thread);
548 Object obj_aa(&scope, ApiHandle::asObject(ApiHandle::fromPyObject(aa)));
549 Object obj_bb(&scope, ApiHandle::asObject(ApiHandle::fromPyObject(bb)));
550 Str lhs(&scope, strUnderlying(*obj_aa));
551 Str rhs(&scope, strUnderlying(*obj_bb));
552 return lhs.equals(*rhs);
553}
554
555PY_EXPORT size_t Py_UNICODE_strlen(const Py_UNICODE* u) {
556 DCHECK(u != nullptr, "u should not be null");
557 return std::wcslen(u);
558}
559
560PY_EXPORT int _PyUnicode_Ready(PyObject* /* unicode */) { return 0; }
561
562PY_EXPORT int PyUnicode_CheckExact_Func(PyObject* obj) {
563 return ApiHandle::asObject(ApiHandle::fromPyObject(obj)).isStr();
564}
565
566PY_EXPORT int PyUnicode_Check_Func(PyObject* obj) {
567 return Thread::current()->runtime()->isInstanceOfStr(
568 ApiHandle::asObject(ApiHandle::fromPyObject(obj)));
569}
570
571PY_EXPORT PyObject* PyUnicode_FromString(const char* c_string) {
572 Runtime* runtime = Thread::current()->runtime();
573 return ApiHandle::newReference(runtime, runtime->newStrFromCStr(c_string));
574}
575
576// Look for a surrogate codepoint in str[start:]. Note that start is a byte
577// offset. Return the first index found in that range, or -1 if not found.
578static word strFindSurrogateCodepoint(const Str& str, word start) {
579 word length = str.length();
580 word byte_index = start;
581 while (byte_index < length) {
582 word num_bytes;
583 int32_t codepoint = str.codePointAt(byte_index, &num_bytes);
584 if (Unicode::isSurrogate(codepoint)) {
585 return byte_index;
586 }
587 byte_index += num_bytes;
588 }
589 return -1;
590}
591
592PY_EXPORT const char* PyUnicode_AsUTF8AndSize(PyObject* pyunicode,
593 Py_ssize_t* size) {
594 Thread* thread = Thread::current();
595 if (pyunicode == nullptr) {
596 thread->raiseBadArgument();
597 return nullptr;
598 }
599
600 HandleScope scope(thread);
601 ApiHandle* handle = ApiHandle::fromPyObject(pyunicode);
602 Object obj(&scope, ApiHandle::asObject(handle));
603 Runtime* runtime = thread->runtime();
604 if (!runtime->isInstanceOfStr(*obj)) {
605 thread->raiseBadInternalCall();
606 return nullptr;
607 }
608
609 Str str(&scope, strUnderlying(*obj));
610 word length = str.length();
611 if (size != nullptr) *size = length;
612 if (void* cache = ApiHandle::cache(runtime, handle)) {
613 return static_cast<char*>(cache);
614 }
615
616 word surr_index = strFindSurrogateCodepoint(str, 0);
617 if (surr_index != -1) {
618 Object encoding(&scope, SmallStr::fromCStr("utf-8"));
619 Object start(&scope, SmallInt::fromWord(surr_index));
620 Object end(&scope, SmallInt::fromWord(surr_index + 1));
621 Object reason(&scope, runtime->newStrFromCStr("surrogates not allowed"));
622 Object exc(&scope,
623 thread->invokeFunction5(ID(builtins), ID(UnicodeEncodeError),
624 encoding, str, start, end, reason));
625 Object err(&scope,
626 thread->invokeFunction1(ID(_codecs), ID(strict_errors), exc));
627 DCHECK(err.isErrorException(),
628 "_codecs.strict_errors should raise an exception");
629 return nullptr;
630 }
631
632 byte* result = static_cast<byte*>(std::malloc(length + 1));
633 str.copyTo(result, length);
634 result[length] = '\0';
635 ApiHandle::setCache(runtime, handle, result);
636 ApiHandle::setBorrowedNoImmediate(handle);
637 return reinterpret_cast<char*>(result);
638}
639
640PY_EXPORT const char* PyUnicode_AsUTF8(PyObject* unicode) {
641 return PyUnicode_AsUTF8AndSize(unicode, nullptr);
642}
643
644PY_EXPORT PyObject* PyUnicode_FromStringAndSize(const char* u,
645 Py_ssize_t size) {
646 Thread* thread = Thread::current();
647
648 if (size < 0) {
649 thread->raiseWithFmt(LayoutId::kSystemError,
650 "Negative size passed to PyUnicode_FromStringAndSize");
651 return nullptr;
652 }
653 if (u == nullptr && size != 0) {
654 // TODO(T36562134): Implement _PyUnicode_New
655 UNIMPLEMENTED("_PyUnicode_New");
656 }
657 const byte* data = reinterpret_cast<const byte*>(u);
658 Runtime* runtime = thread->runtime();
659 return ApiHandle::newReference(
660 runtime, runtime->newStrWithAll(View<byte>(data, size)));
661}
662
663PY_EXPORT PyObject* PyUnicode_EncodeFSDefault(PyObject* unicode) {
664 // TODO(T40363016): Allow arbitrary encodings instead of defaulting to utf-8
665 return _PyUnicode_AsUTF8String(unicode, Py_FileSystemDefaultEncodeErrors);
666}
667
668PY_EXPORT PyObject* PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar) {
669 Thread* thread = Thread::current();
670 // Since CPython optimizes for empty string, we must do so as well to make
671 // sure we don't fail if maxchar is invalid
672 if (size == 0) {
673 return ApiHandle::newReference(thread->runtime(), Str::empty());
674 }
675 if (maxchar > kMaxUnicode) {
676 thread->raiseWithFmt(LayoutId::kSystemError,
677 "invalid maximum character passed to PyUnicode_New");
678 return nullptr;
679 }
680 if (size < 0) {
681 thread->raiseWithFmt(LayoutId::kSystemError,
682 "Negative size passed to PyUnicode_New");
683 return nullptr;
684 }
685 // TODO(T41498010): Add modifiable string state
686 UNIMPLEMENTED("Cannot create mutable strings yet");
687}
688
689PY_EXPORT void PyUnicode_Append(PyObject** p_left, PyObject* right) {
690 if (p_left == nullptr) {
691 if (!PyErr_Occurred()) {
692 PyErr_BadInternalCall();
693 }
694 return;
695 }
696
697 PyObject* left = *p_left;
698 if (left == nullptr || right == nullptr || !PyUnicode_Check(left) ||
699 !PyUnicode_Check(right)) {
700 if (!PyErr_Occurred()) {
701 PyErr_BadInternalCall();
702 }
703 Py_CLEAR(*p_left);
704 return;
705 }
706 *p_left = PyUnicode_Concat(left, right);
707 Py_DECREF(left);
708}
709
710PY_EXPORT void PyUnicode_AppendAndDel(PyObject** p_left, PyObject* right) {
711 PyUnicode_Append(p_left, right);
712 Py_XDECREF(right);
713}
714
715PY_EXPORT PyObject* _PyUnicode_AsASCIIString(PyObject* unicode,
716 const char* errors) {
717 DCHECK(unicode != nullptr, "unicode cannot be null");
718 Thread* thread = Thread::current();
719 HandleScope scope(thread);
720 Runtime* runtime = thread->runtime();
721 Object str(&scope, ApiHandle::asObject(ApiHandle::fromPyObject(unicode)));
722 if (!runtime->isInstanceOfStr(*str)) {
723 thread->raiseBadArgument();
724 return nullptr;
725 }
726 Object errors_obj(&scope, symbolFromError(thread, errors));
727 Object tuple_obj(&scope, thread->invokeFunction2(
728 ID(_codecs), ID(ascii_encode), str, errors_obj));
729 if (tuple_obj.isError()) {
730 return nullptr;
731 }
732 Tuple tuple(&scope, *tuple_obj);
733 return ApiHandle::newReference(runtime, tuple.at(0));
734}
735
736PY_EXPORT PyObject* PyUnicode_AsASCIIString(PyObject* unicode) {
737 return _PyUnicode_AsASCIIString(unicode, "strict");
738}
739
740PY_EXPORT PyObject* PyUnicode_AsCharmapString(PyObject* /* e */,
741 PyObject* /* g */) {
742 UNIMPLEMENTED("PyUnicode_AsCharmapString");
743}
744
745PY_EXPORT PyObject* PyUnicode_AsDecodedObject(PyObject* /* e */,
746 const char* /* g */,
747 const char* /* s */) {
748 UNIMPLEMENTED("PyUnicode_AsDecodedObject");
749}
750
751PY_EXPORT PyObject* PyUnicode_AsDecodedUnicode(PyObject* /* e */,
752 const char* /* g */,
753 const char* /* s */) {
754 UNIMPLEMENTED("PyUnicode_AsDecodedUnicode");
755}
756
757PY_EXPORT PyObject* PyUnicode_AsEncodedObject(PyObject* /* e */,
758 const char* /* g */,
759 const char* /* s */) {
760 UNIMPLEMENTED("PyUnicode_AsEncodedObject");
761}
762
763PY_EXPORT PyObject* PyUnicode_AsEncodedString(PyObject* unicode,
764 const char* encoding,
765 const char* errors) {
766 DCHECK(unicode != nullptr, "unicode cannot be null");
767 if (encoding == nullptr) {
768 return _PyUnicode_AsUTF8String(unicode, errors);
769 }
770 Thread* thread = Thread::current();
771 HandleScope scope(thread);
772 Runtime* runtime = thread->runtime();
773 Object str(&scope, ApiHandle::asObject(ApiHandle::fromPyObject(unicode)));
774 if (!runtime->isInstanceOfStr(*str)) {
775 thread->raiseBadArgument();
776 return nullptr;
777 }
778 Object encoding_obj(&scope, runtime->newStrFromCStr(encoding));
779 Object errors_obj(&scope, errors == nullptr
780 ? Unbound::object()
781 : symbolFromError(thread, errors));
782 Object result(&scope, thread->invokeFunction3(ID(_codecs), ID(encode), str,
783 encoding_obj, errors_obj));
784 if (result.isError()) {
785 return nullptr;
786 }
787 if (runtime->isInstanceOfBytes(*result)) {
788 return ApiHandle::newReference(runtime, *result);
789 }
790 if (runtime->isInstanceOfBytearray(*result)) {
791 // Equivalent to calling PyErr_WarnFormat
792 if (!ensureBuiltinModuleById(thread, ID(warnings)).isErrorException()) {
793 Object category(&scope, runtime->typeAt(LayoutId::kRuntimeWarning));
794 Object message(&scope,
795 runtime->newStrFromFmt(
796 "encoder %s returned bytearray instead of bytes; "
797 "use codecs.encode() to encode to arbitrary types",
798 encoding));
799 Object stack_level(&scope, runtime->newInt(1));
800 Object source(&scope, NoneType::object());
801 Object err(&scope,
802 thread->invokeFunction4(ID(warnings), ID(warn), message,
803 category, stack_level, source));
804 if (err.isErrorException()) {
805 thread->clearPendingException();
806 }
807 }
808 Bytearray result_bytearray(&scope, *result);
809 return ApiHandle::newReference(runtime,
810 bytearrayAsBytes(thread, result_bytearray));
811 }
812 thread->raiseWithFmt(LayoutId::kTypeError,
813 "'%s' encoder returned '%T' instead of 'bytes'; "
814 "use codecs.encode() to encode to arbitrary types",
815 encoding, *result);
816 return nullptr;
817}
818
819PY_EXPORT PyObject* PyUnicode_AsEncodedUnicode(PyObject* /* e */,
820 const char* /* g */,
821 const char* /* s */) {
822 UNIMPLEMENTED("PyUnicode_AsEncodedUnicode");
823}
824
825PY_EXPORT PyObject* _PyUnicode_AsLatin1String(PyObject* unicode,
826 const char* errors) {
827 DCHECK(unicode != nullptr, "unicode cannot be null");
828 Thread* thread = Thread::current();
829 HandleScope scope(thread);
830 Runtime* runtime = thread->runtime();
831 Object str(&scope, ApiHandle::asObject(ApiHandle::fromPyObject(unicode)));
832 if (!runtime->isInstanceOfStr(*str)) {
833 thread->raiseBadArgument();
834 return nullptr;
835 }
836 Object errors_obj(&scope, symbolFromError(thread, errors));
837 Object tuple_obj(&scope,
838 thread->invokeFunction2(ID(_codecs), ID(latin_1_encode), str,
839 errors_obj));
840 if (tuple_obj.isError()) {
841 return nullptr;
842 }
843 Tuple tuple(&scope, *tuple_obj);
844 return ApiHandle::newReference(runtime, tuple.at(0));
845}
846
847PY_EXPORT PyObject* PyUnicode_AsLatin1String(PyObject* unicode) {
848 return _PyUnicode_AsLatin1String(unicode, "strict");
849}
850
851PY_EXPORT PyObject* PyUnicode_AsMBCSString(PyObject* /* e */) {
852 UNIMPLEMENTED("PyUnicode_AsMBCSString");
853}
854
855PY_EXPORT PyObject* PyUnicode_AsRawUnicodeEscapeString(PyObject* /* e */) {
856 UNIMPLEMENTED("PyUnicode_AsRawUnicodeEscapeString");
857}
858
859PY_EXPORT Py_UCS4* PyUnicode_AsUCS4(PyObject* u, Py_UCS4* buffer,
860 Py_ssize_t buflen, int copy_null) {
861 if (buffer == nullptr || buflen < 0) {
862 PyErr_BadInternalCall();
863 return nullptr;
864 }
865
866 Thread* thread = Thread::current();
867 HandleScope scope(thread);
868 Object obj(&scope, ApiHandle::asObject(ApiHandle::fromPyObject(u)));
869 if (!thread->runtime()->isInstanceOfStr(*obj)) {
870 thread->raiseBadArgument();
871 }
872
873 Str str(&scope, strUnderlying(*obj));
874 word num_codepoints = str.codePointLength();
875 word target_buflen = copy_null ? num_codepoints + 1 : num_codepoints;
876 if (buflen < target_buflen) {
877 thread->raiseWithFmt(LayoutId::kSystemError,
878 "string is longer than the buffer");
879 if (copy_null != 0 && 0 < buflen) {
880 buffer[0] = 0;
881 }
882 return nullptr;
883 }
884
885 for (word i = 0, offset = 0; i < num_codepoints; i++) {
886 word num_bytes;
887 buffer[i] = str.codePointAt(offset, &num_bytes);
888 offset += num_bytes;
889 }
890 if (copy_null != 0) buffer[num_codepoints] = 0;
891
892 return buffer;
893}
894
895PY_EXPORT Py_UCS4* PyUnicode_AsUCS4Copy(PyObject* str) {
896 Py_ssize_t len = PyUnicode_GET_LENGTH(str) + 1;
897 Py_UCS4* result = static_cast<Py_UCS4*>(PyMem_Malloc(len * sizeof(Py_UCS4)));
898 if (result == nullptr) {
899 PyErr_NoMemory();
900 return nullptr;
901 }
902 return PyUnicode_AsUCS4(str, result, len, 1);
903}
904
905PY_EXPORT PyObject* PyUnicode_AsUTF16String(PyObject* unicode) {
906 return _PyUnicode_EncodeUTF16(unicode, nullptr, 0);
907}
908
909PY_EXPORT PyObject* PyUnicode_AsUTF32String(PyObject* unicode) {
910 return _PyUnicode_EncodeUTF32(unicode, nullptr, 0);
911}
912
913PY_EXPORT PyObject* PyUnicode_AsUTF8String(PyObject* unicode) {
914 return _PyUnicode_AsUTF8String(unicode, "strict");
915}
916
917PY_EXPORT PyObject* PyUnicode_AsUnicodeEscapeString(PyObject* /* e */) {
918 UNIMPLEMENTED("PyUnicode_AsUnicodeEscapeString");
919}
920
921PY_EXPORT Py_ssize_t PyUnicode_AsWideChar(PyObject* str, wchar_t* result,
922 Py_ssize_t size) {
923 Thread* thread = Thread::current();
924 if (str == nullptr) {
925 thread->raiseBadInternalCall();
926 return -1;
927 }
928 HandleScope scope(thread);
929 Object str_obj(&scope, ApiHandle::asObject(ApiHandle::fromPyObject(str)));
930 Runtime* runtime = thread->runtime();
931 if (!runtime->isInstanceOfStr(*str_obj)) {
932 thread->raiseWithFmt(
933 LayoutId::kTypeError,
934 "PyUnicode_AsWideChar requires 'str' object but received a '%T'",
935 &str_obj);
936 return -1;
937 }
938 Str str_str(&scope, strUnderlying(*str_obj));
939 Py_ssize_t num_code_points = str_str.codePointLength();
940 if (size > num_code_points) {
941 size = num_code_points + 1;
942 } else {
943 num_code_points = size;
944 }
945
946 {
947 word byte_count = str_str.length();
948 for (word byte_index = 0, wchar_index = 0, num_bytes = 0;
949 byte_index < byte_count && wchar_index < size;
950 byte_index += num_bytes, wchar_index += 1) {
951 int32_t cp = str_str.codePointAt(byte_index, &num_bytes);
952 static_assert(sizeof(wchar_t) == sizeof(cp), "Requires 32bit wchar_t");
953 if (result != nullptr) {
954 result[wchar_index] = static_cast<wchar_t>(cp);
955 }
956 }
957 if (num_code_points < size) {
958 result[num_code_points] = '\0';
959 }
960 }
961
962 return num_code_points;
963}
964
965PY_EXPORT wchar_t* PyUnicode_AsWideCharString(PyObject* str,
966 Py_ssize_t* result_len) {
967 Thread* thread = Thread::current();
968 if (str == nullptr) {
969 thread->raiseBadInternalCall();
970 return nullptr;
971 }
972 HandleScope scope(thread);
973 Object str_obj(&scope, ApiHandle::asObject(ApiHandle::fromPyObject(str)));
974 Runtime* runtime = thread->runtime();
975 if (!runtime->isInstanceOfStr(*str_obj)) {
976 thread->raiseWithFmt(
977 LayoutId::kTypeError,
978 "PyUnicode_AsWideChar requires 'str' object but received a '%T'",
979 &str_obj);
980 return nullptr;
981 }
982 Str str_str(&scope, strUnderlying(*str_obj));
983 word length = str_str.codePointLength();
984 wchar_t* result =
985 static_cast<wchar_t*>(PyMem_Malloc((length + 1) * sizeof(wchar_t)));
986 if (result == nullptr) {
987 thread->raiseMemoryError();
988 return nullptr;
989 }
990
991 {
992 word byte_count = str_str.length();
993 for (word byte_index = 0, wchar_index = 0, num_bytes = 0;
994 byte_index < byte_count && wchar_index < length + 1;
995 byte_index += num_bytes, wchar_index += 1) {
996 int32_t cp = str_str.codePointAt(byte_index, &num_bytes);
997 if (cp == '\0') {
998 PyMem_Free(result);
999 thread->raiseWithFmt(LayoutId::kValueError, "embedded null character");
1000 return nullptr;
1001 }
1002 static_assert(sizeof(wchar_t) == sizeof(cp), "Requires 32bit wchar_t");
1003 result[wchar_index] = static_cast<wchar_t>(cp);
1004 }
1005 result[length] = '\0';
1006 }
1007
1008 if (result_len != nullptr) {
1009 *result_len = length;
1010 }
1011 return result;
1012}
1013
1014PY_EXPORT PyObject* PyUnicode_BuildEncodingMap(PyObject* /* g */) {
1015 UNIMPLEMENTED("PyUnicode_BuildEncodingMap");
1016}
1017
1018PY_EXPORT int PyUnicode_Compare(PyObject* left, PyObject* right) {
1019 Thread* thread = Thread::current();
1020 if (left == nullptr || right == nullptr) {
1021 thread->raiseBadInternalCall();
1022 return -1;
1023 }
1024
1025 Runtime* runtime = thread->runtime();
1026 HandleScope scope(thread);
1027 Object left_obj(&scope, ApiHandle::asObject(ApiHandle::fromPyObject(left)));
1028 Object right_obj(&scope, ApiHandle::asObject(ApiHandle::fromPyObject(right)));
1029 if (runtime->isInstanceOfStr(*left_obj) &&
1030 runtime->isInstanceOfStr(*right_obj)) {
1031 Str left_str(&scope, strUnderlying(*left_obj));
1032 Str right_str(&scope, strUnderlying(*right_obj));
1033 word result = left_str.compare(*right_str);
1034 return result > 0 ? 1 : (result < 0 ? -1 : 0);
1035 }
1036 thread->raiseWithFmt(LayoutId::kTypeError, "Can't compare %T and %T",
1037 &left_obj, &right_obj);
1038 return -1;
1039}
1040
1041PY_EXPORT int PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str) {
1042 Thread* thread = Thread::current();
1043 HandleScope scope(thread);
1044 Object obj(&scope, ApiHandle::asObject(ApiHandle::fromPyObject(uni)));
1045 Str str_obj(&scope, strUnderlying(*obj));
1046 // TODO(atalaba): Allow for proper comparison against Latin-1 strings. For
1047 // example, in CPython: "\xC3\xA9" (UTF-8) == "\xE9" (Latin-1), and
1048 // "\xE9 longer" > "\xC3\xA9".
1049 return str_obj.compareCStr(str);
1050}
1051
1052PY_EXPORT PyObject* PyUnicode_Concat(PyObject* left, PyObject* right) {
1053 Thread* thread = Thread::current();
1054 HandleScope scope(thread);
1055 Runtime* runtime = thread->runtime();
1056
1057 Object left_obj(&scope, ApiHandle::asObject(ApiHandle::fromPyObject(left)));
1058 Object right_obj(&scope, ApiHandle::asObject(ApiHandle::fromPyObject(right)));
1059 if (!runtime->isInstanceOfStr(*left_obj) ||
1060 !runtime->isInstanceOfStr(*right_obj)) {
1061 thread->raiseWithFmt(LayoutId::kTypeError,
1062 "can only concatenate str to str");
1063 return nullptr;
1064 }
1065 Str left_str(&scope, strUnderlying(*left_obj));
1066 Str right_str(&scope, strUnderlying(*right_obj));
1067 word dummy;
1068 if (__builtin_add_overflow(left_str.length(), right_str.length(), &dummy)) {
1069 thread->raiseWithFmt(LayoutId::kOverflowError,
1070 "strings are too large to concat");
1071 return nullptr;
1072 }
1073 return ApiHandle::newReference(
1074 runtime, runtime->strConcat(thread, left_str, right_str));
1075}
1076
1077PY_EXPORT int PyUnicode_Contains(PyObject* str, PyObject* substr) {
1078 DCHECK(str != nullptr, "str should not be null");
1079 DCHECK(substr != nullptr, "substr should not be null");
1080 Thread* thread = Thread::current();
1081 HandleScope scope(thread);
1082 Object str_obj(&scope, ApiHandle::asObject(ApiHandle::fromPyObject(str)));
1083 Object substr_obj(&scope, ApiHandle::asObject(ApiHandle::fromPyObject(substr)));
1084 Object result(&scope,
1085 thread->invokeMethodStatic2(LayoutId::kStr, ID(__contains__),
1086 str_obj, substr_obj));
1087 if (result.isError()) {
1088 if (result.isErrorNotFound()) {
1089 thread->raiseWithFmt(LayoutId::kTypeError,
1090 "could not call str.__contains__");
1091 }
1092 return -1;
1093 }
1094 DCHECK(result.isBool(), "result of __contains__ should be bool");
1095 return Bool::cast(*result).value();
1096}
1097
1098PY_EXPORT Py_ssize_t PyUnicode_CopyCharacters(PyObject*, Py_ssize_t, PyObject*,
1099 Py_ssize_t, Py_ssize_t) {
1100 UNIMPLEMENTED("PyUnicode_CopyCharacters");
1101}
1102
1103PY_EXPORT Py_ssize_t PyUnicode_Count(PyObject* /* r */, PyObject* /* r */,
1104 Py_ssize_t /* t */, Py_ssize_t /* d */) {
1105 UNIMPLEMENTED("PyUnicode_Count");
1106}
1107
1108PY_EXPORT PyObject* PyUnicode_Decode(const char* c_str, Py_ssize_t size,
1109 const char* encoding, const char* errors) {
1110 DCHECK(c_str != nullptr, "c_str cannot be null");
1111 if (encoding == nullptr) {
1112 return PyUnicode_DecodeUTF8Stateful(c_str, size, errors, nullptr);
1113 }
1114
1115 Thread* thread = Thread::current();
1116 Runtime* runtime = thread->runtime();
1117 HandleScope scope(thread);
1118 Bytes bytes(&scope, runtime->newBytesWithAll(View<byte>(
1119 reinterpret_cast<const byte*>(c_str), size)));
1120 Object errors_obj(&scope, symbolFromError(thread, errors));
1121 Object encoding_obj(&scope, runtime->newStrFromCStr(encoding));
1122 Object result(&scope, thread->invokeFunction3(ID(_codecs), ID(decode), bytes,
1123 encoding_obj, errors_obj));
1124 if (result.isError()) {
1125 return nullptr;
1126 }
1127 return ApiHandle::newReference(runtime, *result);
1128}
1129
1130PY_EXPORT PyObject* PyUnicode_DecodeASCII(const char* c_str, Py_ssize_t size,
1131 const char* errors) {
1132 Thread* thread = Thread::current();
1133 Runtime* runtime = thread->runtime();
1134 HandleScope scope(thread);
1135 Bytes bytes(&scope, runtime->newBytesWithAll(View<byte>(
1136 reinterpret_cast<const byte*>(c_str), size)));
1137 Str errors_obj(&scope, symbolFromError(thread, errors));
1138 Object result_obj(
1139 &scope, thread->invokeFunction2(ID(_codecs), ID(ascii_decode), bytes,
1140 errors_obj));
1141 if (result_obj.isError()) {
1142 if (result_obj.isErrorNotFound()) {
1143 thread->raiseWithFmt(LayoutId::kSystemError,
1144 "could not call _codecs.ascii_decode");
1145 }
1146 return nullptr;
1147 }
1148 Tuple result(&scope, *result_obj);
1149 return ApiHandle::newReference(runtime, result.at(0));
1150}
1151
1152PY_EXPORT PyObject* PyUnicode_DecodeCharmap(const char* /* s */,
1153 Py_ssize_t /* e */,
1154 PyObject* /* g */,
1155 const char* /* s */) {
1156 UNIMPLEMENTED("PyUnicode_DecodeCharmap");
1157}
1158
1159PY_EXPORT PyObject* PyUnicode_DecodeCodePageStateful(int /* e */,
1160 const char* /* s */,
1161 Py_ssize_t /* e */,
1162 const char* /* s */,
1163 Py_ssize_t* /* d */) {
1164 UNIMPLEMENTED("PyUnicode_DecodeCodePageStateful");
1165}
1166
1167PY_EXPORT PyObject* PyUnicode_DecodeFSDefault(const char* c_str) {
1168 Runtime* runtime = Thread::current()->runtime();
1169 return ApiHandle::newReference(runtime, runtime->newStrFromCStr(c_str));
1170}
1171
1172PY_EXPORT PyObject* PyUnicode_DecodeFSDefaultAndSize(const char* c_str,
1173 Py_ssize_t size) {
1174 Runtime* runtime = Thread::current()->runtime();
1175 View<byte> str(reinterpret_cast<const byte*>(c_str), size);
1176 return ApiHandle::newReference(runtime, runtime->newStrWithAll(str));
1177}
1178
1179PY_EXPORT PyObject* PyUnicode_DecodeLatin1(const char* c_str, Py_ssize_t size,
1180 const char* /* errors */) {
1181 Thread* thread = Thread::current();
1182 Runtime* runtime = thread->runtime();
1183 HandleScope scope(thread);
1184 Bytes bytes(&scope, runtime->newBytesWithAll(View<byte>(
1185 reinterpret_cast<const byte*>(c_str), size)));
1186 Object result_obj(
1187 &scope, thread->invokeFunction1(ID(_codecs), ID(latin_1_decode), bytes));
1188 if (result_obj.isError()) {
1189 if (result_obj.isErrorNotFound()) {
1190 thread->raiseWithFmt(LayoutId::kSystemError,
1191 "could not call _codecs.latin_1_decode");
1192 }
1193 return nullptr;
1194 }
1195 Tuple result(&scope, *result_obj);
1196 return ApiHandle::newReference(runtime, result.at(0));
1197}
1198
1199PY_EXPORT PyObject* PyUnicode_DecodeLocale(const char* str,
1200 const char* errors) {
1201 return PyUnicode_DecodeLocaleAndSize(str, std::strlen(str), errors);
1202}
1203
1204PY_EXPORT PyObject* PyUnicode_DecodeLocaleAndSize(const char* str,
1205 Py_ssize_t len,
1206 const char* errors) {
1207 _Py_error_handler surrogateescape;
1208 if (errors == nullptr || std::strcmp(errors, "strict") == 0) {
1209 surrogateescape = _Py_ERROR_STRICT;
1210 } else if (std::strcmp(errors, "surrogateescape") == 0) {
1211 surrogateescape = _Py_ERROR_SURROGATEESCAPE;
1212 } else {
1213 Thread::current()->raiseWithFmt(
1214 LayoutId::kValueError,
1215 "only 'strict' and 'surrogateescape' error handlers "
1216 "are supported, not '%s'",
1217 errors);
1218 return nullptr;
1219 }
1220
1221 if (str[len] != '\0' || static_cast<size_t>(len) != std::strlen(str)) {
1222 Thread::current()->raiseWithFmt(LayoutId::kValueError,
1223 "embedded null byte");
1224 return nullptr;
1225 }
1226
1227 wchar_t* wstr;
1228 size_t wlen;
1229 const char* reason;
1230 int res = _Py_DecodeLocaleEx(str, &wstr, &wlen, &reason, 1, surrogateescape);
1231 if (res != 0) {
1232 if (res == -2) {
1233 PyObject* exc =
1234 PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nns", "locale",
1235 str, len, wlen, wlen + 1, reason);
1236 if (exc != nullptr) {
1237 PyCodec_StrictErrors(exc);
1238 Py_DECREF(exc);
1239 }
1240 } else {
1241 PyErr_NoMemory();
1242 }
1243 return nullptr;
1244 }
1245
1246 PyObject* unicode = PyUnicode_FromWideChar(wstr, wlen);
1247 PyMem_RawFree(wstr);
1248 return unicode;
1249}
1250
1251PY_EXPORT PyObject* PyUnicode_DecodeMBCS(const char* /* s */,
1252 Py_ssize_t /* e */,
1253 const char* /* s */) {
1254 UNIMPLEMENTED("PyUnicode_DecodeMBCS");
1255}
1256
1257PY_EXPORT PyObject* PyUnicode_DecodeMBCSStateful(const char* /* s */,
1258 Py_ssize_t /* e */,
1259 const char* /* s */,
1260 Py_ssize_t* /* d */) {
1261 UNIMPLEMENTED("PyUnicode_DecodeMBCSStateful");
1262}
1263
1264PY_EXPORT PyObject* PyUnicode_DecodeRawUnicodeEscape(const char* /* s */,
1265 Py_ssize_t /* e */,
1266 const char* /* s */) {
1267 UNIMPLEMENTED("PyUnicode_DecodeRawUnicodeEscape");
1268}
1269
1270PY_EXPORT PyObject* PyUnicode_DecodeUTF16(const char* /* s */,
1271 Py_ssize_t /* e */,
1272 const char* /* s */, int* /* r */) {
1273 UNIMPLEMENTED("PyUnicode_DecodeUTF16");
1274}
1275
1276PY_EXPORT PyObject* PyUnicode_DecodeUTF16Stateful(const char* /* s */,
1277 Py_ssize_t /* e */,
1278 const char* /* s */,
1279 int* /* r */,
1280 Py_ssize_t* /* d */) {
1281 UNIMPLEMENTED("PyUnicode_DecodeUTF16Stateful");
1282}
1283
1284PY_EXPORT PyObject* PyUnicode_DecodeUTF32(const char* /* s */,
1285 Py_ssize_t /* e */,
1286 const char* /* s */, int* /* r */) {
1287 UNIMPLEMENTED("PyUnicode_DecodeUTF32");
1288}
1289
1290PY_EXPORT PyObject* PyUnicode_DecodeUTF32Stateful(const char* /* s */,
1291 Py_ssize_t /* e */,
1292 const char* /* s */,
1293 int* /* r */,
1294 Py_ssize_t* /* d */) {
1295 UNIMPLEMENTED("PyUnicode_DecodeUTF32Stateful");
1296}
1297
1298PY_EXPORT PyObject* PyUnicode_DecodeUTF7(const char* /* s */,
1299 Py_ssize_t /* e */,
1300 const char* /* s */) {
1301 UNIMPLEMENTED("PyUnicode_DecodeUTF7");
1302}
1303
1304PY_EXPORT PyObject* PyUnicode_DecodeUTF7Stateful(const char* /* s */,
1305 Py_ssize_t /* e */,
1306 const char* /* s */,
1307 Py_ssize_t* /* d */) {
1308 UNIMPLEMENTED("PyUnicode_DecodeUTF7Stateful");
1309}
1310
1311PY_EXPORT PyObject* PyUnicode_DecodeUTF8(const char* c_str, Py_ssize_t size,
1312 const char* errors) {
1313 return PyUnicode_DecodeUTF8Stateful(c_str, size, errors, nullptr);
1314}
1315
1316PY_EXPORT PyObject* PyUnicode_DecodeUTF8Stateful(const char* c_str,
1317 Py_ssize_t size,
1318 const char* errors,
1319 Py_ssize_t* consumed) {
1320 DCHECK(c_str != nullptr, "c_str cannot be null");
1321
1322 Thread* thread = Thread::current();
1323 HandleScope scope(thread);
1324 Runtime* runtime = thread->runtime();
1325 word i = 0;
1326 const byte* byte_str = reinterpret_cast<const byte*>(c_str);
1327 for (; i < size; ++i) {
1328 if (byte_str[i] > kMaxASCII) break;
1329 }
1330 if (i == size) {
1331 if (consumed != nullptr) {
1332 *consumed = size;
1333 }
1334 return ApiHandle::newReference(runtime,
1335 runtime->newStrWithAll({byte_str, size}));
1336 }
1337 Object bytes(&scope, runtime->newBytesWithAll(View<byte>({byte_str, size})));
1338 Object errors_obj(&scope, symbolFromError(thread, errors));
1339 Object is_final(&scope, Bool::fromBool(consumed == nullptr));
1340 Object result_obj(
1341 &scope, thread->invokeFunction3(ID(_codecs), ID(utf_8_decode), bytes,
1342 errors_obj, is_final));
1343 if (result_obj.isError()) {
1344 if (result_obj.isErrorNotFound()) {
1345 thread->raiseWithFmt(LayoutId::kSystemError,
1346 "could not call _codecs._utf_8_decode_stateful");
1347 }
1348 return nullptr;
1349 }
1350 Tuple result(&scope, *result_obj);
1351 if (consumed != nullptr) {
1352 *consumed = Int::cast(result.at(1)).asWord();
1353 }
1354 return ApiHandle::newReference(runtime, result.at(0));
1355}
1356
1357PY_EXPORT PyObject* PyUnicode_DecodeUnicodeEscape(const char* c_str,
1358 Py_ssize_t size,
1359 const char* errors) {
1360 DCHECK(c_str != nullptr, "c_str cannot be null");
1361 const char* first_invalid_escape;
1362 PyObject* result = _PyUnicode_DecodeUnicodeEscape(c_str, size, errors,
1363 &first_invalid_escape);
1364 if (result == nullptr) {
1365 return nullptr;
1366 }
1367 if (first_invalid_escape != nullptr) {
1368 if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
1369 "invalid escape sequence '\\%c'",
1370 static_cast<byte>(*first_invalid_escape)) < 0) {
1371 Py_DECREF(result);
1372 return nullptr;
1373 }
1374 }
1375 return result;
1376}
1377
1378PY_EXPORT PyObject* _PyUnicode_DecodeUnicodeEscape(
1379 const char* c_str, Py_ssize_t size, const char* errors,
1380 const char** first_invalid_escape) {
1381 DCHECK(c_str != nullptr, "c_str cannot be null");
1382 DCHECK(first_invalid_escape != nullptr,
1383 "first_invalid_escape cannot be null");
1384
1385 // So we can remember if we've seen an invalid escape char or not
1386 *first_invalid_escape = nullptr;
1387
1388 Thread* thread = Thread::current();
1389 HandleScope scope(thread);
1390 Runtime* runtime = thread->runtime();
1391 Object bytes(&scope, runtime->newBytesWithAll(View<byte>(
1392 reinterpret_cast<const byte*>(c_str), size)));
1393 Object errors_obj(&scope, symbolFromError(thread, errors));
1394 Object result_obj(
1395 &scope,
1396 thread->invokeFunction2(ID(_codecs), ID(_unicode_escape_decode_stateful),
1397 bytes, errors_obj));
1398 if (result_obj.isError()) {
1399 if (result_obj.isErrorNotFound()) {
1400 thread->raiseWithFmt(LayoutId::kSystemError,
1401 "could not call _codecs.unicode_escape_decode");
1402 }
1403 return nullptr;
1404 }
1405 Tuple result(&scope, *result_obj);
1406 Int first_invalid_index(&scope, result.at(2));
1407 word invalid_index = first_invalid_index.asWord();
1408 if (invalid_index > -1) {
1409 *first_invalid_escape = c_str + invalid_index;
1410 }
1411 return ApiHandle::newReference(runtime, result.at(0));
1412}
1413
1414PY_EXPORT PyObject* PyUnicode_EncodeCodePage(int /* e */, PyObject* /* e */,
1415 const char* /* s */) {
1416 UNIMPLEMENTED("PyUnicode_EncodeCodePage");
1417}
1418
1419PY_EXPORT PyObject* PyUnicode_EncodeLocale(PyObject* unicode,
1420 const char* errors) {
1421 _Py_error_handler surrogateescape;
1422 if (errors == nullptr || std::strcmp(errors, "strict") == 0) {
1423 surrogateescape = _Py_ERROR_STRICT;
1424 } else if (std::strcmp(errors, "surrogateescape") == 0) {
1425 surrogateescape = _Py_ERROR_SURROGATEESCAPE;
1426 } else {
1427 Thread::current()->raiseWithFmt(
1428 LayoutId::kValueError,
1429 "only 'strict' and 'surrogateescape' error handlers "
1430 "are supported, not '%s'",
1431 errors);
1432 return nullptr;
1433 }
1434 Py_ssize_t wlen;
1435 wchar_t* wstr = PyUnicode_AsWideCharString(unicode, &wlen);
1436 if (wstr == nullptr) {
1437 return nullptr;
1438 }
1439
1440 if (static_cast<size_t>(wlen) != std::wcslen(wstr)) {
1441 Thread::current()->raiseWithFmt(LayoutId::kValueError,
1442 "embedded null character");
1443 PyMem_Free(wstr);
1444 return nullptr;
1445 }
1446
1447 char* str;
1448 size_t error_pos;
1449 const char* reason;
1450 int res = _Py_EncodeLocaleEx(wstr, &str, &error_pos, &reason,
1451 /*current_locale=*/1, surrogateescape);
1452 PyMem_Free(wstr);
1453
1454 if (res != 0) {
1455 if (res == -2) {
1456 PyObject* exc =
1457 PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnns", "locale",
1458 unicode, error_pos, error_pos + 1, reason);
1459 if (exc != nullptr) {
1460 PyCodec_StrictErrors(exc);
1461 Py_DECREF(exc);
1462 }
1463 } else {
1464 PyErr_NoMemory();
1465 }
1466 return nullptr;
1467 }
1468
1469 PyObject* bytes = PyBytes_FromString(str);
1470 PyMem_RawFree(str);
1471 return bytes;
1472}
1473
1474PY_EXPORT PyObject* _PyUnicode_EncodeUTF16(PyObject* unicode,
1475 const char* errors, int byteorder) {
1476 DCHECK(unicode != nullptr, "unicode cannot be null");
1477 Thread* thread = Thread::current();
1478 HandleScope scope(thread);
1479 Runtime* runtime = thread->runtime();
1480 Object str(&scope, ApiHandle::asObject(ApiHandle::fromPyObject(unicode)));
1481 if (!runtime->isInstanceOfStr(*str)) {
1482 thread->raiseBadArgument();
1483 return nullptr;
1484 }
1485 Object errors_obj(&scope, symbolFromError(thread, errors));
1486 Object byteorder_obj(&scope, runtime->newInt(byteorder));
1487 Object tuple_obj(&scope,
1488 thread->invokeFunction3(ID(_codecs), ID(utf_16_encode), str,
1489 errors_obj, byteorder_obj));
1490 if (tuple_obj.isError()) {
1491 return nullptr;
1492 }
1493 Tuple tuple(&scope, *tuple_obj);
1494 return ApiHandle::newReference(runtime, tuple.at(0));
1495}
1496
1497PY_EXPORT PyObject* PyUnicode_EncodeUTF16(const Py_UNICODE* unicode,
1498 Py_ssize_t size, const char* errors,
1499 int byteorder) {
1500 PyObject* str = PyUnicode_FromUnicode(unicode, size);
1501 if (str == nullptr) return nullptr;
1502 PyObject* result = _PyUnicode_EncodeUTF16(str, errors, byteorder);
1503 Py_DECREF(str);
1504 return result;
1505}
1506
1507PY_EXPORT PyObject* _PyUnicode_EncodeUTF32(PyObject* unicode,
1508 const char* errors, int byteorder) {
1509 DCHECK(unicode != nullptr, "unicode cannot be null");
1510 Thread* thread = Thread::current();
1511 HandleScope scope(thread);
1512 Runtime* runtime = thread->runtime();
1513 Object str(&scope, ApiHandle::asObject(ApiHandle::fromPyObject(unicode)));
1514 if (!runtime->isInstanceOfStr(*str)) {
1515 thread->raiseBadArgument();
1516 return nullptr;
1517 }
1518 Object errors_obj(&scope, symbolFromError(thread, errors));
1519 Object byteorder_obj(&scope, runtime->newInt(byteorder));
1520 Object tuple_obj(&scope,
1521 thread->invokeFunction3(ID(_codecs), ID(utf_32_encode), str,
1522 errors_obj, byteorder_obj));
1523 if (tuple_obj.isError()) {
1524 return nullptr;
1525 }
1526 Tuple tuple(&scope, *tuple_obj);
1527 return ApiHandle::newReference(runtime, tuple.at(0));
1528}
1529
1530PY_EXPORT PyObject* PyUnicode_EncodeUTF32(const Py_UNICODE* unicode,
1531 Py_ssize_t size, const char* errors,
1532 int byteorder) {
1533 PyObject* str = PyUnicode_FromUnicode(unicode, size);
1534 if (str == nullptr) return nullptr;
1535 PyObject* result = _PyUnicode_EncodeUTF32(str, errors, byteorder);
1536 Py_DECREF(str);
1537 return result;
1538}
1539
1540PY_EXPORT int PyUnicode_FSConverter(PyObject* arg, void* addr) {
1541 if (arg == nullptr) {
1542 Py_DECREF(*reinterpret_cast<PyObject**>(addr));
1543 *reinterpret_cast<PyObject**>(addr) = nullptr;
1544 return 1;
1545 }
1546 Thread* thread = Thread::current();
1547 HandleScope scope(thread);
1548 Object arg_obj(&scope, ApiHandle::asObject(ApiHandle::fromPyObject(arg)));
1549 Object path(&scope, NoneType::object());
1550 Runtime* runtime = thread->runtime();
1551 if (runtime->isInstanceOfStr(*arg_obj) ||
1552 runtime->isInstanceOfBytes(*arg_obj)) {
1553 path = *arg_obj;
1554 } else {
1555 path = thread->invokeFunction1(ID(_io), ID(_fspath), arg_obj);
1556 if (path.isErrorException()) {
1557 return 0;
1558 }
1559 }
1560 Object output(&scope, NoneType::object());
1561 if (runtime->isInstanceOfBytes(*path)) {
1562 output = *path;
1563 } else {
1564 CHECK(std::strcmp(Py_FileSystemDefaultEncoding, "utf-8") == 0, "");
1565 CHECK(std::strcmp(Py_FileSystemDefaultEncodeErrors, "surrogatepass") == 0,
1566 "");
1567 // PyOS_FSPath/_io._fspath guarantee their returned value is bytes or str.
1568 // This is an inlined PyUnicode_FSDecoder, which does a UTF-8 decode with
1569 // surrogatepass. Since our strings are UTF-8 with UTF-16 surrogates
1570 // (WTF-8), we can just copy the bytes out.
1571 Str path_str(&scope, strUnderlying(*path));
1572 word path_len = path_str.length();
1573 MutableBytes bytes(&scope, runtime->newMutableBytesUninitialized(path_len));
1574 bytes.replaceFromWithStr(0, *path_str, path_len);
1575 output = bytes.becomeImmutable();
1576 }
1577 Bytes underlying(&scope, bytesUnderlying(*output));
1578 if (underlying.findByte('\0', /*start=*/0, /*length=*/underlying.length()) !=
1579 -1) {
1580 thread->raiseWithFmt(LayoutId::kValueError, "embedded null byte");
1581 return 0;
1582 }
1583 *reinterpret_cast<PyObject**>(addr) =
1584 ApiHandle::newReference(runtime, *output);
1585 return Py_CLEANUP_SUPPORTED;
1586}
1587
1588PY_EXPORT int PyUnicode_FSDecoder(PyObject* arg, void* addr) {
1589 if (arg == nullptr) {
1590 Py_DECREF(*(PyObject**)addr);
1591 *reinterpret_cast<PyObject**>(addr) = nullptr;
1592 return 1;
1593 }
1594
1595 bool is_buffer = PyObject_CheckBuffer(arg);
1596 PyObject* path;
1597 if (!is_buffer) {
1598 path = PyOS_FSPath(arg);
1599 if (path == nullptr) return 0;
1600 } else {
1601 path = arg;
1602 Py_INCREF(arg);
1603 }
1604
1605 PyObject* output;
1606 if (PyUnicode_Check(path)) {
1607 output = path;
1608 } else if (PyBytes_Check(path) || is_buffer) {
1609 if (!PyBytes_Check(path) &&
1610 PyErr_WarnFormat(
1611 PyExc_DeprecationWarning, 1,
1612 "path should be string, bytes, or os.PathLike, not %.200s",
1613 PyObject_TypeName(arg))) {
1614 Py_DECREF(path);
1615 return 0;
1616 }
1617 PyObject* path_bytes = PyBytes_FromObject(path);
1618 Py_DECREF(path);
1619 if (!path_bytes) return 0;
1620 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(path_bytes),
1621 PyBytes_GET_SIZE(path_bytes));
1622 Py_DECREF(path_bytes);
1623 if (!output) return 0;
1624 } else {
1625 Thread::current()->raiseWithFmt(
1626 LayoutId::kTypeError,
1627 "path should be string, bytes, or os.PathLike, not %s",
1628 PyObject_TypeName(arg));
1629 Py_DECREF(path);
1630 return 0;
1631 }
1632
1633 Thread* thread = Thread::current();
1634 HandleScope scope(thread);
1635 Str output_str(&scope, ApiHandle::asObject(ApiHandle::fromPyObject(output)));
1636 if (strFindAsciiChar(output_str, '\0') >= 0) {
1637 thread->raiseWithFmt(LayoutId::kValueError, "embedded null character");
1638 Py_DECREF(output);
1639 return 0;
1640 }
1641 *reinterpret_cast<PyObject**>(addr) = output;
1642 return Py_CLEANUP_SUPPORTED;
1643}
1644
1645PY_EXPORT Py_ssize_t PyUnicode_Find(PyObject* str, PyObject* substr,
1646 Py_ssize_t start, Py_ssize_t end,
1647 int direction) {
1648 DCHECK(str != nullptr, "str must be non-null");
1649 DCHECK(substr != nullptr, "substr must be non-null");
1650 DCHECK(direction == -1 || direction == 1, "direction must be -1 or 1");
1651 Thread* thread = Thread::current();
1652 HandleScope scope(thread);
1653 Object haystack_obj(&scope, ApiHandle::asObject(ApiHandle::fromPyObject(str)));
1654 Object needle_obj(&scope, ApiHandle::asObject(ApiHandle::fromPyObject(substr)));
1655 Runtime* runtime = thread->runtime();
1656 if (!runtime->isInstanceOfStr(*haystack_obj)) {
1657 thread->raiseWithFmt(LayoutId::kTypeError,
1658 "PyUnicode_Find requires a 'str' instance");
1659 return -2;
1660 }
1661 Str haystack(&scope, strUnderlying(*haystack_obj));
1662 if (!runtime->isInstanceOfStr(*needle_obj)) {
1663 thread->raiseWithFmt(LayoutId::kTypeError,
1664 "PyUnicode_Find requires a 'str' instance");
1665 return -2;
1666 }
1667 Str needle(&scope, strUnderlying(*needle_obj));
1668 if (direction == 1) return strFindWithRange(haystack, needle, start, end);
1669 return strRFind(haystack, needle, start, end);
1670}
1671
1672PY_EXPORT Py_ssize_t PyUnicode_FindChar(PyObject* str, Py_UCS4 ch,
1673 Py_ssize_t start, Py_ssize_t end,
1674 int direction) {
1675 DCHECK(str != nullptr, "str must not be null");
1676 DCHECK(direction == 1 || direction == -1, "direction must be -1 or 1");
1677 Thread* thread = Thread::current();
1678 HandleScope scope(thread);
1679 Object haystack_obj(&scope, ApiHandle::asObject(ApiHandle::fromPyObject(str)));
1680 Runtime* runtime = thread->runtime();
1681 DCHECK(runtime->isInstanceOfStr(*haystack_obj),
1682 "PyUnicode_FindChar requires a 'str' instance");
1683 Str haystack(&scope, strUnderlying(*haystack_obj));
1684 Str needle(&scope, SmallStr::fromCodePoint(ch));
1685 if (direction == 1) return strFindWithRange(haystack, needle, start, end);
1686 return strRFind(haystack, needle, start, end);
1687}
1688
1689PY_EXPORT PyObject* PyUnicode_Format(PyObject* format, PyObject* args) {
1690 if (format == nullptr || args == nullptr) {
1691 PyErr_BadInternalCall();
1692 return nullptr;
1693 }
1694 if (!PyUnicode_Check(format)) {
1695 Thread::current()->raiseWithFmt(LayoutId::kTypeError, "must be str, not %s",
1696 _PyType_Name(Py_TYPE(format)));
1697 return nullptr;
1698 }
1699 return PyNumber_Remainder(format, args);
1700}
1701
1702PY_EXPORT PyObject* PyUnicode_FromEncodedObject(PyObject* /* j */,
1703 const char* /* g */,
1704 const char* /* s */) {
1705 UNIMPLEMENTED("PyUnicode_FromEncodedObject");
1706}
1707
1708PY_EXPORT PyObject* PyUnicode_FromFormat(const char* format, ...) {
1709 va_list vargs;
1710
1711 va_start(vargs, format);
1712 PyObject* ret = PyUnicode_FromFormatV(format, vargs);
1713 va_end(vargs);
1714 return ret;
1715}
1716
1717PY_EXPORT PyObject* PyUnicode_FromFormatV(const char* format, va_list vargs) {
1718 va_list vargs2;
1719 _PyUnicodeWriter writer;
1720
1721 _PyUnicodeWriter_Init(&writer);
1722 writer.min_length = std::strlen(format) + 100;
1723 writer.overallocate = 1;
1724
1725 // This copy seems unnecessary but it may have been needed by CPython for
1726 // historical reasons.
1727 va_copy(vargs2, vargs);
1728
1729 for (const char* f = format; *f;) {
1730 if (*f == '%') {
1731 f = writeArg(&writer, f, &vargs2);
1732 if (f == nullptr) goto fail;
1733 } else {
1734 const char* p = f;
1735 do {
1736 if (static_cast<unsigned char>(*p) > 127) {
1737 PyErr_Format(
1738 PyExc_ValueError,
1739 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
1740 "string, got a non-ASCII byte: 0x%02x",
1741 static_cast<unsigned char>(*p));
1742 goto fail;
1743 }
1744 p++;
1745 } while (*p != '\0' && *p != '%');
1746 Py_ssize_t len = p - f;
1747
1748 if (*p == '\0') writer.overallocate = 0;
1749
1750 if (_PyUnicodeWriter_WriteASCIIString(&writer, f, len) < 0) goto fail;
1751
1752 f = p;
1753 }
1754 }
1755 va_end(vargs2);
1756 return _PyUnicodeWriter_Finish(&writer);
1757
1758fail:
1759 va_end(vargs2);
1760 _PyUnicodeWriter_Dealloc(&writer);
1761 return nullptr;
1762}
1763
1764PY_EXPORT PyObject* PyUnicode_FromObject(PyObject* /* j */) {
1765 UNIMPLEMENTED("PyUnicode_FromObject");
1766}
1767
1768PY_EXPORT PyObject* PyUnicode_FromOrdinal(int ordinal) {
1769 Thread* thread = Thread::current();
1770 if (ordinal < 0 || ordinal > kMaxUnicode) {
1771 thread->raiseWithFmt(LayoutId::kValueError,
1772 "chr() arg not in range(0x110000)");
1773 return nullptr;
1774 }
1775 return ApiHandle::newReference(thread->runtime(),
1776 SmallStr::fromCodePoint(ordinal));
1777}
1778
1779PY_EXPORT PyObject* PyUnicode_FromWideChar(const wchar_t* buffer,
1780 Py_ssize_t size) {
1781 Thread* thread = Thread::current();
1782 if (buffer == nullptr && size != 0) {
1783 thread->raiseBadInternalCall();
1784 return nullptr;
1785 }
1786
1787 RawObject result = size == -1
1788 ? newStrFromWideChar(thread, buffer)
1789 : newStrFromWideCharWithLength(thread, buffer, size);
1790 return result.isErrorException()
1791 ? nullptr
1792 : ApiHandle::newReference(thread->runtime(), result);
1793}
1794
1795PY_EXPORT Py_ssize_t PyUnicode_GET_LENGTH_Func(PyObject* pyobj) {
1796 RawObject obj = ApiHandle::asObjectNoImmediate(ApiHandle::fromPyObject(pyobj));
1797 DCHECK(Thread::current()->runtime()->isInstanceOfStr(obj),
1798 "non-str argument to PyUnicode_GET_LENGTH");
1799 return strUnderlying(obj).codePointLength();
1800}
1801
1802PY_EXPORT const char* PyUnicode_GetDefaultEncoding() {
1803 return Py_FileSystemDefaultEncoding;
1804}
1805
1806PY_EXPORT Py_ssize_t PyUnicode_GetLength(PyObject* pyobj) {
1807 Thread* thread = Thread::current();
1808 RawObject obj = ApiHandle::asObject(ApiHandle::fromPyObject(pyobj));
1809 if (!thread->runtime()->isInstanceOfStr(obj)) {
1810 thread->raiseBadArgument();
1811 return -1;
1812 }
1813 return strUnderlying(obj).codePointLength();
1814}
1815
1816PY_EXPORT Py_ssize_t PyUnicode_GetSize(PyObject* pyobj) {
1817 // This function returns the number of UTF-16 or UTF-32 code units, depending
1818 // on the size of wchar_t on the operating system. On the machines that we
1819 // currently use for testing, this is the same as the number of Unicode code
1820 // points. This must be modified when we support operating systems with
1821 // different wchar_t (e.g. Windows).
1822 return PyUnicode_GetLength(pyobj);
1823}
1824
1825PY_EXPORT PyObject* PyUnicode_InternFromString(const char* c_str) {
1826 DCHECK(c_str != nullptr, "c_str must not be nullptr");
1827 Thread* thread = Thread::current();
1828 return ApiHandle::newReference(thread->runtime(),
1829 Runtime::internStrFromCStr(thread, c_str));
1830}
1831
1832PY_EXPORT void PyUnicode_InternImmortal(PyObject** /* p */) {
1833 UNIMPLEMENTED("PyUnicode_InternImmortal");
1834}
1835
1836PY_EXPORT void PyUnicode_InternInPlace(PyObject** obj_ptr) {
1837 PyObject* pobj = *obj_ptr;
1838 DCHECK(pobj != nullptr, "pobj should not be null");
1839 if (pobj == nullptr) {
1840 return;
1841 }
1842 Thread* thread = Thread::current();
1843 HandleScope scope(thread);
1844 Object obj(&scope, ApiHandle::asObject(ApiHandle::fromPyObject(pobj)));
1845 if (!obj.isLargeStr()) {
1846 return;
1847 }
1848 Object result(&scope, Runtime::internStr(thread, obj));
1849 if (result != obj) {
1850 Py_DECREF(pobj);
1851 *obj_ptr = ApiHandle::newReference(thread->runtime(), *result);
1852 }
1853}
1854
1855PY_EXPORT int PyUnicode_IsIdentifier(PyObject* str) {
1856 DCHECK(str != nullptr, "str must not be null");
1857 Thread* thread = Thread::current();
1858 HandleScope scope(thread);
1859 Object str_obj(&scope, ApiHandle::asObject(ApiHandle::fromPyObject(str)));
1860 if (str_obj == Str::empty()) {
1861 return false;
1862 }
1863 Object result(&scope, thread->invokeMethodStatic1(LayoutId::kStr,
1864 ID(isidentifier), str_obj));
1865 DCHECK(!result.isErrorNotFound(), "could not call str.isidentifier");
1866 CHECK(!result.isError(), "this function should not error");
1867 return Bool::cast(*result).value();
1868}
1869
1870PY_EXPORT PyObject* PyUnicode_Join(PyObject* sep, PyObject* seq) {
1871 DCHECK(sep != nullptr, "sep should not be null");
1872 DCHECK(seq != nullptr, "seq should not be null");
1873 Thread* thread = Thread::current();
1874 HandleScope scope(thread);
1875 Object sep_obj(&scope, ApiHandle::asObject(ApiHandle::fromPyObject(sep)));
1876 // An optimization to rule out non-str values here to use the further
1877 // optimization of `strJoinWithTupleOrList`.
1878 Runtime* runtime = thread->runtime();
1879 if (!runtime->isInstanceOfStr(*sep_obj)) {
1880 thread->raiseWithFmt(LayoutId::kTypeError,
1881 "separator: expected str instance,"
1882 "'%T' found",
1883 &sep_obj);
1884 return nullptr;
1885 }
1886 Str sep_str(&scope, strUnderlying(*sep_obj));
1887 Object seq_obj(&scope, ApiHandle::asObject(ApiHandle::fromPyObject(seq)));
1888 // An ad-hoc optimization for the case `seq_obj` is a `tuple` or `list`,
1889 // that can be removed without changing the correctness of PyUnicode_Join.
1890 Object result(&scope, strJoinWithTupleOrList(thread, sep_str, seq_obj));
1891 if (result.isUnbound()) {
1892 result =
1893 thread->invokeMethodStatic2(LayoutId::kStr, ID(join), sep_str, seq_obj);
1894 }
1895 if (result.isError()) {
1896 if (result.isErrorNotFound()) {
1897 thread->raiseWithFmt(LayoutId::kTypeError, "could not call str.join");
1898 }
1899 return nullptr;
1900 }
1901 return ApiHandle::newReference(runtime, *result);
1902}
1903
1904PY_EXPORT PyObject* PyUnicode_Partition(PyObject* str, PyObject* sep) {
1905 DCHECK(str != nullptr, "str should not be null");
1906 DCHECK(sep != nullptr, "sep should not be null");
1907 Thread* thread = Thread::current();
1908 HandleScope scope(thread);
1909 Object str_obj(&scope, ApiHandle::asObject(ApiHandle::fromPyObject(str)));
1910 Object sep_obj(&scope, ApiHandle::asObject(ApiHandle::fromPyObject(sep)));
1911 Object result(&scope, thread->invokeMethodStatic2(
1912 LayoutId::kStr, ID(partition), str_obj, sep_obj));
1913 if (result.isError()) {
1914 if (result.isErrorNotFound()) {
1915 thread->raiseWithFmt(LayoutId::kTypeError,
1916 "could not call str.partition");
1917 }
1918 return nullptr;
1919 }
1920 return ApiHandle::newReference(thread->runtime(), *result);
1921}
1922
1923PY_EXPORT PyObject* PyUnicode_RPartition(PyObject* str, PyObject* sep) {
1924 DCHECK(str != nullptr, "str should not be null");
1925 DCHECK(sep != nullptr, "sep should not be null");
1926 Thread* thread = Thread::current();
1927 HandleScope scope(thread);
1928 Object str_obj(&scope, ApiHandle::asObject(ApiHandle::fromPyObject(str)));
1929 Object sep_obj(&scope, ApiHandle::asObject(ApiHandle::fromPyObject(sep)));
1930 Object result(&scope, thread->invokeMethodStatic2(
1931 LayoutId::kStr, ID(rpartition), str_obj, sep_obj));
1932 if (result.isError()) {
1933 if (result.isErrorNotFound()) {
1934 thread->raiseWithFmt(LayoutId::kTypeError,
1935 "could not call str.rpartition");
1936 }
1937 return nullptr;
1938 }
1939 return ApiHandle::newReference(thread->runtime(), *result);
1940}
1941
1942PY_EXPORT PyObject* PyUnicode_RSplit(PyObject* str, PyObject* sep,
1943 Py_ssize_t maxsplit) {
1944 DCHECK(str != nullptr, "str must not be null");
1945 DCHECK(sep != nullptr, "sep must not be null");
1946 Thread* thread = Thread::current();
1947 HandleScope scope(thread);
1948 Object str_obj(&scope, ApiHandle::asObject(ApiHandle::fromPyObject(str)));
1949 Object sep_obj(&scope, ApiHandle::asObject(ApiHandle::fromPyObject(sep)));
1950 Runtime* runtime = thread->runtime();
1951 Object maxsplit_obj(&scope, runtime->newInt(maxsplit));
1952 Object result(&scope,
1953 thread->invokeMethodStatic3(LayoutId::kStr, ID(rsplit), str_obj,
1954 sep_obj, maxsplit_obj));
1955 if (result.isError()) {
1956 if (result.isErrorNotFound()) {
1957 thread->raiseWithFmt(LayoutId::kTypeError, "could not call str.rsplit");
1958 }
1959 return nullptr;
1960 }
1961 return ApiHandle::newReference(runtime, *result);
1962}
1963
1964PY_EXPORT Py_UCS4 PyUnicode_ReadChar(PyObject* obj, Py_ssize_t index) {
1965 DCHECK(obj != nullptr, "obj must not be null");
1966 Thread* thread = Thread::current();
1967 HandleScope scope(thread);
1968 Runtime* runtime = thread->runtime();
1969 Object str_obj(&scope, ApiHandle::asObject(ApiHandle::fromPyObject(obj)));
1970 if (!runtime->isInstanceOfStr(*str_obj)) {
1971 thread->raiseBadArgument();
1972 return -1;
1973 }
1974 Str str(&scope, strUnderlying(*str_obj));
1975 word byte_offset;
1976 if (index < 0 ||
1977 (byte_offset = thread->strOffset(str, index)) >= str.length()) {
1978 thread->raiseWithFmt(LayoutId::kIndexError, "string index out of range");
1979 return -1;
1980 }
1981 word num_bytes;
1982 return str.codePointAt(byte_offset, &num_bytes);
1983}
1984
1985PY_EXPORT PyObject* PyUnicode_Replace(PyObject* str, PyObject* substr,
1986 PyObject* replstr, Py_ssize_t maxcount) {
1987 DCHECK(str != nullptr, "str must not be null");
1988 DCHECK(substr != nullptr, "substr must not be null");
1989 DCHECK(replstr != nullptr, "replstr must not be null");
1990 Thread* thread = Thread::current();
1991 HandleScope scope(thread);
1992 Runtime* runtime = thread->runtime();
1993 Object str_obj(&scope, ApiHandle::asObject(ApiHandle::fromPyObject(str)));
1994 if (!runtime->isInstanceOfStr(*str_obj)) {
1995 thread->raiseWithFmt(LayoutId::kTypeError, "str must be str");
1996 return nullptr;
1997 }
1998
1999 Object substr_obj(&scope, ApiHandle::asObject(ApiHandle::fromPyObject(substr)));
2000 if (!runtime->isInstanceOfStr(*substr_obj)) {
2001 thread->raiseWithFmt(LayoutId::kTypeError, "substr must be str");
2002 return nullptr;
2003 }
2004
2005 Object replstr_obj(&scope, ApiHandle::asObject(ApiHandle::fromPyObject(replstr)));
2006 if (!runtime->isInstanceOfStr(*replstr_obj)) {
2007 thread->raiseWithFmt(LayoutId::kTypeError, "replstr must be str");
2008 return nullptr;
2009 }
2010
2011 Str str_str(&scope, strUnderlying(*str_obj));
2012 Str substr_str(&scope, strUnderlying(*substr_obj));
2013 Str replstr_str(&scope, strUnderlying(*replstr_obj));
2014 return ApiHandle::newReference(
2015 runtime,
2016 runtime->strReplace(thread, str_str, substr_str, replstr_str, maxcount));
2017}
2018
2019PY_EXPORT int PyUnicode_Resize(PyObject** /* p_unicode */, Py_ssize_t /* h */) {
2020 UNIMPLEMENTED("PyUnicode_Resize");
2021}
2022
2023PY_EXPORT PyObject* PyUnicode_RichCompare(PyObject* /* t */, PyObject* /* t */,
2024 int /* p */) {
2025 UNIMPLEMENTED("PyUnicode_RichCompare");
2026}
2027
2028PY_EXPORT PyObject* PyUnicode_Split(PyObject* str, PyObject* sep,
2029 Py_ssize_t maxsplit) {
2030 DCHECK(str != nullptr, "str must not be null");
2031 DCHECK(sep != nullptr, "sep must not be null");
2032 Thread* thread = Thread::current();
2033 HandleScope scope(thread);
2034 Object str_obj(&scope, ApiHandle::asObject(ApiHandle::fromPyObject(str)));
2035 Object sep_obj(&scope, ApiHandle::asObject(ApiHandle::fromPyObject(sep)));
2036 Runtime* runtime = thread->runtime();
2037 Object maxsplit_obj(&scope, runtime->newInt(maxsplit));
2038 Object result(&scope,
2039 thread->invokeMethodStatic3(LayoutId::kStr, ID(split), str_obj,
2040 sep_obj, maxsplit_obj));
2041 if (result.isError()) {
2042 if (result.isErrorNotFound()) {
2043 thread->raiseWithFmt(LayoutId::kTypeError, "could not call str.split");
2044 }
2045 return nullptr;
2046 }
2047 return ApiHandle::newReference(runtime, *result);
2048}
2049
2050PY_EXPORT PyObject* PyUnicode_Splitlines(PyObject* str, int keepends) {
2051 Thread* thread = Thread::current();
2052 HandleScope scope(thread);
2053 Object str_obj(&scope, ApiHandle::asObject(ApiHandle::fromPyObject(str)));
2054 Runtime* runtime = thread->runtime();
2055 if (!runtime->isInstanceOfStr(*str_obj)) {
2056 thread->raiseWithFmt(LayoutId::kTypeError, "must be str, not '%T'",
2057 &str_obj);
2058 return nullptr;
2059 }
2060 Str str_str(&scope, strUnderlying(*str_obj));
2061 return ApiHandle::newReference(runtime,
2062 strSplitlines(thread, str_str, keepends));
2063}
2064
2065PY_EXPORT PyObject* PyUnicode_Substring(PyObject* pyobj, Py_ssize_t start,
2066 Py_ssize_t end) {
2067 DCHECK(pyobj != nullptr, "null argument to PyUnicode_Substring");
2068 Thread* thread = Thread::current();
2069 if (start < 0 || end < 0) {
2070 thread->raiseWithFmt(LayoutId::kIndexError, "string index out of range");
2071 return nullptr;
2072 }
2073 Runtime* runtime = thread->runtime();
2074 if (end <= start) {
2075 return ApiHandle::newReference(runtime, Str::empty());
2076 }
2077 HandleScope scope(thread);
2078 ApiHandle* handle = ApiHandle::fromPyObject(pyobj);
2079 Object obj(&scope, ApiHandle::asObject(handle));
2080 DCHECK(runtime->isInstanceOfStr(*obj),
2081 "PyUnicode_Substring requires a 'str' instance");
2082 Str self(&scope, strUnderlying(*obj));
2083 word len = self.length();
2084 word start_index = thread->strOffset(self, start);
2085 if (start_index == len) {
2086 return ApiHandle::newReference(runtime, Str::empty());
2087 }
2088 word end_index = thread->strOffset(self, end);
2089 if (end_index == len) {
2090 if (start_index == 0) {
2091 ApiHandle::incref(handle);
2092 return pyobj;
2093 }
2094 }
2095 return ApiHandle::newReference(
2096 runtime, strSubstr(thread, self, start_index, end_index - start_index));
2097}
2098
2099PY_EXPORT Py_ssize_t PyUnicode_Tailmatch(PyObject* str, PyObject* substr,
2100 Py_ssize_t start, Py_ssize_t end,
2101 int direction) {
2102 DCHECK(str != nullptr, "str must be non-null");
2103 DCHECK(substr != nullptr, "substr must be non-null");
2104 DCHECK(direction == -1 || direction == 1, "direction must be -1 or 1");
2105 Thread* thread = Thread::current();
2106 HandleScope scope(thread);
2107 Object haystack_obj(&scope, ApiHandle::asObject(ApiHandle::fromPyObject(str)));
2108 Object needle_obj(&scope, ApiHandle::asObject(ApiHandle::fromPyObject(substr)));
2109 Runtime* runtime = thread->runtime();
2110 if (!runtime->isInstanceOfStr(*haystack_obj) ||
2111 !runtime->isInstanceOfStr(*needle_obj)) {
2112 thread->raiseBadArgument();
2113 return -1;
2114 }
2115 Str haystack(&scope, strUnderlying(*haystack_obj));
2116 Str needle(&scope, strUnderlying(*needle_obj));
2117 word haystack_len = haystack.codePointLength();
2118 Slice::adjustSearchIndices(&start, &end, haystack_len);
2119 word needle_len = needle.codePointLength();
2120 if (start + needle_len > end) {
2121 return 0;
2122 }
2123 word start_offset;
2124 if (direction == 1) {
2125 start_offset = haystack.offsetByCodePoints(0, end - needle_len);
2126 } else {
2127 start_offset = haystack.offsetByCodePoints(0, start);
2128 }
2129 word needle_chars = needle.length();
2130 for (word i = start_offset, j = 0; j < needle_chars; i++, j++) {
2131 if (haystack.byteAt(i) != needle.byteAt(j)) {
2132 return 0;
2133 }
2134 }
2135 return 1;
2136}
2137
2138PY_EXPORT PyObject* PyUnicode_Translate(PyObject* /* r */, PyObject* /* g */,
2139 const char* /* s */) {
2140 UNIMPLEMENTED("PyUnicode_Translate");
2141}
2142
2143PY_EXPORT PyTypeObject* PyUnicode_Type_Ptr() {
2144 Runtime* runtime = Thread::current()->runtime();
2145 return reinterpret_cast<PyTypeObject*>(
2146 ApiHandle::borrowedReference(runtime, runtime->typeAt(LayoutId::kStr)));
2147}
2148
2149PY_EXPORT int PyUnicode_WriteChar(PyObject* /* e */, Py_ssize_t /* x */,
2150 Py_UCS4 /* h */) {
2151 UNIMPLEMENTED("PyUnicode_WriteChar");
2152}
2153
2154PY_EXPORT Py_UNICODE* PyUnicode_AsUnicode(PyObject* /* e */) {
2155 UNIMPLEMENTED("PyUnicode_AsUnicode");
2156}
2157
2158PY_EXPORT Py_UNICODE* PyUnicode_AsUnicodeAndSize(PyObject* /* unicode */,
2159 Py_ssize_t* /* size */) {
2160 UNIMPLEMENTED("PyUnicode_AsUnicodeAndSize");
2161}
2162
2163template <typename T>
2164static PyObject* decodeUnicodeToString(Thread* thread, const void* src,
2165 word size) {
2166 Runtime* runtime = thread->runtime();
2167 DCHECK(src != nullptr, "Must pass in a non-null buffer");
2168 const T* cp = static_cast<const T*>(src);
2169 if (size == 1) {
2170 return ApiHandle::newReference(runtime, SmallStr::fromCodePoint(cp[0]));
2171 }
2172 HandleScope scope(thread);
2173 // TODO(T41785453): Remove the StrArray intermediary
2174 StrArray array(&scope, runtime->newStrArray());
2175 runtime->strArrayEnsureCapacity(thread, array, size);
2176 for (word i = 0; i < size; ++i) {
2177 runtime->strArrayAddCodePoint(thread, array, cp[i]);
2178 }
2179 return ApiHandle::newReference(runtime, runtime->strFromStrArray(array));
2180}
2181
2182PY_EXPORT PyObject* PyUnicode_FromKindAndData(int kind, const void* buffer,
2183 Py_ssize_t size) {
2184 Thread* thread = Thread::current();
2185 if (size < 0) {
2186 thread->raiseWithFmt(LayoutId::kValueError, "size must be positive");
2187 return nullptr;
2188 }
2189 if (size == 0) {
2190 return ApiHandle::newReference(thread->runtime(), Str::empty());
2191 }
2192 switch (kind) {
2193 case PyUnicode_1BYTE_KIND:
2194 return decodeUnicodeToString<Py_UCS1>(thread, buffer, size);
2195 case PyUnicode_2BYTE_KIND:
2196 return decodeUnicodeToString<Py_UCS2>(thread, buffer, size);
2197 case PyUnicode_4BYTE_KIND:
2198 return decodeUnicodeToString<Py_UCS4>(thread, buffer, size);
2199 }
2200 thread->raiseWithFmt(LayoutId::kSystemError, "invalid kind");
2201 return nullptr;
2202}
2203
2204PY_EXPORT PyObject* PyUnicode_FromUnicode(const Py_UNICODE* code_units,
2205 Py_ssize_t size) {
2206 if (code_units == nullptr) {
2207 // TODO(T36562134): Implement _PyUnicode_New
2208 UNIMPLEMENTED("_PyUnicode_New");
2209 }
2210
2211 Thread* thread = Thread::current();
2212 RawObject result = newStrFromWideCharWithLength(thread, code_units, size);
2213 return result.isErrorException()
2214 ? nullptr
2215 : ApiHandle::newReference(thread->runtime(), result);
2216}
2217
2218PY_EXPORT int PyUnicode_KIND_Func(PyObject* obj) {
2219 // TODO(T47682853): Introduce new PyUnicode_VARBYTE_KIND
2220 CHECK(PyUnicode_IS_ASCII_Func(obj), "only ASCII allowed");
2221 return PyUnicode_1BYTE_KIND;
2222}
2223
2224// NOTE: This will return a cached and managed C-string buffer that is a copy
2225// of the Str internal buffer. It is NOT a direct pointer into the string
2226// object, so writing into this buffer will do nothing. This is different
2227// behavior from CPython, where changing the data in the buffer changes the
2228// string object.
2229PY_EXPORT void* PyUnicode_DATA_Func(PyObject* str) {
2230 Thread* thread = Thread::current();
2231 Runtime* runtime = thread->runtime();
2232 ApiHandle* handle = ApiHandle::fromPyObject(str);
2233 if (void* cache = ApiHandle::cache(runtime, handle)) {
2234 return static_cast<char*>(cache);
2235 }
2236 HandleScope scope(thread);
2237 Object obj(&scope, ApiHandle::asObject(handle));
2238 DCHECK(runtime->isInstanceOfStr(*obj), "str should be a str instance");
2239 Str str_obj(&scope, strUnderlying(*obj));
2240 word length = str_obj.length();
2241 byte* result = static_cast<byte*>(std::malloc(length + 1));
2242 str_obj.copyTo(result, length);
2243 result[length] = '\0';
2244 ApiHandle::setCache(runtime, handle, result);
2245 ApiHandle::setBorrowedNoImmediate(handle);
2246 return reinterpret_cast<char*>(result);
2247}
2248
2249PY_EXPORT Py_UCS4 PyUnicode_READ_Func(int kind, void* data, Py_ssize_t index) {
2250 if (kind == PyUnicode_1BYTE_KIND) return static_cast<Py_UCS1*>(data)[index];
2251 if (kind == PyUnicode_2BYTE_KIND) return static_cast<Py_UCS2*>(data)[index];
2252 DCHECK(kind == PyUnicode_4BYTE_KIND, "kind must be PyUnicode_4BYTE_KIND");
2253 return static_cast<Py_UCS4*>(data)[index];
2254}
2255
2256PY_EXPORT Py_UCS4 PyUnicode_READ_CHAR_Func(PyObject* obj, Py_ssize_t index) {
2257 Thread* thread = Thread::current();
2258 HandleScope scope(thread);
2259 Object str_obj(&scope, ApiHandle::asObject(ApiHandle::fromPyObject(obj)));
2260 DCHECK(thread->runtime()->isInstanceOfStr(*str_obj),
2261 "PyUnicode_READ_CHAR must receive a unicode object");
2262 Str str(&scope, strUnderlying(*str_obj));
2263 word byte_offset = thread->strOffset(str, index);
2264 if (byte_offset == str.length()) return Py_UCS4{0};
2265 word num_bytes;
2266 return static_cast<Py_UCS4>(str.codePointAt(byte_offset, &num_bytes));
2267}
2268
2269PY_EXPORT int PyUnicode_IS_ASCII_Func(PyObject* obj) {
2270 Thread* thread = Thread::current();
2271 HandleScope scope(thread);
2272 Object str(&scope, ApiHandle::asObject(ApiHandle::fromPyObject(obj)));
2273 DCHECK(thread->runtime()->isInstanceOfStr(*str),
2274 "strIsASCII must receive a unicode object");
2275 return strUnderlying(*str).isASCII() ? 1 : 0;
2276}
2277
2278PY_EXPORT int Py_UNICODE_ISALPHA_Func(Py_UCS4 code_point) {
2279 if (code_point > kMaxUnicode) {
2280 return 0;
2281 }
2282 return Unicode::isAlpha(static_cast<int32_t>(code_point)) ? 1 : 0;
2283}
2284
2285PY_EXPORT int Py_UNICODE_ISDECIMAL_Func(Py_UCS4 code_point) {
2286 if (code_point > kMaxUnicode) {
2287 return 0;
2288 }
2289 return Unicode::isDecimal(static_cast<int32_t>(code_point)) ? 1 : 0;
2290}
2291
2292PY_EXPORT int Py_UNICODE_ISDIGIT_Func(Py_UCS4 code_point) {
2293 if (code_point > kMaxUnicode) {
2294 return 0;
2295 }
2296 return Unicode::isDigit(static_cast<int32_t>(code_point)) ? 1 : 0;
2297}
2298
2299PY_EXPORT int Py_UNICODE_ISLINEBREAK_Func(Py_UCS4 code_point) {
2300 if (code_point > kMaxUnicode) {
2301 return 0;
2302 }
2303 return Unicode::isLinebreak(static_cast<int32_t>(code_point)) ? 1 : 0;
2304}
2305
2306PY_EXPORT int Py_UNICODE_ISLOWER_Func(Py_UCS4 code_point) {
2307 if (code_point > kMaxUnicode) {
2308 return 0;
2309 }
2310 return Unicode::isLower(static_cast<int32_t>(code_point)) ? 1 : 0;
2311}
2312
2313PY_EXPORT int Py_UNICODE_ISNUMERIC_Func(Py_UCS4 code_point) {
2314 if (code_point > kMaxUnicode) {
2315 return 0;
2316 }
2317 return Unicode::isNumeric(static_cast<int32_t>(code_point)) ? 1 : 0;
2318}
2319
2320PY_EXPORT int Py_UNICODE_ISPRINTABLE_Func(Py_UCS4 code_point) {
2321 if (code_point > kMaxUnicode) {
2322 return 0;
2323 }
2324 return Unicode::isPrintable(static_cast<int32_t>(code_point)) ? 1 : 0;
2325}
2326
2327PY_EXPORT int Py_UNICODE_ISSPACE_Func(Py_UCS4 code_point) {
2328 if (code_point > kMaxUnicode) {
2329 return 0;
2330 }
2331 return Unicode::isSpace(static_cast<int32_t>(code_point)) ? 1 : 0;
2332}
2333
2334PY_EXPORT int Py_UNICODE_ISTITLE_Func(Py_UCS4 code_point) {
2335 if (code_point > kMaxUnicode) {
2336 return 0;
2337 }
2338 return Unicode::isTitle(static_cast<int32_t>(code_point)) ? 1 : 0;
2339}
2340
2341PY_EXPORT int Py_UNICODE_ISUPPER_Func(Py_UCS4 code_point) {
2342 if (code_point > kMaxUnicode) {
2343 return 0;
2344 }
2345 return Unicode::isUpper(static_cast<int32_t>(code_point)) ? 1 : 0;
2346}
2347
2348PY_EXPORT int Py_UNICODE_TODECIMAL_Func(Py_UCS4 code_point) {
2349 if (code_point > kMaxUnicode) {
2350 return -1;
2351 }
2352 return Unicode::toDecimal(static_cast<int32_t>(code_point));
2353}
2354
2355PY_EXPORT int Py_UNICODE_TODIGIT_Func(Py_UCS4 code_point) {
2356 if (code_point > kMaxUnicode) {
2357 return -1;
2358 }
2359 return Unicode::toDigit(static_cast<int32_t>(code_point));
2360}
2361
2362PY_EXPORT Py_UCS4 Py_UNICODE_TOLOWER_Func(Py_UCS4 code_point) {
2363 if (code_point > kMaxUnicode) {
2364 return code_point;
2365 }
2366 FullCasing lower = Unicode::toLower(static_cast<int32_t>(code_point));
2367 return lower.code_points[0];
2368}
2369
2370PY_EXPORT double Py_UNICODE_TONUMERIC_Func(Py_UCS4 code_point) {
2371 if (code_point > kMaxUnicode) {
2372 return -1.0;
2373 }
2374 return Unicode::toNumeric(static_cast<int32_t>(code_point));
2375}
2376
2377PY_EXPORT Py_UCS4 Py_UNICODE_TOTITLE_Func(Py_UCS4 code_point) {
2378 if (code_point > kMaxUnicode) {
2379 return code_point;
2380 }
2381 FullCasing title = Unicode::toTitle(static_cast<int32_t>(code_point));
2382 return title.code_points[0];
2383}
2384
2385PY_EXPORT Py_UCS4 Py_UNICODE_TOUPPER_Func(Py_UCS4 code_point) {
2386 if (code_point > kMaxUnicode) {
2387 return code_point;
2388 }
2389 FullCasing upper = Unicode::toUpper(static_cast<int32_t>(code_point));
2390 return upper.code_points[0];
2391}
2392
2393PY_EXPORT int _Py_normalize_encoding(const char* encoding, char* lower,
2394 size_t lower_len) {
2395 char* buffer = lower;
2396 const char* lower_end = &lower[lower_len - 1];
2397 bool has_punct = false;
2398 for (char ch = *encoding; ch != '\0'; ch = *++encoding) {
2399 if (Py_ISALNUM(ch) || ch == '.') {
2400 if (has_punct && buffer != lower) {
2401 if (buffer == lower_end) {
2402 return 0;
2403 }
2404 *buffer++ = '_';
2405 }
2406 has_punct = false;
2407
2408 if (buffer == lower_end) {
2409 return 0;
2410 }
2411 *buffer++ = Py_TOLOWER(ch);
2412 } else {
2413 has_punct = true;
2414 }
2415 }
2416 *buffer = '\0';
2417 return 1;
2418}
2419
2420PY_EXPORT PyObject* _PyUnicode_AsUTF8String(PyObject* unicode,
2421 const char* errors) {
2422 DCHECK(unicode != nullptr, "unicode cannot be null");
2423 Thread* thread = Thread::current();
2424 HandleScope scope(thread);
2425 Runtime* runtime = thread->runtime();
2426 Object obj(&scope, ApiHandle::asObject(ApiHandle::fromPyObject(unicode)));
2427 if (!runtime->isInstanceOfStr(*obj)) {
2428 thread->raiseBadArgument();
2429 return nullptr;
2430 }
2431 Str str(&scope, strUnderlying(*obj));
2432 if (!strHasSurrogate(str)) {
2433 word length = str.length();
2434 MutableBytes result(&scope, runtime->newMutableBytesUninitialized(length));
2435 result.replaceFromWithStr(0, *str, length);
2436 return ApiHandle::newReference(runtime, result.becomeImmutable());
2437 }
2438 Object errors_obj(&scope, symbolFromError(thread, errors));
2439 Object tuple_obj(&scope, thread->invokeFunction2(
2440 ID(_codecs), ID(utf_8_encode), str, errors_obj));
2441 if (tuple_obj.isError()) {
2442 return nullptr;
2443 }
2444 Tuple tuple(&scope, *tuple_obj);
2445 return ApiHandle::newReference(runtime, tuple.at(0));
2446}
2447
2448PY_EXPORT wchar_t* _Py_DecodeUTF8_surrogateescape(const char* c_str,
2449 Py_ssize_t size,
2450 size_t* wlen) {
2451 DCHECK(c_str != nullptr, "c_str cannot be null");
2452 wchar_t* wc_str =
2453 static_cast<wchar_t*>(PyMem_RawMalloc((size + 1) * sizeof(wchar_t)));
2454 for (Py_ssize_t i = 0; i < size; i++) {
2455 char ch = c_str[i];
2456 // TODO(T57811636): Support UTF-8 arguments on macOS.
2457 // We don't have UTF-8 decoding machinery that is decoupled from the
2458 // runtime
2459 if (ch & 0x80) {
2460 UNIMPLEMENTED("UTF-8 argument support unimplemented");
2461 }
2462 wc_str[i] = static_cast<wchar_t>(ch);
2463 }
2464 wc_str[size] = '\0';
2465 if (wlen != nullptr) {
2466 *wlen = size;
2467 }
2468 return wc_str;
2469}
2470
2471PY_EXPORT int _Py_DecodeUTF8Ex(const char* c_str, Py_ssize_t size,
2472 wchar_t** result, size_t* wlen,
2473 const char** /* reason */,
2474 _Py_error_handler /* surrogateescape */) {
2475 wchar_t* wc_str =
2476 static_cast<wchar_t*>(PyMem_RawMalloc((size + 1) * sizeof(*wc_str)));
2477 if (wc_str == nullptr) {
2478 return -1;
2479 }
2480 for (Py_ssize_t i = 0; i < size; i++) {
2481 byte ch = c_str[i];
2482 // TODO(T57811636): Support UTF-8 decoding decoupled from the runtime.
2483 // We don't have UTF-8 decoding machinery that is decoupled from the
2484 // runtime
2485 if (ch > kMaxASCII) {
2486 UNIMPLEMENTED("UTF-8 argument support unimplemented");
2487 }
2488 wc_str[i] = ch;
2489 }
2490 wc_str[size] = '\0';
2491 *result = wc_str;
2492 if (wlen) {
2493 *wlen = size;
2494 }
2495 return 0;
2496}
2497
2498// UTF-8 encoder using the surrogateescape error handler .
2499//
2500// On success, return 0 and write the newly allocated character string (use
2501// PyMem_Free() to free the memory) into *str.
2502//
2503// On encoding failure, return -2 and write the position of the invalid
2504// surrogate character into *error_pos (if error_pos is set) and the decoding
2505// error message into *reason (if reason is set).
2506//
2507// On memory allocation failure, return -1.
2508PY_EXPORT int _Py_EncodeUTF8Ex(const wchar_t* text, char** str,
2509 size_t* error_pos, const char** reason,
2510 int raw_malloc, _Py_error_handler errors) {
2511 const Py_ssize_t max_char_size = 4;
2512 Py_ssize_t len = std::wcslen(text);
2513 DCHECK(len >= 0, "len must be non-negative");
2514
2515 bool surrogateescape = false;
2516 bool surrogatepass = false;
2517 switch (errors) {
2518 case _Py_ERROR_STRICT:
2519 break;
2520 case _Py_ERROR_SURROGATEESCAPE:
2521 surrogateescape = true;
2522 break;
2523 case _Py_ERROR_SURROGATEPASS:
2524 surrogatepass = true;
2525 break;
2526 default:
2527 return -3;
2528 }
2529
2530 if (len > PY_SSIZE_T_MAX / max_char_size - 1) {
2531 return -1;
2532 }
2533 char* bytes;
2534 if (raw_malloc) {
2535 bytes = reinterpret_cast<char*>(PyMem_RawMalloc((len + 1) * max_char_size));
2536 } else {
2537 bytes = reinterpret_cast<char*>(PyMem_Malloc((len + 1) * max_char_size));
2538 }
2539 if (bytes == nullptr) {
2540 return -1;
2541 }
2542
2543 char* p = bytes;
2544 for (Py_ssize_t i = 0; i < len; i++) {
2545 Py_UCS4 ch = text[i];
2546
2547 if (ch < 0x80) {
2548 // Encode ASCII
2549 *p++ = (char)ch;
2550
2551 } else if (ch < 0x0800) {
2552 // Encode Latin-1
2553 *p++ = (char)(0xc0 | (ch >> 6));
2554 *p++ = (char)(0x80 | (ch & 0x3f));
2555 } else if (Py_UNICODE_IS_SURROGATE(ch) && !surrogatepass) {
2556 // surrogateescape error handler
2557 if (!surrogateescape || !(0xDC80 <= ch && ch <= 0xDCFF)) {
2558 if (error_pos != nullptr) {
2559 *error_pos = (size_t)i;
2560 }
2561 if (reason != nullptr) {
2562 *reason = "encoding error";
2563 }
2564 if (raw_malloc) {
2565 PyMem_RawFree(bytes);
2566 } else {
2567 PyMem_Free(bytes);
2568 }
2569 return -2;
2570 }
2571 *p++ = (char)(ch & 0xff);
2572 } else if (ch < 0x10000) {
2573 *p++ = (char)(0xe0 | (ch >> 12));
2574 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2575 *p++ = (char)(0x80 | (ch & 0x3f));
2576 } else {
2577 // ch >= 0x10000
2578 DCHECK(ch <= kMaxUnicode, "ch must be a valid unicode code point");
2579 // Encode UCS4 Unicode ordinals
2580 *p++ = (char)(0xf0 | (ch >> 18));
2581 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
2582 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2583 *p++ = (char)(0x80 | (ch & 0x3f));
2584 }
2585 }
2586 *p++ = '\0';
2587
2588 size_t final_size = (p - bytes);
2589 char* bytes2;
2590 if (raw_malloc) {
2591 bytes2 = reinterpret_cast<char*>(PyMem_RawRealloc(bytes, final_size));
2592 } else {
2593 bytes2 = reinterpret_cast<char*>(PyMem_Realloc(bytes, final_size));
2594 }
2595 if (bytes2 == nullptr) {
2596 if (error_pos != nullptr) {
2597 *error_pos = (size_t)-1;
2598 }
2599 if (raw_malloc) {
2600 PyMem_RawFree(bytes);
2601 } else {
2602 PyMem_Free(bytes);
2603 }
2604 return -1;
2605 }
2606 *str = bytes2;
2607 return 0;
2608}
2609
2610} // namespace py