this repo has no description
1// Copyright (c) Facebook, Inc. and its affiliates. (http://www.facebook.com)
2#include "builtins.h"
3#include "handles-decl.h"
4#include "layout.h"
5#include "module-builtins.h"
6#include "modules.h"
7#include "objects.h"
8#include "runtime.h"
9#include "symbols.h"
10#include "thread.h"
11#include "type-builtins.h"
12#include "unicode-db.h"
13#include "unicode.h"
14
15namespace py {
16
17void FUNC(unicodedata, __init_module__)(Thread* thread, const Module& module,
18 View<byte> bytecode) {
19 executeFrozenModule(thread, module, bytecode);
20
21 HandleScope scope(thread);
22 Runtime* runtime = thread->runtime();
23 Type ucd_type(&scope, moduleAtById(thread, module, ID(UCD)));
24 Layout ucd_layout(&scope, ucd_type.instanceLayout());
25 Object old_ucd(&scope, runtime->newInstance(ucd_layout));
26 moduleAtPutById(thread, module, ID(ucd_3_2_0), old_ucd);
27}
28
29static int32_t getCodePoint(const Str& src) {
30 word length = src.length();
31 if (length == 0) {
32 return -1;
33 }
34 word char_length;
35 int32_t result = src.codePointAt(0, &char_length);
36 return (length == char_length) ? result : -1;
37}
38
39RawObject FUNC(unicodedata, bidirectional)(Thread* thread, Arguments args) {
40 HandleScope scope(thread);
41 Runtime* runtime = thread->runtime();
42 Object obj(&scope, args.get(0));
43 if (!runtime->isInstanceOfStr(*obj)) {
44 return thread->raiseRequiresType(obj, ID(str));
45 }
46 Str src(&scope, strUnderlying(*obj));
47 int32_t code_point = getCodePoint(src);
48 if (code_point == -1) {
49 return thread->raiseWithFmt(
50 LayoutId::kTypeError,
51 "bidirectional() argument must be a unicode character");
52 }
53 return kBidirectionalNames[databaseRecord(code_point)->bidirectional];
54}
55
56RawObject FUNC(unicodedata, category)(Thread* thread, Arguments args) {
57 HandleScope scope(thread);
58 Runtime* runtime = thread->runtime();
59 Object obj(&scope, args.get(0));
60 if (!runtime->isInstanceOfStr(*obj)) {
61 return thread->raiseRequiresType(obj, ID(str));
62 }
63 Str src(&scope, strUnderlying(*obj));
64 int32_t code_point = getCodePoint(src);
65 if (code_point == -1) {
66 return thread->raiseWithFmt(
67 LayoutId::kTypeError,
68 "category() argument must be a unicode character");
69 }
70 return kCategoryNames[databaseRecord(code_point)->category];
71}
72
73RawObject FUNC(unicodedata, decimal)(Thread* thread, Arguments args) {
74 HandleScope scope(thread);
75 Runtime* runtime = thread->runtime();
76 Object obj(&scope, args.get(0));
77 if (!runtime->isInstanceOfStr(*obj)) {
78 return thread->raiseRequiresType(obj, ID(str));
79 }
80 Str src(&scope, strUnderlying(*obj));
81 int32_t code_point = getCodePoint(src);
82 if (code_point == -1) {
83 return thread->raiseWithFmt(
84 LayoutId::kTypeError, "decimal() argument must be a unicode character");
85 }
86
87 int8_t decimal = Unicode::toDecimal(code_point);
88 if (decimal != -1) {
89 return SmallInt::fromWord(decimal);
90 }
91
92 Object default_value(&scope, args.get(1));
93 if (default_value.isUnbound()) {
94 return thread->raiseWithFmt(LayoutId::kValueError, "not a decimal");
95 }
96 return *default_value;
97}
98
99static void writeDecomposition(UnicodeDecomposition decomp,
100 const MutableBytes& out) {
101 word prefix_length = std::strlen(decomp.prefix);
102 char* dst = reinterpret_cast<char*>(out.address());
103 std::memcpy(dst, decomp.prefix, prefix_length);
104
105 word i = prefix_length;
106 for (word j = 0; j < decomp.count; j++) {
107 if (i > 0) {
108 dst[i++] = ' ';
109 }
110 std::sprintf(&dst[i], "%04X", decomp.code_points[j]);
111 i += 4;
112 }
113 DCHECK(i == out.length(), "expected %d bytes, wrote %d", out.length(), i);
114}
115
116RawObject FUNC(unicodedata, decomposition)(Thread* thread, Arguments args) {
117 HandleScope scope(thread);
118 Runtime* runtime = thread->runtime();
119 Object obj(&scope, args.get(0));
120 if (!runtime->isInstanceOfStr(*obj)) {
121 return thread->raiseRequiresType(obj, ID(str));
122 }
123 Str src(&scope, strUnderlying(*obj));
124 int32_t code_point = getCodePoint(src);
125 if (code_point == -1) {
126 return thread->raiseWithFmt(
127 LayoutId::kTypeError,
128 "decomposition() argument must be a unicode character");
129 }
130
131 UnicodeDecomposition decomp = decomposeCodePoint(code_point);
132 if (decomp.count == 0) {
133 return Str::empty();
134 }
135
136 word prefix_length = std::strlen(decomp.prefix);
137 word result_length = prefix_length + 5 * decomp.count;
138 MutableBytes result(&scope,
139 runtime->newMutableBytesUninitialized(result_length));
140 writeDecomposition(decomp, result);
141 return result.becomeStr();
142}
143
144RawObject FUNC(unicodedata, digit)(Thread* thread, Arguments args) {
145 HandleScope scope(thread);
146 Runtime* runtime = thread->runtime();
147 Object obj(&scope, args.get(0));
148 if (!runtime->isInstanceOfStr(*obj)) {
149 return thread->raiseRequiresType(obj, ID(str));
150 }
151 Str src(&scope, strUnderlying(*obj));
152 int32_t code_point = getCodePoint(src);
153 if (code_point == -1) {
154 return thread->raiseWithFmt(LayoutId::kTypeError,
155 "digit() argument must be a unicode character");
156 }
157
158 int8_t digit = Unicode::toDigit(code_point);
159 if (digit != -1) {
160 return SmallInt::fromWord(digit);
161 }
162
163 Object default_value(&scope, args.get(1));
164 if (default_value.isUnbound()) {
165 return thread->raiseWithFmt(LayoutId::kValueError, "not a digit");
166 }
167 return *default_value;
168}
169
170static RawObject copyName(Thread* thread, const Object& name_obj, byte* buffer,
171 word size) {
172 HandleScope scope(thread);
173 Runtime* runtime = thread->runtime();
174 if (runtime->isInstanceOfStr(*name_obj)) {
175 Str name(&scope, strUnderlying(*name_obj));
176 word length = name.length();
177 if (length > size) {
178 return thread->raiseWithFmt(LayoutId::kKeyError, "name too long");
179 }
180 name.copyTo(buffer, length);
181 return SmallInt::fromWord(length);
182 }
183 if (runtime->isInstanceOfBytes(*name_obj)) {
184 Bytes name(&scope, bytesUnderlying(*name_obj));
185 word length = name.length();
186 if (length > size) {
187 return thread->raiseWithFmt(LayoutId::kKeyError, "name too long");
188 }
189 name.copyTo(buffer, length);
190 return SmallInt::fromWord(length);
191 }
192 if (runtime->isByteslike(*name_obj)) {
193 UNIMPLEMENTED("bytes-like other than bytes");
194 }
195 return thread->raiseWithFmt(LayoutId::kTypeError,
196 "a bytes-like object is required, not '%T'",
197 &name_obj);
198}
199
200RawObject FUNC(unicodedata, lookup)(Thread* thread, Arguments args) {
201 HandleScope scope(thread);
202 Object name(&scope, args.get(0));
203 Runtime* runtime = thread->runtime();
204
205 byte buffer[kMaxNameLength + 1];
206 Object copy_result(&scope, copyName(thread, name, buffer, kMaxNameLength));
207 if (copy_result.isErrorException()) {
208 return *copy_result;
209 }
210 word length = SmallInt::cast(*copy_result).value();
211
212 int32_t code_point = codePointFromNameOrNamedSequence(buffer, length);
213 if (code_point < 0) {
214 buffer[length] = '\0';
215 return thread->raiseWithFmt(LayoutId::kKeyError,
216 "undefined character name '%s'", buffer);
217 }
218 if (Unicode::isNamedSequence(code_point)) {
219 const UnicodeNamedSequence* seq = namedSequence(code_point);
220 return runtime->newStrFromUTF32({seq->code_points, seq->length});
221 }
222 DCHECK_BOUND(code_point, kMaxUnicode);
223 return SmallStr::fromCodePoint(code_point);
224}
225
226static NormalizationForm getForm(const Str& str) {
227 if (str.equalsCStr("NFC")) {
228 return NormalizationForm::kNFC;
229 }
230 if (str.equalsCStr("NFKC")) {
231 return NormalizationForm::kNFKC;
232 }
233 if (str.equalsCStr("NFD")) {
234 return NormalizationForm::kNFD;
235 }
236 if (str.equalsCStr("NFKD")) {
237 return NormalizationForm::kNFKD;
238 }
239 return NormalizationForm::kInvalid;
240}
241
242static bool isNormalized(const Str& str, NormalizationForm form) {
243 byte prev_combining = 0;
244 for (word i = 0, length = str.length(), char_length; i < length;
245 i += char_length) {
246 int32_t code_point = str.codePointAt(i, &char_length);
247 const UnicodeDatabaseRecord* record = databaseRecord(code_point);
248 if ((record->quick_check & form) != 0) {
249 return false;
250 }
251 byte combining = record->combining;
252 if (combining != 0 && combining < prev_combining) {
253 return false;
254 }
255 prev_combining = combining;
256 }
257 return true;
258}
259
260static void decomposeHangul(Thread* thread, const StrArray& buffer,
261 int32_t code_point) {
262 int32_t offset = code_point - Unicode::kHangulSyllableStart;
263 int32_t lead = Unicode::kHangulLeadStart + offset / Unicode::kHangulCodaCount;
264 int32_t vowel =
265 Unicode::kHangulVowelStart +
266 (offset % Unicode::kHangulCodaCount) / Unicode::kHangulTrailCount;
267 int32_t trail =
268 Unicode::kHangulTrailStart + offset % Unicode::kHangulTrailCount;
269
270 Runtime* runtime = thread->runtime();
271 runtime->strArrayAddCodePoint(thread, buffer, lead);
272 runtime->strArrayAddCodePoint(thread, buffer, vowel);
273 if (trail != Unicode::kHangulTrailStart) {
274 runtime->strArrayAddCodePoint(thread, buffer, trail);
275 }
276}
277
278static void sortCanonical(const StrArray& buffer) {
279 word char_length;
280 int32_t code_point = buffer.codePointAt(0, &char_length);
281 byte prev_combining = databaseRecord(code_point)->combining;
282 word result_length = buffer.numItems();
283 for (word i = char_length; i < result_length; i += char_length) {
284 code_point = buffer.codePointAt(i, &char_length);
285 byte combining = databaseRecord(code_point)->combining;
286 if (combining == 0 || prev_combining <= combining) {
287 prev_combining = combining;
288 continue;
289 }
290
291 // Non-canonical order. Insert the code point in order.
292 word first = 0;
293 for (word j = buffer.offsetByCodePoints(i, -2); j >= 0;
294 j = buffer.offsetByCodePoints(j, -1)) {
295 word other_len;
296 int32_t other = buffer.codePointAt(j, &other_len);
297 byte other_combining = databaseRecord(other)->combining;
298 if (other_combining == 0 || other_combining <= combining) {
299 first = j + other_len;
300 break;
301 }
302 }
303 buffer.rotateCodePoint(first, i);
304 }
305}
306
307static word skipIndex(word index, int32_t* skipped, word num_skipped) {
308 for (word i = 0; i < num_skipped; i++) {
309 if (skipped[i] == index) {
310 skipped[i] = skipped[num_skipped - 1];
311 return true;
312 }
313 }
314 return false;
315}
316
317static RawObject compose(Thread* thread, const StrArray& decomposition) {
318 HandleScope scope(thread);
319 Runtime* runtime = thread->runtime();
320 StrArray result(&scope, runtime->newStrArray());
321 word decomp_length = decomposition.numItems();
322
323 int32_t skipped[kMaxDecomposition];
324 for (word char_length, i = 0, num_skipped = 0; i < decomp_length;
325 i += char_length) {
326 int32_t code_point = decomposition.codePointAt(i, &char_length);
327 if (skipIndex(i, skipped, num_skipped)) {
328 num_skipped--;
329 continue;
330 }
331
332 // Hangul Composition
333 if (Unicode::isHangulLead(code_point) && i + char_length < decomp_length) {
334 word vowel_length;
335 int32_t vowel = decomposition.codePointAt(i + char_length, &vowel_length);
336 if (Unicode::isHangulVowel(vowel)) {
337 int32_t lead = code_point - Unicode::kHangulLeadStart;
338 vowel -= Unicode::kHangulVowelStart;
339 code_point = Unicode::kHangulSyllableStart +
340 (lead * Unicode::kHangulVowelCount + vowel) *
341 Unicode::kHangulTrailCount;
342 char_length += vowel_length;
343
344 if (i + char_length < decomp_length) {
345 word trail_length;
346 int32_t trail =
347 decomposition.codePointAt(i + char_length, &trail_length);
348 if (Unicode::isHangulTrail(trail)) {
349 code_point += trail - Unicode::kHangulTrailStart;
350 char_length += trail_length;
351 }
352 }
353 runtime->strArrayAddCodePoint(thread, result, code_point);
354 continue;
355 }
356 }
357
358 int32_t first = findNFCFirst(code_point);
359 if (first == -1) {
360 runtime->strArrayAddCodePoint(thread, result, code_point);
361 continue;
362 }
363
364 // Find next unblocked character.
365 byte combining = 0;
366 for (word j = i + char_length, next_len; j < decomp_length; j += next_len) {
367 int32_t next = decomposition.codePointAt(j, &next_len);
368 byte next_combining = databaseRecord(next)->combining;
369 if (combining != 0) {
370 if (next_combining == 0) {
371 break;
372 }
373 if (next_combining <= combining) {
374 continue;
375 }
376 }
377
378 int32_t last = findNFCLast(next);
379 next = (last == -1) ? 0 : composeCodePoint(first, last);
380 if (next == 0) {
381 if (next_combining == 0) {
382 break;
383 }
384 combining = next_combining;
385 continue;
386 }
387
388 // Replace the original character
389 code_point = next;
390 DCHECK_INDEX(num_skipped, kMaxDecomposition);
391 skipped[num_skipped++] = j;
392 first = findNFCFirst(code_point);
393 if (first == -1) {
394 break;
395 }
396 }
397
398 // Write the output character
399 runtime->strArrayAddCodePoint(thread, result, code_point);
400 }
401
402 return runtime->strFromStrArray(result);
403}
404
405RawObject FUNC(unicodedata, normalize)(Thread* thread, Arguments args) {
406 HandleScope scope(thread);
407 Runtime* runtime = thread->runtime();
408 Object form_obj(&scope, args.get(0));
409 if (!runtime->isInstanceOfStr(*form_obj)) {
410 return thread->raiseRequiresType(form_obj, ID(str));
411 }
412 Object src_obj(&scope, args.get(1));
413 if (!runtime->isInstanceOfStr(*src_obj)) {
414 return thread->raiseRequiresType(src_obj, ID(str));
415 }
416
417 Str src(&scope, strUnderlying(*src_obj));
418 if (src.length() == 0) {
419 return *src_obj;
420 }
421
422 Str form_str(&scope, strUnderlying(*form_obj));
423 NormalizationForm form = getForm(form_str);
424 if (form == NormalizationForm::kInvalid) {
425 return thread->raiseWithFmt(LayoutId::kValueError,
426 "invalid normalization form");
427 }
428
429 if (isNormalized(src, form)) {
430 return *src_obj;
431 }
432
433 // Decomposition
434 StrArray buffer(&scope, runtime->newStrArray());
435 word src_length = src.length();
436 runtime->strArrayEnsureCapacity(thread, buffer, src_length);
437 bool canonical =
438 form == NormalizationForm::kNFC || form == NormalizationForm::kNFD;
439 for (word i = 0, char_length; i < src_length; i += char_length) {
440 int32_t stack[kMaxDecomposition];
441 stack[0] = src.codePointAt(i, &char_length);
442 for (word depth = 1; depth > 0;) {
443 int32_t code_point = stack[--depth];
444 if (Unicode::isHangulSyllable(code_point)) {
445 decomposeHangul(thread, buffer, code_point);
446 continue;
447 }
448
449 UnicodeDecomposition decomp = decomposeCodePoint(code_point);
450 if (decomp.count == 0 || (std::strlen(decomp.prefix) > 0 && canonical)) {
451 runtime->strArrayAddCodePoint(thread, buffer, code_point);
452 continue;
453 }
454
455 for (word j = decomp.count - 1; j >= 0; j--) {
456 stack[depth++] = decomp.code_points[j];
457 }
458 }
459 }
460
461 sortCanonical(buffer);
462 if (form == NormalizationForm::kNFD || form == NormalizationForm::kNFKD) {
463 return runtime->strFromStrArray(buffer);
464 }
465
466 return compose(thread, buffer);
467}
468
469RawObject FUNC(unicodedata, numeric)(Thread* thread, Arguments args) {
470 HandleScope scope(thread);
471 Runtime* runtime = thread->runtime();
472 Object obj(&scope, args.get(0));
473 if (!runtime->isInstanceOfStr(*obj)) {
474 return thread->raiseRequiresType(obj, ID(str));
475 }
476 Str src(&scope, strUnderlying(*obj));
477 int32_t code_point = getCodePoint(src);
478 if (code_point == -1) {
479 return thread->raiseWithFmt(
480 LayoutId::kTypeError, "numeric() argument must be a unicode character");
481 }
482
483 double value = Unicode::toNumeric(code_point);
484 if (value != -1.0) {
485 return runtime->newFloat(value);
486 }
487
488 Object default_value(&scope, args.get(1));
489 if (default_value.isUnbound()) {
490 return thread->raiseWithFmt(LayoutId::kValueError,
491 "not a numeric character");
492 }
493 return *default_value;
494}
495
496RawObject METH(UCD, bidirectional)(Thread* thread, Arguments args) {
497 HandleScope scope(thread);
498 Runtime* runtime = thread->runtime();
499 Object self(&scope, args.get(0));
500 if (!typeIsSubclass(
501 runtime->typeOf(*self),
502 runtime->lookupNameInModule(thread, ID(unicodedata), ID(UCD)))) {
503 return thread->raiseRequiresType(self, ID(UCD));
504 }
505 Object obj(&scope, args.get(1));
506 if (!runtime->isInstanceOfStr(*obj)) {
507 return thread->raiseRequiresType(obj, ID(str));
508 }
509 Str src(&scope, strUnderlying(*obj));
510 int32_t code_point = getCodePoint(src);
511 if (code_point == -1) {
512 return thread->raiseWithFmt(
513 LayoutId::kTypeError,
514 "bidirectional() argument must be a unicode character");
515 }
516 const UnicodeChangeRecord* record = changeRecord(code_point);
517 if (record->category == 0) {
518 return kBidirectionalNames[0];
519 }
520 if (record->bidirectional != 0xff) {
521 return kBidirectionalNames[record->bidirectional];
522 }
523 return kBidirectionalNames[databaseRecord(code_point)->bidirectional];
524}
525
526RawObject METH(UCD, category)(Thread* thread, Arguments args) {
527 HandleScope scope(thread);
528 Runtime* runtime = thread->runtime();
529 Object self(&scope, args.get(0));
530 if (!typeIsSubclass(
531 runtime->typeOf(*self),
532 runtime->lookupNameInModule(thread, ID(unicodedata), ID(UCD)))) {
533 return thread->raiseRequiresType(self, ID(UCD));
534 }
535 Object obj(&scope, args.get(1));
536 if (!runtime->isInstanceOfStr(*obj)) {
537 return thread->raiseRequiresType(obj, ID(str));
538 }
539 Str src(&scope, strUnderlying(*obj));
540 int32_t code_point = getCodePoint(src);
541 if (code_point == -1) {
542 return thread->raiseWithFmt(
543 LayoutId::kTypeError,
544 "category() argument must be a unicode character");
545 }
546 byte category = changeRecord(code_point)->category;
547 if (category != 0xff) {
548 return kCategoryNames[category];
549 }
550 return kCategoryNames[databaseRecord(code_point)->category];
551}
552
553RawObject METH(UCD, decomposition)(Thread* thread, Arguments args) {
554 HandleScope scope(thread);
555 Runtime* runtime = thread->runtime();
556 Object self(&scope, args.get(0));
557 if (!typeIsSubclass(
558 runtime->typeOf(*self),
559 runtime->lookupNameInModule(thread, ID(unicodedata), ID(UCD)))) {
560 return thread->raiseRequiresType(self, ID(UCD));
561 }
562 Object obj(&scope, args.get(1));
563 if (!runtime->isInstanceOfStr(*obj)) {
564 return thread->raiseRequiresType(obj, ID(str));
565 }
566 Str src(&scope, strUnderlying(*obj));
567 int32_t code_point = getCodePoint(src);
568 if (code_point == -1) {
569 return thread->raiseWithFmt(
570 LayoutId::kTypeError,
571 "decomposition() argument must be a unicode character");
572 }
573
574 if (changeRecord(code_point)->category == 0) {
575 return Str::empty();
576 }
577
578 UnicodeDecomposition decomp = decomposeCodePoint(code_point);
579 if (decomp.count == 0) {
580 return Str::empty();
581 }
582
583 word prefix_length = std::strlen(decomp.prefix);
584 word result_length = prefix_length + 5 * decomp.count;
585 MutableBytes result(&scope,
586 runtime->newMutableBytesUninitialized(result_length));
587 writeDecomposition(decomp, result);
588 return result.becomeStr();
589}
590
591RawObject METH(UCD, decimal)(Thread* thread, Arguments args) {
592 HandleScope scope(thread);
593 Runtime* runtime = thread->runtime();
594 Object self(&scope, args.get(0));
595 if (!typeIsSubclass(
596 runtime->typeOf(*self),
597 runtime->lookupNameInModule(thread, ID(unicodedata), ID(UCD)))) {
598 return thread->raiseRequiresType(self, ID(UCD));
599 }
600 Object obj(&scope, args.get(1));
601 if (!runtime->isInstanceOfStr(*obj)) {
602 return thread->raiseRequiresType(obj, ID(str));
603 }
604 Str src(&scope, strUnderlying(*obj));
605 int32_t code_point = getCodePoint(src);
606 if (code_point == -1) {
607 return thread->raiseWithFmt(
608 LayoutId::kTypeError, "decimal() argument must be a unicode character");
609 }
610
611 word decimal;
612 const UnicodeChangeRecord* record = changeRecord(code_point);
613 if (record->category == 0) {
614 decimal = -1;
615 } else if (record->decimal != kMaxByte) {
616 decimal = record->decimal;
617 } else {
618 decimal = Unicode::toDecimal(code_point);
619 }
620
621 if (decimal != -1) {
622 return SmallInt::fromWord(decimal);
623 }
624
625 Object default_value(&scope, args.get(2));
626 if (default_value.isUnbound()) {
627 return thread->raiseWithFmt(LayoutId::kValueError, "not a decimal");
628 }
629 return *default_value;
630}
631
632RawObject METH(UCD, digit)(Thread* thread, Arguments args) {
633 HandleScope scope(thread);
634 Runtime* runtime = thread->runtime();
635 Object self(&scope, args.get(0));
636 if (!typeIsSubclass(
637 runtime->typeOf(*self),
638 runtime->lookupNameInModule(thread, ID(unicodedata), ID(UCD)))) {
639 return thread->raiseRequiresType(self, ID(UCD));
640 }
641 Object obj(&scope, args.get(1));
642 if (!runtime->isInstanceOfStr(*obj)) {
643 return thread->raiseRequiresType(obj, ID(str));
644 }
645 Str src(&scope, strUnderlying(*obj));
646 int32_t code_point = getCodePoint(src);
647 if (code_point == -1) {
648 return thread->raiseWithFmt(LayoutId::kTypeError,
649 "digit() argument must be a unicode character");
650 }
651
652 int8_t digit = Unicode::toDigit(code_point);
653 if (digit != -1) {
654 return SmallInt::fromWord(digit);
655 }
656
657 Object default_value(&scope, args.get(2));
658 if (default_value.isUnbound()) {
659 return thread->raiseWithFmt(LayoutId::kValueError, "not a digit");
660 }
661 return *default_value;
662}
663
664RawObject METH(UCD, normalize)(Thread* thread, Arguments args) {
665 HandleScope scope(thread);
666 Runtime* runtime = thread->runtime();
667
668 Object self(&scope, args.get(0));
669 if (!typeIsSubclass(
670 runtime->typeOf(*self),
671 runtime->lookupNameInModule(thread, ID(unicodedata), ID(UCD)))) {
672 return thread->raiseRequiresType(self, ID(UCD));
673 }
674 Object form_obj(&scope, args.get(1));
675 if (!runtime->isInstanceOfStr(*form_obj)) {
676 return thread->raiseRequiresType(form_obj, ID(str));
677 }
678 Object src_obj(&scope, args.get(2));
679 if (!runtime->isInstanceOfStr(*src_obj)) {
680 return thread->raiseRequiresType(src_obj, ID(str));
681 }
682
683 Str src(&scope, strUnderlying(*src_obj));
684 if (src.length() == 0) {
685 return *src_obj;
686 }
687
688 Str form_str(&scope, strUnderlying(*form_obj));
689 NormalizationForm form = getForm(form_str);
690 if (form == NormalizationForm::kInvalid) {
691 return thread->raiseWithFmt(LayoutId::kValueError,
692 "invalid normalization form");
693 }
694
695 // Decomposition
696 StrArray buffer(&scope, runtime->newStrArray());
697 word src_length = src.length();
698 runtime->strArrayEnsureCapacity(thread, buffer, src_length);
699 bool canonical =
700 form == NormalizationForm::kNFC || form == NormalizationForm::kNFD;
701 for (word i = 0, char_length; i < src_length; i += char_length) {
702 // longest decomposition in Unicode 3.2: U+FDFA
703 int32_t stack[kMaxDecomposition];
704 stack[0] = src.codePointAt(i, &char_length);
705 for (word depth = 1; depth > 0;) {
706 int32_t code_point = stack[--depth];
707 if (Unicode::isHangulSyllable(code_point)) {
708 decomposeHangul(thread, buffer, code_point);
709 continue;
710 }
711
712 int32_t normalization = normalizeOld(code_point);
713 if (normalization >= 0) {
714 stack[depth++] = normalization;
715 continue;
716 }
717
718 if (changeRecord(code_point)->category == 0) {
719 runtime->strArrayAddCodePoint(thread, buffer, code_point);
720 continue;
721 }
722
723 UnicodeDecomposition decomp = decomposeCodePoint(code_point);
724 if (decomp.count == 0 || (std::strlen(decomp.prefix) > 0 && canonical)) {
725 runtime->strArrayAddCodePoint(thread, buffer, code_point);
726 continue;
727 }
728
729 for (word j = decomp.count - 1; j >= 0; j--) {
730 stack[depth++] = decomp.code_points[j];
731 }
732 }
733 }
734
735 sortCanonical(buffer);
736 if (form == NormalizationForm::kNFD || form == NormalizationForm::kNFKD) {
737 return runtime->strFromStrArray(buffer);
738 }
739
740 return compose(thread, buffer);
741}
742
743} // namespace py