this repo has no description
at trunk 743 lines 24 kB view raw
1// Copyright (c) Facebook, Inc. and its affiliates. (http://www.facebook.com) 2#include "builtins.h" 3#include "handles-decl.h" 4#include "layout.h" 5#include "module-builtins.h" 6#include "modules.h" 7#include "objects.h" 8#include "runtime.h" 9#include "symbols.h" 10#include "thread.h" 11#include "type-builtins.h" 12#include "unicode-db.h" 13#include "unicode.h" 14 15namespace py { 16 17void FUNC(unicodedata, __init_module__)(Thread* thread, const Module& module, 18 View<byte> bytecode) { 19 executeFrozenModule(thread, module, bytecode); 20 21 HandleScope scope(thread); 22 Runtime* runtime = thread->runtime(); 23 Type ucd_type(&scope, moduleAtById(thread, module, ID(UCD))); 24 Layout ucd_layout(&scope, ucd_type.instanceLayout()); 25 Object old_ucd(&scope, runtime->newInstance(ucd_layout)); 26 moduleAtPutById(thread, module, ID(ucd_3_2_0), old_ucd); 27} 28 29static int32_t getCodePoint(const Str& src) { 30 word length = src.length(); 31 if (length == 0) { 32 return -1; 33 } 34 word char_length; 35 int32_t result = src.codePointAt(0, &char_length); 36 return (length == char_length) ? result : -1; 37} 38 39RawObject FUNC(unicodedata, bidirectional)(Thread* thread, Arguments args) { 40 HandleScope scope(thread); 41 Runtime* runtime = thread->runtime(); 42 Object obj(&scope, args.get(0)); 43 if (!runtime->isInstanceOfStr(*obj)) { 44 return thread->raiseRequiresType(obj, ID(str)); 45 } 46 Str src(&scope, strUnderlying(*obj)); 47 int32_t code_point = getCodePoint(src); 48 if (code_point == -1) { 49 return thread->raiseWithFmt( 50 LayoutId::kTypeError, 51 "bidirectional() argument must be a unicode character"); 52 } 53 return kBidirectionalNames[databaseRecord(code_point)->bidirectional]; 54} 55 56RawObject FUNC(unicodedata, category)(Thread* thread, Arguments args) { 57 HandleScope scope(thread); 58 Runtime* runtime = thread->runtime(); 59 Object obj(&scope, args.get(0)); 60 if (!runtime->isInstanceOfStr(*obj)) { 61 return thread->raiseRequiresType(obj, ID(str)); 62 } 63 Str src(&scope, strUnderlying(*obj)); 64 int32_t code_point = getCodePoint(src); 65 if (code_point == -1) { 66 return thread->raiseWithFmt( 67 LayoutId::kTypeError, 68 "category() argument must be a unicode character"); 69 } 70 return kCategoryNames[databaseRecord(code_point)->category]; 71} 72 73RawObject FUNC(unicodedata, decimal)(Thread* thread, Arguments args) { 74 HandleScope scope(thread); 75 Runtime* runtime = thread->runtime(); 76 Object obj(&scope, args.get(0)); 77 if (!runtime->isInstanceOfStr(*obj)) { 78 return thread->raiseRequiresType(obj, ID(str)); 79 } 80 Str src(&scope, strUnderlying(*obj)); 81 int32_t code_point = getCodePoint(src); 82 if (code_point == -1) { 83 return thread->raiseWithFmt( 84 LayoutId::kTypeError, "decimal() argument must be a unicode character"); 85 } 86 87 int8_t decimal = Unicode::toDecimal(code_point); 88 if (decimal != -1) { 89 return SmallInt::fromWord(decimal); 90 } 91 92 Object default_value(&scope, args.get(1)); 93 if (default_value.isUnbound()) { 94 return thread->raiseWithFmt(LayoutId::kValueError, "not a decimal"); 95 } 96 return *default_value; 97} 98 99static void writeDecomposition(UnicodeDecomposition decomp, 100 const MutableBytes& out) { 101 word prefix_length = std::strlen(decomp.prefix); 102 char* dst = reinterpret_cast<char*>(out.address()); 103 std::memcpy(dst, decomp.prefix, prefix_length); 104 105 word i = prefix_length; 106 for (word j = 0; j < decomp.count; j++) { 107 if (i > 0) { 108 dst[i++] = ' '; 109 } 110 std::sprintf(&dst[i], "%04X", decomp.code_points[j]); 111 i += 4; 112 } 113 DCHECK(i == out.length(), "expected %d bytes, wrote %d", out.length(), i); 114} 115 116RawObject FUNC(unicodedata, decomposition)(Thread* thread, Arguments args) { 117 HandleScope scope(thread); 118 Runtime* runtime = thread->runtime(); 119 Object obj(&scope, args.get(0)); 120 if (!runtime->isInstanceOfStr(*obj)) { 121 return thread->raiseRequiresType(obj, ID(str)); 122 } 123 Str src(&scope, strUnderlying(*obj)); 124 int32_t code_point = getCodePoint(src); 125 if (code_point == -1) { 126 return thread->raiseWithFmt( 127 LayoutId::kTypeError, 128 "decomposition() argument must be a unicode character"); 129 } 130 131 UnicodeDecomposition decomp = decomposeCodePoint(code_point); 132 if (decomp.count == 0) { 133 return Str::empty(); 134 } 135 136 word prefix_length = std::strlen(decomp.prefix); 137 word result_length = prefix_length + 5 * decomp.count; 138 MutableBytes result(&scope, 139 runtime->newMutableBytesUninitialized(result_length)); 140 writeDecomposition(decomp, result); 141 return result.becomeStr(); 142} 143 144RawObject FUNC(unicodedata, digit)(Thread* thread, Arguments args) { 145 HandleScope scope(thread); 146 Runtime* runtime = thread->runtime(); 147 Object obj(&scope, args.get(0)); 148 if (!runtime->isInstanceOfStr(*obj)) { 149 return thread->raiseRequiresType(obj, ID(str)); 150 } 151 Str src(&scope, strUnderlying(*obj)); 152 int32_t code_point = getCodePoint(src); 153 if (code_point == -1) { 154 return thread->raiseWithFmt(LayoutId::kTypeError, 155 "digit() argument must be a unicode character"); 156 } 157 158 int8_t digit = Unicode::toDigit(code_point); 159 if (digit != -1) { 160 return SmallInt::fromWord(digit); 161 } 162 163 Object default_value(&scope, args.get(1)); 164 if (default_value.isUnbound()) { 165 return thread->raiseWithFmt(LayoutId::kValueError, "not a digit"); 166 } 167 return *default_value; 168} 169 170static RawObject copyName(Thread* thread, const Object& name_obj, byte* buffer, 171 word size) { 172 HandleScope scope(thread); 173 Runtime* runtime = thread->runtime(); 174 if (runtime->isInstanceOfStr(*name_obj)) { 175 Str name(&scope, strUnderlying(*name_obj)); 176 word length = name.length(); 177 if (length > size) { 178 return thread->raiseWithFmt(LayoutId::kKeyError, "name too long"); 179 } 180 name.copyTo(buffer, length); 181 return SmallInt::fromWord(length); 182 } 183 if (runtime->isInstanceOfBytes(*name_obj)) { 184 Bytes name(&scope, bytesUnderlying(*name_obj)); 185 word length = name.length(); 186 if (length > size) { 187 return thread->raiseWithFmt(LayoutId::kKeyError, "name too long"); 188 } 189 name.copyTo(buffer, length); 190 return SmallInt::fromWord(length); 191 } 192 if (runtime->isByteslike(*name_obj)) { 193 UNIMPLEMENTED("bytes-like other than bytes"); 194 } 195 return thread->raiseWithFmt(LayoutId::kTypeError, 196 "a bytes-like object is required, not '%T'", 197 &name_obj); 198} 199 200RawObject FUNC(unicodedata, lookup)(Thread* thread, Arguments args) { 201 HandleScope scope(thread); 202 Object name(&scope, args.get(0)); 203 Runtime* runtime = thread->runtime(); 204 205 byte buffer[kMaxNameLength + 1]; 206 Object copy_result(&scope, copyName(thread, name, buffer, kMaxNameLength)); 207 if (copy_result.isErrorException()) { 208 return *copy_result; 209 } 210 word length = SmallInt::cast(*copy_result).value(); 211 212 int32_t code_point = codePointFromNameOrNamedSequence(buffer, length); 213 if (code_point < 0) { 214 buffer[length] = '\0'; 215 return thread->raiseWithFmt(LayoutId::kKeyError, 216 "undefined character name '%s'", buffer); 217 } 218 if (Unicode::isNamedSequence(code_point)) { 219 const UnicodeNamedSequence* seq = namedSequence(code_point); 220 return runtime->newStrFromUTF32({seq->code_points, seq->length}); 221 } 222 DCHECK_BOUND(code_point, kMaxUnicode); 223 return SmallStr::fromCodePoint(code_point); 224} 225 226static NormalizationForm getForm(const Str& str) { 227 if (str.equalsCStr("NFC")) { 228 return NormalizationForm::kNFC; 229 } 230 if (str.equalsCStr("NFKC")) { 231 return NormalizationForm::kNFKC; 232 } 233 if (str.equalsCStr("NFD")) { 234 return NormalizationForm::kNFD; 235 } 236 if (str.equalsCStr("NFKD")) { 237 return NormalizationForm::kNFKD; 238 } 239 return NormalizationForm::kInvalid; 240} 241 242static bool isNormalized(const Str& str, NormalizationForm form) { 243 byte prev_combining = 0; 244 for (word i = 0, length = str.length(), char_length; i < length; 245 i += char_length) { 246 int32_t code_point = str.codePointAt(i, &char_length); 247 const UnicodeDatabaseRecord* record = databaseRecord(code_point); 248 if ((record->quick_check & form) != 0) { 249 return false; 250 } 251 byte combining = record->combining; 252 if (combining != 0 && combining < prev_combining) { 253 return false; 254 } 255 prev_combining = combining; 256 } 257 return true; 258} 259 260static void decomposeHangul(Thread* thread, const StrArray& buffer, 261 int32_t code_point) { 262 int32_t offset = code_point - Unicode::kHangulSyllableStart; 263 int32_t lead = Unicode::kHangulLeadStart + offset / Unicode::kHangulCodaCount; 264 int32_t vowel = 265 Unicode::kHangulVowelStart + 266 (offset % Unicode::kHangulCodaCount) / Unicode::kHangulTrailCount; 267 int32_t trail = 268 Unicode::kHangulTrailStart + offset % Unicode::kHangulTrailCount; 269 270 Runtime* runtime = thread->runtime(); 271 runtime->strArrayAddCodePoint(thread, buffer, lead); 272 runtime->strArrayAddCodePoint(thread, buffer, vowel); 273 if (trail != Unicode::kHangulTrailStart) { 274 runtime->strArrayAddCodePoint(thread, buffer, trail); 275 } 276} 277 278static void sortCanonical(const StrArray& buffer) { 279 word char_length; 280 int32_t code_point = buffer.codePointAt(0, &char_length); 281 byte prev_combining = databaseRecord(code_point)->combining; 282 word result_length = buffer.numItems(); 283 for (word i = char_length; i < result_length; i += char_length) { 284 code_point = buffer.codePointAt(i, &char_length); 285 byte combining = databaseRecord(code_point)->combining; 286 if (combining == 0 || prev_combining <= combining) { 287 prev_combining = combining; 288 continue; 289 } 290 291 // Non-canonical order. Insert the code point in order. 292 word first = 0; 293 for (word j = buffer.offsetByCodePoints(i, -2); j >= 0; 294 j = buffer.offsetByCodePoints(j, -1)) { 295 word other_len; 296 int32_t other = buffer.codePointAt(j, &other_len); 297 byte other_combining = databaseRecord(other)->combining; 298 if (other_combining == 0 || other_combining <= combining) { 299 first = j + other_len; 300 break; 301 } 302 } 303 buffer.rotateCodePoint(first, i); 304 } 305} 306 307static word skipIndex(word index, int32_t* skipped, word num_skipped) { 308 for (word i = 0; i < num_skipped; i++) { 309 if (skipped[i] == index) { 310 skipped[i] = skipped[num_skipped - 1]; 311 return true; 312 } 313 } 314 return false; 315} 316 317static RawObject compose(Thread* thread, const StrArray& decomposition) { 318 HandleScope scope(thread); 319 Runtime* runtime = thread->runtime(); 320 StrArray result(&scope, runtime->newStrArray()); 321 word decomp_length = decomposition.numItems(); 322 323 int32_t skipped[kMaxDecomposition]; 324 for (word char_length, i = 0, num_skipped = 0; i < decomp_length; 325 i += char_length) { 326 int32_t code_point = decomposition.codePointAt(i, &char_length); 327 if (skipIndex(i, skipped, num_skipped)) { 328 num_skipped--; 329 continue; 330 } 331 332 // Hangul Composition 333 if (Unicode::isHangulLead(code_point) && i + char_length < decomp_length) { 334 word vowel_length; 335 int32_t vowel = decomposition.codePointAt(i + char_length, &vowel_length); 336 if (Unicode::isHangulVowel(vowel)) { 337 int32_t lead = code_point - Unicode::kHangulLeadStart; 338 vowel -= Unicode::kHangulVowelStart; 339 code_point = Unicode::kHangulSyllableStart + 340 (lead * Unicode::kHangulVowelCount + vowel) * 341 Unicode::kHangulTrailCount; 342 char_length += vowel_length; 343 344 if (i + char_length < decomp_length) { 345 word trail_length; 346 int32_t trail = 347 decomposition.codePointAt(i + char_length, &trail_length); 348 if (Unicode::isHangulTrail(trail)) { 349 code_point += trail - Unicode::kHangulTrailStart; 350 char_length += trail_length; 351 } 352 } 353 runtime->strArrayAddCodePoint(thread, result, code_point); 354 continue; 355 } 356 } 357 358 int32_t first = findNFCFirst(code_point); 359 if (first == -1) { 360 runtime->strArrayAddCodePoint(thread, result, code_point); 361 continue; 362 } 363 364 // Find next unblocked character. 365 byte combining = 0; 366 for (word j = i + char_length, next_len; j < decomp_length; j += next_len) { 367 int32_t next = decomposition.codePointAt(j, &next_len); 368 byte next_combining = databaseRecord(next)->combining; 369 if (combining != 0) { 370 if (next_combining == 0) { 371 break; 372 } 373 if (next_combining <= combining) { 374 continue; 375 } 376 } 377 378 int32_t last = findNFCLast(next); 379 next = (last == -1) ? 0 : composeCodePoint(first, last); 380 if (next == 0) { 381 if (next_combining == 0) { 382 break; 383 } 384 combining = next_combining; 385 continue; 386 } 387 388 // Replace the original character 389 code_point = next; 390 DCHECK_INDEX(num_skipped, kMaxDecomposition); 391 skipped[num_skipped++] = j; 392 first = findNFCFirst(code_point); 393 if (first == -1) { 394 break; 395 } 396 } 397 398 // Write the output character 399 runtime->strArrayAddCodePoint(thread, result, code_point); 400 } 401 402 return runtime->strFromStrArray(result); 403} 404 405RawObject FUNC(unicodedata, normalize)(Thread* thread, Arguments args) { 406 HandleScope scope(thread); 407 Runtime* runtime = thread->runtime(); 408 Object form_obj(&scope, args.get(0)); 409 if (!runtime->isInstanceOfStr(*form_obj)) { 410 return thread->raiseRequiresType(form_obj, ID(str)); 411 } 412 Object src_obj(&scope, args.get(1)); 413 if (!runtime->isInstanceOfStr(*src_obj)) { 414 return thread->raiseRequiresType(src_obj, ID(str)); 415 } 416 417 Str src(&scope, strUnderlying(*src_obj)); 418 if (src.length() == 0) { 419 return *src_obj; 420 } 421 422 Str form_str(&scope, strUnderlying(*form_obj)); 423 NormalizationForm form = getForm(form_str); 424 if (form == NormalizationForm::kInvalid) { 425 return thread->raiseWithFmt(LayoutId::kValueError, 426 "invalid normalization form"); 427 } 428 429 if (isNormalized(src, form)) { 430 return *src_obj; 431 } 432 433 // Decomposition 434 StrArray buffer(&scope, runtime->newStrArray()); 435 word src_length = src.length(); 436 runtime->strArrayEnsureCapacity(thread, buffer, src_length); 437 bool canonical = 438 form == NormalizationForm::kNFC || form == NormalizationForm::kNFD; 439 for (word i = 0, char_length; i < src_length; i += char_length) { 440 int32_t stack[kMaxDecomposition]; 441 stack[0] = src.codePointAt(i, &char_length); 442 for (word depth = 1; depth > 0;) { 443 int32_t code_point = stack[--depth]; 444 if (Unicode::isHangulSyllable(code_point)) { 445 decomposeHangul(thread, buffer, code_point); 446 continue; 447 } 448 449 UnicodeDecomposition decomp = decomposeCodePoint(code_point); 450 if (decomp.count == 0 || (std::strlen(decomp.prefix) > 0 && canonical)) { 451 runtime->strArrayAddCodePoint(thread, buffer, code_point); 452 continue; 453 } 454 455 for (word j = decomp.count - 1; j >= 0; j--) { 456 stack[depth++] = decomp.code_points[j]; 457 } 458 } 459 } 460 461 sortCanonical(buffer); 462 if (form == NormalizationForm::kNFD || form == NormalizationForm::kNFKD) { 463 return runtime->strFromStrArray(buffer); 464 } 465 466 return compose(thread, buffer); 467} 468 469RawObject FUNC(unicodedata, numeric)(Thread* thread, Arguments args) { 470 HandleScope scope(thread); 471 Runtime* runtime = thread->runtime(); 472 Object obj(&scope, args.get(0)); 473 if (!runtime->isInstanceOfStr(*obj)) { 474 return thread->raiseRequiresType(obj, ID(str)); 475 } 476 Str src(&scope, strUnderlying(*obj)); 477 int32_t code_point = getCodePoint(src); 478 if (code_point == -1) { 479 return thread->raiseWithFmt( 480 LayoutId::kTypeError, "numeric() argument must be a unicode character"); 481 } 482 483 double value = Unicode::toNumeric(code_point); 484 if (value != -1.0) { 485 return runtime->newFloat(value); 486 } 487 488 Object default_value(&scope, args.get(1)); 489 if (default_value.isUnbound()) { 490 return thread->raiseWithFmt(LayoutId::kValueError, 491 "not a numeric character"); 492 } 493 return *default_value; 494} 495 496RawObject METH(UCD, bidirectional)(Thread* thread, Arguments args) { 497 HandleScope scope(thread); 498 Runtime* runtime = thread->runtime(); 499 Object self(&scope, args.get(0)); 500 if (!typeIsSubclass( 501 runtime->typeOf(*self), 502 runtime->lookupNameInModule(thread, ID(unicodedata), ID(UCD)))) { 503 return thread->raiseRequiresType(self, ID(UCD)); 504 } 505 Object obj(&scope, args.get(1)); 506 if (!runtime->isInstanceOfStr(*obj)) { 507 return thread->raiseRequiresType(obj, ID(str)); 508 } 509 Str src(&scope, strUnderlying(*obj)); 510 int32_t code_point = getCodePoint(src); 511 if (code_point == -1) { 512 return thread->raiseWithFmt( 513 LayoutId::kTypeError, 514 "bidirectional() argument must be a unicode character"); 515 } 516 const UnicodeChangeRecord* record = changeRecord(code_point); 517 if (record->category == 0) { 518 return kBidirectionalNames[0]; 519 } 520 if (record->bidirectional != 0xff) { 521 return kBidirectionalNames[record->bidirectional]; 522 } 523 return kBidirectionalNames[databaseRecord(code_point)->bidirectional]; 524} 525 526RawObject METH(UCD, category)(Thread* thread, Arguments args) { 527 HandleScope scope(thread); 528 Runtime* runtime = thread->runtime(); 529 Object self(&scope, args.get(0)); 530 if (!typeIsSubclass( 531 runtime->typeOf(*self), 532 runtime->lookupNameInModule(thread, ID(unicodedata), ID(UCD)))) { 533 return thread->raiseRequiresType(self, ID(UCD)); 534 } 535 Object obj(&scope, args.get(1)); 536 if (!runtime->isInstanceOfStr(*obj)) { 537 return thread->raiseRequiresType(obj, ID(str)); 538 } 539 Str src(&scope, strUnderlying(*obj)); 540 int32_t code_point = getCodePoint(src); 541 if (code_point == -1) { 542 return thread->raiseWithFmt( 543 LayoutId::kTypeError, 544 "category() argument must be a unicode character"); 545 } 546 byte category = changeRecord(code_point)->category; 547 if (category != 0xff) { 548 return kCategoryNames[category]; 549 } 550 return kCategoryNames[databaseRecord(code_point)->category]; 551} 552 553RawObject METH(UCD, decomposition)(Thread* thread, Arguments args) { 554 HandleScope scope(thread); 555 Runtime* runtime = thread->runtime(); 556 Object self(&scope, args.get(0)); 557 if (!typeIsSubclass( 558 runtime->typeOf(*self), 559 runtime->lookupNameInModule(thread, ID(unicodedata), ID(UCD)))) { 560 return thread->raiseRequiresType(self, ID(UCD)); 561 } 562 Object obj(&scope, args.get(1)); 563 if (!runtime->isInstanceOfStr(*obj)) { 564 return thread->raiseRequiresType(obj, ID(str)); 565 } 566 Str src(&scope, strUnderlying(*obj)); 567 int32_t code_point = getCodePoint(src); 568 if (code_point == -1) { 569 return thread->raiseWithFmt( 570 LayoutId::kTypeError, 571 "decomposition() argument must be a unicode character"); 572 } 573 574 if (changeRecord(code_point)->category == 0) { 575 return Str::empty(); 576 } 577 578 UnicodeDecomposition decomp = decomposeCodePoint(code_point); 579 if (decomp.count == 0) { 580 return Str::empty(); 581 } 582 583 word prefix_length = std::strlen(decomp.prefix); 584 word result_length = prefix_length + 5 * decomp.count; 585 MutableBytes result(&scope, 586 runtime->newMutableBytesUninitialized(result_length)); 587 writeDecomposition(decomp, result); 588 return result.becomeStr(); 589} 590 591RawObject METH(UCD, decimal)(Thread* thread, Arguments args) { 592 HandleScope scope(thread); 593 Runtime* runtime = thread->runtime(); 594 Object self(&scope, args.get(0)); 595 if (!typeIsSubclass( 596 runtime->typeOf(*self), 597 runtime->lookupNameInModule(thread, ID(unicodedata), ID(UCD)))) { 598 return thread->raiseRequiresType(self, ID(UCD)); 599 } 600 Object obj(&scope, args.get(1)); 601 if (!runtime->isInstanceOfStr(*obj)) { 602 return thread->raiseRequiresType(obj, ID(str)); 603 } 604 Str src(&scope, strUnderlying(*obj)); 605 int32_t code_point = getCodePoint(src); 606 if (code_point == -1) { 607 return thread->raiseWithFmt( 608 LayoutId::kTypeError, "decimal() argument must be a unicode character"); 609 } 610 611 word decimal; 612 const UnicodeChangeRecord* record = changeRecord(code_point); 613 if (record->category == 0) { 614 decimal = -1; 615 } else if (record->decimal != kMaxByte) { 616 decimal = record->decimal; 617 } else { 618 decimal = Unicode::toDecimal(code_point); 619 } 620 621 if (decimal != -1) { 622 return SmallInt::fromWord(decimal); 623 } 624 625 Object default_value(&scope, args.get(2)); 626 if (default_value.isUnbound()) { 627 return thread->raiseWithFmt(LayoutId::kValueError, "not a decimal"); 628 } 629 return *default_value; 630} 631 632RawObject METH(UCD, digit)(Thread* thread, Arguments args) { 633 HandleScope scope(thread); 634 Runtime* runtime = thread->runtime(); 635 Object self(&scope, args.get(0)); 636 if (!typeIsSubclass( 637 runtime->typeOf(*self), 638 runtime->lookupNameInModule(thread, ID(unicodedata), ID(UCD)))) { 639 return thread->raiseRequiresType(self, ID(UCD)); 640 } 641 Object obj(&scope, args.get(1)); 642 if (!runtime->isInstanceOfStr(*obj)) { 643 return thread->raiseRequiresType(obj, ID(str)); 644 } 645 Str src(&scope, strUnderlying(*obj)); 646 int32_t code_point = getCodePoint(src); 647 if (code_point == -1) { 648 return thread->raiseWithFmt(LayoutId::kTypeError, 649 "digit() argument must be a unicode character"); 650 } 651 652 int8_t digit = Unicode::toDigit(code_point); 653 if (digit != -1) { 654 return SmallInt::fromWord(digit); 655 } 656 657 Object default_value(&scope, args.get(2)); 658 if (default_value.isUnbound()) { 659 return thread->raiseWithFmt(LayoutId::kValueError, "not a digit"); 660 } 661 return *default_value; 662} 663 664RawObject METH(UCD, normalize)(Thread* thread, Arguments args) { 665 HandleScope scope(thread); 666 Runtime* runtime = thread->runtime(); 667 668 Object self(&scope, args.get(0)); 669 if (!typeIsSubclass( 670 runtime->typeOf(*self), 671 runtime->lookupNameInModule(thread, ID(unicodedata), ID(UCD)))) { 672 return thread->raiseRequiresType(self, ID(UCD)); 673 } 674 Object form_obj(&scope, args.get(1)); 675 if (!runtime->isInstanceOfStr(*form_obj)) { 676 return thread->raiseRequiresType(form_obj, ID(str)); 677 } 678 Object src_obj(&scope, args.get(2)); 679 if (!runtime->isInstanceOfStr(*src_obj)) { 680 return thread->raiseRequiresType(src_obj, ID(str)); 681 } 682 683 Str src(&scope, strUnderlying(*src_obj)); 684 if (src.length() == 0) { 685 return *src_obj; 686 } 687 688 Str form_str(&scope, strUnderlying(*form_obj)); 689 NormalizationForm form = getForm(form_str); 690 if (form == NormalizationForm::kInvalid) { 691 return thread->raiseWithFmt(LayoutId::kValueError, 692 "invalid normalization form"); 693 } 694 695 // Decomposition 696 StrArray buffer(&scope, runtime->newStrArray()); 697 word src_length = src.length(); 698 runtime->strArrayEnsureCapacity(thread, buffer, src_length); 699 bool canonical = 700 form == NormalizationForm::kNFC || form == NormalizationForm::kNFD; 701 for (word i = 0, char_length; i < src_length; i += char_length) { 702 // longest decomposition in Unicode 3.2: U+FDFA 703 int32_t stack[kMaxDecomposition]; 704 stack[0] = src.codePointAt(i, &char_length); 705 for (word depth = 1; depth > 0;) { 706 int32_t code_point = stack[--depth]; 707 if (Unicode::isHangulSyllable(code_point)) { 708 decomposeHangul(thread, buffer, code_point); 709 continue; 710 } 711 712 int32_t normalization = normalizeOld(code_point); 713 if (normalization >= 0) { 714 stack[depth++] = normalization; 715 continue; 716 } 717 718 if (changeRecord(code_point)->category == 0) { 719 runtime->strArrayAddCodePoint(thread, buffer, code_point); 720 continue; 721 } 722 723 UnicodeDecomposition decomp = decomposeCodePoint(code_point); 724 if (decomp.count == 0 || (std::strlen(decomp.prefix) > 0 && canonical)) { 725 runtime->strArrayAddCodePoint(thread, buffer, code_point); 726 continue; 727 } 728 729 for (word j = decomp.count - 1; j >= 0; j--) { 730 stack[depth++] = decomp.code_points[j]; 731 } 732 } 733 } 734 735 sortCanonical(buffer); 736 if (form == NormalizationForm::kNFD || form == NormalizationForm::kNFKD) { 737 return runtime->strFromStrArray(buffer); 738 } 739 740 return compose(thread, buffer); 741} 742 743} // namespace py