/* Copyright (c) Facebook, Inc. and its affiliates. (http://www.facebook.com) */ // @generated by generate_unicode_database.py #pragma once #include #include "globals.h" #include "objects.h" #include "unicode.h" namespace py { static const int kMaxNameLength = 256; // Longest decomposition in Unicode 11.0.0: U+FDFA static const int kMaxDecomposition = 18; static_assert(Unicode::kAliasStart == 0xf0000, "Unicode aliases start at unexpected code point"); static_assert(Unicode::kAliasCount == 468, "Unexpected number of Unicode aliases"); static_assert(Unicode::kNamedSequenceStart == 0xf0200, "Unicode named sequences start at unexpected code point"); static_assert(Unicode::kNamedSequenceCount == 442, "Unexpected number of Unicode named sequences"); enum NormalizationForm : byte { kInvalid = 0, kNFD = 0x3, kNFKD = 0xc, kNFC = 0x30, kNFKC = 0xc0, }; enum : int32_t { kAlphaMask = 0x1, kDecimalMask = 0x2, kDigitMask = 0x4, kLowerMask = 0x8, kLinebreakMask = 0x10, kSpaceMask = 0x20, kTitleMask = 0x40, kUpperMask = 0x80, kXidStartMask = 0x100, kXidContinueMask = 0x200, kPrintableMask = 0x400, kNumericMask = 0x800, kCaseIgnorableMask = 0x1000, kCasedMask = 0x2000, kExtendedCaseMask = 0x4000, }; struct UnicodeChangeRecord { const byte bidirectional; const byte category; const byte decimal; const byte east_asian_width; const byte mirrored; const double numeric; }; struct UnicodeDatabaseRecord { const byte bidirectional; const byte category; const byte combining; // canonical combining class const byte east_asian_width; const bool mirrored; const byte quick_check; }; struct UnicodeDecomposition { const char* prefix; const int count; const int32_t* code_points; }; struct UnicodeNamedSequence { const byte length; const int32_t code_points[4]; }; struct UnicodeTypeRecord { // Note: if more flag space is needed, decimal and digit could be unified const int8_t decimal; const int8_t digit; const int16_t flags; // Deltas to the character or offsets in kExtendedCase const int32_t lower; const int32_t title; const int32_t upper; }; extern const RawSmallStr kBidirectionalNames[]; extern const RawSmallStr kCategoryNames[]; extern const RawSmallStr kEastAsianWidthNames[]; // Get a code point from its Unicode name. // Returns the code point if the lookup succeeds, -1 if it fails. int32_t codePointFromName(const byte* name, word size); int32_t codePointFromNameOrNamedSequence(const byte* name, word size); // Returns the NFC composition given the NFC first and last indices. int32_t composeCodePoint(int32_t first, int32_t last); // Returns the decomposition mapping of the code point. UnicodeDecomposition decomposeCodePoint(int32_t code_point); // Returns the case mapping for code points where offset is insufficient int32_t extendedCaseMapping(int32_t index); // Finds the first/last character of an NFC sequence containing the code point. int32_t findNFCFirst(int32_t code_point); int32_t findNFCLast(int32_t code_point); // Write the Unicode name for the given code point into the buffer. // Returns true if the name was written successfully, false otherwise. bool nameFromCodePoint(int32_t code_point, byte* buffer, word size); // Returns the normalization of the code point in Unicode 3.2.0, if it differs // from the current version. If the normalization is unchanged, returns -1. int32_t normalizeOld(int32_t code_point); // Returns the numeric value of the code point, or -1.0 if not numeric. double numericValue(int32_t code_point); // Returns true if the code point has one of the line break properties "BK", // "CR", "LR", or "NL" or the bidirectional type "B". Returns false otherwise. bool unicodeIsLinebreak(int32_t code_point); // Returns true if the code point has the bidirectional type "WS", "B", or "S" // or the category "Zs". Returns false otherwise. bool unicodeIsWhitespace(int32_t code_point); const UnicodeChangeRecord* changeRecord(int32_t code_point); const UnicodeDatabaseRecord* databaseRecord(int32_t code_point); const UnicodeNamedSequence* namedSequence(int32_t code_point); const UnicodeTypeRecord* typeRecord(int32_t code_point); } // namespace py