this repo has no description
at trunk 139 lines 4.3 kB view raw
1/* Copyright (c) Facebook, Inc. and its affiliates. (http://www.facebook.com) */ 2// @generated by generate_unicode_database.py 3#pragma once 4 5#include <cstdint> 6 7#include "globals.h" 8#include "objects.h" 9#include "unicode.h" 10 11namespace py { 12 13static const int kMaxNameLength = 256; 14 15// Longest decomposition in Unicode 11.0.0: U+FDFA 16static const int kMaxDecomposition = 18; 17 18static_assert(Unicode::kAliasStart == 0xf0000, 19 "Unicode aliases start at unexpected code point"); 20static_assert(Unicode::kAliasCount == 468, 21 "Unexpected number of Unicode aliases"); 22static_assert(Unicode::kNamedSequenceStart == 0xf0200, 23 "Unicode named sequences start at unexpected code point"); 24static_assert(Unicode::kNamedSequenceCount == 442, 25 "Unexpected number of Unicode named sequences"); 26 27enum NormalizationForm : byte { 28 kInvalid = 0, 29 kNFD = 0x3, 30 kNFKD = 0xc, 31 kNFC = 0x30, 32 kNFKC = 0xc0, 33}; 34 35enum : int32_t { 36 kAlphaMask = 0x1, 37 kDecimalMask = 0x2, 38 kDigitMask = 0x4, 39 kLowerMask = 0x8, 40 kLinebreakMask = 0x10, 41 kSpaceMask = 0x20, 42 kTitleMask = 0x40, 43 kUpperMask = 0x80, 44 kXidStartMask = 0x100, 45 kXidContinueMask = 0x200, 46 kPrintableMask = 0x400, 47 kNumericMask = 0x800, 48 kCaseIgnorableMask = 0x1000, 49 kCasedMask = 0x2000, 50 kExtendedCaseMask = 0x4000, 51}; 52 53struct UnicodeChangeRecord { 54 const byte bidirectional; 55 const byte category; 56 const byte decimal; 57 const byte east_asian_width; 58 const byte mirrored; 59 const double numeric; 60}; 61 62struct UnicodeDatabaseRecord { 63 const byte bidirectional; 64 const byte category; 65 const byte combining; // canonical combining class 66 const byte east_asian_width; 67 const bool mirrored; 68 const byte quick_check; 69}; 70 71struct UnicodeDecomposition { 72 const char* prefix; 73 const int count; 74 const int32_t* code_points; 75}; 76 77struct UnicodeNamedSequence { 78 const byte length; 79 const int32_t code_points[4]; 80}; 81 82struct UnicodeTypeRecord { 83 // Note: if more flag space is needed, decimal and digit could be unified 84 const int8_t decimal; 85 const int8_t digit; 86 const int16_t flags; 87 // Deltas to the character or offsets in kExtendedCase 88 const int32_t lower; 89 const int32_t title; 90 const int32_t upper; 91}; 92 93extern const RawSmallStr kBidirectionalNames[]; 94extern const RawSmallStr kCategoryNames[]; 95extern const RawSmallStr kEastAsianWidthNames[]; 96 97// Get a code point from its Unicode name. 98// Returns the code point if the lookup succeeds, -1 if it fails. 99int32_t codePointFromName(const byte* name, word size); 100int32_t codePointFromNameOrNamedSequence(const byte* name, word size); 101 102// Returns the NFC composition given the NFC first and last indices. 103int32_t composeCodePoint(int32_t first, int32_t last); 104 105// Returns the decomposition mapping of the code point. 106UnicodeDecomposition decomposeCodePoint(int32_t code_point); 107 108// Returns the case mapping for code points where offset is insufficient 109int32_t extendedCaseMapping(int32_t index); 110 111// Finds the first/last character of an NFC sequence containing the code point. 112int32_t findNFCFirst(int32_t code_point); 113int32_t findNFCLast(int32_t code_point); 114 115// Write the Unicode name for the given code point into the buffer. 116// Returns true if the name was written successfully, false otherwise. 117bool nameFromCodePoint(int32_t code_point, byte* buffer, word size); 118 119// Returns the normalization of the code point in Unicode 3.2.0, if it differs 120// from the current version. If the normalization is unchanged, returns -1. 121int32_t normalizeOld(int32_t code_point); 122 123// Returns the numeric value of the code point, or -1.0 if not numeric. 124double numericValue(int32_t code_point); 125 126// Returns true if the code point has one of the line break properties "BK", 127// "CR", "LR", or "NL" or the bidirectional type "B". Returns false otherwise. 128bool unicodeIsLinebreak(int32_t code_point); 129 130// Returns true if the code point has the bidirectional type "WS", "B", or "S" 131// or the category "Zs". Returns false otherwise. 132bool unicodeIsWhitespace(int32_t code_point); 133 134const UnicodeChangeRecord* changeRecord(int32_t code_point); 135const UnicodeDatabaseRecord* databaseRecord(int32_t code_point); 136const UnicodeNamedSequence* namedSequence(int32_t code_point); 137const UnicodeTypeRecord* typeRecord(int32_t code_point); 138 139} // namespace py