this repo has no description
1/* Copyright (c) Facebook, Inc. and its affiliates. (http://www.facebook.com) */
2// @generated by generate_unicode_database.py
3#pragma once
4
5#include <cstdint>
6
7#include "globals.h"
8#include "objects.h"
9#include "unicode.h"
10
11namespace py {
12
13static const int kMaxNameLength = 256;
14
15// Longest decomposition in Unicode 11.0.0: U+FDFA
16static const int kMaxDecomposition = 18;
17
18static_assert(Unicode::kAliasStart == 0xf0000,
19 "Unicode aliases start at unexpected code point");
20static_assert(Unicode::kAliasCount == 468,
21 "Unexpected number of Unicode aliases");
22static_assert(Unicode::kNamedSequenceStart == 0xf0200,
23 "Unicode named sequences start at unexpected code point");
24static_assert(Unicode::kNamedSequenceCount == 442,
25 "Unexpected number of Unicode named sequences");
26
27enum NormalizationForm : byte {
28 kInvalid = 0,
29 kNFD = 0x3,
30 kNFKD = 0xc,
31 kNFC = 0x30,
32 kNFKC = 0xc0,
33};
34
35enum : int32_t {
36 kAlphaMask = 0x1,
37 kDecimalMask = 0x2,
38 kDigitMask = 0x4,
39 kLowerMask = 0x8,
40 kLinebreakMask = 0x10,
41 kSpaceMask = 0x20,
42 kTitleMask = 0x40,
43 kUpperMask = 0x80,
44 kXidStartMask = 0x100,
45 kXidContinueMask = 0x200,
46 kPrintableMask = 0x400,
47 kNumericMask = 0x800,
48 kCaseIgnorableMask = 0x1000,
49 kCasedMask = 0x2000,
50 kExtendedCaseMask = 0x4000,
51};
52
53struct UnicodeChangeRecord {
54 const byte bidirectional;
55 const byte category;
56 const byte decimal;
57 const byte east_asian_width;
58 const byte mirrored;
59 const double numeric;
60};
61
62struct UnicodeDatabaseRecord {
63 const byte bidirectional;
64 const byte category;
65 const byte combining; // canonical combining class
66 const byte east_asian_width;
67 const bool mirrored;
68 const byte quick_check;
69};
70
71struct UnicodeDecomposition {
72 const char* prefix;
73 const int count;
74 const int32_t* code_points;
75};
76
77struct UnicodeNamedSequence {
78 const byte length;
79 const int32_t code_points[4];
80};
81
82struct UnicodeTypeRecord {
83 // Note: if more flag space is needed, decimal and digit could be unified
84 const int8_t decimal;
85 const int8_t digit;
86 const int16_t flags;
87 // Deltas to the character or offsets in kExtendedCase
88 const int32_t lower;
89 const int32_t title;
90 const int32_t upper;
91};
92
93extern const RawSmallStr kBidirectionalNames[];
94extern const RawSmallStr kCategoryNames[];
95extern const RawSmallStr kEastAsianWidthNames[];
96
97// Get a code point from its Unicode name.
98// Returns the code point if the lookup succeeds, -1 if it fails.
99int32_t codePointFromName(const byte* name, word size);
100int32_t codePointFromNameOrNamedSequence(const byte* name, word size);
101
102// Returns the NFC composition given the NFC first and last indices.
103int32_t composeCodePoint(int32_t first, int32_t last);
104
105// Returns the decomposition mapping of the code point.
106UnicodeDecomposition decomposeCodePoint(int32_t code_point);
107
108// Returns the case mapping for code points where offset is insufficient
109int32_t extendedCaseMapping(int32_t index);
110
111// Finds the first/last character of an NFC sequence containing the code point.
112int32_t findNFCFirst(int32_t code_point);
113int32_t findNFCLast(int32_t code_point);
114
115// Write the Unicode name for the given code point into the buffer.
116// Returns true if the name was written successfully, false otherwise.
117bool nameFromCodePoint(int32_t code_point, byte* buffer, word size);
118
119// Returns the normalization of the code point in Unicode 3.2.0, if it differs
120// from the current version. If the normalization is unchanged, returns -1.
121int32_t normalizeOld(int32_t code_point);
122
123// Returns the numeric value of the code point, or -1.0 if not numeric.
124double numericValue(int32_t code_point);
125
126// Returns true if the code point has one of the line break properties "BK",
127// "CR", "LR", or "NL" or the bidirectional type "B". Returns false otherwise.
128bool unicodeIsLinebreak(int32_t code_point);
129
130// Returns true if the code point has the bidirectional type "WS", "B", or "S"
131// or the category "Zs". Returns false otherwise.
132bool unicodeIsWhitespace(int32_t code_point);
133
134const UnicodeChangeRecord* changeRecord(int32_t code_point);
135const UnicodeDatabaseRecord* databaseRecord(int32_t code_point);
136const UnicodeNamedSequence* namedSequence(int32_t code_point);
137const UnicodeTypeRecord* typeRecord(int32_t code_point);
138
139} // namespace py