this repo has no description
1/* Copyright (c) Facebook, Inc. and its affiliates. (http://www.facebook.com) */
2#pragma once
3
4#include <cstdint>
5
6#include "globals.h"
7#include "utils.h"
8
9namespace py {
10
11// Functions for ASCII code points. These should only be used for bytes-like
12// objects or when a code point is guaranteed to be valid ASCII.
13class ASCII {
14 public:
15 // Predicates
16 static bool isAlnum(byte b);
17 static bool isAlpha(byte b);
18 static bool isControlCharacter(byte b);
19 static bool isDecimal(byte b);
20 static bool isDigit(byte b);
21 static bool isLinebreak(byte b);
22 static bool isLower(byte b);
23 static bool isNumeric(byte b);
24 static bool isPrintable(byte b);
25 static bool isUpper(byte b);
26 static bool isSpace(byte b);
27 static bool isXidContinue(byte b);
28 static bool isXidStart(byte b);
29
30 // Conversion
31 static int8_t toDecimal(byte b);
32 static int8_t toDigit(byte b);
33 static byte toLower(byte b);
34 static double toNumeric(byte b);
35 static byte toUpper(byte b);
36
37 private:
38 DISALLOW_IMPLICIT_CONSTRUCTORS(ASCII);
39};
40
41// Functions corresponding to "C type" functions in CPython,
42// e.g. Py_ISLOWER, Py_TOLOWER, etc.
43class Byte {
44 public:
45 // Predicates
46 static bool isAlnum(byte b);
47 static bool isAlpha(byte b);
48 static bool isDigit(byte b);
49 static bool isHexDigit(byte b);
50 static bool isLower(byte b);
51 static bool isSpace(byte b);
52 static bool isUpper(byte b);
53
54 // Conversion
55 static int8_t toDigit(byte b);
56 static int8_t toHexDigit(byte b);
57 static byte toLower(byte b);
58 static byte toUpper(byte b);
59
60 private:
61 enum Flag : byte {
62 kLower = 1 << 0,
63 kUpper = 1 << 1,
64 kAlpha = kLower | kUpper,
65 kDigit = 1 << 2,
66 kAlnum = kAlpha | kDigit,
67 kSpace = 1 << 4,
68 kHexDigit = 1 << 5,
69 };
70
71 static constexpr byte kTable[256] = {
72 0, // 0x0 '\x00'
73 0, // 0x1 '\x01'
74 0, // 0x2 '\x02'
75 0, // 0x3 '\x03'
76 0, // 0x4 '\x04'
77 0, // 0x5 '\x05'
78 0, // 0x6 '\x06'
79 0, // 0x7 '\x07'
80 0, // 0x8 '\x08'
81 kSpace, // 0x9 '\t'
82 kSpace, // 0xa '\n'
83 kSpace, // 0xb '\v'
84 kSpace, // 0xc '\f'
85 kSpace, // 0xd '\r'
86 0, // 0xe '\x0e'
87 0, // 0xf '\x0f'
88 0, // 0x10 '\x10'
89 0, // 0x11 '\x11'
90 0, // 0x12 '\x12'
91 0, // 0x13 '\x13'
92 0, // 0x14 '\x14'
93 0, // 0x15 '\x15'
94 0, // 0x16 '\x16'
95 0, // 0x17 '\x17'
96 0, // 0x18 '\x18'
97 0, // 0x19 '\x19'
98 0, // 0x1a '\x1a'
99 0, // 0x1b '\x1b'
100 0, // 0x1c '\x1c'
101 0, // 0x1d '\x1d'
102 0, // 0x1e '\x1e'
103 0, // 0x1f '\x1f'
104 kSpace, // ' '
105 0, // 0x21 '!'
106 0, // 0x22 '"'
107 0, // 0x23 '#'
108 0, // 0x24 '$'
109 0, // 0x25 '%'
110 0, // 0x26 '&'
111 0, // 0x27 "'"
112 0, // 0x28 '('
113 0, // 0x29 ')'
114 0, // 0x2a '*'
115 0, // 0x2b '+'
116 0, // 0x2c ','
117 0, // 0x2d '-'
118 0, // 0x2e '.'
119 0, // 0x2f '/'
120 kDigit | kHexDigit, // 0x30 '0'
121 kDigit | kHexDigit, // 0x31 '1'
122 kDigit | kHexDigit, // 0x32 '2'
123 kDigit | kHexDigit, // 0x33 '3'
124 kDigit | kHexDigit, // 0x34 '4'
125 kDigit | kHexDigit, // 0x35 '5'
126 kDigit | kHexDigit, // 0x36 '6'
127 kDigit | kHexDigit, // 0x37 '7'
128 kDigit | kHexDigit, // 0x38 '8'
129 kDigit | kHexDigit, // 0x39 '9'
130 0, // 0x3a ':'
131 0, // 0x3b ';'
132 0, // 0x3c '<'
133 0, // 0x3d '='
134 0, // 0x3e '>'
135 0, // 0x3f '?'
136 0, // 0x40 '@'
137 kUpper | kHexDigit, // 0x41 'A'
138 kUpper | kHexDigit, // 0x42 'B'
139 kUpper | kHexDigit, // 0x43 'C'
140 kUpper | kHexDigit, // 0x44 'D'
141 kUpper | kHexDigit, // 0x45 'E'
142 kUpper | kHexDigit, // 0x46 'F'
143 kUpper, // 0x47 'G'
144 kUpper, // 0x48 'H'
145 kUpper, // 0x49 'I'
146 kUpper, // 0x4a 'J'
147 kUpper, // 0x4b 'K'
148 kUpper, // 0x4c 'L'
149 kUpper, // 0x4d 'M'
150 kUpper, // 0x4e 'N'
151 kUpper, // 0x4f 'O'
152 kUpper, // 0x50 'P'
153 kUpper, // 0x51 'Q'
154 kUpper, // 0x52 'R'
155 kUpper, // 0x53 'S'
156 kUpper, // 0x54 'T'
157 kUpper, // 0x55 'U'
158 kUpper, // 0x56 'V'
159 kUpper, // 0x57 'W'
160 kUpper, // 0x58 'X'
161 kUpper, // 0x59 'Y'
162 kUpper, // 0x5a 'Z'
163 0, // 0x5b '['
164 0, // 0x5c '\\'
165 0, // 0x5d ']'
166 0, // 0x5e '^'
167 0, // 0x5f '_'
168 0, // 0x60 '`'
169 kLower | kHexDigit, // 0x61 'a'
170 kLower | kHexDigit, // 0x62 'b'
171 kLower | kHexDigit, // 0x63 'c'
172 kLower | kHexDigit, // 0x64 'd'
173 kLower | kHexDigit, // 0x65 'e'
174 kLower | kHexDigit, // 0x66 'f'
175 kLower, // 0x67 'g'
176 kLower, // 0x68 'h'
177 kLower, // 0x69 'i'
178 kLower, // 0x6a 'j'
179 kLower, // 0x6b 'k'
180 kLower, // 0x6c 'l'
181 kLower, // 0x6d 'm'
182 kLower, // 0x6e 'n'
183 kLower, // 0x6f 'o'
184 kLower, // 0x70 'p'
185 kLower, // 0x71 'q'
186 kLower, // 0x72 'r'
187 kLower, // 0x73 's'
188 kLower, // 0x74 't'
189 kLower, // 0x75 'u'
190 kLower, // 0x76 'v'
191 kLower, // 0x77 'w'
192 kLower, // 0x78 'x'
193 kLower, // 0x79 'y'
194 kLower, // 0x7a 'z'
195 };
196
197 static constexpr byte kToLower[256] = {
198 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b,
199 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
200 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20, 0x21, 0x22, 0x23,
201 0x24, 0x25, 0x26, 0x27, 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f,
202 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b,
203 0x3c, 0x3d, 0x3e, 0x3f, 0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
204 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, 0x70, 0x71, 0x72, 0x73,
205 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f,
206 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x6b,
207 0x6c, 0x6d, 0x6e, 0x6f, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
208 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, 0x80, 0x81, 0x82, 0x83,
209 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f,
210 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9a, 0x9b,
211 0x9c, 0x9d, 0x9e, 0x9f, 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7,
212 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, 0xb0, 0xb1, 0xb2, 0xb3,
213 0xb4, 0xb5, 0xb6, 0xb7, 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf,
214 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, 0xc8, 0xc9, 0xca, 0xcb,
215 0xcc, 0xcd, 0xce, 0xcf, 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7,
216 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, 0xe0, 0xe1, 0xe2, 0xe3,
217 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef,
218 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 0xf8, 0xf9, 0xfa, 0xfb,
219 0xfc, 0xfd, 0xfe, 0xff,
220 };
221
222 static constexpr byte kToUpper[256] = {
223 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b,
224 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
225 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20, 0x21, 0x22, 0x23,
226 0x24, 0x25, 0x26, 0x27, 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f,
227 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b,
228 0x3c, 0x3d, 0x3e, 0x3f, 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47,
229 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, 0x50, 0x51, 0x52, 0x53,
230 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f,
231 0x60, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x4a, 0x4b,
232 0x4c, 0x4d, 0x4e, 0x4f, 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57,
233 0x58, 0x59, 0x5a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, 0x80, 0x81, 0x82, 0x83,
234 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f,
235 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9a, 0x9b,
236 0x9c, 0x9d, 0x9e, 0x9f, 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7,
237 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, 0xb0, 0xb1, 0xb2, 0xb3,
238 0xb4, 0xb5, 0xb6, 0xb7, 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf,
239 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, 0xc8, 0xc9, 0xca, 0xcb,
240 0xcc, 0xcd, 0xce, 0xcf, 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7,
241 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, 0xe0, 0xe1, 0xe2, 0xe3,
242 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef,
243 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 0xf8, 0xf9, 0xfa, 0xfb,
244 0xfc, 0xfd, 0xfe, 0xff,
245 };
246};
247
248// Represents the possible result of casing a codepoint. Since lower-, upper-,
249// and title-casing a codepoint can be a one-to-many mapping, this cannot be
250// represented as a single value.
251struct FullCasing {
252 int32_t code_points[3];
253};
254
255class UTF8 {
256 public:
257 static const word kMaxLength = 4;
258 static const byte kSurrogateLeadByte = 0xED;
259 static constexpr byte kBOM[] = {0xef, 0xbb, 0xbf};
260
261 // Predicates
262 static bool isLeadByte(byte b);
263 static bool isTrailByte(byte b);
264
265 // Given the lead byte of a UTF-8 code point, return its length.
266 static word numChars(byte lead_byte);
267
268 private:
269 DISALLOW_IMPLICIT_CONSTRUCTORS(UTF8);
270};
271
272class UTF16 {
273 public:
274 static constexpr byte kBOMLittleEndian[] = {0xff, 0xfe};
275 static constexpr byte kBOMBigEndian[] = {0xfe, 0xff};
276};
277
278class UTF32 {
279 public:
280 static constexpr byte kBOMLittleEndian[] = {0xff, 0xfe, 0, 0};
281 static constexpr byte kBOMBigEndian[] = {0, 0, 0xfe, 0xff};
282};
283
284// Functions for Unicode code points.
285class Unicode {
286 public:
287 // Constants
288 static const int32_t kAliasStart = 0xf0000;
289 static const int32_t kHighSurrogateStart = 0xd800;
290 static const int32_t kHighSurrogateEnd = 0xdbff;
291 static const int32_t kHangulSyllableStart = 0xac00;
292 static const int32_t kHangulLeadStart = 0x1100;
293 static const int32_t kHangulVowelStart = 0x1161;
294 static const int32_t kHangulTrailStart = 0x11a7;
295 static const int32_t kLowSurrogateStart = 0xdc00;
296 static const int32_t kLowSurrogateEnd = 0xdfff;
297 static const int32_t kNamedSequenceStart = 0xf0200;
298 static const int32_t kSurrogateMask = 0x03ff;
299
300 static const int kAliasCount = 468;
301 static const int kHangulLeadCount = 19;
302 static const int kHangulVowelCount = 21;
303 static const int kHangulTrailCount = 28;
304 static const int kHangulCodaCount = kHangulVowelCount * kHangulTrailCount;
305 static const int kHangulSyllableCount = kHangulLeadCount * kHangulCodaCount;
306 static const int kNamedSequenceCount = 442;
307
308 // Predicates
309 static bool isASCII(int32_t code_point);
310 static bool isAlias(int32_t code_point);
311 static bool isAlpha(int32_t code_point);
312 static bool isAlnum(int32_t code_point);
313 static bool isCaseIgnorable(int32_t code_point);
314 static bool isCased(int32_t code_point);
315 static bool isDecimal(int32_t code_point);
316 static bool isDigit(int32_t code_point);
317 static bool isHangulLead(int32_t code_point);
318 static bool isHangulSyllable(int32_t code_point);
319 static bool isHangulTrail(int32_t code_point);
320 static bool isHangulVowel(int32_t code_point);
321 static bool isHighSurrogate(int32_t code_point);
322 static bool isLinebreak(int32_t code_point);
323 static bool isLowSurrogate(int32_t code_point);
324 static bool isLower(int32_t code_point);
325 static bool isNamedSequence(int32_t code_point);
326 static bool isNumeric(int32_t code_point);
327 static bool isPrintable(int32_t code_point);
328 static bool isSpace(int32_t code_point);
329 static bool isSurrogate(int32_t code_point);
330 static bool isTitle(int32_t code_point);
331 static bool isUnfolded(int32_t code_point);
332 static bool isUpper(int32_t code_point);
333 static bool isXidContinue(int32_t code_point);
334 static bool isXidStart(int32_t code_point);
335
336 // Conversion
337 static int32_t combineSurrogates(int32_t high_code_point,
338 int32_t low_code_point);
339 static int32_t highSurrogateFor(int32_t code_point);
340 static int32_t lowSurrogateFor(int32_t code_point);
341 static int8_t toDecimal(int32_t code_point);
342 static int8_t toDigit(int32_t code_point);
343 static FullCasing toFolded(int32_t code_point);
344 static FullCasing toLower(int32_t code_point);
345 static double toNumeric(int32_t code_point);
346 static FullCasing toTitle(int32_t code_point);
347 static FullCasing toUpper(int32_t code_point);
348
349 private:
350 // Slow paths that use the Unicode database.
351 static bool isAlphaDB(int32_t code_point);
352 static bool isCaseIgnorableDB(int32_t code_point);
353 static bool isCasedDB(int32_t code_point);
354 static bool isDecimalDB(int32_t code_point);
355 static bool isDigitDB(int32_t code_point);
356 static bool isLinebreakDB(int32_t code_point);
357 static bool isLowerDB(int32_t code_point);
358 static bool isNumericDB(int32_t code_point);
359 static bool isPrintableDB(int32_t code_point);
360 static bool isSpaceDB(int32_t code_point);
361 static bool isTitleDB(int32_t code_point);
362 static bool isUnfoldedDB(int32_t code_point);
363 static bool isUpperDB(int32_t code_point);
364 static bool isXidContinueDB(int32_t code_point);
365 static bool isXidStartDB(int32_t code_point);
366 static int8_t toDecimalDB(int32_t code_point);
367 static int8_t toDigitDB(int32_t code_point);
368 static FullCasing toFoldedDB(int32_t code_point);
369 static FullCasing toLowerDB(int32_t code_point);
370 static double toNumericDB(int32_t code_point);
371 static FullCasing toTitleDB(int32_t code_point);
372 static FullCasing toUpperDB(int32_t code_point);
373
374 DISALLOW_IMPLICIT_CONSTRUCTORS(Unicode);
375};
376
377// ASCII
378
379inline bool ASCII::isAlnum(byte b) { return isDigit(b) || isAlpha(b); }
380
381inline bool ASCII::isAlpha(byte b) { return isUpper(b) || isLower(b); }
382
383inline bool ASCII::isControlCharacter(byte b) { return b <= 0x1f; }
384
385inline bool ASCII::isDecimal(byte b) { return isDigit(b); }
386
387inline bool ASCII::isDigit(byte b) { return '0' <= b && b <= '9'; }
388
389inline bool ASCII::isLinebreak(byte b) {
390 switch (b) {
391 case '\n':
392 case '\x0B':
393 case '\x0C':
394 case '\r':
395 case '\x1C':
396 case '\x1D':
397 case '\x1E':
398 return true;
399 default:
400 return false;
401 }
402}
403
404inline bool ASCII::isLower(byte b) { return 'a' <= b && b <= 'z'; }
405
406inline bool ASCII::isNumeric(byte b) { return isDigit(b); }
407
408inline bool ASCII::isPrintable(byte b) { return ' ' <= b && b < kMaxASCII; }
409
410inline bool ASCII::isSpace(byte b) {
411 switch (b) {
412 case '\t':
413 case '\n':
414 case '\x0B':
415 case '\x0C':
416 case '\r':
417 case '\x1C':
418 case '\x1D':
419 case '\x1E':
420 case '\x1F':
421 case ' ':
422 return true;
423 default:
424 return false;
425 }
426}
427
428inline bool ASCII::isUpper(byte b) { return 'A' <= b && b <= 'Z'; }
429
430inline bool ASCII::isXidContinue(byte b) { return isXidStart(b) || isDigit(b); }
431
432inline bool ASCII::isXidStart(byte b) { return isAlpha(b) || b == '_'; }
433
434inline int8_t ASCII::toDecimal(byte b) { return toDigit(b); }
435
436inline int8_t ASCII::toDigit(byte b) { return isDigit(b) ? b - '0' : -1; }
437
438inline byte ASCII::toLower(byte b) { return isUpper(b) ? b + ('a' - 'A') : b; }
439
440inline double ASCII::toNumeric(byte b) {
441 return isNumeric(b) ? static_cast<double>(b - '0') : -1.0;
442}
443
444inline byte ASCII::toUpper(byte b) { return isLower(b) ? b - ('a' - 'A') : b; }
445
446// Byte
447
448inline bool Byte::isAlnum(byte b) { return (kTable[b] & kAlnum) != 0; }
449
450inline bool Byte::isAlpha(byte b) { return (kTable[b] & kAlpha) != 0; }
451
452inline bool Byte::isDigit(byte b) { return (kTable[b] & kDigit) != 0; }
453
454inline bool Byte::isLower(byte b) { return (kTable[b] & kLower) != 0; }
455
456inline bool Byte::isSpace(byte b) { return (kTable[b] & kSpace) != 0; }
457
458inline bool Byte::isUpper(byte b) { return (kTable[b] & kUpper) != 0; }
459
460inline bool Byte::isHexDigit(byte b) { return (kTable[b] & kHexDigit) != 0; }
461
462inline int8_t Byte::toDigit(byte b) { return Byte::isDigit(b) ? b - '0' : -1; }
463
464inline int8_t Byte::toHexDigit(byte b) {
465 if (Byte::isDigit(b)) {
466 return b - '0';
467 }
468 if ('a' <= b && b <= 'f') {
469 return b - 'a' + 10;
470 }
471 if ('A' <= b && b <= 'F') {
472 return b - 'A' + 10;
473 }
474 return -1;
475}
476
477inline byte Byte::toLower(byte b) { return kToLower[b]; }
478
479inline byte Byte::toUpper(byte b) { return kToUpper[b]; }
480
481// UTF-8
482
483inline bool UTF8::isLeadByte(byte b) {
484 DCHECK(b < 0xF8, "invalid UTF-8 byte");
485 return (b & 0xC0) != 0x80;
486}
487
488inline bool UTF8::isTrailByte(byte b) { return (b & 0xC0) == 0x80; }
489
490inline word UTF8::numChars(byte lead_byte) {
491 if (lead_byte <= kMaxASCII) {
492 return 1;
493 }
494 if (lead_byte < 0xE0) {
495 DCHECK(lead_byte >= 0xC0, "invalid lead byte");
496 return 2;
497 }
498 if (lead_byte < 0xF0) {
499 return 3;
500 }
501 DCHECK(lead_byte < 0xF8, "invalid lead byte");
502 return 4;
503}
504
505// Unicode
506
507inline bool Unicode::isASCII(int32_t code_point) {
508 return code_point <= kMaxASCII;
509}
510
511inline bool Unicode::isAlias(int32_t code_point) {
512 return (kAliasStart <= code_point) &&
513 (code_point < kAliasStart + kAliasCount);
514}
515
516inline bool Unicode::isAlnum(int32_t code_point) {
517 if (isASCII(code_point)) {
518 return ASCII::isAlnum(code_point);
519 }
520 return Unicode::isAlphaDB(code_point) || Unicode::isDecimalDB(code_point) ||
521 Unicode::isDigitDB(code_point) || Unicode::isNumericDB(code_point);
522}
523
524inline bool Unicode::isAlpha(int32_t code_point) {
525 if (isASCII(code_point)) {
526 return ASCII::isAlpha(code_point);
527 }
528 return Unicode::isAlphaDB(code_point);
529}
530
531inline bool Unicode::isCaseIgnorable(int32_t code_point) {
532 if (isASCII(code_point)) {
533 return !ASCII::isAlpha(code_point);
534 }
535 return isCaseIgnorableDB(code_point);
536}
537
538inline bool Unicode::isCased(int32_t code_point) {
539 if (isASCII(code_point)) {
540 return ASCII::isAlpha(code_point);
541 }
542 return isCasedDB(code_point);
543}
544
545inline bool Unicode::isDecimal(int32_t code_point) {
546 if (isASCII(code_point)) {
547 return ASCII::isDecimal(code_point);
548 }
549 return isDecimalDB(code_point);
550}
551
552inline bool Unicode::isDigit(int32_t code_point) {
553 if (isASCII(code_point)) {
554 return ASCII::isDigit(code_point);
555 }
556 return isDigitDB(code_point);
557}
558
559inline bool Unicode::isHangulLead(int32_t code_point) {
560 return (kHangulLeadStart <= code_point) &&
561 (code_point < kHangulLeadStart + kHangulLeadCount);
562}
563
564inline bool Unicode::isHangulSyllable(int32_t code_point) {
565 return (kHangulSyllableStart <= code_point) &&
566 (code_point < kHangulSyllableStart + kHangulSyllableCount);
567}
568
569inline bool Unicode::isHangulTrail(int32_t code_point) {
570 return (kHangulTrailStart <= code_point) &&
571 (code_point < kHangulTrailStart + kHangulTrailCount);
572}
573
574inline bool Unicode::isHangulVowel(int32_t code_point) {
575 return (kHangulVowelStart <= code_point) &&
576 (code_point < kHangulVowelStart + kHangulVowelCount);
577}
578
579inline bool Unicode::isHighSurrogate(int32_t code_point) {
580 return (kHighSurrogateStart <= code_point) &&
581 (code_point <= kHighSurrogateEnd);
582}
583
584inline bool Unicode::isLinebreak(int32_t code_point) {
585 if (isASCII(code_point)) {
586 return ASCII::isLinebreak(code_point);
587 }
588 return isLinebreakDB(code_point);
589}
590
591inline bool Unicode::isLowSurrogate(int32_t code_point) {
592 return (kLowSurrogateStart <= code_point) && (code_point <= kLowSurrogateEnd);
593}
594
595inline bool Unicode::isLower(int32_t code_point) {
596 if (isASCII(code_point)) {
597 return ASCII::isLower(code_point);
598 }
599 return isLowerDB(code_point);
600}
601
602inline bool Unicode::isNamedSequence(int32_t code_point) {
603 return (kNamedSequenceStart <= code_point) &&
604 (code_point < kNamedSequenceStart + kNamedSequenceCount);
605}
606
607inline bool Unicode::isNumeric(int32_t code_point) {
608 if (isASCII(code_point)) {
609 return ASCII::isNumeric(code_point);
610 }
611 return Unicode::isNumericDB(code_point);
612}
613
614inline bool Unicode::isPrintable(int32_t code_point) {
615 if (isASCII(code_point)) {
616 return ASCII::isPrintable(code_point);
617 }
618 return Unicode::isPrintableDB(code_point);
619}
620
621inline bool Unicode::isSpace(int32_t code_point) {
622 if (isASCII(code_point)) {
623 return ASCII::isSpace(code_point);
624 }
625 return isSpaceDB(code_point);
626}
627
628inline bool Unicode::isSurrogate(int32_t code_point) {
629 return kHighSurrogateStart <= code_point && code_point <= kLowSurrogateEnd;
630}
631
632inline bool Unicode::isTitle(int32_t code_point) {
633 if (isASCII(code_point)) {
634 return false;
635 }
636 return isTitleDB(code_point);
637}
638
639inline bool Unicode::isUnfolded(int32_t code_point) {
640 if (isASCII(code_point)) {
641 return false;
642 }
643 return isUnfoldedDB(code_point);
644}
645
646inline bool Unicode::isUpper(int32_t code_point) {
647 if (isASCII(code_point)) {
648 return ASCII::isUpper(code_point);
649 }
650 return isUpperDB(code_point);
651}
652
653inline bool Unicode::isXidContinue(int32_t code_point) {
654 if (isASCII(code_point)) {
655 return ASCII::isXidContinue(code_point);
656 }
657 return isXidContinueDB(code_point);
658}
659
660inline bool Unicode::isXidStart(int32_t code_point) {
661 if (isASCII(code_point)) {
662 return ASCII::isXidStart(code_point);
663 }
664 return isXidStartDB(code_point);
665}
666
667inline int32_t Unicode::combineSurrogates(int32_t high_code_point,
668 int32_t low_code_point) {
669 DCHECK(Unicode::isHighSurrogate(high_code_point), "expected high surrogate");
670 DCHECK(Unicode::isLowSurrogate(low_code_point), "expected low surrogate");
671 int32_t result = (((high_code_point & kSurrogateMask)) << 10 |
672 (low_code_point & kSurrogateMask)) +
673 0x10000;
674 DCHECK(result <= kMaxUnicode, "result must be valid code point");
675 return result;
676}
677
678inline int32_t Unicode::highSurrogateFor(int32_t code_point) {
679 DCHECK(0x10000 <= code_point && code_point <= kMaxUnicode,
680 "Codepoint must be valid unicode and require more than 16 bits");
681 return kHighSurrogateStart - (0x10000 >> 10) + (code_point >> 10);
682}
683
684inline int32_t Unicode::lowSurrogateFor(int32_t code_point) {
685 DCHECK(0x10000 <= code_point && code_point <= kMaxUnicode,
686 "Codepoint must be valid unicode and require more than 16 bits");
687 return kLowSurrogateStart + (code_point & kSurrogateMask);
688}
689
690inline int8_t Unicode::toDecimal(int32_t code_point) {
691 if (isASCII(code_point)) {
692 return ASCII::toDecimal(code_point);
693 }
694 return toDecimalDB(code_point);
695}
696
697inline int8_t Unicode::toDigit(int32_t code_point) {
698 if (isASCII(code_point)) {
699 return ASCII::toDigit(code_point);
700 }
701 return toDigitDB(code_point);
702}
703
704inline FullCasing Unicode::toFolded(int32_t code_point) {
705 if (isASCII(code_point)) {
706 return {ASCII::toLower(code_point), -1};
707 }
708 return toFoldedDB(code_point);
709}
710
711inline FullCasing Unicode::toLower(int32_t code_point) {
712 if (isASCII(code_point)) {
713 return {ASCII::toLower(code_point), -1};
714 }
715 return toLowerDB(code_point);
716}
717
718inline double Unicode::toNumeric(int32_t code_point) {
719 if (isASCII(code_point)) {
720 return ASCII::toNumeric(code_point);
721 }
722 return toNumericDB(code_point);
723}
724
725inline FullCasing Unicode::toTitle(int32_t code_point) {
726 if (isASCII(code_point)) {
727 return {ASCII::toUpper(code_point), -1};
728 }
729 return toTitleDB(code_point);
730}
731
732inline FullCasing Unicode::toUpper(int32_t code_point) {
733 if (isASCII(code_point)) {
734 return {ASCII::toUpper(code_point), -1};
735 }
736 return toUpperDB(code_point);
737}
738
739} // namespace py