this repo has no description
at trunk 739 lines 25 kB view raw
1/* Copyright (c) Facebook, Inc. and its affiliates. (http://www.facebook.com) */ 2#pragma once 3 4#include <cstdint> 5 6#include "globals.h" 7#include "utils.h" 8 9namespace py { 10 11// Functions for ASCII code points. These should only be used for bytes-like 12// objects or when a code point is guaranteed to be valid ASCII. 13class ASCII { 14 public: 15 // Predicates 16 static bool isAlnum(byte b); 17 static bool isAlpha(byte b); 18 static bool isControlCharacter(byte b); 19 static bool isDecimal(byte b); 20 static bool isDigit(byte b); 21 static bool isLinebreak(byte b); 22 static bool isLower(byte b); 23 static bool isNumeric(byte b); 24 static bool isPrintable(byte b); 25 static bool isUpper(byte b); 26 static bool isSpace(byte b); 27 static bool isXidContinue(byte b); 28 static bool isXidStart(byte b); 29 30 // Conversion 31 static int8_t toDecimal(byte b); 32 static int8_t toDigit(byte b); 33 static byte toLower(byte b); 34 static double toNumeric(byte b); 35 static byte toUpper(byte b); 36 37 private: 38 DISALLOW_IMPLICIT_CONSTRUCTORS(ASCII); 39}; 40 41// Functions corresponding to "C type" functions in CPython, 42// e.g. Py_ISLOWER, Py_TOLOWER, etc. 43class Byte { 44 public: 45 // Predicates 46 static bool isAlnum(byte b); 47 static bool isAlpha(byte b); 48 static bool isDigit(byte b); 49 static bool isHexDigit(byte b); 50 static bool isLower(byte b); 51 static bool isSpace(byte b); 52 static bool isUpper(byte b); 53 54 // Conversion 55 static int8_t toDigit(byte b); 56 static int8_t toHexDigit(byte b); 57 static byte toLower(byte b); 58 static byte toUpper(byte b); 59 60 private: 61 enum Flag : byte { 62 kLower = 1 << 0, 63 kUpper = 1 << 1, 64 kAlpha = kLower | kUpper, 65 kDigit = 1 << 2, 66 kAlnum = kAlpha | kDigit, 67 kSpace = 1 << 4, 68 kHexDigit = 1 << 5, 69 }; 70 71 static constexpr byte kTable[256] = { 72 0, // 0x0 '\x00' 73 0, // 0x1 '\x01' 74 0, // 0x2 '\x02' 75 0, // 0x3 '\x03' 76 0, // 0x4 '\x04' 77 0, // 0x5 '\x05' 78 0, // 0x6 '\x06' 79 0, // 0x7 '\x07' 80 0, // 0x8 '\x08' 81 kSpace, // 0x9 '\t' 82 kSpace, // 0xa '\n' 83 kSpace, // 0xb '\v' 84 kSpace, // 0xc '\f' 85 kSpace, // 0xd '\r' 86 0, // 0xe '\x0e' 87 0, // 0xf '\x0f' 88 0, // 0x10 '\x10' 89 0, // 0x11 '\x11' 90 0, // 0x12 '\x12' 91 0, // 0x13 '\x13' 92 0, // 0x14 '\x14' 93 0, // 0x15 '\x15' 94 0, // 0x16 '\x16' 95 0, // 0x17 '\x17' 96 0, // 0x18 '\x18' 97 0, // 0x19 '\x19' 98 0, // 0x1a '\x1a' 99 0, // 0x1b '\x1b' 100 0, // 0x1c '\x1c' 101 0, // 0x1d '\x1d' 102 0, // 0x1e '\x1e' 103 0, // 0x1f '\x1f' 104 kSpace, // ' ' 105 0, // 0x21 '!' 106 0, // 0x22 '"' 107 0, // 0x23 '#' 108 0, // 0x24 '$' 109 0, // 0x25 '%' 110 0, // 0x26 '&' 111 0, // 0x27 "'" 112 0, // 0x28 '(' 113 0, // 0x29 ')' 114 0, // 0x2a '*' 115 0, // 0x2b '+' 116 0, // 0x2c ',' 117 0, // 0x2d '-' 118 0, // 0x2e '.' 119 0, // 0x2f '/' 120 kDigit | kHexDigit, // 0x30 '0' 121 kDigit | kHexDigit, // 0x31 '1' 122 kDigit | kHexDigit, // 0x32 '2' 123 kDigit | kHexDigit, // 0x33 '3' 124 kDigit | kHexDigit, // 0x34 '4' 125 kDigit | kHexDigit, // 0x35 '5' 126 kDigit | kHexDigit, // 0x36 '6' 127 kDigit | kHexDigit, // 0x37 '7' 128 kDigit | kHexDigit, // 0x38 '8' 129 kDigit | kHexDigit, // 0x39 '9' 130 0, // 0x3a ':' 131 0, // 0x3b ';' 132 0, // 0x3c '<' 133 0, // 0x3d '=' 134 0, // 0x3e '>' 135 0, // 0x3f '?' 136 0, // 0x40 '@' 137 kUpper | kHexDigit, // 0x41 'A' 138 kUpper | kHexDigit, // 0x42 'B' 139 kUpper | kHexDigit, // 0x43 'C' 140 kUpper | kHexDigit, // 0x44 'D' 141 kUpper | kHexDigit, // 0x45 'E' 142 kUpper | kHexDigit, // 0x46 'F' 143 kUpper, // 0x47 'G' 144 kUpper, // 0x48 'H' 145 kUpper, // 0x49 'I' 146 kUpper, // 0x4a 'J' 147 kUpper, // 0x4b 'K' 148 kUpper, // 0x4c 'L' 149 kUpper, // 0x4d 'M' 150 kUpper, // 0x4e 'N' 151 kUpper, // 0x4f 'O' 152 kUpper, // 0x50 'P' 153 kUpper, // 0x51 'Q' 154 kUpper, // 0x52 'R' 155 kUpper, // 0x53 'S' 156 kUpper, // 0x54 'T' 157 kUpper, // 0x55 'U' 158 kUpper, // 0x56 'V' 159 kUpper, // 0x57 'W' 160 kUpper, // 0x58 'X' 161 kUpper, // 0x59 'Y' 162 kUpper, // 0x5a 'Z' 163 0, // 0x5b '[' 164 0, // 0x5c '\\' 165 0, // 0x5d ']' 166 0, // 0x5e '^' 167 0, // 0x5f '_' 168 0, // 0x60 '`' 169 kLower | kHexDigit, // 0x61 'a' 170 kLower | kHexDigit, // 0x62 'b' 171 kLower | kHexDigit, // 0x63 'c' 172 kLower | kHexDigit, // 0x64 'd' 173 kLower | kHexDigit, // 0x65 'e' 174 kLower | kHexDigit, // 0x66 'f' 175 kLower, // 0x67 'g' 176 kLower, // 0x68 'h' 177 kLower, // 0x69 'i' 178 kLower, // 0x6a 'j' 179 kLower, // 0x6b 'k' 180 kLower, // 0x6c 'l' 181 kLower, // 0x6d 'm' 182 kLower, // 0x6e 'n' 183 kLower, // 0x6f 'o' 184 kLower, // 0x70 'p' 185 kLower, // 0x71 'q' 186 kLower, // 0x72 'r' 187 kLower, // 0x73 's' 188 kLower, // 0x74 't' 189 kLower, // 0x75 'u' 190 kLower, // 0x76 'v' 191 kLower, // 0x77 'w' 192 kLower, // 0x78 'x' 193 kLower, // 0x79 'y' 194 kLower, // 0x7a 'z' 195 }; 196 197 static constexpr byte kToLower[256] = { 198 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 199 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 200 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20, 0x21, 0x22, 0x23, 201 0x24, 0x25, 0x26, 0x27, 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 202 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b, 203 0x3c, 0x3d, 0x3e, 0x3f, 0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 204 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, 0x70, 0x71, 0x72, 0x73, 205 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, 206 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x6b, 207 0x6c, 0x6d, 0x6e, 0x6f, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 208 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, 0x80, 0x81, 0x82, 0x83, 209 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, 210 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9a, 0x9b, 211 0x9c, 0x9d, 0x9e, 0x9f, 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, 212 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, 0xb0, 0xb1, 0xb2, 0xb3, 213 0xb4, 0xb5, 0xb6, 0xb7, 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, 214 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, 0xc8, 0xc9, 0xca, 0xcb, 215 0xcc, 0xcd, 0xce, 0xcf, 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, 216 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, 0xe0, 0xe1, 0xe2, 0xe3, 217 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, 218 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 0xf8, 0xf9, 0xfa, 0xfb, 219 0xfc, 0xfd, 0xfe, 0xff, 220 }; 221 222 static constexpr byte kToUpper[256] = { 223 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 224 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 225 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20, 0x21, 0x22, 0x23, 226 0x24, 0x25, 0x26, 0x27, 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 227 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b, 228 0x3c, 0x3d, 0x3e, 0x3f, 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 229 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, 0x50, 0x51, 0x52, 0x53, 230 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, 231 0x60, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x4a, 0x4b, 232 0x4c, 0x4d, 0x4e, 0x4f, 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, 233 0x58, 0x59, 0x5a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, 0x80, 0x81, 0x82, 0x83, 234 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, 235 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9a, 0x9b, 236 0x9c, 0x9d, 0x9e, 0x9f, 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, 237 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, 0xb0, 0xb1, 0xb2, 0xb3, 238 0xb4, 0xb5, 0xb6, 0xb7, 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, 239 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, 0xc8, 0xc9, 0xca, 0xcb, 240 0xcc, 0xcd, 0xce, 0xcf, 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, 241 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, 0xe0, 0xe1, 0xe2, 0xe3, 242 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, 243 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 0xf8, 0xf9, 0xfa, 0xfb, 244 0xfc, 0xfd, 0xfe, 0xff, 245 }; 246}; 247 248// Represents the possible result of casing a codepoint. Since lower-, upper-, 249// and title-casing a codepoint can be a one-to-many mapping, this cannot be 250// represented as a single value. 251struct FullCasing { 252 int32_t code_points[3]; 253}; 254 255class UTF8 { 256 public: 257 static const word kMaxLength = 4; 258 static const byte kSurrogateLeadByte = 0xED; 259 static constexpr byte kBOM[] = {0xef, 0xbb, 0xbf}; 260 261 // Predicates 262 static bool isLeadByte(byte b); 263 static bool isTrailByte(byte b); 264 265 // Given the lead byte of a UTF-8 code point, return its length. 266 static word numChars(byte lead_byte); 267 268 private: 269 DISALLOW_IMPLICIT_CONSTRUCTORS(UTF8); 270}; 271 272class UTF16 { 273 public: 274 static constexpr byte kBOMLittleEndian[] = {0xff, 0xfe}; 275 static constexpr byte kBOMBigEndian[] = {0xfe, 0xff}; 276}; 277 278class UTF32 { 279 public: 280 static constexpr byte kBOMLittleEndian[] = {0xff, 0xfe, 0, 0}; 281 static constexpr byte kBOMBigEndian[] = {0, 0, 0xfe, 0xff}; 282}; 283 284// Functions for Unicode code points. 285class Unicode { 286 public: 287 // Constants 288 static const int32_t kAliasStart = 0xf0000; 289 static const int32_t kHighSurrogateStart = 0xd800; 290 static const int32_t kHighSurrogateEnd = 0xdbff; 291 static const int32_t kHangulSyllableStart = 0xac00; 292 static const int32_t kHangulLeadStart = 0x1100; 293 static const int32_t kHangulVowelStart = 0x1161; 294 static const int32_t kHangulTrailStart = 0x11a7; 295 static const int32_t kLowSurrogateStart = 0xdc00; 296 static const int32_t kLowSurrogateEnd = 0xdfff; 297 static const int32_t kNamedSequenceStart = 0xf0200; 298 static const int32_t kSurrogateMask = 0x03ff; 299 300 static const int kAliasCount = 468; 301 static const int kHangulLeadCount = 19; 302 static const int kHangulVowelCount = 21; 303 static const int kHangulTrailCount = 28; 304 static const int kHangulCodaCount = kHangulVowelCount * kHangulTrailCount; 305 static const int kHangulSyllableCount = kHangulLeadCount * kHangulCodaCount; 306 static const int kNamedSequenceCount = 442; 307 308 // Predicates 309 static bool isASCII(int32_t code_point); 310 static bool isAlias(int32_t code_point); 311 static bool isAlpha(int32_t code_point); 312 static bool isAlnum(int32_t code_point); 313 static bool isCaseIgnorable(int32_t code_point); 314 static bool isCased(int32_t code_point); 315 static bool isDecimal(int32_t code_point); 316 static bool isDigit(int32_t code_point); 317 static bool isHangulLead(int32_t code_point); 318 static bool isHangulSyllable(int32_t code_point); 319 static bool isHangulTrail(int32_t code_point); 320 static bool isHangulVowel(int32_t code_point); 321 static bool isHighSurrogate(int32_t code_point); 322 static bool isLinebreak(int32_t code_point); 323 static bool isLowSurrogate(int32_t code_point); 324 static bool isLower(int32_t code_point); 325 static bool isNamedSequence(int32_t code_point); 326 static bool isNumeric(int32_t code_point); 327 static bool isPrintable(int32_t code_point); 328 static bool isSpace(int32_t code_point); 329 static bool isSurrogate(int32_t code_point); 330 static bool isTitle(int32_t code_point); 331 static bool isUnfolded(int32_t code_point); 332 static bool isUpper(int32_t code_point); 333 static bool isXidContinue(int32_t code_point); 334 static bool isXidStart(int32_t code_point); 335 336 // Conversion 337 static int32_t combineSurrogates(int32_t high_code_point, 338 int32_t low_code_point); 339 static int32_t highSurrogateFor(int32_t code_point); 340 static int32_t lowSurrogateFor(int32_t code_point); 341 static int8_t toDecimal(int32_t code_point); 342 static int8_t toDigit(int32_t code_point); 343 static FullCasing toFolded(int32_t code_point); 344 static FullCasing toLower(int32_t code_point); 345 static double toNumeric(int32_t code_point); 346 static FullCasing toTitle(int32_t code_point); 347 static FullCasing toUpper(int32_t code_point); 348 349 private: 350 // Slow paths that use the Unicode database. 351 static bool isAlphaDB(int32_t code_point); 352 static bool isCaseIgnorableDB(int32_t code_point); 353 static bool isCasedDB(int32_t code_point); 354 static bool isDecimalDB(int32_t code_point); 355 static bool isDigitDB(int32_t code_point); 356 static bool isLinebreakDB(int32_t code_point); 357 static bool isLowerDB(int32_t code_point); 358 static bool isNumericDB(int32_t code_point); 359 static bool isPrintableDB(int32_t code_point); 360 static bool isSpaceDB(int32_t code_point); 361 static bool isTitleDB(int32_t code_point); 362 static bool isUnfoldedDB(int32_t code_point); 363 static bool isUpperDB(int32_t code_point); 364 static bool isXidContinueDB(int32_t code_point); 365 static bool isXidStartDB(int32_t code_point); 366 static int8_t toDecimalDB(int32_t code_point); 367 static int8_t toDigitDB(int32_t code_point); 368 static FullCasing toFoldedDB(int32_t code_point); 369 static FullCasing toLowerDB(int32_t code_point); 370 static double toNumericDB(int32_t code_point); 371 static FullCasing toTitleDB(int32_t code_point); 372 static FullCasing toUpperDB(int32_t code_point); 373 374 DISALLOW_IMPLICIT_CONSTRUCTORS(Unicode); 375}; 376 377// ASCII 378 379inline bool ASCII::isAlnum(byte b) { return isDigit(b) || isAlpha(b); } 380 381inline bool ASCII::isAlpha(byte b) { return isUpper(b) || isLower(b); } 382 383inline bool ASCII::isControlCharacter(byte b) { return b <= 0x1f; } 384 385inline bool ASCII::isDecimal(byte b) { return isDigit(b); } 386 387inline bool ASCII::isDigit(byte b) { return '0' <= b && b <= '9'; } 388 389inline bool ASCII::isLinebreak(byte b) { 390 switch (b) { 391 case '\n': 392 case '\x0B': 393 case '\x0C': 394 case '\r': 395 case '\x1C': 396 case '\x1D': 397 case '\x1E': 398 return true; 399 default: 400 return false; 401 } 402} 403 404inline bool ASCII::isLower(byte b) { return 'a' <= b && b <= 'z'; } 405 406inline bool ASCII::isNumeric(byte b) { return isDigit(b); } 407 408inline bool ASCII::isPrintable(byte b) { return ' ' <= b && b < kMaxASCII; } 409 410inline bool ASCII::isSpace(byte b) { 411 switch (b) { 412 case '\t': 413 case '\n': 414 case '\x0B': 415 case '\x0C': 416 case '\r': 417 case '\x1C': 418 case '\x1D': 419 case '\x1E': 420 case '\x1F': 421 case ' ': 422 return true; 423 default: 424 return false; 425 } 426} 427 428inline bool ASCII::isUpper(byte b) { return 'A' <= b && b <= 'Z'; } 429 430inline bool ASCII::isXidContinue(byte b) { return isXidStart(b) || isDigit(b); } 431 432inline bool ASCII::isXidStart(byte b) { return isAlpha(b) || b == '_'; } 433 434inline int8_t ASCII::toDecimal(byte b) { return toDigit(b); } 435 436inline int8_t ASCII::toDigit(byte b) { return isDigit(b) ? b - '0' : -1; } 437 438inline byte ASCII::toLower(byte b) { return isUpper(b) ? b + ('a' - 'A') : b; } 439 440inline double ASCII::toNumeric(byte b) { 441 return isNumeric(b) ? static_cast<double>(b - '0') : -1.0; 442} 443 444inline byte ASCII::toUpper(byte b) { return isLower(b) ? b - ('a' - 'A') : b; } 445 446// Byte 447 448inline bool Byte::isAlnum(byte b) { return (kTable[b] & kAlnum) != 0; } 449 450inline bool Byte::isAlpha(byte b) { return (kTable[b] & kAlpha) != 0; } 451 452inline bool Byte::isDigit(byte b) { return (kTable[b] & kDigit) != 0; } 453 454inline bool Byte::isLower(byte b) { return (kTable[b] & kLower) != 0; } 455 456inline bool Byte::isSpace(byte b) { return (kTable[b] & kSpace) != 0; } 457 458inline bool Byte::isUpper(byte b) { return (kTable[b] & kUpper) != 0; } 459 460inline bool Byte::isHexDigit(byte b) { return (kTable[b] & kHexDigit) != 0; } 461 462inline int8_t Byte::toDigit(byte b) { return Byte::isDigit(b) ? b - '0' : -1; } 463 464inline int8_t Byte::toHexDigit(byte b) { 465 if (Byte::isDigit(b)) { 466 return b - '0'; 467 } 468 if ('a' <= b && b <= 'f') { 469 return b - 'a' + 10; 470 } 471 if ('A' <= b && b <= 'F') { 472 return b - 'A' + 10; 473 } 474 return -1; 475} 476 477inline byte Byte::toLower(byte b) { return kToLower[b]; } 478 479inline byte Byte::toUpper(byte b) { return kToUpper[b]; } 480 481// UTF-8 482 483inline bool UTF8::isLeadByte(byte b) { 484 DCHECK(b < 0xF8, "invalid UTF-8 byte"); 485 return (b & 0xC0) != 0x80; 486} 487 488inline bool UTF8::isTrailByte(byte b) { return (b & 0xC0) == 0x80; } 489 490inline word UTF8::numChars(byte lead_byte) { 491 if (lead_byte <= kMaxASCII) { 492 return 1; 493 } 494 if (lead_byte < 0xE0) { 495 DCHECK(lead_byte >= 0xC0, "invalid lead byte"); 496 return 2; 497 } 498 if (lead_byte < 0xF0) { 499 return 3; 500 } 501 DCHECK(lead_byte < 0xF8, "invalid lead byte"); 502 return 4; 503} 504 505// Unicode 506 507inline bool Unicode::isASCII(int32_t code_point) { 508 return code_point <= kMaxASCII; 509} 510 511inline bool Unicode::isAlias(int32_t code_point) { 512 return (kAliasStart <= code_point) && 513 (code_point < kAliasStart + kAliasCount); 514} 515 516inline bool Unicode::isAlnum(int32_t code_point) { 517 if (isASCII(code_point)) { 518 return ASCII::isAlnum(code_point); 519 } 520 return Unicode::isAlphaDB(code_point) || Unicode::isDecimalDB(code_point) || 521 Unicode::isDigitDB(code_point) || Unicode::isNumericDB(code_point); 522} 523 524inline bool Unicode::isAlpha(int32_t code_point) { 525 if (isASCII(code_point)) { 526 return ASCII::isAlpha(code_point); 527 } 528 return Unicode::isAlphaDB(code_point); 529} 530 531inline bool Unicode::isCaseIgnorable(int32_t code_point) { 532 if (isASCII(code_point)) { 533 return !ASCII::isAlpha(code_point); 534 } 535 return isCaseIgnorableDB(code_point); 536} 537 538inline bool Unicode::isCased(int32_t code_point) { 539 if (isASCII(code_point)) { 540 return ASCII::isAlpha(code_point); 541 } 542 return isCasedDB(code_point); 543} 544 545inline bool Unicode::isDecimal(int32_t code_point) { 546 if (isASCII(code_point)) { 547 return ASCII::isDecimal(code_point); 548 } 549 return isDecimalDB(code_point); 550} 551 552inline bool Unicode::isDigit(int32_t code_point) { 553 if (isASCII(code_point)) { 554 return ASCII::isDigit(code_point); 555 } 556 return isDigitDB(code_point); 557} 558 559inline bool Unicode::isHangulLead(int32_t code_point) { 560 return (kHangulLeadStart <= code_point) && 561 (code_point < kHangulLeadStart + kHangulLeadCount); 562} 563 564inline bool Unicode::isHangulSyllable(int32_t code_point) { 565 return (kHangulSyllableStart <= code_point) && 566 (code_point < kHangulSyllableStart + kHangulSyllableCount); 567} 568 569inline bool Unicode::isHangulTrail(int32_t code_point) { 570 return (kHangulTrailStart <= code_point) && 571 (code_point < kHangulTrailStart + kHangulTrailCount); 572} 573 574inline bool Unicode::isHangulVowel(int32_t code_point) { 575 return (kHangulVowelStart <= code_point) && 576 (code_point < kHangulVowelStart + kHangulVowelCount); 577} 578 579inline bool Unicode::isHighSurrogate(int32_t code_point) { 580 return (kHighSurrogateStart <= code_point) && 581 (code_point <= kHighSurrogateEnd); 582} 583 584inline bool Unicode::isLinebreak(int32_t code_point) { 585 if (isASCII(code_point)) { 586 return ASCII::isLinebreak(code_point); 587 } 588 return isLinebreakDB(code_point); 589} 590 591inline bool Unicode::isLowSurrogate(int32_t code_point) { 592 return (kLowSurrogateStart <= code_point) && (code_point <= kLowSurrogateEnd); 593} 594 595inline bool Unicode::isLower(int32_t code_point) { 596 if (isASCII(code_point)) { 597 return ASCII::isLower(code_point); 598 } 599 return isLowerDB(code_point); 600} 601 602inline bool Unicode::isNamedSequence(int32_t code_point) { 603 return (kNamedSequenceStart <= code_point) && 604 (code_point < kNamedSequenceStart + kNamedSequenceCount); 605} 606 607inline bool Unicode::isNumeric(int32_t code_point) { 608 if (isASCII(code_point)) { 609 return ASCII::isNumeric(code_point); 610 } 611 return Unicode::isNumericDB(code_point); 612} 613 614inline bool Unicode::isPrintable(int32_t code_point) { 615 if (isASCII(code_point)) { 616 return ASCII::isPrintable(code_point); 617 } 618 return Unicode::isPrintableDB(code_point); 619} 620 621inline bool Unicode::isSpace(int32_t code_point) { 622 if (isASCII(code_point)) { 623 return ASCII::isSpace(code_point); 624 } 625 return isSpaceDB(code_point); 626} 627 628inline bool Unicode::isSurrogate(int32_t code_point) { 629 return kHighSurrogateStart <= code_point && code_point <= kLowSurrogateEnd; 630} 631 632inline bool Unicode::isTitle(int32_t code_point) { 633 if (isASCII(code_point)) { 634 return false; 635 } 636 return isTitleDB(code_point); 637} 638 639inline bool Unicode::isUnfolded(int32_t code_point) { 640 if (isASCII(code_point)) { 641 return false; 642 } 643 return isUnfoldedDB(code_point); 644} 645 646inline bool Unicode::isUpper(int32_t code_point) { 647 if (isASCII(code_point)) { 648 return ASCII::isUpper(code_point); 649 } 650 return isUpperDB(code_point); 651} 652 653inline bool Unicode::isXidContinue(int32_t code_point) { 654 if (isASCII(code_point)) { 655 return ASCII::isXidContinue(code_point); 656 } 657 return isXidContinueDB(code_point); 658} 659 660inline bool Unicode::isXidStart(int32_t code_point) { 661 if (isASCII(code_point)) { 662 return ASCII::isXidStart(code_point); 663 } 664 return isXidStartDB(code_point); 665} 666 667inline int32_t Unicode::combineSurrogates(int32_t high_code_point, 668 int32_t low_code_point) { 669 DCHECK(Unicode::isHighSurrogate(high_code_point), "expected high surrogate"); 670 DCHECK(Unicode::isLowSurrogate(low_code_point), "expected low surrogate"); 671 int32_t result = (((high_code_point & kSurrogateMask)) << 10 | 672 (low_code_point & kSurrogateMask)) + 673 0x10000; 674 DCHECK(result <= kMaxUnicode, "result must be valid code point"); 675 return result; 676} 677 678inline int32_t Unicode::highSurrogateFor(int32_t code_point) { 679 DCHECK(0x10000 <= code_point && code_point <= kMaxUnicode, 680 "Codepoint must be valid unicode and require more than 16 bits"); 681 return kHighSurrogateStart - (0x10000 >> 10) + (code_point >> 10); 682} 683 684inline int32_t Unicode::lowSurrogateFor(int32_t code_point) { 685 DCHECK(0x10000 <= code_point && code_point <= kMaxUnicode, 686 "Codepoint must be valid unicode and require more than 16 bits"); 687 return kLowSurrogateStart + (code_point & kSurrogateMask); 688} 689 690inline int8_t Unicode::toDecimal(int32_t code_point) { 691 if (isASCII(code_point)) { 692 return ASCII::toDecimal(code_point); 693 } 694 return toDecimalDB(code_point); 695} 696 697inline int8_t Unicode::toDigit(int32_t code_point) { 698 if (isASCII(code_point)) { 699 return ASCII::toDigit(code_point); 700 } 701 return toDigitDB(code_point); 702} 703 704inline FullCasing Unicode::toFolded(int32_t code_point) { 705 if (isASCII(code_point)) { 706 return {ASCII::toLower(code_point), -1}; 707 } 708 return toFoldedDB(code_point); 709} 710 711inline FullCasing Unicode::toLower(int32_t code_point) { 712 if (isASCII(code_point)) { 713 return {ASCII::toLower(code_point), -1}; 714 } 715 return toLowerDB(code_point); 716} 717 718inline double Unicode::toNumeric(int32_t code_point) { 719 if (isASCII(code_point)) { 720 return ASCII::toNumeric(code_point); 721 } 722 return toNumericDB(code_point); 723} 724 725inline FullCasing Unicode::toTitle(int32_t code_point) { 726 if (isASCII(code_point)) { 727 return {ASCII::toUpper(code_point), -1}; 728 } 729 return toTitleDB(code_point); 730} 731 732inline FullCasing Unicode::toUpper(int32_t code_point) { 733 if (isASCII(code_point)) { 734 return {ASCII::toUpper(code_point), -1}; 735 } 736 return toUpperDB(code_point); 737} 738 739} // namespace py