runtime/unicode-db.h at trunk · bernsteinbear.com/skybison

bernsteinbear.com / skybison
fork atom
this repo has no description
fork atom
skybison / runtime / unicode-db.h
at trunk 139 lines 4.3 kB view raw
wrap content
Max Bernstein Add license headers 4y ago
29d072a3
  1/* Copyright (c) Facebook, Inc. and its affiliates. (http://www.facebook.com) */
  2// @generated by generate_unicode_database.py
  3#pragma once
  4
  5#include <cstdint>
  6
  7#include "globals.h"
  8#include "objects.h"
  9#include "unicode.h"
 10
 11namespace py {
 12
 13static const int kMaxNameLength = 256;
 14
 15// Longest decomposition in Unicode 11.0.0: U+FDFA
 16static const int kMaxDecomposition = 18;
 17
 18static_assert(Unicode::kAliasStart == 0xf0000,
 19              "Unicode aliases start at unexpected code point");
 20static_assert(Unicode::kAliasCount == 468,
 21              "Unexpected number of Unicode aliases");
 22static_assert(Unicode::kNamedSequenceStart == 0xf0200,
 23              "Unicode named sequences start at unexpected code point");
 24static_assert(Unicode::kNamedSequenceCount == 442,
 25              "Unexpected number of Unicode named sequences");
 26
 27enum NormalizationForm : byte {
 28  kInvalid = 0,
 29  kNFD = 0x3,
 30  kNFKD = 0xc,
 31  kNFC = 0x30,
 32  kNFKC = 0xc0,
 33};
 34
 35enum : int32_t {
 36  kAlphaMask = 0x1,
 37  kDecimalMask = 0x2,
 38  kDigitMask = 0x4,
 39  kLowerMask = 0x8,
 40  kLinebreakMask = 0x10,
 41  kSpaceMask = 0x20,
 42  kTitleMask = 0x40,
 43  kUpperMask = 0x80,
 44  kXidStartMask = 0x100,
 45  kXidContinueMask = 0x200,
 46  kPrintableMask = 0x400,
 47  kNumericMask = 0x800,
 48  kCaseIgnorableMask = 0x1000,
 49  kCasedMask = 0x2000,
 50  kExtendedCaseMask = 0x4000,
 51};
 52
 53struct UnicodeChangeRecord {
 54  const byte bidirectional;
 55  const byte category;
 56  const byte decimal;
 57  const byte east_asian_width;
 58  const byte mirrored;
 59  const double numeric;
 60};
 61
 62struct UnicodeDatabaseRecord {
 63  const byte bidirectional;
 64  const byte category;
 65  const byte combining;  // canonical combining class
 66  const byte east_asian_width;
 67  const bool mirrored;
 68  const byte quick_check;
 69};
 70
 71struct UnicodeDecomposition {
 72  const char* prefix;
 73  const int count;
 74  const int32_t* code_points;
 75};
 76
 77struct UnicodeNamedSequence {
 78  const byte length;
 79  const int32_t code_points[4];
 80};
 81
 82struct UnicodeTypeRecord {
 83  // Note: if more flag space is needed, decimal and digit could be unified
 84  const int8_t decimal;
 85  const int8_t digit;
 86  const int16_t flags;
 87  // Deltas to the character or offsets in kExtendedCase
 88  const int32_t lower;
 89  const int32_t title;
 90  const int32_t upper;
 91};
 92
 93extern const RawSmallStr kBidirectionalNames[];
 94extern const RawSmallStr kCategoryNames[];
 95extern const RawSmallStr kEastAsianWidthNames[];
 96
 97// Get a code point from its Unicode name.
 98// Returns the code point if the lookup succeeds, -1 if it fails.
 99int32_t codePointFromName(const byte* name, word size);
100int32_t codePointFromNameOrNamedSequence(const byte* name, word size);
101
102// Returns the NFC composition given the NFC first and last indices.
103int32_t composeCodePoint(int32_t first, int32_t last);
104
105// Returns the decomposition mapping of the code point.
106UnicodeDecomposition decomposeCodePoint(int32_t code_point);
107
108// Returns the case mapping for code points where offset is insufficient
109int32_t extendedCaseMapping(int32_t index);
110
111// Finds the first/last character of an NFC sequence containing the code point.
112int32_t findNFCFirst(int32_t code_point);
113int32_t findNFCLast(int32_t code_point);
114
115// Write the Unicode name for the given code point into the buffer.
116// Returns true if the name was written successfully, false otherwise.
117bool nameFromCodePoint(int32_t code_point, byte* buffer, word size);
118
119// Returns the normalization of the code point in Unicode 3.2.0, if it differs
120// from the current version. If the normalization is unchanged, returns -1.
121int32_t normalizeOld(int32_t code_point);
122
123// Returns the numeric value of the code point, or -1.0 if not numeric.
124double numericValue(int32_t code_point);
125
126// Returns true if the code point has one of the line break properties "BK",
127// "CR", "LR", or "NL" or the bidirectional type "B". Returns false otherwise.
128bool unicodeIsLinebreak(int32_t code_point);
129
130// Returns true if the code point has the bidirectional type "WS", "B", or "S"
131// or the category "Zs". Returns false otherwise.
132bool unicodeIsWhitespace(int32_t code_point);
133
134const UnicodeChangeRecord* changeRecord(int32_t code_point);
135const UnicodeDatabaseRecord* databaseRecord(int32_t code_point);
136const UnicodeNamedSequence* namedSequence(int32_t code_point);
137const UnicodeTypeRecord* typeRecord(int32_t code_point);
138
139}  // namespace py