util/generate_unicode_database.py at trunk · bernsteinbear.com/skybison

bernsteinbear.com / skybison
fork atom
this repo has no description
fork atom
skybison / util / generate_unicode_database.py
at trunk 2057 lines 64 kB view raw
wrap content
Max Bernstein Add license headers 4y ago
29d072a3
   1#!/usr/bin/env python3
   2# Portions copyright (c) Facebook, Inc. and its affiliates. (http://www.facebook.com)
   3"""Generate Unicode name, property, and type databases.
   4
   5This script converts Unicode 3.2 and 11.0 database files into a Pyro-specific format.
   6Run the script using Python version >= 3.7, from this directory, without args.
   7
   8The algorithm was taken from Fredrik Lundh's script for CPython.
   9"""
  10
  11import os
  12import sys
  13import zipfile
  14from dataclasses import dataclass
  15
  16
  17SCRIPT = os.path.relpath(__file__)
  18PYRO_DIR = os.path.normpath(os.path.join(SCRIPT, "..", ".."))
  19RUNTIME_DIR = os.path.join(PYRO_DIR, "runtime")
  20HEADER = "unicode-db.h"
  21HEADER_PATH = os.path.join(RUNTIME_DIR, HEADER)
  22DB_PATH = os.path.join(RUNTIME_DIR, "unicode-db.cpp")
  23LINE_WIDTH = 80
  24
  25NUM_CODE_POINTS = sys.maxunicode + 1
  26CODE_POINTS = range(NUM_CODE_POINTS)
  27
  28UNIDATA_VERSION = "11.0.0"
  29OLD_VERSION = "3.2.0"
  30
  31# UCD files
  32CASE_FOLDING = "CaseFolding.txt"
  33COMPOSITION_EXCLUSIONS = "CompositionExclusions.txt"
  34DERIVED_CORE_PROPERTIES = "DerivedCoreProperties.txt"
  35DERIVEDNORMALIZATION_PROPS = "DerivedNormalizationProps.txt"
  36EASTASIAN_WIDTH = "EastAsianWidth.txt"
  37LINE_BREAK = "LineBreak.txt"
  38NAME_ALIASES = "NameAliases.txt"
  39NAMED_SEQUENCES = "NamedSequences.txt"
  40SPECIAL_CASING = "SpecialCasing.txt"
  41UNICODE_DATA = "UnicodeData.txt"
  42UNIHAN = "Unihan.zip"
  43
  44# Private Use Areas -- in planes 1, 15, 16
  45PUA_1 = range(0xE000, 0xF900)
  46PUA_15 = range(0xF0000, 0xFFFFE)
  47PUA_16 = range(0x100000, 0x10FFFE)
  48
  49# we use this ranges of PUA_15 to store name aliases and named sequences
  50NAME_ALIASES_START = 0xF0000
  51NAMED_SEQUENCES_START = 0xF0200
  52
  53# Longest decomposition in Unicode {UNIDATA_VERSION}: U+FDFA
  54MAX_DECOMPOSITION = 18
  55
  56CATEGORY_NAMES = [
  57    "Cn",
  58    "Lu",
  59    "Ll",
  60    "Lt",
  61    "Mn",
  62    "Mc",
  63    "Me",
  64    "Nd",
  65    "Nl",
  66    "No",
  67    "Zs",
  68    "Zl",
  69    "Zp",
  70    "Cc",
  71    "Cf",
  72    "Cs",
  73    "Co",
  74    "Cn",
  75    "Lm",
  76    "Lo",
  77    "Pc",
  78    "Pd",
  79    "Ps",
  80    "Pe",
  81    "Pi",
  82    "Pf",
  83    "Po",
  84    "Sm",
  85    "Sc",
  86    "Sk",
  87    "So",
  88]
  89
  90BIDIRECTIONAL_NAMES = [
  91    "",
  92    "L",
  93    "LRE",
  94    "LRO",
  95    "R",
  96    "AL",
  97    "RLE",
  98    "RLO",
  99    "PDF",
 100    "EN",
 101    "ES",
 102    "ET",
 103    "AN",
 104    "CS",
 105    "NSM",
 106    "BN",
 107    "B",
 108    "S",
 109    "WS",
 110    "ON",
 111    "LRI",
 112    "RLI",
 113    "FSI",
 114    "PDI",
 115]
 116
 117EASTASIANWIDTH_NAMES = ["F", "H", "W", "Na", "A", "N"]
 118
 119MANDATORY_LINE_BREAKS = ["BK", "CR", "LF", "NL"]
 120
 121ALPHA_MASK = 0x01
 122DECIMAL_MASK = 0x02
 123DIGIT_MASK = 0x04
 124LOWER_MASK = 0x08
 125LINEBREAK_MASK = 0x10
 126SPACE_MASK = 0x20
 127TITLE_MASK = 0x40
 128UPPER_MASK = 0x80
 129XID_START_MASK = 0x100
 130XID_CONTINUE_MASK = 0x200
 131PRINTABLE_MASK = 0x400
 132NUMERIC_MASK = 0x800
 133CASE_IGNORABLE_MASK = 0x1000
 134CASED_MASK = 0x2000
 135EXTENDED_CASE_MASK = 0x4000
 136
 137
 138def open_data(filename, version):
 139    ucd_dir = os.path.join(PYRO_DIR, "third-party", f"ucd-{version}")
 140    if not os.path.exists(ucd_dir):
 141        os.mkdir(ucd_dir)
 142
 143    path = os.path.join(ucd_dir, filename)
 144    if not os.path.exists(path):
 145        if version == OLD_VERSION:
 146            # irregular url structure
 147            root, path = os.path.splitext(filename)
 148            url = f"http://www.unicode.org/Public/3.2-Update/{root}-{version}{path}"
 149        else:
 150            url = f"http://www.unicode.org/Public/{version}/ucd/{filename}"
 151        raise OSError(
 152            f"""Cannot open {path}: file does not exist.
 153Find it online at {url}"""
 154        )
 155
 156    if path.endswith(".txt"):
 157        return open(path, encoding="utf-8")
 158    else:
 159        # Unihan.zip
 160        return open(path, "rb")
 161
 162
 163########################################################################################
 164# CJK ranges
 165
 166
 167@dataclass(frozen=True)
 168class CJKRange:
 169    start: int
 170    end: int
 171
 172
 173CJK_RANGES = (
 174    CJKRange(0x3400, 0x4DB5),
 175    CJKRange(0x4E00, 0x9FEF),
 176    CJKRange(0x20000, 0x2A6D6),
 177    CJKRange(0x2A700, 0x2B734),
 178    CJKRange(0x2B740, 0x2B81D),
 179    CJKRange(0x2B820, 0x2CEA1),
 180    CJKRange(0x2CEB0, 0x2EBE0),
 181)
 182
 183
 184########################################################################################
 185# the following support code is taken from the unidb utilities
 186# Copyright (c) 1999-2000 by Secret Labs AB
 187
 188# load a unicode-data file from disk
 189
 190
 191class UnicodeData:
 192    """Loads a Unicode data file, checking disk first and then internet.
 193
 194    Taken from the unidb utilities.
 195    Copyright (c) 1999-2000 by Secret Labs AB
 196    """
 197
 198    # Record structure:
 199    # [ID, name, category, combining, bidi, decomp,  (6)
 200    #  decimal, digit, numeric, bidi-mirrored, Unicode-1-name, (11)
 201    #  ISO-comment, uppercase, lowercase, titlecase, ea-width, (16)
 202    #  derived-props] (17)
 203
 204    def __init__(self, version, check_cjk=True):  # noqa: C901
 205        self.version = version
 206
 207        self.aliases = []
 208        self.case_folding = {}
 209        self.change_records = []
 210        self.exclusions = set()
 211        self.named_sequences = []
 212        self.normalization_changes = []
 213        self.special_casing = {}
 214        self.table = [None] * NUM_CODE_POINTS
 215
 216        with open_data(UNICODE_DATA, version) as file:
 217            while True:
 218                s = file.readline()
 219                if not s:
 220                    break
 221                s = s.strip().split(";")
 222                char = int(s[0], 16)
 223                self.table[char] = s
 224
 225        # expand first-last ranges
 226        field = None
 227        cjk_ranges = []
 228        for char in CODE_POINTS:
 229            s = self.table[char]
 230            if s is not None:
 231                if s[1][-6:] == "First>":
 232                    s[1] = ""
 233                    field = s
 234                elif s[1][-5:] == "Last>":
 235                    if check_cjk and s[1].startswith("<CJK Ideograph"):
 236                        start = int(field[0], 16)
 237                        end = int(s[0], 16)
 238                        cjk_ranges.append(CJKRange(start, end))
 239                    s[1] = ""
 240                    field = None
 241            elif field:
 242                f2 = field[:]
 243                f2[0] = f"{char:X}"
 244                self.table[char] = f2
 245
 246        if check_cjk and set(cjk_ranges) != set(CJK_RANGES):
 247            raise ValueError(f"CJK ranges deviate: found {cjk_ranges!r}")
 248
 249        # check for name aliases and named sequences, see #12753
 250        # aliases and named sequences are not in 3.2.0
 251        if version != OLD_VERSION:
 252            self.aliases = []
 253            # store aliases in the Private Use Area 15, in range U+F0000..U+F00FF,
 254            # in order to take advantage of the compression and lookup
 255            # algorithms used for the other characters
 256            pua_index = NAME_ALIASES_START
 257            with open_data(NAME_ALIASES, version) as file:
 258                for s in file:
 259                    s = s.strip()
 260                    if not s or s.startswith("#"):
 261                        continue
 262                    char, name, abbrev = s.split(";")
 263                    char = int(char, 16)
 264                    self.aliases.append((name, char))
 265                    # also store the name in the PUA 1
 266                    self.table[pua_index][1] = name
 267                    pua_index += 1
 268            assert pua_index - NAME_ALIASES_START == len(self.aliases)
 269
 270            self.named_sequences = []
 271            # store named sequences in the PUA 1, in range U+F0100..,
 272            # in order to take advantage of the compression and lookup
 273            # algorithms used for the other characters.
 274
 275            assert pua_index < NAMED_SEQUENCES_START
 276            pua_index = NAMED_SEQUENCES_START
 277            with open_data(NAMED_SEQUENCES, version) as file:
 278                for s in file:
 279                    s = s.strip()
 280                    if not s or s.startswith("#"):
 281                        continue
 282                    name, chars = s.split(";")
 283                    chars = tuple(int(char, 16) for char in chars.split())
 284                    # check that the structure defined in write_header is OK
 285                    assert len(chars) <= 4, "change the NamedSequence array size"
 286                    assert all(c < NUM_CODE_POINTS for c in chars)
 287                    self.named_sequences.append(NamedSequence(name, chars))
 288                    # also store these in the PUA 1
 289                    self.table[pua_index][1] = name
 290                    pua_index += 1
 291            assert pua_index - NAMED_SEQUENCES_START == len(self.named_sequences)
 292
 293        self.exclusions = set()
 294        with open_data(COMPOSITION_EXCLUSIONS, version) as file:
 295            for s in file:
 296                s = s.strip()
 297                if s and not s.startswith("#"):
 298                    char = int(s.split()[0], 16)
 299                    self.exclusions.add(char)
 300
 301        widths = [None] * NUM_CODE_POINTS
 302        with open_data(EASTASIAN_WIDTH, version) as file:
 303            for s in file:
 304                s = s.strip()
 305                if not s:
 306                    continue
 307                if s[0] == "#":
 308                    continue
 309                s = s.split()[0].split(";")
 310                if ".." in s[0]:
 311                    first, last = [int(c, 16) for c in s[0].split("..")]
 312                    chars = list(range(first, last + 1))
 313                else:
 314                    chars = [int(s[0], 16)]
 315                for char in chars:
 316                    widths[char] = s[1]
 317
 318        for char in CODE_POINTS:
 319            if self.table[char] is not None:
 320                self.table[char].append(widths[char])
 321                self.table[char].append(set())
 322
 323        with open_data(DERIVED_CORE_PROPERTIES, version) as file:
 324            for s in file:
 325                s = s.split("#", 1)[0].strip()
 326                if not s:
 327                    continue
 328
 329                r, p = s.split(";")
 330                r = r.strip()
 331                p = p.strip()
 332                if ".." in r:
 333                    first, last = [int(c, 16) for c in r.split("..")]
 334                    chars = list(range(first, last + 1))
 335                else:
 336                    chars = [int(r, 16)]
 337                for char in chars:
 338                    if self.table[char]:
 339                        # Some properties (e.g. Default_Ignorable_Code_Point)
 340                        # apply to unassigned code points; ignore them
 341                        self.table[char][-1].add(p)
 342
 343        with open_data(LINE_BREAK, version) as file:
 344            for s in file:
 345                s = s.partition("#")[0]
 346                s = [i.strip() for i in s.split(";")]
 347                if len(s) < 2 or s[1] not in MANDATORY_LINE_BREAKS:
 348                    continue
 349                if ".." not in s[0]:
 350                    first = last = int(s[0], 16)
 351                else:
 352                    first, last = [int(c, 16) for c in s[0].split("..")]
 353                for char in range(first, last + 1):
 354                    self.table[char][-1].add("Line_Break")
 355
 356        # We only want the quickcheck properties
 357        # Format: NF?_QC; Y(es)/N(o)/M(aybe)
 358        # Yes is the default, hence only N and M occur
 359        # In 3.2.0, the format was different (NF?_NO)
 360        # The parsing will incorrectly determine these as
 361        # "yes", however, unicodedata.c will not perform quickchecks
 362        # for older versions, and no delta records will be created.
 363        quickchecks = [0] * NUM_CODE_POINTS
 364        qc_order = "NFD_QC NFKD_QC NFC_QC NFKC_QC".split()
 365        with open_data(DERIVEDNORMALIZATION_PROPS, version) as file:
 366            for s in file:
 367                if "#" in s:
 368                    s = s[: s.index("#")]
 369                s = [i.strip() for i in s.split(";")]
 370                if len(s) < 2 or s[1] not in qc_order:
 371                    continue
 372                quickcheck = "MN".index(s[2]) + 1  # Maybe or No
 373                quickcheck_shift = qc_order.index(s[1]) * 2
 374                quickcheck <<= quickcheck_shift
 375                if ".." not in s[0]:
 376                    first = last = int(s[0], 16)
 377                else:
 378                    first, last = [int(c, 16) for c in s[0].split("..")]
 379                for char in range(first, last + 1):
 380                    assert not (quickchecks[char] >> quickcheck_shift) & 3
 381                    quickchecks[char] |= quickcheck
 382
 383        with open_data(UNIHAN, version) as file:
 384            zip = zipfile.ZipFile(file)
 385            if version == OLD_VERSION:
 386                data = zip.open(f"Unihan-{OLD_VERSION}.txt").read()
 387            else:
 388                data = zip.open("Unihan_NumericValues.txt").read()
 389        for line in data.decode("utf-8").splitlines():
 390            if not line.startswith("U+"):
 391                continue
 392            code, tag, value = line.split(None, 3)[:3]
 393            if tag not in ("kAccountingNumeric", "kPrimaryNumeric", "kOtherNumeric"):
 394                continue
 395            value = value.strip().replace(",", "")
 396            i = int(code[2:], 16)
 397            # Patch the numeric field
 398            if self.table[i] is not None:
 399                self.table[i][8] = value
 400        sc = self.special_casing = {}
 401        with open_data(SPECIAL_CASING, version) as file:
 402            for s in file:
 403                s = s[:-1].split("#", 1)[0]
 404                if not s:
 405                    continue
 406                data = s.split("; ")
 407                if data[4]:
 408                    # We ignore all conditionals (since they depend on
 409                    # languages) except for one, which is hardcoded. See
 410                    # handle_capital_sigma in unicodeobject.c.
 411                    continue
 412                c = int(data[0], 16)
 413                lower = [int(char, 16) for char in data[1].split()]
 414                title = [int(char, 16) for char in data[2].split()]
 415                upper = [int(char, 16) for char in data[3].split()]
 416                sc[c] = (lower, title, upper)
 417        cf = self.case_folding = {}
 418        if version != OLD_VERSION:
 419            with open_data(CASE_FOLDING, version) as file:
 420                for s in file:
 421                    s = s[:-1].split("#", 1)[0]
 422                    if not s:
 423                        continue
 424                    data = s.split("; ")
 425                    if data[1] in "CF":
 426                        c = int(data[0], 16)
 427                        cf[c] = [int(char, 16) for char in data[2].split()]
 428
 429        for char in CODE_POINTS:
 430            if self.table[char] is not None:
 431                self.table[char].append(quickchecks[char])
 432
 433    def merge(self, old):  # noqa: C901
 434        # Changes to exclusion file not implemented yet
 435        if old.exclusions != self.exclusions:
 436            raise NotImplementedError("exclusions differ")
 437
 438        # In these change records, 0xFF means "no change"
 439        bidir_changes = [0xFF] * NUM_CODE_POINTS
 440        category_changes = [0xFF] * NUM_CODE_POINTS
 441        decimal_changes = [0xFF] * NUM_CODE_POINTS
 442        east_asian_width_changes = [0xFF] * NUM_CODE_POINTS
 443        mirrored_changes = [0xFF] * NUM_CODE_POINTS
 444        # In numeric data, 0 means "no change",
 445        # -1 means "did not have a numeric value
 446        numeric_changes = [0.0] * NUM_CODE_POINTS
 447        # normalization_changes is a list of key-value pairs
 448        normalization_changes = []
 449        for char in CODE_POINTS:
 450            if self.table[char] is None:
 451                # Characters unassigned in the new version ought to
 452                # be unassigned in the old one
 453                assert old.table[char] is None
 454                continue
 455            # check characters unassigned in the old version
 456            if old.table[char] is None:
 457                # category 0 is "unassigned"
 458                category_changes[char] = 0
 459                continue
 460            # check characters that differ
 461            if old.table[char] != self.table[char]:
 462                for k in range(len(old.table[char])):
 463                    if old.table[char][k] != self.table[char][k]:
 464                        value = old.table[char][k]
 465                        if k == 1 and char in PUA_15:
 466                            # the name is not set in the old.table, but in the
 467                            # self.table we are using it for aliases and named seq
 468                            assert value == ""
 469                        elif k == 2:
 470                            category_changes[char] = CATEGORY_NAMES.index(value)
 471                        elif k == 4:
 472                            bidir_changes[char] = BIDIRECTIONAL_NAMES.index(value)
 473                        elif k == 5:
 474                            # We assume all normalization changes are in 1:1 mappings
 475                            assert " " not in value
 476                            normalization_changes.append((char, value))
 477                        elif k == 6:
 478                            # only support changes where the old value is a single digit
 479                            assert value in "0123456789"
 480                            decimal_changes[char] = int(value)
 481                        elif k == 8:
 482                            # Since 0 encodes "no change", the old value is better not 0
 483                            if not value:
 484                                numeric_changes[char] = -1.0
 485                            else:
 486                                numeric_changes[char] = float(value)
 487                                assert numeric_changes[char] not in (0, -1)
 488                        elif k == 9:
 489                            if value == "Y":
 490                                mirrored_changes[char] = 1
 491                            else:
 492                                mirrored_changes[char] = 0
 493                        elif k == 11:
 494                            # change to ISO comment, ignore
 495                            pass
 496                        elif k == 12:
 497                            # change to simple uppercase mapping; ignore
 498                            pass
 499                        elif k == 13:
 500                            # change to simple lowercase mapping; ignore
 501                            pass
 502                        elif k == 14:
 503                            # change to simple titlecase mapping; ignore
 504                            pass
 505                        elif k == 15:
 506                            # change to east asian width
 507                            east_asian_width_changes[char] = EASTASIANWIDTH_NAMES.index(
 508                                value
 509                            )
 510                        elif k == 16:
 511                            # derived property changes; not yet
 512                            pass
 513                        elif k == 17:
 514                            # normalization quickchecks are not performed
 515                            # for older versions
 516                            pass
 517                        else:
 518
 519                            class Difference(Exception):
 520                                pass
 521
 522                            raise Difference(
 523                                hex(char), k, old.table[char], self.table[char]
 524                            )
 525
 526        self.change_records = zip(
 527            bidir_changes,
 528            category_changes,
 529            decimal_changes,
 530            east_asian_width_changes,
 531            mirrored_changes,
 532            numeric_changes,
 533        )
 534        self.normalization_changes = normalization_changes
 535
 536
 537########################################################################################
 538# Helper functions & classes
 539
 540
 541def getsize(data):
 542    """Return the smallest possible integer size for the given array."""
 543    maxdata = max(data)
 544    if maxdata < (1 << 8):
 545        return 1
 546    if maxdata < (1 << 16):
 547        return 2
 548    return 4
 549
 550
 551def splitbins(t, trace=False):  # noqa: C901
 552    """t, trace=False -> (t1, t2, shift).  Split a table to save space.
 553
 554    t is a sequence of ints.  This function can be useful to save space if
 555    many of the ints are the same.  t1 and t2 are lists of ints, and shift
 556    is an int, chosen to minimize the combined size of t1 and t2 (in C
 557    code), and where for each i in range(len(t)),
 558        t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
 559    where mask is a bitmask isolating the last "shift" bits.
 560
 561    If optional arg trace is non-zero (default zero), progress info
 562    is printed to sys.stderr.  The higher the value, the more info
 563    you'll get.
 564    """
 565
 566    if trace:
 567
 568        def dump(t1, t2, shift, bytes):
 569            sys.stderr.write(
 570                f"{len(t1)}+{len(t2)} bins at shift {shift}; {bytes} bytes\n"
 571            )
 572
 573        sys.stderr.write(f"Size of original table: {len(t) * getsize(t)} bytes\n")
 574
 575    n = len(t) - 1  # last valid index
 576    maxshift = 0  # the most we can shift n and still have something left
 577    if n > 0:
 578        while n >> 1:
 579            n >>= 1
 580            maxshift += 1
 581    del n
 582    bytes = sys.maxsize  # smallest total size so far
 583    t = tuple(t)  # so slices can be dict keys
 584    for shift in range(maxshift + 1):
 585        t1 = []
 586        t2 = []
 587        size = 2 ** shift
 588        bincache = {}
 589        for i in range(0, len(t), size):
 590            bin = t[i : i + size]
 591            index = bincache.get(bin)
 592            if index is None:
 593                index = len(t2)
 594                bincache[bin] = index
 595                t2.extend(bin)
 596            t1.append(index >> shift)
 597        # determine memory size
 598        b = len(t1) * getsize(t1) + len(t2) * getsize(t2)
 599        if trace:
 600            dump(t1, t2, shift, b)
 601        if b < bytes:
 602            best = t1, t2, shift
 603            bytes = b
 604    t1, t2, shift = best
 605    if trace:
 606        sys.stderr.write(f"Best: ")
 607        dump(t1, t2, shift, bytes)
 608    if __debug__:
 609        # exhaustively verify that the decomposition is correct
 610        mask = ~((~0) << shift)  # i.e., low-bit mask of shift bits
 611        for i in range(len(t)):
 612            assert t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
 613    return best
 614
 615
 616@dataclass
 617class WordData:
 618    word: str
 619    index: int
 620    frequency: int
 621
 622
 623class Words:
 624    def __init__(self, names, trace=False):
 625        self.indices = {}
 626        self.frequencies = {}
 627
 628        count = num_chars = num_words = 0
 629        for char in CODE_POINTS:
 630            name = names[char]
 631            if name is None:
 632                continue
 633
 634            words = name.split()
 635            num_chars += len(name)
 636            num_words += len(words)
 637            for word in words:
 638                if word in self.indices:
 639                    self.frequencies[word] += 1
 640                else:
 641                    self.frequencies[word] = 1
 642                    self.indices[word] = count
 643                    count += 1
 644
 645        if trace:
 646            print(f"{num_words} words in text; {num_chars} code points")
 647
 648    def tolist(self):
 649        return sorted(
 650            (
 651                WordData(word, index, self.frequencies[word])
 652                for word, index in self.indices.items()
 653            ),
 654            key=lambda data: (-data.frequency, data.word),
 655        )
 656
 657
 658########################################################################################
 659# Classes for database tables
 660
 661
 662@dataclass(frozen=True)
 663class ChangeRecord:
 664    bidirectional: int
 665    category: int
 666    decimal: int
 667    east_asian_width: int
 668    mirrored: int
 669    numeric: float
 670
 671    def __str__(self):
 672        return f"\
 673{{{self.bidirectional:#04x}, {self.category:#04x}, {self.decimal:#04x}, \
 674{self.east_asian_width:#04x}, {self.mirrored:#04x}, {self.numeric}}}"
 675
 676
 677@dataclass(frozen=True)
 678class DatabaseRecord:
 679    bidirectional: int
 680    category: int
 681    combining: int
 682    east_asian_width: int
 683    mirrored: bool
 684    quick_check: int
 685
 686    def __str__(self):
 687        mirrored = "true" if self.mirrored else "false"
 688        return f"{{{self.bidirectional:#04x}, {self.category:#04x}, \
 689{self.combining:#04x}, {self.east_asian_width:#04x}, {mirrored}, \
 690{self.quick_check:#04x}}}"
 691
 692
 693@dataclass(frozen=True)
 694class NamedSequence:
 695    name: int
 696    seq: int
 697
 698    def __str__(self):
 699        seq_str = ", ".join(f"{elt:#04x}" for elt in self.seq)
 700        return f"{{{len(self.seq)}, {{{seq_str}}}}}"
 701
 702
 703@dataclass(frozen=True)
 704class Reindex:
 705    start: int
 706    count: int
 707    index: int
 708
 709    def __str__(self):
 710        return f"{{{self.start:#010x}, {self.count}, {self.index}}}"
 711
 712
 713@dataclass(frozen=True)
 714class TypeRecord:
 715    decimal: int
 716    digit: int
 717    flags: int
 718    lower: int
 719    title: int
 720    upper: int
 721
 722    def __str__(self):
 723        return f"{{{self.decimal}, {self.digit}, {self.flags:#06x}, \
 724{self.lower:#x}, {self.title:#x}, {self.upper:#x}}}"
 725
 726
 727class CodePointArray:
 728    """Arrays of code points, printed as hex."""
 729
 730    def __init__(self, name, data):
 731        self.name = name
 732        self.data = data
 733
 734    def dump(self, file, trace=False):
 735        if trace:
 736            sys.stderr.write(f"{self.name}: {4 * len(self.data)} bytes\n")
 737
 738        file.write(f"\nstatic const int32_t {self.name}[] = {{")
 739        columns = (LINE_WIDTH - 3) // 12
 740        for i in range(0, len(self.data), columns):
 741            line = "\n   "
 742            for item in self.data[i : i + columns]:
 743                line = f"{line} {item:#010x},"
 744            file.write(line)
 745        file.write("\n};\n")
 746
 747
 748class SmallStrArray:
 749    def __init__(self, name, data, comment=None):
 750        self.name = name
 751        self.data = data
 752        self.comment = comment
 753
 754    def dump(self, file, trace=False):
 755        if self.comment is not None:
 756            file.write(f"\n// {self.comment}")
 757        file.write(f"\nconst RawSmallStr {self.name}[] = {{")
 758
 759        if self.data:
 760            file.write("\n")
 761            for item in self.data:
 762                file.write(f'    RawSmallStr::fromCStr("{item}"),\n')
 763
 764        file.write("};\n")
 765
 766
 767class StructArray:
 768    """Store an array for writing to a generated C/C++ file."""
 769
 770    def __init__(self, ctype, name, data, comment=None):
 771        self.type = ctype
 772        self.name = name
 773        self.data = data
 774        self.comment = comment
 775
 776    def dump(self, file, trace=False):
 777        if self.comment is not None:
 778            file.write(f"\n// {self.comment}")
 779        file.write(f"\nstatic const {self.type} {self.name}[] = {{")
 780
 781        if self.data:
 782            file.write("\n")
 783            for item in self.data:
 784                file.write(f"    {item},\n")
 785
 786        file.write("};\n")
 787
 788
 789class UIntArray:
 790    """Unsigned integer arrays, printed as hex."""
 791
 792    def __init__(self, name, data):
 793        self.name = name
 794        self.data = data
 795        self.size = getsize(data)
 796
 797    def dump(self, file, trace=False):
 798        if trace:
 799            sys.stderr.write(f"{self.name}: {self.size * len(self.data)} bytes\n")
 800
 801        file.write(f"\nstatic const uint{8 * self.size}_t {self.name}[] = {{")
 802        columns = (LINE_WIDTH - 3) // (2 * self.size + 4)
 803        for i in range(0, len(self.data), columns):
 804            line = "\n   "
 805            for item in self.data[i : i + columns]:
 806                if self.size == 1:
 807                    line = f"{line} {item:#04x},"
 808                elif self.size == 2:
 809                    line = f"{line} {item:#06x},"
 810                else:
 811                    line = f"{line} {item:#010x},"
 812            file.write(line)
 813        file.write("\n};\n")
 814
 815
 816########################################################################################
 817# Reimplementation of dictionaries using static structures and a custom string hash
 818
 819
 820# Number chosen to minimize the number of collisions. CPython uses 47.
 821MAGIC = 43
 822
 823
 824def myhash(s):
 825    h = 0
 826    for c in map(ord, s.upper()):
 827        h = (h * MAGIC) + c
 828        if h > 0x00FFFFFF:
 829            h = (h & 0x00FFFFFF) ^ ((h & 0xFF000000) >> 24)
 830    return h
 831
 832
 833SIZES = [
 834    (4, 3),
 835    (8, 3),
 836    (16, 3),
 837    (32, 5),
 838    (64, 3),
 839    (128, 3),
 840    (256, 29),
 841    (512, 17),
 842    (1024, 9),
 843    (2048, 5),
 844    (4096, 83),
 845    (8192, 27),
 846    (16384, 43),
 847    (32768, 3),
 848    (65536, 45),
 849    (131072, 9),
 850    (262144, 39),
 851    (524288, 39),
 852    (1048576, 9),
 853    (2097152, 5),
 854    (4194304, 3),
 855    (8388608, 33),
 856    (16777216, 27),
 857]
 858
 859
 860class Hash:
 861    def __init__(self, data):
 862        """Turn a (key, value) list into a static hash table structure."""
 863
 864        # determine table size
 865        for size, poly in SIZES:
 866            if size > len(data):
 867                self.poly = poly + size
 868                self.size = size
 869                break
 870        else:
 871            raise AssertionError("ran out of polynomials")
 872
 873        print(self.size, "slots in hash table")
 874        table = [None] * self.size
 875        mask = self.size - 1
 876
 877        # initialize hash table
 878        collisions = 0
 879        for key, value in data:
 880            h = myhash(key)
 881            i = (~h) & mask
 882            if table[i] is None:
 883                table[i] = value
 884                continue
 885            incr = (h ^ (h >> 3)) & mask
 886            if not incr:
 887                incr = mask
 888            while True:
 889                collisions += 1
 890                i = (i + incr) & mask
 891                if table[i] is None:
 892                    table[i] = value
 893                    break
 894                incr = incr << 1
 895                if incr > mask:
 896                    incr = incr ^ self.poly
 897        print(collisions, "collisions")
 898
 899        for i in range(len(table)):
 900            if table[i] is None:
 901                table[i] = 0
 902
 903        self.data = CodePointArray("kCodeHash", table)
 904
 905    def dump(self, file, trace):
 906        self.data.dump(file, trace)
 907        file.write(
 908            f"""
 909static const int32_t kCodeMagic = {MAGIC};
 910static const int32_t kCodeMask = {self.size - 1};
 911static const int32_t kCodePoly = {self.poly};
 912"""
 913        )
 914
 915
 916########################################################################################
 917# Unicode database writers
 918
 919
 920def write_header(unicode, header):
 921    """Writes type declarations to the database header file."""
 922
 923    header.write("// @")
 924    header.write(
 925        f"""\
 926generated by {os.path.basename(SCRIPT)}
 927#pragma once
 928
 929#include <cstdint>
 930
 931#include "globals.h"
 932#include "objects.h"
 933#include "unicode.h"
 934
 935namespace py {{
 936
 937static const int kMaxNameLength = 256;
 938
 939// Longest decomposition in Unicode {UNIDATA_VERSION}: U+FDFA
 940static const int kMaxDecomposition = {MAX_DECOMPOSITION};
 941
 942static_assert(Unicode::kAliasStart == {NAME_ALIASES_START:#x},
 943              "Unicode aliases start at unexpected code point");
 944static_assert(Unicode::kAliasCount == {len(unicode.aliases)},
 945              "Unexpected number of Unicode aliases");
 946static_assert(Unicode::kNamedSequenceStart == {NAMED_SEQUENCES_START:#x},
 947              "Unicode named sequences start at unexpected code point");
 948static_assert(Unicode::kNamedSequenceCount == {len(unicode.named_sequences)},
 949              "Unexpected number of Unicode named sequences");
 950
 951enum NormalizationForm : byte {{
 952  kInvalid = 0,
 953  kNFD = 0x3,
 954  kNFKD = 0xc,
 955  kNFC = 0x30,
 956  kNFKC = 0xc0,
 957}};
 958
 959enum : int32_t {{
 960  kAlphaMask = {ALPHA_MASK:#x},
 961  kDecimalMask = {DECIMAL_MASK:#x},
 962  kDigitMask = {DIGIT_MASK:#x},
 963  kLowerMask = {LOWER_MASK:#x},
 964  kLinebreakMask = {LINEBREAK_MASK:#x},
 965  kSpaceMask = {SPACE_MASK:#x},
 966  kTitleMask = {TITLE_MASK:#x},
 967  kUpperMask = {UPPER_MASK:#x},
 968  kXidStartMask = {XID_START_MASK:#x},
 969  kXidContinueMask = {XID_CONTINUE_MASK:#x},
 970  kPrintableMask = {PRINTABLE_MASK:#x},
 971  kNumericMask = {NUMERIC_MASK:#x},
 972  kCaseIgnorableMask = {CASE_IGNORABLE_MASK:#x},
 973  kCasedMask = {CASED_MASK:#x},
 974  kExtendedCaseMask = {EXTENDED_CASE_MASK:#x},
 975}};
 976
 977struct UnicodeChangeRecord {{
 978  const byte bidirectional;
 979  const byte category;
 980  const byte decimal;
 981  const byte east_asian_width;
 982  const byte mirrored;
 983  const double numeric;
 984}};
 985
 986struct UnicodeDatabaseRecord {{
 987  const byte bidirectional;
 988  const byte category;
 989  const byte combining;  // canonical combining class
 990  const byte east_asian_width;
 991  const bool mirrored;
 992  const byte quick_check;
 993}};
 994
 995struct UnicodeDecomposition {{
 996  const char* prefix;
 997  const int count;
 998  const int32_t* code_points;
 999}};
1000
1001struct UnicodeNamedSequence {{
1002  const byte length;
1003  const int32_t code_points[4];
1004}};
1005
1006struct UnicodeTypeRecord {{
1007  // Note: if more flag space is needed, decimal and digit could be unified
1008  const int8_t decimal;
1009  const int8_t digit;
1010  const int16_t flags;
1011  // Deltas to the character or offsets in kExtendedCase
1012  const int32_t lower;
1013  const int32_t title;
1014  const int32_t upper;
1015}};
1016
1017extern const RawSmallStr kBidirectionalNames[];
1018extern const RawSmallStr kCategoryNames[];
1019extern const RawSmallStr kEastAsianWidthNames[];
1020
1021// Get a code point from its Unicode name.
1022// Returns the code point if the lookup succeeds, -1 if it fails.
1023int32_t codePointFromName(const byte* name, word size);
1024int32_t codePointFromNameOrNamedSequence(const byte* name, word size);
1025
1026// Returns the NFC composition given the NFC first and last indices.
1027int32_t composeCodePoint(int32_t first, int32_t last);
1028
1029// Returns the decomposition mapping of the code point.
1030UnicodeDecomposition decomposeCodePoint(int32_t code_point);
1031
1032// Returns the case mapping for code points where offset is insufficient
1033int32_t extendedCaseMapping(int32_t index);
1034
1035// Finds the first/last character of an NFC sequence containing the code point.
1036int32_t findNFCFirst(int32_t code_point);
1037int32_t findNFCLast(int32_t code_point);
1038
1039// Write the Unicode name for the given code point into the buffer.
1040// Returns true if the name was written successfully, false otherwise.
1041bool nameFromCodePoint(int32_t code_point, byte* buffer, word size);
1042
1043// Returns the normalization of the code point in Unicode {OLD_VERSION}, if it differs
1044// from the current version. If the normalization is unchanged, returns -1.
1045int32_t normalizeOld(int32_t code_point);
1046
1047// Returns the numeric value of the code point, or -1.0 if not numeric.
1048double numericValue(int32_t code_point);
1049
1050// Returns true if the code point has one of the line break properties "BK",
1051// "CR", "LR", or "NL" or the bidirectional type "B". Returns false otherwise.
1052bool unicodeIsLinebreak(int32_t code_point);
1053
1054// Returns true if the code point has the bidirectional type "WS", "B", or "S"
1055// or the category "Zs". Returns false otherwise.
1056bool unicodeIsWhitespace(int32_t code_point);
1057
1058const UnicodeChangeRecord* changeRecord(int32_t code_point);
1059const UnicodeDatabaseRecord* databaseRecord(int32_t code_point);
1060const UnicodeNamedSequence* namedSequence(int32_t code_point);
1061const UnicodeTypeRecord* typeRecord(int32_t code_point);
1062
1063}}  // namespace py
1064"""
1065    )
1066
1067
1068def write_db_prelude(db):
1069    """Writes required include directives to the database file."""
1070
1071    db.write("// @")
1072    db.write(
1073        f"""\
1074generated by {os.path.basename(SCRIPT)}
1075#include "{HEADER}"
1076
1077#include <cinttypes>
1078#include <cstdio>
1079#include <cstring>
1080
1081#include "asserts.h"
1082#include "globals.h"
1083#include "objects.h"
1084#include "unicode.h"
1085
1086namespace py {{
1087"""
1088    )
1089
1090
1091def write_database_records(unicode, db, trace):  # noqa: C901
1092    dummy = DatabaseRecord(0, 0, 0, 0, False, 0)
1093    table = [dummy]
1094    cache = {dummy: 0}
1095    index = [0] * NUM_CODE_POINTS
1096
1097    decomp_data = [0]
1098    decomp_prefix = [""]
1099    decomp_index = [0] * NUM_CODE_POINTS
1100    decomp_size = 0
1101
1102    comp_pairs = []
1103    comp_first = [None] * NUM_CODE_POINTS
1104    comp_last = [None] * NUM_CODE_POINTS
1105
1106    for char in CODE_POINTS:
1107        record = unicode.table[char]
1108        if record:
1109            # database properties
1110            item = DatabaseRecord(
1111                BIDIRECTIONAL_NAMES.index(record[4]),
1112                CATEGORY_NAMES.index(record[2]),
1113                int(record[3]),
1114                EASTASIANWIDTH_NAMES.index(record[15]),
1115                record[9] == "Y",
1116                record[17],
1117            )
1118
1119            idx = cache.get(item)
1120            if idx is None:
1121                cache[item] = idx = len(table)
1122                table.append(item)
1123            index[char] = idx
1124
1125            # decomposition data
1126            decomp_idx = 0
1127            if record[5]:
1128                decomp = record[5].split()
1129                prefix = decomp.pop(0) if decomp[0][0] == "<" else ""
1130                if len(decomp) > MAX_DECOMPOSITION:
1131                    raise Exception(
1132                        f"decomposition of code point {char:#x} is too large"
1133                    )
1134
1135                try:
1136                    idx = decomp_prefix.index(prefix)
1137                except ValueError:
1138                    idx = len(decomp_prefix)
1139                    decomp_prefix.append(prefix)
1140                assert idx < 256
1141
1142                decomp = [idx + (len(decomp) << 8)] + [int(s, 16) for s in decomp]
1143                if (
1144                    not idx
1145                    and len(decomp) == 3
1146                    and char not in unicode.exclusions
1147                    and unicode.table[decomp[1]][3] == "0"
1148                ):
1149                    _, l, r = decomp
1150                    comp_first[l] = 1
1151                    comp_last[r] = 1
1152                    comp_pairs.append((l, r, char))
1153                try:
1154                    decomp_idx = decomp_data.index(decomp)
1155                except ValueError:
1156                    decomp_idx = len(decomp_data)
1157                    decomp_data.extend(decomp)
1158                    decomp_size = decomp_size + len(decomp) * 2
1159            decomp_index[char] = decomp_idx
1160
1161    first = last = 0
1162    comp_first_ranges = []
1163    comp_last_ranges = []
1164    prev_first = prev_last = None
1165    for ch in CODE_POINTS:
1166        if comp_first[ch] is not None:
1167            comp_first[ch] = first
1168            first += 1
1169            if prev_first is None:
1170                prev_first = (ch, ch)
1171            elif prev_first[1] + 1 == ch:
1172                prev_first = prev_first[0], ch
1173            else:
1174                comp_first_ranges.append(prev_first)
1175                prev_first = (ch, ch)
1176        if comp_last[ch] is not None:
1177            comp_last[ch] = last
1178            last += 1
1179            if prev_last is None:
1180                prev_last = (ch, ch)
1181            elif prev_last[1] + 1 == ch:
1182                prev_last = prev_last[0], ch
1183            else:
1184                comp_last_ranges.append(prev_last)
1185                prev_last = (ch, ch)
1186    comp_first_ranges.append(prev_first)
1187    comp_last_ranges.append(prev_last)
1188    total_first = first
1189    total_last = last
1190
1191    comp_data = [0] * (total_first * total_last)
1192    for first, last, char in comp_pairs:
1193        first = comp_first[first]
1194        last = comp_last[last]
1195        comp_data[first * total_last + last] = char
1196
1197    if trace:
1198        print(len(table), "unique properties")
1199        print(len(decomp_prefix), "unique decomposition prefixes")
1200        print(len(decomp_data), "unique decomposition entries:", end=" ")
1201        print(decomp_size, "bytes")
1202        print(total_first, "first characters in NFC")
1203        print(total_last, "last characters in NFC")
1204        print(len(comp_pairs), "NFC pairs")
1205
1206    StructArray(
1207        "UnicodeDatabaseRecord",
1208        "kDatabaseRecords",
1209        table,
1210        "a list of unique database records",
1211    ).dump(db, trace)
1212
1213    # split record index table
1214    index1, index2, shift = splitbins(index, trace)
1215    db.write(
1216        f"""
1217// type indices
1218static const int kDatabaseIndexShift = {shift};
1219static const int32_t kDatabaseIndexMask =
1220    (int32_t{{1}} << kDatabaseIndexShift) - 1;
1221"""
1222    )
1223    UIntArray("kDatabaseIndex1", index1).dump(db, trace)
1224    UIntArray("kDatabaseIndex2", index2).dump(db, trace)
1225
1226    db.write(
1227        f"""
1228// Reindexing of NFC first and last characters
1229struct Reindex {{
1230  const int32_t start;
1231  const short count;
1232  const short index;
1233}};
1234
1235static const int kTotalLast = {total_last};
1236"""
1237    )
1238
1239    nfc_first = [
1240        Reindex(start, end - start, comp_first[start])
1241        for start, end in comp_first_ranges
1242    ] + [Reindex(0, 0, 0)]
1243    nfc_last = [
1244        Reindex(start, end - start, comp_last[start]) for start, end in comp_last_ranges
1245    ] + [Reindex(0, 0, 0)]
1246    StructArray("Reindex", "kNFCFirst", nfc_first).dump(db, trace)
1247    StructArray("Reindex", "kNFCLast", nfc_last).dump(db, trace)
1248
1249    # split decomposition index table
1250    index1, index2, shift = splitbins(decomp_index, trace)
1251
1252    db.write(
1253        f"""
1254// decomposition mappings
1255static const int kDecompShift = {shift};
1256static const int32_t kDecompMask = (int32_t{{1}} << kDecompShift) - 1;
1257
1258const char* kDecompPrefix[] = {{
1259"""
1260    )
1261    for name in decomp_prefix:
1262        db.write(f'    "{name}",\n')
1263    db.write("};\n")
1264
1265    CodePointArray("kDecompData", decomp_data).dump(db, trace)
1266    UIntArray("kDecompIndex1", index1).dump(db, trace)
1267    UIntArray("kDecompIndex2", index2).dump(db, trace)
1268
1269    index1, index2, shift = splitbins(comp_data, trace)
1270    db.write(
1271        f"""
1272// NFC pairs
1273static const int kCompShift = {shift};
1274static const int32_t kCompMask = (int32_t{{1}} << kCompShift) - 1;
1275"""
1276    )
1277    UIntArray("kCompIndex", index1).dump(db, trace)
1278    UIntArray("kCompData", index2).dump(db, trace)
1279
1280
1281def write_name_data(unicode, db, trace):  # noqa: C901
1282    # Collect names
1283    names = [None] * NUM_CODE_POINTS
1284    for char in CODE_POINTS:
1285        record = unicode.table[char]
1286        if record:
1287            name = record[1].strip()
1288            if name and name[0] != "<":
1289                names[char] = f"{name}\0"
1290
1291    if trace:
1292        print(len([n for n in names if n is not None]), "distinct names")
1293
1294    # Collect unique words from names. Note that we differ between
1295    # words ending a sentence, which include the trailing null byte,
1296    # and words inside a sentence, which do not.
1297    words = Words(names, trace)
1298    wordlist = words.tolist()
1299
1300    # Figure out how many phrasebook escapes we need
1301    short = (65536 - len(wordlist)) // 256
1302    assert short > 0
1303
1304    if trace:
1305        print(short, "short indexes in lexicon")
1306        n = sum(wordlist[i].frequency for i in range(short))
1307        print(n, "short indexes in phrasebook")
1308
1309    # Pick the most commonly used words, and sort the rest by falling
1310    # length to maximize overlap.
1311    tail = wordlist[short:]
1312    tail.sort(key=lambda data: len(data.word), reverse=True)
1313    wordlist[short:] = tail
1314
1315    # Build a lexicon string from words
1316    current_offset = 0
1317    lexicon_offset = [0]
1318    lexicon_string = ""
1319    word_offsets = {}
1320    for data in wordlist:
1321        # Encoding: bit 7 indicates last character in word
1322        # chr(128) indicates the last character in an entire string)
1323        last = ord(data.word[-1])
1324        encoded = f"{data.word[:-1]}{chr(last + 128)}"
1325
1326        # reuse string tails, when possible
1327        offset = lexicon_string.find(encoded)
1328        if offset < 0:
1329            offset = current_offset
1330            lexicon_string = lexicon_string + encoded
1331            current_offset += len(encoded)
1332
1333        word_offsets[data.word] = len(lexicon_offset)
1334        lexicon_offset.append(offset)
1335
1336    lexicon = [ord(ch) for ch in lexicon_string]
1337
1338    # generate phrasebook from names and lexicon
1339    phrasebook = [0]
1340    phrasebook_offset = [0] * NUM_CODE_POINTS
1341    for char in CODE_POINTS:
1342        name = names[char]
1343        if name is not None:
1344            words = name.split()
1345            phrasebook_offset[char] = len(phrasebook)
1346            for word in words:
1347                offset = word_offsets[word]
1348                if offset < short:
1349                    phrasebook.append(offset)
1350                else:
1351                    # store as two bytes
1352                    phrasebook.append((offset >> 8) + short)
1353                    phrasebook.append(offset & 255)
1354
1355    assert getsize(phrasebook) == 1
1356
1357    db.write(f"\n// lexicon")
1358    UIntArray("kLexicon", lexicon).dump(db, trace)
1359    UIntArray("kLexiconOffset", lexicon_offset).dump(db, trace)
1360
1361    # split phrasebook index table
1362    offset1, offset2, shift = splitbins(phrasebook_offset, trace)
1363
1364    db.write(
1365        f"""
1366// code => name phrasebook
1367static const int kPhrasebookShift = {shift};
1368static const int kPhrasebookShort = {short};
1369static const int kPhrasebookMask = (1 << kPhrasebookShift) - 1;
1370"""
1371    )
1372    UIntArray("kPhrasebook", phrasebook).dump(db, trace)
1373    UIntArray("kPhrasebookOffset1", offset1).dump(db, trace)
1374    UIntArray("kPhrasebookOffset2", offset2).dump(db, trace)
1375
1376    # Extract names for name hash table
1377    hash_data = []
1378    for char in CODE_POINTS:
1379        record = unicode.table[char]
1380        if record:
1381            name = record[1].strip()
1382            if name and name[0] != "<":
1383                hash_data.append((name, char))
1384
1385    db.write("\n// name => code dictionary")
1386    Hash(hash_data).dump(db, trace)
1387
1388    aliases = [codepoint for _, codepoint in unicode.aliases]
1389    UIntArray("kNameAliases", aliases).dump(db, trace)
1390
1391    StructArray(
1392        "UnicodeNamedSequence", "kNamedSequences", unicode.named_sequences
1393    ).dump(db, trace)
1394
1395
1396def write_type_data(unicode, db, trace):  # noqa: C901
1397    """Writes Unicode character type tables to the database file."""
1398
1399    # extract unicode types
1400    dummy = TypeRecord(0, 0, 0, 0, 0, 0)
1401    table = [dummy]
1402    cache = {0: dummy}
1403    index = [0] * NUM_CODE_POINTS
1404    numeric = {}
1405    spaces = []
1406    linebreaks = []
1407    extended_cases = []
1408
1409    for char in CODE_POINTS:
1410        record = unicode.table[char]
1411        if record:
1412            # extract database properties
1413            category = record[2]
1414            bidirectional = record[4]
1415            properties = record[16]
1416            flags = 0
1417            # TODO(T55176519): delta = True
1418            if category in ("Lm", "Lt", "Lu", "Ll", "Lo"):
1419                flags |= ALPHA_MASK
1420            if "Lowercase" in properties:
1421                flags |= LOWER_MASK
1422            if "Line_Break" in properties or bidirectional == "B":
1423                flags |= LINEBREAK_MASK
1424                linebreaks.append(char)
1425            if category == "Zs" or bidirectional in ("WS", "B", "S"):
1426                flags |= SPACE_MASK
1427                spaces.append(char)
1428            if category == "Lt":
1429                flags |= TITLE_MASK
1430            if "Uppercase" in properties:
1431                flags |= UPPER_MASK
1432            if char == ord(" ") or category[0] not in ("C", "Z"):
1433                flags |= PRINTABLE_MASK
1434            if "XID_Start" in properties:
1435                flags |= XID_START_MASK
1436            if "XID_Continue" in properties:
1437                flags |= XID_CONTINUE_MASK
1438            if "Cased" in properties:
1439                flags |= CASED_MASK
1440            if "Case_Ignorable" in properties:
1441                flags |= CASE_IGNORABLE_MASK
1442            sc = unicode.special_casing.get(char)
1443            cf = unicode.case_folding.get(char, [char])
1444            if record[12]:
1445                upper = int(record[12], 16)
1446            else:
1447                upper = char
1448            if record[13]:
1449                lower = int(record[13], 16)
1450            else:
1451                lower = char
1452            if record[14]:
1453                title = int(record[14], 16)
1454            else:
1455                title = upper
1456            if sc is None and cf != [lower]:
1457                sc = ([lower], [title], [upper])
1458            if sc is None:
1459                if upper == lower == title:
1460                    upper = lower = title = 0
1461                else:
1462                    upper = upper - char
1463                    lower = lower - char
1464                    title = title - char
1465                    assert (
1466                        abs(upper) <= 2147483647
1467                        and abs(lower) <= 2147483647
1468                        and abs(title) <= 2147483647
1469                    )
1470            else:
1471                # This happens either when some character maps to more than one
1472                # character in uppercase, lowercase, or titlecase or the
1473                # casefolded version of the character is different from the
1474                # lowercase. The extra characters are stored in a different
1475                # array.
1476                flags |= EXTENDED_CASE_MASK
1477                lower = len(extended_cases) | (len(sc[0]) << 24)
1478                extended_cases.extend(sc[0])
1479                if cf != sc[0]:
1480                    lower |= len(cf) << 20
1481                    extended_cases.extend(cf)
1482                upper = len(extended_cases) | (len(sc[2]) << 24)
1483                extended_cases.extend(sc[2])
1484                # Title is probably equal to upper.
1485                if sc[1] == sc[2]:
1486                    title = upper
1487                else:
1488                    title = len(extended_cases) | (len(sc[1]) << 24)
1489                    extended_cases.extend(sc[1])
1490            # decimal digit, integer digit
1491            decimal = 0
1492            if record[6]:
1493                flags |= DECIMAL_MASK
1494                decimal = int(record[6])
1495            digit = 0
1496            if record[7]:
1497                flags |= DIGIT_MASK
1498                digit = int(record[7])
1499            if record[8]:
1500                flags |= NUMERIC_MASK
1501                numeric.setdefault(record[8], []).append(char)
1502            item = TypeRecord(decimal, digit, flags, lower, title, upper)
1503            # add entry to index and item tables
1504            i = cache.get(item)
1505            if i is None:
1506                cache[item] = i = len(table)
1507                table.append(item)
1508            index[char] = i
1509
1510    print(len(table), "unique character type entries")
1511    print(sum(map(len, numeric.values())), "numeric code points")
1512    print(len(spaces), "whitespace code points")
1513    print(len(linebreaks), "linebreak code points")
1514    print(len(extended_cases), "extended case array")
1515
1516    StructArray(
1517        "UnicodeTypeRecord",
1518        "kTypeRecords",
1519        table,
1520        "a list of unique character type descriptors",
1521    ).dump(db, trace)
1522
1523    # split record index table
1524    index1, index2, shift = splitbins(index, trace)
1525    db.write(
1526        f"""
1527// type indices
1528static const int kTypeIndexShift = {shift};
1529static const int32_t kTypeIndexMask = (int32_t{{1}} << kTypeIndexShift) - 1;
1530"""
1531    )
1532    UIntArray("kTypeIndex1", index1).dump(db, trace)
1533    UIntArray("kTypeIndex2", index2).dump(db, trace)
1534
1535    # extended case mappings
1536    CodePointArray("kExtendedCase", extended_cases).dump(db, trace)
1537
1538    db.write(
1539        """
1540double numericValue(int32_t code_point) {
1541  switch (code_point) {"""
1542    )
1543    for value, codepoints in sorted(numeric.items()):
1544        parts = value.split("/")
1545        value = " / ".join(repr(float(part)) for part in parts)
1546
1547        codepoints.sort()
1548        for codepoint in codepoints:
1549            db.write(f"\n    case {codepoint:#08x}:")
1550        db.write(f"\n      return {value};")
1551    db.write(
1552        """
1553    default:
1554      return -1.0;
1555  }
1556}
1557
1558bool unicodeIsLinebreak(int32_t code_point) {
1559  switch (code_point) {"""
1560    )
1561    for codepoint in sorted(linebreaks):
1562        db.write(f"\n    case {codepoint:#08x}:")
1563    db.write(
1564        """
1565      return true;
1566    default:
1567      return false;
1568  }
1569}
1570
1571bool unicodeIsWhitespace(int32_t code_point) {
1572  switch (code_point) {"""
1573    )
1574    for codepoint in sorted(spaces):
1575        db.write(f"\n    case {codepoint:#08x}:")
1576    db.write(
1577        """
1578      return true;
1579    default:
1580      return false;
1581  }
1582}
1583"""
1584    )
1585
1586
1587def write_change_data(unicode, db, trace):
1588    dummy = ChangeRecord(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0.0)
1589    records = [dummy]
1590    cache = {dummy: 0}
1591    index = [0] * NUM_CODE_POINTS
1592
1593    # Generate delta tables for 3.2
1594    for char, record in enumerate(unicode.change_records):
1595        record = ChangeRecord(*record)
1596        if record in cache:
1597            index[char] = cache[record]
1598        else:
1599            index[char] = cache[record] = len(records)
1600            records.append(record)
1601
1602    StructArray(
1603        "UnicodeChangeRecord",
1604        "kChangeRecords",
1605        records,
1606        f"deltas from Unicode {OLD_VERSION} to the current version",
1607    ).dump(db, trace)
1608
1609    index1, index2, shift = splitbins(index, trace)
1610    db.write(
1611        f"""
1612static const int kChangeIndexShift = {shift};
1613static const int kChangeIndexMask = (1 << kChangeIndexShift) - 1;
1614"""
1615    )
1616    UIntArray("kChangeIndex1", index1).dump(db, trace)
1617    UIntArray("kChangeIndex2", index2).dump(db, trace)
1618
1619    db.write(
1620        """
1621int32_t normalizeOld(int32_t code_point) {
1622  switch (code_point) {"""
1623    )
1624    for k, v in unicode.normalization_changes:
1625        db.write(
1626            f"""
1627    case {k:#x}:
1628      return 0x{v.lower()};"""
1629        )
1630    db.write(
1631        """
1632    default:
1633      return -1;
1634  }
1635}
1636"""
1637    )
1638
1639
1640def write_string_literals(db, trace):
1641    SmallStrArray("kBidirectionalNames", BIDIRECTIONAL_NAMES).dump(db, trace)
1642    SmallStrArray("kCategoryNames", CATEGORY_NAMES).dump(db, trace)
1643    SmallStrArray("kEastAsianWidthNames", EASTASIANWIDTH_NAMES).dump(db, trace)
1644
1645
1646def write_db_coda(db):
1647    db.write(
1648        """
1649int32_t composeCodePoint(int32_t first, int32_t last) {
1650  int32_t i = first * kTotalLast + last;
1651  int32_t j = kCompIndex[i >> kCompShift];
1652  return kCompData[(j << kCompShift) + (i & kCompMask)];
1653}
1654
1655UnicodeDecomposition decomposeCodePoint(int32_t code_point) {
1656  DCHECK_BOUND(code_point, kMaxUnicode);
1657  uint16_t offset = kDecompIndex1[code_point >> kDecompShift];
1658  offset = kDecompIndex2[(offset << kDecompShift) + (code_point & kDecompMask)];
1659
1660  int32_t decomp = kDecompData[offset];
1661  int32_t count = decomp >> kBitsPerByte;
1662  int8_t prefix = decomp & kMaxByte;
1663  return {kDecompPrefix[prefix], count, &kDecompData[offset + 1]};
1664}
1665
1666static int32_t findNFC(const Reindex* nfc, int32_t code_point) {
1667  for (word index = 0; nfc[index].start != 0; index++) {
1668    int32_t delta = code_point - nfc[index].start;
1669    if (delta < 0) {
1670      return -1;
1671    }
1672    if (delta <= nfc[index].count) {
1673      return delta + nfc[index].index;
1674    }
1675  }
1676  return -1;
1677}
1678
1679int32_t findNFCFirst(int32_t code_point) {
1680  return findNFC(kNFCFirst, code_point);
1681}
1682
1683int32_t findNFCLast(int32_t code_point) {
1684  return findNFC(kNFCLast, code_point);
1685}
1686
1687struct HangulJamo {
1688  const int length;
1689  const byte name[3];
1690};
1691
1692static const HangulJamo kHangulLeads[] = {
1693    {1, {'G'}},      {2, {'G', 'G'}}, {1, {'N'}},      {1, {'D'}},
1694    {2, {'D', 'D'}}, {1, {'R'}},      {1, {'M'}},      {1, {'B'}},
1695    {2, {'B', 'B'}}, {1, {'S'}},      {2, {'S', 'S'}}, {0},
1696    {1, {'J'}},      {2, {'J', 'J'}}, {1, {'C'}},      {1, {'K'}},
1697    {1, {'T'}},      {1, {'P'}},      {1, {'H'}},
1698};
1699
1700static const HangulJamo kHangulVowels[] = {
1701    {1, {'A'}},           {2, {'A', 'E'}},      {2, {'Y', 'A'}},
1702    {3, {'Y', 'A', 'E'}}, {2, {'E', 'O'}},      {1, {'E'}},
1703    {3, {'Y', 'E', 'O'}}, {2, {'Y', 'E'}},      {1, {'O'}},
1704    {2, {'W', 'A'}},      {3, {'W', 'A', 'E'}}, {2, {'O', 'E'}},
1705    {2, {'Y', 'O'}},      {1, {'U'}},           {3, {'W', 'E', 'O'}},
1706    {2, {'W', 'E'}},      {2, {'W', 'I'}},      {2, {'Y', 'U'}},
1707    {2, {'E', 'U'}},      {2, {'Y', 'I'}},      {1, {'I'}},
1708};
1709
1710static const HangulJamo kHangulTrails[] = {
1711    {0},
1712    {1, {'G'}},
1713    {2, {'G', 'G'}},
1714    {2, {'G', 'S'}},
1715    {1, {'N'}},
1716    {2, {'N', 'J'}},
1717    {2, {'N', 'H'}},
1718    {1, {'D'}},
1719    {1, {'L'}},
1720    {2, {'L', 'G'}},
1721    {2, {'L', 'M'}},
1722    {2, {'L', 'B'}},
1723    {2, {'L', 'S'}},
1724    {2, {'L', 'T'}},
1725    {2, {'L', 'P'}},
1726    {2, {'L', 'H'}},
1727    {1, {'M'}},
1728    {1, {'B'}},
1729    {2, {'B', 'S'}},
1730    {1, {'S'}},
1731    {2, {'S', 'S'}},
1732    {2, {'N', 'G'}},
1733    {1, {'J'}},
1734    {1, {'C'}},
1735    {1, {'K'}},
1736    {1, {'T'}},
1737    {1, {'P'}},
1738    {1, {'H'}},
1739};
1740
1741static_assert(ARRAYSIZE(kHangulLeads) == Unicode::kHangulLeadCount,
1742              "mismatch in number of Hangul lead jamo");
1743static_assert(ARRAYSIZE(kHangulVowels) == Unicode::kHangulVowelCount,
1744              "mismatch in number of Hangul vowel jamo");
1745static_assert(ARRAYSIZE(kHangulTrails) == Unicode::kHangulTrailCount,
1746              "mismatch in number of Hangul trail jamo");
1747
1748static int32_t findHangulJamo(const byte* str, word size,
1749                              const HangulJamo array[], word count) {
1750  word result = -1;
1751  int length = -1;
1752  for (word i = 0; i < count; i++) {
1753    HangulJamo jamo = array[i];
1754    if (length < jamo.length && jamo.length <= size &&
1755        (std::memcmp(str, jamo.name, jamo.length) == 0)) {
1756      length = jamo.length;
1757      result = i;
1758    }
1759  }
1760  return result;
1761}
1762
1763static int32_t findHangulSyllable(const byte* name, word size) {
1764  word pos = 16;
1765  word lead = findHangulJamo(name + pos, size - pos, kHangulLeads,
1766                             Unicode::kHangulLeadCount);
1767  if (lead == -1) {
1768    return -1;
1769  }
1770
1771  pos += kHangulLeads[lead].length;
1772  word vowel = findHangulJamo(name + pos, size - pos, kHangulVowels,
1773                              Unicode::kHangulVowelCount);
1774  if (vowel == -1) {
1775    return -1;
1776  }
1777
1778  pos += kHangulVowels[vowel].length;
1779  word trail = findHangulJamo(name + pos, size - pos, kHangulTrails,
1780                              Unicode::kHangulTrailCount);
1781  if (trail == -1 || pos + kHangulTrails[trail].length != size) {
1782    return -1;
1783  }
1784
1785  return Unicode::kHangulSyllableStart +
1786         (lead * Unicode::kHangulVowelCount + vowel) *
1787             Unicode::kHangulTrailCount +
1788         trail;
1789}
1790
1791static bool isUnifiedIdeograph(int32_t code_point) {
1792  return """
1793    )
1794
1795    for i, cjk in enumerate(CJK_RANGES):
1796        if i > 0:
1797            db.write(" ||\n         ")
1798        db.write(f"({cjk.start:#010x} <= code_point && code_point <= {cjk.end:#010x})")
1799
1800    db.write(
1801        """;
1802}
1803
1804static uint32_t myHash(const byte* name, word size) {
1805  uint32_t hash = 0;
1806  for (word i = 0; i < size; i++) {
1807    hash = (hash * kCodeMagic) + ASCII::toUpper(name[i]);
1808    if (hash > 0x00ffffff) {
1809      hash = (hash & 0x00ffffff) ^ ((hash & 0xff000000) >> 24);
1810    }
1811  }
1812  return hash;
1813}
1814
1815static bool nameMatchesCodePoint(int32_t code_point, const byte* name,
1816                                 word size) {
1817  byte buffer[kMaxNameLength + 1];
1818  if (!nameFromCodePoint(code_point, buffer, kMaxNameLength)) {
1819    return false;
1820  }
1821  for (word i = 0; i < size; i++) {
1822    if (ASCII::toUpper(name[i]) != buffer[i]) {
1823      return false;
1824    }
1825  }
1826  return buffer[size] == '\\0';
1827}
1828
1829static int32_t getCodePoint(const byte* name, word size,
1830                            int32_t (*check)(int32_t)) {
1831  if (size >= 16 && std::memcmp(name, "HANGUL SYLLABLE ", 16) == 0) {
1832    return findHangulSyllable(name, size);
1833  }
1834
1835  if (size >= 22 && std::memcmp(name, "CJK UNIFIED IDEOGRAPH-", 22) == 0) {
1836    // Four or five uppercase hexdigits must follow
1837    name += 22;
1838    size -= 22;
1839    if (size != 4 && size != 5) {
1840      return -1;
1841    }
1842
1843    int32_t result = 0;
1844    while (size--) {
1845      result *= 16;
1846      if ('0' <= *name && *name <= '9') {
1847        result += *name - '0';
1848      } else if ('A' <= *name && *name <= 'F') {
1849        result += *name - 'A' + 10;
1850      } else {
1851        return -1;
1852      }
1853      name++;
1854    }
1855    if (!isUnifiedIdeograph(result)) {
1856      return -1;
1857    }
1858    return result;
1859  }
1860
1861  uint32_t hash = myHash(name, size);
1862  uint32_t step = (hash ^ (hash >> 3)) & kCodeMask;
1863  if (step == 0) {
1864    step = kCodeMask;
1865  }
1866
1867  for (uint32_t idx = (~hash) & kCodeMask;; idx = (idx + step) & kCodeMask) {
1868    int32_t result = kCodeHash[idx];
1869    if (result == 0) {
1870      return -1;
1871    }
1872    if (nameMatchesCodePoint(result, name, size)) {
1873      return check(result);
1874    }
1875    step = step << 1;
1876    if (step > kCodeMask) {
1877      step = step ^ kCodePoly;
1878    }
1879  }
1880}
1881
1882static int32_t checkAliasAndNamedSequence(int32_t code_point) {
1883  if (Unicode::isNamedSequence(code_point)) {
1884    return -1;
1885  }
1886  if (Unicode::isAlias(code_point)) {
1887    return kNameAliases[code_point - Unicode::kAliasStart];
1888  }
1889  return code_point;
1890}
1891
1892int32_t codePointFromName(const byte* name, word size) {
1893  return getCodePoint(name, size, checkAliasAndNamedSequence);
1894}
1895
1896static int32_t checkAlias(int32_t code_point) {
1897  if (Unicode::isAlias(code_point)) {
1898    return kNameAliases[code_point - Unicode::kAliasStart];
1899  }
1900  return code_point;
1901}
1902
1903int32_t codePointFromNameOrNamedSequence(const byte* name, word size) {
1904  return getCodePoint(name, size, checkAlias);
1905}
1906
1907int32_t extendedCaseMapping(int32_t index) {
1908  return kExtendedCase[index];
1909}
1910
1911bool nameFromCodePoint(int32_t code_point, byte* buffer, word size) {
1912  DCHECK_BOUND(code_point, kMaxUnicode);
1913
1914  if (Unicode::isHangulSyllable(code_point)) {
1915    if (size < 27) {
1916      return false;  // worst case: HANGUL SYLLABLE <10 chars>
1917    }
1918
1919    int32_t offset = code_point - Unicode::kHangulSyllableStart;
1920    HangulJamo lead = kHangulLeads[offset / Unicode::kHangulCodaCount];
1921    HangulJamo vowel = kHangulVowels[(offset % Unicode::kHangulCodaCount) /
1922                                     Unicode::kHangulTrailCount];
1923    HangulJamo trail = kHangulTrails[offset % Unicode::kHangulTrailCount];
1924
1925    std::memcpy(buffer, "HANGUL SYLLABLE ", 16);
1926    buffer += 16;
1927    std::memcpy(buffer, lead.name, lead.length);
1928    buffer += lead.length;
1929    std::memcpy(buffer, vowel.name, vowel.length);
1930    buffer += vowel.length;
1931    std::memcpy(buffer, trail.name, trail.length);
1932    buffer[trail.length] = '\\0';
1933    return true;
1934  }
1935
1936  if (isUnifiedIdeograph(code_point)) {
1937    if (size < 28) {
1938      return false;  // worst case: CJK UNIFIED IDEOGRAPH-*****
1939    }
1940    std::snprintf(reinterpret_cast<char*>(buffer), size,
1941                  "CJK UNIFIED IDEOGRAPH-%" PRIX32, code_point);
1942    return true;
1943  }
1944
1945  uint32_t offset = kPhrasebookOffset1[code_point >> kPhrasebookShift];
1946  offset = kPhrasebookOffset2[(offset << kPhrasebookShift) +
1947                              (code_point & kPhrasebookMask)];
1948
1949  if (offset == 0) {
1950    return false;
1951  }
1952
1953  for (word i = 0;;) {
1954    if (i > 0) {
1955      if (i >= size) {
1956        return false;  // buffer overflow
1957      }
1958      buffer[i++] = ' ';
1959    }
1960
1961    // get word index
1962    word word_idx = kPhrasebook[offset] - kPhrasebookShort;
1963    if (word_idx >= 0) {
1964      word_idx = (word_idx << 8) + kPhrasebook[offset + 1];
1965      offset += 2;
1966    } else {
1967      word_idx = kPhrasebook[offset++];
1968    }
1969
1970    // copy word string from lexicon
1971    const byte* word_ptr = kLexicon + kLexiconOffset[word_idx];
1972    while (*word_ptr < 128) {
1973      if (i >= size) {
1974        return false;  // buffer overflow
1975      }
1976      buffer[i++] = *word_ptr++;
1977    }
1978
1979    if (i >= size) {
1980      return false;  // buffer overflow
1981    }
1982    buffer[i++] = *word_ptr & 0x7f;
1983    if (*word_ptr == 0x80) {
1984      return true;
1985    }
1986  }
1987}
1988
1989const UnicodeChangeRecord* changeRecord(int32_t code_point) {
1990  if (code_point > kMaxUnicode) {
1991    return &kChangeRecords[0];
1992  }
1993
1994  int index = kChangeIndex1[code_point >> kChangeIndexShift];
1995  index <<= kChangeIndexShift;
1996  index = kChangeIndex2[index + (code_point & kChangeIndexMask)];
1997  return &kChangeRecords[index];
1998}
1999
2000const UnicodeDatabaseRecord* databaseRecord(int32_t code_point) {
2001  if (code_point > kMaxUnicode) {
2002    return &kDatabaseRecords[0];
2003  }
2004
2005  int index = kDatabaseIndex1[code_point >> kDatabaseIndexShift];
2006  index <<= kDatabaseIndexShift;
2007  index = kDatabaseIndex2[index + (code_point & kDatabaseIndexMask)];
2008  return &kDatabaseRecords[index];
2009}
2010
2011const UnicodeNamedSequence* namedSequence(int32_t code_point) {
2012  DCHECK(Unicode::isNamedSequence(code_point), "not a named sequence");
2013  return &kNamedSequences[code_point - Unicode::kNamedSequenceStart];
2014}
2015
2016const UnicodeTypeRecord* typeRecord(int32_t code_point) {
2017  if (code_point > kMaxUnicode) {
2018    return &kTypeRecords[0];
2019  }
2020
2021  int index = kTypeIndex1[code_point >> kTypeIndexShift];
2022  index <<= kTypeIndexShift;
2023  index = kTypeIndex2[index + (code_point & kTypeIndexMask)];
2024  return &kTypeRecords[index];
2025}
2026
2027}  // namespace py
2028"""
2029    )
2030
2031
2032def write_db(trace=False):
2033    print(f"--- Reading {UNICODE_DATA} v{UNIDATA_VERSION} ...")
2034    unicode = UnicodeData(UNIDATA_VERSION)
2035
2036    print(len(list(filter(None, unicode.table))), "characters")
2037
2038    print(f"--- Reading {UNICODE_DATA} v{OLD_VERSION} ...")
2039    old_unicode = UnicodeData(OLD_VERSION, check_cjk=False)
2040    print(len(list(filter(None, old_unicode.table))), "characters")
2041    unicode.merge(old_unicode)
2042
2043    with open(HEADER_PATH, "w") as header:
2044        write_header(unicode, header)
2045
2046    with open(DB_PATH, "w") as db:
2047        write_db_prelude(db)
2048        write_database_records(unicode, db, trace)
2049        write_name_data(unicode, db, trace)
2050        write_type_data(unicode, db, trace)
2051        write_change_data(unicode, db, trace)
2052        write_string_literals(db, trace)
2053        write_db_coda(db)
2054
2055
2056if __name__ == "__main__":
2057    write_db(True)