this repo has no description
at trunk 2057 lines 64 kB view raw
1#!/usr/bin/env python3 2# Portions copyright (c) Facebook, Inc. and its affiliates. (http://www.facebook.com) 3"""Generate Unicode name, property, and type databases. 4 5This script converts Unicode 3.2 and 11.0 database files into a Pyro-specific format. 6Run the script using Python version >= 3.7, from this directory, without args. 7 8The algorithm was taken from Fredrik Lundh's script for CPython. 9""" 10 11import os 12import sys 13import zipfile 14from dataclasses import dataclass 15 16 17SCRIPT = os.path.relpath(__file__) 18PYRO_DIR = os.path.normpath(os.path.join(SCRIPT, "..", "..")) 19RUNTIME_DIR = os.path.join(PYRO_DIR, "runtime") 20HEADER = "unicode-db.h" 21HEADER_PATH = os.path.join(RUNTIME_DIR, HEADER) 22DB_PATH = os.path.join(RUNTIME_DIR, "unicode-db.cpp") 23LINE_WIDTH = 80 24 25NUM_CODE_POINTS = sys.maxunicode + 1 26CODE_POINTS = range(NUM_CODE_POINTS) 27 28UNIDATA_VERSION = "11.0.0" 29OLD_VERSION = "3.2.0" 30 31# UCD files 32CASE_FOLDING = "CaseFolding.txt" 33COMPOSITION_EXCLUSIONS = "CompositionExclusions.txt" 34DERIVED_CORE_PROPERTIES = "DerivedCoreProperties.txt" 35DERIVEDNORMALIZATION_PROPS = "DerivedNormalizationProps.txt" 36EASTASIAN_WIDTH = "EastAsianWidth.txt" 37LINE_BREAK = "LineBreak.txt" 38NAME_ALIASES = "NameAliases.txt" 39NAMED_SEQUENCES = "NamedSequences.txt" 40SPECIAL_CASING = "SpecialCasing.txt" 41UNICODE_DATA = "UnicodeData.txt" 42UNIHAN = "Unihan.zip" 43 44# Private Use Areas -- in planes 1, 15, 16 45PUA_1 = range(0xE000, 0xF900) 46PUA_15 = range(0xF0000, 0xFFFFE) 47PUA_16 = range(0x100000, 0x10FFFE) 48 49# we use this ranges of PUA_15 to store name aliases and named sequences 50NAME_ALIASES_START = 0xF0000 51NAMED_SEQUENCES_START = 0xF0200 52 53# Longest decomposition in Unicode {UNIDATA_VERSION}: U+FDFA 54MAX_DECOMPOSITION = 18 55 56CATEGORY_NAMES = [ 57 "Cn", 58 "Lu", 59 "Ll", 60 "Lt", 61 "Mn", 62 "Mc", 63 "Me", 64 "Nd", 65 "Nl", 66 "No", 67 "Zs", 68 "Zl", 69 "Zp", 70 "Cc", 71 "Cf", 72 "Cs", 73 "Co", 74 "Cn", 75 "Lm", 76 "Lo", 77 "Pc", 78 "Pd", 79 "Ps", 80 "Pe", 81 "Pi", 82 "Pf", 83 "Po", 84 "Sm", 85 "Sc", 86 "Sk", 87 "So", 88] 89 90BIDIRECTIONAL_NAMES = [ 91 "", 92 "L", 93 "LRE", 94 "LRO", 95 "R", 96 "AL", 97 "RLE", 98 "RLO", 99 "PDF", 100 "EN", 101 "ES", 102 "ET", 103 "AN", 104 "CS", 105 "NSM", 106 "BN", 107 "B", 108 "S", 109 "WS", 110 "ON", 111 "LRI", 112 "RLI", 113 "FSI", 114 "PDI", 115] 116 117EASTASIANWIDTH_NAMES = ["F", "H", "W", "Na", "A", "N"] 118 119MANDATORY_LINE_BREAKS = ["BK", "CR", "LF", "NL"] 120 121ALPHA_MASK = 0x01 122DECIMAL_MASK = 0x02 123DIGIT_MASK = 0x04 124LOWER_MASK = 0x08 125LINEBREAK_MASK = 0x10 126SPACE_MASK = 0x20 127TITLE_MASK = 0x40 128UPPER_MASK = 0x80 129XID_START_MASK = 0x100 130XID_CONTINUE_MASK = 0x200 131PRINTABLE_MASK = 0x400 132NUMERIC_MASK = 0x800 133CASE_IGNORABLE_MASK = 0x1000 134CASED_MASK = 0x2000 135EXTENDED_CASE_MASK = 0x4000 136 137 138def open_data(filename, version): 139 ucd_dir = os.path.join(PYRO_DIR, "third-party", f"ucd-{version}") 140 if not os.path.exists(ucd_dir): 141 os.mkdir(ucd_dir) 142 143 path = os.path.join(ucd_dir, filename) 144 if not os.path.exists(path): 145 if version == OLD_VERSION: 146 # irregular url structure 147 root, path = os.path.splitext(filename) 148 url = f"http://www.unicode.org/Public/3.2-Update/{root}-{version}{path}" 149 else: 150 url = f"http://www.unicode.org/Public/{version}/ucd/{filename}" 151 raise OSError( 152 f"""Cannot open {path}: file does not exist. 153Find it online at {url}""" 154 ) 155 156 if path.endswith(".txt"): 157 return open(path, encoding="utf-8") 158 else: 159 # Unihan.zip 160 return open(path, "rb") 161 162 163######################################################################################## 164# CJK ranges 165 166 167@dataclass(frozen=True) 168class CJKRange: 169 start: int 170 end: int 171 172 173CJK_RANGES = ( 174 CJKRange(0x3400, 0x4DB5), 175 CJKRange(0x4E00, 0x9FEF), 176 CJKRange(0x20000, 0x2A6D6), 177 CJKRange(0x2A700, 0x2B734), 178 CJKRange(0x2B740, 0x2B81D), 179 CJKRange(0x2B820, 0x2CEA1), 180 CJKRange(0x2CEB0, 0x2EBE0), 181) 182 183 184######################################################################################## 185# the following support code is taken from the unidb utilities 186# Copyright (c) 1999-2000 by Secret Labs AB 187 188# load a unicode-data file from disk 189 190 191class UnicodeData: 192 """Loads a Unicode data file, checking disk first and then internet. 193 194 Taken from the unidb utilities. 195 Copyright (c) 1999-2000 by Secret Labs AB 196 """ 197 198 # Record structure: 199 # [ID, name, category, combining, bidi, decomp, (6) 200 # decimal, digit, numeric, bidi-mirrored, Unicode-1-name, (11) 201 # ISO-comment, uppercase, lowercase, titlecase, ea-width, (16) 202 # derived-props] (17) 203 204 def __init__(self, version, check_cjk=True): # noqa: C901 205 self.version = version 206 207 self.aliases = [] 208 self.case_folding = {} 209 self.change_records = [] 210 self.exclusions = set() 211 self.named_sequences = [] 212 self.normalization_changes = [] 213 self.special_casing = {} 214 self.table = [None] * NUM_CODE_POINTS 215 216 with open_data(UNICODE_DATA, version) as file: 217 while True: 218 s = file.readline() 219 if not s: 220 break 221 s = s.strip().split(";") 222 char = int(s[0], 16) 223 self.table[char] = s 224 225 # expand first-last ranges 226 field = None 227 cjk_ranges = [] 228 for char in CODE_POINTS: 229 s = self.table[char] 230 if s is not None: 231 if s[1][-6:] == "First>": 232 s[1] = "" 233 field = s 234 elif s[1][-5:] == "Last>": 235 if check_cjk and s[1].startswith("<CJK Ideograph"): 236 start = int(field[0], 16) 237 end = int(s[0], 16) 238 cjk_ranges.append(CJKRange(start, end)) 239 s[1] = "" 240 field = None 241 elif field: 242 f2 = field[:] 243 f2[0] = f"{char:X}" 244 self.table[char] = f2 245 246 if check_cjk and set(cjk_ranges) != set(CJK_RANGES): 247 raise ValueError(f"CJK ranges deviate: found {cjk_ranges!r}") 248 249 # check for name aliases and named sequences, see #12753 250 # aliases and named sequences are not in 3.2.0 251 if version != OLD_VERSION: 252 self.aliases = [] 253 # store aliases in the Private Use Area 15, in range U+F0000..U+F00FF, 254 # in order to take advantage of the compression and lookup 255 # algorithms used for the other characters 256 pua_index = NAME_ALIASES_START 257 with open_data(NAME_ALIASES, version) as file: 258 for s in file: 259 s = s.strip() 260 if not s or s.startswith("#"): 261 continue 262 char, name, abbrev = s.split(";") 263 char = int(char, 16) 264 self.aliases.append((name, char)) 265 # also store the name in the PUA 1 266 self.table[pua_index][1] = name 267 pua_index += 1 268 assert pua_index - NAME_ALIASES_START == len(self.aliases) 269 270 self.named_sequences = [] 271 # store named sequences in the PUA 1, in range U+F0100.., 272 # in order to take advantage of the compression and lookup 273 # algorithms used for the other characters. 274 275 assert pua_index < NAMED_SEQUENCES_START 276 pua_index = NAMED_SEQUENCES_START 277 with open_data(NAMED_SEQUENCES, version) as file: 278 for s in file: 279 s = s.strip() 280 if not s or s.startswith("#"): 281 continue 282 name, chars = s.split(";") 283 chars = tuple(int(char, 16) for char in chars.split()) 284 # check that the structure defined in write_header is OK 285 assert len(chars) <= 4, "change the NamedSequence array size" 286 assert all(c < NUM_CODE_POINTS for c in chars) 287 self.named_sequences.append(NamedSequence(name, chars)) 288 # also store these in the PUA 1 289 self.table[pua_index][1] = name 290 pua_index += 1 291 assert pua_index - NAMED_SEQUENCES_START == len(self.named_sequences) 292 293 self.exclusions = set() 294 with open_data(COMPOSITION_EXCLUSIONS, version) as file: 295 for s in file: 296 s = s.strip() 297 if s and not s.startswith("#"): 298 char = int(s.split()[0], 16) 299 self.exclusions.add(char) 300 301 widths = [None] * NUM_CODE_POINTS 302 with open_data(EASTASIAN_WIDTH, version) as file: 303 for s in file: 304 s = s.strip() 305 if not s: 306 continue 307 if s[0] == "#": 308 continue 309 s = s.split()[0].split(";") 310 if ".." in s[0]: 311 first, last = [int(c, 16) for c in s[0].split("..")] 312 chars = list(range(first, last + 1)) 313 else: 314 chars = [int(s[0], 16)] 315 for char in chars: 316 widths[char] = s[1] 317 318 for char in CODE_POINTS: 319 if self.table[char] is not None: 320 self.table[char].append(widths[char]) 321 self.table[char].append(set()) 322 323 with open_data(DERIVED_CORE_PROPERTIES, version) as file: 324 for s in file: 325 s = s.split("#", 1)[0].strip() 326 if not s: 327 continue 328 329 r, p = s.split(";") 330 r = r.strip() 331 p = p.strip() 332 if ".." in r: 333 first, last = [int(c, 16) for c in r.split("..")] 334 chars = list(range(first, last + 1)) 335 else: 336 chars = [int(r, 16)] 337 for char in chars: 338 if self.table[char]: 339 # Some properties (e.g. Default_Ignorable_Code_Point) 340 # apply to unassigned code points; ignore them 341 self.table[char][-1].add(p) 342 343 with open_data(LINE_BREAK, version) as file: 344 for s in file: 345 s = s.partition("#")[0] 346 s = [i.strip() for i in s.split(";")] 347 if len(s) < 2 or s[1] not in MANDATORY_LINE_BREAKS: 348 continue 349 if ".." not in s[0]: 350 first = last = int(s[0], 16) 351 else: 352 first, last = [int(c, 16) for c in s[0].split("..")] 353 for char in range(first, last + 1): 354 self.table[char][-1].add("Line_Break") 355 356 # We only want the quickcheck properties 357 # Format: NF?_QC; Y(es)/N(o)/M(aybe) 358 # Yes is the default, hence only N and M occur 359 # In 3.2.0, the format was different (NF?_NO) 360 # The parsing will incorrectly determine these as 361 # "yes", however, unicodedata.c will not perform quickchecks 362 # for older versions, and no delta records will be created. 363 quickchecks = [0] * NUM_CODE_POINTS 364 qc_order = "NFD_QC NFKD_QC NFC_QC NFKC_QC".split() 365 with open_data(DERIVEDNORMALIZATION_PROPS, version) as file: 366 for s in file: 367 if "#" in s: 368 s = s[: s.index("#")] 369 s = [i.strip() for i in s.split(";")] 370 if len(s) < 2 or s[1] not in qc_order: 371 continue 372 quickcheck = "MN".index(s[2]) + 1 # Maybe or No 373 quickcheck_shift = qc_order.index(s[1]) * 2 374 quickcheck <<= quickcheck_shift 375 if ".." not in s[0]: 376 first = last = int(s[0], 16) 377 else: 378 first, last = [int(c, 16) for c in s[0].split("..")] 379 for char in range(first, last + 1): 380 assert not (quickchecks[char] >> quickcheck_shift) & 3 381 quickchecks[char] |= quickcheck 382 383 with open_data(UNIHAN, version) as file: 384 zip = zipfile.ZipFile(file) 385 if version == OLD_VERSION: 386 data = zip.open(f"Unihan-{OLD_VERSION}.txt").read() 387 else: 388 data = zip.open("Unihan_NumericValues.txt").read() 389 for line in data.decode("utf-8").splitlines(): 390 if not line.startswith("U+"): 391 continue 392 code, tag, value = line.split(None, 3)[:3] 393 if tag not in ("kAccountingNumeric", "kPrimaryNumeric", "kOtherNumeric"): 394 continue 395 value = value.strip().replace(",", "") 396 i = int(code[2:], 16) 397 # Patch the numeric field 398 if self.table[i] is not None: 399 self.table[i][8] = value 400 sc = self.special_casing = {} 401 with open_data(SPECIAL_CASING, version) as file: 402 for s in file: 403 s = s[:-1].split("#", 1)[0] 404 if not s: 405 continue 406 data = s.split("; ") 407 if data[4]: 408 # We ignore all conditionals (since they depend on 409 # languages) except for one, which is hardcoded. See 410 # handle_capital_sigma in unicodeobject.c. 411 continue 412 c = int(data[0], 16) 413 lower = [int(char, 16) for char in data[1].split()] 414 title = [int(char, 16) for char in data[2].split()] 415 upper = [int(char, 16) for char in data[3].split()] 416 sc[c] = (lower, title, upper) 417 cf = self.case_folding = {} 418 if version != OLD_VERSION: 419 with open_data(CASE_FOLDING, version) as file: 420 for s in file: 421 s = s[:-1].split("#", 1)[0] 422 if not s: 423 continue 424 data = s.split("; ") 425 if data[1] in "CF": 426 c = int(data[0], 16) 427 cf[c] = [int(char, 16) for char in data[2].split()] 428 429 for char in CODE_POINTS: 430 if self.table[char] is not None: 431 self.table[char].append(quickchecks[char]) 432 433 def merge(self, old): # noqa: C901 434 # Changes to exclusion file not implemented yet 435 if old.exclusions != self.exclusions: 436 raise NotImplementedError("exclusions differ") 437 438 # In these change records, 0xFF means "no change" 439 bidir_changes = [0xFF] * NUM_CODE_POINTS 440 category_changes = [0xFF] * NUM_CODE_POINTS 441 decimal_changes = [0xFF] * NUM_CODE_POINTS 442 east_asian_width_changes = [0xFF] * NUM_CODE_POINTS 443 mirrored_changes = [0xFF] * NUM_CODE_POINTS 444 # In numeric data, 0 means "no change", 445 # -1 means "did not have a numeric value 446 numeric_changes = [0.0] * NUM_CODE_POINTS 447 # normalization_changes is a list of key-value pairs 448 normalization_changes = [] 449 for char in CODE_POINTS: 450 if self.table[char] is None: 451 # Characters unassigned in the new version ought to 452 # be unassigned in the old one 453 assert old.table[char] is None 454 continue 455 # check characters unassigned in the old version 456 if old.table[char] is None: 457 # category 0 is "unassigned" 458 category_changes[char] = 0 459 continue 460 # check characters that differ 461 if old.table[char] != self.table[char]: 462 for k in range(len(old.table[char])): 463 if old.table[char][k] != self.table[char][k]: 464 value = old.table[char][k] 465 if k == 1 and char in PUA_15: 466 # the name is not set in the old.table, but in the 467 # self.table we are using it for aliases and named seq 468 assert value == "" 469 elif k == 2: 470 category_changes[char] = CATEGORY_NAMES.index(value) 471 elif k == 4: 472 bidir_changes[char] = BIDIRECTIONAL_NAMES.index(value) 473 elif k == 5: 474 # We assume all normalization changes are in 1:1 mappings 475 assert " " not in value 476 normalization_changes.append((char, value)) 477 elif k == 6: 478 # only support changes where the old value is a single digit 479 assert value in "0123456789" 480 decimal_changes[char] = int(value) 481 elif k == 8: 482 # Since 0 encodes "no change", the old value is better not 0 483 if not value: 484 numeric_changes[char] = -1.0 485 else: 486 numeric_changes[char] = float(value) 487 assert numeric_changes[char] not in (0, -1) 488 elif k == 9: 489 if value == "Y": 490 mirrored_changes[char] = 1 491 else: 492 mirrored_changes[char] = 0 493 elif k == 11: 494 # change to ISO comment, ignore 495 pass 496 elif k == 12: 497 # change to simple uppercase mapping; ignore 498 pass 499 elif k == 13: 500 # change to simple lowercase mapping; ignore 501 pass 502 elif k == 14: 503 # change to simple titlecase mapping; ignore 504 pass 505 elif k == 15: 506 # change to east asian width 507 east_asian_width_changes[char] = EASTASIANWIDTH_NAMES.index( 508 value 509 ) 510 elif k == 16: 511 # derived property changes; not yet 512 pass 513 elif k == 17: 514 # normalization quickchecks are not performed 515 # for older versions 516 pass 517 else: 518 519 class Difference(Exception): 520 pass 521 522 raise Difference( 523 hex(char), k, old.table[char], self.table[char] 524 ) 525 526 self.change_records = zip( 527 bidir_changes, 528 category_changes, 529 decimal_changes, 530 east_asian_width_changes, 531 mirrored_changes, 532 numeric_changes, 533 ) 534 self.normalization_changes = normalization_changes 535 536 537######################################################################################## 538# Helper functions & classes 539 540 541def getsize(data): 542 """Return the smallest possible integer size for the given array.""" 543 maxdata = max(data) 544 if maxdata < (1 << 8): 545 return 1 546 if maxdata < (1 << 16): 547 return 2 548 return 4 549 550 551def splitbins(t, trace=False): # noqa: C901 552 """t, trace=False -> (t1, t2, shift). Split a table to save space. 553 554 t is a sequence of ints. This function can be useful to save space if 555 many of the ints are the same. t1 and t2 are lists of ints, and shift 556 is an int, chosen to minimize the combined size of t1 and t2 (in C 557 code), and where for each i in range(len(t)), 558 t[i] == t2[(t1[i >> shift] << shift) + (i & mask)] 559 where mask is a bitmask isolating the last "shift" bits. 560 561 If optional arg trace is non-zero (default zero), progress info 562 is printed to sys.stderr. The higher the value, the more info 563 you'll get. 564 """ 565 566 if trace: 567 568 def dump(t1, t2, shift, bytes): 569 sys.stderr.write( 570 f"{len(t1)}+{len(t2)} bins at shift {shift}; {bytes} bytes\n" 571 ) 572 573 sys.stderr.write(f"Size of original table: {len(t) * getsize(t)} bytes\n") 574 575 n = len(t) - 1 # last valid index 576 maxshift = 0 # the most we can shift n and still have something left 577 if n > 0: 578 while n >> 1: 579 n >>= 1 580 maxshift += 1 581 del n 582 bytes = sys.maxsize # smallest total size so far 583 t = tuple(t) # so slices can be dict keys 584 for shift in range(maxshift + 1): 585 t1 = [] 586 t2 = [] 587 size = 2 ** shift 588 bincache = {} 589 for i in range(0, len(t), size): 590 bin = t[i : i + size] 591 index = bincache.get(bin) 592 if index is None: 593 index = len(t2) 594 bincache[bin] = index 595 t2.extend(bin) 596 t1.append(index >> shift) 597 # determine memory size 598 b = len(t1) * getsize(t1) + len(t2) * getsize(t2) 599 if trace: 600 dump(t1, t2, shift, b) 601 if b < bytes: 602 best = t1, t2, shift 603 bytes = b 604 t1, t2, shift = best 605 if trace: 606 sys.stderr.write(f"Best: ") 607 dump(t1, t2, shift, bytes) 608 if __debug__: 609 # exhaustively verify that the decomposition is correct 610 mask = ~((~0) << shift) # i.e., low-bit mask of shift bits 611 for i in range(len(t)): 612 assert t[i] == t2[(t1[i >> shift] << shift) + (i & mask)] 613 return best 614 615 616@dataclass 617class WordData: 618 word: str 619 index: int 620 frequency: int 621 622 623class Words: 624 def __init__(self, names, trace=False): 625 self.indices = {} 626 self.frequencies = {} 627 628 count = num_chars = num_words = 0 629 for char in CODE_POINTS: 630 name = names[char] 631 if name is None: 632 continue 633 634 words = name.split() 635 num_chars += len(name) 636 num_words += len(words) 637 for word in words: 638 if word in self.indices: 639 self.frequencies[word] += 1 640 else: 641 self.frequencies[word] = 1 642 self.indices[word] = count 643 count += 1 644 645 if trace: 646 print(f"{num_words} words in text; {num_chars} code points") 647 648 def tolist(self): 649 return sorted( 650 ( 651 WordData(word, index, self.frequencies[word]) 652 for word, index in self.indices.items() 653 ), 654 key=lambda data: (-data.frequency, data.word), 655 ) 656 657 658######################################################################################## 659# Classes for database tables 660 661 662@dataclass(frozen=True) 663class ChangeRecord: 664 bidirectional: int 665 category: int 666 decimal: int 667 east_asian_width: int 668 mirrored: int 669 numeric: float 670 671 def __str__(self): 672 return f"\ 673{{{self.bidirectional:#04x}, {self.category:#04x}, {self.decimal:#04x}, \ 674{self.east_asian_width:#04x}, {self.mirrored:#04x}, {self.numeric}}}" 675 676 677@dataclass(frozen=True) 678class DatabaseRecord: 679 bidirectional: int 680 category: int 681 combining: int 682 east_asian_width: int 683 mirrored: bool 684 quick_check: int 685 686 def __str__(self): 687 mirrored = "true" if self.mirrored else "false" 688 return f"{{{self.bidirectional:#04x}, {self.category:#04x}, \ 689{self.combining:#04x}, {self.east_asian_width:#04x}, {mirrored}, \ 690{self.quick_check:#04x}}}" 691 692 693@dataclass(frozen=True) 694class NamedSequence: 695 name: int 696 seq: int 697 698 def __str__(self): 699 seq_str = ", ".join(f"{elt:#04x}" for elt in self.seq) 700 return f"{{{len(self.seq)}, {{{seq_str}}}}}" 701 702 703@dataclass(frozen=True) 704class Reindex: 705 start: int 706 count: int 707 index: int 708 709 def __str__(self): 710 return f"{{{self.start:#010x}, {self.count}, {self.index}}}" 711 712 713@dataclass(frozen=True) 714class TypeRecord: 715 decimal: int 716 digit: int 717 flags: int 718 lower: int 719 title: int 720 upper: int 721 722 def __str__(self): 723 return f"{{{self.decimal}, {self.digit}, {self.flags:#06x}, \ 724{self.lower:#x}, {self.title:#x}, {self.upper:#x}}}" 725 726 727class CodePointArray: 728 """Arrays of code points, printed as hex.""" 729 730 def __init__(self, name, data): 731 self.name = name 732 self.data = data 733 734 def dump(self, file, trace=False): 735 if trace: 736 sys.stderr.write(f"{self.name}: {4 * len(self.data)} bytes\n") 737 738 file.write(f"\nstatic const int32_t {self.name}[] = {{") 739 columns = (LINE_WIDTH - 3) // 12 740 for i in range(0, len(self.data), columns): 741 line = "\n " 742 for item in self.data[i : i + columns]: 743 line = f"{line} {item:#010x}," 744 file.write(line) 745 file.write("\n};\n") 746 747 748class SmallStrArray: 749 def __init__(self, name, data, comment=None): 750 self.name = name 751 self.data = data 752 self.comment = comment 753 754 def dump(self, file, trace=False): 755 if self.comment is not None: 756 file.write(f"\n// {self.comment}") 757 file.write(f"\nconst RawSmallStr {self.name}[] = {{") 758 759 if self.data: 760 file.write("\n") 761 for item in self.data: 762 file.write(f' RawSmallStr::fromCStr("{item}"),\n') 763 764 file.write("};\n") 765 766 767class StructArray: 768 """Store an array for writing to a generated C/C++ file.""" 769 770 def __init__(self, ctype, name, data, comment=None): 771 self.type = ctype 772 self.name = name 773 self.data = data 774 self.comment = comment 775 776 def dump(self, file, trace=False): 777 if self.comment is not None: 778 file.write(f"\n// {self.comment}") 779 file.write(f"\nstatic const {self.type} {self.name}[] = {{") 780 781 if self.data: 782 file.write("\n") 783 for item in self.data: 784 file.write(f" {item},\n") 785 786 file.write("};\n") 787 788 789class UIntArray: 790 """Unsigned integer arrays, printed as hex.""" 791 792 def __init__(self, name, data): 793 self.name = name 794 self.data = data 795 self.size = getsize(data) 796 797 def dump(self, file, trace=False): 798 if trace: 799 sys.stderr.write(f"{self.name}: {self.size * len(self.data)} bytes\n") 800 801 file.write(f"\nstatic const uint{8 * self.size}_t {self.name}[] = {{") 802 columns = (LINE_WIDTH - 3) // (2 * self.size + 4) 803 for i in range(0, len(self.data), columns): 804 line = "\n " 805 for item in self.data[i : i + columns]: 806 if self.size == 1: 807 line = f"{line} {item:#04x}," 808 elif self.size == 2: 809 line = f"{line} {item:#06x}," 810 else: 811 line = f"{line} {item:#010x}," 812 file.write(line) 813 file.write("\n};\n") 814 815 816######################################################################################## 817# Reimplementation of dictionaries using static structures and a custom string hash 818 819 820# Number chosen to minimize the number of collisions. CPython uses 47. 821MAGIC = 43 822 823 824def myhash(s): 825 h = 0 826 for c in map(ord, s.upper()): 827 h = (h * MAGIC) + c 828 if h > 0x00FFFFFF: 829 h = (h & 0x00FFFFFF) ^ ((h & 0xFF000000) >> 24) 830 return h 831 832 833SIZES = [ 834 (4, 3), 835 (8, 3), 836 (16, 3), 837 (32, 5), 838 (64, 3), 839 (128, 3), 840 (256, 29), 841 (512, 17), 842 (1024, 9), 843 (2048, 5), 844 (4096, 83), 845 (8192, 27), 846 (16384, 43), 847 (32768, 3), 848 (65536, 45), 849 (131072, 9), 850 (262144, 39), 851 (524288, 39), 852 (1048576, 9), 853 (2097152, 5), 854 (4194304, 3), 855 (8388608, 33), 856 (16777216, 27), 857] 858 859 860class Hash: 861 def __init__(self, data): 862 """Turn a (key, value) list into a static hash table structure.""" 863 864 # determine table size 865 for size, poly in SIZES: 866 if size > len(data): 867 self.poly = poly + size 868 self.size = size 869 break 870 else: 871 raise AssertionError("ran out of polynomials") 872 873 print(self.size, "slots in hash table") 874 table = [None] * self.size 875 mask = self.size - 1 876 877 # initialize hash table 878 collisions = 0 879 for key, value in data: 880 h = myhash(key) 881 i = (~h) & mask 882 if table[i] is None: 883 table[i] = value 884 continue 885 incr = (h ^ (h >> 3)) & mask 886 if not incr: 887 incr = mask 888 while True: 889 collisions += 1 890 i = (i + incr) & mask 891 if table[i] is None: 892 table[i] = value 893 break 894 incr = incr << 1 895 if incr > mask: 896 incr = incr ^ self.poly 897 print(collisions, "collisions") 898 899 for i in range(len(table)): 900 if table[i] is None: 901 table[i] = 0 902 903 self.data = CodePointArray("kCodeHash", table) 904 905 def dump(self, file, trace): 906 self.data.dump(file, trace) 907 file.write( 908 f""" 909static const int32_t kCodeMagic = {MAGIC}; 910static const int32_t kCodeMask = {self.size - 1}; 911static const int32_t kCodePoly = {self.poly}; 912""" 913 ) 914 915 916######################################################################################## 917# Unicode database writers 918 919 920def write_header(unicode, header): 921 """Writes type declarations to the database header file.""" 922 923 header.write("// @") 924 header.write( 925 f"""\ 926generated by {os.path.basename(SCRIPT)} 927#pragma once 928 929#include <cstdint> 930 931#include "globals.h" 932#include "objects.h" 933#include "unicode.h" 934 935namespace py {{ 936 937static const int kMaxNameLength = 256; 938 939// Longest decomposition in Unicode {UNIDATA_VERSION}: U+FDFA 940static const int kMaxDecomposition = {MAX_DECOMPOSITION}; 941 942static_assert(Unicode::kAliasStart == {NAME_ALIASES_START:#x}, 943 "Unicode aliases start at unexpected code point"); 944static_assert(Unicode::kAliasCount == {len(unicode.aliases)}, 945 "Unexpected number of Unicode aliases"); 946static_assert(Unicode::kNamedSequenceStart == {NAMED_SEQUENCES_START:#x}, 947 "Unicode named sequences start at unexpected code point"); 948static_assert(Unicode::kNamedSequenceCount == {len(unicode.named_sequences)}, 949 "Unexpected number of Unicode named sequences"); 950 951enum NormalizationForm : byte {{ 952 kInvalid = 0, 953 kNFD = 0x3, 954 kNFKD = 0xc, 955 kNFC = 0x30, 956 kNFKC = 0xc0, 957}}; 958 959enum : int32_t {{ 960 kAlphaMask = {ALPHA_MASK:#x}, 961 kDecimalMask = {DECIMAL_MASK:#x}, 962 kDigitMask = {DIGIT_MASK:#x}, 963 kLowerMask = {LOWER_MASK:#x}, 964 kLinebreakMask = {LINEBREAK_MASK:#x}, 965 kSpaceMask = {SPACE_MASK:#x}, 966 kTitleMask = {TITLE_MASK:#x}, 967 kUpperMask = {UPPER_MASK:#x}, 968 kXidStartMask = {XID_START_MASK:#x}, 969 kXidContinueMask = {XID_CONTINUE_MASK:#x}, 970 kPrintableMask = {PRINTABLE_MASK:#x}, 971 kNumericMask = {NUMERIC_MASK:#x}, 972 kCaseIgnorableMask = {CASE_IGNORABLE_MASK:#x}, 973 kCasedMask = {CASED_MASK:#x}, 974 kExtendedCaseMask = {EXTENDED_CASE_MASK:#x}, 975}}; 976 977struct UnicodeChangeRecord {{ 978 const byte bidirectional; 979 const byte category; 980 const byte decimal; 981 const byte east_asian_width; 982 const byte mirrored; 983 const double numeric; 984}}; 985 986struct UnicodeDatabaseRecord {{ 987 const byte bidirectional; 988 const byte category; 989 const byte combining; // canonical combining class 990 const byte east_asian_width; 991 const bool mirrored; 992 const byte quick_check; 993}}; 994 995struct UnicodeDecomposition {{ 996 const char* prefix; 997 const int count; 998 const int32_t* code_points; 999}}; 1000 1001struct UnicodeNamedSequence {{ 1002 const byte length; 1003 const int32_t code_points[4]; 1004}}; 1005 1006struct UnicodeTypeRecord {{ 1007 // Note: if more flag space is needed, decimal and digit could be unified 1008 const int8_t decimal; 1009 const int8_t digit; 1010 const int16_t flags; 1011 // Deltas to the character or offsets in kExtendedCase 1012 const int32_t lower; 1013 const int32_t title; 1014 const int32_t upper; 1015}}; 1016 1017extern const RawSmallStr kBidirectionalNames[]; 1018extern const RawSmallStr kCategoryNames[]; 1019extern const RawSmallStr kEastAsianWidthNames[]; 1020 1021// Get a code point from its Unicode name. 1022// Returns the code point if the lookup succeeds, -1 if it fails. 1023int32_t codePointFromName(const byte* name, word size); 1024int32_t codePointFromNameOrNamedSequence(const byte* name, word size); 1025 1026// Returns the NFC composition given the NFC first and last indices. 1027int32_t composeCodePoint(int32_t first, int32_t last); 1028 1029// Returns the decomposition mapping of the code point. 1030UnicodeDecomposition decomposeCodePoint(int32_t code_point); 1031 1032// Returns the case mapping for code points where offset is insufficient 1033int32_t extendedCaseMapping(int32_t index); 1034 1035// Finds the first/last character of an NFC sequence containing the code point. 1036int32_t findNFCFirst(int32_t code_point); 1037int32_t findNFCLast(int32_t code_point); 1038 1039// Write the Unicode name for the given code point into the buffer. 1040// Returns true if the name was written successfully, false otherwise. 1041bool nameFromCodePoint(int32_t code_point, byte* buffer, word size); 1042 1043// Returns the normalization of the code point in Unicode {OLD_VERSION}, if it differs 1044// from the current version. If the normalization is unchanged, returns -1. 1045int32_t normalizeOld(int32_t code_point); 1046 1047// Returns the numeric value of the code point, or -1.0 if not numeric. 1048double numericValue(int32_t code_point); 1049 1050// Returns true if the code point has one of the line break properties "BK", 1051// "CR", "LR", or "NL" or the bidirectional type "B". Returns false otherwise. 1052bool unicodeIsLinebreak(int32_t code_point); 1053 1054// Returns true if the code point has the bidirectional type "WS", "B", or "S" 1055// or the category "Zs". Returns false otherwise. 1056bool unicodeIsWhitespace(int32_t code_point); 1057 1058const UnicodeChangeRecord* changeRecord(int32_t code_point); 1059const UnicodeDatabaseRecord* databaseRecord(int32_t code_point); 1060const UnicodeNamedSequence* namedSequence(int32_t code_point); 1061const UnicodeTypeRecord* typeRecord(int32_t code_point); 1062 1063}} // namespace py 1064""" 1065 ) 1066 1067 1068def write_db_prelude(db): 1069 """Writes required include directives to the database file.""" 1070 1071 db.write("// @") 1072 db.write( 1073 f"""\ 1074generated by {os.path.basename(SCRIPT)} 1075#include "{HEADER}" 1076 1077#include <cinttypes> 1078#include <cstdio> 1079#include <cstring> 1080 1081#include "asserts.h" 1082#include "globals.h" 1083#include "objects.h" 1084#include "unicode.h" 1085 1086namespace py {{ 1087""" 1088 ) 1089 1090 1091def write_database_records(unicode, db, trace): # noqa: C901 1092 dummy = DatabaseRecord(0, 0, 0, 0, False, 0) 1093 table = [dummy] 1094 cache = {dummy: 0} 1095 index = [0] * NUM_CODE_POINTS 1096 1097 decomp_data = [0] 1098 decomp_prefix = [""] 1099 decomp_index = [0] * NUM_CODE_POINTS 1100 decomp_size = 0 1101 1102 comp_pairs = [] 1103 comp_first = [None] * NUM_CODE_POINTS 1104 comp_last = [None] * NUM_CODE_POINTS 1105 1106 for char in CODE_POINTS: 1107 record = unicode.table[char] 1108 if record: 1109 # database properties 1110 item = DatabaseRecord( 1111 BIDIRECTIONAL_NAMES.index(record[4]), 1112 CATEGORY_NAMES.index(record[2]), 1113 int(record[3]), 1114 EASTASIANWIDTH_NAMES.index(record[15]), 1115 record[9] == "Y", 1116 record[17], 1117 ) 1118 1119 idx = cache.get(item) 1120 if idx is None: 1121 cache[item] = idx = len(table) 1122 table.append(item) 1123 index[char] = idx 1124 1125 # decomposition data 1126 decomp_idx = 0 1127 if record[5]: 1128 decomp = record[5].split() 1129 prefix = decomp.pop(0) if decomp[0][0] == "<" else "" 1130 if len(decomp) > MAX_DECOMPOSITION: 1131 raise Exception( 1132 f"decomposition of code point {char:#x} is too large" 1133 ) 1134 1135 try: 1136 idx = decomp_prefix.index(prefix) 1137 except ValueError: 1138 idx = len(decomp_prefix) 1139 decomp_prefix.append(prefix) 1140 assert idx < 256 1141 1142 decomp = [idx + (len(decomp) << 8)] + [int(s, 16) for s in decomp] 1143 if ( 1144 not idx 1145 and len(decomp) == 3 1146 and char not in unicode.exclusions 1147 and unicode.table[decomp[1]][3] == "0" 1148 ): 1149 _, l, r = decomp 1150 comp_first[l] = 1 1151 comp_last[r] = 1 1152 comp_pairs.append((l, r, char)) 1153 try: 1154 decomp_idx = decomp_data.index(decomp) 1155 except ValueError: 1156 decomp_idx = len(decomp_data) 1157 decomp_data.extend(decomp) 1158 decomp_size = decomp_size + len(decomp) * 2 1159 decomp_index[char] = decomp_idx 1160 1161 first = last = 0 1162 comp_first_ranges = [] 1163 comp_last_ranges = [] 1164 prev_first = prev_last = None 1165 for ch in CODE_POINTS: 1166 if comp_first[ch] is not None: 1167 comp_first[ch] = first 1168 first += 1 1169 if prev_first is None: 1170 prev_first = (ch, ch) 1171 elif prev_first[1] + 1 == ch: 1172 prev_first = prev_first[0], ch 1173 else: 1174 comp_first_ranges.append(prev_first) 1175 prev_first = (ch, ch) 1176 if comp_last[ch] is not None: 1177 comp_last[ch] = last 1178 last += 1 1179 if prev_last is None: 1180 prev_last = (ch, ch) 1181 elif prev_last[1] + 1 == ch: 1182 prev_last = prev_last[0], ch 1183 else: 1184 comp_last_ranges.append(prev_last) 1185 prev_last = (ch, ch) 1186 comp_first_ranges.append(prev_first) 1187 comp_last_ranges.append(prev_last) 1188 total_first = first 1189 total_last = last 1190 1191 comp_data = [0] * (total_first * total_last) 1192 for first, last, char in comp_pairs: 1193 first = comp_first[first] 1194 last = comp_last[last] 1195 comp_data[first * total_last + last] = char 1196 1197 if trace: 1198 print(len(table), "unique properties") 1199 print(len(decomp_prefix), "unique decomposition prefixes") 1200 print(len(decomp_data), "unique decomposition entries:", end=" ") 1201 print(decomp_size, "bytes") 1202 print(total_first, "first characters in NFC") 1203 print(total_last, "last characters in NFC") 1204 print(len(comp_pairs), "NFC pairs") 1205 1206 StructArray( 1207 "UnicodeDatabaseRecord", 1208 "kDatabaseRecords", 1209 table, 1210 "a list of unique database records", 1211 ).dump(db, trace) 1212 1213 # split record index table 1214 index1, index2, shift = splitbins(index, trace) 1215 db.write( 1216 f""" 1217// type indices 1218static const int kDatabaseIndexShift = {shift}; 1219static const int32_t kDatabaseIndexMask = 1220 (int32_t{{1}} << kDatabaseIndexShift) - 1; 1221""" 1222 ) 1223 UIntArray("kDatabaseIndex1", index1).dump(db, trace) 1224 UIntArray("kDatabaseIndex2", index2).dump(db, trace) 1225 1226 db.write( 1227 f""" 1228// Reindexing of NFC first and last characters 1229struct Reindex {{ 1230 const int32_t start; 1231 const short count; 1232 const short index; 1233}}; 1234 1235static const int kTotalLast = {total_last}; 1236""" 1237 ) 1238 1239 nfc_first = [ 1240 Reindex(start, end - start, comp_first[start]) 1241 for start, end in comp_first_ranges 1242 ] + [Reindex(0, 0, 0)] 1243 nfc_last = [ 1244 Reindex(start, end - start, comp_last[start]) for start, end in comp_last_ranges 1245 ] + [Reindex(0, 0, 0)] 1246 StructArray("Reindex", "kNFCFirst", nfc_first).dump(db, trace) 1247 StructArray("Reindex", "kNFCLast", nfc_last).dump(db, trace) 1248 1249 # split decomposition index table 1250 index1, index2, shift = splitbins(decomp_index, trace) 1251 1252 db.write( 1253 f""" 1254// decomposition mappings 1255static const int kDecompShift = {shift}; 1256static const int32_t kDecompMask = (int32_t{{1}} << kDecompShift) - 1; 1257 1258const char* kDecompPrefix[] = {{ 1259""" 1260 ) 1261 for name in decomp_prefix: 1262 db.write(f' "{name}",\n') 1263 db.write("};\n") 1264 1265 CodePointArray("kDecompData", decomp_data).dump(db, trace) 1266 UIntArray("kDecompIndex1", index1).dump(db, trace) 1267 UIntArray("kDecompIndex2", index2).dump(db, trace) 1268 1269 index1, index2, shift = splitbins(comp_data, trace) 1270 db.write( 1271 f""" 1272// NFC pairs 1273static const int kCompShift = {shift}; 1274static const int32_t kCompMask = (int32_t{{1}} << kCompShift) - 1; 1275""" 1276 ) 1277 UIntArray("kCompIndex", index1).dump(db, trace) 1278 UIntArray("kCompData", index2).dump(db, trace) 1279 1280 1281def write_name_data(unicode, db, trace): # noqa: C901 1282 # Collect names 1283 names = [None] * NUM_CODE_POINTS 1284 for char in CODE_POINTS: 1285 record = unicode.table[char] 1286 if record: 1287 name = record[1].strip() 1288 if name and name[0] != "<": 1289 names[char] = f"{name}\0" 1290 1291 if trace: 1292 print(len([n for n in names if n is not None]), "distinct names") 1293 1294 # Collect unique words from names. Note that we differ between 1295 # words ending a sentence, which include the trailing null byte, 1296 # and words inside a sentence, which do not. 1297 words = Words(names, trace) 1298 wordlist = words.tolist() 1299 1300 # Figure out how many phrasebook escapes we need 1301 short = (65536 - len(wordlist)) // 256 1302 assert short > 0 1303 1304 if trace: 1305 print(short, "short indexes in lexicon") 1306 n = sum(wordlist[i].frequency for i in range(short)) 1307 print(n, "short indexes in phrasebook") 1308 1309 # Pick the most commonly used words, and sort the rest by falling 1310 # length to maximize overlap. 1311 tail = wordlist[short:] 1312 tail.sort(key=lambda data: len(data.word), reverse=True) 1313 wordlist[short:] = tail 1314 1315 # Build a lexicon string from words 1316 current_offset = 0 1317 lexicon_offset = [0] 1318 lexicon_string = "" 1319 word_offsets = {} 1320 for data in wordlist: 1321 # Encoding: bit 7 indicates last character in word 1322 # chr(128) indicates the last character in an entire string) 1323 last = ord(data.word[-1]) 1324 encoded = f"{data.word[:-1]}{chr(last + 128)}" 1325 1326 # reuse string tails, when possible 1327 offset = lexicon_string.find(encoded) 1328 if offset < 0: 1329 offset = current_offset 1330 lexicon_string = lexicon_string + encoded 1331 current_offset += len(encoded) 1332 1333 word_offsets[data.word] = len(lexicon_offset) 1334 lexicon_offset.append(offset) 1335 1336 lexicon = [ord(ch) for ch in lexicon_string] 1337 1338 # generate phrasebook from names and lexicon 1339 phrasebook = [0] 1340 phrasebook_offset = [0] * NUM_CODE_POINTS 1341 for char in CODE_POINTS: 1342 name = names[char] 1343 if name is not None: 1344 words = name.split() 1345 phrasebook_offset[char] = len(phrasebook) 1346 for word in words: 1347 offset = word_offsets[word] 1348 if offset < short: 1349 phrasebook.append(offset) 1350 else: 1351 # store as two bytes 1352 phrasebook.append((offset >> 8) + short) 1353 phrasebook.append(offset & 255) 1354 1355 assert getsize(phrasebook) == 1 1356 1357 db.write(f"\n// lexicon") 1358 UIntArray("kLexicon", lexicon).dump(db, trace) 1359 UIntArray("kLexiconOffset", lexicon_offset).dump(db, trace) 1360 1361 # split phrasebook index table 1362 offset1, offset2, shift = splitbins(phrasebook_offset, trace) 1363 1364 db.write( 1365 f""" 1366// code => name phrasebook 1367static const int kPhrasebookShift = {shift}; 1368static const int kPhrasebookShort = {short}; 1369static const int kPhrasebookMask = (1 << kPhrasebookShift) - 1; 1370""" 1371 ) 1372 UIntArray("kPhrasebook", phrasebook).dump(db, trace) 1373 UIntArray("kPhrasebookOffset1", offset1).dump(db, trace) 1374 UIntArray("kPhrasebookOffset2", offset2).dump(db, trace) 1375 1376 # Extract names for name hash table 1377 hash_data = [] 1378 for char in CODE_POINTS: 1379 record = unicode.table[char] 1380 if record: 1381 name = record[1].strip() 1382 if name and name[0] != "<": 1383 hash_data.append((name, char)) 1384 1385 db.write("\n// name => code dictionary") 1386 Hash(hash_data).dump(db, trace) 1387 1388 aliases = [codepoint for _, codepoint in unicode.aliases] 1389 UIntArray("kNameAliases", aliases).dump(db, trace) 1390 1391 StructArray( 1392 "UnicodeNamedSequence", "kNamedSequences", unicode.named_sequences 1393 ).dump(db, trace) 1394 1395 1396def write_type_data(unicode, db, trace): # noqa: C901 1397 """Writes Unicode character type tables to the database file.""" 1398 1399 # extract unicode types 1400 dummy = TypeRecord(0, 0, 0, 0, 0, 0) 1401 table = [dummy] 1402 cache = {0: dummy} 1403 index = [0] * NUM_CODE_POINTS 1404 numeric = {} 1405 spaces = [] 1406 linebreaks = [] 1407 extended_cases = [] 1408 1409 for char in CODE_POINTS: 1410 record = unicode.table[char] 1411 if record: 1412 # extract database properties 1413 category = record[2] 1414 bidirectional = record[4] 1415 properties = record[16] 1416 flags = 0 1417 # TODO(T55176519): delta = True 1418 if category in ("Lm", "Lt", "Lu", "Ll", "Lo"): 1419 flags |= ALPHA_MASK 1420 if "Lowercase" in properties: 1421 flags |= LOWER_MASK 1422 if "Line_Break" in properties or bidirectional == "B": 1423 flags |= LINEBREAK_MASK 1424 linebreaks.append(char) 1425 if category == "Zs" or bidirectional in ("WS", "B", "S"): 1426 flags |= SPACE_MASK 1427 spaces.append(char) 1428 if category == "Lt": 1429 flags |= TITLE_MASK 1430 if "Uppercase" in properties: 1431 flags |= UPPER_MASK 1432 if char == ord(" ") or category[0] not in ("C", "Z"): 1433 flags |= PRINTABLE_MASK 1434 if "XID_Start" in properties: 1435 flags |= XID_START_MASK 1436 if "XID_Continue" in properties: 1437 flags |= XID_CONTINUE_MASK 1438 if "Cased" in properties: 1439 flags |= CASED_MASK 1440 if "Case_Ignorable" in properties: 1441 flags |= CASE_IGNORABLE_MASK 1442 sc = unicode.special_casing.get(char) 1443 cf = unicode.case_folding.get(char, [char]) 1444 if record[12]: 1445 upper = int(record[12], 16) 1446 else: 1447 upper = char 1448 if record[13]: 1449 lower = int(record[13], 16) 1450 else: 1451 lower = char 1452 if record[14]: 1453 title = int(record[14], 16) 1454 else: 1455 title = upper 1456 if sc is None and cf != [lower]: 1457 sc = ([lower], [title], [upper]) 1458 if sc is None: 1459 if upper == lower == title: 1460 upper = lower = title = 0 1461 else: 1462 upper = upper - char 1463 lower = lower - char 1464 title = title - char 1465 assert ( 1466 abs(upper) <= 2147483647 1467 and abs(lower) <= 2147483647 1468 and abs(title) <= 2147483647 1469 ) 1470 else: 1471 # This happens either when some character maps to more than one 1472 # character in uppercase, lowercase, or titlecase or the 1473 # casefolded version of the character is different from the 1474 # lowercase. The extra characters are stored in a different 1475 # array. 1476 flags |= EXTENDED_CASE_MASK 1477 lower = len(extended_cases) | (len(sc[0]) << 24) 1478 extended_cases.extend(sc[0]) 1479 if cf != sc[0]: 1480 lower |= len(cf) << 20 1481 extended_cases.extend(cf) 1482 upper = len(extended_cases) | (len(sc[2]) << 24) 1483 extended_cases.extend(sc[2]) 1484 # Title is probably equal to upper. 1485 if sc[1] == sc[2]: 1486 title = upper 1487 else: 1488 title = len(extended_cases) | (len(sc[1]) << 24) 1489 extended_cases.extend(sc[1]) 1490 # decimal digit, integer digit 1491 decimal = 0 1492 if record[6]: 1493 flags |= DECIMAL_MASK 1494 decimal = int(record[6]) 1495 digit = 0 1496 if record[7]: 1497 flags |= DIGIT_MASK 1498 digit = int(record[7]) 1499 if record[8]: 1500 flags |= NUMERIC_MASK 1501 numeric.setdefault(record[8], []).append(char) 1502 item = TypeRecord(decimal, digit, flags, lower, title, upper) 1503 # add entry to index and item tables 1504 i = cache.get(item) 1505 if i is None: 1506 cache[item] = i = len(table) 1507 table.append(item) 1508 index[char] = i 1509 1510 print(len(table), "unique character type entries") 1511 print(sum(map(len, numeric.values())), "numeric code points") 1512 print(len(spaces), "whitespace code points") 1513 print(len(linebreaks), "linebreak code points") 1514 print(len(extended_cases), "extended case array") 1515 1516 StructArray( 1517 "UnicodeTypeRecord", 1518 "kTypeRecords", 1519 table, 1520 "a list of unique character type descriptors", 1521 ).dump(db, trace) 1522 1523 # split record index table 1524 index1, index2, shift = splitbins(index, trace) 1525 db.write( 1526 f""" 1527// type indices 1528static const int kTypeIndexShift = {shift}; 1529static const int32_t kTypeIndexMask = (int32_t{{1}} << kTypeIndexShift) - 1; 1530""" 1531 ) 1532 UIntArray("kTypeIndex1", index1).dump(db, trace) 1533 UIntArray("kTypeIndex2", index2).dump(db, trace) 1534 1535 # extended case mappings 1536 CodePointArray("kExtendedCase", extended_cases).dump(db, trace) 1537 1538 db.write( 1539 """ 1540double numericValue(int32_t code_point) { 1541 switch (code_point) {""" 1542 ) 1543 for value, codepoints in sorted(numeric.items()): 1544 parts = value.split("/") 1545 value = " / ".join(repr(float(part)) for part in parts) 1546 1547 codepoints.sort() 1548 for codepoint in codepoints: 1549 db.write(f"\n case {codepoint:#08x}:") 1550 db.write(f"\n return {value};") 1551 db.write( 1552 """ 1553 default: 1554 return -1.0; 1555 } 1556} 1557 1558bool unicodeIsLinebreak(int32_t code_point) { 1559 switch (code_point) {""" 1560 ) 1561 for codepoint in sorted(linebreaks): 1562 db.write(f"\n case {codepoint:#08x}:") 1563 db.write( 1564 """ 1565 return true; 1566 default: 1567 return false; 1568 } 1569} 1570 1571bool unicodeIsWhitespace(int32_t code_point) { 1572 switch (code_point) {""" 1573 ) 1574 for codepoint in sorted(spaces): 1575 db.write(f"\n case {codepoint:#08x}:") 1576 db.write( 1577 """ 1578 return true; 1579 default: 1580 return false; 1581 } 1582} 1583""" 1584 ) 1585 1586 1587def write_change_data(unicode, db, trace): 1588 dummy = ChangeRecord(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0.0) 1589 records = [dummy] 1590 cache = {dummy: 0} 1591 index = [0] * NUM_CODE_POINTS 1592 1593 # Generate delta tables for 3.2 1594 for char, record in enumerate(unicode.change_records): 1595 record = ChangeRecord(*record) 1596 if record in cache: 1597 index[char] = cache[record] 1598 else: 1599 index[char] = cache[record] = len(records) 1600 records.append(record) 1601 1602 StructArray( 1603 "UnicodeChangeRecord", 1604 "kChangeRecords", 1605 records, 1606 f"deltas from Unicode {OLD_VERSION} to the current version", 1607 ).dump(db, trace) 1608 1609 index1, index2, shift = splitbins(index, trace) 1610 db.write( 1611 f""" 1612static const int kChangeIndexShift = {shift}; 1613static const int kChangeIndexMask = (1 << kChangeIndexShift) - 1; 1614""" 1615 ) 1616 UIntArray("kChangeIndex1", index1).dump(db, trace) 1617 UIntArray("kChangeIndex2", index2).dump(db, trace) 1618 1619 db.write( 1620 """ 1621int32_t normalizeOld(int32_t code_point) { 1622 switch (code_point) {""" 1623 ) 1624 for k, v in unicode.normalization_changes: 1625 db.write( 1626 f""" 1627 case {k:#x}: 1628 return 0x{v.lower()};""" 1629 ) 1630 db.write( 1631 """ 1632 default: 1633 return -1; 1634 } 1635} 1636""" 1637 ) 1638 1639 1640def write_string_literals(db, trace): 1641 SmallStrArray("kBidirectionalNames", BIDIRECTIONAL_NAMES).dump(db, trace) 1642 SmallStrArray("kCategoryNames", CATEGORY_NAMES).dump(db, trace) 1643 SmallStrArray("kEastAsianWidthNames", EASTASIANWIDTH_NAMES).dump(db, trace) 1644 1645 1646def write_db_coda(db): 1647 db.write( 1648 """ 1649int32_t composeCodePoint(int32_t first, int32_t last) { 1650 int32_t i = first * kTotalLast + last; 1651 int32_t j = kCompIndex[i >> kCompShift]; 1652 return kCompData[(j << kCompShift) + (i & kCompMask)]; 1653} 1654 1655UnicodeDecomposition decomposeCodePoint(int32_t code_point) { 1656 DCHECK_BOUND(code_point, kMaxUnicode); 1657 uint16_t offset = kDecompIndex1[code_point >> kDecompShift]; 1658 offset = kDecompIndex2[(offset << kDecompShift) + (code_point & kDecompMask)]; 1659 1660 int32_t decomp = kDecompData[offset]; 1661 int32_t count = decomp >> kBitsPerByte; 1662 int8_t prefix = decomp & kMaxByte; 1663 return {kDecompPrefix[prefix], count, &kDecompData[offset + 1]}; 1664} 1665 1666static int32_t findNFC(const Reindex* nfc, int32_t code_point) { 1667 for (word index = 0; nfc[index].start != 0; index++) { 1668 int32_t delta = code_point - nfc[index].start; 1669 if (delta < 0) { 1670 return -1; 1671 } 1672 if (delta <= nfc[index].count) { 1673 return delta + nfc[index].index; 1674 } 1675 } 1676 return -1; 1677} 1678 1679int32_t findNFCFirst(int32_t code_point) { 1680 return findNFC(kNFCFirst, code_point); 1681} 1682 1683int32_t findNFCLast(int32_t code_point) { 1684 return findNFC(kNFCLast, code_point); 1685} 1686 1687struct HangulJamo { 1688 const int length; 1689 const byte name[3]; 1690}; 1691 1692static const HangulJamo kHangulLeads[] = { 1693 {1, {'G'}}, {2, {'G', 'G'}}, {1, {'N'}}, {1, {'D'}}, 1694 {2, {'D', 'D'}}, {1, {'R'}}, {1, {'M'}}, {1, {'B'}}, 1695 {2, {'B', 'B'}}, {1, {'S'}}, {2, {'S', 'S'}}, {0}, 1696 {1, {'J'}}, {2, {'J', 'J'}}, {1, {'C'}}, {1, {'K'}}, 1697 {1, {'T'}}, {1, {'P'}}, {1, {'H'}}, 1698}; 1699 1700static const HangulJamo kHangulVowels[] = { 1701 {1, {'A'}}, {2, {'A', 'E'}}, {2, {'Y', 'A'}}, 1702 {3, {'Y', 'A', 'E'}}, {2, {'E', 'O'}}, {1, {'E'}}, 1703 {3, {'Y', 'E', 'O'}}, {2, {'Y', 'E'}}, {1, {'O'}}, 1704 {2, {'W', 'A'}}, {3, {'W', 'A', 'E'}}, {2, {'O', 'E'}}, 1705 {2, {'Y', 'O'}}, {1, {'U'}}, {3, {'W', 'E', 'O'}}, 1706 {2, {'W', 'E'}}, {2, {'W', 'I'}}, {2, {'Y', 'U'}}, 1707 {2, {'E', 'U'}}, {2, {'Y', 'I'}}, {1, {'I'}}, 1708}; 1709 1710static const HangulJamo kHangulTrails[] = { 1711 {0}, 1712 {1, {'G'}}, 1713 {2, {'G', 'G'}}, 1714 {2, {'G', 'S'}}, 1715 {1, {'N'}}, 1716 {2, {'N', 'J'}}, 1717 {2, {'N', 'H'}}, 1718 {1, {'D'}}, 1719 {1, {'L'}}, 1720 {2, {'L', 'G'}}, 1721 {2, {'L', 'M'}}, 1722 {2, {'L', 'B'}}, 1723 {2, {'L', 'S'}}, 1724 {2, {'L', 'T'}}, 1725 {2, {'L', 'P'}}, 1726 {2, {'L', 'H'}}, 1727 {1, {'M'}}, 1728 {1, {'B'}}, 1729 {2, {'B', 'S'}}, 1730 {1, {'S'}}, 1731 {2, {'S', 'S'}}, 1732 {2, {'N', 'G'}}, 1733 {1, {'J'}}, 1734 {1, {'C'}}, 1735 {1, {'K'}}, 1736 {1, {'T'}}, 1737 {1, {'P'}}, 1738 {1, {'H'}}, 1739}; 1740 1741static_assert(ARRAYSIZE(kHangulLeads) == Unicode::kHangulLeadCount, 1742 "mismatch in number of Hangul lead jamo"); 1743static_assert(ARRAYSIZE(kHangulVowels) == Unicode::kHangulVowelCount, 1744 "mismatch in number of Hangul vowel jamo"); 1745static_assert(ARRAYSIZE(kHangulTrails) == Unicode::kHangulTrailCount, 1746 "mismatch in number of Hangul trail jamo"); 1747 1748static int32_t findHangulJamo(const byte* str, word size, 1749 const HangulJamo array[], word count) { 1750 word result = -1; 1751 int length = -1; 1752 for (word i = 0; i < count; i++) { 1753 HangulJamo jamo = array[i]; 1754 if (length < jamo.length && jamo.length <= size && 1755 (std::memcmp(str, jamo.name, jamo.length) == 0)) { 1756 length = jamo.length; 1757 result = i; 1758 } 1759 } 1760 return result; 1761} 1762 1763static int32_t findHangulSyllable(const byte* name, word size) { 1764 word pos = 16; 1765 word lead = findHangulJamo(name + pos, size - pos, kHangulLeads, 1766 Unicode::kHangulLeadCount); 1767 if (lead == -1) { 1768 return -1; 1769 } 1770 1771 pos += kHangulLeads[lead].length; 1772 word vowel = findHangulJamo(name + pos, size - pos, kHangulVowels, 1773 Unicode::kHangulVowelCount); 1774 if (vowel == -1) { 1775 return -1; 1776 } 1777 1778 pos += kHangulVowels[vowel].length; 1779 word trail = findHangulJamo(name + pos, size - pos, kHangulTrails, 1780 Unicode::kHangulTrailCount); 1781 if (trail == -1 || pos + kHangulTrails[trail].length != size) { 1782 return -1; 1783 } 1784 1785 return Unicode::kHangulSyllableStart + 1786 (lead * Unicode::kHangulVowelCount + vowel) * 1787 Unicode::kHangulTrailCount + 1788 trail; 1789} 1790 1791static bool isUnifiedIdeograph(int32_t code_point) { 1792 return """ 1793 ) 1794 1795 for i, cjk in enumerate(CJK_RANGES): 1796 if i > 0: 1797 db.write(" ||\n ") 1798 db.write(f"({cjk.start:#010x} <= code_point && code_point <= {cjk.end:#010x})") 1799 1800 db.write( 1801 """; 1802} 1803 1804static uint32_t myHash(const byte* name, word size) { 1805 uint32_t hash = 0; 1806 for (word i = 0; i < size; i++) { 1807 hash = (hash * kCodeMagic) + ASCII::toUpper(name[i]); 1808 if (hash > 0x00ffffff) { 1809 hash = (hash & 0x00ffffff) ^ ((hash & 0xff000000) >> 24); 1810 } 1811 } 1812 return hash; 1813} 1814 1815static bool nameMatchesCodePoint(int32_t code_point, const byte* name, 1816 word size) { 1817 byte buffer[kMaxNameLength + 1]; 1818 if (!nameFromCodePoint(code_point, buffer, kMaxNameLength)) { 1819 return false; 1820 } 1821 for (word i = 0; i < size; i++) { 1822 if (ASCII::toUpper(name[i]) != buffer[i]) { 1823 return false; 1824 } 1825 } 1826 return buffer[size] == '\\0'; 1827} 1828 1829static int32_t getCodePoint(const byte* name, word size, 1830 int32_t (*check)(int32_t)) { 1831 if (size >= 16 && std::memcmp(name, "HANGUL SYLLABLE ", 16) == 0) { 1832 return findHangulSyllable(name, size); 1833 } 1834 1835 if (size >= 22 && std::memcmp(name, "CJK UNIFIED IDEOGRAPH-", 22) == 0) { 1836 // Four or five uppercase hexdigits must follow 1837 name += 22; 1838 size -= 22; 1839 if (size != 4 && size != 5) { 1840 return -1; 1841 } 1842 1843 int32_t result = 0; 1844 while (size--) { 1845 result *= 16; 1846 if ('0' <= *name && *name <= '9') { 1847 result += *name - '0'; 1848 } else if ('A' <= *name && *name <= 'F') { 1849 result += *name - 'A' + 10; 1850 } else { 1851 return -1; 1852 } 1853 name++; 1854 } 1855 if (!isUnifiedIdeograph(result)) { 1856 return -1; 1857 } 1858 return result; 1859 } 1860 1861 uint32_t hash = myHash(name, size); 1862 uint32_t step = (hash ^ (hash >> 3)) & kCodeMask; 1863 if (step == 0) { 1864 step = kCodeMask; 1865 } 1866 1867 for (uint32_t idx = (~hash) & kCodeMask;; idx = (idx + step) & kCodeMask) { 1868 int32_t result = kCodeHash[idx]; 1869 if (result == 0) { 1870 return -1; 1871 } 1872 if (nameMatchesCodePoint(result, name, size)) { 1873 return check(result); 1874 } 1875 step = step << 1; 1876 if (step > kCodeMask) { 1877 step = step ^ kCodePoly; 1878 } 1879 } 1880} 1881 1882static int32_t checkAliasAndNamedSequence(int32_t code_point) { 1883 if (Unicode::isNamedSequence(code_point)) { 1884 return -1; 1885 } 1886 if (Unicode::isAlias(code_point)) { 1887 return kNameAliases[code_point - Unicode::kAliasStart]; 1888 } 1889 return code_point; 1890} 1891 1892int32_t codePointFromName(const byte* name, word size) { 1893 return getCodePoint(name, size, checkAliasAndNamedSequence); 1894} 1895 1896static int32_t checkAlias(int32_t code_point) { 1897 if (Unicode::isAlias(code_point)) { 1898 return kNameAliases[code_point - Unicode::kAliasStart]; 1899 } 1900 return code_point; 1901} 1902 1903int32_t codePointFromNameOrNamedSequence(const byte* name, word size) { 1904 return getCodePoint(name, size, checkAlias); 1905} 1906 1907int32_t extendedCaseMapping(int32_t index) { 1908 return kExtendedCase[index]; 1909} 1910 1911bool nameFromCodePoint(int32_t code_point, byte* buffer, word size) { 1912 DCHECK_BOUND(code_point, kMaxUnicode); 1913 1914 if (Unicode::isHangulSyllable(code_point)) { 1915 if (size < 27) { 1916 return false; // worst case: HANGUL SYLLABLE <10 chars> 1917 } 1918 1919 int32_t offset = code_point - Unicode::kHangulSyllableStart; 1920 HangulJamo lead = kHangulLeads[offset / Unicode::kHangulCodaCount]; 1921 HangulJamo vowel = kHangulVowels[(offset % Unicode::kHangulCodaCount) / 1922 Unicode::kHangulTrailCount]; 1923 HangulJamo trail = kHangulTrails[offset % Unicode::kHangulTrailCount]; 1924 1925 std::memcpy(buffer, "HANGUL SYLLABLE ", 16); 1926 buffer += 16; 1927 std::memcpy(buffer, lead.name, lead.length); 1928 buffer += lead.length; 1929 std::memcpy(buffer, vowel.name, vowel.length); 1930 buffer += vowel.length; 1931 std::memcpy(buffer, trail.name, trail.length); 1932 buffer[trail.length] = '\\0'; 1933 return true; 1934 } 1935 1936 if (isUnifiedIdeograph(code_point)) { 1937 if (size < 28) { 1938 return false; // worst case: CJK UNIFIED IDEOGRAPH-***** 1939 } 1940 std::snprintf(reinterpret_cast<char*>(buffer), size, 1941 "CJK UNIFIED IDEOGRAPH-%" PRIX32, code_point); 1942 return true; 1943 } 1944 1945 uint32_t offset = kPhrasebookOffset1[code_point >> kPhrasebookShift]; 1946 offset = kPhrasebookOffset2[(offset << kPhrasebookShift) + 1947 (code_point & kPhrasebookMask)]; 1948 1949 if (offset == 0) { 1950 return false; 1951 } 1952 1953 for (word i = 0;;) { 1954 if (i > 0) { 1955 if (i >= size) { 1956 return false; // buffer overflow 1957 } 1958 buffer[i++] = ' '; 1959 } 1960 1961 // get word index 1962 word word_idx = kPhrasebook[offset] - kPhrasebookShort; 1963 if (word_idx >= 0) { 1964 word_idx = (word_idx << 8) + kPhrasebook[offset + 1]; 1965 offset += 2; 1966 } else { 1967 word_idx = kPhrasebook[offset++]; 1968 } 1969 1970 // copy word string from lexicon 1971 const byte* word_ptr = kLexicon + kLexiconOffset[word_idx]; 1972 while (*word_ptr < 128) { 1973 if (i >= size) { 1974 return false; // buffer overflow 1975 } 1976 buffer[i++] = *word_ptr++; 1977 } 1978 1979 if (i >= size) { 1980 return false; // buffer overflow 1981 } 1982 buffer[i++] = *word_ptr & 0x7f; 1983 if (*word_ptr == 0x80) { 1984 return true; 1985 } 1986 } 1987} 1988 1989const UnicodeChangeRecord* changeRecord(int32_t code_point) { 1990 if (code_point > kMaxUnicode) { 1991 return &kChangeRecords[0]; 1992 } 1993 1994 int index = kChangeIndex1[code_point >> kChangeIndexShift]; 1995 index <<= kChangeIndexShift; 1996 index = kChangeIndex2[index + (code_point & kChangeIndexMask)]; 1997 return &kChangeRecords[index]; 1998} 1999 2000const UnicodeDatabaseRecord* databaseRecord(int32_t code_point) { 2001 if (code_point > kMaxUnicode) { 2002 return &kDatabaseRecords[0]; 2003 } 2004 2005 int index = kDatabaseIndex1[code_point >> kDatabaseIndexShift]; 2006 index <<= kDatabaseIndexShift; 2007 index = kDatabaseIndex2[index + (code_point & kDatabaseIndexMask)]; 2008 return &kDatabaseRecords[index]; 2009} 2010 2011const UnicodeNamedSequence* namedSequence(int32_t code_point) { 2012 DCHECK(Unicode::isNamedSequence(code_point), "not a named sequence"); 2013 return &kNamedSequences[code_point - Unicode::kNamedSequenceStart]; 2014} 2015 2016const UnicodeTypeRecord* typeRecord(int32_t code_point) { 2017 if (code_point > kMaxUnicode) { 2018 return &kTypeRecords[0]; 2019 } 2020 2021 int index = kTypeIndex1[code_point >> kTypeIndexShift]; 2022 index <<= kTypeIndexShift; 2023 index = kTypeIndex2[index + (code_point & kTypeIndexMask)]; 2024 return &kTypeRecords[index]; 2025} 2026 2027} // namespace py 2028""" 2029 ) 2030 2031 2032def write_db(trace=False): 2033 print(f"--- Reading {UNICODE_DATA} v{UNIDATA_VERSION} ...") 2034 unicode = UnicodeData(UNIDATA_VERSION) 2035 2036 print(len(list(filter(None, unicode.table))), "characters") 2037 2038 print(f"--- Reading {UNICODE_DATA} v{OLD_VERSION} ...") 2039 old_unicode = UnicodeData(OLD_VERSION, check_cjk=False) 2040 print(len(list(filter(None, old_unicode.table))), "characters") 2041 unicode.merge(old_unicode) 2042 2043 with open(HEADER_PATH, "w") as header: 2044 write_header(unicode, header) 2045 2046 with open(DB_PATH, "w") as db: 2047 write_db_prelude(db) 2048 write_database_records(unicode, db, trace) 2049 write_name_data(unicode, db, trace) 2050 write_type_data(unicode, db, trace) 2051 write_change_data(unicode, db, trace) 2052 write_string_literals(db, trace) 2053 write_db_coda(db) 2054 2055 2056if __name__ == "__main__": 2057 write_db(True)