this repo has no description
1#!/usr/bin/env python3
2# Portions copyright (c) Facebook, Inc. and its affiliates. (http://www.facebook.com)
3"""Generate Unicode name, property, and type databases.
4
5This script converts Unicode 3.2 and 11.0 database files into a Pyro-specific format.
6Run the script using Python version >= 3.7, from this directory, without args.
7
8The algorithm was taken from Fredrik Lundh's script for CPython.
9"""
10
11import os
12import sys
13import zipfile
14from dataclasses import dataclass
15
16
17SCRIPT = os.path.relpath(__file__)
18PYRO_DIR = os.path.normpath(os.path.join(SCRIPT, "..", ".."))
19RUNTIME_DIR = os.path.join(PYRO_DIR, "runtime")
20HEADER = "unicode-db.h"
21HEADER_PATH = os.path.join(RUNTIME_DIR, HEADER)
22DB_PATH = os.path.join(RUNTIME_DIR, "unicode-db.cpp")
23LINE_WIDTH = 80
24
25NUM_CODE_POINTS = sys.maxunicode + 1
26CODE_POINTS = range(NUM_CODE_POINTS)
27
28UNIDATA_VERSION = "11.0.0"
29OLD_VERSION = "3.2.0"
30
31# UCD files
32CASE_FOLDING = "CaseFolding.txt"
33COMPOSITION_EXCLUSIONS = "CompositionExclusions.txt"
34DERIVED_CORE_PROPERTIES = "DerivedCoreProperties.txt"
35DERIVEDNORMALIZATION_PROPS = "DerivedNormalizationProps.txt"
36EASTASIAN_WIDTH = "EastAsianWidth.txt"
37LINE_BREAK = "LineBreak.txt"
38NAME_ALIASES = "NameAliases.txt"
39NAMED_SEQUENCES = "NamedSequences.txt"
40SPECIAL_CASING = "SpecialCasing.txt"
41UNICODE_DATA = "UnicodeData.txt"
42UNIHAN = "Unihan.zip"
43
44# Private Use Areas -- in planes 1, 15, 16
45PUA_1 = range(0xE000, 0xF900)
46PUA_15 = range(0xF0000, 0xFFFFE)
47PUA_16 = range(0x100000, 0x10FFFE)
48
49# we use this ranges of PUA_15 to store name aliases and named sequences
50NAME_ALIASES_START = 0xF0000
51NAMED_SEQUENCES_START = 0xF0200
52
53# Longest decomposition in Unicode {UNIDATA_VERSION}: U+FDFA
54MAX_DECOMPOSITION = 18
55
56CATEGORY_NAMES = [
57 "Cn",
58 "Lu",
59 "Ll",
60 "Lt",
61 "Mn",
62 "Mc",
63 "Me",
64 "Nd",
65 "Nl",
66 "No",
67 "Zs",
68 "Zl",
69 "Zp",
70 "Cc",
71 "Cf",
72 "Cs",
73 "Co",
74 "Cn",
75 "Lm",
76 "Lo",
77 "Pc",
78 "Pd",
79 "Ps",
80 "Pe",
81 "Pi",
82 "Pf",
83 "Po",
84 "Sm",
85 "Sc",
86 "Sk",
87 "So",
88]
89
90BIDIRECTIONAL_NAMES = [
91 "",
92 "L",
93 "LRE",
94 "LRO",
95 "R",
96 "AL",
97 "RLE",
98 "RLO",
99 "PDF",
100 "EN",
101 "ES",
102 "ET",
103 "AN",
104 "CS",
105 "NSM",
106 "BN",
107 "B",
108 "S",
109 "WS",
110 "ON",
111 "LRI",
112 "RLI",
113 "FSI",
114 "PDI",
115]
116
117EASTASIANWIDTH_NAMES = ["F", "H", "W", "Na", "A", "N"]
118
119MANDATORY_LINE_BREAKS = ["BK", "CR", "LF", "NL"]
120
121ALPHA_MASK = 0x01
122DECIMAL_MASK = 0x02
123DIGIT_MASK = 0x04
124LOWER_MASK = 0x08
125LINEBREAK_MASK = 0x10
126SPACE_MASK = 0x20
127TITLE_MASK = 0x40
128UPPER_MASK = 0x80
129XID_START_MASK = 0x100
130XID_CONTINUE_MASK = 0x200
131PRINTABLE_MASK = 0x400
132NUMERIC_MASK = 0x800
133CASE_IGNORABLE_MASK = 0x1000
134CASED_MASK = 0x2000
135EXTENDED_CASE_MASK = 0x4000
136
137
138def open_data(filename, version):
139 ucd_dir = os.path.join(PYRO_DIR, "third-party", f"ucd-{version}")
140 if not os.path.exists(ucd_dir):
141 os.mkdir(ucd_dir)
142
143 path = os.path.join(ucd_dir, filename)
144 if not os.path.exists(path):
145 if version == OLD_VERSION:
146 # irregular url structure
147 root, path = os.path.splitext(filename)
148 url = f"http://www.unicode.org/Public/3.2-Update/{root}-{version}{path}"
149 else:
150 url = f"http://www.unicode.org/Public/{version}/ucd/{filename}"
151 raise OSError(
152 f"""Cannot open {path}: file does not exist.
153Find it online at {url}"""
154 )
155
156 if path.endswith(".txt"):
157 return open(path, encoding="utf-8")
158 else:
159 # Unihan.zip
160 return open(path, "rb")
161
162
163########################################################################################
164# CJK ranges
165
166
167@dataclass(frozen=True)
168class CJKRange:
169 start: int
170 end: int
171
172
173CJK_RANGES = (
174 CJKRange(0x3400, 0x4DB5),
175 CJKRange(0x4E00, 0x9FEF),
176 CJKRange(0x20000, 0x2A6D6),
177 CJKRange(0x2A700, 0x2B734),
178 CJKRange(0x2B740, 0x2B81D),
179 CJKRange(0x2B820, 0x2CEA1),
180 CJKRange(0x2CEB0, 0x2EBE0),
181)
182
183
184########################################################################################
185# the following support code is taken from the unidb utilities
186# Copyright (c) 1999-2000 by Secret Labs AB
187
188# load a unicode-data file from disk
189
190
191class UnicodeData:
192 """Loads a Unicode data file, checking disk first and then internet.
193
194 Taken from the unidb utilities.
195 Copyright (c) 1999-2000 by Secret Labs AB
196 """
197
198 # Record structure:
199 # [ID, name, category, combining, bidi, decomp, (6)
200 # decimal, digit, numeric, bidi-mirrored, Unicode-1-name, (11)
201 # ISO-comment, uppercase, lowercase, titlecase, ea-width, (16)
202 # derived-props] (17)
203
204 def __init__(self, version, check_cjk=True): # noqa: C901
205 self.version = version
206
207 self.aliases = []
208 self.case_folding = {}
209 self.change_records = []
210 self.exclusions = set()
211 self.named_sequences = []
212 self.normalization_changes = []
213 self.special_casing = {}
214 self.table = [None] * NUM_CODE_POINTS
215
216 with open_data(UNICODE_DATA, version) as file:
217 while True:
218 s = file.readline()
219 if not s:
220 break
221 s = s.strip().split(";")
222 char = int(s[0], 16)
223 self.table[char] = s
224
225 # expand first-last ranges
226 field = None
227 cjk_ranges = []
228 for char in CODE_POINTS:
229 s = self.table[char]
230 if s is not None:
231 if s[1][-6:] == "First>":
232 s[1] = ""
233 field = s
234 elif s[1][-5:] == "Last>":
235 if check_cjk and s[1].startswith("<CJK Ideograph"):
236 start = int(field[0], 16)
237 end = int(s[0], 16)
238 cjk_ranges.append(CJKRange(start, end))
239 s[1] = ""
240 field = None
241 elif field:
242 f2 = field[:]
243 f2[0] = f"{char:X}"
244 self.table[char] = f2
245
246 if check_cjk and set(cjk_ranges) != set(CJK_RANGES):
247 raise ValueError(f"CJK ranges deviate: found {cjk_ranges!r}")
248
249 # check for name aliases and named sequences, see #12753
250 # aliases and named sequences are not in 3.2.0
251 if version != OLD_VERSION:
252 self.aliases = []
253 # store aliases in the Private Use Area 15, in range U+F0000..U+F00FF,
254 # in order to take advantage of the compression and lookup
255 # algorithms used for the other characters
256 pua_index = NAME_ALIASES_START
257 with open_data(NAME_ALIASES, version) as file:
258 for s in file:
259 s = s.strip()
260 if not s or s.startswith("#"):
261 continue
262 char, name, abbrev = s.split(";")
263 char = int(char, 16)
264 self.aliases.append((name, char))
265 # also store the name in the PUA 1
266 self.table[pua_index][1] = name
267 pua_index += 1
268 assert pua_index - NAME_ALIASES_START == len(self.aliases)
269
270 self.named_sequences = []
271 # store named sequences in the PUA 1, in range U+F0100..,
272 # in order to take advantage of the compression and lookup
273 # algorithms used for the other characters.
274
275 assert pua_index < NAMED_SEQUENCES_START
276 pua_index = NAMED_SEQUENCES_START
277 with open_data(NAMED_SEQUENCES, version) as file:
278 for s in file:
279 s = s.strip()
280 if not s or s.startswith("#"):
281 continue
282 name, chars = s.split(";")
283 chars = tuple(int(char, 16) for char in chars.split())
284 # check that the structure defined in write_header is OK
285 assert len(chars) <= 4, "change the NamedSequence array size"
286 assert all(c < NUM_CODE_POINTS for c in chars)
287 self.named_sequences.append(NamedSequence(name, chars))
288 # also store these in the PUA 1
289 self.table[pua_index][1] = name
290 pua_index += 1
291 assert pua_index - NAMED_SEQUENCES_START == len(self.named_sequences)
292
293 self.exclusions = set()
294 with open_data(COMPOSITION_EXCLUSIONS, version) as file:
295 for s in file:
296 s = s.strip()
297 if s and not s.startswith("#"):
298 char = int(s.split()[0], 16)
299 self.exclusions.add(char)
300
301 widths = [None] * NUM_CODE_POINTS
302 with open_data(EASTASIAN_WIDTH, version) as file:
303 for s in file:
304 s = s.strip()
305 if not s:
306 continue
307 if s[0] == "#":
308 continue
309 s = s.split()[0].split(";")
310 if ".." in s[0]:
311 first, last = [int(c, 16) for c in s[0].split("..")]
312 chars = list(range(first, last + 1))
313 else:
314 chars = [int(s[0], 16)]
315 for char in chars:
316 widths[char] = s[1]
317
318 for char in CODE_POINTS:
319 if self.table[char] is not None:
320 self.table[char].append(widths[char])
321 self.table[char].append(set())
322
323 with open_data(DERIVED_CORE_PROPERTIES, version) as file:
324 for s in file:
325 s = s.split("#", 1)[0].strip()
326 if not s:
327 continue
328
329 r, p = s.split(";")
330 r = r.strip()
331 p = p.strip()
332 if ".." in r:
333 first, last = [int(c, 16) for c in r.split("..")]
334 chars = list(range(first, last + 1))
335 else:
336 chars = [int(r, 16)]
337 for char in chars:
338 if self.table[char]:
339 # Some properties (e.g. Default_Ignorable_Code_Point)
340 # apply to unassigned code points; ignore them
341 self.table[char][-1].add(p)
342
343 with open_data(LINE_BREAK, version) as file:
344 for s in file:
345 s = s.partition("#")[0]
346 s = [i.strip() for i in s.split(";")]
347 if len(s) < 2 or s[1] not in MANDATORY_LINE_BREAKS:
348 continue
349 if ".." not in s[0]:
350 first = last = int(s[0], 16)
351 else:
352 first, last = [int(c, 16) for c in s[0].split("..")]
353 for char in range(first, last + 1):
354 self.table[char][-1].add("Line_Break")
355
356 # We only want the quickcheck properties
357 # Format: NF?_QC; Y(es)/N(o)/M(aybe)
358 # Yes is the default, hence only N and M occur
359 # In 3.2.0, the format was different (NF?_NO)
360 # The parsing will incorrectly determine these as
361 # "yes", however, unicodedata.c will not perform quickchecks
362 # for older versions, and no delta records will be created.
363 quickchecks = [0] * NUM_CODE_POINTS
364 qc_order = "NFD_QC NFKD_QC NFC_QC NFKC_QC".split()
365 with open_data(DERIVEDNORMALIZATION_PROPS, version) as file:
366 for s in file:
367 if "#" in s:
368 s = s[: s.index("#")]
369 s = [i.strip() for i in s.split(";")]
370 if len(s) < 2 or s[1] not in qc_order:
371 continue
372 quickcheck = "MN".index(s[2]) + 1 # Maybe or No
373 quickcheck_shift = qc_order.index(s[1]) * 2
374 quickcheck <<= quickcheck_shift
375 if ".." not in s[0]:
376 first = last = int(s[0], 16)
377 else:
378 first, last = [int(c, 16) for c in s[0].split("..")]
379 for char in range(first, last + 1):
380 assert not (quickchecks[char] >> quickcheck_shift) & 3
381 quickchecks[char] |= quickcheck
382
383 with open_data(UNIHAN, version) as file:
384 zip = zipfile.ZipFile(file)
385 if version == OLD_VERSION:
386 data = zip.open(f"Unihan-{OLD_VERSION}.txt").read()
387 else:
388 data = zip.open("Unihan_NumericValues.txt").read()
389 for line in data.decode("utf-8").splitlines():
390 if not line.startswith("U+"):
391 continue
392 code, tag, value = line.split(None, 3)[:3]
393 if tag not in ("kAccountingNumeric", "kPrimaryNumeric", "kOtherNumeric"):
394 continue
395 value = value.strip().replace(",", "")
396 i = int(code[2:], 16)
397 # Patch the numeric field
398 if self.table[i] is not None:
399 self.table[i][8] = value
400 sc = self.special_casing = {}
401 with open_data(SPECIAL_CASING, version) as file:
402 for s in file:
403 s = s[:-1].split("#", 1)[0]
404 if not s:
405 continue
406 data = s.split("; ")
407 if data[4]:
408 # We ignore all conditionals (since they depend on
409 # languages) except for one, which is hardcoded. See
410 # handle_capital_sigma in unicodeobject.c.
411 continue
412 c = int(data[0], 16)
413 lower = [int(char, 16) for char in data[1].split()]
414 title = [int(char, 16) for char in data[2].split()]
415 upper = [int(char, 16) for char in data[3].split()]
416 sc[c] = (lower, title, upper)
417 cf = self.case_folding = {}
418 if version != OLD_VERSION:
419 with open_data(CASE_FOLDING, version) as file:
420 for s in file:
421 s = s[:-1].split("#", 1)[0]
422 if not s:
423 continue
424 data = s.split("; ")
425 if data[1] in "CF":
426 c = int(data[0], 16)
427 cf[c] = [int(char, 16) for char in data[2].split()]
428
429 for char in CODE_POINTS:
430 if self.table[char] is not None:
431 self.table[char].append(quickchecks[char])
432
433 def merge(self, old): # noqa: C901
434 # Changes to exclusion file not implemented yet
435 if old.exclusions != self.exclusions:
436 raise NotImplementedError("exclusions differ")
437
438 # In these change records, 0xFF means "no change"
439 bidir_changes = [0xFF] * NUM_CODE_POINTS
440 category_changes = [0xFF] * NUM_CODE_POINTS
441 decimal_changes = [0xFF] * NUM_CODE_POINTS
442 east_asian_width_changes = [0xFF] * NUM_CODE_POINTS
443 mirrored_changes = [0xFF] * NUM_CODE_POINTS
444 # In numeric data, 0 means "no change",
445 # -1 means "did not have a numeric value
446 numeric_changes = [0.0] * NUM_CODE_POINTS
447 # normalization_changes is a list of key-value pairs
448 normalization_changes = []
449 for char in CODE_POINTS:
450 if self.table[char] is None:
451 # Characters unassigned in the new version ought to
452 # be unassigned in the old one
453 assert old.table[char] is None
454 continue
455 # check characters unassigned in the old version
456 if old.table[char] is None:
457 # category 0 is "unassigned"
458 category_changes[char] = 0
459 continue
460 # check characters that differ
461 if old.table[char] != self.table[char]:
462 for k in range(len(old.table[char])):
463 if old.table[char][k] != self.table[char][k]:
464 value = old.table[char][k]
465 if k == 1 and char in PUA_15:
466 # the name is not set in the old.table, but in the
467 # self.table we are using it for aliases and named seq
468 assert value == ""
469 elif k == 2:
470 category_changes[char] = CATEGORY_NAMES.index(value)
471 elif k == 4:
472 bidir_changes[char] = BIDIRECTIONAL_NAMES.index(value)
473 elif k == 5:
474 # We assume all normalization changes are in 1:1 mappings
475 assert " " not in value
476 normalization_changes.append((char, value))
477 elif k == 6:
478 # only support changes where the old value is a single digit
479 assert value in "0123456789"
480 decimal_changes[char] = int(value)
481 elif k == 8:
482 # Since 0 encodes "no change", the old value is better not 0
483 if not value:
484 numeric_changes[char] = -1.0
485 else:
486 numeric_changes[char] = float(value)
487 assert numeric_changes[char] not in (0, -1)
488 elif k == 9:
489 if value == "Y":
490 mirrored_changes[char] = 1
491 else:
492 mirrored_changes[char] = 0
493 elif k == 11:
494 # change to ISO comment, ignore
495 pass
496 elif k == 12:
497 # change to simple uppercase mapping; ignore
498 pass
499 elif k == 13:
500 # change to simple lowercase mapping; ignore
501 pass
502 elif k == 14:
503 # change to simple titlecase mapping; ignore
504 pass
505 elif k == 15:
506 # change to east asian width
507 east_asian_width_changes[char] = EASTASIANWIDTH_NAMES.index(
508 value
509 )
510 elif k == 16:
511 # derived property changes; not yet
512 pass
513 elif k == 17:
514 # normalization quickchecks are not performed
515 # for older versions
516 pass
517 else:
518
519 class Difference(Exception):
520 pass
521
522 raise Difference(
523 hex(char), k, old.table[char], self.table[char]
524 )
525
526 self.change_records = zip(
527 bidir_changes,
528 category_changes,
529 decimal_changes,
530 east_asian_width_changes,
531 mirrored_changes,
532 numeric_changes,
533 )
534 self.normalization_changes = normalization_changes
535
536
537########################################################################################
538# Helper functions & classes
539
540
541def getsize(data):
542 """Return the smallest possible integer size for the given array."""
543 maxdata = max(data)
544 if maxdata < (1 << 8):
545 return 1
546 if maxdata < (1 << 16):
547 return 2
548 return 4
549
550
551def splitbins(t, trace=False): # noqa: C901
552 """t, trace=False -> (t1, t2, shift). Split a table to save space.
553
554 t is a sequence of ints. This function can be useful to save space if
555 many of the ints are the same. t1 and t2 are lists of ints, and shift
556 is an int, chosen to minimize the combined size of t1 and t2 (in C
557 code), and where for each i in range(len(t)),
558 t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
559 where mask is a bitmask isolating the last "shift" bits.
560
561 If optional arg trace is non-zero (default zero), progress info
562 is printed to sys.stderr. The higher the value, the more info
563 you'll get.
564 """
565
566 if trace:
567
568 def dump(t1, t2, shift, bytes):
569 sys.stderr.write(
570 f"{len(t1)}+{len(t2)} bins at shift {shift}; {bytes} bytes\n"
571 )
572
573 sys.stderr.write(f"Size of original table: {len(t) * getsize(t)} bytes\n")
574
575 n = len(t) - 1 # last valid index
576 maxshift = 0 # the most we can shift n and still have something left
577 if n > 0:
578 while n >> 1:
579 n >>= 1
580 maxshift += 1
581 del n
582 bytes = sys.maxsize # smallest total size so far
583 t = tuple(t) # so slices can be dict keys
584 for shift in range(maxshift + 1):
585 t1 = []
586 t2 = []
587 size = 2 ** shift
588 bincache = {}
589 for i in range(0, len(t), size):
590 bin = t[i : i + size]
591 index = bincache.get(bin)
592 if index is None:
593 index = len(t2)
594 bincache[bin] = index
595 t2.extend(bin)
596 t1.append(index >> shift)
597 # determine memory size
598 b = len(t1) * getsize(t1) + len(t2) * getsize(t2)
599 if trace:
600 dump(t1, t2, shift, b)
601 if b < bytes:
602 best = t1, t2, shift
603 bytes = b
604 t1, t2, shift = best
605 if trace:
606 sys.stderr.write(f"Best: ")
607 dump(t1, t2, shift, bytes)
608 if __debug__:
609 # exhaustively verify that the decomposition is correct
610 mask = ~((~0) << shift) # i.e., low-bit mask of shift bits
611 for i in range(len(t)):
612 assert t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
613 return best
614
615
616@dataclass
617class WordData:
618 word: str
619 index: int
620 frequency: int
621
622
623class Words:
624 def __init__(self, names, trace=False):
625 self.indices = {}
626 self.frequencies = {}
627
628 count = num_chars = num_words = 0
629 for char in CODE_POINTS:
630 name = names[char]
631 if name is None:
632 continue
633
634 words = name.split()
635 num_chars += len(name)
636 num_words += len(words)
637 for word in words:
638 if word in self.indices:
639 self.frequencies[word] += 1
640 else:
641 self.frequencies[word] = 1
642 self.indices[word] = count
643 count += 1
644
645 if trace:
646 print(f"{num_words} words in text; {num_chars} code points")
647
648 def tolist(self):
649 return sorted(
650 (
651 WordData(word, index, self.frequencies[word])
652 for word, index in self.indices.items()
653 ),
654 key=lambda data: (-data.frequency, data.word),
655 )
656
657
658########################################################################################
659# Classes for database tables
660
661
662@dataclass(frozen=True)
663class ChangeRecord:
664 bidirectional: int
665 category: int
666 decimal: int
667 east_asian_width: int
668 mirrored: int
669 numeric: float
670
671 def __str__(self):
672 return f"\
673{{{self.bidirectional:#04x}, {self.category:#04x}, {self.decimal:#04x}, \
674{self.east_asian_width:#04x}, {self.mirrored:#04x}, {self.numeric}}}"
675
676
677@dataclass(frozen=True)
678class DatabaseRecord:
679 bidirectional: int
680 category: int
681 combining: int
682 east_asian_width: int
683 mirrored: bool
684 quick_check: int
685
686 def __str__(self):
687 mirrored = "true" if self.mirrored else "false"
688 return f"{{{self.bidirectional:#04x}, {self.category:#04x}, \
689{self.combining:#04x}, {self.east_asian_width:#04x}, {mirrored}, \
690{self.quick_check:#04x}}}"
691
692
693@dataclass(frozen=True)
694class NamedSequence:
695 name: int
696 seq: int
697
698 def __str__(self):
699 seq_str = ", ".join(f"{elt:#04x}" for elt in self.seq)
700 return f"{{{len(self.seq)}, {{{seq_str}}}}}"
701
702
703@dataclass(frozen=True)
704class Reindex:
705 start: int
706 count: int
707 index: int
708
709 def __str__(self):
710 return f"{{{self.start:#010x}, {self.count}, {self.index}}}"
711
712
713@dataclass(frozen=True)
714class TypeRecord:
715 decimal: int
716 digit: int
717 flags: int
718 lower: int
719 title: int
720 upper: int
721
722 def __str__(self):
723 return f"{{{self.decimal}, {self.digit}, {self.flags:#06x}, \
724{self.lower:#x}, {self.title:#x}, {self.upper:#x}}}"
725
726
727class CodePointArray:
728 """Arrays of code points, printed as hex."""
729
730 def __init__(self, name, data):
731 self.name = name
732 self.data = data
733
734 def dump(self, file, trace=False):
735 if trace:
736 sys.stderr.write(f"{self.name}: {4 * len(self.data)} bytes\n")
737
738 file.write(f"\nstatic const int32_t {self.name}[] = {{")
739 columns = (LINE_WIDTH - 3) // 12
740 for i in range(0, len(self.data), columns):
741 line = "\n "
742 for item in self.data[i : i + columns]:
743 line = f"{line} {item:#010x},"
744 file.write(line)
745 file.write("\n};\n")
746
747
748class SmallStrArray:
749 def __init__(self, name, data, comment=None):
750 self.name = name
751 self.data = data
752 self.comment = comment
753
754 def dump(self, file, trace=False):
755 if self.comment is not None:
756 file.write(f"\n// {self.comment}")
757 file.write(f"\nconst RawSmallStr {self.name}[] = {{")
758
759 if self.data:
760 file.write("\n")
761 for item in self.data:
762 file.write(f' RawSmallStr::fromCStr("{item}"),\n')
763
764 file.write("};\n")
765
766
767class StructArray:
768 """Store an array for writing to a generated C/C++ file."""
769
770 def __init__(self, ctype, name, data, comment=None):
771 self.type = ctype
772 self.name = name
773 self.data = data
774 self.comment = comment
775
776 def dump(self, file, trace=False):
777 if self.comment is not None:
778 file.write(f"\n// {self.comment}")
779 file.write(f"\nstatic const {self.type} {self.name}[] = {{")
780
781 if self.data:
782 file.write("\n")
783 for item in self.data:
784 file.write(f" {item},\n")
785
786 file.write("};\n")
787
788
789class UIntArray:
790 """Unsigned integer arrays, printed as hex."""
791
792 def __init__(self, name, data):
793 self.name = name
794 self.data = data
795 self.size = getsize(data)
796
797 def dump(self, file, trace=False):
798 if trace:
799 sys.stderr.write(f"{self.name}: {self.size * len(self.data)} bytes\n")
800
801 file.write(f"\nstatic const uint{8 * self.size}_t {self.name}[] = {{")
802 columns = (LINE_WIDTH - 3) // (2 * self.size + 4)
803 for i in range(0, len(self.data), columns):
804 line = "\n "
805 for item in self.data[i : i + columns]:
806 if self.size == 1:
807 line = f"{line} {item:#04x},"
808 elif self.size == 2:
809 line = f"{line} {item:#06x},"
810 else:
811 line = f"{line} {item:#010x},"
812 file.write(line)
813 file.write("\n};\n")
814
815
816########################################################################################
817# Reimplementation of dictionaries using static structures and a custom string hash
818
819
820# Number chosen to minimize the number of collisions. CPython uses 47.
821MAGIC = 43
822
823
824def myhash(s):
825 h = 0
826 for c in map(ord, s.upper()):
827 h = (h * MAGIC) + c
828 if h > 0x00FFFFFF:
829 h = (h & 0x00FFFFFF) ^ ((h & 0xFF000000) >> 24)
830 return h
831
832
833SIZES = [
834 (4, 3),
835 (8, 3),
836 (16, 3),
837 (32, 5),
838 (64, 3),
839 (128, 3),
840 (256, 29),
841 (512, 17),
842 (1024, 9),
843 (2048, 5),
844 (4096, 83),
845 (8192, 27),
846 (16384, 43),
847 (32768, 3),
848 (65536, 45),
849 (131072, 9),
850 (262144, 39),
851 (524288, 39),
852 (1048576, 9),
853 (2097152, 5),
854 (4194304, 3),
855 (8388608, 33),
856 (16777216, 27),
857]
858
859
860class Hash:
861 def __init__(self, data):
862 """Turn a (key, value) list into a static hash table structure."""
863
864 # determine table size
865 for size, poly in SIZES:
866 if size > len(data):
867 self.poly = poly + size
868 self.size = size
869 break
870 else:
871 raise AssertionError("ran out of polynomials")
872
873 print(self.size, "slots in hash table")
874 table = [None] * self.size
875 mask = self.size - 1
876
877 # initialize hash table
878 collisions = 0
879 for key, value in data:
880 h = myhash(key)
881 i = (~h) & mask
882 if table[i] is None:
883 table[i] = value
884 continue
885 incr = (h ^ (h >> 3)) & mask
886 if not incr:
887 incr = mask
888 while True:
889 collisions += 1
890 i = (i + incr) & mask
891 if table[i] is None:
892 table[i] = value
893 break
894 incr = incr << 1
895 if incr > mask:
896 incr = incr ^ self.poly
897 print(collisions, "collisions")
898
899 for i in range(len(table)):
900 if table[i] is None:
901 table[i] = 0
902
903 self.data = CodePointArray("kCodeHash", table)
904
905 def dump(self, file, trace):
906 self.data.dump(file, trace)
907 file.write(
908 f"""
909static const int32_t kCodeMagic = {MAGIC};
910static const int32_t kCodeMask = {self.size - 1};
911static const int32_t kCodePoly = {self.poly};
912"""
913 )
914
915
916########################################################################################
917# Unicode database writers
918
919
920def write_header(unicode, header):
921 """Writes type declarations to the database header file."""
922
923 header.write("// @")
924 header.write(
925 f"""\
926generated by {os.path.basename(SCRIPT)}
927#pragma once
928
929#include <cstdint>
930
931#include "globals.h"
932#include "objects.h"
933#include "unicode.h"
934
935namespace py {{
936
937static const int kMaxNameLength = 256;
938
939// Longest decomposition in Unicode {UNIDATA_VERSION}: U+FDFA
940static const int kMaxDecomposition = {MAX_DECOMPOSITION};
941
942static_assert(Unicode::kAliasStart == {NAME_ALIASES_START:#x},
943 "Unicode aliases start at unexpected code point");
944static_assert(Unicode::kAliasCount == {len(unicode.aliases)},
945 "Unexpected number of Unicode aliases");
946static_assert(Unicode::kNamedSequenceStart == {NAMED_SEQUENCES_START:#x},
947 "Unicode named sequences start at unexpected code point");
948static_assert(Unicode::kNamedSequenceCount == {len(unicode.named_sequences)},
949 "Unexpected number of Unicode named sequences");
950
951enum NormalizationForm : byte {{
952 kInvalid = 0,
953 kNFD = 0x3,
954 kNFKD = 0xc,
955 kNFC = 0x30,
956 kNFKC = 0xc0,
957}};
958
959enum : int32_t {{
960 kAlphaMask = {ALPHA_MASK:#x},
961 kDecimalMask = {DECIMAL_MASK:#x},
962 kDigitMask = {DIGIT_MASK:#x},
963 kLowerMask = {LOWER_MASK:#x},
964 kLinebreakMask = {LINEBREAK_MASK:#x},
965 kSpaceMask = {SPACE_MASK:#x},
966 kTitleMask = {TITLE_MASK:#x},
967 kUpperMask = {UPPER_MASK:#x},
968 kXidStartMask = {XID_START_MASK:#x},
969 kXidContinueMask = {XID_CONTINUE_MASK:#x},
970 kPrintableMask = {PRINTABLE_MASK:#x},
971 kNumericMask = {NUMERIC_MASK:#x},
972 kCaseIgnorableMask = {CASE_IGNORABLE_MASK:#x},
973 kCasedMask = {CASED_MASK:#x},
974 kExtendedCaseMask = {EXTENDED_CASE_MASK:#x},
975}};
976
977struct UnicodeChangeRecord {{
978 const byte bidirectional;
979 const byte category;
980 const byte decimal;
981 const byte east_asian_width;
982 const byte mirrored;
983 const double numeric;
984}};
985
986struct UnicodeDatabaseRecord {{
987 const byte bidirectional;
988 const byte category;
989 const byte combining; // canonical combining class
990 const byte east_asian_width;
991 const bool mirrored;
992 const byte quick_check;
993}};
994
995struct UnicodeDecomposition {{
996 const char* prefix;
997 const int count;
998 const int32_t* code_points;
999}};
1000
1001struct UnicodeNamedSequence {{
1002 const byte length;
1003 const int32_t code_points[4];
1004}};
1005
1006struct UnicodeTypeRecord {{
1007 // Note: if more flag space is needed, decimal and digit could be unified
1008 const int8_t decimal;
1009 const int8_t digit;
1010 const int16_t flags;
1011 // Deltas to the character or offsets in kExtendedCase
1012 const int32_t lower;
1013 const int32_t title;
1014 const int32_t upper;
1015}};
1016
1017extern const RawSmallStr kBidirectionalNames[];
1018extern const RawSmallStr kCategoryNames[];
1019extern const RawSmallStr kEastAsianWidthNames[];
1020
1021// Get a code point from its Unicode name.
1022// Returns the code point if the lookup succeeds, -1 if it fails.
1023int32_t codePointFromName(const byte* name, word size);
1024int32_t codePointFromNameOrNamedSequence(const byte* name, word size);
1025
1026// Returns the NFC composition given the NFC first and last indices.
1027int32_t composeCodePoint(int32_t first, int32_t last);
1028
1029// Returns the decomposition mapping of the code point.
1030UnicodeDecomposition decomposeCodePoint(int32_t code_point);
1031
1032// Returns the case mapping for code points where offset is insufficient
1033int32_t extendedCaseMapping(int32_t index);
1034
1035// Finds the first/last character of an NFC sequence containing the code point.
1036int32_t findNFCFirst(int32_t code_point);
1037int32_t findNFCLast(int32_t code_point);
1038
1039// Write the Unicode name for the given code point into the buffer.
1040// Returns true if the name was written successfully, false otherwise.
1041bool nameFromCodePoint(int32_t code_point, byte* buffer, word size);
1042
1043// Returns the normalization of the code point in Unicode {OLD_VERSION}, if it differs
1044// from the current version. If the normalization is unchanged, returns -1.
1045int32_t normalizeOld(int32_t code_point);
1046
1047// Returns the numeric value of the code point, or -1.0 if not numeric.
1048double numericValue(int32_t code_point);
1049
1050// Returns true if the code point has one of the line break properties "BK",
1051// "CR", "LR", or "NL" or the bidirectional type "B". Returns false otherwise.
1052bool unicodeIsLinebreak(int32_t code_point);
1053
1054// Returns true if the code point has the bidirectional type "WS", "B", or "S"
1055// or the category "Zs". Returns false otherwise.
1056bool unicodeIsWhitespace(int32_t code_point);
1057
1058const UnicodeChangeRecord* changeRecord(int32_t code_point);
1059const UnicodeDatabaseRecord* databaseRecord(int32_t code_point);
1060const UnicodeNamedSequence* namedSequence(int32_t code_point);
1061const UnicodeTypeRecord* typeRecord(int32_t code_point);
1062
1063}} // namespace py
1064"""
1065 )
1066
1067
1068def write_db_prelude(db):
1069 """Writes required include directives to the database file."""
1070
1071 db.write("// @")
1072 db.write(
1073 f"""\
1074generated by {os.path.basename(SCRIPT)}
1075#include "{HEADER}"
1076
1077#include <cinttypes>
1078#include <cstdio>
1079#include <cstring>
1080
1081#include "asserts.h"
1082#include "globals.h"
1083#include "objects.h"
1084#include "unicode.h"
1085
1086namespace py {{
1087"""
1088 )
1089
1090
1091def write_database_records(unicode, db, trace): # noqa: C901
1092 dummy = DatabaseRecord(0, 0, 0, 0, False, 0)
1093 table = [dummy]
1094 cache = {dummy: 0}
1095 index = [0] * NUM_CODE_POINTS
1096
1097 decomp_data = [0]
1098 decomp_prefix = [""]
1099 decomp_index = [0] * NUM_CODE_POINTS
1100 decomp_size = 0
1101
1102 comp_pairs = []
1103 comp_first = [None] * NUM_CODE_POINTS
1104 comp_last = [None] * NUM_CODE_POINTS
1105
1106 for char in CODE_POINTS:
1107 record = unicode.table[char]
1108 if record:
1109 # database properties
1110 item = DatabaseRecord(
1111 BIDIRECTIONAL_NAMES.index(record[4]),
1112 CATEGORY_NAMES.index(record[2]),
1113 int(record[3]),
1114 EASTASIANWIDTH_NAMES.index(record[15]),
1115 record[9] == "Y",
1116 record[17],
1117 )
1118
1119 idx = cache.get(item)
1120 if idx is None:
1121 cache[item] = idx = len(table)
1122 table.append(item)
1123 index[char] = idx
1124
1125 # decomposition data
1126 decomp_idx = 0
1127 if record[5]:
1128 decomp = record[5].split()
1129 prefix = decomp.pop(0) if decomp[0][0] == "<" else ""
1130 if len(decomp) > MAX_DECOMPOSITION:
1131 raise Exception(
1132 f"decomposition of code point {char:#x} is too large"
1133 )
1134
1135 try:
1136 idx = decomp_prefix.index(prefix)
1137 except ValueError:
1138 idx = len(decomp_prefix)
1139 decomp_prefix.append(prefix)
1140 assert idx < 256
1141
1142 decomp = [idx + (len(decomp) << 8)] + [int(s, 16) for s in decomp]
1143 if (
1144 not idx
1145 and len(decomp) == 3
1146 and char not in unicode.exclusions
1147 and unicode.table[decomp[1]][3] == "0"
1148 ):
1149 _, l, r = decomp
1150 comp_first[l] = 1
1151 comp_last[r] = 1
1152 comp_pairs.append((l, r, char))
1153 try:
1154 decomp_idx = decomp_data.index(decomp)
1155 except ValueError:
1156 decomp_idx = len(decomp_data)
1157 decomp_data.extend(decomp)
1158 decomp_size = decomp_size + len(decomp) * 2
1159 decomp_index[char] = decomp_idx
1160
1161 first = last = 0
1162 comp_first_ranges = []
1163 comp_last_ranges = []
1164 prev_first = prev_last = None
1165 for ch in CODE_POINTS:
1166 if comp_first[ch] is not None:
1167 comp_first[ch] = first
1168 first += 1
1169 if prev_first is None:
1170 prev_first = (ch, ch)
1171 elif prev_first[1] + 1 == ch:
1172 prev_first = prev_first[0], ch
1173 else:
1174 comp_first_ranges.append(prev_first)
1175 prev_first = (ch, ch)
1176 if comp_last[ch] is not None:
1177 comp_last[ch] = last
1178 last += 1
1179 if prev_last is None:
1180 prev_last = (ch, ch)
1181 elif prev_last[1] + 1 == ch:
1182 prev_last = prev_last[0], ch
1183 else:
1184 comp_last_ranges.append(prev_last)
1185 prev_last = (ch, ch)
1186 comp_first_ranges.append(prev_first)
1187 comp_last_ranges.append(prev_last)
1188 total_first = first
1189 total_last = last
1190
1191 comp_data = [0] * (total_first * total_last)
1192 for first, last, char in comp_pairs:
1193 first = comp_first[first]
1194 last = comp_last[last]
1195 comp_data[first * total_last + last] = char
1196
1197 if trace:
1198 print(len(table), "unique properties")
1199 print(len(decomp_prefix), "unique decomposition prefixes")
1200 print(len(decomp_data), "unique decomposition entries:", end=" ")
1201 print(decomp_size, "bytes")
1202 print(total_first, "first characters in NFC")
1203 print(total_last, "last characters in NFC")
1204 print(len(comp_pairs), "NFC pairs")
1205
1206 StructArray(
1207 "UnicodeDatabaseRecord",
1208 "kDatabaseRecords",
1209 table,
1210 "a list of unique database records",
1211 ).dump(db, trace)
1212
1213 # split record index table
1214 index1, index2, shift = splitbins(index, trace)
1215 db.write(
1216 f"""
1217// type indices
1218static const int kDatabaseIndexShift = {shift};
1219static const int32_t kDatabaseIndexMask =
1220 (int32_t{{1}} << kDatabaseIndexShift) - 1;
1221"""
1222 )
1223 UIntArray("kDatabaseIndex1", index1).dump(db, trace)
1224 UIntArray("kDatabaseIndex2", index2).dump(db, trace)
1225
1226 db.write(
1227 f"""
1228// Reindexing of NFC first and last characters
1229struct Reindex {{
1230 const int32_t start;
1231 const short count;
1232 const short index;
1233}};
1234
1235static const int kTotalLast = {total_last};
1236"""
1237 )
1238
1239 nfc_first = [
1240 Reindex(start, end - start, comp_first[start])
1241 for start, end in comp_first_ranges
1242 ] + [Reindex(0, 0, 0)]
1243 nfc_last = [
1244 Reindex(start, end - start, comp_last[start]) for start, end in comp_last_ranges
1245 ] + [Reindex(0, 0, 0)]
1246 StructArray("Reindex", "kNFCFirst", nfc_first).dump(db, trace)
1247 StructArray("Reindex", "kNFCLast", nfc_last).dump(db, trace)
1248
1249 # split decomposition index table
1250 index1, index2, shift = splitbins(decomp_index, trace)
1251
1252 db.write(
1253 f"""
1254// decomposition mappings
1255static const int kDecompShift = {shift};
1256static const int32_t kDecompMask = (int32_t{{1}} << kDecompShift) - 1;
1257
1258const char* kDecompPrefix[] = {{
1259"""
1260 )
1261 for name in decomp_prefix:
1262 db.write(f' "{name}",\n')
1263 db.write("};\n")
1264
1265 CodePointArray("kDecompData", decomp_data).dump(db, trace)
1266 UIntArray("kDecompIndex1", index1).dump(db, trace)
1267 UIntArray("kDecompIndex2", index2).dump(db, trace)
1268
1269 index1, index2, shift = splitbins(comp_data, trace)
1270 db.write(
1271 f"""
1272// NFC pairs
1273static const int kCompShift = {shift};
1274static const int32_t kCompMask = (int32_t{{1}} << kCompShift) - 1;
1275"""
1276 )
1277 UIntArray("kCompIndex", index1).dump(db, trace)
1278 UIntArray("kCompData", index2).dump(db, trace)
1279
1280
1281def write_name_data(unicode, db, trace): # noqa: C901
1282 # Collect names
1283 names = [None] * NUM_CODE_POINTS
1284 for char in CODE_POINTS:
1285 record = unicode.table[char]
1286 if record:
1287 name = record[1].strip()
1288 if name and name[0] != "<":
1289 names[char] = f"{name}\0"
1290
1291 if trace:
1292 print(len([n for n in names if n is not None]), "distinct names")
1293
1294 # Collect unique words from names. Note that we differ between
1295 # words ending a sentence, which include the trailing null byte,
1296 # and words inside a sentence, which do not.
1297 words = Words(names, trace)
1298 wordlist = words.tolist()
1299
1300 # Figure out how many phrasebook escapes we need
1301 short = (65536 - len(wordlist)) // 256
1302 assert short > 0
1303
1304 if trace:
1305 print(short, "short indexes in lexicon")
1306 n = sum(wordlist[i].frequency for i in range(short))
1307 print(n, "short indexes in phrasebook")
1308
1309 # Pick the most commonly used words, and sort the rest by falling
1310 # length to maximize overlap.
1311 tail = wordlist[short:]
1312 tail.sort(key=lambda data: len(data.word), reverse=True)
1313 wordlist[short:] = tail
1314
1315 # Build a lexicon string from words
1316 current_offset = 0
1317 lexicon_offset = [0]
1318 lexicon_string = ""
1319 word_offsets = {}
1320 for data in wordlist:
1321 # Encoding: bit 7 indicates last character in word
1322 # chr(128) indicates the last character in an entire string)
1323 last = ord(data.word[-1])
1324 encoded = f"{data.word[:-1]}{chr(last + 128)}"
1325
1326 # reuse string tails, when possible
1327 offset = lexicon_string.find(encoded)
1328 if offset < 0:
1329 offset = current_offset
1330 lexicon_string = lexicon_string + encoded
1331 current_offset += len(encoded)
1332
1333 word_offsets[data.word] = len(lexicon_offset)
1334 lexicon_offset.append(offset)
1335
1336 lexicon = [ord(ch) for ch in lexicon_string]
1337
1338 # generate phrasebook from names and lexicon
1339 phrasebook = [0]
1340 phrasebook_offset = [0] * NUM_CODE_POINTS
1341 for char in CODE_POINTS:
1342 name = names[char]
1343 if name is not None:
1344 words = name.split()
1345 phrasebook_offset[char] = len(phrasebook)
1346 for word in words:
1347 offset = word_offsets[word]
1348 if offset < short:
1349 phrasebook.append(offset)
1350 else:
1351 # store as two bytes
1352 phrasebook.append((offset >> 8) + short)
1353 phrasebook.append(offset & 255)
1354
1355 assert getsize(phrasebook) == 1
1356
1357 db.write(f"\n// lexicon")
1358 UIntArray("kLexicon", lexicon).dump(db, trace)
1359 UIntArray("kLexiconOffset", lexicon_offset).dump(db, trace)
1360
1361 # split phrasebook index table
1362 offset1, offset2, shift = splitbins(phrasebook_offset, trace)
1363
1364 db.write(
1365 f"""
1366// code => name phrasebook
1367static const int kPhrasebookShift = {shift};
1368static const int kPhrasebookShort = {short};
1369static const int kPhrasebookMask = (1 << kPhrasebookShift) - 1;
1370"""
1371 )
1372 UIntArray("kPhrasebook", phrasebook).dump(db, trace)
1373 UIntArray("kPhrasebookOffset1", offset1).dump(db, trace)
1374 UIntArray("kPhrasebookOffset2", offset2).dump(db, trace)
1375
1376 # Extract names for name hash table
1377 hash_data = []
1378 for char in CODE_POINTS:
1379 record = unicode.table[char]
1380 if record:
1381 name = record[1].strip()
1382 if name and name[0] != "<":
1383 hash_data.append((name, char))
1384
1385 db.write("\n// name => code dictionary")
1386 Hash(hash_data).dump(db, trace)
1387
1388 aliases = [codepoint for _, codepoint in unicode.aliases]
1389 UIntArray("kNameAliases", aliases).dump(db, trace)
1390
1391 StructArray(
1392 "UnicodeNamedSequence", "kNamedSequences", unicode.named_sequences
1393 ).dump(db, trace)
1394
1395
1396def write_type_data(unicode, db, trace): # noqa: C901
1397 """Writes Unicode character type tables to the database file."""
1398
1399 # extract unicode types
1400 dummy = TypeRecord(0, 0, 0, 0, 0, 0)
1401 table = [dummy]
1402 cache = {0: dummy}
1403 index = [0] * NUM_CODE_POINTS
1404 numeric = {}
1405 spaces = []
1406 linebreaks = []
1407 extended_cases = []
1408
1409 for char in CODE_POINTS:
1410 record = unicode.table[char]
1411 if record:
1412 # extract database properties
1413 category = record[2]
1414 bidirectional = record[4]
1415 properties = record[16]
1416 flags = 0
1417 # TODO(T55176519): delta = True
1418 if category in ("Lm", "Lt", "Lu", "Ll", "Lo"):
1419 flags |= ALPHA_MASK
1420 if "Lowercase" in properties:
1421 flags |= LOWER_MASK
1422 if "Line_Break" in properties or bidirectional == "B":
1423 flags |= LINEBREAK_MASK
1424 linebreaks.append(char)
1425 if category == "Zs" or bidirectional in ("WS", "B", "S"):
1426 flags |= SPACE_MASK
1427 spaces.append(char)
1428 if category == "Lt":
1429 flags |= TITLE_MASK
1430 if "Uppercase" in properties:
1431 flags |= UPPER_MASK
1432 if char == ord(" ") or category[0] not in ("C", "Z"):
1433 flags |= PRINTABLE_MASK
1434 if "XID_Start" in properties:
1435 flags |= XID_START_MASK
1436 if "XID_Continue" in properties:
1437 flags |= XID_CONTINUE_MASK
1438 if "Cased" in properties:
1439 flags |= CASED_MASK
1440 if "Case_Ignorable" in properties:
1441 flags |= CASE_IGNORABLE_MASK
1442 sc = unicode.special_casing.get(char)
1443 cf = unicode.case_folding.get(char, [char])
1444 if record[12]:
1445 upper = int(record[12], 16)
1446 else:
1447 upper = char
1448 if record[13]:
1449 lower = int(record[13], 16)
1450 else:
1451 lower = char
1452 if record[14]:
1453 title = int(record[14], 16)
1454 else:
1455 title = upper
1456 if sc is None and cf != [lower]:
1457 sc = ([lower], [title], [upper])
1458 if sc is None:
1459 if upper == lower == title:
1460 upper = lower = title = 0
1461 else:
1462 upper = upper - char
1463 lower = lower - char
1464 title = title - char
1465 assert (
1466 abs(upper) <= 2147483647
1467 and abs(lower) <= 2147483647
1468 and abs(title) <= 2147483647
1469 )
1470 else:
1471 # This happens either when some character maps to more than one
1472 # character in uppercase, lowercase, or titlecase or the
1473 # casefolded version of the character is different from the
1474 # lowercase. The extra characters are stored in a different
1475 # array.
1476 flags |= EXTENDED_CASE_MASK
1477 lower = len(extended_cases) | (len(sc[0]) << 24)
1478 extended_cases.extend(sc[0])
1479 if cf != sc[0]:
1480 lower |= len(cf) << 20
1481 extended_cases.extend(cf)
1482 upper = len(extended_cases) | (len(sc[2]) << 24)
1483 extended_cases.extend(sc[2])
1484 # Title is probably equal to upper.
1485 if sc[1] == sc[2]:
1486 title = upper
1487 else:
1488 title = len(extended_cases) | (len(sc[1]) << 24)
1489 extended_cases.extend(sc[1])
1490 # decimal digit, integer digit
1491 decimal = 0
1492 if record[6]:
1493 flags |= DECIMAL_MASK
1494 decimal = int(record[6])
1495 digit = 0
1496 if record[7]:
1497 flags |= DIGIT_MASK
1498 digit = int(record[7])
1499 if record[8]:
1500 flags |= NUMERIC_MASK
1501 numeric.setdefault(record[8], []).append(char)
1502 item = TypeRecord(decimal, digit, flags, lower, title, upper)
1503 # add entry to index and item tables
1504 i = cache.get(item)
1505 if i is None:
1506 cache[item] = i = len(table)
1507 table.append(item)
1508 index[char] = i
1509
1510 print(len(table), "unique character type entries")
1511 print(sum(map(len, numeric.values())), "numeric code points")
1512 print(len(spaces), "whitespace code points")
1513 print(len(linebreaks), "linebreak code points")
1514 print(len(extended_cases), "extended case array")
1515
1516 StructArray(
1517 "UnicodeTypeRecord",
1518 "kTypeRecords",
1519 table,
1520 "a list of unique character type descriptors",
1521 ).dump(db, trace)
1522
1523 # split record index table
1524 index1, index2, shift = splitbins(index, trace)
1525 db.write(
1526 f"""
1527// type indices
1528static const int kTypeIndexShift = {shift};
1529static const int32_t kTypeIndexMask = (int32_t{{1}} << kTypeIndexShift) - 1;
1530"""
1531 )
1532 UIntArray("kTypeIndex1", index1).dump(db, trace)
1533 UIntArray("kTypeIndex2", index2).dump(db, trace)
1534
1535 # extended case mappings
1536 CodePointArray("kExtendedCase", extended_cases).dump(db, trace)
1537
1538 db.write(
1539 """
1540double numericValue(int32_t code_point) {
1541 switch (code_point) {"""
1542 )
1543 for value, codepoints in sorted(numeric.items()):
1544 parts = value.split("/")
1545 value = " / ".join(repr(float(part)) for part in parts)
1546
1547 codepoints.sort()
1548 for codepoint in codepoints:
1549 db.write(f"\n case {codepoint:#08x}:")
1550 db.write(f"\n return {value};")
1551 db.write(
1552 """
1553 default:
1554 return -1.0;
1555 }
1556}
1557
1558bool unicodeIsLinebreak(int32_t code_point) {
1559 switch (code_point) {"""
1560 )
1561 for codepoint in sorted(linebreaks):
1562 db.write(f"\n case {codepoint:#08x}:")
1563 db.write(
1564 """
1565 return true;
1566 default:
1567 return false;
1568 }
1569}
1570
1571bool unicodeIsWhitespace(int32_t code_point) {
1572 switch (code_point) {"""
1573 )
1574 for codepoint in sorted(spaces):
1575 db.write(f"\n case {codepoint:#08x}:")
1576 db.write(
1577 """
1578 return true;
1579 default:
1580 return false;
1581 }
1582}
1583"""
1584 )
1585
1586
1587def write_change_data(unicode, db, trace):
1588 dummy = ChangeRecord(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0.0)
1589 records = [dummy]
1590 cache = {dummy: 0}
1591 index = [0] * NUM_CODE_POINTS
1592
1593 # Generate delta tables for 3.2
1594 for char, record in enumerate(unicode.change_records):
1595 record = ChangeRecord(*record)
1596 if record in cache:
1597 index[char] = cache[record]
1598 else:
1599 index[char] = cache[record] = len(records)
1600 records.append(record)
1601
1602 StructArray(
1603 "UnicodeChangeRecord",
1604 "kChangeRecords",
1605 records,
1606 f"deltas from Unicode {OLD_VERSION} to the current version",
1607 ).dump(db, trace)
1608
1609 index1, index2, shift = splitbins(index, trace)
1610 db.write(
1611 f"""
1612static const int kChangeIndexShift = {shift};
1613static const int kChangeIndexMask = (1 << kChangeIndexShift) - 1;
1614"""
1615 )
1616 UIntArray("kChangeIndex1", index1).dump(db, trace)
1617 UIntArray("kChangeIndex2", index2).dump(db, trace)
1618
1619 db.write(
1620 """
1621int32_t normalizeOld(int32_t code_point) {
1622 switch (code_point) {"""
1623 )
1624 for k, v in unicode.normalization_changes:
1625 db.write(
1626 f"""
1627 case {k:#x}:
1628 return 0x{v.lower()};"""
1629 )
1630 db.write(
1631 """
1632 default:
1633 return -1;
1634 }
1635}
1636"""
1637 )
1638
1639
1640def write_string_literals(db, trace):
1641 SmallStrArray("kBidirectionalNames", BIDIRECTIONAL_NAMES).dump(db, trace)
1642 SmallStrArray("kCategoryNames", CATEGORY_NAMES).dump(db, trace)
1643 SmallStrArray("kEastAsianWidthNames", EASTASIANWIDTH_NAMES).dump(db, trace)
1644
1645
1646def write_db_coda(db):
1647 db.write(
1648 """
1649int32_t composeCodePoint(int32_t first, int32_t last) {
1650 int32_t i = first * kTotalLast + last;
1651 int32_t j = kCompIndex[i >> kCompShift];
1652 return kCompData[(j << kCompShift) + (i & kCompMask)];
1653}
1654
1655UnicodeDecomposition decomposeCodePoint(int32_t code_point) {
1656 DCHECK_BOUND(code_point, kMaxUnicode);
1657 uint16_t offset = kDecompIndex1[code_point >> kDecompShift];
1658 offset = kDecompIndex2[(offset << kDecompShift) + (code_point & kDecompMask)];
1659
1660 int32_t decomp = kDecompData[offset];
1661 int32_t count = decomp >> kBitsPerByte;
1662 int8_t prefix = decomp & kMaxByte;
1663 return {kDecompPrefix[prefix], count, &kDecompData[offset + 1]};
1664}
1665
1666static int32_t findNFC(const Reindex* nfc, int32_t code_point) {
1667 for (word index = 0; nfc[index].start != 0; index++) {
1668 int32_t delta = code_point - nfc[index].start;
1669 if (delta < 0) {
1670 return -1;
1671 }
1672 if (delta <= nfc[index].count) {
1673 return delta + nfc[index].index;
1674 }
1675 }
1676 return -1;
1677}
1678
1679int32_t findNFCFirst(int32_t code_point) {
1680 return findNFC(kNFCFirst, code_point);
1681}
1682
1683int32_t findNFCLast(int32_t code_point) {
1684 return findNFC(kNFCLast, code_point);
1685}
1686
1687struct HangulJamo {
1688 const int length;
1689 const byte name[3];
1690};
1691
1692static const HangulJamo kHangulLeads[] = {
1693 {1, {'G'}}, {2, {'G', 'G'}}, {1, {'N'}}, {1, {'D'}},
1694 {2, {'D', 'D'}}, {1, {'R'}}, {1, {'M'}}, {1, {'B'}},
1695 {2, {'B', 'B'}}, {1, {'S'}}, {2, {'S', 'S'}}, {0},
1696 {1, {'J'}}, {2, {'J', 'J'}}, {1, {'C'}}, {1, {'K'}},
1697 {1, {'T'}}, {1, {'P'}}, {1, {'H'}},
1698};
1699
1700static const HangulJamo kHangulVowels[] = {
1701 {1, {'A'}}, {2, {'A', 'E'}}, {2, {'Y', 'A'}},
1702 {3, {'Y', 'A', 'E'}}, {2, {'E', 'O'}}, {1, {'E'}},
1703 {3, {'Y', 'E', 'O'}}, {2, {'Y', 'E'}}, {1, {'O'}},
1704 {2, {'W', 'A'}}, {3, {'W', 'A', 'E'}}, {2, {'O', 'E'}},
1705 {2, {'Y', 'O'}}, {1, {'U'}}, {3, {'W', 'E', 'O'}},
1706 {2, {'W', 'E'}}, {2, {'W', 'I'}}, {2, {'Y', 'U'}},
1707 {2, {'E', 'U'}}, {2, {'Y', 'I'}}, {1, {'I'}},
1708};
1709
1710static const HangulJamo kHangulTrails[] = {
1711 {0},
1712 {1, {'G'}},
1713 {2, {'G', 'G'}},
1714 {2, {'G', 'S'}},
1715 {1, {'N'}},
1716 {2, {'N', 'J'}},
1717 {2, {'N', 'H'}},
1718 {1, {'D'}},
1719 {1, {'L'}},
1720 {2, {'L', 'G'}},
1721 {2, {'L', 'M'}},
1722 {2, {'L', 'B'}},
1723 {2, {'L', 'S'}},
1724 {2, {'L', 'T'}},
1725 {2, {'L', 'P'}},
1726 {2, {'L', 'H'}},
1727 {1, {'M'}},
1728 {1, {'B'}},
1729 {2, {'B', 'S'}},
1730 {1, {'S'}},
1731 {2, {'S', 'S'}},
1732 {2, {'N', 'G'}},
1733 {1, {'J'}},
1734 {1, {'C'}},
1735 {1, {'K'}},
1736 {1, {'T'}},
1737 {1, {'P'}},
1738 {1, {'H'}},
1739};
1740
1741static_assert(ARRAYSIZE(kHangulLeads) == Unicode::kHangulLeadCount,
1742 "mismatch in number of Hangul lead jamo");
1743static_assert(ARRAYSIZE(kHangulVowels) == Unicode::kHangulVowelCount,
1744 "mismatch in number of Hangul vowel jamo");
1745static_assert(ARRAYSIZE(kHangulTrails) == Unicode::kHangulTrailCount,
1746 "mismatch in number of Hangul trail jamo");
1747
1748static int32_t findHangulJamo(const byte* str, word size,
1749 const HangulJamo array[], word count) {
1750 word result = -1;
1751 int length = -1;
1752 for (word i = 0; i < count; i++) {
1753 HangulJamo jamo = array[i];
1754 if (length < jamo.length && jamo.length <= size &&
1755 (std::memcmp(str, jamo.name, jamo.length) == 0)) {
1756 length = jamo.length;
1757 result = i;
1758 }
1759 }
1760 return result;
1761}
1762
1763static int32_t findHangulSyllable(const byte* name, word size) {
1764 word pos = 16;
1765 word lead = findHangulJamo(name + pos, size - pos, kHangulLeads,
1766 Unicode::kHangulLeadCount);
1767 if (lead == -1) {
1768 return -1;
1769 }
1770
1771 pos += kHangulLeads[lead].length;
1772 word vowel = findHangulJamo(name + pos, size - pos, kHangulVowels,
1773 Unicode::kHangulVowelCount);
1774 if (vowel == -1) {
1775 return -1;
1776 }
1777
1778 pos += kHangulVowels[vowel].length;
1779 word trail = findHangulJamo(name + pos, size - pos, kHangulTrails,
1780 Unicode::kHangulTrailCount);
1781 if (trail == -1 || pos + kHangulTrails[trail].length != size) {
1782 return -1;
1783 }
1784
1785 return Unicode::kHangulSyllableStart +
1786 (lead * Unicode::kHangulVowelCount + vowel) *
1787 Unicode::kHangulTrailCount +
1788 trail;
1789}
1790
1791static bool isUnifiedIdeograph(int32_t code_point) {
1792 return """
1793 )
1794
1795 for i, cjk in enumerate(CJK_RANGES):
1796 if i > 0:
1797 db.write(" ||\n ")
1798 db.write(f"({cjk.start:#010x} <= code_point && code_point <= {cjk.end:#010x})")
1799
1800 db.write(
1801 """;
1802}
1803
1804static uint32_t myHash(const byte* name, word size) {
1805 uint32_t hash = 0;
1806 for (word i = 0; i < size; i++) {
1807 hash = (hash * kCodeMagic) + ASCII::toUpper(name[i]);
1808 if (hash > 0x00ffffff) {
1809 hash = (hash & 0x00ffffff) ^ ((hash & 0xff000000) >> 24);
1810 }
1811 }
1812 return hash;
1813}
1814
1815static bool nameMatchesCodePoint(int32_t code_point, const byte* name,
1816 word size) {
1817 byte buffer[kMaxNameLength + 1];
1818 if (!nameFromCodePoint(code_point, buffer, kMaxNameLength)) {
1819 return false;
1820 }
1821 for (word i = 0; i < size; i++) {
1822 if (ASCII::toUpper(name[i]) != buffer[i]) {
1823 return false;
1824 }
1825 }
1826 return buffer[size] == '\\0';
1827}
1828
1829static int32_t getCodePoint(const byte* name, word size,
1830 int32_t (*check)(int32_t)) {
1831 if (size >= 16 && std::memcmp(name, "HANGUL SYLLABLE ", 16) == 0) {
1832 return findHangulSyllable(name, size);
1833 }
1834
1835 if (size >= 22 && std::memcmp(name, "CJK UNIFIED IDEOGRAPH-", 22) == 0) {
1836 // Four or five uppercase hexdigits must follow
1837 name += 22;
1838 size -= 22;
1839 if (size != 4 && size != 5) {
1840 return -1;
1841 }
1842
1843 int32_t result = 0;
1844 while (size--) {
1845 result *= 16;
1846 if ('0' <= *name && *name <= '9') {
1847 result += *name - '0';
1848 } else if ('A' <= *name && *name <= 'F') {
1849 result += *name - 'A' + 10;
1850 } else {
1851 return -1;
1852 }
1853 name++;
1854 }
1855 if (!isUnifiedIdeograph(result)) {
1856 return -1;
1857 }
1858 return result;
1859 }
1860
1861 uint32_t hash = myHash(name, size);
1862 uint32_t step = (hash ^ (hash >> 3)) & kCodeMask;
1863 if (step == 0) {
1864 step = kCodeMask;
1865 }
1866
1867 for (uint32_t idx = (~hash) & kCodeMask;; idx = (idx + step) & kCodeMask) {
1868 int32_t result = kCodeHash[idx];
1869 if (result == 0) {
1870 return -1;
1871 }
1872 if (nameMatchesCodePoint(result, name, size)) {
1873 return check(result);
1874 }
1875 step = step << 1;
1876 if (step > kCodeMask) {
1877 step = step ^ kCodePoly;
1878 }
1879 }
1880}
1881
1882static int32_t checkAliasAndNamedSequence(int32_t code_point) {
1883 if (Unicode::isNamedSequence(code_point)) {
1884 return -1;
1885 }
1886 if (Unicode::isAlias(code_point)) {
1887 return kNameAliases[code_point - Unicode::kAliasStart];
1888 }
1889 return code_point;
1890}
1891
1892int32_t codePointFromName(const byte* name, word size) {
1893 return getCodePoint(name, size, checkAliasAndNamedSequence);
1894}
1895
1896static int32_t checkAlias(int32_t code_point) {
1897 if (Unicode::isAlias(code_point)) {
1898 return kNameAliases[code_point - Unicode::kAliasStart];
1899 }
1900 return code_point;
1901}
1902
1903int32_t codePointFromNameOrNamedSequence(const byte* name, word size) {
1904 return getCodePoint(name, size, checkAlias);
1905}
1906
1907int32_t extendedCaseMapping(int32_t index) {
1908 return kExtendedCase[index];
1909}
1910
1911bool nameFromCodePoint(int32_t code_point, byte* buffer, word size) {
1912 DCHECK_BOUND(code_point, kMaxUnicode);
1913
1914 if (Unicode::isHangulSyllable(code_point)) {
1915 if (size < 27) {
1916 return false; // worst case: HANGUL SYLLABLE <10 chars>
1917 }
1918
1919 int32_t offset = code_point - Unicode::kHangulSyllableStart;
1920 HangulJamo lead = kHangulLeads[offset / Unicode::kHangulCodaCount];
1921 HangulJamo vowel = kHangulVowels[(offset % Unicode::kHangulCodaCount) /
1922 Unicode::kHangulTrailCount];
1923 HangulJamo trail = kHangulTrails[offset % Unicode::kHangulTrailCount];
1924
1925 std::memcpy(buffer, "HANGUL SYLLABLE ", 16);
1926 buffer += 16;
1927 std::memcpy(buffer, lead.name, lead.length);
1928 buffer += lead.length;
1929 std::memcpy(buffer, vowel.name, vowel.length);
1930 buffer += vowel.length;
1931 std::memcpy(buffer, trail.name, trail.length);
1932 buffer[trail.length] = '\\0';
1933 return true;
1934 }
1935
1936 if (isUnifiedIdeograph(code_point)) {
1937 if (size < 28) {
1938 return false; // worst case: CJK UNIFIED IDEOGRAPH-*****
1939 }
1940 std::snprintf(reinterpret_cast<char*>(buffer), size,
1941 "CJK UNIFIED IDEOGRAPH-%" PRIX32, code_point);
1942 return true;
1943 }
1944
1945 uint32_t offset = kPhrasebookOffset1[code_point >> kPhrasebookShift];
1946 offset = kPhrasebookOffset2[(offset << kPhrasebookShift) +
1947 (code_point & kPhrasebookMask)];
1948
1949 if (offset == 0) {
1950 return false;
1951 }
1952
1953 for (word i = 0;;) {
1954 if (i > 0) {
1955 if (i >= size) {
1956 return false; // buffer overflow
1957 }
1958 buffer[i++] = ' ';
1959 }
1960
1961 // get word index
1962 word word_idx = kPhrasebook[offset] - kPhrasebookShort;
1963 if (word_idx >= 0) {
1964 word_idx = (word_idx << 8) + kPhrasebook[offset + 1];
1965 offset += 2;
1966 } else {
1967 word_idx = kPhrasebook[offset++];
1968 }
1969
1970 // copy word string from lexicon
1971 const byte* word_ptr = kLexicon + kLexiconOffset[word_idx];
1972 while (*word_ptr < 128) {
1973 if (i >= size) {
1974 return false; // buffer overflow
1975 }
1976 buffer[i++] = *word_ptr++;
1977 }
1978
1979 if (i >= size) {
1980 return false; // buffer overflow
1981 }
1982 buffer[i++] = *word_ptr & 0x7f;
1983 if (*word_ptr == 0x80) {
1984 return true;
1985 }
1986 }
1987}
1988
1989const UnicodeChangeRecord* changeRecord(int32_t code_point) {
1990 if (code_point > kMaxUnicode) {
1991 return &kChangeRecords[0];
1992 }
1993
1994 int index = kChangeIndex1[code_point >> kChangeIndexShift];
1995 index <<= kChangeIndexShift;
1996 index = kChangeIndex2[index + (code_point & kChangeIndexMask)];
1997 return &kChangeRecords[index];
1998}
1999
2000const UnicodeDatabaseRecord* databaseRecord(int32_t code_point) {
2001 if (code_point > kMaxUnicode) {
2002 return &kDatabaseRecords[0];
2003 }
2004
2005 int index = kDatabaseIndex1[code_point >> kDatabaseIndexShift];
2006 index <<= kDatabaseIndexShift;
2007 index = kDatabaseIndex2[index + (code_point & kDatabaseIndexMask)];
2008 return &kDatabaseRecords[index];
2009}
2010
2011const UnicodeNamedSequence* namedSequence(int32_t code_point) {
2012 DCHECK(Unicode::isNamedSequence(code_point), "not a named sequence");
2013 return &kNamedSequences[code_point - Unicode::kNamedSequenceStart];
2014}
2015
2016const UnicodeTypeRecord* typeRecord(int32_t code_point) {
2017 if (code_point > kMaxUnicode) {
2018 return &kTypeRecords[0];
2019 }
2020
2021 int index = kTypeIndex1[code_point >> kTypeIndexShift];
2022 index <<= kTypeIndexShift;
2023 index = kTypeIndex2[index + (code_point & kTypeIndexMask)];
2024 return &kTypeRecords[index];
2025}
2026
2027} // namespace py
2028"""
2029 )
2030
2031
2032def write_db(trace=False):
2033 print(f"--- Reading {UNICODE_DATA} v{UNIDATA_VERSION} ...")
2034 unicode = UnicodeData(UNIDATA_VERSION)
2035
2036 print(len(list(filter(None, unicode.table))), "characters")
2037
2038 print(f"--- Reading {UNICODE_DATA} v{OLD_VERSION} ...")
2039 old_unicode = UnicodeData(OLD_VERSION, check_cjk=False)
2040 print(len(list(filter(None, old_unicode.table))), "characters")
2041 unicode.merge(old_unicode)
2042
2043 with open(HEADER_PATH, "w") as header:
2044 write_header(unicode, header)
2045
2046 with open(DB_PATH, "w") as db:
2047 write_db_prelude(db)
2048 write_database_records(unicode, db, trace)
2049 write_name_data(unicode, db, trace)
2050 write_type_data(unicode, db, trace)
2051 write_change_data(unicode, db, trace)
2052 write_string_literals(db, trace)
2053 write_db_coda(db)
2054
2055
2056if __name__ == "__main__":
2057 write_db(True)