vt: introduce gen_ucs_fallback_table.py to create ucs_fallback_table.h

Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git

kernel os linux

The generated table maps complex characters to their simpler fallback
forms for a terminal display when corresponding glyphs are unavailable.
This includes diacritics, symbols as well as many drawing characters.
Fallback characters aren't perfect replacements, obviously. But they are
still far more useful than a bunch of squared question marks.

Signed-off-by: Nicolas Pitre <npitre@baylibre.com>
Reviewed-by: Jiri Slaby <jirislaby@kernel.org>
Link: https://lore.kernel.org/r/20250507141535.40655-5-nico@fluxnic.net
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>

authored by

Nicolas Pitre and committed by

Greg Kroah-Hartman 10 months ago 5071ddc1 bb9a1516

+352

1 changed file

expand all

drivers

tty

gen_ucs_fallback_table.py

+352

drivers/tty/vt/gen_ucs_fallback_table.py

··· 1 + #!/usr/bin/env python3 2 + # SPDX-License-Identifier: GPL-2.0 3 + # 4 + # Leverage Python's unidecode module to generate ucs_fallback_table.h 5 + # 6 + # The generated table maps complex characters to their simpler fallback forms 7 + # for a terminal display when corresponding glyphs are unavailable. 8 + # 9 + # Usage: 10 + # python3 gen_ucs_fallback_table.py # Generate fallback tables 11 + # python3 gen_ucs_fallback_table.py -o FILE # Specify output file 12 + 13 + import unicodedata 14 + from unidecode import unidecode 15 + import sys 16 + import argparse 17 + from collections import defaultdict 18 + 19 + # Try to get unidecode version 20 + try: 21 + from importlib.metadata import version 22 + unidecode_version = version('unidecode') 23 + except: 24 + unidecode_version = 'unknown' 25 + 26 + # This script's file name 27 + from pathlib import Path 28 + this_file = Path(__file__).name 29 + 30 + # Default output file name 31 + DEFAULT_OUT_FILE = "ucs_fallback_table.h" 32 + 33 + # Define the range marker value 34 + RANGE_MARKER = 0x00 35 + 36 + def generate_fallback_map(): 37 + """Generate a fallback map using unidecode for all relevant Unicode points.""" 38 + fallback_map = {} 39 + 40 + # Process BMP characters (0x0000 - 0xFFFF) to keep table size manageable 41 + for cp in range(0x0080, 0x10000): # Skip ASCII range (0x00-0x7F) 42 + char = chr(cp) 43 + 44 + # Skip unassigned/control characters 45 + try: 46 + if not unicodedata.name(char, ''): 47 + continue 48 + except ValueError: 49 + continue 50 + 51 + # Get the unidecode transliteration 52 + ascii_version = unidecode(char) 53 + 54 + # Only store if it results in a single character mapping 55 + if len(ascii_version) == 1: 56 + fallback_map[cp] = ord(ascii_version) 57 + 58 + # Apply manual overrides for special cases 59 + fallback_map.update(get_special_overrides()) 60 + 61 + return fallback_map 62 + 63 + def get_special_overrides(): 64 + """Get special case overrides that need different handling than unidecode 65 + provides... or doesn't provide at all.""" 66 + 67 + overrides = {} 68 + 69 + # Multi-character unidecode output 70 + # These map to single chars instead of unidecode's multiple-char mappings 71 + # In a terminal fallback context, we need a single character rather than multiple 72 + overrides[0x00C6] = ord('E') # Æ LATIN CAPITAL LETTER AE -> E (unidecode: "AE") 73 + overrides[0x00E6] = ord('e') # æ LATIN SMALL LETTER AE -> e (unidecode: "ae") 74 + overrides[0x0152] = ord('E') # Œ LATIN CAPITAL LIGATURE OE -> E (unidecode: "OE") 75 + overrides[0x0153] = ord('e') # œ LATIN SMALL LETTER LIGATURE OE -> e (unidecode: "oe") 76 + overrides[0x00DF] = ord('s') # ß LATIN SMALL LETTER SHARP S -> s (unidecode: "ss") 77 + 78 + # Comparison operators that unidecode renders as multiple characters 79 + overrides[0x2264] = ord('<') # ≤ LESS-THAN OR EQUAL TO -> < (unidecode: "<=") 80 + overrides[0x2265] = ord('>') # ≥ GREATER-THAN OR EQUAL TO -> > (unidecode: ">=") 81 + 82 + # Unidecode returns an empty string for these 83 + overrides[0x2260] = ord('#') # ≠ NOT EQUAL TO -> # (unidecode: empty string) 84 + 85 + # Quadrant block characters that unidecode doesn't map 86 + for cp in range(0x2596, 0x259F+1): 87 + overrides[cp] = ord('#') # ▖ ▗ ▘ ▙ etc. - map to # (unidecode: empty string) 88 + 89 + # Directional arrows 90 + # These provide better semantic meaning than unidecode's mappings 91 + overrides[0x2192] = ord('>') # → RIGHTWARDS ARROW -> > (unidecode: "-") 92 + overrides[0x2190] = ord('<') # ← LEFTWARDS ARROW -> < (unidecode: "-") 93 + overrides[0x2191] = ord('^') # ↑ UPWARDS ARROW -> ^ (unidecode: "|") 94 + overrides[0x2193] = ord('v') # ↓ DOWNWARDS ARROW -> v (unidecode: "|") 95 + 96 + # Double arrows with their directional semantic mappings 97 + overrides[0x21D0] = ord('<') # ⇐ LEFTWARDS DOUBLE ARROW -> < 98 + overrides[0x21D1] = ord('^') # ⇑ UPWARDS DOUBLE ARROW -> ^ 99 + overrides[0x21D2] = ord('>') # ⇒ RIGHTWARDS DOUBLE ARROW -> > 100 + overrides[0x21D3] = ord('v') # ⇓ DOWNWARDS DOUBLE ARROW -> v 101 + 102 + # Halfwidth arrows 103 + # These need the same treatment as their normal-width counterparts 104 + overrides[0xFFE9] = ord('<') # ￩ HALFWIDTH LEFTWARDS ARROW -> < (unidecode: "-") 105 + overrides[0xFFEA] = ord('^') # ￪ HALFWIDTH UPWARDS ARROW -> ^ (unidecode: "|") 106 + overrides[0xFFEB] = ord('>') # ￫ HALFWIDTH RIGHTWARDS ARROW -> > (unidecode: "-") 107 + overrides[0xFFEC] = ord('v') # ￬ HALFWIDTH DOWNWARDS ARROW -> v (unidecode: "|") 108 + 109 + # Currency symbols - each mapped to a representative letter 110 + overrides[0x00A2] = ord('c') # ¢ CENT SIGN -> c 111 + overrides[0x00A3] = ord('L') # £ POUND SIGN -> L 112 + overrides[0x00A5] = ord('Y') # ¥ YEN SIGN -> Y 113 + overrides[0x20AC] = ord('E') # € EURO SIGN -> E 114 + 115 + # Symbols mapped to letters 116 + overrides[0x00A7] = ord('S') # § SECTION SIGN -> S 117 + overrides[0x00A9] = ord('C') # © COPYRIGHT SIGN -> C 118 + overrides[0x00AE] = ord('R') # ® REGISTERED SIGN -> R 119 + overrides[0x2122] = ord('T') # ™ TRADE MARK SIGN -> T 120 + 121 + # Degree-related symbols 122 + overrides[0x00B0] = ord('o') # ° DEGREE SIGN -> o 123 + overrides[0x2103] = ord('C') # ℃ DEGREE CELSIUS -> C 124 + overrides[0x2109] = ord('F') # ℉ DEGREE FAHRENHEIT -> F 125 + 126 + # Angle quotation marks 127 + overrides[0x00AB] = ord('<') # « LEFT-POINTING DOUBLE ANGLE QUOTATION MARK -> < 128 + overrides[0x00BB] = ord('>') # » RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK -> > 129 + 130 + # Operators with circular shape 131 + overrides[0x2218] = ord('o') # ∘ RING OPERATOR -> o 132 + overrides[0x2219] = ord('.') # ∙ BULLET OPERATOR -> . 133 + 134 + # Negated mathematical symbols (preserving the negation semantics) 135 + # Negated symbols mapped to exclamation mark (semantically "not") 136 + for cp in (0x2204, 0x2209, 0x220C, 0x2224, 0x2226, 0x226E, 0x226F, 0x2280, 0x2281, 0x2284, 0x2285): 137 + overrides[cp] = ord('!') # Negated math symbols -> ! (not) 138 + 139 + # Negated symbols mapped to hash sign (semantically "not equal") 140 + for cp in (0x2241, 0x2244, 0x2249, 0x2262, 0x2268, 0x2269, 0x226D, 0x228A, 0x228B): 141 + overrides[cp] = ord('#') # Negated equality symbols -> # (not equal) 142 + 143 + # Negated arrows - all mapped to exclamation mark 144 + for cp in (0x219A, 0x219B, 0x21AE, 0x21CD, 0x21CE, 0x21CF): 145 + overrides[cp] = ord('!') # Negated arrows -> ! (not) 146 + 147 + # Dashes and hyphens 148 + for cp in (0x2010, 0x2011, 0x2012, 0x2013, 0x2014, 0x2015, 0x2043, 0x2052): 149 + overrides[cp] = ord('-') # Dashes and hyphens -> - 150 + 151 + # Question mark punctuation 152 + for cp in (0x203D, 0x2047, 0x2048): 153 + overrides[cp] = ord('?') # Question marks -> ? 154 + 155 + # Exclamation mark punctuation 156 + for cp in (0x203C, 0x2049): 157 + overrides[cp] = ord('!') # Exclamation marks -> ! 158 + 159 + # Asterisk-like symbols 160 + for cp in (0x2042, 0x2051, 0x2055): 161 + overrides[cp] = ord('*') 162 + 163 + # Other specific punctuation with unique mappings 164 + overrides[0x201E] = ord('"') # „ DOUBLE LOW-9 QUOTATION MARK 165 + overrides[0x2023] = ord('>') # ‣ TRIANGULAR BULLET 166 + overrides[0x2026] = ord('.') # … HORIZONTAL ELLIPSIS 167 + overrides[0x2033] = ord('"') # ″ DOUBLE PRIME 168 + overrides[0x204B] = ord('P') # ⁋ REVERSED PILCROW SIGN 169 + overrides[0x204C] = ord('<') # ⁌ BLACK LEFTWARDS BULLET 170 + overrides[0x204D] = ord('>') # ⁍ BLACK RIGHTWARDS BULLET 171 + overrides[0x204F] = ord(';') # ⁏ REVERSED SEMICOLON 172 + overrides[0x205B] = ord(':') # ⁛ FOUR DOT MARK 173 + 174 + # Check marks 175 + overrides[0x2713] = ord('v') # ✓ CHECK MARK 176 + overrides[0x2714] = ord('V') # ✔ HEAVY CHECK MARK 177 + 178 + # X marks - lowercase for regular, uppercase for heavy 179 + for cp in (0x2715, 0x2717): 180 + overrides[cp] = ord('x') # Regular X marks -> x 181 + for cp in (0x2716, 0x2718): 182 + overrides[cp] = ord('X') # Heavy X marks -> X 183 + 184 + # Stars and asterisk-like symbols mapped to '*' 185 + for cp in (0x2605, 0x2606, 0x262A, 0x269D, 0x2698): 186 + overrides[cp] = ord('*') # All star and asterisk symbols -> * 187 + for cp in range(0x2721, 0x2746+1): 188 + overrides[cp] = ord('*') # All star and asterisk symbols -> * 189 + for cp in range(0x2749, 0x274B+1): 190 + overrides[cp] = ord('*') # Last set of asterisk symbols -> * 191 + for cp in (0x229B, 0x22C6, 0x235F, 0x2363): 192 + overrides[cp] = ord('*') # Star operators -> * 193 + 194 + # Special exclusions with fallback value of 0 195 + # These will be filtered out in organize_by_pages() 196 + 197 + # Exclude U+2028 (LINE SEPARATOR) 198 + overrides[0x2028] = 0 # LINE SEPARATOR (unidecode: '\n') 199 + 200 + return overrides 201 + 202 + def organize_by_pages(fallback_map): 203 + """Organize the fallback mappings by their high byte (page).""" 204 + # Group by high byte (page) 205 + page_groups = defaultdict(list) 206 + for code, fallback in fallback_map.items(): 207 + # Skip characters with fallback value of 0 (excluded characters) 208 + if fallback == 0: 209 + continue 210 + 211 + page = code >> 8 # Get the high byte (page) 212 + offset = code & 0xFF # Get the low byte (offset within page) 213 + page_groups[page].append((offset, fallback)) 214 + 215 + # Sort each page's entries by offset 216 + for page in page_groups: 217 + page_groups[page].sort() 218 + 219 + return page_groups 220 + 221 + def compress_ranges(page_groups): 222 + """Compress consecutive entries with the same fallback character into ranges. 223 + A range is only compressed if it contains 3 or more consecutive entries.""" 224 + 225 + compressed_pages = {} 226 + 227 + for page, entries in page_groups.items(): 228 + compressed_entries = [] 229 + i = 0 230 + while i < len(entries): 231 + start_offset, fallback = entries[i] 232 + 233 + # Look ahead to find consecutive entries with the same fallback 234 + j = i + 1 235 + while (j < len(entries) and 236 + entries[j][0] == entries[j-1][0] + 1 and # consecutive offsets 237 + entries[j][1] == fallback): # same fallback 238 + j += 1 239 + 240 + # Calculate the range end 241 + end_offset = entries[j-1][0] 242 + 243 + # If we found a range with 3 or more entries (worth compressing) 244 + if j - i >= 3: 245 + # Add a range entry 246 + compressed_entries.append((start_offset, RANGE_MARKER)) 247 + compressed_entries.append((end_offset, fallback)) 248 + else: 249 + # Add the individual entries as is 250 + for k in range(i, j): 251 + compressed_entries.append(entries[k]) 252 + 253 + i = j 254 + 255 + compressed_pages[page] = compressed_entries 256 + 257 + return compressed_pages 258 + 259 + def cp_name(cp): 260 + """Get the Unicode character name for a code point.""" 261 + try: 262 + return unicodedata.name(chr(cp)) 263 + except: 264 + return f"U+{cp:04X}" 265 + 266 + def generate_fallback_tables(out_file=DEFAULT_OUT_FILE): 267 + """Generate the fallback character tables.""" 268 + # Generate fallback map using unidecode 269 + fallback_map = generate_fallback_map() 270 + print(f"Generated {len(fallback_map)} total fallback mappings") 271 + 272 + # Organize by pages 273 + page_groups = organize_by_pages(fallback_map) 274 + print(f"Organized into {len(page_groups)} pages") 275 + 276 + # Compress ranges 277 + compressed_pages = compress_ranges(page_groups) 278 + total_compressed_entries = sum(len(entries) for entries in compressed_pages.values()) 279 + print(f"Total compressed entries: {total_compressed_entries}") 280 + 281 + # Create output file 282 + with open(out_file, 'w') as f: 283 + f.write(f"""\ 284 + /* SPDX-License-Identifier: GPL-2.0 */ 285 + /* 286 + * {out_file} - Unicode character fallback table 287 + * 288 + * Auto-generated by {this_file} 289 + * 290 + * Unicode Version: {unicodedata.unidata_version} 291 + * Unidecode Version: {unidecode_version} 292 + * 293 + * This file contains optimized tables that map complex Unicode characters 294 + * to simpler fallback characters for terminal display when corresponding 295 + * glyphs are unavailable. 296 + */ 297 + 298 + static const struct ucs_page_desc ucs_fallback_pages[] = {{ 299 + """) 300 + 301 + # Convert compressed_pages to a sorted list of (page, entries) tuples 302 + sorted_pages = sorted(compressed_pages.items()) 303 + 304 + # Track the start index for each page 305 + start_index = 0 306 + 307 + # Write page descriptors 308 + for page, entries in sorted_pages: 309 + count = len(entries) 310 + f.write(f"\t{{ 0x{page:02X}, {count}, {start_index} }},\n") 311 + start_index += count 312 + 313 + # Write entries array 314 + f.write("""\ 315 + }; 316 + 317 + /* Page entries array (referenced by page descriptors) */ 318 + static const struct ucs_page_entry ucs_fallback_entries[] = { 319 + """) 320 + 321 + # Write all entries 322 + for page, entries in sorted_pages: 323 + page_hex = f"0x{page:02X}" 324 + f.write(f"\t/* Entries for page {page_hex} */\n") 325 + 326 + for i, (offset, fallback) in enumerate(entries): 327 + # Convert to hex for better readability 328 + offset_hex = f"0x{offset:02X}" 329 + fallback_hex = f"0x{fallback:02X}" 330 + 331 + # Handle comments 332 + codepoint = (page << 8) | offset 333 + 334 + if fallback == RANGE_MARKER: 335 + comment = f"{cp_name(codepoint)} -> ..." 336 + else: 337 + comment = f"{cp_name(codepoint)} -> '{chr(fallback)}'" 338 + f.write(f"\t{{ 0x{offset:02X}, 0x{fallback:02X} }}, /* {comment} */\n") 339 + 340 + f.write(f"""\ 341 + }}; 342 + 343 + #define UCS_PAGE_ENTRY_RANGE_MARKER {RANGE_MARKER} 344 + """) 345 + 346 + if __name__ == "__main__": 347 + parser = argparse.ArgumentParser(description="Generate Unicode fallback character tables") 348 + parser.add_argument("-o", "--output", dest="output_file", default=DEFAULT_OUT_FILE, 349 + help=f"Output file name (default: {DEFAULT_OUT_FILE})") 350 + args = parser.parse_args() 351 + 352 + generate_fallback_tables(out_file=args.output_file)