Linux kernel mirror (for testing)
git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel
os
linux
1#!/usr/bin/env python3
2# SPDX-License-Identifier: GPL-2.0
3#
4# Leverage Python's unicodedata module to generate ucs_width_table.h
5
6import unicodedata
7import sys
8import argparse
9
10# This script's file name
11from pathlib import Path
12this_file = Path(__file__).name
13
14# Default output file name
15DEFAULT_OUT_FILE = "ucs_width_table.h"
16
17# --- Global Constants for Width Assignments ---
18
19# Known zero-width characters
20KNOWN_ZERO_WIDTH = (
21 0x200B, # ZERO WIDTH SPACE
22 0x200C, # ZERO WIDTH NON-JOINER
23 0x200D, # ZERO WIDTH JOINER
24 0x2060, # WORD JOINER
25 0xFEFF # ZERO WIDTH NO-BREAK SPACE (BOM)
26)
27
28# Zero-width emoji modifiers and components
29# NOTE: Some of these characters would normally be single-width according to
30# East Asian Width properties, but we deliberately override them to be
31# zero-width because they function as modifiers in emoji sequences.
32EMOJI_ZERO_WIDTH = [
33 # Skin tone modifiers
34 (0x1F3FB, 0x1F3FF), # Emoji modifiers (skin tones)
35
36 # Variation selectors (note: VS16 is treated specially in vt.c)
37 (0xFE00, 0xFE0F), # Variation Selectors 1-16
38
39 # Gender and hair style modifiers
40 # These would be single-width by Unicode properties, but are zero-width
41 # when part of emoji
42 (0x2640, 0x2640), # Female sign
43 (0x2642, 0x2642), # Male sign
44 (0x26A7, 0x26A7), # Transgender symbol
45 (0x1F9B0, 0x1F9B3), # Hair components (red, curly, white, bald)
46
47 # Tag characters
48 (0xE0020, 0xE007E), # Tags
49]
50
51# Regional indicators (flag components)
52REGIONAL_INDICATORS = (0x1F1E6, 0x1F1FF) # Regional indicator symbols A-Z
53
54# Double-width emoji ranges
55#
56# Many emoji characters are classified as single-width according to Unicode
57# Standard Annex #11 East Asian Width property (N or Neutral), but we
58# deliberately override them to be double-width. References:
59# 1. Unicode Technical Standard #51: Unicode Emoji
60# (https://www.unicode.org/reports/tr51/)
61# 2. Principle of "emoji presentation" in WHATWG CSS Text specification
62# (https://drafts.csswg.org/css-text-3/#character-properties)
63# 3. Terminal emulator implementations (iTerm2, Windows Terminal, etc.) which
64# universally render emoji as double-width characters regardless of their
65# Unicode EAW property
66# 4. W3C Work Item: Requirements for Japanese Text Layout - Section 3.8.1
67# Emoji width (https://www.w3.org/TR/jlreq/)
68EMOJI_RANGES = [
69 (0x1F000, 0x1F02F), # Mahjong Tiles (EAW: N, but displayed as double-width)
70 (0x1F0A0, 0x1F0FF), # Playing Cards (EAW: N, but displayed as double-width)
71 (0x1F300, 0x1F5FF), # Miscellaneous Symbols and Pictographs
72 (0x1F600, 0x1F64F), # Emoticons
73 (0x1F680, 0x1F6FF), # Transport and Map Symbols
74 (0x1F700, 0x1F77F), # Alchemical Symbols
75 (0x1F780, 0x1F7FF), # Geometric Shapes Extended
76 (0x1F800, 0x1F8FF), # Supplemental Arrows-C
77 (0x1F900, 0x1F9FF), # Supplemental Symbols and Pictographs
78 (0x1FA00, 0x1FA6F), # Chess Symbols
79 (0x1FA70, 0x1FAFF), # Symbols and Pictographs Extended-A
80]
81
82def create_width_tables():
83 """
84 Creates Unicode character width tables and returns the data structures.
85
86 Returns:
87 tuple: (zero_width_ranges, double_width_ranges)
88 """
89
90 # Width data mapping
91 width_map = {} # Maps code points to width (0, 1, 2)
92
93 # Mark emoji modifiers as zero-width
94 for start, end in EMOJI_ZERO_WIDTH:
95 for cp in range(start, end + 1):
96 width_map[cp] = 0
97
98 # Mark all regional indicators as single-width as they are usually paired
99 # providing a combined width of 2 when displayed together.
100 start, end = REGIONAL_INDICATORS
101 for cp in range(start, end + 1):
102 width_map[cp] = 1
103
104 # Process all assigned Unicode code points (Basic Multilingual Plane +
105 # Supplementary Planes) Range 0x0 to 0x10FFFF (the full Unicode range)
106 for block_start in range(0, 0x110000, 0x1000):
107 block_end = block_start + 0x1000
108 for cp in range(block_start, block_end):
109 try:
110 char = chr(cp)
111
112 # Skip if already processed
113 if cp in width_map:
114 continue
115
116 # Check for combining marks and a format characters
117 category = unicodedata.category(char)
118
119 # Combining marks
120 if category.startswith('M'):
121 width_map[cp] = 0
122 continue
123
124 # Format characters
125 # Since we have no support for bidirectional text, all format
126 # characters (category Cf) can be treated with width 0 (zero)
127 # for simplicity, as they don't need to occupy visual space
128 # in a non-bidirectional text environment.
129 if category == 'Cf':
130 width_map[cp] = 0
131 continue
132
133 # Known zero-width characters
134 if cp in KNOWN_ZERO_WIDTH:
135 width_map[cp] = 0
136 continue
137
138 # Use East Asian Width property
139 eaw = unicodedata.east_asian_width(char)
140 if eaw in ('F', 'W'): # Fullwidth or Wide
141 width_map[cp] = 2
142 elif eaw in ('Na', 'H', 'N', 'A'): # Narrow, Halfwidth, Neutral, Ambiguous
143 width_map[cp] = 1
144 else:
145 # Default to single-width for unknown
146 width_map[cp] = 1
147
148 except (ValueError, OverflowError):
149 # Skip invalid code points
150 continue
151
152 # Process Emoji - generally double-width
153 for start, end in EMOJI_RANGES:
154 for cp in range(start, end + 1):
155 if cp not in width_map or width_map[cp] != 0: # Don't override zero-width
156 try:
157 char = chr(cp)
158 width_map[cp] = 2
159 except (ValueError, OverflowError):
160 continue
161
162 # Optimize to create range tables
163 def ranges_optimize(width_data, target_width):
164 points = sorted([cp for cp, width in width_data.items() if width == target_width])
165 if not points:
166 return []
167
168 # Group consecutive code points into ranges
169 ranges = []
170 start = points[0]
171 prev = start
172
173 for cp in points[1:]:
174 if cp > prev + 1:
175 ranges.append((start, prev))
176 start = cp
177 prev = cp
178
179 # Add the last range
180 ranges.append((start, prev))
181 return ranges
182
183 # Extract ranges for each width
184 zero_width_ranges = ranges_optimize(width_map, 0)
185 double_width_ranges = ranges_optimize(width_map, 2)
186
187 return zero_width_ranges, double_width_ranges
188
189def write_tables(zero_width_ranges, double_width_ranges, out_file=DEFAULT_OUT_FILE):
190 """
191 Write the generated tables to C header file.
192
193 Args:
194 zero_width_ranges: List of (start, end) ranges for zero-width characters
195 double_width_ranges: List of (start, end) ranges for double-width characters
196 out_file: Output file name (default: DEFAULT_OUT_FILE)
197 """
198
199 # Function to split ranges into BMP (16-bit) and non-BMP (above 16-bit)
200 def split_ranges_by_size(ranges):
201 bmp_ranges = []
202 non_bmp_ranges = []
203
204 for start, end in ranges:
205 if end <= 0xFFFF:
206 bmp_ranges.append((start, end))
207 elif start > 0xFFFF:
208 non_bmp_ranges.append((start, end))
209 else:
210 # Split the range at 0xFFFF
211 bmp_ranges.append((start, 0xFFFF))
212 non_bmp_ranges.append((0x10000, end))
213
214 return bmp_ranges, non_bmp_ranges
215
216 # Split ranges into BMP and non-BMP
217 zero_width_bmp, zero_width_non_bmp = split_ranges_by_size(zero_width_ranges)
218 double_width_bmp, double_width_non_bmp = split_ranges_by_size(double_width_ranges)
219
220 # Function to generate code point description comments
221 def get_code_point_comment(start, end):
222 try:
223 start_char_desc = unicodedata.name(chr(start))
224 if start == end:
225 return f"/* {start_char_desc} */"
226 else:
227 end_char_desc = unicodedata.name(chr(end))
228 return f"/* {start_char_desc} - {end_char_desc} */"
229 except:
230 if start == end:
231 return f"/* U+{start:04X} */"
232 else:
233 return f"/* U+{start:04X} - U+{end:04X} */"
234
235 # Generate C tables
236 with open(out_file, 'w') as f:
237 f.write(f"""\
238/* SPDX-License-Identifier: GPL-2.0 */
239/*
240 * {out_file} - Unicode character width
241 *
242 * Auto-generated by {this_file}
243 *
244 * Unicode Version: {unicodedata.unidata_version}
245 */
246
247/* Zero-width character ranges (BMP - Basic Multilingual Plane, U+0000 to U+FFFF) */
248static const struct ucs_interval16 ucs_zero_width_bmp_ranges[] = {{
249""")
250
251 for start, end in zero_width_bmp:
252 comment = get_code_point_comment(start, end)
253 f.write(f"\t{{ 0x{start:04X}, 0x{end:04X} }}, {comment}\n")
254
255 f.write("""\
256};
257
258/* Zero-width character ranges (non-BMP, U+10000 and above) */
259static const struct ucs_interval32 ucs_zero_width_non_bmp_ranges[] = {
260""")
261
262 for start, end in zero_width_non_bmp:
263 comment = get_code_point_comment(start, end)
264 f.write(f"\t{{ 0x{start:05X}, 0x{end:05X} }}, {comment}\n")
265
266 f.write("""\
267};
268
269/* Double-width character ranges (BMP - Basic Multilingual Plane, U+0000 to U+FFFF) */
270static const struct ucs_interval16 ucs_double_width_bmp_ranges[] = {
271""")
272
273 for start, end in double_width_bmp:
274 comment = get_code_point_comment(start, end)
275 f.write(f"\t{{ 0x{start:04X}, 0x{end:04X} }}, {comment}\n")
276
277 f.write("""\
278};
279
280/* Double-width character ranges (non-BMP, U+10000 and above) */
281static const struct ucs_interval32 ucs_double_width_non_bmp_ranges[] = {
282""")
283
284 for start, end in double_width_non_bmp:
285 comment = get_code_point_comment(start, end)
286 f.write(f"\t{{ 0x{start:05X}, 0x{end:05X} }}, {comment}\n")
287
288 f.write("};\n")
289
290if __name__ == "__main__":
291 # Parse command line arguments
292 parser = argparse.ArgumentParser(description="Generate Unicode width tables")
293 parser.add_argument("-o", "--output", dest="output_file", default=DEFAULT_OUT_FILE,
294 help=f"Output file name (default: {DEFAULT_OUT_FILE})")
295 args = parser.parse_args()
296
297 # Write tables to header file
298 zero_width_ranges, double_width_ranges = create_width_tables()
299 write_tables(zero_width_ranges, double_width_ranges, out_file=args.output_file)
300
301 # Print summary
302 zero_width_count = sum(end - start + 1 for start, end in zero_width_ranges)
303 double_width_count = sum(end - start + 1 for start, end in double_width_ranges)
304 print(f"Generated {args.output_file} with:")
305 print(f"- {len(zero_width_ranges)} zero-width ranges covering ~{zero_width_count} code points")
306 print(f"- {len(double_width_ranges)} double-width ranges covering ~{double_width_count} code points")
307 print(f"- Unicode Version: {unicodedata.unidata_version}")