Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Kernel utf-8 handling

This patch fixes dead keys and copy/paste of non-ASCII characters in UTF-8
mode on Linux console. See more details about the original patch at:
http://chris.heathens.co.nz/linux/utf8.html

Already posted on
(Oldest) http://lkml.org/lkml/2003/5/31/148
http://lkml.org/lkml/2005/12/24/69
(Recent) http://lkml.org/lkml/2006/8/7/75

[bunk@stusta.de: make drivers/char/selection.c:store_utf8() static]
Signed-off-by: Jan Engelhardt <jengelh@gmx.de>
Cc: Alexander E. Patrakov <patrakov@ums.usu.ru>
Cc: Dmitry Torokhov <dtor@mail.ru>
Cc: "Antonino A. Daplas" <adaplas@pol.net>
Signed-off-by: Adrian Bunk <bunk@stusta.de>
Cc: David Woodhouse <dwmw2@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

authored by

Jan Engelhardt and committed by
Linus Torvalds
759448f4 aa0ac365

+134 -23
+72 -6
drivers/char/consolemap.c
··· 177 177 unsigned long refcount; 178 178 unsigned long sum; 179 179 unsigned char *inverse_translations[4]; 180 + u16 *inverse_trans_unicode; 180 181 int readonly; 181 182 }; 182 183 ··· 208 207 } 209 208 } 210 209 210 + static void set_inverse_trans_unicode(struct vc_data *conp, 211 + struct uni_pagedir *p) 212 + { 213 + int i, j, k, glyph; 214 + u16 **p1, *p2; 215 + u16 *q; 216 + 217 + if (!p) return; 218 + q = p->inverse_trans_unicode; 219 + if (!q) { 220 + q = p->inverse_trans_unicode = 221 + kmalloc(MAX_GLYPH * sizeof(u16), GFP_KERNEL); 222 + if (!q) 223 + return; 224 + } 225 + memset(q, 0, MAX_GLYPH * sizeof(u16)); 226 + 227 + for (i = 0; i < 32; i++) { 228 + p1 = p->uni_pgdir[i]; 229 + if (!p1) 230 + continue; 231 + for (j = 0; j < 32; j++) { 232 + p2 = p1[j]; 233 + if (!p2) 234 + continue; 235 + for (k = 0; k < 64; k++) { 236 + glyph = p2[k]; 237 + if (glyph >= 0 && glyph < MAX_GLYPH 238 + && q[glyph] < 32) 239 + q[glyph] = (i << 11) + (j << 6) + k; 240 + } 241 + } 242 + } 243 + } 244 + 211 245 unsigned short *set_translate(int m, struct vc_data *vc) 212 246 { 213 247 inv_translate[vc->vc_num] = m; ··· 253 217 * Inverse translation is impossible for several reasons: 254 218 * 1. The font<->character maps are not 1-1. 255 219 * 2. The text may have been written while a different translation map 256 - * was active, or using Unicode. 220 + * was active. 257 221 * Still, it is now possible to a certain extent to cut and paste non-ASCII. 258 222 */ 259 - unsigned char inverse_translate(struct vc_data *conp, int glyph) 223 + u16 inverse_translate(struct vc_data *conp, int glyph, int use_unicode) 260 224 { 261 225 struct uni_pagedir *p; 226 + int m; 262 227 if (glyph < 0 || glyph >= MAX_GLYPH) 263 228 return 0; 264 - else if (!(p = (struct uni_pagedir *)*conp->vc_uni_pagedir_loc) || 265 - !p->inverse_translations[inv_translate[conp->vc_num]]) 229 + else if (!(p = (struct uni_pagedir *)*conp->vc_uni_pagedir_loc)) 266 230 return glyph; 267 - else 268 - return p->inverse_translations[inv_translate[conp->vc_num]][glyph]; 231 + else if (use_unicode) { 232 + if (!p->inverse_trans_unicode) 233 + return glyph; 234 + else 235 + return p->inverse_trans_unicode[glyph]; 236 + } else { 237 + m = inv_translate[conp->vc_num]; 238 + if (!p->inverse_translations[m]) 239 + return glyph; 240 + else 241 + return p->inverse_translations[m][glyph]; 242 + } 269 243 } 270 244 271 245 static void update_user_maps(void) ··· 289 243 p = (struct uni_pagedir *)*vc_cons[i].d->vc_uni_pagedir_loc; 290 244 if (p && p != q) { 291 245 set_inverse_transl(vc_cons[i].d, p, USER_MAP); 246 + set_inverse_trans_unicode(vc_cons[i].d, p); 292 247 q = p; 293 248 } 294 249 } ··· 399 352 for (i = 0; i < 4; i++) { 400 353 kfree(p->inverse_translations[i]); 401 354 p->inverse_translations[i] = NULL; 355 + } 356 + if (p->inverse_trans_unicode) { 357 + kfree(p->inverse_trans_unicode); 358 + p->inverse_trans_unicode = NULL; 402 359 } 403 360 } 404 361 ··· 562 511 563 512 for (i = 0; i <= 3; i++) 564 513 set_inverse_transl(vc, p, i); /* Update all inverse translations */ 514 + set_inverse_trans_unicode(vc, p); 565 515 566 516 return err; 567 517 } ··· 613 561 614 562 for (i = 0; i <= 3; i++) 615 563 set_inverse_transl(vc, p, i); /* Update all inverse translations */ 564 + set_inverse_trans_unicode(vc, p); 616 565 dflt = p; 617 566 return err; 618 567 } ··· 668 615 669 616 if (p) 670 617 p->readonly = rdonly; 618 + } 619 + 620 + /* may be called during an interrupt */ 621 + u32 conv_8bit_to_uni(unsigned char c) 622 + { 623 + /* 624 + * Always use USER_MAP. This function is used by the keyboard, 625 + * which shouldn't be affected by G0/G1 switching, etc. 626 + * If the user map still contains default values, i.e. the 627 + * direct-to-font mapping, then assume user is using Latin1. 628 + */ 629 + unsigned short uni = translations[USER_MAP][c]; 630 + return uni == (0xf000 | c) ? c : uni; 671 631 } 672 632 673 633 int
+18 -8
drivers/char/keyboard.c
··· 24 24 * 21-08-02: Converted to input API, major cleanup. (Vojtech Pavlik) 25 25 */ 26 26 27 + #include <linux/consolemap.h> 27 28 #include <linux/module.h> 28 29 #include <linux/sched.h> 29 30 #include <linux/tty.h> ··· 309 308 * Many other routines do put_queue, but I think either 310 309 * they produce ASCII, or they produce some user-assigned 311 310 * string, and in both cases we might assume that it is 312 - * in utf-8 already. UTF-8 is defined for words of up to 31 bits, 313 - * but we need only 16 bits here 311 + * in utf-8 already. 314 312 */ 315 - static void to_utf8(struct vc_data *vc, ushort c) 313 + static void to_utf8(struct vc_data *vc, uint c) 316 314 { 317 315 if (c < 0x80) 318 316 /* 0******* */ ··· 320 320 /* 110***** 10****** */ 321 321 put_queue(vc, 0xc0 | (c >> 6)); 322 322 put_queue(vc, 0x80 | (c & 0x3f)); 323 - } else { 323 + } else if (c < 0x10000) { 324 + if (c >= 0xD800 && c < 0xE000) 325 + return; 326 + if (c == 0xFFFF) 327 + return; 324 328 /* 1110**** 10****** 10****** */ 325 329 put_queue(vc, 0xe0 | (c >> 12)); 330 + put_queue(vc, 0x80 | ((c >> 6) & 0x3f)); 331 + put_queue(vc, 0x80 | (c & 0x3f)); 332 + } else if (c < 0x110000) { 333 + /* 11110*** 10****** 10****** 10****** */ 334 + put_queue(vc, 0xf0 | (c >> 18)); 335 + put_queue(vc, 0x80 | ((c >> 12) & 0x3f)); 326 336 put_queue(vc, 0x80 | ((c >> 6) & 0x3f)); 327 337 put_queue(vc, 0x80 | (c & 0x3f)); 328 338 } ··· 403 393 return d; 404 394 405 395 if (kbd->kbdmode == VC_UNICODE) 406 - to_utf8(vc, d); 396 + to_utf8(vc, conv_8bit_to_uni(d)); 407 397 else if (d < 0x100) 408 398 put_queue(vc, d); 409 399 ··· 417 407 { 418 408 if (diacr) { 419 409 if (kbd->kbdmode == VC_UNICODE) 420 - to_utf8(vc, diacr); 410 + to_utf8(vc, conv_8bit_to_uni(diacr)); 421 411 else if (diacr < 0x100) 422 412 put_queue(vc, diacr); 423 413 diacr = 0; ··· 627 617 return; 628 618 } 629 619 if (kbd->kbdmode == VC_UNICODE) 630 - to_utf8(vc, value); 620 + to_utf8(vc, conv_8bit_to_uni(value)); 631 621 else if (value < 0x100) 632 622 put_queue(vc, value); 633 623 } ··· 785 775 /* kludge */ 786 776 if (up_flag && shift_state != old_state && npadch != -1) { 787 777 if (kbd->kbdmode == VC_UNICODE) 788 - to_utf8(vc, npadch & 0xffff); 778 + to_utf8(vc, npadch); 789 779 else 790 780 put_queue(vc, npadch & 0xff); 791 781 npadch = -1;
+40 -8
drivers/char/selection.c
··· 20 20 21 21 #include <asm/uaccess.h> 22 22 23 + #include <linux/kbd_kern.h> 23 24 #include <linux/vt_kern.h> 24 25 #include <linux/consolemap.h> 25 26 #include <linux/selection.h> ··· 35 34 /* Variables for selection control. */ 36 35 /* Use a dynamic buffer, instead of static (Dec 1994) */ 37 36 struct vc_data *sel_cons; /* must not be deallocated */ 37 + static int use_unicode; 38 38 static volatile int sel_start = -1; /* cleared by clear_selection */ 39 39 static int sel_end; 40 40 static int sel_buffer_lth; ··· 56 54 complement_pos(sel_cons, where); 57 55 } 58 56 59 - static unsigned char 57 + static u16 60 58 sel_pos(int n) 61 59 { 62 - return inverse_translate(sel_cons, screen_glyph(sel_cons, n)); 60 + return inverse_translate(sel_cons, screen_glyph(sel_cons, n), 61 + use_unicode); 63 62 } 64 63 65 64 /* remove the current selection highlight, if any, ··· 89 86 0xFF7FFFFF /* latin-1 accented letters, not division sign */ 90 87 }; 91 88 92 - static inline int inword(const unsigned char c) { 93 - return ( inwordLut[c>>5] >> (c & 0x1F) ) & 1; 89 + static inline int inword(const u16 c) { 90 + return c > 0xff || (( inwordLut[c>>5] >> (c & 0x1F) ) & 1); 94 91 } 95 92 96 93 /* set inwordLut contents. Invoked by ioctl(). */ ··· 111 108 return (v > u) ? u : v; 112 109 } 113 110 111 + /* stores the char in UTF8 and returns the number of bytes used (1-3) */ 112 + static int store_utf8(u16 c, char *p) 113 + { 114 + if (c < 0x80) { 115 + /* 0******* */ 116 + p[0] = c; 117 + return 1; 118 + } else if (c < 0x800) { 119 + /* 110***** 10****** */ 120 + p[0] = 0xc0 | (c >> 6); 121 + p[1] = 0x80 | (c & 0x3f); 122 + return 2; 123 + } else { 124 + /* 1110**** 10****** 10****** */ 125 + p[0] = 0xe0 | (c >> 12); 126 + p[1] = 0x80 | ((c >> 6) & 0x3f); 127 + p[2] = 0x80 | (c & 0x3f); 128 + return 3; 129 + } 130 + } 131 + 114 132 /* set the current selection. Invoked by ioctl() or by kernel code. */ 115 133 int set_selection(const struct tiocl_selection __user *sel, struct tty_struct *tty) 116 134 { 117 135 struct vc_data *vc = vc_cons[fg_console].d; 118 136 int sel_mode, new_sel_start, new_sel_end, spc; 119 137 char *bp, *obp; 120 - int i, ps, pe; 138 + int i, ps, pe, multiplier; 139 + u16 c; 140 + struct kbd_struct *kbd = kbd_table + fg_console; 121 141 122 142 poke_blanked_console(); 123 143 ··· 184 158 clear_selection(); 185 159 sel_cons = vc_cons[fg_console].d; 186 160 } 161 + use_unicode = kbd && kbd->kbdmode == VC_UNICODE; 187 162 188 163 switch (sel_mode) 189 164 { ··· 267 240 sel_end = new_sel_end; 268 241 269 242 /* Allocate a new buffer before freeing the old one ... */ 270 - bp = kmalloc((sel_end-sel_start)/2+1, GFP_KERNEL); 243 + multiplier = use_unicode ? 3 : 1; /* chars can take up to 3 bytes */ 244 + bp = kmalloc((sel_end-sel_start)/2*multiplier+1, GFP_KERNEL); 271 245 if (!bp) { 272 246 printk(KERN_WARNING "selection: kmalloc() failed\n"); 273 247 clear_selection(); ··· 279 251 280 252 obp = bp; 281 253 for (i = sel_start; i <= sel_end; i += 2) { 282 - *bp = sel_pos(i); 283 - if (!isspace(*bp++)) 254 + c = sel_pos(i); 255 + if (use_unicode) 256 + bp += store_utf8(c, bp); 257 + else 258 + *bp++ = c; 259 + if (!isspace(c)) 284 260 obp = bp; 285 261 if (! ((i + 2) % vc->vc_size_row)) { 286 262 /* strip trailing blanks from line and add newline,
+4 -1
include/linux/consolemap.h
··· 8 8 #define IBMPC_MAP 2 9 9 #define USER_MAP 3 10 10 11 + #include <linux/types.h> 12 + 11 13 struct vc_data; 12 14 13 - extern unsigned char inverse_translate(struct vc_data *conp, int glyph); 15 + extern u16 inverse_translate(struct vc_data *conp, int glyph, int use_unicode); 14 16 extern unsigned short *set_translate(int m, struct vc_data *vc); 15 17 extern int conv_uni_to_pc(struct vc_data *conp, long ucs); 18 + extern u32 conv_8bit_to_uni(unsigned char c); 16 19 void console_map_init(void);