Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

x86/irq/64: Split the IRQ stack into its own pages

Currently, the IRQ stack is hardcoded as the first page of the percpu
area, and the stack canary lives on the IRQ stack. The former gets in
the way of adding an IRQ stack guard page, and the latter is a potential
weakness in the stack canary mechanism.

Split the IRQ stack into its own private percpu pages.

[ tglx: Make 64 and 32 bit share struct irq_stack ]

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Brijesh Singh <brijesh.singh@amd.com>
Cc: "Chang S. Bae" <chang.seok.bae@intel.com>
Cc: Dominik Brodowski <linux@dominikbrodowski.net>
Cc: Feng Tang <feng.tang@intel.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jan Beulich <JBeulich@suse.com>
Cc: Jiri Kosina <jkosina@suse.cz>
Cc: Joerg Roedel <jroedel@suse.de>
Cc: Jordan Borgner <mail@jordan-borgner.de>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Cc: Maran Wilson <maran.wilson@oracle.com>
Cc: Masahiro Yamada <yamada.masahiro@socionext.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Rapoport <rppt@linux.vnet.ibm.com>
Cc: Nick Desaulniers <ndesaulniers@google.com>
Cc: Nicolai Stange <nstange@suse.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Pu Wen <puwen@hygon.cn>
Cc: "Rafael Ávila de Espíndola" <rafael@espindo.la>
Cc: Sean Christopherson <sean.j.christopherson@intel.com>
Cc: Stefano Stabellini <sstabellini@kernel.org>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: x86-ml <x86@kernel.org>
Cc: xen-devel@lists.xenproject.org
Link: https://lkml.kernel.org/r/20190414160146.267376656@linutronix.de

authored by

Andy Lutomirski and committed by
Borislav Petkov
e6401c13 0ac26104

+39 -44
+2 -2
arch/x86/entry/entry_64.S
··· 298 298 299 299 #ifdef CONFIG_STACKPROTECTOR 300 300 movq TASK_stack_canary(%rsi), %rbx 301 - movq %rbx, PER_CPU_VAR(irq_stack_union)+stack_canary_offset 301 + movq %rbx, PER_CPU_VAR(fixed_percpu_data) + stack_canary_offset 302 302 #endif 303 303 304 304 #ifdef CONFIG_RETPOLINE ··· 430 430 * it before we actually move ourselves to the IRQ stack. 431 431 */ 432 432 433 - movq \old_rsp, PER_CPU_VAR(irq_stack_union + IRQ_STACK_SIZE - 8) 433 + movq \old_rsp, PER_CPU_VAR(irq_stack_backing_store + IRQ_STACK_SIZE - 8) 434 434 movq PER_CPU_VAR(hardirq_stack_ptr), %rsp 435 435 436 436 #ifdef CONFIG_DEBUG_ENTRY
+14 -18
arch/x86/include/asm/processor.h
··· 367 367 #define __KERNEL_TSS_LIMIT \ 368 368 (IO_BITMAP_OFFSET + IO_BITMAP_BYTES + sizeof(unsigned long) - 1) 369 369 370 + /* Per CPU interrupt stacks */ 371 + struct irq_stack { 372 + char stack[IRQ_STACK_SIZE]; 373 + } __aligned(IRQ_STACK_SIZE); 374 + 375 + DECLARE_PER_CPU(struct irq_stack *, hardirq_stack_ptr); 376 + 370 377 #ifdef CONFIG_X86_32 371 378 DECLARE_PER_CPU(unsigned long, cpu_current_top_of_stack); 372 379 #else ··· 382 375 #endif 383 376 384 377 #ifdef CONFIG_X86_64 385 - union irq_stack_union { 386 - char irq_stack[IRQ_STACK_SIZE]; 378 + struct fixed_percpu_data { 387 379 /* 388 380 * GCC hardcodes the stack canary as %gs:40. Since the 389 381 * irq_stack is the object at %gs:0, we reserve the bottom 390 382 * 48 bytes of the irq stack for the canary. 391 383 */ 392 - struct { 393 - char gs_base[40]; 394 - unsigned long stack_canary; 395 - }; 384 + char gs_base[40]; 385 + unsigned long stack_canary; 396 386 }; 397 387 398 - DECLARE_PER_CPU_FIRST(union irq_stack_union, irq_stack_union) __visible; 399 - DECLARE_INIT_PER_CPU(irq_stack_union); 388 + DECLARE_PER_CPU_FIRST(struct fixed_percpu_data, fixed_percpu_data) __visible; 389 + DECLARE_INIT_PER_CPU(fixed_percpu_data); 400 390 401 391 static inline unsigned long cpu_kernelmode_gs_base(int cpu) 402 392 { 403 - return (unsigned long)per_cpu(irq_stack_union.gs_base, cpu); 393 + return (unsigned long)per_cpu(fixed_percpu_data.gs_base, cpu); 404 394 } 405 395 406 - DECLARE_PER_CPU(char *, hardirq_stack_ptr); 407 396 DECLARE_PER_CPU(unsigned int, irq_count); 408 397 extern asmlinkage void ignore_sysret(void); 409 398 ··· 421 418 }; 422 419 DECLARE_PER_CPU_ALIGNED(struct stack_canary, stack_canary); 423 420 #endif 424 - /* 425 - * per-CPU IRQ handling stacks 426 - */ 427 - struct irq_stack { 428 - char stack[IRQ_STACK_SIZE]; 429 - } __aligned(IRQ_STACK_SIZE); 430 - 431 - DECLARE_PER_CPU(struct irq_stack *, hardirq_stack_ptr); 421 + /* Per CPU softirq stack pointer */ 432 422 DECLARE_PER_CPU(struct irq_stack *, softirq_stack_ptr); 433 423 #endif /* X86_64 */ 434 424
+3 -3
arch/x86/include/asm/stackprotector.h
··· 13 13 * On x86_64, %gs is shared by percpu area and stack canary. All 14 14 * percpu symbols are zero based and %gs points to the base of percpu 15 15 * area. The first occupant of the percpu area is always 16 - * irq_stack_union which contains stack_canary at offset 40. Userland 16 + * fixed_percpu_data which contains stack_canary at offset 40. Userland 17 17 * %gs is always saved and restored on kernel entry and exit using 18 18 * swapgs, so stack protector doesn't add any complexity there. 19 19 * ··· 64 64 u64 tsc; 65 65 66 66 #ifdef CONFIG_X86_64 67 - BUILD_BUG_ON(offsetof(union irq_stack_union, stack_canary) != 40); 67 + BUILD_BUG_ON(offsetof(struct fixed_percpu_data, stack_canary) != 40); 68 68 #endif 69 69 /* 70 70 * We both use the random pool and the current TSC as a source ··· 79 79 80 80 current->stack_canary = canary; 81 81 #ifdef CONFIG_X86_64 82 - this_cpu_write(irq_stack_union.stack_canary, canary); 82 + this_cpu_write(fixed_percpu_data.stack_canary, canary); 83 83 #else 84 84 this_cpu_write(stack_canary.canary, canary); 85 85 #endif
+1 -1
arch/x86/kernel/asm-offsets_64.c
··· 73 73 BLANK(); 74 74 75 75 #ifdef CONFIG_STACKPROTECTOR 76 - DEFINE(stack_canary_offset, offsetof(union irq_stack_union, stack_canary)); 76 + DEFINE(stack_canary_offset, offsetof(struct fixed_percpu_data, stack_canary)); 77 77 BLANK(); 78 78 #endif 79 79
+4 -4
arch/x86/kernel/cpu/common.c
··· 1498 1498 __setup("clearcpuid=", setup_clearcpuid); 1499 1499 1500 1500 #ifdef CONFIG_X86_64 1501 - DEFINE_PER_CPU_FIRST(union irq_stack_union, 1502 - irq_stack_union) __aligned(PAGE_SIZE) __visible; 1503 - EXPORT_PER_CPU_SYMBOL_GPL(irq_stack_union); 1501 + DEFINE_PER_CPU_FIRST(struct fixed_percpu_data, 1502 + fixed_percpu_data) __aligned(PAGE_SIZE) __visible; 1503 + EXPORT_PER_CPU_SYMBOL_GPL(fixed_percpu_data); 1504 1504 1505 1505 /* 1506 1506 * The following percpu variables are hot. Align current_task to ··· 1510 1510 &init_task; 1511 1511 EXPORT_PER_CPU_SYMBOL(current_task); 1512 1512 1513 - DEFINE_PER_CPU(char *, hardirq_stack_ptr); 1513 + DEFINE_PER_CPU(struct irq_stack *, hardirq_stack_ptr); 1514 1514 DEFINE_PER_CPU(unsigned int, irq_count) __visible = -1; 1515 1515 1516 1516 DEFINE_PER_CPU(int, __preempt_count) = INIT_PREEMPT_COUNT;
+1 -1
arch/x86/kernel/head_64.S
··· 265 265 GLOBAL(initial_code) 266 266 .quad x86_64_start_kernel 267 267 GLOBAL(initial_gs) 268 - .quad INIT_PER_CPU_VAR(irq_stack_union) 268 + .quad INIT_PER_CPU_VAR(fixed_percpu_data) 269 269 GLOBAL(initial_stack) 270 270 /* 271 271 * The SIZEOF_PTREGS gap is a convention which helps the in-kernel
+4 -1
arch/x86/kernel/irq_64.c
··· 23 23 #include <asm/io_apic.h> 24 24 #include <asm/apic.h> 25 25 26 + DEFINE_PER_CPU_PAGE_ALIGNED(struct irq_stack, irq_stack_backing_store) __visible; 27 + DECLARE_INIT_PER_CPU(irq_stack_backing_store); 28 + 26 29 int sysctl_panic_on_stackoverflow; 27 30 28 31 /* ··· 93 90 94 91 static int map_irq_stack(unsigned int cpu) 95 92 { 96 - void *va = per_cpu_ptr(irq_stack_union.irq_stack, cpu); 93 + void *va = per_cpu_ptr(&irq_stack_backing_store, cpu); 97 94 98 95 per_cpu(hardirq_stack_ptr, cpu) = va + IRQ_STACK_SIZE; 99 96 return 0;
-5
arch/x86/kernel/setup_percpu.c
··· 244 244 per_cpu(x86_cpu_to_logical_apicid, cpu) = 245 245 early_per_cpu_map(x86_cpu_to_logical_apicid, cpu); 246 246 #endif 247 - #ifdef CONFIG_X86_64 248 - per_cpu(hardirq_stack_ptr, cpu) = 249 - per_cpu(irq_stack_union.irq_stack, cpu) + 250 - IRQ_STACK_SIZE; 251 - #endif 252 247 #ifdef CONFIG_NUMA 253 248 per_cpu(x86_cpu_to_node_map, cpu) = 254 249 early_per_cpu_map(x86_cpu_to_node_map, cpu);
+4 -3
arch/x86/kernel/vmlinux.lds.S
··· 403 403 */ 404 404 #define INIT_PER_CPU(x) init_per_cpu__##x = ABSOLUTE(x) + __per_cpu_load 405 405 INIT_PER_CPU(gdt_page); 406 - INIT_PER_CPU(irq_stack_union); 406 + INIT_PER_CPU(fixed_percpu_data); 407 + INIT_PER_CPU(irq_stack_backing_store); 407 408 408 409 /* 409 410 * Build-time check on the image size: ··· 413 412 "kernel image bigger than KERNEL_IMAGE_SIZE"); 414 413 415 414 #ifdef CONFIG_SMP 416 - . = ASSERT((irq_stack_union == 0), 417 - "irq_stack_union is not at start of per-cpu area"); 415 + . = ASSERT((fixed_percpu_data == 0), 416 + "fixed_percpu_data is not at start of per-cpu area"); 418 417 #endif 419 418 420 419 #endif /* CONFIG_X86_32 */
+1 -1
arch/x86/tools/relocs.c
··· 738 738 * __per_cpu_load 739 739 * 740 740 * The "gold" linker incorrectly associates: 741 - * init_per_cpu__irq_stack_union 741 + * init_per_cpu__fixed_percpu_data 742 742 * init_per_cpu__gdt_page 743 743 */ 744 744 static int is_percpu_sym(ElfW(Sym) *sym, const char *symname)
+5 -5
arch/x86/xen/xen-head.S
··· 40 40 #ifdef CONFIG_X86_64 41 41 /* Set up %gs. 42 42 * 43 - * The base of %gs always points to the bottom of the irqstack 44 - * union. If the stack protector canary is enabled, it is 45 - * located at %gs:40. Note that, on SMP, the boot cpu uses 46 - * init data section till per cpu areas are set up. 43 + * The base of %gs always points to fixed_percpu_data. If the 44 + * stack protector canary is enabled, it is located at %gs:40. 45 + * Note that, on SMP, the boot cpu uses init data section until 46 + * the per cpu areas are set up. 47 47 */ 48 48 movl $MSR_GS_BASE,%ecx 49 - movq $INIT_PER_CPU_VAR(irq_stack_union),%rax 49 + movq $INIT_PER_CPU_VAR(fixed_percpu_data),%rax 50 50 cdq 51 51 wrmsr 52 52 #endif