Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge branch 'x86-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull x86 fixes from Ingo Molnar:
"Misc fixes and updates:

- a handful of MDS documentation/comment updates

- a cleanup related to hweight interfaces

- a SEV guest fix for large pages

- a kprobes LTO fix

- and a final cleanup commit for vDSO HPET support removal"

* 'x86-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
x86/speculation/mds: Improve CPU buffer clear documentation
x86/speculation/mds: Revert CPU buffer clear on double fault exit
x86/kconfig: Disable CONFIG_GENERIC_HWEIGHT and remove __HAVE_ARCH_SW_HWEIGHT
x86/mm: Do not use set_{pud, pmd}_safe() when splitting a large page
x86/kprobes: Make trampoline_handler() global and visible
x86/vdso: Remove hpet_page from vDSO

+121 -103
+6 -38
Documentation/x86/mds.rst
··· 142 142 mds_user_clear. 143 143 144 144 The mitigation is invoked in prepare_exit_to_usermode() which covers 145 - most of the kernel to user space transitions. There are a few exceptions 146 - which are not invoking prepare_exit_to_usermode() on return to user 147 - space. These exceptions use the paranoid exit code. 145 + all but one of the kernel to user space transitions. The exception 146 + is when we return from a Non Maskable Interrupt (NMI), which is 147 + handled directly in do_nmi(). 148 148 149 - - Non Maskable Interrupt (NMI): 150 - 151 - Access to sensible data like keys, credentials in the NMI context is 152 - mostly theoretical: The CPU can do prefetching or execute a 153 - misspeculated code path and thereby fetching data which might end up 154 - leaking through a buffer. 155 - 156 - But for mounting other attacks the kernel stack address of the task is 157 - already valuable information. So in full mitigation mode, the NMI is 158 - mitigated on the return from do_nmi() to provide almost complete 159 - coverage. 160 - 161 - - Double fault (#DF): 162 - 163 - A double fault is usually fatal, but the ESPFIX workaround, which can 164 - be triggered from user space through modify_ldt(2) is a recoverable 165 - double fault. #DF uses the paranoid exit path, so explicit mitigation 166 - in the double fault handler is required. 167 - 168 - - Machine Check Exception (#MC): 169 - 170 - Another corner case is a #MC which hits between the CPU buffer clear 171 - invocation and the actual return to user. As this still is in kernel 172 - space it takes the paranoid exit path which does not clear the CPU 173 - buffers. So the #MC handler repopulates the buffers to some 174 - extent. Machine checks are not reliably controllable and the window is 175 - extremly small so mitigation would just tick a checkbox that this 176 - theoretical corner case is covered. To keep the amount of special 177 - cases small, ignore #MC. 178 - 179 - - Debug Exception (#DB): 180 - 181 - This takes the paranoid exit path only when the INT1 breakpoint is in 182 - kernel space. #DB on a user space address takes the regular exit path, 183 - so no extra mitigation required. 149 + (The reason that NMI is special is that prepare_exit_to_usermode() can 150 + enable IRQs. In NMI context, NMIs are blocked, and we don't want to 151 + enable IRQs with NMIs blocked.) 184 152 185 153 186 154 2. C-State transition
-3
arch/x86/Kconfig
··· 270 270 config GENERIC_BUG_RELATIVE_POINTERS 271 271 bool 272 272 273 - config GENERIC_HWEIGHT 274 - def_bool y 275 - 276 273 config ARCH_MAY_HAVE_PC_FDC 277 274 def_bool y 278 275 depends on ISA_DMA_API
-3
arch/x86/entry/vdso/vdso2c.c
··· 73 73 enum { 74 74 sym_vvar_start, 75 75 sym_vvar_page, 76 - sym_hpet_page, 77 76 sym_pvclock_page, 78 77 sym_hvclock_page, 79 78 }; 80 79 81 80 const int special_pages[] = { 82 81 sym_vvar_page, 83 - sym_hpet_page, 84 82 sym_pvclock_page, 85 83 sym_hvclock_page, 86 84 }; ··· 91 93 struct vdso_sym required_syms[] = { 92 94 [sym_vvar_start] = {"vvar_start", true}, 93 95 [sym_vvar_page] = {"vvar_page", true}, 94 - [sym_hpet_page] = {"hpet_page", true}, 95 96 [sym_pvclock_page] = {"pvclock_page", true}, 96 97 [sym_hvclock_page] = {"hvclock_page", true}, 97 98 {"VDSO32_NOTE_MASK", true},
-2
arch/x86/include/asm/arch_hweight.h
··· 12 12 #define REG_OUT "a" 13 13 #endif 14 14 15 - #define __HAVE_ARCH_SW_HWEIGHT 16 - 17 15 static __always_inline unsigned int __arch_hweight32(unsigned int w) 18 16 { 19 17 unsigned int res;
-1
arch/x86/include/asm/vdso.h
··· 19 19 long sym_vvar_start; /* Negative offset to the vvar area */ 20 20 21 21 long sym_vvar_page; 22 - long sym_hpet_page; 23 22 long sym_pvclock_page; 24 23 long sym_hvclock_page; 25 24 long sym_VDSO32_NOTE_MASK;
+1 -1
arch/x86/kernel/kprobes/core.c
··· 768 768 /* 769 769 * Called from kretprobe_trampoline 770 770 */ 771 - static __used void *trampoline_handler(struct pt_regs *regs) 771 + __used __visible void *trampoline_handler(struct pt_regs *regs) 772 772 { 773 773 struct kprobe_ctlblk *kcb; 774 774 struct kretprobe_instance *ri = NULL;
-8
arch/x86/kernel/traps.c
··· 58 58 #include <asm/alternative.h> 59 59 #include <asm/fpu/xstate.h> 60 60 #include <asm/trace/mpx.h> 61 - #include <asm/nospec-branch.h> 62 61 #include <asm/mpx.h> 63 62 #include <asm/vm86.h> 64 63 #include <asm/umip.h> ··· 367 368 regs->ip = (unsigned long)general_protection; 368 369 regs->sp = (unsigned long)&gpregs->orig_ax; 369 370 370 - /* 371 - * This situation can be triggered by userspace via 372 - * modify_ldt(2) and the return does not take the regular 373 - * user space exit, so a CPU buffer clear is required when 374 - * MDS mitigation is enabled. 375 - */ 376 - mds_user_clear_cpu_buffers(); 377 371 return; 378 372 } 379 373 #endif
+104 -40
arch/x86/mm/init_64.c
··· 58 58 59 59 #include "ident_map.c" 60 60 61 + #define DEFINE_POPULATE(fname, type1, type2, init) \ 62 + static inline void fname##_init(struct mm_struct *mm, \ 63 + type1##_t *arg1, type2##_t *arg2, bool init) \ 64 + { \ 65 + if (init) \ 66 + fname##_safe(mm, arg1, arg2); \ 67 + else \ 68 + fname(mm, arg1, arg2); \ 69 + } 70 + 71 + DEFINE_POPULATE(p4d_populate, p4d, pud, init) 72 + DEFINE_POPULATE(pgd_populate, pgd, p4d, init) 73 + DEFINE_POPULATE(pud_populate, pud, pmd, init) 74 + DEFINE_POPULATE(pmd_populate_kernel, pmd, pte, init) 75 + 76 + #define DEFINE_ENTRY(type1, type2, init) \ 77 + static inline void set_##type1##_init(type1##_t *arg1, \ 78 + type2##_t arg2, bool init) \ 79 + { \ 80 + if (init) \ 81 + set_##type1##_safe(arg1, arg2); \ 82 + else \ 83 + set_##type1(arg1, arg2); \ 84 + } 85 + 86 + DEFINE_ENTRY(p4d, p4d, init) 87 + DEFINE_ENTRY(pud, pud, init) 88 + DEFINE_ENTRY(pmd, pmd, init) 89 + DEFINE_ENTRY(pte, pte, init) 90 + 91 + 61 92 /* 62 93 * NOTE: pagetable_init alloc all the fixmap pagetables contiguous on the 63 94 * physical space so we can cache the place of the first one and move ··· 445 414 */ 446 415 static unsigned long __meminit 447 416 phys_pte_init(pte_t *pte_page, unsigned long paddr, unsigned long paddr_end, 448 - pgprot_t prot) 417 + pgprot_t prot, bool init) 449 418 { 450 419 unsigned long pages = 0, paddr_next; 451 420 unsigned long paddr_last = paddr_end; ··· 463 432 E820_TYPE_RAM) && 464 433 !e820__mapped_any(paddr & PAGE_MASK, paddr_next, 465 434 E820_TYPE_RESERVED_KERN)) 466 - set_pte_safe(pte, __pte(0)); 435 + set_pte_init(pte, __pte(0), init); 467 436 continue; 468 437 } 469 438 ··· 483 452 pr_info(" pte=%p addr=%lx pte=%016lx\n", pte, paddr, 484 453 pfn_pte(paddr >> PAGE_SHIFT, PAGE_KERNEL).pte); 485 454 pages++; 486 - set_pte_safe(pte, pfn_pte(paddr >> PAGE_SHIFT, prot)); 455 + set_pte_init(pte, pfn_pte(paddr >> PAGE_SHIFT, prot), init); 487 456 paddr_last = (paddr & PAGE_MASK) + PAGE_SIZE; 488 457 } 489 458 ··· 499 468 */ 500 469 static unsigned long __meminit 501 470 phys_pmd_init(pmd_t *pmd_page, unsigned long paddr, unsigned long paddr_end, 502 - unsigned long page_size_mask, pgprot_t prot) 471 + unsigned long page_size_mask, pgprot_t prot, bool init) 503 472 { 504 473 unsigned long pages = 0, paddr_next; 505 474 unsigned long paddr_last = paddr_end; ··· 518 487 E820_TYPE_RAM) && 519 488 !e820__mapped_any(paddr & PMD_MASK, paddr_next, 520 489 E820_TYPE_RESERVED_KERN)) 521 - set_pmd_safe(pmd, __pmd(0)); 490 + set_pmd_init(pmd, __pmd(0), init); 522 491 continue; 523 492 } 524 493 ··· 527 496 spin_lock(&init_mm.page_table_lock); 528 497 pte = (pte_t *)pmd_page_vaddr(*pmd); 529 498 paddr_last = phys_pte_init(pte, paddr, 530 - paddr_end, prot); 499 + paddr_end, prot, 500 + init); 531 501 spin_unlock(&init_mm.page_table_lock); 532 502 continue; 533 503 } ··· 556 524 if (page_size_mask & (1<<PG_LEVEL_2M)) { 557 525 pages++; 558 526 spin_lock(&init_mm.page_table_lock); 559 - set_pte_safe((pte_t *)pmd, 560 - pfn_pte((paddr & PMD_MASK) >> PAGE_SHIFT, 561 - __pgprot(pgprot_val(prot) | _PAGE_PSE))); 527 + set_pte_init((pte_t *)pmd, 528 + pfn_pte((paddr & PMD_MASK) >> PAGE_SHIFT, 529 + __pgprot(pgprot_val(prot) | _PAGE_PSE)), 530 + init); 562 531 spin_unlock(&init_mm.page_table_lock); 563 532 paddr_last = paddr_next; 564 533 continue; 565 534 } 566 535 567 536 pte = alloc_low_page(); 568 - paddr_last = phys_pte_init(pte, paddr, paddr_end, new_prot); 537 + paddr_last = phys_pte_init(pte, paddr, paddr_end, new_prot, init); 569 538 570 539 spin_lock(&init_mm.page_table_lock); 571 - pmd_populate_kernel_safe(&init_mm, pmd, pte); 540 + pmd_populate_kernel_init(&init_mm, pmd, pte, init); 572 541 spin_unlock(&init_mm.page_table_lock); 573 542 } 574 543 update_page_count(PG_LEVEL_2M, pages); ··· 584 551 */ 585 552 static unsigned long __meminit 586 553 phys_pud_init(pud_t *pud_page, unsigned long paddr, unsigned long paddr_end, 587 - unsigned long page_size_mask) 554 + unsigned long page_size_mask, bool init) 588 555 { 589 556 unsigned long pages = 0, paddr_next; 590 557 unsigned long paddr_last = paddr_end; ··· 606 573 E820_TYPE_RAM) && 607 574 !e820__mapped_any(paddr & PUD_MASK, paddr_next, 608 575 E820_TYPE_RESERVED_KERN)) 609 - set_pud_safe(pud, __pud(0)); 576 + set_pud_init(pud, __pud(0), init); 610 577 continue; 611 578 } 612 579 ··· 616 583 paddr_last = phys_pmd_init(pmd, paddr, 617 584 paddr_end, 618 585 page_size_mask, 619 - prot); 586 + prot, init); 620 587 continue; 621 588 } 622 589 /* ··· 643 610 if (page_size_mask & (1<<PG_LEVEL_1G)) { 644 611 pages++; 645 612 spin_lock(&init_mm.page_table_lock); 646 - set_pte_safe((pte_t *)pud, 647 - pfn_pte((paddr & PUD_MASK) >> PAGE_SHIFT, 648 - PAGE_KERNEL_LARGE)); 613 + set_pte_init((pte_t *)pud, 614 + pfn_pte((paddr & PUD_MASK) >> PAGE_SHIFT, 615 + PAGE_KERNEL_LARGE), 616 + init); 649 617 spin_unlock(&init_mm.page_table_lock); 650 618 paddr_last = paddr_next; 651 619 continue; ··· 654 620 655 621 pmd = alloc_low_page(); 656 622 paddr_last = phys_pmd_init(pmd, paddr, paddr_end, 657 - page_size_mask, prot); 623 + page_size_mask, prot, init); 658 624 659 625 spin_lock(&init_mm.page_table_lock); 660 - pud_populate_safe(&init_mm, pud, pmd); 626 + pud_populate_init(&init_mm, pud, pmd, init); 661 627 spin_unlock(&init_mm.page_table_lock); 662 628 } 663 629 ··· 668 634 669 635 static unsigned long __meminit 670 636 phys_p4d_init(p4d_t *p4d_page, unsigned long paddr, unsigned long paddr_end, 671 - unsigned long page_size_mask) 637 + unsigned long page_size_mask, bool init) 672 638 { 673 639 unsigned long paddr_next, paddr_last = paddr_end; 674 640 unsigned long vaddr = (unsigned long)__va(paddr); 675 641 int i = p4d_index(vaddr); 676 642 677 643 if (!pgtable_l5_enabled()) 678 - return phys_pud_init((pud_t *) p4d_page, paddr, paddr_end, page_size_mask); 644 + return phys_pud_init((pud_t *) p4d_page, paddr, paddr_end, 645 + page_size_mask, init); 679 646 680 647 for (; i < PTRS_PER_P4D; i++, paddr = paddr_next) { 681 648 p4d_t *p4d; ··· 692 657 E820_TYPE_RAM) && 693 658 !e820__mapped_any(paddr & P4D_MASK, paddr_next, 694 659 E820_TYPE_RESERVED_KERN)) 695 - set_p4d_safe(p4d, __p4d(0)); 660 + set_p4d_init(p4d, __p4d(0), init); 696 661 continue; 697 662 } 698 663 699 664 if (!p4d_none(*p4d)) { 700 665 pud = pud_offset(p4d, 0); 701 - paddr_last = phys_pud_init(pud, paddr, 702 - paddr_end, 703 - page_size_mask); 666 + paddr_last = phys_pud_init(pud, paddr, paddr_end, 667 + page_size_mask, init); 704 668 continue; 705 669 } 706 670 707 671 pud = alloc_low_page(); 708 672 paddr_last = phys_pud_init(pud, paddr, paddr_end, 709 - page_size_mask); 673 + page_size_mask, init); 710 674 711 675 spin_lock(&init_mm.page_table_lock); 712 - p4d_populate_safe(&init_mm, p4d, pud); 676 + p4d_populate_init(&init_mm, p4d, pud, init); 713 677 spin_unlock(&init_mm.page_table_lock); 714 678 } 715 679 716 680 return paddr_last; 717 681 } 718 682 719 - /* 720 - * Create page table mapping for the physical memory for specific physical 721 - * addresses. The virtual and physical addresses have to be aligned on PMD level 722 - * down. It returns the last physical address mapped. 723 - */ 724 - unsigned long __meminit 725 - kernel_physical_mapping_init(unsigned long paddr_start, 726 - unsigned long paddr_end, 727 - unsigned long page_size_mask) 683 + static unsigned long __meminit 684 + __kernel_physical_mapping_init(unsigned long paddr_start, 685 + unsigned long paddr_end, 686 + unsigned long page_size_mask, 687 + bool init) 728 688 { 729 689 bool pgd_changed = false; 730 690 unsigned long vaddr, vaddr_start, vaddr_end, vaddr_next, paddr_last; ··· 739 709 p4d = (p4d_t *)pgd_page_vaddr(*pgd); 740 710 paddr_last = phys_p4d_init(p4d, __pa(vaddr), 741 711 __pa(vaddr_end), 742 - page_size_mask); 712 + page_size_mask, 713 + init); 743 714 continue; 744 715 } 745 716 746 717 p4d = alloc_low_page(); 747 718 paddr_last = phys_p4d_init(p4d, __pa(vaddr), __pa(vaddr_end), 748 - page_size_mask); 719 + page_size_mask, init); 749 720 750 721 spin_lock(&init_mm.page_table_lock); 751 722 if (pgtable_l5_enabled()) 752 - pgd_populate_safe(&init_mm, pgd, p4d); 723 + pgd_populate_init(&init_mm, pgd, p4d, init); 753 724 else 754 - p4d_populate_safe(&init_mm, p4d_offset(pgd, vaddr), (pud_t *) p4d); 725 + p4d_populate_init(&init_mm, p4d_offset(pgd, vaddr), 726 + (pud_t *) p4d, init); 727 + 755 728 spin_unlock(&init_mm.page_table_lock); 756 729 pgd_changed = true; 757 730 } ··· 763 730 sync_global_pgds(vaddr_start, vaddr_end - 1); 764 731 765 732 return paddr_last; 733 + } 734 + 735 + 736 + /* 737 + * Create page table mapping for the physical memory for specific physical 738 + * addresses. Note that it can only be used to populate non-present entries. 739 + * The virtual and physical addresses have to be aligned on PMD level 740 + * down. It returns the last physical address mapped. 741 + */ 742 + unsigned long __meminit 743 + kernel_physical_mapping_init(unsigned long paddr_start, 744 + unsigned long paddr_end, 745 + unsigned long page_size_mask) 746 + { 747 + return __kernel_physical_mapping_init(paddr_start, paddr_end, 748 + page_size_mask, true); 749 + } 750 + 751 + /* 752 + * This function is similar to kernel_physical_mapping_init() above with the 753 + * exception that it uses set_{pud,pmd}() instead of the set_{pud,pte}_safe() 754 + * when updating the mapping. The caller is responsible to flush the TLBs after 755 + * the function returns. 756 + */ 757 + unsigned long __meminit 758 + kernel_physical_mapping_change(unsigned long paddr_start, 759 + unsigned long paddr_end, 760 + unsigned long page_size_mask) 761 + { 762 + return __kernel_physical_mapping_init(paddr_start, paddr_end, 763 + page_size_mask, false); 766 764 } 767 765 768 766 #ifndef CONFIG_NUMA
+7 -3
arch/x86/mm/mem_encrypt.c
··· 301 301 else 302 302 split_page_size_mask = 1 << PG_LEVEL_2M; 303 303 304 - kernel_physical_mapping_init(__pa(vaddr & pmask), 305 - __pa((vaddr_end & pmask) + psize), 306 - split_page_size_mask); 304 + /* 305 + * kernel_physical_mapping_change() does not flush the TLBs, so 306 + * a TLB flush is required after we exit from the for loop. 307 + */ 308 + kernel_physical_mapping_change(__pa(vaddr & pmask), 309 + __pa((vaddr_end & pmask) + psize), 310 + split_page_size_mask); 307 311 } 308 312 309 313 ret = 0;
+3
arch/x86/mm/mm_internal.h
··· 13 13 unsigned long kernel_physical_mapping_init(unsigned long start, 14 14 unsigned long end, 15 15 unsigned long page_size_mask); 16 + unsigned long kernel_physical_mapping_change(unsigned long start, 17 + unsigned long end, 18 + unsigned long page_size_mask); 16 19 void zone_sizes_init(void); 17 20 18 21 extern int after_bootmem;
-4
lib/hweight.c
··· 10 10 * The Hamming Weight of a number is the total number of bits set in it. 11 11 */ 12 12 13 - #ifndef __HAVE_ARCH_SW_HWEIGHT 14 13 unsigned int __sw_hweight32(unsigned int w) 15 14 { 16 15 #ifdef CONFIG_ARCH_HAS_FAST_MULTIPLIER ··· 26 27 #endif 27 28 } 28 29 EXPORT_SYMBOL(__sw_hweight32); 29 - #endif 30 30 31 31 unsigned int __sw_hweight16(unsigned int w) 32 32 { ··· 44 46 } 45 47 EXPORT_SYMBOL(__sw_hweight8); 46 48 47 - #ifndef __HAVE_ARCH_SW_HWEIGHT 48 49 unsigned long __sw_hweight64(__u64 w) 49 50 { 50 51 #if BITS_PER_LONG == 32 ··· 66 69 #endif 67 70 } 68 71 EXPORT_SYMBOL(__sw_hweight64); 69 - #endif