Merge tag 'powerpc-4.18-2' of git://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux

Pull powerpc fixes from Michael Ellerman:

- a fix for hugetlb with 4K pages, broken by our recent changes for
split PMD PTL.

- set the correct assembler machine type on e500mc, needed since
binutils 2.26 introduced two forms for the "wait" instruction.

- a fix for potential missed TLB flushes with MADV_[FREE|DONTNEED] etc.
and THP on Power9 Radix.

- three fixes to try and make our panic handling more robust by hard
disabling interrupts, and not marking stopped CPUs as offline because
they haven't been properly offlined.

- three other minor fixes.

Thanks to: Aneesh Kumar K.V, Michael Jeanson, Nicholas Piggin.

* tag 'powerpc-4.18-2' of git://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux:
powerpc/mm/hash/4k: Free hugetlb page table caches correctly.
powerpc/64s/radix: Fix radix_kvm_prefetch_workaround paca access of not possible CPU
powerpc/64s: Fix build failures with CONFIG_NMI_IPI=n
powerpc/64: hard disable irqs on the panic()ing CPU
powerpc: smp_send_stop do not offline stopped CPUs
powerpc/64: hard disable irqs in panic_smp_self_stop
powerpc/64s: Fix DT CPU features Power9 DD2.1 logic
powerpc/64s/radix: Fix MADV_[FREE|DONTNEED] TLB flush miss problem with THP
powerpc/e500mc: Set assembler machine type to e500mc

+153 -34
+1
arch/powerpc/Makefile
··· 244 244 cpu-as-$(CONFIG_ALTIVEC) += $(call as-option,-Wa$(comma)-maltivec) 245 245 cpu-as-$(CONFIG_E200) += -Wa,-me200 246 246 cpu-as-$(CONFIG_PPC_BOOK3S_64) += -Wa,-mpower4 247 + cpu-as-$(CONFIG_PPC_E500MC) += $(call as-option,-Wa$(comma)-me500mc) 247 248 248 249 KBUILD_AFLAGS += $(cpu-as-y) 249 250 KBUILD_CFLAGS += $(cpu-as-y)
+1
arch/powerpc/include/asm/book3s/32/pgalloc.h
··· 108 108 } 109 109 110 110 #define check_pgt_cache() do { } while (0) 111 + #define get_hugepd_cache_index(x) (x) 111 112 112 113 #ifdef CONFIG_SMP 113 114 static inline void pgtable_free_tlb(struct mmu_gather *tlb,
+21
arch/powerpc/include/asm/book3s/64/pgtable-4k.h
··· 49 49 } 50 50 #define is_hugepd(hpd) (hugepd_ok(hpd)) 51 51 52 + /* 53 + * 16M and 16G huge page directory tables are allocated from slab cache 54 + * 55 + */ 56 + #define H_16M_CACHE_INDEX (PAGE_SHIFT + H_PTE_INDEX_SIZE + H_PMD_INDEX_SIZE - 24) 57 + #define H_16G_CACHE_INDEX \ 58 + (PAGE_SHIFT + H_PTE_INDEX_SIZE + H_PMD_INDEX_SIZE + H_PUD_INDEX_SIZE - 34) 59 + 60 + static inline int get_hugepd_cache_index(int index) 61 + { 62 + switch (index) { 63 + case H_16M_CACHE_INDEX: 64 + return HTLB_16M_INDEX; 65 + case H_16G_CACHE_INDEX: 66 + return HTLB_16G_INDEX; 67 + default: 68 + BUG(); 69 + } 70 + /* should not reach */ 71 + } 72 + 52 73 #else /* !CONFIG_HUGETLB_PAGE */ 53 74 static inline int pmd_huge(pmd_t pmd) { return 0; } 54 75 static inline int pud_huge(pud_t pud) { return 0; }
+9
arch/powerpc/include/asm/book3s/64/pgtable-64k.h
··· 45 45 { 46 46 return 0; 47 47 } 48 + 48 49 #define is_hugepd(pdep) 0 50 + 51 + /* 52 + * This should never get called 53 + */ 54 + static inline int get_hugepd_cache_index(int index) 55 + { 56 + BUG(); 57 + } 49 58 50 59 #else /* !CONFIG_HUGETLB_PAGE */ 51 60 static inline int pmd_huge(pmd_t pmd) { return 0; }
+5
arch/powerpc/include/asm/book3s/64/pgtable.h
··· 287 287 PMD_INDEX, 288 288 PUD_INDEX, 289 289 PGD_INDEX, 290 + /* 291 + * Below are used with 4k page size and hugetlb 292 + */ 293 + HTLB_16M_INDEX, 294 + HTLB_16G_INDEX, 290 295 }; 291 296 292 297 extern unsigned long __vmalloc_start;
+1 -1
arch/powerpc/include/asm/nmi.h
··· 8 8 static inline void arch_touch_nmi_watchdog(void) {} 9 9 #endif 10 10 11 - #if defined(CONFIG_PPC_BOOK3S_64) && defined(CONFIG_STACKTRACE) 11 + #if defined(CONFIG_NMI_IPI) && defined(CONFIG_STACKTRACE) 12 12 extern void arch_trigger_cpumask_backtrace(const cpumask_t *mask, 13 13 bool exclude_self); 14 14 #define arch_trigger_cpumask_backtrace arch_trigger_cpumask_backtrace
+1
arch/powerpc/include/asm/nohash/32/pgalloc.h
··· 109 109 } 110 110 111 111 #define check_pgt_cache() do { } while (0) 112 + #define get_hugepd_cache_index(x) (x) 112 113 113 114 #ifdef CONFIG_SMP 114 115 static inline void pgtable_free_tlb(struct mmu_gather *tlb,
+1
arch/powerpc/include/asm/nohash/64/pgalloc.h
··· 141 141 } 142 142 } 143 143 144 + #define get_hugepd_cache_index(x) (x) 144 145 #ifdef CONFIG_SMP 145 146 static inline void pgtable_free_tlb(struct mmu_gather *tlb, void *table, int shift) 146 147 {
+2 -1
arch/powerpc/kernel/dt_cpu_ftrs.c
··· 711 711 cur_cpu_spec->cpu_features |= CPU_FTR_P9_TM_HV_ASSIST; 712 712 cur_cpu_spec->cpu_features |= CPU_FTR_P9_TM_XER_SO_BUG; 713 713 cur_cpu_spec->cpu_features |= CPU_FTR_POWER9_DD2_1; 714 - } else /* DD2.1 and up have DD2_1 */ 714 + } else if ((version & 0xffff0000) == 0x004e0000) 715 + /* DD2.1 and up have DD2_1 */ 715 716 cur_cpu_spec->cpu_features |= CPU_FTR_POWER9_DD2_1; 716 717 717 718 if ((version & 0xffff0000) == 0x004e0000) {
+10 -2
arch/powerpc/kernel/setup-common.c
··· 701 701 unsigned long event, void *ptr) 702 702 { 703 703 /* 704 + * panic does a local_irq_disable, but we really 705 + * want interrupts to be hard disabled. 706 + */ 707 + hard_irq_disable(); 708 + 709 + /* 704 710 * If firmware-assisted dump has been registered then trigger 705 711 * firmware-assisted dump and let firmware handle everything else. 706 712 */ 707 713 crash_fadump(NULL, ptr); 708 - ppc_md.panic(ptr); /* May not return */ 714 + if (ppc_md.panic) 715 + ppc_md.panic(ptr); /* May not return */ 709 716 return NOTIFY_DONE; 710 717 } 711 718 ··· 723 716 724 717 void __init setup_panic(void) 725 718 { 726 - if (!ppc_md.panic) 719 + /* PPC64 always does a hard irq disable in its panic handler */ 720 + if (!IS_ENABLED(CONFIG_PPC64) && !ppc_md.panic) 727 721 return; 728 722 atomic_notifier_chain_register(&panic_notifier_list, &ppc_panic_block); 729 723 }
+8
arch/powerpc/kernel/setup_64.c
··· 387 387 388 388 #endif /* CONFIG_SMP */ 389 389 390 + void panic_smp_self_stop(void) 391 + { 392 + hard_irq_disable(); 393 + spin_begin(); 394 + while (1) 395 + spin_cpu_relax(); 396 + } 397 + 390 398 #if defined(CONFIG_SMP) || defined(CONFIG_KEXEC_CORE) 391 399 static bool use_spinloop(void) 392 400 {
-6
arch/powerpc/kernel/smp.c
··· 600 600 nmi_ipi_busy_count--; 601 601 nmi_ipi_unlock(); 602 602 603 - /* Remove this CPU */ 604 - set_cpu_online(smp_processor_id(), false); 605 - 606 603 spin_begin(); 607 604 while (1) 608 605 spin_cpu_relax(); ··· 614 617 615 618 static void stop_this_cpu(void *dummy) 616 619 { 617 - /* Remove this CPU */ 618 - set_cpu_online(smp_processor_id(), false); 619 - 620 620 hard_irq_disable(); 621 621 spin_begin(); 622 622 while (1)
+2 -2
arch/powerpc/kernel/stacktrace.c
··· 196 196 EXPORT_SYMBOL_GPL(save_stack_trace_tsk_reliable); 197 197 #endif /* CONFIG_HAVE_RELIABLE_STACKTRACE */ 198 198 199 - #ifdef CONFIG_PPC_BOOK3S_64 199 + #if defined(CONFIG_PPC_BOOK3S_64) && defined(CONFIG_NMI_IPI) 200 200 static void handle_backtrace_ipi(struct pt_regs *regs) 201 201 { 202 202 nmi_cpu_backtrace(regs); ··· 242 242 { 243 243 nmi_trigger_cpumask_backtrace(mask, exclude_self, raise_backtrace_ipi); 244 244 } 245 - #endif /* CONFIG_PPC64 */ 245 + #endif /* defined(CONFIG_PPC_BOOK3S_64) && defined(CONFIG_NMI_IPI) */
+2 -1
arch/powerpc/mm/hugetlbpage.c
··· 337 337 if (shift >= pdshift) 338 338 hugepd_free(tlb, hugepte); 339 339 else 340 - pgtable_free_tlb(tlb, hugepte, pdshift - shift); 340 + pgtable_free_tlb(tlb, hugepte, 341 + get_hugepd_cache_index(pdshift - shift)); 341 342 } 342 343 343 344 static void hugetlb_free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
+12
arch/powerpc/mm/pgtable-book3s64.c
··· 409 409 case PUD_INDEX: 410 410 kmem_cache_free(PGT_CACHE(PUD_CACHE_INDEX), table); 411 411 break; 412 + #if defined(CONFIG_PPC_4K_PAGES) && defined(CONFIG_HUGETLB_PAGE) 413 + /* 16M hugepd directory at pud level */ 414 + case HTLB_16M_INDEX: 415 + BUILD_BUG_ON(H_16M_CACHE_INDEX <= 0); 416 + kmem_cache_free(PGT_CACHE(H_16M_CACHE_INDEX), table); 417 + break; 418 + /* 16G hugepd directory at the pgd level */ 419 + case HTLB_16G_INDEX: 420 + BUILD_BUG_ON(H_16G_CACHE_INDEX <= 0); 421 + kmem_cache_free(PGT_CACHE(H_16G_CACHE_INDEX), table); 422 + break; 423 + #endif 412 424 /* We don't free pgd table via RCU callback */ 413 425 default: 414 426 BUG();
+77 -21
arch/powerpc/mm/tlb-radix.c
··· 689 689 static unsigned long tlb_single_page_flush_ceiling __read_mostly = 33; 690 690 static unsigned long tlb_local_single_page_flush_ceiling __read_mostly = POWER9_TLB_SETS_RADIX * 2; 691 691 692 - void radix__flush_tlb_range(struct vm_area_struct *vma, unsigned long start, 693 - unsigned long end) 692 + static inline void __radix__flush_tlb_range(struct mm_struct *mm, 693 + unsigned long start, unsigned long end, 694 + bool flush_all_sizes) 694 695 695 696 { 696 - struct mm_struct *mm = vma->vm_mm; 697 697 unsigned long pid; 698 698 unsigned int page_shift = mmu_psize_defs[mmu_virtual_psize].shift; 699 699 unsigned long page_size = 1UL << page_shift; 700 700 unsigned long nr_pages = (end - start) >> page_shift; 701 701 bool local, full; 702 - 703 - #ifdef CONFIG_HUGETLB_PAGE 704 - if (is_vm_hugetlb_page(vma)) 705 - return radix__flush_hugetlb_tlb_range(vma, start, end); 706 - #endif 707 702 708 703 pid = mm->context.id; 709 704 if (unlikely(pid == MMU_NO_CONTEXT)) ··· 733 738 _tlbie_pid(pid, RIC_FLUSH_TLB); 734 739 } 735 740 } else { 736 - bool hflush = false; 741 + bool hflush = flush_all_sizes; 742 + bool gflush = flush_all_sizes; 737 743 unsigned long hstart, hend; 744 + unsigned long gstart, gend; 738 745 739 - #ifdef CONFIG_TRANSPARENT_HUGEPAGE 740 - hstart = (start + HPAGE_PMD_SIZE - 1) >> HPAGE_PMD_SHIFT; 741 - hend = end >> HPAGE_PMD_SHIFT; 742 - if (hstart < hend) { 743 - hstart <<= HPAGE_PMD_SHIFT; 744 - hend <<= HPAGE_PMD_SHIFT; 746 + if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) 745 747 hflush = true; 748 + 749 + if (hflush) { 750 + hstart = (start + PMD_SIZE - 1) & PMD_MASK; 751 + hend = end & PMD_MASK; 752 + if (hstart == hend) 753 + hflush = false; 746 754 } 747 - #endif 755 + 756 + if (gflush) { 757 + gstart = (start + PUD_SIZE - 1) & PUD_MASK; 758 + gend = end & PUD_MASK; 759 + if (gstart == gend) 760 + gflush = false; 761 + } 748 762 749 763 asm volatile("ptesync": : :"memory"); 750 764 if (local) { 751 765 __tlbiel_va_range(start, end, pid, page_size, mmu_virtual_psize); 752 766 if (hflush) 753 767 __tlbiel_va_range(hstart, hend, pid, 754 - HPAGE_PMD_SIZE, MMU_PAGE_2M); 768 + PMD_SIZE, MMU_PAGE_2M); 769 + if (gflush) 770 + __tlbiel_va_range(gstart, gend, pid, 771 + PUD_SIZE, MMU_PAGE_1G); 755 772 asm volatile("ptesync": : :"memory"); 756 773 } else { 757 774 __tlbie_va_range(start, end, pid, page_size, mmu_virtual_psize); 758 775 if (hflush) 759 776 __tlbie_va_range(hstart, hend, pid, 760 - HPAGE_PMD_SIZE, MMU_PAGE_2M); 777 + PMD_SIZE, MMU_PAGE_2M); 778 + if (gflush) 779 + __tlbie_va_range(gstart, gend, pid, 780 + PUD_SIZE, MMU_PAGE_1G); 761 781 fixup_tlbie(); 762 782 asm volatile("eieio; tlbsync; ptesync": : :"memory"); 763 783 } 764 784 } 765 785 preempt_enable(); 786 + } 787 + 788 + void radix__flush_tlb_range(struct vm_area_struct *vma, unsigned long start, 789 + unsigned long end) 790 + 791 + { 792 + #ifdef CONFIG_HUGETLB_PAGE 793 + if (is_vm_hugetlb_page(vma)) 794 + return radix__flush_hugetlb_tlb_range(vma, start, end); 795 + #endif 796 + 797 + __radix__flush_tlb_range(vma->vm_mm, start, end, false); 766 798 } 767 799 EXPORT_SYMBOL(radix__flush_tlb_range); 768 800 ··· 859 837 int psize = 0; 860 838 struct mm_struct *mm = tlb->mm; 861 839 int page_size = tlb->page_size; 840 + unsigned long start = tlb->start; 841 + unsigned long end = tlb->end; 862 842 863 843 /* 864 844 * if page size is not something we understand, do a full mm flush ··· 871 847 */ 872 848 if (tlb->fullmm) { 873 849 __flush_all_mm(mm, true); 850 + #if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLB_PAGE) 851 + } else if (mm_tlb_flush_nested(mm)) { 852 + /* 853 + * If there is a concurrent invalidation that is clearing ptes, 854 + * then it's possible this invalidation will miss one of those 855 + * cleared ptes and miss flushing the TLB. If this invalidate 856 + * returns before the other one flushes TLBs, that can result 857 + * in it returning while there are still valid TLBs inside the 858 + * range to be invalidated. 859 + * 860 + * See mm/memory.c:tlb_finish_mmu() for more details. 861 + * 862 + * The solution to this is ensure the entire range is always 863 + * flushed here. The problem for powerpc is that the flushes 864 + * are page size specific, so this "forced flush" would not 865 + * do the right thing if there are a mix of page sizes in 866 + * the range to be invalidated. So use __flush_tlb_range 867 + * which invalidates all possible page sizes in the range. 868 + * 869 + * PWC flush probably is not be required because the core code 870 + * shouldn't free page tables in this path, but accounting 871 + * for the possibility makes us a bit more robust. 872 + * 873 + * need_flush_all is an uncommon case because page table 874 + * teardown should be done with exclusive locks held (but 875 + * after locks are dropped another invalidate could come 876 + * in), it could be optimized further if necessary. 877 + */ 878 + if (!tlb->need_flush_all) 879 + __radix__flush_tlb_range(mm, start, end, true); 880 + else 881 + radix__flush_all_mm(mm); 882 + #endif 874 883 } else if ( (psize = radix_get_mmu_psize(page_size)) == -1) { 875 884 if (!tlb->need_flush_all) 876 885 radix__flush_tlb_mm(mm); 877 886 else 878 887 radix__flush_all_mm(mm); 879 888 } else { 880 - unsigned long start = tlb->start; 881 - unsigned long end = tlb->end; 882 - 883 889 if (!tlb->need_flush_all) 884 890 radix__flush_tlb_range_psize(mm, start, end, psize); 885 891 else ··· 1096 1042 1097 1043 for (; sib <= cpu_last_thread_sibling(cpu) && !flush; sib++) { 1098 1044 if (sib == cpu) 1045 + continue; 1046 + if (!cpu_possible(sib)) 1099 1047 continue; 1100 1048 if (paca_ptrs[sib]->kvm_hstate.kvm_vcpu) 1101 1049 flush = true;