Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

powerpc/64s/radix: introduce options to disable use of the tlbie instruction

Introduce two options to control the use of the tlbie instruction. A
boot time option which completely disables the kernel using the
instruction, this is currently incompatible with HASH MMU, KVM, and
coherent accelerators.

And a debugfs option can be switched at runtime and avoids using tlbie
for invalidating CPU TLBs for normal process and kernel address
mappings. Coherent accelerators are still managed with tlbie, as will
KVM partition scope translations.

Cross-CPU TLB flushing is implemented with IPIs and tlbiel. This is a
basic implementation which does not attempt to make any optimisation
beyond the tlbie implementation.

This is useful for performance testing among other things. For example
in certain situations on large systems, using IPIs may be faster than
tlbie as they can be directed rather than broadcast. Later we may also
take advantage of the IPIs to do more interesting things such as trim
the mm cpumask more aggressively.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20190902152931.17840-7-npiggin@gmail.com

authored by

Nicholas Piggin and committed by
Michael Ellerman
2275d7b5 7d805acc

+246 -18
+4
Documentation/admin-guide/kernel-parameters.txt
··· 860 860 disable_radix [PPC] 861 861 Disable RADIX MMU mode on POWER9 862 862 863 + disable_tlbie [PPC] 864 + Disable TLBIE instruction. Currently does not work 865 + with KVM, with HASH MMU, or with coherent accelerators. 866 + 863 867 disable_cpu_apicid= [X86,APIC,SMP] 864 868 Format: <int> 865 869 The number of initial APIC ID for the
+9
arch/powerpc/include/asm/book3s/64/tlbflush.h
··· 162 162 163 163 radix__flush_tlb_pwc(tlb, address); 164 164 } 165 + 166 + extern bool tlbie_capable; 167 + extern bool tlbie_enabled; 168 + 169 + static inline bool cputlb_use_tlbie(void) 170 + { 171 + return tlbie_enabled; 172 + } 173 + 165 174 #endif /* _ASM_POWERPC_BOOK3S_64_TLBFLUSH_H */
+6
arch/powerpc/kvm/book3s_hv.c
··· 5462 5462 static int kvmppc_book3s_init_hv(void) 5463 5463 { 5464 5464 int r; 5465 + 5466 + if (!tlbie_capable) { 5467 + pr_err("KVM-HV: Host does not support TLBIE\n"); 5468 + return -ENODEV; 5469 + } 5470 + 5465 5471 /* 5466 5472 * FIXME!! Do we need to check on all cpus ? 5467 5473 */
+47
arch/powerpc/mm/book3s64/pgtable.c
··· 8 8 #include <linux/memblock.h> 9 9 #include <misc/cxl-base.h> 10 10 11 + #include <asm/debugfs.h> 11 12 #include <asm/pgalloc.h> 12 13 #include <asm/tlb.h> 13 14 #include <asm/trace.h> ··· 470 469 471 470 return true; 472 471 } 472 + 473 + /* 474 + * Does the CPU support tlbie? 475 + */ 476 + bool tlbie_capable __read_mostly = true; 477 + EXPORT_SYMBOL(tlbie_capable); 478 + 479 + /* 480 + * Should tlbie be used for management of CPU TLBs, for kernel and process 481 + * address spaces? tlbie may still be used for nMMU accelerators, and for KVM 482 + * guest address spaces. 483 + */ 484 + bool tlbie_enabled __read_mostly = true; 485 + 486 + static int __init setup_disable_tlbie(char *str) 487 + { 488 + if (!radix_enabled()) { 489 + pr_err("disable_tlbie: Unable to disable TLBIE with Hash MMU.\n"); 490 + return 1; 491 + } 492 + 493 + tlbie_capable = false; 494 + tlbie_enabled = false; 495 + 496 + return 1; 497 + } 498 + __setup("disable_tlbie", setup_disable_tlbie); 499 + 500 + static int __init pgtable_debugfs_setup(void) 501 + { 502 + if (!tlbie_capable) 503 + return 0; 504 + 505 + /* 506 + * There is no locking vs tlb flushing when changing this value. 507 + * The tlb flushers will see one value or another, and use either 508 + * tlbie or tlbiel with IPIs. In both cases the TLBs will be 509 + * invalidated as expected. 510 + */ 511 + debugfs_create_bool("tlbie_enabled", 0600, 512 + powerpc_debugfs_root, 513 + &tlbie_enabled); 514 + 515 + return 0; 516 + } 517 + arch_initcall(pgtable_debugfs_setup);
+172 -18
arch/powerpc/mm/book3s64/radix_tlb.c
··· 270 270 asm volatile("eieio; tlbsync; ptesync": : :"memory"); 271 271 } 272 272 273 + struct tlbiel_pid { 274 + unsigned long pid; 275 + unsigned long ric; 276 + }; 277 + 278 + static void do_tlbiel_pid(void *info) 279 + { 280 + struct tlbiel_pid *t = info; 281 + 282 + if (t->ric == RIC_FLUSH_TLB) 283 + _tlbiel_pid(t->pid, RIC_FLUSH_TLB); 284 + else if (t->ric == RIC_FLUSH_PWC) 285 + _tlbiel_pid(t->pid, RIC_FLUSH_PWC); 286 + else 287 + _tlbiel_pid(t->pid, RIC_FLUSH_ALL); 288 + } 289 + 290 + static inline void _tlbiel_pid_multicast(struct mm_struct *mm, 291 + unsigned long pid, unsigned long ric) 292 + { 293 + struct cpumask *cpus = mm_cpumask(mm); 294 + struct tlbiel_pid t = { .pid = pid, .ric = ric }; 295 + 296 + on_each_cpu_mask(cpus, do_tlbiel_pid, &t, 1); 297 + /* 298 + * Always want the CPU translations to be invalidated with tlbiel in 299 + * these paths, so while coprocessors must use tlbie, we can not 300 + * optimise away the tlbiel component. 301 + */ 302 + if (atomic_read(&mm->context.copros) > 0) 303 + _tlbie_pid(pid, RIC_FLUSH_ALL); 304 + } 305 + 273 306 static inline void _tlbie_lpid(unsigned long lpid, unsigned long ric) 274 307 { 275 308 asm volatile("ptesync": : :"memory"); ··· 403 370 asm volatile("eieio; tlbsync; ptesync": : :"memory"); 404 371 } 405 372 373 + struct tlbiel_va { 374 + unsigned long pid; 375 + unsigned long va; 376 + unsigned long psize; 377 + unsigned long ric; 378 + }; 379 + 380 + static void do_tlbiel_va(void *info) 381 + { 382 + struct tlbiel_va *t = info; 383 + 384 + if (t->ric == RIC_FLUSH_TLB) 385 + _tlbiel_va(t->va, t->pid, t->psize, RIC_FLUSH_TLB); 386 + else if (t->ric == RIC_FLUSH_PWC) 387 + _tlbiel_va(t->va, t->pid, t->psize, RIC_FLUSH_PWC); 388 + else 389 + _tlbiel_va(t->va, t->pid, t->psize, RIC_FLUSH_ALL); 390 + } 391 + 392 + static inline void _tlbiel_va_multicast(struct mm_struct *mm, 393 + unsigned long va, unsigned long pid, 394 + unsigned long psize, unsigned long ric) 395 + { 396 + struct cpumask *cpus = mm_cpumask(mm); 397 + struct tlbiel_va t = { .va = va, .pid = pid, .psize = psize, .ric = ric }; 398 + on_each_cpu_mask(cpus, do_tlbiel_va, &t, 1); 399 + if (atomic_read(&mm->context.copros) > 0) 400 + _tlbie_va(va, pid, psize, RIC_FLUSH_TLB); 401 + } 402 + 403 + struct tlbiel_va_range { 404 + unsigned long pid; 405 + unsigned long start; 406 + unsigned long end; 407 + unsigned long page_size; 408 + unsigned long psize; 409 + bool also_pwc; 410 + }; 411 + 412 + static void do_tlbiel_va_range(void *info) 413 + { 414 + struct tlbiel_va_range *t = info; 415 + 416 + _tlbiel_va_range(t->start, t->end, t->pid, t->page_size, 417 + t->psize, t->also_pwc); 418 + } 419 + 406 420 static __always_inline void _tlbie_lpid_va(unsigned long va, unsigned long lpid, 407 421 unsigned long psize, unsigned long ric) 408 422 { ··· 471 391 __tlbie_va_range(start, end, pid, page_size, psize); 472 392 fixup_tlbie(); 473 393 asm volatile("eieio; tlbsync; ptesync": : :"memory"); 394 + } 395 + 396 + static inline void _tlbiel_va_range_multicast(struct mm_struct *mm, 397 + unsigned long start, unsigned long end, 398 + unsigned long pid, unsigned long page_size, 399 + unsigned long psize, bool also_pwc) 400 + { 401 + struct cpumask *cpus = mm_cpumask(mm); 402 + struct tlbiel_va_range t = { .start = start, .end = end, 403 + .pid = pid, .page_size = page_size, 404 + .psize = psize, .also_pwc = also_pwc }; 405 + 406 + on_each_cpu_mask(cpus, do_tlbiel_va_range, &t, 1); 407 + if (atomic_read(&mm->context.copros) > 0) 408 + _tlbie_va_range(start, end, pid, page_size, psize, also_pwc); 474 409 } 475 410 476 411 /* ··· 625 530 goto local; 626 531 } 627 532 628 - if (mm_needs_flush_escalation(mm)) 629 - _tlbie_pid(pid, RIC_FLUSH_ALL); 630 - else 631 - _tlbie_pid(pid, RIC_FLUSH_TLB); 533 + if (cputlb_use_tlbie()) { 534 + if (mm_needs_flush_escalation(mm)) 535 + _tlbie_pid(pid, RIC_FLUSH_ALL); 536 + else 537 + _tlbie_pid(pid, RIC_FLUSH_TLB); 538 + } else { 539 + _tlbiel_pid_multicast(mm, pid, RIC_FLUSH_TLB); 540 + } 632 541 } else { 633 542 local: 634 543 _tlbiel_pid(pid, RIC_FLUSH_TLB); ··· 658 559 goto local; 659 560 } 660 561 } 661 - _tlbie_pid(pid, RIC_FLUSH_ALL); 562 + if (cputlb_use_tlbie()) 563 + _tlbie_pid(pid, RIC_FLUSH_ALL); 564 + else 565 + _tlbiel_pid_multicast(mm, pid, RIC_FLUSH_ALL); 662 566 } else { 663 567 local: 664 568 _tlbiel_pid(pid, RIC_FLUSH_ALL); ··· 696 594 exit_flush_lazy_tlbs(mm); 697 595 goto local; 698 596 } 699 - _tlbie_va(vmaddr, pid, psize, RIC_FLUSH_TLB); 597 + if (cputlb_use_tlbie()) 598 + _tlbie_va(vmaddr, pid, psize, RIC_FLUSH_TLB); 599 + else 600 + _tlbiel_va_multicast(mm, vmaddr, pid, psize, RIC_FLUSH_TLB); 700 601 } else { 701 602 local: 702 603 _tlbiel_va(vmaddr, pid, psize, RIC_FLUSH_TLB); ··· 721 616 #define radix__flush_all_mm radix__local_flush_all_mm 722 617 #endif /* CONFIG_SMP */ 723 618 619 + static void do_tlbiel_kernel(void *info) 620 + { 621 + _tlbiel_pid(0, RIC_FLUSH_ALL); 622 + } 623 + 624 + static inline void _tlbiel_kernel_broadcast(void) 625 + { 626 + on_each_cpu(do_tlbiel_kernel, NULL, 1); 627 + if (tlbie_capable) { 628 + /* 629 + * Coherent accelerators don't refcount kernel memory mappings, 630 + * so have to always issue a tlbie for them. This is quite a 631 + * slow path anyway. 632 + */ 633 + _tlbie_pid(0, RIC_FLUSH_ALL); 634 + } 635 + } 636 + 724 637 /* 725 638 * If kernel TLBIs ever become local rather than global, then 726 639 * drivers/misc/ocxl/link.c:ocxl_link_add_pe will need some work, as it ··· 746 623 */ 747 624 void radix__flush_tlb_kernel_range(unsigned long start, unsigned long end) 748 625 { 749 - _tlbie_pid(0, RIC_FLUSH_ALL); 626 + if (cputlb_use_tlbie()) 627 + _tlbie_pid(0, RIC_FLUSH_ALL); 628 + else 629 + _tlbiel_kernel_broadcast(); 750 630 } 751 631 EXPORT_SYMBOL(radix__flush_tlb_kernel_range); 752 632 ··· 805 679 if (local) { 806 680 _tlbiel_pid(pid, RIC_FLUSH_TLB); 807 681 } else { 808 - if (mm_needs_flush_escalation(mm)) 809 - _tlbie_pid(pid, RIC_FLUSH_ALL); 810 - else 811 - _tlbie_pid(pid, RIC_FLUSH_TLB); 682 + if (cputlb_use_tlbie()) { 683 + if (mm_needs_flush_escalation(mm)) 684 + _tlbie_pid(pid, RIC_FLUSH_ALL); 685 + else 686 + _tlbie_pid(pid, RIC_FLUSH_TLB); 687 + } else { 688 + _tlbiel_pid_multicast(mm, pid, RIC_FLUSH_TLB); 689 + } 812 690 } 813 691 } else { 814 692 bool hflush = flush_all_sizes; ··· 837 707 gflush = false; 838 708 } 839 709 840 - asm volatile("ptesync": : :"memory"); 841 710 if (local) { 711 + asm volatile("ptesync": : :"memory"); 842 712 __tlbiel_va_range(start, end, pid, page_size, mmu_virtual_psize); 843 713 if (hflush) 844 714 __tlbiel_va_range(hstart, hend, pid, ··· 847 717 __tlbiel_va_range(gstart, gend, pid, 848 718 PUD_SIZE, MMU_PAGE_1G); 849 719 asm volatile("ptesync": : :"memory"); 850 - } else { 720 + } else if (cputlb_use_tlbie()) { 721 + asm volatile("ptesync": : :"memory"); 851 722 __tlbie_va_range(start, end, pid, page_size, mmu_virtual_psize); 852 723 if (hflush) 853 724 __tlbie_va_range(hstart, hend, pid, ··· 858 727 PUD_SIZE, MMU_PAGE_1G); 859 728 fixup_tlbie(); 860 729 asm volatile("eieio; tlbsync; ptesync": : :"memory"); 730 + } else { 731 + _tlbiel_va_range_multicast(mm, 732 + start, end, pid, page_size, mmu_virtual_psize, false); 733 + if (hflush) 734 + _tlbiel_va_range_multicast(mm, 735 + hstart, hend, pid, PMD_SIZE, MMU_PAGE_2M, false); 736 + if (gflush) 737 + _tlbiel_va_range_multicast(mm, 738 + gstart, gend, pid, PUD_SIZE, MMU_PAGE_1G, false); 861 739 } 862 740 } 863 741 preempt_enable(); ··· 1043 903 if (local) { 1044 904 _tlbiel_pid(pid, also_pwc ? RIC_FLUSH_ALL : RIC_FLUSH_TLB); 1045 905 } else { 1046 - if (mm_needs_flush_escalation(mm)) 1047 - also_pwc = true; 906 + if (cputlb_use_tlbie()) { 907 + if (mm_needs_flush_escalation(mm)) 908 + also_pwc = true; 1048 909 1049 - _tlbie_pid(pid, also_pwc ? RIC_FLUSH_ALL : RIC_FLUSH_TLB); 910 + _tlbie_pid(pid, 911 + also_pwc ? RIC_FLUSH_ALL : RIC_FLUSH_TLB); 912 + } else { 913 + _tlbiel_pid_multicast(mm, pid, 914 + also_pwc ? RIC_FLUSH_ALL : RIC_FLUSH_TLB); 915 + } 916 + 1050 917 } 1051 918 } else { 1052 919 if (local) 1053 920 _tlbiel_va_range(start, end, pid, page_size, psize, also_pwc); 1054 - else 921 + else if (cputlb_use_tlbie()) 1055 922 _tlbie_va_range(start, end, pid, page_size, psize, also_pwc); 923 + else 924 + _tlbiel_va_range_multicast(mm, 925 + start, end, pid, page_size, psize, also_pwc); 1056 926 } 1057 927 preempt_enable(); 1058 928 } ··· 1104 954 exit_flush_lazy_tlbs(mm); 1105 955 goto local; 1106 956 } 1107 - _tlbie_va_range(addr, end, pid, PAGE_SIZE, mmu_virtual_psize, true); 957 + if (cputlb_use_tlbie()) 958 + _tlbie_va_range(addr, end, pid, PAGE_SIZE, mmu_virtual_psize, true); 959 + else 960 + _tlbiel_va_range_multicast(mm, 961 + addr, end, pid, PAGE_SIZE, mmu_virtual_psize, true); 1108 962 } else { 1109 963 local: 1110 964 _tlbiel_va_range(addr, end, pid, PAGE_SIZE, mmu_virtual_psize, true);
+4
drivers/misc/cxl/main.c
··· 18 18 #include <linux/sched/task.h> 19 19 20 20 #include <asm/cputable.h> 21 + #include <asm/mmu.h> 21 22 #include <misc/cxl-base.h> 22 23 23 24 #include "cxl.h" ··· 315 314 static int __init init_cxl(void) 316 315 { 317 316 int rc = 0; 317 + 318 + if (!tlbie_capable) 319 + return -EINVAL; 318 320 319 321 if ((rc = cxl_file_init())) 320 322 return rc;
+4
drivers/misc/ocxl/main.c
··· 2 2 // Copyright 2017 IBM Corp. 3 3 #include <linux/module.h> 4 4 #include <linux/pci.h> 5 + #include <asm/mmu.h> 5 6 #include "ocxl_internal.h" 6 7 7 8 static int __init init_ocxl(void) 8 9 { 9 10 int rc = 0; 11 + 12 + if (!tlbie_capable) 13 + return -EINVAL; 10 14 11 15 rc = ocxl_file_init(); 12 16 if (rc)