Merge tag 'powerpc-5.18-2' of git://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux

Pull powerpc fixes from Michael Ellerman:

- Fix KVM "lost kick" race, where an attempt to pull a vcpu out of the
guest could be lost (or delayed until the next guest exit).

- Disable SCV (system call vectored) when PR KVM guests could be run.

- Fix KVM PR guests using SCV, by disallowing AIL != 0 for KVM PR
guests.

- Add a new KVM CAP to indicate if AIL == 3 is supported.

- Fix a regression when hotplugging a CPU to a memoryless/cpuless node.

- Make virt_addr_valid() stricter for 64-bit Book3E & 32-bit, which
fixes crashes seen due to hardened usercopy.

- Revert a change to max_mapnr which broke HIGHMEM.

Thanks to Christophe Leroy, Fabiano Rosas, Kefeng Wang, Nicholas Piggin,
and Srikar Dronamraju.

* tag 'powerpc-5.18-2' of git://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux:
Revert "powerpc: Set max_mapnr correctly"
powerpc: Fix virt_addr_valid() for 64-bit Book3E & 32-bit
KVM: PPC: Move kvmhv_on_pseries() into kvm_ppc.h
powerpc/numa: Handle partially initialized numa nodes
powerpc/64: Fix build failure with allyesconfig in book3s_64_entry.S
KVM: PPC: Use KVM_CAP_PPC_AIL_MODE_3
KVM: PPC: Book3S PR: Disallow AIL != 0
KVM: PPC: Book3S PR: Disable SCV when AIL could be disabled
KVM: PPC: Book3S HV P9: Fix "lost kick" race

-12
arch/powerpc/include/asm/kvm_book3s_64.h
··· 16 16 #include <asm/ppc-opcode.h> 17 17 #include <asm/pte-walk.h> 18 18 19 - #ifdef CONFIG_PPC_PSERIES 20 - static inline bool kvmhv_on_pseries(void) 21 - { 22 - return !cpu_has_feature(CPU_FTR_HVMODE); 23 - } 24 - #else 25 - static inline bool kvmhv_on_pseries(void) 26 - { 27 - return false; 28 - } 29 - #endif 30 - 31 19 /* 32 20 * Structure for a nested guest, that is, for a guest that is managed by 33 21 * one of our guests.
+12
arch/powerpc/include/asm/kvm_ppc.h
··· 586 586 587 587 #endif 588 588 589 + #ifdef CONFIG_PPC_PSERIES 590 + static inline bool kvmhv_on_pseries(void) 591 + { 592 + return !cpu_has_feature(CPU_FTR_HVMODE); 593 + } 594 + #else 595 + static inline bool kvmhv_on_pseries(void) 596 + { 597 + return false; 598 + } 599 + #endif 600 + 589 601 #ifdef CONFIG_KVM_XICS 590 602 static inline int kvmppc_xics_enabled(struct kvm_vcpu *vcpu) 591 603 {
+5 -1
arch/powerpc/include/asm/page.h
··· 132 132 #define virt_to_page(kaddr) pfn_to_page(virt_to_pfn(kaddr)) 133 133 #define pfn_to_kaddr(pfn) __va((pfn) << PAGE_SHIFT) 134 134 135 - #define virt_addr_valid(kaddr) pfn_valid(virt_to_pfn(kaddr)) 135 + #define virt_addr_valid(vaddr) ({ \ 136 + unsigned long _addr = (unsigned long)vaddr; \ 137 + _addr >= PAGE_OFFSET && _addr < (unsigned long)high_memory && \ 138 + pfn_valid(virt_to_pfn(_addr)); \ 139 + }) 136 140 137 141 /* 138 142 * On Book-E parts we need __va to parse the device tree and we can't
+2
arch/powerpc/include/asm/setup.h
··· 28 28 #define ARCH_PANIC_TIMEOUT 180 29 29 30 30 #ifdef CONFIG_PPC_PSERIES 31 + extern bool pseries_reloc_on_exception(void); 31 32 extern bool pseries_enable_reloc_on_exc(void); 32 33 extern void pseries_disable_reloc_on_exc(void); 33 34 extern void pseries_big_endian_exceptions(void); 34 35 void __init pseries_little_endian_exceptions(void); 35 36 #else 37 + static inline bool pseries_reloc_on_exception(void) { return false; } 36 38 static inline bool pseries_enable_reloc_on_exc(void) { return false; } 37 39 static inline void pseries_disable_reloc_on_exc(void) {} 38 40 static inline void pseries_big_endian_exceptions(void) {}
+4
arch/powerpc/kernel/exceptions-64s.S
··· 809 809 * - MSR_EE|MSR_RI is clear (no reentrant exceptions) 810 810 * - Standard kernel environment is set up (stack, paca, etc) 811 811 * 812 + * KVM: 813 + * These interrupts do not elevate HV 0->1, so HV is not involved. PR KVM 814 + * ensures that FSCR[SCV] is disabled whenever it has to force AIL off. 815 + * 812 816 * Call convention: 813 817 * 814 818 * syscall register convention is in Documentation/powerpc/syscall64-abi.rst
+28
arch/powerpc/kernel/setup_64.c
··· 196 196 197 197 /* Under a PAPR hypervisor, we need hypercalls */ 198 198 if (firmware_has_feature(FW_FEATURE_SET_MODE)) { 199 + /* 200 + * - PR KVM does not support AIL mode interrupts in the host 201 + * while a PR guest is running. 202 + * 203 + * - SCV system call interrupt vectors are only implemented for 204 + * AIL mode interrupts. 205 + * 206 + * - On pseries, AIL mode can only be enabled and disabled 207 + * system-wide so when a PR VM is created on a pseries host, 208 + * all CPUs of the host are set to AIL=0 mode. 209 + * 210 + * - Therefore host CPUs must not execute scv while a PR VM 211 + * exists. 212 + * 213 + * - SCV support can not be disabled dynamically because the 214 + * feature is advertised to host userspace. Disabling the 215 + * facility and emulating it would be possible but is not 216 + * implemented. 217 + * 218 + * - So SCV support is blanket disabled if PR KVM could possibly 219 + * run. That is, PR support compiled in, booting on pseries 220 + * with hash MMU. 221 + */ 222 + if (IS_ENABLED(CONFIG_KVM_BOOK3S_PR_POSSIBLE) && !radix_enabled()) { 223 + init_task.thread.fscr &= ~FSCR_SCV; 224 + cur_cpu_spec->cpu_user_features2 &= ~PPC_FEATURE2_SCV; 225 + } 226 + 199 227 /* Enable AIL if possible */ 200 228 if (!pseries_enable_reloc_on_exc()) { 201 229 init_task.thread.fscr &= ~FSCR_SCV;
+9
arch/powerpc/kvm/Kconfig
··· 112 112 guest in user mode (problem state) and emulating all 113 113 privileged instructions and registers. 114 114 115 + This is only available for hash MMU mode and only supports 116 + guests that use hash MMU mode. 117 + 115 118 This is not as fast as using hypervisor mode, but works on 116 119 machines where hypervisor mode is not available or not usable, 117 120 and can emulate processors that are different from the host 118 121 processor, including emulating 32-bit processors on a 64-bit 119 122 host. 123 + 124 + Selecting this option will cause the SCV facility to be 125 + disabled when the kernel is booted on the pseries platform in 126 + hash MMU mode (regardless of PR VMs running). When any PR VMs 127 + are running, "AIL" mode is disabled which may slow interrupts 128 + and system calls on the host. 120 129 121 130 config KVM_BOOK3S_HV_EXIT_TIMING 122 131 bool "Detailed timing for hypervisor real-mode code"
+8 -2
arch/powerpc/kvm/book3s_64_entry.S
··· 414 414 */ 415 415 ld r10,HSTATE_SCRATCH0(r13) 416 416 cmpwi r10,BOOK3S_INTERRUPT_MACHINE_CHECK 417 - beq machine_check_common 417 + beq .Lcall_machine_check_common 418 418 419 419 cmpwi r10,BOOK3S_INTERRUPT_SYSTEM_RESET 420 - beq system_reset_common 420 + beq .Lcall_system_reset_common 421 421 422 422 b . 423 + 424 + .Lcall_machine_check_common: 425 + b machine_check_common 426 + 427 + .Lcall_system_reset_common: 428 + b system_reset_common 423 429 #endif
+33 -8
arch/powerpc/kvm/book3s_hv.c
··· 225 225 int cpu; 226 226 struct rcuwait *waitp; 227 227 228 + /* 229 + * rcuwait_wake_up contains smp_mb() which orders prior stores that 230 + * create pending work vs below loads of cpu fields. The other side 231 + * is the barrier in vcpu run that orders setting the cpu fields vs 232 + * testing for pending work. 233 + */ 234 + 228 235 waitp = kvm_arch_vcpu_get_wait(vcpu); 229 236 if (rcuwait_wake_up(waitp)) 230 237 ++vcpu->stat.generic.halt_wakeup; ··· 1096 1089 break; 1097 1090 } 1098 1091 tvcpu->arch.prodded = 1; 1099 - smp_mb(); 1092 + smp_mb(); /* This orders prodded store vs ceded load */ 1100 1093 if (tvcpu->arch.ceded) 1101 1094 kvmppc_fast_vcpu_kick_hv(tvcpu); 1102 1095 break; ··· 3773 3766 pvc = core_info.vc[sub]; 3774 3767 pvc->pcpu = pcpu + thr; 3775 3768 for_each_runnable_thread(i, vcpu, pvc) { 3769 + /* 3770 + * XXX: is kvmppc_start_thread called too late here? 3771 + * It updates vcpu->cpu and vcpu->arch.thread_cpu 3772 + * which are used by kvmppc_fast_vcpu_kick_hv(), but 3773 + * kick is called after new exceptions become available 3774 + * and exceptions are checked earlier than here, by 3775 + * kvmppc_core_prepare_to_enter. 3776 + */ 3776 3777 kvmppc_start_thread(vcpu, pvc); 3777 3778 kvmppc_create_dtl_entry(vcpu, pvc); 3778 3779 trace_kvm_guest_enter(vcpu); ··· 4502 4487 if (need_resched() || !kvm->arch.mmu_ready) 4503 4488 goto out; 4504 4489 4490 + vcpu->cpu = pcpu; 4491 + vcpu->arch.thread_cpu = pcpu; 4492 + vc->pcpu = pcpu; 4493 + local_paca->kvm_hstate.kvm_vcpu = vcpu; 4494 + local_paca->kvm_hstate.ptid = 0; 4495 + local_paca->kvm_hstate.fake_suspend = 0; 4496 + 4497 + /* 4498 + * Orders set cpu/thread_cpu vs testing for pending interrupts and 4499 + * doorbells below. The other side is when these fields are set vs 4500 + * kvmppc_fast_vcpu_kick_hv reading the cpu/thread_cpu fields to 4501 + * kick a vCPU to notice the pending interrupt. 4502 + */ 4503 + smp_mb(); 4504 + 4505 4505 if (!nested) { 4506 4506 kvmppc_core_prepare_to_enter(vcpu); 4507 4507 if (test_bit(BOOK3S_IRQPRIO_EXTERNAL, ··· 4535 4505 } 4536 4506 4537 4507 tb = mftb(); 4538 - 4539 - vcpu->cpu = pcpu; 4540 - vcpu->arch.thread_cpu = pcpu; 4541 - vc->pcpu = pcpu; 4542 - local_paca->kvm_hstate.kvm_vcpu = vcpu; 4543 - local_paca->kvm_hstate.ptid = 0; 4544 - local_paca->kvm_hstate.fake_suspend = 0; 4545 4508 4546 4509 __kvmppc_create_dtl_entry(vcpu, pcpu, tb + vc->tb_offset, 0); 4547 4510 ··· 4637 4614 run->exit_reason = KVM_EXIT_INTR; 4638 4615 vcpu->arch.ret = -EINTR; 4639 4616 out: 4617 + vcpu->cpu = -1; 4618 + vcpu->arch.thread_cpu = -1; 4640 4619 powerpc_local_irq_pmu_restore(flags); 4641 4620 preempt_enable(); 4642 4621 goto done;
+17 -9
arch/powerpc/kvm/book3s_pr.c
··· 137 137 svcpu->slb_max = to_book3s(vcpu)->slb_shadow_max; 138 138 svcpu->in_use = 0; 139 139 svcpu_put(svcpu); 140 - #endif 141 140 142 141 /* Disable AIL if supported */ 143 - if (cpu_has_feature(CPU_FTR_HVMODE) && 144 - cpu_has_feature(CPU_FTR_ARCH_207S)) 145 - mtspr(SPRN_LPCR, mfspr(SPRN_LPCR) & ~LPCR_AIL); 142 + if (cpu_has_feature(CPU_FTR_HVMODE)) { 143 + if (cpu_has_feature(CPU_FTR_ARCH_207S)) 144 + mtspr(SPRN_LPCR, mfspr(SPRN_LPCR) & ~LPCR_AIL); 145 + if (cpu_has_feature(CPU_FTR_ARCH_300) && (current->thread.fscr & FSCR_SCV)) 146 + mtspr(SPRN_FSCR, mfspr(SPRN_FSCR) & ~FSCR_SCV); 147 + } 148 + #endif 146 149 147 150 vcpu->cpu = smp_processor_id(); 148 151 #ifdef CONFIG_PPC_BOOK3S_32 ··· 168 165 memcpy(to_book3s(vcpu)->slb_shadow, svcpu->slb, sizeof(svcpu->slb)); 169 166 to_book3s(vcpu)->slb_shadow_max = svcpu->slb_max; 170 167 svcpu_put(svcpu); 168 + 169 + /* Enable AIL if supported */ 170 + if (cpu_has_feature(CPU_FTR_HVMODE)) { 171 + if (cpu_has_feature(CPU_FTR_ARCH_207S)) 172 + mtspr(SPRN_LPCR, mfspr(SPRN_LPCR) | LPCR_AIL_3); 173 + if (cpu_has_feature(CPU_FTR_ARCH_300) && (current->thread.fscr & FSCR_SCV)) 174 + mtspr(SPRN_FSCR, mfspr(SPRN_FSCR) | FSCR_SCV); 175 + } 171 176 #endif 172 177 173 178 if (kvmppc_is_split_real(vcpu)) ··· 184 173 kvmppc_giveup_ext(vcpu, MSR_FP | MSR_VEC | MSR_VSX); 185 174 kvmppc_giveup_fac(vcpu, FSCR_TAR_LG); 186 175 kvmppc_save_tm_pr(vcpu); 187 - 188 - /* Enable AIL if supported */ 189 - if (cpu_has_feature(CPU_FTR_HVMODE) && 190 - cpu_has_feature(CPU_FTR_ARCH_207S)) 191 - mtspr(SPRN_LPCR, mfspr(SPRN_LPCR) | LPCR_AIL_3); 192 176 193 177 vcpu->cpu = -1; 194 178 } ··· 1043 1037 1044 1038 void kvmppc_set_fscr(struct kvm_vcpu *vcpu, u64 fscr) 1045 1039 { 1040 + if (fscr & FSCR_SCV) 1041 + fscr &= ~FSCR_SCV; /* SCV must not be enabled */ 1046 1042 if ((vcpu->arch.fscr & FSCR_TAR) && !(fscr & FSCR_TAR)) { 1047 1043 /* TAR got dropped, drop it in shadow too */ 1048 1044 kvmppc_giveup_fac(vcpu, FSCR_TAR_LG);
+20
arch/powerpc/kvm/book3s_pr_papr.c
··· 281 281 return EMULATE_DONE; 282 282 } 283 283 284 + static int kvmppc_h_pr_set_mode(struct kvm_vcpu *vcpu) 285 + { 286 + unsigned long mflags = kvmppc_get_gpr(vcpu, 4); 287 + unsigned long resource = kvmppc_get_gpr(vcpu, 5); 288 + 289 + if (resource == H_SET_MODE_RESOURCE_ADDR_TRANS_MODE) { 290 + /* KVM PR does not provide AIL!=0 to guests */ 291 + if (mflags == 0) 292 + kvmppc_set_gpr(vcpu, 3, H_SUCCESS); 293 + else 294 + kvmppc_set_gpr(vcpu, 3, H_UNSUPPORTED_FLAG_START - 63); 295 + return EMULATE_DONE; 296 + } 297 + return EMULATE_FAIL; 298 + } 299 + 284 300 #ifdef CONFIG_SPAPR_TCE_IOMMU 285 301 static int kvmppc_h_pr_put_tce(struct kvm_vcpu *vcpu) 286 302 { ··· 400 384 return kvmppc_h_pr_logical_ci_load(vcpu); 401 385 case H_LOGICAL_CI_STORE: 402 386 return kvmppc_h_pr_logical_ci_store(vcpu); 387 + case H_SET_MODE: 388 + return kvmppc_h_pr_set_mode(vcpu); 403 389 case H_XIRR: 404 390 case H_CPPR: 405 391 case H_EOI: ··· 439 421 case H_CEDE: 440 422 case H_LOGICAL_CI_LOAD: 441 423 case H_LOGICAL_CI_STORE: 424 + case H_SET_MODE: 442 425 #ifdef CONFIG_KVM_XICS 443 426 case H_XIRR: 444 427 case H_CPPR: ··· 466 447 H_BULK_REMOVE, 467 448 H_PUT_TCE, 468 449 H_CEDE, 450 + H_SET_MODE, 469 451 #ifdef CONFIG_KVM_XICS 470 452 H_XIRR, 471 453 H_CPPR,
+17
arch/powerpc/kvm/powerpc.c
··· 705 705 r = 1; 706 706 break; 707 707 #endif 708 + case KVM_CAP_PPC_AIL_MODE_3: 709 + r = 0; 710 + /* 711 + * KVM PR, POWER7, and some POWER9s don't support AIL=3 mode. 712 + * The POWER9s can support it if the guest runs in hash mode, 713 + * but QEMU doesn't necessarily query the capability in time. 714 + */ 715 + if (hv_enabled) { 716 + if (kvmhv_on_pseries()) { 717 + if (pseries_reloc_on_exception()) 718 + r = 1; 719 + } else if (cpu_has_feature(CPU_FTR_ARCH_207S) && 720 + !cpu_has_feature(CPU_FTR_P9_RADIX_PREFETCH_BUG)) { 721 + r = 1; 722 + } 723 + } 724 + break; 708 725 default: 709 726 r = 0; 710 727 break;
+1 -1
arch/powerpc/mm/mem.c
··· 255 255 #endif 256 256 257 257 high_memory = (void *) __va(max_low_pfn * PAGE_SIZE); 258 - set_max_mapnr(max_low_pfn); 258 + set_max_mapnr(max_pfn); 259 259 260 260 kasan_late_init(); 261 261
+1 -1
arch/powerpc/mm/numa.c
··· 1436 1436 if (new_nid < 0 || !node_possible(new_nid)) 1437 1437 new_nid = first_online_node; 1438 1438 1439 - if (NODE_DATA(new_nid) == NULL) { 1439 + if (!node_online(new_nid)) { 1440 1440 #ifdef CONFIG_MEMORY_HOTPLUG 1441 1441 /* 1442 1442 * Need to ensure that NODE_DATA is initialized for a node from
+12 -1
arch/powerpc/platforms/pseries/setup.c
··· 353 353 pseries_idle_epilog(); 354 354 } 355 355 356 + static bool pseries_reloc_on_exception_enabled; 357 + 358 + bool pseries_reloc_on_exception(void) 359 + { 360 + return pseries_reloc_on_exception_enabled; 361 + } 362 + EXPORT_SYMBOL_GPL(pseries_reloc_on_exception); 363 + 356 364 /* 357 365 * Enable relocation on during exceptions. This has partition wide scope and 358 366 * may take a while to complete, if it takes longer than one second we will ··· 385 377 " on exceptions: %ld\n", rc); 386 378 return false; 387 379 } 380 + pseries_reloc_on_exception_enabled = true; 388 381 return true; 389 382 } 390 383 ··· 413 404 break; 414 405 mdelay(get_longbusy_msecs(rc)); 415 406 } 416 - if (rc != H_SUCCESS) 407 + if (rc == H_SUCCESS) 408 + pseries_reloc_on_exception_enabled = false; 409 + else 417 410 pr_warn("Warning: Failed to disable relocation on exceptions: %ld\n", 418 411 rc); 419 412 }