Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge branch 'kvm-ppc-next' of git://git.kernel.org/pub/scm/linux/kernel/git/paulus/powerpc into HEAD

- Better machine check handling for HV KVM
- Ability to support guests with threads=2, 4 or 8 on POWER9
- Fix for a race that could cause delayed recognition of signals
- Fix for a bug where POWER9 guests could sleep with interrupts
pending.

+736 -211
+37
Documentation/virtual/kvm/api.txt
··· 4131 4131 Allow use of adapter-interruption suppression. 4132 4132 Returns: 0 on success; -EBUSY if a VCPU has already been created. 4133 4133 4134 + 7.11 KVM_CAP_PPC_SMT 4135 + 4136 + Architectures: ppc 4137 + Parameters: vsmt_mode, flags 4138 + 4139 + Enabling this capability on a VM provides userspace with a way to set 4140 + the desired virtual SMT mode (i.e. the number of virtual CPUs per 4141 + virtual core). The virtual SMT mode, vsmt_mode, must be a power of 2 4142 + between 1 and 8. On POWER8, vsmt_mode must also be no greater than 4143 + the number of threads per subcore for the host. Currently flags must 4144 + be 0. A successful call to enable this capability will result in 4145 + vsmt_mode being returned when the KVM_CAP_PPC_SMT capability is 4146 + subsequently queried for the VM. This capability is only supported by 4147 + HV KVM, and can only be set before any VCPUs have been created. 4148 + The KVM_CAP_PPC_SMT_POSSIBLE capability indicates which virtual SMT 4149 + modes are available. 4150 + 4151 + 7.12 KVM_CAP_PPC_FWNMI 4152 + 4153 + Architectures: ppc 4154 + Parameters: none 4155 + 4156 + With this capability a machine check exception in the guest address 4157 + space will cause KVM to exit the guest with NMI exit reason. This 4158 + enables QEMU to build error log and branch to guest kernel registered 4159 + machine check handling routine. Without this capability KVM will 4160 + branch to guests' 0x200 interrupt vector. 4161 + 4134 4162 8. Other capabilities. 4135 4163 ---------------------- 4136 4164 ··· 4320 4292 Future versions of kvm may implement additional events. These will get 4321 4293 indicated by returning a higher number from KVM_CHECK_EXTENSION and will be 4322 4294 listed above. 4295 + 4296 + 8.10 KVM_CAP_PPC_SMT_POSSIBLE 4297 + 4298 + Architectures: ppc 4299 + 4300 + Querying this capability returns a bitmap indicating the possible 4301 + virtual SMT modes that can be set using KVM_CAP_PPC_SMT. If bit N 4302 + (counting from the right) is set, then a virtual SMT mode of 2^N is 4303 + available.
-1
arch/powerpc/include/asm/kvm_book3s.h
··· 86 86 u16 last_cpu; 87 87 u8 vcore_state; 88 88 u8 in_guest; 89 - struct kvmppc_vcore *master_vcore; 90 89 struct kvm_vcpu *runnable_threads[MAX_SMT_THREADS]; 91 90 struct list_head preempt_list; 92 91 spinlock_t lock;
+1 -1
arch/powerpc/include/asm/kvm_book3s_asm.h
··· 81 81 u8 subcore_size; 82 82 u8 do_nap; 83 83 u8 napped[MAX_SMT_THREADS]; 84 - struct kvmppc_vcore *master_vcs[MAX_SUBCORES]; 84 + struct kvmppc_vcore *vc[MAX_SUBCORES]; 85 85 }; 86 86 87 87 /*
+8 -1
arch/powerpc/include/asm/kvm_host.h
··· 35 35 #include <asm/page.h> 36 36 #include <asm/cacheflush.h> 37 37 #include <asm/hvcall.h> 38 + #include <asm/mce.h> 38 39 39 40 #define KVM_MAX_VCPUS NR_CPUS 40 41 #define KVM_MAX_VCORES NR_CPUS ··· 268 267 269 268 struct kvm_arch { 270 269 unsigned int lpid; 270 + unsigned int smt_mode; /* # vcpus per virtual core */ 271 + unsigned int emul_smt_mode; /* emualted SMT mode, on P9 */ 271 272 #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE 272 273 unsigned int tlb_sets; 273 274 struct kvm_hpt_info hpt; ··· 288 285 cpumask_t need_tlb_flush; 289 286 cpumask_t cpu_in_guest; 290 287 u8 radix; 288 + u8 fwnmi_enabled; 291 289 pgd_t *pgtable; 292 290 u64 process_table; 293 291 struct dentry *debugfs_dir; ··· 570 566 ulong wort; 571 567 ulong tid; 572 568 ulong psscr; 569 + ulong hfscr; 573 570 ulong shadow_srr1; 574 571 #endif 575 572 u32 vrsave; /* also USPRG0 */ ··· 584 579 ulong mcsrr0; 585 580 ulong mcsrr1; 586 581 ulong mcsr; 587 - u32 dec; 582 + ulong dec; 588 583 #ifdef CONFIG_BOOKE 589 584 u32 decar; 590 585 #endif ··· 715 710 unsigned long pending_exceptions; 716 711 u8 ceded; 717 712 u8 prodded; 713 + u8 doorbell_request; 718 714 u32 last_inst; 719 715 720 716 struct swait_queue_head *wqp; ··· 728 722 int prev_cpu; 729 723 bool timer_running; 730 724 wait_queue_head_t cpu_run; 725 + struct machine_check_event mce_evt; /* Valid if trap == 0x200 */ 731 726 732 727 struct kvm_vcpu_arch_shared *shared; 733 728 #if defined(CONFIG_PPC_BOOK3S_64) && defined(CONFIG_KVM_BOOK3S_PR_POSSIBLE)
+2
arch/powerpc/include/asm/kvm_ppc.h
··· 315 315 struct irq_bypass_producer *); 316 316 int (*configure_mmu)(struct kvm *kvm, struct kvm_ppc_mmuv3_cfg *cfg); 317 317 int (*get_rmmu_info)(struct kvm *kvm, struct kvm_ppc_rmmu_info *info); 318 + int (*set_smt_mode)(struct kvm *kvm, unsigned long mode, 319 + unsigned long flags); 318 320 }; 319 321 320 322 extern struct kvmppc_ops *kvmppc_hv_ops;
+2
arch/powerpc/include/asm/ppc-opcode.h
··· 103 103 #define OP_31_XOP_STBUX 247 104 104 #define OP_31_XOP_LHZX 279 105 105 #define OP_31_XOP_LHZUX 311 106 + #define OP_31_XOP_MSGSNDP 142 107 + #define OP_31_XOP_MSGCLRP 174 106 108 #define OP_31_XOP_MFSPR 339 107 109 #define OP_31_XOP_LWAX 341 108 110 #define OP_31_XOP_LHAX 343
+6
arch/powerpc/include/uapi/asm/kvm.h
··· 60 60 61 61 #define KVM_SREGS_E_FSL_PIDn (1 << 0) /* PID1/PID2 */ 62 62 63 + /* flags for kvm_run.flags */ 64 + #define KVM_RUN_PPC_NMI_DISP_MASK (3 << 0) 65 + #define KVM_RUN_PPC_NMI_DISP_FULLY_RECOV (1 << 0) 66 + #define KVM_RUN_PPC_NMI_DISP_LIMITED_RECOV (2 << 0) 67 + #define KVM_RUN_PPC_NMI_DISP_NOT_RECOV (3 << 0) 68 + 63 69 /* 64 70 * Feature bits indicate which sections of the sregs struct are valid, 65 71 * both in KVM_GET_SREGS and KVM_SET_SREGS. On KVM_SET_SREGS, registers
+3
arch/powerpc/kernel/asm-offsets.c
··· 485 485 OFFSET(KVM_ENABLED_HCALLS, kvm, arch.enabled_hcalls); 486 486 OFFSET(KVM_VRMA_SLB_V, kvm, arch.vrma_slb_v); 487 487 OFFSET(KVM_RADIX, kvm, arch.radix); 488 + OFFSET(KVM_FWNMI, kvm, arch.fwnmi_enabled); 488 489 OFFSET(VCPU_DSISR, kvm_vcpu, arch.shregs.dsisr); 489 490 OFFSET(VCPU_DAR, kvm_vcpu, arch.shregs.dar); 490 491 OFFSET(VCPU_VPA, kvm_vcpu, arch.vpa.pinned_addr); ··· 514 513 OFFSET(VCPU_PENDING_EXC, kvm_vcpu, arch.pending_exceptions); 515 514 OFFSET(VCPU_CEDED, kvm_vcpu, arch.ceded); 516 515 OFFSET(VCPU_PRODDED, kvm_vcpu, arch.prodded); 516 + OFFSET(VCPU_DBELL_REQ, kvm_vcpu, arch.doorbell_request); 517 517 OFFSET(VCPU_MMCR, kvm_vcpu, arch.mmcr); 518 518 OFFSET(VCPU_PMC, kvm_vcpu, arch.pmc); 519 519 OFFSET(VCPU_SPMC, kvm_vcpu, arch.spmc); ··· 544 542 OFFSET(VCPU_WORT, kvm_vcpu, arch.wort); 545 543 OFFSET(VCPU_TID, kvm_vcpu, arch.tid); 546 544 OFFSET(VCPU_PSSCR, kvm_vcpu, arch.psscr); 545 + OFFSET(VCPU_HFSCR, kvm_vcpu, arch.hfscr); 547 546 OFFSET(VCORE_ENTRY_EXIT, kvmppc_vcore, entry_exit_map); 548 547 OFFSET(VCORE_IN_GUEST, kvmppc_vcore, in_guest); 549 548 OFFSET(VCORE_NAPPING_THREADS, kvmppc_vcore, napping_threads);
+1
arch/powerpc/kernel/mce.c
··· 405 405 break; 406 406 } 407 407 } 408 + EXPORT_SYMBOL_GPL(machine_check_print_event_info); 408 409 409 410 uint64_t get_mce_fault_addr(struct machine_check_event *evt) 410 411 {
+450 -112
arch/powerpc/kvm/book3s_hv.c
··· 46 46 #include <linux/of.h> 47 47 48 48 #include <asm/reg.h> 49 + #include <asm/ppc-opcode.h> 50 + #include <asm/disassemble.h> 49 51 #include <asm/cputable.h> 50 52 #include <asm/cacheflush.h> 51 53 #include <asm/tlbflush.h> ··· 647 645 unsigned long stolen; 648 646 unsigned long core_stolen; 649 647 u64 now; 648 + unsigned long flags; 650 649 651 650 dt = vcpu->arch.dtl_ptr; 652 651 vpa = vcpu->arch.vpa.pinned_addr; ··· 655 652 core_stolen = vcore_stolen_time(vc, now); 656 653 stolen = core_stolen - vcpu->arch.stolen_logged; 657 654 vcpu->arch.stolen_logged = core_stolen; 658 - spin_lock_irq(&vcpu->arch.tbacct_lock); 655 + spin_lock_irqsave(&vcpu->arch.tbacct_lock, flags); 659 656 stolen += vcpu->arch.busy_stolen; 660 657 vcpu->arch.busy_stolen = 0; 661 - spin_unlock_irq(&vcpu->arch.tbacct_lock); 658 + spin_unlock_irqrestore(&vcpu->arch.tbacct_lock, flags); 662 659 if (!dt || !vpa) 663 660 return; 664 661 memset(dt, 0, sizeof(struct dtl_entry)); ··· 676 673 smp_wmb(); 677 674 vpa->dtl_idx = cpu_to_be64(++vcpu->arch.dtl_index); 678 675 vcpu->arch.dtl.dirty = true; 676 + } 677 + 678 + /* See if there is a doorbell interrupt pending for a vcpu */ 679 + static bool kvmppc_doorbell_pending(struct kvm_vcpu *vcpu) 680 + { 681 + int thr; 682 + struct kvmppc_vcore *vc; 683 + 684 + if (vcpu->arch.doorbell_request) 685 + return true; 686 + /* 687 + * Ensure that the read of vcore->dpdes comes after the read 688 + * of vcpu->doorbell_request. This barrier matches the 689 + * lwsync in book3s_hv_rmhandlers.S just before the 690 + * fast_guest_return label. 691 + */ 692 + smp_rmb(); 693 + vc = vcpu->arch.vcore; 694 + thr = vcpu->vcpu_id - vc->first_vcpuid; 695 + return !!(vc->dpdes & (1 << thr)); 679 696 } 680 697 681 698 static bool kvmppc_power8_compatible(struct kvm_vcpu *vcpu) ··· 949 926 } 950 927 } 951 928 929 + static void do_nothing(void *x) 930 + { 931 + } 932 + 933 + static unsigned long kvmppc_read_dpdes(struct kvm_vcpu *vcpu) 934 + { 935 + int thr, cpu, pcpu, nthreads; 936 + struct kvm_vcpu *v; 937 + unsigned long dpdes; 938 + 939 + nthreads = vcpu->kvm->arch.emul_smt_mode; 940 + dpdes = 0; 941 + cpu = vcpu->vcpu_id & ~(nthreads - 1); 942 + for (thr = 0; thr < nthreads; ++thr, ++cpu) { 943 + v = kvmppc_find_vcpu(vcpu->kvm, cpu); 944 + if (!v) 945 + continue; 946 + /* 947 + * If the vcpu is currently running on a physical cpu thread, 948 + * interrupt it in order to pull it out of the guest briefly, 949 + * which will update its vcore->dpdes value. 950 + */ 951 + pcpu = READ_ONCE(v->cpu); 952 + if (pcpu >= 0) 953 + smp_call_function_single(pcpu, do_nothing, NULL, 1); 954 + if (kvmppc_doorbell_pending(v)) 955 + dpdes |= 1 << thr; 956 + } 957 + return dpdes; 958 + } 959 + 960 + /* 961 + * On POWER9, emulate doorbell-related instructions in order to 962 + * give the guest the illusion of running on a multi-threaded core. 963 + * The instructions emulated are msgsndp, msgclrp, mfspr TIR, 964 + * and mfspr DPDES. 965 + */ 966 + static int kvmppc_emulate_doorbell_instr(struct kvm_vcpu *vcpu) 967 + { 968 + u32 inst, rb, thr; 969 + unsigned long arg; 970 + struct kvm *kvm = vcpu->kvm; 971 + struct kvm_vcpu *tvcpu; 972 + 973 + if (!cpu_has_feature(CPU_FTR_ARCH_300)) 974 + return EMULATE_FAIL; 975 + if (kvmppc_get_last_inst(vcpu, INST_GENERIC, &inst) != EMULATE_DONE) 976 + return RESUME_GUEST; 977 + if (get_op(inst) != 31) 978 + return EMULATE_FAIL; 979 + rb = get_rb(inst); 980 + thr = vcpu->vcpu_id & (kvm->arch.emul_smt_mode - 1); 981 + switch (get_xop(inst)) { 982 + case OP_31_XOP_MSGSNDP: 983 + arg = kvmppc_get_gpr(vcpu, rb); 984 + if (((arg >> 27) & 0xf) != PPC_DBELL_SERVER) 985 + break; 986 + arg &= 0x3f; 987 + if (arg >= kvm->arch.emul_smt_mode) 988 + break; 989 + tvcpu = kvmppc_find_vcpu(kvm, vcpu->vcpu_id - thr + arg); 990 + if (!tvcpu) 991 + break; 992 + if (!tvcpu->arch.doorbell_request) { 993 + tvcpu->arch.doorbell_request = 1; 994 + kvmppc_fast_vcpu_kick_hv(tvcpu); 995 + } 996 + break; 997 + case OP_31_XOP_MSGCLRP: 998 + arg = kvmppc_get_gpr(vcpu, rb); 999 + if (((arg >> 27) & 0xf) != PPC_DBELL_SERVER) 1000 + break; 1001 + vcpu->arch.vcore->dpdes = 0; 1002 + vcpu->arch.doorbell_request = 0; 1003 + break; 1004 + case OP_31_XOP_MFSPR: 1005 + switch (get_sprn(inst)) { 1006 + case SPRN_TIR: 1007 + arg = thr; 1008 + break; 1009 + case SPRN_DPDES: 1010 + arg = kvmppc_read_dpdes(vcpu); 1011 + break; 1012 + default: 1013 + return EMULATE_FAIL; 1014 + } 1015 + kvmppc_set_gpr(vcpu, get_rt(inst), arg); 1016 + break; 1017 + default: 1018 + return EMULATE_FAIL; 1019 + } 1020 + kvmppc_set_pc(vcpu, kvmppc_get_pc(vcpu) + 4); 1021 + return RESUME_GUEST; 1022 + } 1023 + 952 1024 static int kvmppc_handle_exit_hv(struct kvm_run *run, struct kvm_vcpu *vcpu, 953 1025 struct task_struct *tsk) 954 1026 { ··· 1089 971 r = RESUME_GUEST; 1090 972 break; 1091 973 case BOOK3S_INTERRUPT_MACHINE_CHECK: 1092 - /* 1093 - * Deliver a machine check interrupt to the guest. 1094 - * We have to do this, even if the host has handled the 1095 - * machine check, because machine checks use SRR0/1 and 1096 - * the interrupt might have trashed guest state in them. 1097 - */ 1098 - kvmppc_book3s_queue_irqprio(vcpu, 1099 - BOOK3S_INTERRUPT_MACHINE_CHECK); 1100 - r = RESUME_GUEST; 974 + /* Exit to guest with KVM_EXIT_NMI as exit reason */ 975 + run->exit_reason = KVM_EXIT_NMI; 976 + run->hw.hardware_exit_reason = vcpu->arch.trap; 977 + /* Clear out the old NMI status from run->flags */ 978 + run->flags &= ~KVM_RUN_PPC_NMI_DISP_MASK; 979 + /* Now set the NMI status */ 980 + if (vcpu->arch.mce_evt.disposition == MCE_DISPOSITION_RECOVERED) 981 + run->flags |= KVM_RUN_PPC_NMI_DISP_FULLY_RECOV; 982 + else 983 + run->flags |= KVM_RUN_PPC_NMI_DISP_NOT_RECOV; 984 + 985 + r = RESUME_HOST; 986 + /* Print the MCE event to host console. */ 987 + machine_check_print_event_info(&vcpu->arch.mce_evt, false); 1101 988 break; 1102 989 case BOOK3S_INTERRUPT_PROGRAM: 1103 990 { ··· 1171 1048 break; 1172 1049 /* 1173 1050 * This occurs if the guest (kernel or userspace), does something that 1174 - * is prohibited by HFSCR. We just generate a program interrupt to 1175 - * the guest. 1051 + * is prohibited by HFSCR. 1052 + * On POWER9, this could be a doorbell instruction that we need 1053 + * to emulate. 1054 + * Otherwise, we just generate a program interrupt to the guest. 1176 1055 */ 1177 1056 case BOOK3S_INTERRUPT_H_FAC_UNAVAIL: 1178 - kvmppc_core_queue_program(vcpu, SRR1_PROGILL); 1179 - r = RESUME_GUEST; 1057 + r = EMULATE_FAIL; 1058 + if ((vcpu->arch.hfscr >> 56) == FSCR_MSGP_LG) 1059 + r = kvmppc_emulate_doorbell_instr(vcpu); 1060 + if (r == EMULATE_FAIL) { 1061 + kvmppc_core_queue_program(vcpu, SRR1_PROGILL); 1062 + r = RESUME_GUEST; 1063 + } 1180 1064 break; 1181 1065 case BOOK3S_INTERRUPT_HV_RM_HARD: 1182 1066 r = RESUME_PASSTHROUGH; ··· 1273 1143 mask = LPCR_DPFD | LPCR_ILE | LPCR_TC; 1274 1144 if (cpu_has_feature(CPU_FTR_ARCH_207S)) 1275 1145 mask |= LPCR_AIL; 1146 + /* 1147 + * On POWER9, allow userspace to enable large decrementer for the 1148 + * guest, whether or not the host has it enabled. 1149 + */ 1150 + if (cpu_has_feature(CPU_FTR_ARCH_300)) 1151 + mask |= LPCR_LD; 1276 1152 1277 1153 /* Broken 32-bit version of LPCR must not clear top bits */ 1278 1154 if (preserve_top32) ··· 1622 1486 r = set_vpa(vcpu, &vcpu->arch.dtl, addr, len); 1623 1487 break; 1624 1488 case KVM_REG_PPC_TB_OFFSET: 1489 + /* 1490 + * POWER9 DD1 has an erratum where writing TBU40 causes 1491 + * the timebase to lose ticks. So we don't let the 1492 + * timebase offset be changed on P9 DD1. (It is 1493 + * initialized to zero.) 1494 + */ 1495 + if (cpu_has_feature(CPU_FTR_POWER9_DD1)) 1496 + break; 1625 1497 /* round up to multiple of 2^24 */ 1626 1498 vcpu->arch.vcore->tb_offset = 1627 1499 ALIGN(set_reg_val(id, *val), 1UL << 24); ··· 1747 1603 init_swait_queue_head(&vcore->wq); 1748 1604 vcore->preempt_tb = TB_NIL; 1749 1605 vcore->lpcr = kvm->arch.lpcr; 1750 - vcore->first_vcpuid = core * threads_per_vcore(); 1606 + vcore->first_vcpuid = core * kvm->arch.smt_mode; 1751 1607 vcore->kvm = kvm; 1752 1608 INIT_LIST_HEAD(&vcore->preempt_list); 1753 1609 ··· 1906 1762 unsigned int id) 1907 1763 { 1908 1764 struct kvm_vcpu *vcpu; 1909 - int err = -EINVAL; 1765 + int err; 1910 1766 int core; 1911 1767 struct kvmppc_vcore *vcore; 1912 - 1913 - core = id / threads_per_vcore(); 1914 - if (core >= KVM_MAX_VCORES) 1915 - goto out; 1916 1768 1917 1769 err = -ENOMEM; 1918 1770 vcpu = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL); ··· 1940 1800 vcpu->arch.busy_preempt = TB_NIL; 1941 1801 vcpu->arch.intr_msr = MSR_SF | MSR_ME; 1942 1802 1803 + /* 1804 + * Set the default HFSCR for the guest from the host value. 1805 + * This value is only used on POWER9. 1806 + * On POWER9 DD1, TM doesn't work, so we make sure to 1807 + * prevent the guest from using it. 1808 + * On POWER9, we want to virtualize the doorbell facility, so we 1809 + * turn off the HFSCR bit, which causes those instructions to trap. 1810 + */ 1811 + vcpu->arch.hfscr = mfspr(SPRN_HFSCR); 1812 + if (!cpu_has_feature(CPU_FTR_TM)) 1813 + vcpu->arch.hfscr &= ~HFSCR_TM; 1814 + if (cpu_has_feature(CPU_FTR_ARCH_300)) 1815 + vcpu->arch.hfscr &= ~HFSCR_MSGP; 1816 + 1943 1817 kvmppc_mmu_book3s_hv_init(vcpu); 1944 1818 1945 1819 vcpu->arch.state = KVMPPC_VCPU_NOTREADY; ··· 1961 1807 init_waitqueue_head(&vcpu->arch.cpu_run); 1962 1808 1963 1809 mutex_lock(&kvm->lock); 1964 - vcore = kvm->arch.vcores[core]; 1965 - if (!vcore) { 1966 - vcore = kvmppc_vcore_create(kvm, core); 1967 - kvm->arch.vcores[core] = vcore; 1968 - kvm->arch.online_vcores++; 1810 + vcore = NULL; 1811 + err = -EINVAL; 1812 + core = id / kvm->arch.smt_mode; 1813 + if (core < KVM_MAX_VCORES) { 1814 + vcore = kvm->arch.vcores[core]; 1815 + if (!vcore) { 1816 + err = -ENOMEM; 1817 + vcore = kvmppc_vcore_create(kvm, core); 1818 + kvm->arch.vcores[core] = vcore; 1819 + kvm->arch.online_vcores++; 1820 + } 1969 1821 } 1970 1822 mutex_unlock(&kvm->lock); 1971 1823 ··· 1997 1837 kmem_cache_free(kvm_vcpu_cache, vcpu); 1998 1838 out: 1999 1839 return ERR_PTR(err); 1840 + } 1841 + 1842 + static int kvmhv_set_smt_mode(struct kvm *kvm, unsigned long smt_mode, 1843 + unsigned long flags) 1844 + { 1845 + int err; 1846 + int esmt = 0; 1847 + 1848 + if (flags) 1849 + return -EINVAL; 1850 + if (smt_mode > MAX_SMT_THREADS || !is_power_of_2(smt_mode)) 1851 + return -EINVAL; 1852 + if (!cpu_has_feature(CPU_FTR_ARCH_300)) { 1853 + /* 1854 + * On POWER8 (or POWER7), the threading mode is "strict", 1855 + * so we pack smt_mode vcpus per vcore. 1856 + */ 1857 + if (smt_mode > threads_per_subcore) 1858 + return -EINVAL; 1859 + } else { 1860 + /* 1861 + * On POWER9, the threading mode is "loose", 1862 + * so each vcpu gets its own vcore. 1863 + */ 1864 + esmt = smt_mode; 1865 + smt_mode = 1; 1866 + } 1867 + mutex_lock(&kvm->lock); 1868 + err = -EBUSY; 1869 + if (!kvm->arch.online_vcores) { 1870 + kvm->arch.smt_mode = smt_mode; 1871 + kvm->arch.emul_smt_mode = esmt; 1872 + err = 0; 1873 + } 1874 + mutex_unlock(&kvm->lock); 1875 + 1876 + return err; 2000 1877 } 2001 1878 2002 1879 static void unpin_vpa(struct kvm *kvm, struct kvmppc_vpa *vpa) ··· 2086 1889 } 2087 1890 } 2088 1891 2089 - extern void __kvmppc_vcore_entry(void); 1892 + extern int __kvmppc_vcore_entry(void); 2090 1893 2091 1894 static void kvmppc_remove_runnable(struct kvmppc_vcore *vc, 2092 1895 struct kvm_vcpu *vcpu) ··· 2151 1954 tpaca->kvm_hstate.kvm_split_mode = NULL; 2152 1955 } 2153 1956 2154 - static void do_nothing(void *x) 2155 - { 2156 - } 2157 - 2158 1957 static void radix_flush_cpu(struct kvm *kvm, int cpu, struct kvm_vcpu *vcpu) 2159 1958 { 2160 1959 int i; ··· 2168 1975 smp_call_function_single(cpu + i, do_nothing, NULL, 1); 2169 1976 } 2170 1977 1978 + static void kvmppc_prepare_radix_vcpu(struct kvm_vcpu *vcpu, int pcpu) 1979 + { 1980 + struct kvm *kvm = vcpu->kvm; 1981 + 1982 + /* 1983 + * With radix, the guest can do TLB invalidations itself, 1984 + * and it could choose to use the local form (tlbiel) if 1985 + * it is invalidating a translation that has only ever been 1986 + * used on one vcpu. However, that doesn't mean it has 1987 + * only ever been used on one physical cpu, since vcpus 1988 + * can move around between pcpus. To cope with this, when 1989 + * a vcpu moves from one pcpu to another, we need to tell 1990 + * any vcpus running on the same core as this vcpu previously 1991 + * ran to flush the TLB. The TLB is shared between threads, 1992 + * so we use a single bit in .need_tlb_flush for all 4 threads. 1993 + */ 1994 + if (vcpu->arch.prev_cpu != pcpu) { 1995 + if (vcpu->arch.prev_cpu >= 0 && 1996 + cpu_first_thread_sibling(vcpu->arch.prev_cpu) != 1997 + cpu_first_thread_sibling(pcpu)) 1998 + radix_flush_cpu(kvm, vcpu->arch.prev_cpu, vcpu); 1999 + vcpu->arch.prev_cpu = pcpu; 2000 + } 2001 + } 2002 + 2171 2003 static void kvmppc_start_thread(struct kvm_vcpu *vcpu, struct kvmppc_vcore *vc) 2172 2004 { 2173 2005 int cpu; 2174 2006 struct paca_struct *tpaca; 2175 - struct kvmppc_vcore *mvc = vc->master_vcore; 2176 2007 struct kvm *kvm = vc->kvm; 2177 2008 2178 2009 cpu = vc->pcpu; ··· 2206 1989 vcpu->arch.timer_running = 0; 2207 1990 } 2208 1991 cpu += vcpu->arch.ptid; 2209 - vcpu->cpu = mvc->pcpu; 1992 + vcpu->cpu = vc->pcpu; 2210 1993 vcpu->arch.thread_cpu = cpu; 2211 - 2212 - /* 2213 - * With radix, the guest can do TLB invalidations itself, 2214 - * and it could choose to use the local form (tlbiel) if 2215 - * it is invalidating a translation that has only ever been 2216 - * used on one vcpu. However, that doesn't mean it has 2217 - * only ever been used on one physical cpu, since vcpus 2218 - * can move around between pcpus. To cope with this, when 2219 - * a vcpu moves from one pcpu to another, we need to tell 2220 - * any vcpus running on the same core as this vcpu previously 2221 - * ran to flush the TLB. The TLB is shared between threads, 2222 - * so we use a single bit in .need_tlb_flush for all 4 threads. 2223 - */ 2224 - if (kvm_is_radix(kvm) && vcpu->arch.prev_cpu != cpu) { 2225 - if (vcpu->arch.prev_cpu >= 0 && 2226 - cpu_first_thread_sibling(vcpu->arch.prev_cpu) != 2227 - cpu_first_thread_sibling(cpu)) 2228 - radix_flush_cpu(kvm, vcpu->arch.prev_cpu, vcpu); 2229 - vcpu->arch.prev_cpu = cpu; 2230 - } 2231 1994 cpumask_set_cpu(cpu, &kvm->arch.cpu_in_guest); 2232 1995 } 2233 1996 tpaca = &paca[cpu]; 2234 1997 tpaca->kvm_hstate.kvm_vcpu = vcpu; 2235 - tpaca->kvm_hstate.ptid = cpu - mvc->pcpu; 1998 + tpaca->kvm_hstate.ptid = cpu - vc->pcpu; 2236 1999 /* Order stores to hstate.kvm_vcpu etc. before store to kvm_vcore */ 2237 2000 smp_wmb(); 2238 - tpaca->kvm_hstate.kvm_vcore = mvc; 2001 + tpaca->kvm_hstate.kvm_vcore = vc; 2239 2002 if (cpu != smp_processor_id()) 2240 2003 kvmppc_ipi_thread(cpu); 2241 2004 } ··· 2344 2147 int max_subcore_threads; 2345 2148 int total_threads; 2346 2149 int subcore_threads[MAX_SUBCORES]; 2347 - struct kvm *subcore_vm[MAX_SUBCORES]; 2348 - struct list_head vcs[MAX_SUBCORES]; 2150 + struct kvmppc_vcore *vc[MAX_SUBCORES]; 2349 2151 }; 2350 2152 2351 2153 /* ··· 2355 2159 2356 2160 static void init_core_info(struct core_info *cip, struct kvmppc_vcore *vc) 2357 2161 { 2358 - int sub; 2359 - 2360 2162 memset(cip, 0, sizeof(*cip)); 2361 2163 cip->n_subcores = 1; 2362 2164 cip->max_subcore_threads = vc->num_threads; 2363 2165 cip->total_threads = vc->num_threads; 2364 2166 cip->subcore_threads[0] = vc->num_threads; 2365 - cip->subcore_vm[0] = vc->kvm; 2366 - for (sub = 0; sub < MAX_SUBCORES; ++sub) 2367 - INIT_LIST_HEAD(&cip->vcs[sub]); 2368 - list_add_tail(&vc->preempt_list, &cip->vcs[0]); 2167 + cip->vc[0] = vc; 2369 2168 } 2370 2169 2371 2170 static bool subcore_config_ok(int n_subcores, int n_threads) ··· 2380 2189 return n_subcores * roundup_pow_of_two(n_threads) <= MAX_SMT_THREADS; 2381 2190 } 2382 2191 2383 - static void init_master_vcore(struct kvmppc_vcore *vc) 2192 + static void init_vcore_to_run(struct kvmppc_vcore *vc) 2384 2193 { 2385 - vc->master_vcore = vc; 2386 2194 vc->entry_exit_map = 0; 2387 2195 vc->in_guest = 0; 2388 2196 vc->napping_threads = 0; ··· 2406 2216 ++cip->n_subcores; 2407 2217 cip->total_threads += vc->num_threads; 2408 2218 cip->subcore_threads[sub] = vc->num_threads; 2409 - cip->subcore_vm[sub] = vc->kvm; 2410 - init_master_vcore(vc); 2411 - list_move_tail(&vc->preempt_list, &cip->vcs[sub]); 2219 + cip->vc[sub] = vc; 2220 + init_vcore_to_run(vc); 2221 + list_del_init(&vc->preempt_list); 2412 2222 2413 2223 return true; 2414 2224 } ··· 2476 2286 spin_unlock(&lp->lock); 2477 2287 } 2478 2288 2289 + static bool recheck_signals(struct core_info *cip) 2290 + { 2291 + int sub, i; 2292 + struct kvm_vcpu *vcpu; 2293 + 2294 + for (sub = 0; sub < cip->n_subcores; ++sub) 2295 + for_each_runnable_thread(i, vcpu, cip->vc[sub]) 2296 + if (signal_pending(vcpu->arch.run_task)) 2297 + return true; 2298 + return false; 2299 + } 2300 + 2479 2301 static void post_guest_process(struct kvmppc_vcore *vc, bool is_master) 2480 2302 { 2481 2303 int still_running = 0, i; ··· 2525 2323 wake_up(&vcpu->arch.cpu_run); 2526 2324 } 2527 2325 } 2528 - list_del_init(&vc->preempt_list); 2529 2326 if (!is_master) { 2530 2327 if (still_running > 0) { 2531 2328 kvmppc_vcore_preempt(vc); ··· 2586 2385 return 0; 2587 2386 } 2588 2387 2388 + static void set_irq_happened(int trap) 2389 + { 2390 + switch (trap) { 2391 + case BOOK3S_INTERRUPT_EXTERNAL: 2392 + local_paca->irq_happened |= PACA_IRQ_EE; 2393 + break; 2394 + case BOOK3S_INTERRUPT_H_DOORBELL: 2395 + local_paca->irq_happened |= PACA_IRQ_DBELL; 2396 + break; 2397 + case BOOK3S_INTERRUPT_HMI: 2398 + local_paca->irq_happened |= PACA_IRQ_HMI; 2399 + break; 2400 + } 2401 + } 2402 + 2589 2403 /* 2590 2404 * Run a set of guest threads on a physical core. 2591 2405 * Called with vc->lock held. ··· 2611 2395 int i; 2612 2396 int srcu_idx; 2613 2397 struct core_info core_info; 2614 - struct kvmppc_vcore *pvc, *vcnext; 2398 + struct kvmppc_vcore *pvc; 2615 2399 struct kvm_split_mode split_info, *sip; 2616 2400 int split, subcore_size, active; 2617 2401 int sub; ··· 2620 2404 int pcpu, thr; 2621 2405 int target_threads; 2622 2406 int controlled_threads; 2407 + int trap; 2623 2408 2624 2409 /* 2625 2410 * Remove from the list any threads that have a signal pending ··· 2635 2418 /* 2636 2419 * Initialize *vc. 2637 2420 */ 2638 - init_master_vcore(vc); 2421 + init_vcore_to_run(vc); 2639 2422 vc->preempt_tb = TB_NIL; 2640 2423 2641 2424 /* ··· 2672 2455 if (vc->num_threads < target_threads) 2673 2456 collect_piggybacks(&core_info, target_threads); 2674 2457 2458 + /* 2459 + * On radix, arrange for TLB flushing if necessary. 2460 + * This has to be done before disabling interrupts since 2461 + * it uses smp_call_function(). 2462 + */ 2463 + pcpu = smp_processor_id(); 2464 + if (kvm_is_radix(vc->kvm)) { 2465 + for (sub = 0; sub < core_info.n_subcores; ++sub) 2466 + for_each_runnable_thread(i, vcpu, core_info.vc[sub]) 2467 + kvmppc_prepare_radix_vcpu(vcpu, pcpu); 2468 + } 2469 + 2470 + /* 2471 + * Hard-disable interrupts, and check resched flag and signals. 2472 + * If we need to reschedule or deliver a signal, clean up 2473 + * and return without going into the guest(s). 2474 + */ 2475 + local_irq_disable(); 2476 + hard_irq_disable(); 2477 + if (lazy_irq_pending() || need_resched() || 2478 + recheck_signals(&core_info)) { 2479 + local_irq_enable(); 2480 + vc->vcore_state = VCORE_INACTIVE; 2481 + /* Unlock all except the primary vcore */ 2482 + for (sub = 1; sub < core_info.n_subcores; ++sub) { 2483 + pvc = core_info.vc[sub]; 2484 + /* Put back on to the preempted vcores list */ 2485 + kvmppc_vcore_preempt(pvc); 2486 + spin_unlock(&pvc->lock); 2487 + } 2488 + for (i = 0; i < controlled_threads; ++i) 2489 + kvmppc_release_hwthread(pcpu + i); 2490 + return; 2491 + } 2492 + 2493 + kvmppc_clear_host_core(pcpu); 2494 + 2675 2495 /* Decide on micro-threading (split-core) mode */ 2676 2496 subcore_size = threads_per_subcore; 2677 2497 cmd_bit = stat_bit = 0; ··· 2732 2478 split_info.ldbar = mfspr(SPRN_LDBAR); 2733 2479 split_info.subcore_size = subcore_size; 2734 2480 for (sub = 0; sub < core_info.n_subcores; ++sub) 2735 - split_info.master_vcs[sub] = 2736 - list_first_entry(&core_info.vcs[sub], 2737 - struct kvmppc_vcore, preempt_list); 2481 + split_info.vc[sub] = core_info.vc[sub]; 2738 2482 /* order writes to split_info before kvm_split_mode pointer */ 2739 2483 smp_wmb(); 2740 2484 } 2741 - pcpu = smp_processor_id(); 2742 2485 for (thr = 0; thr < controlled_threads; ++thr) 2743 2486 paca[pcpu + thr].kvm_hstate.kvm_split_mode = sip; 2744 2487 ··· 2755 2504 } 2756 2505 } 2757 2506 2758 - kvmppc_clear_host_core(pcpu); 2759 - 2760 2507 /* Start all the threads */ 2761 2508 active = 0; 2762 2509 for (sub = 0; sub < core_info.n_subcores; ++sub) { 2763 2510 thr = subcore_thread_map[sub]; 2764 2511 thr0_done = false; 2765 2512 active |= 1 << thr; 2766 - list_for_each_entry(pvc, &core_info.vcs[sub], preempt_list) { 2767 - pvc->pcpu = pcpu + thr; 2768 - for_each_runnable_thread(i, vcpu, pvc) { 2769 - kvmppc_start_thread(vcpu, pvc); 2770 - kvmppc_create_dtl_entry(vcpu, pvc); 2771 - trace_kvm_guest_enter(vcpu); 2772 - if (!vcpu->arch.ptid) 2773 - thr0_done = true; 2774 - active |= 1 << (thr + vcpu->arch.ptid); 2775 - } 2776 - /* 2777 - * We need to start the first thread of each subcore 2778 - * even if it doesn't have a vcpu. 2779 - */ 2780 - if (pvc->master_vcore == pvc && !thr0_done) 2781 - kvmppc_start_thread(NULL, pvc); 2782 - thr += pvc->num_threads; 2513 + pvc = core_info.vc[sub]; 2514 + pvc->pcpu = pcpu + thr; 2515 + for_each_runnable_thread(i, vcpu, pvc) { 2516 + kvmppc_start_thread(vcpu, pvc); 2517 + kvmppc_create_dtl_entry(vcpu, pvc); 2518 + trace_kvm_guest_enter(vcpu); 2519 + if (!vcpu->arch.ptid) 2520 + thr0_done = true; 2521 + active |= 1 << (thr + vcpu->arch.ptid); 2783 2522 } 2523 + /* 2524 + * We need to start the first thread of each subcore 2525 + * even if it doesn't have a vcpu. 2526 + */ 2527 + if (!thr0_done) 2528 + kvmppc_start_thread(NULL, pvc); 2529 + thr += pvc->num_threads; 2784 2530 } 2785 2531 2786 2532 /* ··· 2804 2556 trace_kvmppc_run_core(vc, 0); 2805 2557 2806 2558 for (sub = 0; sub < core_info.n_subcores; ++sub) 2807 - list_for_each_entry(pvc, &core_info.vcs[sub], preempt_list) 2808 - spin_unlock(&pvc->lock); 2559 + spin_unlock(&core_info.vc[sub]->lock); 2560 + 2561 + /* 2562 + * Interrupts will be enabled once we get into the guest, 2563 + * so tell lockdep that we're about to enable interrupts. 2564 + */ 2565 + trace_hardirqs_on(); 2809 2566 2810 2567 guest_enter(); 2811 2568 2812 2569 srcu_idx = srcu_read_lock(&vc->kvm->srcu); 2813 2570 2814 - __kvmppc_vcore_entry(); 2571 + trap = __kvmppc_vcore_entry(); 2815 2572 2816 2573 srcu_read_unlock(&vc->kvm->srcu, srcu_idx); 2574 + 2575 + guest_exit(); 2576 + 2577 + trace_hardirqs_off(); 2578 + set_irq_happened(trap); 2817 2579 2818 2580 spin_lock(&vc->lock); 2819 2581 /* prevent other vcpu threads from doing kvmppc_start_thread() now */ ··· 2852 2594 split_info.do_nap = 0; 2853 2595 } 2854 2596 2597 + kvmppc_set_host_core(pcpu); 2598 + 2599 + local_irq_enable(); 2600 + 2855 2601 /* Let secondaries go back to the offline loop */ 2856 2602 for (i = 0; i < controlled_threads; ++i) { 2857 2603 kvmppc_release_hwthread(pcpu + i); ··· 2864 2602 cpumask_clear_cpu(pcpu + i, &vc->kvm->arch.cpu_in_guest); 2865 2603 } 2866 2604 2867 - kvmppc_set_host_core(pcpu); 2868 - 2869 2605 spin_unlock(&vc->lock); 2870 2606 2871 2607 /* make sure updates to secondary vcpu structs are visible now */ 2872 2608 smp_mb(); 2873 - guest_exit(); 2874 2609 2875 - for (sub = 0; sub < core_info.n_subcores; ++sub) 2876 - list_for_each_entry_safe(pvc, vcnext, &core_info.vcs[sub], 2877 - preempt_list) 2878 - post_guest_process(pvc, pvc == vc); 2610 + for (sub = 0; sub < core_info.n_subcores; ++sub) { 2611 + pvc = core_info.vc[sub]; 2612 + post_guest_process(pvc, pvc == vc); 2613 + } 2879 2614 2880 2615 spin_lock(&vc->lock); 2881 2616 preempt_enable(); ··· 2917 2658 vc->halt_poll_ns /= halt_poll_ns_shrink; 2918 2659 } 2919 2660 2661 + #ifdef CONFIG_KVM_XICS 2662 + static inline bool xive_interrupt_pending(struct kvm_vcpu *vcpu) 2663 + { 2664 + if (!xive_enabled()) 2665 + return false; 2666 + return vcpu->arch.xive_saved_state.pipr < 2667 + vcpu->arch.xive_saved_state.cppr; 2668 + } 2669 + #else 2670 + static inline bool xive_interrupt_pending(struct kvm_vcpu *vcpu) 2671 + { 2672 + return false; 2673 + } 2674 + #endif /* CONFIG_KVM_XICS */ 2675 + 2676 + static bool kvmppc_vcpu_woken(struct kvm_vcpu *vcpu) 2677 + { 2678 + if (vcpu->arch.pending_exceptions || vcpu->arch.prodded || 2679 + kvmppc_doorbell_pending(vcpu) || xive_interrupt_pending(vcpu)) 2680 + return true; 2681 + 2682 + return false; 2683 + } 2684 + 2920 2685 /* 2921 2686 * Check to see if any of the runnable vcpus on the vcore have pending 2922 2687 * exceptions or are no longer ceded ··· 2951 2668 int i; 2952 2669 2953 2670 for_each_runnable_thread(i, vcpu, vc) { 2954 - if (vcpu->arch.pending_exceptions || !vcpu->arch.ceded || 2955 - vcpu->arch.prodded) 2671 + if (!vcpu->arch.ceded || kvmppc_vcpu_woken(vcpu)) 2956 2672 return 1; 2957 2673 } 2958 2674 ··· 3093 2811 */ 3094 2812 if (!signal_pending(current)) { 3095 2813 if (vc->vcore_state == VCORE_PIGGYBACK) { 3096 - struct kvmppc_vcore *mvc = vc->master_vcore; 3097 - if (spin_trylock(&mvc->lock)) { 3098 - if (mvc->vcore_state == VCORE_RUNNING && 3099 - !VCORE_IS_EXITING(mvc)) { 2814 + if (spin_trylock(&vc->lock)) { 2815 + if (vc->vcore_state == VCORE_RUNNING && 2816 + !VCORE_IS_EXITING(vc)) { 3100 2817 kvmppc_create_dtl_entry(vcpu, vc); 3101 2818 kvmppc_start_thread(vcpu, vc); 3102 2819 trace_kvm_guest_enter(vcpu); 3103 2820 } 3104 - spin_unlock(&mvc->lock); 2821 + spin_unlock(&vc->lock); 3105 2822 } 3106 2823 } else if (vc->vcore_state == VCORE_RUNNING && 3107 2824 !VCORE_IS_EXITING(vc)) { ··· 3136 2855 break; 3137 2856 n_ceded = 0; 3138 2857 for_each_runnable_thread(i, v, vc) { 3139 - if (!v->arch.pending_exceptions && !v->arch.prodded) 2858 + if (!kvmppc_vcpu_woken(v)) 3140 2859 n_ceded += v->arch.ceded; 3141 2860 else 3142 2861 v->arch.ceded = 0; ··· 3188 2907 { 3189 2908 int r; 3190 2909 int srcu_idx; 2910 + unsigned long ebb_regs[3] = {}; /* shut up GCC */ 2911 + unsigned long user_tar = 0; 2912 + unsigned int user_vrsave; 3191 2913 3192 2914 if (!vcpu->arch.sane) { 3193 2915 run->exit_reason = KVM_EXIT_INTERNAL_ERROR; 3194 2916 return -EINVAL; 3195 2917 } 2918 + 2919 + /* 2920 + * Don't allow entry with a suspended transaction, because 2921 + * the guest entry/exit code will lose it. 2922 + * If the guest has TM enabled, save away their TM-related SPRs 2923 + * (they will get restored by the TM unavailable interrupt). 2924 + */ 2925 + #ifdef CONFIG_PPC_TRANSACTIONAL_MEM 2926 + if (cpu_has_feature(CPU_FTR_TM) && current->thread.regs && 2927 + (current->thread.regs->msr & MSR_TM)) { 2928 + if (MSR_TM_ACTIVE(current->thread.regs->msr)) { 2929 + run->exit_reason = KVM_EXIT_FAIL_ENTRY; 2930 + run->fail_entry.hardware_entry_failure_reason = 0; 2931 + return -EINVAL; 2932 + } 2933 + current->thread.tm_tfhar = mfspr(SPRN_TFHAR); 2934 + current->thread.tm_tfiar = mfspr(SPRN_TFIAR); 2935 + current->thread.tm_texasr = mfspr(SPRN_TEXASR); 2936 + current->thread.regs->msr &= ~MSR_TM; 2937 + } 2938 + #endif 3196 2939 3197 2940 kvmppc_core_prepare_to_enter(vcpu); 3198 2941 ··· 3238 2933 } 3239 2934 3240 2935 flush_all_to_thread(current); 2936 + 2937 + /* Save userspace EBB and other register values */ 2938 + if (cpu_has_feature(CPU_FTR_ARCH_207S)) { 2939 + ebb_regs[0] = mfspr(SPRN_EBBHR); 2940 + ebb_regs[1] = mfspr(SPRN_EBBRR); 2941 + ebb_regs[2] = mfspr(SPRN_BESCR); 2942 + user_tar = mfspr(SPRN_TAR); 2943 + } 2944 + user_vrsave = mfspr(SPRN_VRSAVE); 3241 2945 3242 2946 vcpu->arch.wqp = &vcpu->arch.vcore->wq; 3243 2947 vcpu->arch.pgdir = current->mm->pgd; ··· 3273 2959 r = kvmppc_xics_rm_complete(vcpu, 0); 3274 2960 } 3275 2961 } while (is_kvmppc_resume_guest(r)); 2962 + 2963 + /* Restore userspace EBB and other register values */ 2964 + if (cpu_has_feature(CPU_FTR_ARCH_207S)) { 2965 + mtspr(SPRN_EBBHR, ebb_regs[0]); 2966 + mtspr(SPRN_EBBRR, ebb_regs[1]); 2967 + mtspr(SPRN_BESCR, ebb_regs[2]); 2968 + mtspr(SPRN_TAR, user_tar); 2969 + mtspr(SPRN_FSCR, current->thread.fscr); 2970 + } 2971 + mtspr(SPRN_VRSAVE, user_vrsave); 3276 2972 3277 2973 out: 3278 2974 vcpu->arch.state = KVMPPC_VCPU_NOTREADY; ··· 3792 3468 kvm_hv_vm_activated(); 3793 3469 3794 3470 /* 3471 + * Initialize smt_mode depending on processor. 3472 + * POWER8 and earlier have to use "strict" threading, where 3473 + * all vCPUs in a vcore have to run on the same (sub)core, 3474 + * whereas on POWER9 the threads can each run a different 3475 + * guest. 3476 + */ 3477 + if (!cpu_has_feature(CPU_FTR_ARCH_300)) 3478 + kvm->arch.smt_mode = threads_per_subcore; 3479 + else 3480 + kvm->arch.smt_mode = 1; 3481 + kvm->arch.emul_smt_mode = 1; 3482 + 3483 + /* 3795 3484 * Create a debugfs directory for the VM 3796 3485 */ 3797 3486 snprintf(buf, sizeof(buf), "vm%d", current->pid); ··· 4233 3896 #endif 4234 3897 .configure_mmu = kvmhv_configure_mmu, 4235 3898 .get_rmmu_info = kvmhv_get_rmmu_info, 3899 + .set_smt_mode = kvmhv_set_smt_mode, 4236 3900 }; 4237 3901 4238 3902 static int kvm_init_subcore_bitmap(void)
+1 -1
arch/powerpc/kvm/book3s_hv_builtin.c
··· 307 307 return; 308 308 309 309 for (i = 0; i < MAX_SUBCORES; ++i) { 310 - vc = sip->master_vcs[i]; 310 + vc = sip->vc[i]; 311 311 if (!vc) 312 312 break; 313 313 do {
+12 -8
arch/powerpc/kvm/book3s_hv_interrupts.S
··· 61 61 std r3, HSTATE_DABR(r13) 62 62 END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S) 63 63 64 - /* Hard-disable interrupts */ 65 - mfmsr r10 66 - std r10, HSTATE_HOST_MSR(r13) 67 - rldicl r10,r10,48,1 68 - rotldi r10,r10,16 69 - mtmsrd r10,1 70 - 71 64 /* Save host PMU registers */ 72 65 BEGIN_FTR_SECTION 73 66 /* Work around P8 PMAE bug */ ··· 114 121 * Put whatever is in the decrementer into the 115 122 * hypervisor decrementer. 116 123 */ 124 + BEGIN_FTR_SECTION 125 + ld r5, HSTATE_KVM_VCORE(r13) 126 + ld r6, VCORE_KVM(r5) 127 + ld r9, KVM_HOST_LPCR(r6) 128 + andis. r9, r9, LPCR_LD@h 129 + END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300) 117 130 mfspr r8,SPRN_DEC 118 131 mftb r7 119 - mtspr SPRN_HDEC,r8 132 + BEGIN_FTR_SECTION 133 + /* On POWER9, don't sign-extend if host LPCR[LD] bit is set */ 134 + bne 32f 135 + END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300) 120 136 extsw r8,r8 137 + 32: mtspr SPRN_HDEC,r8 121 138 add r8,r8,r7 122 139 std r8,HSTATE_DECEXP(r13) 123 140 ··· 146 143 * 147 144 * R1 = host R1 148 145 * R2 = host R2 146 + * R3 = trap number on this thread 149 147 * R12 = exit handler id 150 148 * R13 = PACA 151 149 */
+17 -1
arch/powerpc/kvm/book3s_hv_ras.c
··· 130 130 131 131 out: 132 132 /* 133 + * For guest that supports FWNMI capability, hook the MCE event into 134 + * vcpu structure. We are going to exit the guest with KVM_EXIT_NMI 135 + * exit reason. On our way to exit we will pull this event from vcpu 136 + * structure and print it from thread 0 of the core/subcore. 137 + * 138 + * For guest that does not support FWNMI capability (old QEMU): 133 139 * We are now going enter guest either through machine check 134 140 * interrupt (for unhandled errors) or will continue from 135 141 * current HSRR0 (for handled errors) in guest. Hence 136 142 * queue up the event so that we can log it from host console later. 137 143 */ 138 - machine_check_queue_event(); 144 + if (vcpu->kvm->arch.fwnmi_enabled) { 145 + /* 146 + * Hook up the mce event on to vcpu structure. 147 + * First clear the old event. 148 + */ 149 + memset(&vcpu->arch.mce_evt, 0, sizeof(vcpu->arch.mce_evt)); 150 + if (get_mce_event(&mce_evt, MCE_EVENT_RELEASE)) { 151 + vcpu->arch.mce_evt = mce_evt; 152 + } 153 + } else 154 + machine_check_queue_event(); 139 155 140 156 return handled; 141 157 }
+153 -83
arch/powerpc/kvm/book3s_hv_rmhandlers.S
··· 32 32 #include <asm/opal.h> 33 33 #include <asm/xive-regs.h> 34 34 35 + /* Sign-extend HDEC if not on POWER9 */ 36 + #define EXTEND_HDEC(reg) \ 37 + BEGIN_FTR_SECTION; \ 38 + extsw reg, reg; \ 39 + END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300) 40 + 35 41 #define VCPU_GPRS_TM(reg) (((reg) * ULONG_SIZE) + VCPU_GPR_TM) 36 42 37 43 /* Values in HSTATE_NAPPING(r13) */ 38 44 #define NAPPING_CEDE 1 39 45 #define NAPPING_NOVCPU 2 46 + 47 + /* Stack frame offsets for kvmppc_hv_entry */ 48 + #define SFS 160 49 + #define STACK_SLOT_TRAP (SFS-4) 50 + #define STACK_SLOT_TID (SFS-16) 51 + #define STACK_SLOT_PSSCR (SFS-24) 52 + #define STACK_SLOT_PID (SFS-32) 53 + #define STACK_SLOT_IAMR (SFS-40) 54 + #define STACK_SLOT_CIABR (SFS-48) 55 + #define STACK_SLOT_DAWR (SFS-56) 56 + #define STACK_SLOT_DAWRX (SFS-64) 57 + #define STACK_SLOT_HFSCR (SFS-72) 40 58 41 59 /* 42 60 * Call kvmppc_hv_entry in real mode. ··· 69 51 std r0, PPC_LR_STKOFF(r1) 70 52 stdu r1, -112(r1) 71 53 mfmsr r10 54 + std r10, HSTATE_HOST_MSR(r13) 72 55 LOAD_REG_ADDR(r5, kvmppc_call_hv_entry) 73 56 li r0,MSR_RI 74 57 andc r0,r10,r0 ··· 154 135 stb r0, HSTATE_HWTHREAD_REQ(r13) 155 136 156 137 /* 157 - * For external and machine check interrupts, we need 158 - * to call the Linux handler to process the interrupt. 159 - * We do that by jumping to absolute address 0x500 for 160 - * external interrupts, or the machine_check_fwnmi label 161 - * for machine checks (since firmware might have patched 162 - * the vector area at 0x200). The [h]rfid at the end of the 163 - * handler will return to the book3s_hv_interrupts.S code. 164 - * For other interrupts we do the rfid to get back 165 - * to the book3s_hv_interrupts.S code here. 138 + * For external interrupts we need to call the Linux 139 + * handler to process the interrupt. We do that by jumping 140 + * to absolute address 0x500 for external interrupts. 141 + * The [h]rfid at the end of the handler will return to 142 + * the book3s_hv_interrupts.S code. For other interrupts 143 + * we do the rfid to get back to the book3s_hv_interrupts.S 144 + * code here. 166 145 */ 167 146 ld r8, 112+PPC_LR_STKOFF(r1) 168 147 addi r1, r1, 112 169 148 ld r7, HSTATE_HOST_MSR(r13) 149 + 150 + /* Return the trap number on this thread as the return value */ 151 + mr r3, r12 170 152 171 153 /* 172 154 * If we came back from the guest via a relocation-on interrupt, ··· 178 158 andi. r0, r0, MSR_IR /* in real mode? */ 179 159 bne .Lvirt_return 180 160 181 - cmpwi cr1, r12, BOOK3S_INTERRUPT_MACHINE_CHECK 182 - cmpwi r12, BOOK3S_INTERRUPT_EXTERNAL 183 - beq 11f 184 - cmpwi r12, BOOK3S_INTERRUPT_H_DOORBELL 185 - beq 15f /* Invoke the H_DOORBELL handler */ 186 - cmpwi cr2, r12, BOOK3S_INTERRUPT_HMI 187 - beq cr2, 14f /* HMI check */ 188 - 189 - /* RFI into the highmem handler, or branch to interrupt handler */ 161 + /* RFI into the highmem handler */ 190 162 mfmsr r6 191 163 li r0, MSR_RI 192 164 andc r6, r6, r0 193 165 mtmsrd r6, 1 /* Clear RI in MSR */ 194 166 mtsrr0 r8 195 167 mtsrr1 r7 196 - beq cr1, 13f /* machine check */ 197 168 RFI 198 169 199 - /* On POWER7, we have external interrupts set to use HSRR0/1 */ 200 - 11: mtspr SPRN_HSRR0, r8 201 - mtspr SPRN_HSRR1, r7 202 - ba 0x500 203 - 204 - 13: b machine_check_fwnmi 205 - 206 - 14: mtspr SPRN_HSRR0, r8 207 - mtspr SPRN_HSRR1, r7 208 - b hmi_exception_after_realmode 209 - 210 - 15: mtspr SPRN_HSRR0, r8 211 - mtspr SPRN_HSRR1, r7 212 - ba 0xe80 213 - 214 - /* Virtual-mode return - can't get here for HMI or machine check */ 170 + /* Virtual-mode return */ 215 171 .Lvirt_return: 216 - cmpwi r12, BOOK3S_INTERRUPT_EXTERNAL 217 - beq 16f 218 - cmpwi r12, BOOK3S_INTERRUPT_H_DOORBELL 219 - beq 17f 220 - andi. r0, r7, MSR_EE /* were interrupts hard-enabled? */ 221 - beq 18f 222 - mtmsrd r7, 1 /* if so then re-enable them */ 223 - 18: mtlr r8 172 + mtlr r8 224 173 blr 225 - 226 - 16: mtspr SPRN_HSRR0, r8 /* jump to reloc-on external vector */ 227 - mtspr SPRN_HSRR1, r7 228 - b exc_virt_0x4500_hardware_interrupt 229 - 230 - 17: mtspr SPRN_HSRR0, r8 231 - mtspr SPRN_HSRR1, r7 232 - b exc_virt_0x4e80_h_doorbell 233 174 234 175 kvmppc_primary_no_guest: 235 176 /* We handle this much like a ceded vcpu */ 236 177 /* put the HDEC into the DEC, since HDEC interrupts don't wake us */ 178 + /* HDEC may be larger than DEC for arch >= v3.00, but since the */ 179 + /* HDEC value came from DEC in the first place, it will fit */ 237 180 mfspr r3, SPRN_HDEC 238 181 mtspr SPRN_DEC, r3 239 182 /* ··· 278 295 279 296 /* See if our timeslice has expired (HDEC is negative) */ 280 297 mfspr r0, SPRN_HDEC 298 + EXTEND_HDEC(r0) 281 299 li r12, BOOK3S_INTERRUPT_HV_DECREMENTER 282 - cmpwi r0, 0 300 + cmpdi r0, 0 283 301 blt kvm_novcpu_exit 284 302 285 303 /* Got an IPI but other vcpus aren't yet exiting, must be a latecomer */ ··· 303 319 bl kvmhv_accumulate_time 304 320 #endif 305 321 13: mr r3, r12 306 - stw r12, 112-4(r1) 322 + stw r12, STACK_SLOT_TRAP(r1) 307 323 bl kvmhv_commence_exit 308 324 nop 309 - lwz r12, 112-4(r1) 325 + lwz r12, STACK_SLOT_TRAP(r1) 310 326 b kvmhv_switch_to_host 311 327 312 328 /* ··· 374 390 lbz r4, HSTATE_PTID(r13) 375 391 cmpwi r4, 0 376 392 bne 63f 377 - lis r6, 0x7fff 378 - ori r6, r6, 0xffff 393 + LOAD_REG_ADDR(r6, decrementer_max) 394 + ld r6, 0(r6) 379 395 mtspr SPRN_HDEC, r6 380 396 /* and set per-LPAR registers, if doing dynamic micro-threading */ 381 397 ld r6, HSTATE_SPLIT_MODE(r13) ··· 529 545 * * 530 546 *****************************************************************************/ 531 547 532 - /* Stack frame offsets */ 533 - #define STACK_SLOT_TID (112-16) 534 - #define STACK_SLOT_PSSCR (112-24) 535 - #define STACK_SLOT_PID (112-32) 536 - 537 548 .global kvmppc_hv_entry 538 549 kvmppc_hv_entry: 539 550 ··· 544 565 */ 545 566 mflr r0 546 567 std r0, PPC_LR_STKOFF(r1) 547 - stdu r1, -112(r1) 568 + stdu r1, -SFS(r1) 548 569 549 570 /* Save R1 in the PACA */ 550 571 std r1, HSTATE_HOST_R1(r13) ··· 728 749 mfspr r5, SPRN_TIDR 729 750 mfspr r6, SPRN_PSSCR 730 751 mfspr r7, SPRN_PID 752 + mfspr r8, SPRN_IAMR 731 753 std r5, STACK_SLOT_TID(r1) 732 754 std r6, STACK_SLOT_PSSCR(r1) 733 755 std r7, STACK_SLOT_PID(r1) 756 + std r8, STACK_SLOT_IAMR(r1) 757 + mfspr r5, SPRN_HFSCR 758 + std r5, STACK_SLOT_HFSCR(r1) 734 759 END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300) 760 + BEGIN_FTR_SECTION 761 + mfspr r5, SPRN_CIABR 762 + mfspr r6, SPRN_DAWR 763 + mfspr r7, SPRN_DAWRX 764 + std r5, STACK_SLOT_CIABR(r1) 765 + std r6, STACK_SLOT_DAWR(r1) 766 + std r7, STACK_SLOT_DAWRX(r1) 767 + END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) 735 768 736 769 BEGIN_FTR_SECTION 737 770 /* Set partition DABR */ ··· 886 895 ld r5, VCPU_TID(r4) 887 896 ld r6, VCPU_PSSCR(r4) 888 897 oris r6, r6, PSSCR_EC@h /* This makes stop trap to HV */ 898 + ld r7, VCPU_HFSCR(r4) 889 899 mtspr SPRN_TIDR, r5 890 900 mtspr SPRN_PSSCR, r6 901 + mtspr SPRN_HFSCR, r7 891 902 ALT_FTR_SECTION_END_IFCLR(CPU_FTR_ARCH_300) 892 903 8: 893 904 ··· 904 911 mftb r7 905 912 subf r3,r7,r8 906 913 mtspr SPRN_DEC,r3 907 - stw r3,VCPU_DEC(r4) 914 + std r3,VCPU_DEC(r4) 908 915 909 916 ld r5, VCPU_SPRG0(r4) 910 917 ld r6, VCPU_SPRG1(r4) ··· 961 968 962 969 /* Check if HDEC expires soon */ 963 970 mfspr r3, SPRN_HDEC 964 - cmpwi r3, 512 /* 1 microsecond */ 971 + EXTEND_HDEC(r3) 972 + cmpdi r3, 512 /* 1 microsecond */ 965 973 blt hdec_soon 966 974 967 975 #ifdef CONFIG_KVM_XICS ··· 1016 1022 li r0, BOOK3S_INTERRUPT_EXTERNAL 1017 1023 bne cr1, 12f 1018 1024 mfspr r0, SPRN_DEC 1019 - cmpwi r0, 0 1025 + BEGIN_FTR_SECTION 1026 + /* On POWER9 check whether the guest has large decrementer enabled */ 1027 + andis. r8, r8, LPCR_LD@h 1028 + bne 15f 1029 + END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300) 1030 + extsw r0, r0 1031 + 15: cmpdi r0, 0 1020 1032 li r0, BOOK3S_INTERRUPT_DECREMENTER 1021 1033 bge 5f 1022 1034 ··· 1032 1032 mr r9, r4 1033 1033 bl kvmppc_msr_interrupt 1034 1034 5: 1035 + BEGIN_FTR_SECTION 1036 + b fast_guest_return 1037 + END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300) 1038 + /* On POWER9, check for pending doorbell requests */ 1039 + lbz r0, VCPU_DBELL_REQ(r4) 1040 + cmpwi r0, 0 1041 + beq fast_guest_return 1042 + ld r5, HSTATE_KVM_VCORE(r13) 1043 + /* Set DPDES register so the CPU will take a doorbell interrupt */ 1044 + li r0, 1 1045 + mtspr SPRN_DPDES, r0 1046 + std r0, VCORE_DPDES(r5) 1047 + /* Make sure other cpus see vcore->dpdes set before dbell req clear */ 1048 + lwsync 1049 + /* Clear the pending doorbell request */ 1050 + li r0, 0 1051 + stb r0, VCPU_DBELL_REQ(r4) 1035 1052 1036 1053 /* 1037 1054 * Required state: ··· 1223 1206 1224 1207 stw r12,VCPU_TRAP(r9) 1225 1208 1209 + /* 1210 + * Now that we have saved away SRR0/1 and HSRR0/1, 1211 + * interrupts are recoverable in principle, so set MSR_RI. 1212 + * This becomes important for relocation-on interrupts from 1213 + * the guest, which we can get in radix mode on POWER9. 1214 + */ 1215 + li r0, MSR_RI 1216 + mtmsrd r0, 1 1217 + 1226 1218 #ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING 1227 1219 addi r3, r9, VCPU_TB_RMINTR 1228 1220 mr r4, r9 ··· 1288 1262 beq 4f 1289 1263 b guest_exit_cont 1290 1264 3: 1265 + /* If it's a hypervisor facility unavailable interrupt, save HFSCR */ 1266 + cmpwi r12, BOOK3S_INTERRUPT_H_FAC_UNAVAIL 1267 + bne 14f 1268 + mfspr r3, SPRN_HFSCR 1269 + std r3, VCPU_HFSCR(r9) 1270 + b guest_exit_cont 1271 + 14: 1291 1272 /* External interrupt ? */ 1292 1273 cmpwi r12, BOOK3S_INTERRUPT_EXTERNAL 1293 1274 bne+ guest_exit_cont ··· 1482 1449 mtspr SPRN_SPURR,r4 1483 1450 1484 1451 /* Save DEC */ 1452 + ld r3, HSTATE_KVM_VCORE(r13) 1485 1453 mfspr r5,SPRN_DEC 1486 1454 mftb r6 1455 + /* On P9, if the guest has large decr enabled, don't sign extend */ 1456 + BEGIN_FTR_SECTION 1457 + ld r4, VCORE_LPCR(r3) 1458 + andis. r4, r4, LPCR_LD@h 1459 + bne 16f 1460 + END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300) 1487 1461 extsw r5,r5 1488 - add r5,r5,r6 1462 + 16: add r5,r5,r6 1489 1463 /* r5 is a guest timebase value here, convert to host TB */ 1490 - ld r3,HSTATE_KVM_VCORE(r13) 1491 1464 ld r4,VCORE_TB_OFFSET(r3) 1492 1465 subf r5,r4,r5 1493 1466 std r5,VCPU_DEC_EXPIRES(r9) ··· 1538 1499 rldicl r6, r6, 4, 50 /* r6 &= PSSCR_GUEST_VIS */ 1539 1500 rotldi r6, r6, 60 1540 1501 std r6, VCPU_PSSCR(r9) 1502 + /* Restore host HFSCR value */ 1503 + ld r7, STACK_SLOT_HFSCR(r1) 1504 + mtspr SPRN_HFSCR, r7 1541 1505 ALT_FTR_SECTION_END_IFCLR(CPU_FTR_ARCH_300) 1542 1506 /* 1543 1507 * Restore various registers to 0, where non-zero values 1544 1508 * set by the guest could disrupt the host. 1545 1509 */ 1546 1510 li r0, 0 1547 - mtspr SPRN_IAMR, r0 1548 - mtspr SPRN_CIABR, r0 1549 - mtspr SPRN_DAWRX, r0 1511 + mtspr SPRN_PSPB, r0 1550 1512 mtspr SPRN_WORT, r0 1551 1513 BEGIN_FTR_SECTION 1514 + mtspr SPRN_IAMR, r0 1552 1515 mtspr SPRN_TCSCR, r0 1553 1516 /* Set MMCRS to 1<<31 to freeze and disable the SPMC counters */ 1554 1517 li r0, 1 ··· 1566 1525 std r6,VCPU_UAMOR(r9) 1567 1526 li r6,0 1568 1527 mtspr SPRN_AMR,r6 1528 + mtspr SPRN_UAMOR, r6 1569 1529 1570 1530 /* Switch DSCR back to host value */ 1571 1531 mfspr r8, SPRN_DSCR ··· 1712 1670 1713 1671 /* Restore host values of some registers */ 1714 1672 BEGIN_FTR_SECTION 1673 + ld r5, STACK_SLOT_CIABR(r1) 1674 + ld r6, STACK_SLOT_DAWR(r1) 1675 + ld r7, STACK_SLOT_DAWRX(r1) 1676 + mtspr SPRN_CIABR, r5 1677 + mtspr SPRN_DAWR, r6 1678 + mtspr SPRN_DAWRX, r7 1679 + END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) 1680 + BEGIN_FTR_SECTION 1715 1681 ld r5, STACK_SLOT_TID(r1) 1716 1682 ld r6, STACK_SLOT_PSSCR(r1) 1717 1683 ld r7, STACK_SLOT_PID(r1) 1684 + ld r8, STACK_SLOT_IAMR(r1) 1718 1685 mtspr SPRN_TIDR, r5 1719 1686 mtspr SPRN_PSSCR, r6 1720 1687 mtspr SPRN_PID, r7 1688 + mtspr SPRN_IAMR, r8 1721 1689 END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300) 1722 1690 BEGIN_FTR_SECTION 1723 1691 PPC_INVALIDATE_ERAT ··· 1871 1819 li r0, KVM_GUEST_MODE_NONE 1872 1820 stb r0, HSTATE_IN_GUEST(r13) 1873 1821 1874 - ld r0, 112+PPC_LR_STKOFF(r1) 1875 - addi r1, r1, 112 1822 + ld r0, SFS+PPC_LR_STKOFF(r1) 1823 + addi r1, r1, SFS 1876 1824 mtlr r0 1877 1825 blr 1878 1826 ··· 2418 2366 mfspr r3, SPRN_DEC 2419 2367 mfspr r4, SPRN_HDEC 2420 2368 mftb r5 2421 - cmpw r3, r4 2369 + BEGIN_FTR_SECTION 2370 + /* On P9 check whether the guest has large decrementer mode enabled */ 2371 + ld r6, HSTATE_KVM_VCORE(r13) 2372 + ld r6, VCORE_LPCR(r6) 2373 + andis. r6, r6, LPCR_LD@h 2374 + bne 68f 2375 + END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300) 2376 + extsw r3, r3 2377 + 68: EXTEND_HDEC(r4) 2378 + cmpd r3, r4 2422 2379 ble 67f 2423 2380 mtspr SPRN_DEC, r4 2424 2381 67: 2425 2382 /* save expiry time of guest decrementer */ 2426 - extsw r3, r3 2427 2383 add r3, r3, r5 2428 2384 ld r4, HSTATE_KVM_VCPU(r13) 2429 2385 ld r5, HSTATE_KVM_VCORE(r13) ··· 2612 2552 ld r9, HSTATE_KVM_VCPU(r13) 2613 2553 li r12, BOOK3S_INTERRUPT_MACHINE_CHECK 2614 2554 /* 2615 - * Deliver unhandled/fatal (e.g. UE) MCE errors to guest through 2616 - * machine check interrupt (set HSRR0 to 0x200). And for handled 2617 - * errors (no-fatal), just go back to guest execution with current 2618 - * HSRR0 instead of exiting guest. This new approach will inject 2619 - * machine check to guest for fatal error causing guest to crash. 2555 + * For the guest that is FWNMI capable, deliver all the MCE errors 2556 + * (handled/unhandled) by exiting the guest with KVM_EXIT_NMI exit 2557 + * reason. This new approach injects machine check errors in guest 2558 + * address space to guest with additional information in the form 2559 + * of RTAS event, thus enabling guest kernel to suitably handle 2560 + * such errors. 2620 2561 * 2621 - * The old code used to return to host for unhandled errors which 2622 - * was causing guest to hang with soft lockups inside guest and 2623 - * makes it difficult to recover guest instance. 2624 - * 2562 + * For the guest that is not FWNMI capable (old QEMU) fallback 2563 + * to old behaviour for backward compatibility: 2564 + * Deliver unhandled/fatal (e.g. UE) MCE errors to guest either 2565 + * through machine check interrupt (set HSRR0 to 0x200). 2566 + * For handled errors (no-fatal), just go back to guest execution 2567 + * with current HSRR0. 2625 2568 * if we receive machine check with MSR(RI=0) then deliver it to 2626 2569 * guest as machine check causing guest to crash. 2627 2570 */ 2628 2571 ld r11, VCPU_MSR(r9) 2629 2572 rldicl. r0, r11, 64-MSR_HV_LG, 63 /* check if it happened in HV mode */ 2630 2573 bne mc_cont /* if so, exit to host */ 2574 + /* Check if guest is capable of handling NMI exit */ 2575 + ld r10, VCPU_KVM(r9) 2576 + lbz r10, KVM_FWNMI(r10) 2577 + cmpdi r10, 1 /* FWNMI capable? */ 2578 + beq mc_cont /* if so, exit with KVM_EXIT_NMI. */ 2579 + 2580 + /* if not, fall through for backward compatibility. */ 2631 2581 andi. r10, r11, MSR_RI /* check for unrecoverable exception */ 2632 2582 beq 1f /* Deliver a machine check to guest */ 2633 2583 ld r10, VCPU_PC(r9)
+2 -2
arch/powerpc/kvm/emulate.c
··· 39 39 unsigned long dec_nsec; 40 40 unsigned long long dec_time; 41 41 42 - pr_debug("mtDEC: %x\n", vcpu->arch.dec); 42 + pr_debug("mtDEC: %lx\n", vcpu->arch.dec); 43 43 hrtimer_try_to_cancel(&vcpu->arch.dec_timer); 44 44 45 45 #ifdef CONFIG_PPC_BOOK3S ··· 109 109 case SPRN_TBWU: break; 110 110 111 111 case SPRN_DEC: 112 - vcpu->arch.dec = spr_val; 112 + vcpu->arch.dec = (u32) spr_val; 113 113 kvmppc_emulate_dec(vcpu); 114 114 break; 115 115
+39 -1
arch/powerpc/kvm/powerpc.c
··· 553 553 #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE 554 554 case KVM_CAP_PPC_SMT: 555 555 r = 0; 556 - if (hv_enabled) { 556 + if (kvm) { 557 + if (kvm->arch.emul_smt_mode > 1) 558 + r = kvm->arch.emul_smt_mode; 559 + else 560 + r = kvm->arch.smt_mode; 561 + } else if (hv_enabled) { 557 562 if (cpu_has_feature(CPU_FTR_ARCH_300)) 558 563 r = 1; 559 564 else 560 565 r = threads_per_subcore; 566 + } 567 + break; 568 + case KVM_CAP_PPC_SMT_POSSIBLE: 569 + r = 1; 570 + if (hv_enabled) { 571 + if (!cpu_has_feature(CPU_FTR_ARCH_300)) 572 + r = ((threads_per_subcore << 1) - 1); 573 + else 574 + /* P9 can emulate dbells, so allow any mode */ 575 + r = 8 | 4 | 2 | 1; 561 576 } 562 577 break; 563 578 case KVM_CAP_PPC_RMA: ··· 631 616 case KVM_CAP_SPAPR_RESIZE_HPT: 632 617 /* Disable this on POWER9 until code handles new HPTE format */ 633 618 r = !!hv_enabled && !cpu_has_feature(CPU_FTR_ARCH_300); 619 + break; 620 + #endif 621 + #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE 622 + case KVM_CAP_PPC_FWNMI: 623 + r = hv_enabled; 634 624 break; 635 625 #endif 636 626 case KVM_CAP_PPC_HTM: ··· 1557 1537 break; 1558 1538 } 1559 1539 #endif /* CONFIG_KVM_XICS */ 1540 + #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE 1541 + case KVM_CAP_PPC_FWNMI: 1542 + r = -EINVAL; 1543 + if (!is_kvmppc_hv_enabled(vcpu->kvm)) 1544 + break; 1545 + r = 0; 1546 + vcpu->kvm->arch.fwnmi_enabled = true; 1547 + break; 1548 + #endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */ 1560 1549 default: 1561 1550 r = -EINVAL; 1562 1551 break; ··· 1738 1709 else 1739 1710 clear_bit(hcall / 4, kvm->arch.enabled_hcalls); 1740 1711 r = 0; 1712 + break; 1713 + } 1714 + case KVM_CAP_PPC_SMT: { 1715 + unsigned long mode = cap->args[0]; 1716 + unsigned long flags = cap->args[1]; 1717 + 1718 + r = -EINVAL; 1719 + if (kvm->arch.kvm_ops->set_smt_mode) 1720 + r = kvm->arch.kvm_ops->set_smt_mode(kvm, mode, flags); 1741 1721 break; 1742 1722 } 1743 1723 #endif
+2
include/uapi/linux/kvm.h
··· 925 925 #define KVM_CAP_X86_GUEST_MWAIT 143 926 926 #define KVM_CAP_ARM_USER_IRQ 144 927 927 #define KVM_CAP_S390_CMMA_MIGRATION 145 928 + #define KVM_CAP_PPC_FWNMI 146 929 + #define KVM_CAP_PPC_SMT_POSSIBLE 147 928 930 929 931 #ifdef KVM_CAP_IRQ_ROUTING 930 932