Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm

+17

Documentation/virtual/kvm/api.txt

··· 3573 3573 @ar - access register number 3574 3574 3575 3575 KVM handlers should exit to userspace with rc = -EREMOTE. 3576 + 3577 + 3578 + 8. Other capabilities. 3579 + ---------------------- 3580 + 3581 + This section lists capabilities that give information about other 3582 + features of the KVM implementation. 3583 + 3584 + 8.1 KVM_CAP_PPC_HWRNG 3585 + 3586 + Architectures: ppc 3587 + 3588 + This capability, if KVM_CHECK_EXTENSION indicates that it is 3589 + available, means that that the kernel has an implementation of the 3590 + H_RANDOM hypercall backed by a hardware random-number generator. 3591 + If present, the kernel H_RANDOM handler can be enabled for guest use 3592 + with the KVM_CAP_PPC_ENABLE_HCALL capability.

+7 -1

arch/arm/include/uapi/asm/kvm.h

··· 195 195 #define KVM_ARM_IRQ_CPU_IRQ 0 196 196 #define KVM_ARM_IRQ_CPU_FIQ 1 197 197 198 - /* Highest supported SPI, from VGIC_NR_IRQS */ 198 + /* 199 + * This used to hold the highest supported SPI, but it is now obsolete 200 + * and only here to provide source code level compatibility with older 201 + * userland. The highest SPI number can be set via KVM_DEV_ARM_VGIC_GRP_NR_IRQS. 202 + */ 203 + #ifndef __KERNEL__ 199 204 #define KVM_ARM_IRQ_GIC_MAX 127 205 + #endif 200 206 201 207 /* One single KVM irqchip, ie. the VGIC */ 202 208 #define KVM_NR_IRQCHIPS 1

+1 -2

arch/arm/kvm/arm.c

··· 671 671 if (!irqchip_in_kernel(kvm)) 672 672 return -ENXIO; 673 673 674 - if (irq_num < VGIC_NR_PRIVATE_IRQS || 675 - irq_num > KVM_ARM_IRQ_GIC_MAX) 674 + if (irq_num < VGIC_NR_PRIVATE_IRQS) 676 675 return -EINVAL; 677 676 678 677 return kvm_vgic_inject_irq(kvm, 0, irq_num, level);

+7 -1

arch/arm64/include/uapi/asm/kvm.h

··· 188 188 #define KVM_ARM_IRQ_CPU_IRQ 0 189 189 #define KVM_ARM_IRQ_CPU_FIQ 1 190 190 191 - /* Highest supported SPI, from VGIC_NR_IRQS */ 191 + /* 192 + * This used to hold the highest supported SPI, but it is now obsolete 193 + * and only here to provide source code level compatibility with older 194 + * userland. The highest SPI number can be set via KVM_DEV_ARM_VGIC_GRP_NR_IRQS. 195 + */ 196 + #ifndef __KERNEL__ 192 197 #define KVM_ARM_IRQ_GIC_MAX 127 198 + #endif 193 199 194 200 /* One single KVM irqchip, ie. the VGIC */ 195 201 #define KVM_NR_IRQCHIPS 1

+9 -2

arch/powerpc/include/asm/archrandom.h

··· 30 30 return !!ppc_md.get_random_long; 31 31 } 32 32 33 - int powernv_get_random_long(unsigned long *v); 34 - 35 33 static inline int arch_get_random_seed_long(unsigned long *v) 36 34 { 37 35 return 0; ··· 44 46 } 45 47 46 48 #endif /* CONFIG_ARCH_RANDOM */ 49 + 50 + #ifdef CONFIG_PPC_POWERNV 51 + int powernv_hwrng_present(void); 52 + int powernv_get_random_long(unsigned long *v); 53 + int powernv_get_random_real_mode(unsigned long *v); 54 + #else 55 + static inline int powernv_hwrng_present(void) { return 0; } 56 + static inline int powernv_get_random_real_mode(unsigned long *v) { return 0; } 57 + #endif 47 58 48 59 #endif /* _ASM_POWERPC_ARCHRANDOM_H */

+3

arch/powerpc/include/asm/kvm_book3s.h

··· 288 288 return !is_kvmppc_hv_enabled(vcpu->kvm); 289 289 } 290 290 291 + extern int kvmppc_h_logical_ci_load(struct kvm_vcpu *vcpu); 292 + extern int kvmppc_h_logical_ci_store(struct kvm_vcpu *vcpu); 293 + 291 294 /* Magic register values loaded into r3 and r4 before the 'sc' assembly 292 295 * instruction for the OSI hypercalls */ 293 296 #define OSI_SC_MAGIC_R3 0x113724FA

+18

arch/powerpc/include/asm/kvm_book3s_64.h

··· 85 85 return old == 0; 86 86 } 87 87 88 + static inline void unlock_hpte(__be64 *hpte, unsigned long hpte_v) 89 + { 90 + hpte_v &= ~HPTE_V_HVLOCK; 91 + asm volatile(PPC_RELEASE_BARRIER "" : : : "memory"); 92 + hpte[0] = cpu_to_be64(hpte_v); 93 + } 94 + 95 + /* Without barrier */ 96 + static inline void __unlock_hpte(__be64 *hpte, unsigned long hpte_v) 97 + { 98 + hpte_v &= ~HPTE_V_HVLOCK; 99 + hpte[0] = cpu_to_be64(hpte_v); 100 + } 101 + 88 102 static inline int __hpte_actual_psize(unsigned int lp, int psize) 89 103 { 90 104 int i, shift; ··· 437 423 { 438 424 return rcu_dereference_raw_notrace(kvm->memslots); 439 425 } 426 + 427 + extern void kvmppc_mmu_debugfs_init(struct kvm *kvm); 428 + 429 + extern void kvmhv_rm_send_ipi(int cpu); 440 430 441 431 #endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */ 442 432

+33 -14

arch/powerpc/include/asm/kvm_host.h

··· 227 227 unsigned long host_sdr1; 228 228 int tlbie_lock; 229 229 unsigned long lpcr; 230 - unsigned long rmor; 231 - struct kvm_rma_info *rma; 232 230 unsigned long vrma_slb_v; 233 - int rma_setup_done; 231 + int hpte_setup_done; 234 232 u32 hpt_order; 235 233 atomic_t vcpus_running; 236 234 u32 online_vcores; ··· 237 239 atomic_t hpte_mod_interest; 238 240 cpumask_t need_tlb_flush; 239 241 int hpt_cma_alloc; 242 + struct dentry *debugfs_dir; 243 + struct dentry *htab_dentry; 240 244 #endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */ 241 245 #ifdef CONFIG_KVM_BOOK3S_PR_POSSIBLE 242 246 struct mutex hpt_mutex; ··· 263 263 264 264 /* 265 265 * Struct for a virtual core. 266 - * Note: entry_exit_count combines an entry count in the bottom 8 bits 267 - * and an exit count in the next 8 bits. This is so that we can 268 - * atomically increment the entry count iff the exit count is 0 269 - * without taking the lock. 266 + * Note: entry_exit_map combines a bitmap of threads that have entered 267 + * in the bottom 8 bits and a bitmap of threads that have exited in the 268 + * next 8 bits. This is so that we can atomically set the entry bit 269 + * iff the exit map is 0 without taking a lock. 270 270 */ 271 271 struct kvmppc_vcore { 272 272 int n_runnable; 273 - int n_busy; 274 273 int num_threads; 275 - int entry_exit_count; 276 - int n_woken; 277 - int nap_count; 274 + int entry_exit_map; 278 275 int napping_threads; 279 276 int first_vcpuid; 280 277 u16 pcpu; ··· 296 299 ulong conferring_threads; 297 300 }; 298 301 299 - #define VCORE_ENTRY_COUNT(vc) ((vc)->entry_exit_count & 0xff) 300 - #define VCORE_EXIT_COUNT(vc) ((vc)->entry_exit_count >> 8) 302 + #define VCORE_ENTRY_MAP(vc) ((vc)->entry_exit_map & 0xff) 303 + #define VCORE_EXIT_MAP(vc) ((vc)->entry_exit_map >> 8) 304 + #define VCORE_IS_EXITING(vc) (VCORE_EXIT_MAP(vc) != 0) 301 305 302 306 /* Values for vcore_state */ 303 307 #define VCORE_INACTIVE 0 304 308 #define VCORE_SLEEPING 1 305 - #define VCORE_STARTING 2 309 + #define VCORE_PREEMPT 2 306 310 #define VCORE_RUNNING 3 307 311 #define VCORE_EXITING 4 308 312 ··· 364 366 bool tb : 1; /* 1TB segment */ 365 367 bool class : 1; 366 368 u8 base_page_size; /* MMU_PAGE_xxx */ 369 + }; 370 + 371 + /* Struct used to accumulate timing information in HV real mode code */ 372 + struct kvmhv_tb_accumulator { 373 + u64 seqcount; /* used to synchronize access, also count * 2 */ 374 + u64 tb_total; /* total time in timebase ticks */ 375 + u64 tb_min; /* min time */ 376 + u64 tb_max; /* max time */ 367 377 }; 368 378 369 379 # ifdef CONFIG_PPC_FSL_BOOK3E ··· 662 656 663 657 u32 emul_inst; 664 658 #endif 659 + 660 + #ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING 661 + struct kvmhv_tb_accumulator *cur_activity; /* What we're timing */ 662 + u64 cur_tb_start; /* when it started */ 663 + struct kvmhv_tb_accumulator rm_entry; /* real-mode entry code */ 664 + struct kvmhv_tb_accumulator rm_intr; /* real-mode intr handling */ 665 + struct kvmhv_tb_accumulator rm_exit; /* real-mode exit code */ 666 + struct kvmhv_tb_accumulator guest_time; /* guest execution */ 667 + struct kvmhv_tb_accumulator cede_time; /* time napping inside guest */ 668 + 669 + struct dentry *debugfs_dir; 670 + struct dentry *debugfs_timings; 671 + #endif /* CONFIG_KVM_BOOK3S_HV_EXIT_TIMING */ 665 672 }; 666 673 667 674 #define VCPU_FPR(vcpu, i) (vcpu)->arch.fp.fpr[i][TS_FPROFFSET]

+2

arch/powerpc/include/asm/kvm_ppc.h

··· 302 302 return kvm->arch.kvm_ops == kvmppc_hv_ops; 303 303 } 304 304 305 + extern int kvmppc_hwrng_present(void); 306 + 305 307 /* 306 308 * Cuts out inst bits with ordering according to spec. 307 309 * That means the leftmost bit is zero. All given bits are included.

+3

arch/powerpc/include/asm/time.h

··· 211 211 212 212 DECLARE_PER_CPU(u64, decrementers_next_tb); 213 213 214 + /* Convert timebase ticks to nanoseconds */ 215 + unsigned long long tb_to_ns(unsigned long long tb_ticks); 216 + 214 217 #endif /* __KERNEL__ */ 215 218 #endif /* __POWERPC_TIME_H */

+17 -3

arch/powerpc/kernel/asm-offsets.c

··· 37 37 #include <asm/thread_info.h> 38 38 #include <asm/rtas.h> 39 39 #include <asm/vdso_datapage.h> 40 + #include <asm/dbell.h> 40 41 #ifdef CONFIG_PPC64 41 42 #include <asm/paca.h> 42 43 #include <asm/lppaca.h> ··· 460 459 DEFINE(VCPU_SPRG2, offsetof(struct kvm_vcpu, arch.shregs.sprg2)); 461 460 DEFINE(VCPU_SPRG3, offsetof(struct kvm_vcpu, arch.shregs.sprg3)); 462 461 #endif 462 + #ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING 463 + DEFINE(VCPU_TB_RMENTRY, offsetof(struct kvm_vcpu, arch.rm_entry)); 464 + DEFINE(VCPU_TB_RMINTR, offsetof(struct kvm_vcpu, arch.rm_intr)); 465 + DEFINE(VCPU_TB_RMEXIT, offsetof(struct kvm_vcpu, arch.rm_exit)); 466 + DEFINE(VCPU_TB_GUEST, offsetof(struct kvm_vcpu, arch.guest_time)); 467 + DEFINE(VCPU_TB_CEDE, offsetof(struct kvm_vcpu, arch.cede_time)); 468 + DEFINE(VCPU_CUR_ACTIVITY, offsetof(struct kvm_vcpu, arch.cur_activity)); 469 + DEFINE(VCPU_ACTIVITY_START, offsetof(struct kvm_vcpu, arch.cur_tb_start)); 470 + DEFINE(TAS_SEQCOUNT, offsetof(struct kvmhv_tb_accumulator, seqcount)); 471 + DEFINE(TAS_TOTAL, offsetof(struct kvmhv_tb_accumulator, tb_total)); 472 + DEFINE(TAS_MIN, offsetof(struct kvmhv_tb_accumulator, tb_min)); 473 + DEFINE(TAS_MAX, offsetof(struct kvmhv_tb_accumulator, tb_max)); 474 + #endif 463 475 DEFINE(VCPU_SHARED_SPRG3, offsetof(struct kvm_vcpu_arch_shared, sprg3)); 464 476 DEFINE(VCPU_SHARED_SPRG4, offsetof(struct kvm_vcpu_arch_shared, sprg4)); 465 477 DEFINE(VCPU_SHARED_SPRG5, offsetof(struct kvm_vcpu_arch_shared, sprg5)); ··· 506 492 DEFINE(KVM_NEED_FLUSH, offsetof(struct kvm, arch.need_tlb_flush.bits)); 507 493 DEFINE(KVM_ENABLED_HCALLS, offsetof(struct kvm, arch.enabled_hcalls)); 508 494 DEFINE(KVM_LPCR, offsetof(struct kvm, arch.lpcr)); 509 - DEFINE(KVM_RMOR, offsetof(struct kvm, arch.rmor)); 510 495 DEFINE(KVM_VRMA_SLB_V, offsetof(struct kvm, arch.vrma_slb_v)); 511 496 DEFINE(VCPU_DSISR, offsetof(struct kvm_vcpu, arch.shregs.dsisr)); 512 497 DEFINE(VCPU_DAR, offsetof(struct kvm_vcpu, arch.shregs.dar)); ··· 563 550 DEFINE(VCPU_ACOP, offsetof(struct kvm_vcpu, arch.acop)); 564 551 DEFINE(VCPU_WORT, offsetof(struct kvm_vcpu, arch.wort)); 565 552 DEFINE(VCPU_SHADOW_SRR1, offsetof(struct kvm_vcpu, arch.shadow_srr1)); 566 - DEFINE(VCORE_ENTRY_EXIT, offsetof(struct kvmppc_vcore, entry_exit_count)); 567 - DEFINE(VCORE_NAP_COUNT, offsetof(struct kvmppc_vcore, nap_count)); 553 + DEFINE(VCORE_ENTRY_EXIT, offsetof(struct kvmppc_vcore, entry_exit_map)); 568 554 DEFINE(VCORE_IN_GUEST, offsetof(struct kvmppc_vcore, in_guest)); 569 555 DEFINE(VCORE_NAPPING_THREADS, offsetof(struct kvmppc_vcore, napping_threads)); 570 556 DEFINE(VCORE_KVM, offsetof(struct kvmppc_vcore, kvm)); ··· 759 747 DEFINE(PACA_SUBCORE_SIBLING_MASK, 760 748 offsetof(struct paca_struct, subcore_sibling_mask)); 761 749 #endif 750 + 751 + DEFINE(PPC_DBELL_SERVER, PPC_DBELL_SERVER); 762 752 763 753 return 0; 764 754 }

+6

arch/powerpc/kernel/time.c

··· 608 608 } 609 609 #endif 610 610 611 + unsigned long long tb_to_ns(unsigned long long ticks) 612 + { 613 + return mulhdu(ticks, tb_to_ns_scale) << tb_to_ns_shift; 614 + } 615 + EXPORT_SYMBOL_GPL(tb_to_ns); 616 + 611 617 /* 612 618 * Scheduler clock - returns current time in nanosec units. 613 619 *

+14

arch/powerpc/kvm/Kconfig

··· 110 110 processor, including emulating 32-bit processors on a 64-bit 111 111 host. 112 112 113 + config KVM_BOOK3S_HV_EXIT_TIMING 114 + bool "Detailed timing for hypervisor real-mode code" 115 + depends on KVM_BOOK3S_HV_POSSIBLE && DEBUG_FS 116 + ---help--- 117 + Calculate time taken for each vcpu in the real-mode guest entry, 118 + exit, and interrupt handling code, plus time spent in the guest 119 + and in nap mode due to idle (cede) while other threads are still 120 + in the guest. The total, minimum and maximum times in nanoseconds 121 + together with the number of executions are reported in debugfs in 122 + kvm/vm#/vcpu#/timings. The overhead is of the order of 30 - 40 123 + ns per exit on POWER8. 124 + 125 + If unsure, say N. 126 + 113 127 config KVM_BOOKE_HV 114 128 bool 115 129

+76

arch/powerpc/kvm/book3s.c

··· 821 821 #endif 822 822 } 823 823 824 + int kvmppc_h_logical_ci_load(struct kvm_vcpu *vcpu) 825 + { 826 + unsigned long size = kvmppc_get_gpr(vcpu, 4); 827 + unsigned long addr = kvmppc_get_gpr(vcpu, 5); 828 + u64 buf; 829 + int ret; 830 + 831 + if (!is_power_of_2(size) || (size > sizeof(buf))) 832 + return H_TOO_HARD; 833 + 834 + ret = kvm_io_bus_read(vcpu, KVM_MMIO_BUS, addr, size, &buf); 835 + if (ret != 0) 836 + return H_TOO_HARD; 837 + 838 + switch (size) { 839 + case 1: 840 + kvmppc_set_gpr(vcpu, 4, *(u8 *)&buf); 841 + break; 842 + 843 + case 2: 844 + kvmppc_set_gpr(vcpu, 4, be16_to_cpu(*(__be16 *)&buf)); 845 + break; 846 + 847 + case 4: 848 + kvmppc_set_gpr(vcpu, 4, be32_to_cpu(*(__be32 *)&buf)); 849 + break; 850 + 851 + case 8: 852 + kvmppc_set_gpr(vcpu, 4, be64_to_cpu(*(__be64 *)&buf)); 853 + break; 854 + 855 + default: 856 + BUG(); 857 + } 858 + 859 + return H_SUCCESS; 860 + } 861 + EXPORT_SYMBOL_GPL(kvmppc_h_logical_ci_load); 862 + 863 + int kvmppc_h_logical_ci_store(struct kvm_vcpu *vcpu) 864 + { 865 + unsigned long size = kvmppc_get_gpr(vcpu, 4); 866 + unsigned long addr = kvmppc_get_gpr(vcpu, 5); 867 + unsigned long val = kvmppc_get_gpr(vcpu, 6); 868 + u64 buf; 869 + int ret; 870 + 871 + switch (size) { 872 + case 1: 873 + *(u8 *)&buf = val; 874 + break; 875 + 876 + case 2: 877 + *(__be16 *)&buf = cpu_to_be16(val); 878 + break; 879 + 880 + case 4: 881 + *(__be32 *)&buf = cpu_to_be32(val); 882 + break; 883 + 884 + case 8: 885 + *(__be64 *)&buf = cpu_to_be64(val); 886 + break; 887 + 888 + default: 889 + return H_TOO_HARD; 890 + } 891 + 892 + ret = kvm_io_bus_write(vcpu, KVM_MMIO_BUS, addr, size, &buf); 893 + if (ret != 0) 894 + return H_TOO_HARD; 895 + 896 + return H_SUCCESS; 897 + } 898 + EXPORT_SYMBOL_GPL(kvmppc_h_logical_ci_store); 899 + 824 900 int kvmppc_core_check_processor_compat(void) 825 901 { 826 902 /*

+160 -29

arch/powerpc/kvm/book3s_64_mmu_hv.c

··· 27 27 #include <linux/srcu.h> 28 28 #include <linux/anon_inodes.h> 29 29 #include <linux/file.h> 30 + #include <linux/debugfs.h> 30 31 31 32 #include <asm/tlbflush.h> 32 33 #include <asm/kvm_ppc.h> ··· 117 116 long order; 118 117 119 118 mutex_lock(&kvm->lock); 120 - if (kvm->arch.rma_setup_done) { 121 - kvm->arch.rma_setup_done = 0; 122 - /* order rma_setup_done vs. vcpus_running */ 119 + if (kvm->arch.hpte_setup_done) { 120 + kvm->arch.hpte_setup_done = 0; 121 + /* order hpte_setup_done vs. vcpus_running */ 123 122 smp_mb(); 124 123 if (atomic_read(&kvm->arch.vcpus_running)) { 125 - kvm->arch.rma_setup_done = 1; 124 + kvm->arch.hpte_setup_done = 1; 126 125 goto out; 127 126 } 128 127 } ··· 339 338 v = be64_to_cpu(hptep[0]) & ~HPTE_V_HVLOCK; 340 339 gr = kvm->arch.revmap[index].guest_rpte; 341 340 342 - /* Unlock the HPTE */ 343 - asm volatile("lwsync" : : : "memory"); 344 - hptep[0] = cpu_to_be64(v); 341 + unlock_hpte(hptep, v); 345 342 preempt_enable(); 346 343 347 344 gpte->eaddr = eaddr; ··· 468 469 hpte[0] = be64_to_cpu(hptep[0]) & ~HPTE_V_HVLOCK; 469 470 hpte[1] = be64_to_cpu(hptep[1]); 470 471 hpte[2] = r = rev->guest_rpte; 471 - asm volatile("lwsync" : : : "memory"); 472 - hptep[0] = cpu_to_be64(hpte[0]); 472 + unlock_hpte(hptep, hpte[0]); 473 473 preempt_enable(); 474 474 475 475 if (hpte[0] != vcpu->arch.pgfault_hpte[0] || ··· 619 621 620 622 hptep[1] = cpu_to_be64(r); 621 623 eieio(); 622 - hptep[0] = cpu_to_be64(hpte[0]); 624 + __unlock_hpte(hptep, hpte[0]); 623 625 asm volatile("ptesync" : : : "memory"); 624 626 preempt_enable(); 625 627 if (page && hpte_is_writable(r)) ··· 640 642 return ret; 641 643 642 644 out_unlock: 643 - hptep[0] &= ~cpu_to_be64(HPTE_V_HVLOCK); 645 + __unlock_hpte(hptep, be64_to_cpu(hptep[0])); 644 646 preempt_enable(); 645 647 goto out_put; 646 648 } ··· 769 771 } 770 772 } 771 773 unlock_rmap(rmapp); 772 - hptep[0] &= ~cpu_to_be64(HPTE_V_HVLOCK); 774 + __unlock_hpte(hptep, be64_to_cpu(hptep[0])); 773 775 } 774 776 return 0; 775 777 } ··· 855 857 } 856 858 ret = 1; 857 859 } 858 - hptep[0] &= ~cpu_to_be64(HPTE_V_HVLOCK); 860 + __unlock_hpte(hptep, be64_to_cpu(hptep[0])); 859 861 } while ((i = j) != head); 860 862 861 863 unlock_rmap(rmapp); ··· 972 974 973 975 /* Now check and modify the HPTE */ 974 976 if (!(hptep[0] & cpu_to_be64(HPTE_V_VALID))) { 975 - /* unlock and continue */ 976 - hptep[0] &= ~cpu_to_be64(HPTE_V_HVLOCK); 977 + __unlock_hpte(hptep, be64_to_cpu(hptep[0])); 977 978 continue; 978 979 } 979 980 ··· 993 996 npages_dirty = n; 994 997 eieio(); 995 998 } 996 - v &= ~(HPTE_V_ABSENT | HPTE_V_HVLOCK); 999 + v &= ~HPTE_V_ABSENT; 997 1000 v |= HPTE_V_VALID; 998 - hptep[0] = cpu_to_be64(v); 1001 + __unlock_hpte(hptep, v); 999 1002 } while ((i = j) != head); 1000 1003 1001 1004 unlock_rmap(rmapp); ··· 1215 1218 r &= ~HPTE_GR_MODIFIED; 1216 1219 revp->guest_rpte = r; 1217 1220 } 1218 - asm volatile(PPC_RELEASE_BARRIER "" : : : "memory"); 1219 - hptp[0] &= ~cpu_to_be64(HPTE_V_HVLOCK); 1221 + unlock_hpte(hptp, be64_to_cpu(hptp[0])); 1220 1222 preempt_enable(); 1221 1223 if (!(valid == want_valid && (first_pass || dirty))) 1222 1224 ok = 0; ··· 1335 1339 unsigned long tmp[2]; 1336 1340 ssize_t nb; 1337 1341 long int err, ret; 1338 - int rma_setup; 1342 + int hpte_setup; 1339 1343 1340 1344 if (!access_ok(VERIFY_READ, buf, count)) 1341 1345 return -EFAULT; 1342 1346 1343 1347 /* lock out vcpus from running while we're doing this */ 1344 1348 mutex_lock(&kvm->lock); 1345 - rma_setup = kvm->arch.rma_setup_done; 1346 - if (rma_setup) { 1347 - kvm->arch.rma_setup_done = 0; /* temporarily */ 1348 - /* order rma_setup_done vs. vcpus_running */ 1349 + hpte_setup = kvm->arch.hpte_setup_done; 1350 + if (hpte_setup) { 1351 + kvm->arch.hpte_setup_done = 0; /* temporarily */ 1352 + /* order hpte_setup_done vs. vcpus_running */ 1349 1353 smp_mb(); 1350 1354 if (atomic_read(&kvm->arch.vcpus_running)) { 1351 - kvm->arch.rma_setup_done = 1; 1355 + kvm->arch.hpte_setup_done = 1; 1352 1356 mutex_unlock(&kvm->lock); 1353 1357 return -EBUSY; 1354 1358 } ··· 1401 1405 "r=%lx\n", ret, i, v, r); 1402 1406 goto out; 1403 1407 } 1404 - if (!rma_setup && is_vrma_hpte(v)) { 1408 + if (!hpte_setup && is_vrma_hpte(v)) { 1405 1409 unsigned long psize = hpte_base_page_size(v, r); 1406 1410 unsigned long senc = slb_pgsize_encoding(psize); 1407 1411 unsigned long lpcr; ··· 1410 1414 (VRMA_VSID << SLB_VSID_SHIFT_1T); 1411 1415 lpcr = senc << (LPCR_VRMASD_SH - 4); 1412 1416 kvmppc_update_lpcr(kvm, lpcr, LPCR_VRMASD); 1413 - rma_setup = 1; 1417 + hpte_setup = 1; 1414 1418 } 1415 1419 ++i; 1416 1420 hptp += 2; ··· 1426 1430 } 1427 1431 1428 1432 out: 1429 - /* Order HPTE updates vs. rma_setup_done */ 1433 + /* Order HPTE updates vs. hpte_setup_done */ 1430 1434 smp_wmb(); 1431 - kvm->arch.rma_setup_done = rma_setup; 1435 + kvm->arch.hpte_setup_done = hpte_setup; 1432 1436 mutex_unlock(&kvm->lock); 1433 1437 1434 1438 if (err) ··· 1489 1493 } 1490 1494 1491 1495 return ret; 1496 + } 1497 + 1498 + struct debugfs_htab_state { 1499 + struct kvm *kvm; 1500 + struct mutex mutex; 1501 + unsigned long hpt_index; 1502 + int chars_left; 1503 + int buf_index; 1504 + char buf[64]; 1505 + }; 1506 + 1507 + static int debugfs_htab_open(struct inode *inode, struct file *file) 1508 + { 1509 + struct kvm *kvm = inode->i_private; 1510 + struct debugfs_htab_state *p; 1511 + 1512 + p = kzalloc(sizeof(*p), GFP_KERNEL); 1513 + if (!p) 1514 + return -ENOMEM; 1515 + 1516 + kvm_get_kvm(kvm); 1517 + p->kvm = kvm; 1518 + mutex_init(&p->mutex); 1519 + file->private_data = p; 1520 + 1521 + return nonseekable_open(inode, file); 1522 + } 1523 + 1524 + static int debugfs_htab_release(struct inode *inode, struct file *file) 1525 + { 1526 + struct debugfs_htab_state *p = file->private_data; 1527 + 1528 + kvm_put_kvm(p->kvm); 1529 + kfree(p); 1530 + return 0; 1531 + } 1532 + 1533 + static ssize_t debugfs_htab_read(struct file *file, char __user *buf, 1534 + size_t len, loff_t *ppos) 1535 + { 1536 + struct debugfs_htab_state *p = file->private_data; 1537 + ssize_t ret, r; 1538 + unsigned long i, n; 1539 + unsigned long v, hr, gr; 1540 + struct kvm *kvm; 1541 + __be64 *hptp; 1542 + 1543 + ret = mutex_lock_interruptible(&p->mutex); 1544 + if (ret) 1545 + return ret; 1546 + 1547 + if (p->chars_left) { 1548 + n = p->chars_left; 1549 + if (n > len) 1550 + n = len; 1551 + r = copy_to_user(buf, p->buf + p->buf_index, n); 1552 + n -= r; 1553 + p->chars_left -= n; 1554 + p->buf_index += n; 1555 + buf += n; 1556 + len -= n; 1557 + ret = n; 1558 + if (r) { 1559 + if (!n) 1560 + ret = -EFAULT; 1561 + goto out; 1562 + } 1563 + } 1564 + 1565 + kvm = p->kvm; 1566 + i = p->hpt_index; 1567 + hptp = (__be64 *)(kvm->arch.hpt_virt + (i * HPTE_SIZE)); 1568 + for (; len != 0 && i < kvm->arch.hpt_npte; ++i, hptp += 2) { 1569 + if (!(be64_to_cpu(hptp[0]) & (HPTE_V_VALID | HPTE_V_ABSENT))) 1570 + continue; 1571 + 1572 + /* lock the HPTE so it's stable and read it */ 1573 + preempt_disable(); 1574 + while (!try_lock_hpte(hptp, HPTE_V_HVLOCK)) 1575 + cpu_relax(); 1576 + v = be64_to_cpu(hptp[0]) & ~HPTE_V_HVLOCK; 1577 + hr = be64_to_cpu(hptp[1]); 1578 + gr = kvm->arch.revmap[i].guest_rpte; 1579 + unlock_hpte(hptp, v); 1580 + preempt_enable(); 1581 + 1582 + if (!(v & (HPTE_V_VALID | HPTE_V_ABSENT))) 1583 + continue; 1584 + 1585 + n = scnprintf(p->buf, sizeof(p->buf), 1586 + "%6lx %.16lx %.16lx %.16lx\n", 1587 + i, v, hr, gr); 1588 + p->chars_left = n; 1589 + if (n > len) 1590 + n = len; 1591 + r = copy_to_user(buf, p->buf, n); 1592 + n -= r; 1593 + p->chars_left -= n; 1594 + p->buf_index = n; 1595 + buf += n; 1596 + len -= n; 1597 + ret += n; 1598 + if (r) { 1599 + if (!ret) 1600 + ret = -EFAULT; 1601 + goto out; 1602 + } 1603 + } 1604 + p->hpt_index = i; 1605 + 1606 + out: 1607 + mutex_unlock(&p->mutex); 1608 + return ret; 1609 + } 1610 + 1611 + ssize_t debugfs_htab_write(struct file *file, const char __user *buf, 1612 + size_t len, loff_t *ppos) 1613 + { 1614 + return -EACCES; 1615 + } 1616 + 1617 + static const struct file_operations debugfs_htab_fops = { 1618 + .owner = THIS_MODULE, 1619 + .open = debugfs_htab_open, 1620 + .release = debugfs_htab_release, 1621 + .read = debugfs_htab_read, 1622 + .write = debugfs_htab_write, 1623 + .llseek = generic_file_llseek, 1624 + }; 1625 + 1626 + void kvmppc_mmu_debugfs_init(struct kvm *kvm) 1627 + { 1628 + kvm->arch.htab_dentry = debugfs_create_file("htab", 0400, 1629 + kvm->arch.debugfs_dir, kvm, 1630 + &debugfs_htab_fops); 1492 1631 } 1493 1632 1494 1633 void kvmppc_mmu_book3s_hv_init(struct kvm_vcpu *vcpu)

+325 -112

arch/powerpc/kvm/book3s_hv.c

··· 32 32 #include <linux/page-flags.h> 33 33 #include <linux/srcu.h> 34 34 #include <linux/miscdevice.h> 35 + #include <linux/debugfs.h> 35 36 36 37 #include <asm/reg.h> 37 38 #include <asm/cputable.h> ··· 51 50 #include <asm/hvcall.h> 52 51 #include <asm/switch_to.h> 53 52 #include <asm/smp.h> 53 + #include <asm/dbell.h> 54 54 #include <linux/gfp.h> 55 55 #include <linux/vmalloc.h> 56 56 #include <linux/highmem.h> ··· 85 83 static void kvmppc_end_cede(struct kvm_vcpu *vcpu); 86 84 static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu); 87 85 86 + static bool kvmppc_ipi_thread(int cpu) 87 + { 88 + /* On POWER8 for IPIs to threads in the same core, use msgsnd */ 89 + if (cpu_has_feature(CPU_FTR_ARCH_207S)) { 90 + preempt_disable(); 91 + if (cpu_first_thread_sibling(cpu) == 92 + cpu_first_thread_sibling(smp_processor_id())) { 93 + unsigned long msg = PPC_DBELL_TYPE(PPC_DBELL_SERVER); 94 + msg |= cpu_thread_in_core(cpu); 95 + smp_mb(); 96 + __asm__ __volatile__ (PPC_MSGSND(%0) : : "r" (msg)); 97 + preempt_enable(); 98 + return true; 99 + } 100 + preempt_enable(); 101 + } 102 + 103 + #if defined(CONFIG_PPC_ICP_NATIVE) && defined(CONFIG_SMP) 104 + if (cpu >= 0 && cpu < nr_cpu_ids && paca[cpu].kvm_hstate.xics_phys) { 105 + xics_wake_cpu(cpu); 106 + return true; 107 + } 108 + #endif 109 + 110 + return false; 111 + } 112 + 88 113 static void kvmppc_fast_vcpu_kick_hv(struct kvm_vcpu *vcpu) 89 114 { 90 - int me; 91 115 int cpu = vcpu->cpu; 92 116 wait_queue_head_t *wqp; 93 117 ··· 123 95 ++vcpu->stat.halt_wakeup; 124 96 } 125 97 126 - me = get_cpu(); 98 + if (kvmppc_ipi_thread(cpu + vcpu->arch.ptid)) 99 + return; 127 100 128 101 /* CPU points to the first thread of the core */ 129 - if (cpu != me && cpu >= 0 && cpu < nr_cpu_ids) { 130 - #ifdef CONFIG_PPC_ICP_NATIVE 131 - int real_cpu = cpu + vcpu->arch.ptid; 132 - if (paca[real_cpu].kvm_hstate.xics_phys) 133 - xics_wake_cpu(real_cpu); 134 - else 135 - #endif 136 - if (cpu_online(cpu)) 137 - smp_send_reschedule(cpu); 138 - } 139 - put_cpu(); 102 + if (cpu >= 0 && cpu < nr_cpu_ids && cpu_online(cpu)) 103 + smp_send_reschedule(cpu); 140 104 } 141 105 142 106 /* ··· 726 706 727 707 /* Send the error out to userspace via KVM_RUN */ 728 708 return rc; 709 + case H_LOGICAL_CI_LOAD: 710 + ret = kvmppc_h_logical_ci_load(vcpu); 711 + if (ret == H_TOO_HARD) 712 + return RESUME_HOST; 713 + break; 714 + case H_LOGICAL_CI_STORE: 715 + ret = kvmppc_h_logical_ci_store(vcpu); 716 + if (ret == H_TOO_HARD) 717 + return RESUME_HOST; 718 + break; 729 719 case H_SET_MODE: 730 720 ret = kvmppc_h_set_mode(vcpu, kvmppc_get_gpr(vcpu, 4), 731 721 kvmppc_get_gpr(vcpu, 5), ··· 770 740 case H_CONFER: 771 741 case H_REGISTER_VPA: 772 742 case H_SET_MODE: 743 + case H_LOGICAL_CI_LOAD: 744 + case H_LOGICAL_CI_STORE: 773 745 #ifdef CONFIG_KVM_XICS 774 746 case H_XIRR: 775 747 case H_CPPR: ··· 1442 1410 return vcore; 1443 1411 } 1444 1412 1413 + #ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING 1414 + static struct debugfs_timings_element { 1415 + const char *name; 1416 + size_t offset; 1417 + } timings[] = { 1418 + {"rm_entry", offsetof(struct kvm_vcpu, arch.rm_entry)}, 1419 + {"rm_intr", offsetof(struct kvm_vcpu, arch.rm_intr)}, 1420 + {"rm_exit", offsetof(struct kvm_vcpu, arch.rm_exit)}, 1421 + {"guest", offsetof(struct kvm_vcpu, arch.guest_time)}, 1422 + {"cede", offsetof(struct kvm_vcpu, arch.cede_time)}, 1423 + }; 1424 + 1425 + #define N_TIMINGS (sizeof(timings) / sizeof(timings[0])) 1426 + 1427 + struct debugfs_timings_state { 1428 + struct kvm_vcpu *vcpu; 1429 + unsigned int buflen; 1430 + char buf[N_TIMINGS * 100]; 1431 + }; 1432 + 1433 + static int debugfs_timings_open(struct inode *inode, struct file *file) 1434 + { 1435 + struct kvm_vcpu *vcpu = inode->i_private; 1436 + struct debugfs_timings_state *p; 1437 + 1438 + p = kzalloc(sizeof(*p), GFP_KERNEL); 1439 + if (!p) 1440 + return -ENOMEM; 1441 + 1442 + kvm_get_kvm(vcpu->kvm); 1443 + p->vcpu = vcpu; 1444 + file->private_data = p; 1445 + 1446 + return nonseekable_open(inode, file); 1447 + } 1448 + 1449 + static int debugfs_timings_release(struct inode *inode, struct file *file) 1450 + { 1451 + struct debugfs_timings_state *p = file->private_data; 1452 + 1453 + kvm_put_kvm(p->vcpu->kvm); 1454 + kfree(p); 1455 + return 0; 1456 + } 1457 + 1458 + static ssize_t debugfs_timings_read(struct file *file, char __user *buf, 1459 + size_t len, loff_t *ppos) 1460 + { 1461 + struct debugfs_timings_state *p = file->private_data; 1462 + struct kvm_vcpu *vcpu = p->vcpu; 1463 + char *s, *buf_end; 1464 + struct kvmhv_tb_accumulator tb; 1465 + u64 count; 1466 + loff_t pos; 1467 + ssize_t n; 1468 + int i, loops; 1469 + bool ok; 1470 + 1471 + if (!p->buflen) { 1472 + s = p->buf; 1473 + buf_end = s + sizeof(p->buf); 1474 + for (i = 0; i < N_TIMINGS; ++i) { 1475 + struct kvmhv_tb_accumulator *acc; 1476 + 1477 + acc = (struct kvmhv_tb_accumulator *) 1478 + ((unsigned long)vcpu + timings[i].offset); 1479 + ok = false; 1480 + for (loops = 0; loops < 1000; ++loops) { 1481 + count = acc->seqcount; 1482 + if (!(count & 1)) { 1483 + smp_rmb(); 1484 + tb = *acc; 1485 + smp_rmb(); 1486 + if (count == acc->seqcount) { 1487 + ok = true; 1488 + break; 1489 + } 1490 + } 1491 + udelay(1); 1492 + } 1493 + if (!ok) 1494 + snprintf(s, buf_end - s, "%s: stuck\n", 1495 + timings[i].name); 1496 + else 1497 + snprintf(s, buf_end - s, 1498 + "%s: %llu %llu %llu %llu\n", 1499 + timings[i].name, count / 2, 1500 + tb_to_ns(tb.tb_total), 1501 + tb_to_ns(tb.tb_min), 1502 + tb_to_ns(tb.tb_max)); 1503 + s += strlen(s); 1504 + } 1505 + p->buflen = s - p->buf; 1506 + } 1507 + 1508 + pos = *ppos; 1509 + if (pos >= p->buflen) 1510 + return 0; 1511 + if (len > p->buflen - pos) 1512 + len = p->buflen - pos; 1513 + n = copy_to_user(buf, p->buf + pos, len); 1514 + if (n) { 1515 + if (n == len) 1516 + return -EFAULT; 1517 + len -= n; 1518 + } 1519 + *ppos = pos + len; 1520 + return len; 1521 + } 1522 + 1523 + static ssize_t debugfs_timings_write(struct file *file, const char __user *buf, 1524 + size_t len, loff_t *ppos) 1525 + { 1526 + return -EACCES; 1527 + } 1528 + 1529 + static const struct file_operations debugfs_timings_ops = { 1530 + .owner = THIS_MODULE, 1531 + .open = debugfs_timings_open, 1532 + .release = debugfs_timings_release, 1533 + .read = debugfs_timings_read, 1534 + .write = debugfs_timings_write, 1535 + .llseek = generic_file_llseek, 1536 + }; 1537 + 1538 + /* Create a debugfs directory for the vcpu */ 1539 + static void debugfs_vcpu_init(struct kvm_vcpu *vcpu, unsigned int id) 1540 + { 1541 + char buf[16]; 1542 + struct kvm *kvm = vcpu->kvm; 1543 + 1544 + snprintf(buf, sizeof(buf), "vcpu%u", id); 1545 + if (IS_ERR_OR_NULL(kvm->arch.debugfs_dir)) 1546 + return; 1547 + vcpu->arch.debugfs_dir = debugfs_create_dir(buf, kvm->arch.debugfs_dir); 1548 + if (IS_ERR_OR_NULL(vcpu->arch.debugfs_dir)) 1549 + return; 1550 + vcpu->arch.debugfs_timings = 1551 + debugfs_create_file("timings", 0444, vcpu->arch.debugfs_dir, 1552 + vcpu, &debugfs_timings_ops); 1553 + } 1554 + 1555 + #else /* CONFIG_KVM_BOOK3S_HV_EXIT_TIMING */ 1556 + static void debugfs_vcpu_init(struct kvm_vcpu *vcpu, unsigned int id) 1557 + { 1558 + } 1559 + #endif /* CONFIG_KVM_BOOK3S_HV_EXIT_TIMING */ 1560 + 1445 1561 static struct kvm_vcpu *kvmppc_core_vcpu_create_hv(struct kvm *kvm, 1446 1562 unsigned int id) 1447 1563 { ··· 1658 1478 1659 1479 vcpu->arch.cpu_type = KVM_CPU_3S_64; 1660 1480 kvmppc_sanity_check(vcpu); 1481 + 1482 + debugfs_vcpu_init(vcpu, id); 1661 1483 1662 1484 return vcpu; 1663 1485 ··· 1748 1566 tpaca = &paca[cpu]; 1749 1567 1750 1568 /* Ensure the thread won't go into the kernel if it wakes */ 1751 - tpaca->kvm_hstate.hwthread_req = 1; 1752 1569 tpaca->kvm_hstate.kvm_vcpu = NULL; 1570 + tpaca->kvm_hstate.napping = 0; 1571 + smp_wmb(); 1572 + tpaca->kvm_hstate.hwthread_req = 1; 1753 1573 1754 1574 /* 1755 1575 * If the thread is already executing in the kernel (e.g. handling ··· 1794 1610 } 1795 1611 cpu = vc->pcpu + vcpu->arch.ptid; 1796 1612 tpaca = &paca[cpu]; 1797 - tpaca->kvm_hstate.kvm_vcpu = vcpu; 1798 1613 tpaca->kvm_hstate.kvm_vcore = vc; 1799 1614 tpaca->kvm_hstate.ptid = vcpu->arch.ptid; 1800 1615 vcpu->cpu = vc->pcpu; 1616 + /* Order stores to hstate.kvm_vcore etc. before store to kvm_vcpu */ 1801 1617 smp_wmb(); 1802 - #if defined(CONFIG_PPC_ICP_NATIVE) && defined(CONFIG_SMP) 1803 - if (cpu != smp_processor_id()) { 1804 - xics_wake_cpu(cpu); 1805 - if (vcpu->arch.ptid) 1806 - ++vc->n_woken; 1807 - } 1808 - #endif 1618 + tpaca->kvm_hstate.kvm_vcpu = vcpu; 1619 + if (cpu != smp_processor_id()) 1620 + kvmppc_ipi_thread(cpu); 1809 1621 } 1810 1622 1811 - static void kvmppc_wait_for_nap(struct kvmppc_vcore *vc) 1623 + static void kvmppc_wait_for_nap(void) 1812 1624 { 1813 - int i; 1625 + int cpu = smp_processor_id(); 1626 + int i, loops; 1814 1627 1815 - HMT_low(); 1816 - i = 0; 1817 - while (vc->nap_count < vc->n_woken) { 1818 - if (++i >= 1000000) { 1819 - pr_err("kvmppc_wait_for_nap timeout %d %d\n", 1820 - vc->nap_count, vc->n_woken); 1821 - break; 1628 + for (loops = 0; loops < 1000000; ++loops) { 1629 + /* 1630 + * Check if all threads are finished. 1631 + * We set the vcpu pointer when starting a thread 1632 + * and the thread clears it when finished, so we look 1633 + * for any threads that still have a non-NULL vcpu ptr. 1634 + */ 1635 + for (i = 1; i < threads_per_subcore; ++i) 1636 + if (paca[cpu + i].kvm_hstate.kvm_vcpu) 1637 + break; 1638 + if (i == threads_per_subcore) { 1639 + HMT_medium(); 1640 + return; 1822 1641 } 1823 - cpu_relax(); 1642 + HMT_low(); 1824 1643 } 1825 1644 HMT_medium(); 1645 + for (i = 1; i < threads_per_subcore; ++i) 1646 + if (paca[cpu + i].kvm_hstate.kvm_vcpu) 1647 + pr_err("KVM: CPU %d seems to be stuck\n", cpu + i); 1826 1648 } 1827 1649 1828 1650 /* ··· 1890 1700 mtspr(SPRN_MPPR, mpp_addr | PPC_MPPR_FETCH_WHOLE_TABLE); 1891 1701 } 1892 1702 1703 + static void prepare_threads(struct kvmppc_vcore *vc) 1704 + { 1705 + struct kvm_vcpu *vcpu, *vnext; 1706 + 1707 + list_for_each_entry_safe(vcpu, vnext, &vc->runnable_threads, 1708 + arch.run_list) { 1709 + if (signal_pending(vcpu->arch.run_task)) 1710 + vcpu->arch.ret = -EINTR; 1711 + else if (vcpu->arch.vpa.update_pending || 1712 + vcpu->arch.slb_shadow.update_pending || 1713 + vcpu->arch.dtl.update_pending) 1714 + vcpu->arch.ret = RESUME_GUEST; 1715 + else 1716 + continue; 1717 + kvmppc_remove_runnable(vc, vcpu); 1718 + wake_up(&vcpu->arch.cpu_run); 1719 + } 1720 + } 1721 + 1722 + static void post_guest_process(struct kvmppc_vcore *vc) 1723 + { 1724 + u64 now; 1725 + long ret; 1726 + struct kvm_vcpu *vcpu, *vnext; 1727 + 1728 + now = get_tb(); 1729 + list_for_each_entry_safe(vcpu, vnext, &vc->runnable_threads, 1730 + arch.run_list) { 1731 + /* cancel pending dec exception if dec is positive */ 1732 + if (now < vcpu->arch.dec_expires && 1733 + kvmppc_core_pending_dec(vcpu)) 1734 + kvmppc_core_dequeue_dec(vcpu); 1735 + 1736 + trace_kvm_guest_exit(vcpu); 1737 + 1738 + ret = RESUME_GUEST; 1739 + if (vcpu->arch.trap) 1740 + ret = kvmppc_handle_exit_hv(vcpu->arch.kvm_run, vcpu, 1741 + vcpu->arch.run_task); 1742 + 1743 + vcpu->arch.ret = ret; 1744 + vcpu->arch.trap = 0; 1745 + 1746 + if (vcpu->arch.ceded) { 1747 + if (!is_kvmppc_resume_guest(ret)) 1748 + kvmppc_end_cede(vcpu); 1749 + else 1750 + kvmppc_set_timer(vcpu); 1751 + } 1752 + if (!is_kvmppc_resume_guest(vcpu->arch.ret)) { 1753 + kvmppc_remove_runnable(vc, vcpu); 1754 + wake_up(&vcpu->arch.cpu_run); 1755 + } 1756 + } 1757 + } 1758 + 1893 1759 /* 1894 1760 * Run a set of guest threads on a physical core. 1895 1761 * Called with vc->lock held. 1896 1762 */ 1897 - static void kvmppc_run_core(struct kvmppc_vcore *vc) 1763 + static noinline void kvmppc_run_core(struct kvmppc_vcore *vc) 1898 1764 { 1899 - struct kvm_vcpu *vcpu, *vnext; 1900 - long ret; 1901 - u64 now; 1902 - int i, need_vpa_update; 1765 + struct kvm_vcpu *vcpu; 1766 + int i; 1903 1767 int srcu_idx; 1904 - struct kvm_vcpu *vcpus_to_update[threads_per_core]; 1905 - 1906 - /* don't start if any threads have a signal pending */ 1907 - need_vpa_update = 0; 1908 - list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list) { 1909 - if (signal_pending(vcpu->arch.run_task)) 1910 - return; 1911 - if (vcpu->arch.vpa.update_pending || 1912 - vcpu->arch.slb_shadow.update_pending || 1913 - vcpu->arch.dtl.update_pending) 1914 - vcpus_to_update[need_vpa_update++] = vcpu; 1915 - } 1916 1768 1917 1769 /* 1918 - * Initialize *vc, in particular vc->vcore_state, so we can 1919 - * drop the vcore lock if necessary. 1770 + * Remove from the list any threads that have a signal pending 1771 + * or need a VPA update done 1920 1772 */ 1921 - vc->n_woken = 0; 1922 - vc->nap_count = 0; 1923 - vc->entry_exit_count = 0; 1773 + prepare_threads(vc); 1774 + 1775 + /* if the runner is no longer runnable, let the caller pick a new one */ 1776 + if (vc->runner->arch.state != KVMPPC_VCPU_RUNNABLE) 1777 + return; 1778 + 1779 + /* 1780 + * Initialize *vc. 1781 + */ 1782 + vc->entry_exit_map = 0; 1924 1783 vc->preempt_tb = TB_NIL; 1925 - vc->vcore_state = VCORE_STARTING; 1926 1784 vc->in_guest = 0; 1927 1785 vc->napping_threads = 0; 1928 1786 vc->conferring_threads = 0; 1929 - 1930 - /* 1931 - * Updating any of the vpas requires calling kvmppc_pin_guest_page, 1932 - * which can't be called with any spinlocks held. 1933 - */ 1934 - if (need_vpa_update) { 1935 - spin_unlock(&vc->lock); 1936 - for (i = 0; i < need_vpa_update; ++i) 1937 - kvmppc_update_vpas(vcpus_to_update[i]); 1938 - spin_lock(&vc->lock); 1939 - } 1940 1787 1941 1788 /* 1942 1789 * Make sure we are running on primary threads, and that secondary ··· 1982 1755 */ 1983 1756 if ((threads_per_core > 1) && 1984 1757 ((vc->num_threads > threads_per_subcore) || !on_primary_thread())) { 1985 - list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list) 1758 + list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list) { 1986 1759 vcpu->arch.ret = -EBUSY; 1760 + kvmppc_remove_runnable(vc, vcpu); 1761 + wake_up(&vcpu->arch.cpu_run); 1762 + } 1987 1763 goto out; 1988 1764 } 1989 1765 ··· 2027 1797 list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list) 2028 1798 vcpu->cpu = -1; 2029 1799 /* wait for secondary threads to finish writing their state to memory */ 2030 - if (vc->nap_count < vc->n_woken) 2031 - kvmppc_wait_for_nap(vc); 1800 + kvmppc_wait_for_nap(); 2032 1801 for (i = 0; i < threads_per_subcore; ++i) 2033 1802 kvmppc_release_hwthread(vc->pcpu + i); 2034 1803 /* prevent other vcpu threads from doing kvmppc_start_thread() now */ ··· 2041 1812 kvm_guest_exit(); 2042 1813 2043 1814 preempt_enable(); 2044 - cond_resched(); 2045 1815 2046 1816 spin_lock(&vc->lock); 2047 - now = get_tb(); 2048 - list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list) { 2049 - /* cancel pending dec exception if dec is positive */ 2050 - if (now < vcpu->arch.dec_expires && 2051 - kvmppc_core_pending_dec(vcpu)) 2052 - kvmppc_core_dequeue_dec(vcpu); 2053 - 2054 - trace_kvm_guest_exit(vcpu); 2055 - 2056 - ret = RESUME_GUEST; 2057 - if (vcpu->arch.trap) 2058 - ret = kvmppc_handle_exit_hv(vcpu->arch.kvm_run, vcpu, 2059 - vcpu->arch.run_task); 2060 - 2061 - vcpu->arch.ret = ret; 2062 - vcpu->arch.trap = 0; 2063 - 2064 - if (vcpu->arch.ceded) { 2065 - if (!is_kvmppc_resume_guest(ret)) 2066 - kvmppc_end_cede(vcpu); 2067 - else 2068 - kvmppc_set_timer(vcpu); 2069 - } 2070 - } 1817 + post_guest_process(vc); 2071 1818 2072 1819 out: 2073 1820 vc->vcore_state = VCORE_INACTIVE; 2074 - list_for_each_entry_safe(vcpu, vnext, &vc->runnable_threads, 2075 - arch.run_list) { 2076 - if (!is_kvmppc_resume_guest(vcpu->arch.ret)) { 2077 - kvmppc_remove_runnable(vc, vcpu); 2078 - wake_up(&vcpu->arch.cpu_run); 2079 - } 2080 - } 2081 - 2082 1821 trace_kvmppc_run_core(vc, 1); 2083 1822 } 2084 1823 ··· 2136 1939 * this thread straight away and have it join in. 2137 1940 */ 2138 1941 if (!signal_pending(current)) { 2139 - if (vc->vcore_state == VCORE_RUNNING && 2140 - VCORE_EXIT_COUNT(vc) == 0) { 1942 + if (vc->vcore_state == VCORE_RUNNING && !VCORE_IS_EXITING(vc)) { 2141 1943 kvmppc_create_dtl_entry(vcpu, vc); 2142 1944 kvmppc_start_thread(vcpu); 2143 1945 trace_kvm_guest_enter(vcpu); ··· 2167 1971 } 2168 1972 if (!vc->n_runnable || vcpu->arch.state != KVMPPC_VCPU_RUNNABLE) 2169 1973 break; 2170 - vc->runner = vcpu; 2171 1974 n_ceded = 0; 2172 1975 list_for_each_entry(v, &vc->runnable_threads, arch.run_list) { 2173 1976 if (!v->arch.pending_exceptions) ··· 2174 1979 else 2175 1980 v->arch.ceded = 0; 2176 1981 } 2177 - if (n_ceded == vc->n_runnable) 1982 + vc->runner = vcpu; 1983 + if (n_ceded == vc->n_runnable) { 2178 1984 kvmppc_vcore_blocked(vc); 2179 - else 1985 + } else if (should_resched()) { 1986 + vc->vcore_state = VCORE_PREEMPT; 1987 + /* Let something else run */ 1988 + cond_resched_lock(&vc->lock); 1989 + vc->vcore_state = VCORE_INACTIVE; 1990 + } else { 2180 1991 kvmppc_run_core(vc); 1992 + } 2181 1993 vc->runner = NULL; 2182 1994 } 2183 1995 ··· 2234 2032 } 2235 2033 2236 2034 atomic_inc(&vcpu->kvm->arch.vcpus_running); 2237 - /* Order vcpus_running vs. rma_setup_done, see kvmppc_alloc_reset_hpt */ 2035 + /* Order vcpus_running vs. hpte_setup_done, see kvmppc_alloc_reset_hpt */ 2238 2036 smp_mb(); 2239 2037 2240 2038 /* On the first time here, set up HTAB and VRMA */ 2241 - if (!vcpu->kvm->arch.rma_setup_done) { 2039 + if (!vcpu->kvm->arch.hpte_setup_done) { 2242 2040 r = kvmppc_hv_setup_htab_rma(vcpu); 2243 2041 if (r) 2244 2042 goto out; ··· 2440 2238 int srcu_idx; 2441 2239 2442 2240 mutex_lock(&kvm->lock); 2443 - if (kvm->arch.rma_setup_done) 2241 + if (kvm->arch.hpte_setup_done) 2444 2242 goto out; /* another vcpu beat us to it */ 2445 2243 2446 2244 /* Allocate hashed page table (if not done already) and reset it */ ··· 2491 2289 2492 2290 kvmppc_update_lpcr(kvm, lpcr, LPCR_VRMASD); 2493 2291 2494 - /* Order updates to kvm->arch.lpcr etc. vs. rma_setup_done */ 2292 + /* Order updates to kvm->arch.lpcr etc. vs. hpte_setup_done */ 2495 2293 smp_wmb(); 2496 - kvm->arch.rma_setup_done = 1; 2294 + kvm->arch.hpte_setup_done = 1; 2497 2295 err = 0; 2498 2296 out_srcu: 2499 2297 srcu_read_unlock(&kvm->srcu, srcu_idx); ··· 2509 2307 static int kvmppc_core_init_vm_hv(struct kvm *kvm) 2510 2308 { 2511 2309 unsigned long lpcr, lpid; 2310 + char buf[32]; 2512 2311 2513 2312 /* Allocate the guest's logical partition ID */ 2514 2313 ··· 2550 2347 */ 2551 2348 kvm_hv_vm_activated(); 2552 2349 2350 + /* 2351 + * Create a debugfs directory for the VM 2352 + */ 2353 + snprintf(buf, sizeof(buf), "vm%d", current->pid); 2354 + kvm->arch.debugfs_dir = debugfs_create_dir(buf, kvm_debugfs_dir); 2355 + if (!IS_ERR_OR_NULL(kvm->arch.debugfs_dir)) 2356 + kvmppc_mmu_debugfs_init(kvm); 2357 + 2553 2358 return 0; 2554 2359 } 2555 2360 ··· 2578 2367 2579 2368 static void kvmppc_core_destroy_vm_hv(struct kvm *kvm) 2580 2369 { 2370 + debugfs_remove_recursive(kvm->arch.debugfs_dir); 2371 + 2581 2372 kvm_hv_vm_deactivated(); 2582 2373 2583 2374 kvmppc_free_vcores(kvm);

+95 -5

arch/powerpc/kvm/book3s_hv_builtin.c

··· 21 21 #include <asm/cputable.h> 22 22 #include <asm/kvm_ppc.h> 23 23 #include <asm/kvm_book3s.h> 24 + #include <asm/archrandom.h> 25 + #include <asm/xics.h> 26 + #include <asm/dbell.h> 27 + #include <asm/cputhreads.h> 24 28 25 29 #define KVM_CMA_CHUNK_ORDER 18 26 30 ··· 118 114 int rv = H_SUCCESS; /* => don't yield */ 119 115 120 116 set_bit(vcpu->arch.ptid, &vc->conferring_threads); 121 - while ((get_tb() < stop) && (VCORE_EXIT_COUNT(vc) == 0)) { 122 - threads_running = VCORE_ENTRY_COUNT(vc); 123 - threads_ceded = hweight32(vc->napping_threads); 124 - threads_conferring = hweight32(vc->conferring_threads); 125 - if (threads_ceded + threads_conferring >= threads_running) { 117 + while ((get_tb() < stop) && !VCORE_IS_EXITING(vc)) { 118 + threads_running = VCORE_ENTRY_MAP(vc); 119 + threads_ceded = vc->napping_threads; 120 + threads_conferring = vc->conferring_threads; 121 + if ((threads_ceded | threads_conferring) == threads_running) { 126 122 rv = H_TOO_HARD; /* => do yield */ 127 123 break; 128 124 } ··· 173 169 return 0; 174 170 } 175 171 EXPORT_SYMBOL_GPL(kvmppc_hcall_impl_hv_realmode); 172 + 173 + int kvmppc_hwrng_present(void) 174 + { 175 + return powernv_hwrng_present(); 176 + } 177 + EXPORT_SYMBOL_GPL(kvmppc_hwrng_present); 178 + 179 + long kvmppc_h_random(struct kvm_vcpu *vcpu) 180 + { 181 + if (powernv_get_random_real_mode(&vcpu->arch.gpr[4])) 182 + return H_SUCCESS; 183 + 184 + return H_HARDWARE; 185 + } 186 + 187 + static inline void rm_writeb(unsigned long paddr, u8 val) 188 + { 189 + __asm__ __volatile__("stbcix %0,0,%1" 190 + : : "r" (val), "r" (paddr) : "memory"); 191 + } 192 + 193 + /* 194 + * Send an interrupt or message to another CPU. 195 + * This can only be called in real mode. 196 + * The caller needs to include any barrier needed to order writes 197 + * to memory vs. the IPI/message. 198 + */ 199 + void kvmhv_rm_send_ipi(int cpu) 200 + { 201 + unsigned long xics_phys; 202 + 203 + /* On POWER8 for IPIs to threads in the same core, use msgsnd */ 204 + if (cpu_has_feature(CPU_FTR_ARCH_207S) && 205 + cpu_first_thread_sibling(cpu) == 206 + cpu_first_thread_sibling(raw_smp_processor_id())) { 207 + unsigned long msg = PPC_DBELL_TYPE(PPC_DBELL_SERVER); 208 + msg |= cpu_thread_in_core(cpu); 209 + __asm__ __volatile__ (PPC_MSGSND(%0) : : "r" (msg)); 210 + return; 211 + } 212 + 213 + /* Else poke the target with an IPI */ 214 + xics_phys = paca[cpu].kvm_hstate.xics_phys; 215 + rm_writeb(xics_phys + XICS_MFRR, IPI_PRIORITY); 216 + } 217 + 218 + /* 219 + * The following functions are called from the assembly code 220 + * in book3s_hv_rmhandlers.S. 221 + */ 222 + static void kvmhv_interrupt_vcore(struct kvmppc_vcore *vc, int active) 223 + { 224 + int cpu = vc->pcpu; 225 + 226 + /* Order setting of exit map vs. msgsnd/IPI */ 227 + smp_mb(); 228 + for (; active; active >>= 1, ++cpu) 229 + if (active & 1) 230 + kvmhv_rm_send_ipi(cpu); 231 + } 232 + 233 + void kvmhv_commence_exit(int trap) 234 + { 235 + struct kvmppc_vcore *vc = local_paca->kvm_hstate.kvm_vcore; 236 + int ptid = local_paca->kvm_hstate.ptid; 237 + int me, ee; 238 + 239 + /* Set our bit in the threads-exiting-guest map in the 0xff00 240 + bits of vcore->entry_exit_map */ 241 + me = 0x100 << ptid; 242 + do { 243 + ee = vc->entry_exit_map; 244 + } while (cmpxchg(&vc->entry_exit_map, ee, ee | me) != ee); 245 + 246 + /* Are we the first here? */ 247 + if ((ee >> 8) != 0) 248 + return; 249 + 250 + /* 251 + * Trigger the other threads in this vcore to exit the guest. 252 + * If this is a hypervisor decrementer interrupt then they 253 + * will be already on their way out of the guest. 254 + */ 255 + if (trap != BOOK3S_INTERRUPT_HV_DECREMENTER) 256 + kvmhv_interrupt_vcore(vc, ee & ~(1 << ptid)); 257 + }

+9 -16

arch/powerpc/kvm/book3s_hv_rm_mmu.c

··· 150 150 return kvmppc_read_update_linux_pte(ptep, writing, hugepage_shift); 151 151 } 152 152 153 - static inline void unlock_hpte(__be64 *hpte, unsigned long hpte_v) 154 - { 155 - asm volatile(PPC_RELEASE_BARRIER "" : : : "memory"); 156 - hpte[0] = cpu_to_be64(hpte_v); 157 - } 158 - 159 153 long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags, 160 154 long pte_index, unsigned long pteh, unsigned long ptel, 161 155 pgd_t *pgdir, bool realmode, unsigned long *pte_idx_ret) ··· 265 271 u64 pte; 266 272 while (!try_lock_hpte(hpte, HPTE_V_HVLOCK)) 267 273 cpu_relax(); 268 - pte = be64_to_cpu(*hpte); 274 + pte = be64_to_cpu(hpte[0]); 269 275 if (!(pte & (HPTE_V_VALID | HPTE_V_ABSENT))) 270 276 break; 271 - *hpte &= ~cpu_to_be64(HPTE_V_HVLOCK); 277 + __unlock_hpte(hpte, pte); 272 278 hpte += 2; 273 279 } 274 280 if (i == 8) ··· 284 290 285 291 while (!try_lock_hpte(hpte, HPTE_V_HVLOCK)) 286 292 cpu_relax(); 287 - pte = be64_to_cpu(*hpte); 293 + pte = be64_to_cpu(hpte[0]); 288 294 if (pte & (HPTE_V_VALID | HPTE_V_ABSENT)) { 289 - *hpte &= ~cpu_to_be64(HPTE_V_HVLOCK); 295 + __unlock_hpte(hpte, pte); 290 296 return H_PTEG_FULL; 291 297 } 292 298 } ··· 325 331 326 332 /* Write the first HPTE dword, unlocking the HPTE and making it valid */ 327 333 eieio(); 328 - hpte[0] = cpu_to_be64(pteh); 334 + __unlock_hpte(hpte, pteh); 329 335 asm volatile("ptesync" : : : "memory"); 330 336 331 337 *pte_idx_ret = pte_index; ··· 406 412 if ((pte & (HPTE_V_ABSENT | HPTE_V_VALID)) == 0 || 407 413 ((flags & H_AVPN) && (pte & ~0x7fUL) != avpn) || 408 414 ((flags & H_ANDCOND) && (pte & avpn) != 0)) { 409 - hpte[0] &= ~cpu_to_be64(HPTE_V_HVLOCK); 415 + __unlock_hpte(hpte, pte); 410 416 return H_NOT_FOUND; 411 417 } 412 418 ··· 542 548 be64_to_cpu(hp[0]), be64_to_cpu(hp[1])); 543 549 rcbits = rev->guest_rpte & (HPTE_R_R|HPTE_R_C); 544 550 args[j] |= rcbits << (56 - 5); 545 - hp[0] = 0; 551 + __unlock_hpte(hp, 0); 546 552 } 547 553 } 548 554 ··· 568 574 pte = be64_to_cpu(hpte[0]); 569 575 if ((pte & (HPTE_V_ABSENT | HPTE_V_VALID)) == 0 || 570 576 ((flags & H_AVPN) && (pte & ~0x7fUL) != avpn)) { 571 - hpte[0] &= ~cpu_to_be64(HPTE_V_HVLOCK); 577 + __unlock_hpte(hpte, pte); 572 578 return H_NOT_FOUND; 573 579 } 574 580 ··· 749 755 /* Return with the HPTE still locked */ 750 756 return (hash << 3) + (i >> 1); 751 757 752 - /* Unlock and move on */ 753 - hpte[i] = cpu_to_be64(v); 758 + __unlock_hpte(&hpte[i], v); 754 759 } 755 760 756 761 if (val & HPTE_V_SECONDARY)

+217 -21

arch/powerpc/kvm/book3s_hv_rm_xics.c

··· 23 23 24 24 #define DEBUG_PASSUP 25 25 26 - static inline void rm_writeb(unsigned long paddr, u8 val) 26 + static void icp_rm_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp, 27 + u32 new_irq); 28 + 29 + /* -- ICS routines -- */ 30 + static void ics_rm_check_resend(struct kvmppc_xics *xics, 31 + struct kvmppc_ics *ics, struct kvmppc_icp *icp) 27 32 { 28 - __asm__ __volatile__("sync; stbcix %0,0,%1" 29 - : : "r" (val), "r" (paddr) : "memory"); 33 + int i; 34 + 35 + arch_spin_lock(&ics->lock); 36 + 37 + for (i = 0; i < KVMPPC_XICS_IRQ_PER_ICS; i++) { 38 + struct ics_irq_state *state = &ics->irq_state[i]; 39 + 40 + if (!state->resend) 41 + continue; 42 + 43 + arch_spin_unlock(&ics->lock); 44 + icp_rm_deliver_irq(xics, icp, state->number); 45 + arch_spin_lock(&ics->lock); 46 + } 47 + 48 + arch_spin_unlock(&ics->lock); 30 49 } 50 + 51 + /* -- ICP routines -- */ 31 52 32 53 static void icp_rm_set_vcpu_irq(struct kvm_vcpu *vcpu, 33 54 struct kvm_vcpu *this_vcpu) 34 55 { 35 56 struct kvmppc_icp *this_icp = this_vcpu->arch.icp; 36 - unsigned long xics_phys; 37 57 int cpu; 38 58 39 59 /* Mark the target VCPU as having an interrupt pending */ ··· 76 56 /* In SMT cpu will always point to thread 0, we adjust it */ 77 57 cpu += vcpu->arch.ptid; 78 58 79 - /* Not too hard, then poke the target */ 80 - xics_phys = paca[cpu].kvm_hstate.xics_phys; 81 - rm_writeb(xics_phys + XICS_MFRR, IPI_PRIORITY); 59 + smp_mb(); 60 + kvmhv_rm_send_ipi(cpu); 82 61 } 83 62 84 63 static void icp_rm_clr_vcpu_irq(struct kvm_vcpu *vcpu) ··· 133 114 struct kvmppc_icp *icp) 134 115 { 135 116 return (xics->real_mode_dbg || icp->rm_action) ? H_TOO_HARD : H_SUCCESS; 117 + } 118 + 119 + static void icp_rm_check_resend(struct kvmppc_xics *xics, 120 + struct kvmppc_icp *icp) 121 + { 122 + u32 icsid; 123 + 124 + /* Order this load with the test for need_resend in the caller */ 125 + smp_rmb(); 126 + for_each_set_bit(icsid, icp->resend_map, xics->max_icsid + 1) { 127 + struct kvmppc_ics *ics = xics->ics[icsid]; 128 + 129 + if (!test_and_clear_bit(icsid, icp->resend_map)) 130 + continue; 131 + if (!ics) 132 + continue; 133 + ics_rm_check_resend(xics, ics, icp); 134 + } 135 + } 136 + 137 + static bool icp_rm_try_to_deliver(struct kvmppc_icp *icp, u32 irq, u8 priority, 138 + u32 *reject) 139 + { 140 + union kvmppc_icp_state old_state, new_state; 141 + bool success; 142 + 143 + do { 144 + old_state = new_state = READ_ONCE(icp->state); 145 + 146 + *reject = 0; 147 + 148 + /* See if we can deliver */ 149 + success = new_state.cppr > priority && 150 + new_state.mfrr > priority && 151 + new_state.pending_pri > priority; 152 + 153 + /* 154 + * If we can, check for a rejection and perform the 155 + * delivery 156 + */ 157 + if (success) { 158 + *reject = new_state.xisr; 159 + new_state.xisr = irq; 160 + new_state.pending_pri = priority; 161 + } else { 162 + /* 163 + * If we failed to deliver we set need_resend 164 + * so a subsequent CPPR state change causes us 165 + * to try a new delivery. 166 + */ 167 + new_state.need_resend = true; 168 + } 169 + 170 + } while (!icp_rm_try_update(icp, old_state, new_state)); 171 + 172 + return success; 173 + } 174 + 175 + static void icp_rm_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp, 176 + u32 new_irq) 177 + { 178 + struct ics_irq_state *state; 179 + struct kvmppc_ics *ics; 180 + u32 reject; 181 + u16 src; 182 + 183 + /* 184 + * This is used both for initial delivery of an interrupt and 185 + * for subsequent rejection. 186 + * 187 + * Rejection can be racy vs. resends. We have evaluated the 188 + * rejection in an atomic ICP transaction which is now complete, 189 + * so potentially the ICP can already accept the interrupt again. 190 + * 191 + * So we need to retry the delivery. Essentially the reject path 192 + * boils down to a failed delivery. Always. 193 + * 194 + * Now the interrupt could also have moved to a different target, 195 + * thus we may need to re-do the ICP lookup as well 196 + */ 197 + 198 + again: 199 + /* Get the ICS state and lock it */ 200 + ics = kvmppc_xics_find_ics(xics, new_irq, &src); 201 + if (!ics) { 202 + /* Unsafe increment, but this does not need to be accurate */ 203 + xics->err_noics++; 204 + return; 205 + } 206 + state = &ics->irq_state[src]; 207 + 208 + /* Get a lock on the ICS */ 209 + arch_spin_lock(&ics->lock); 210 + 211 + /* Get our server */ 212 + if (!icp || state->server != icp->server_num) { 213 + icp = kvmppc_xics_find_server(xics->kvm, state->server); 214 + if (!icp) { 215 + /* Unsafe increment again*/ 216 + xics->err_noicp++; 217 + goto out; 218 + } 219 + } 220 + 221 + /* Clear the resend bit of that interrupt */ 222 + state->resend = 0; 223 + 224 + /* 225 + * If masked, bail out 226 + * 227 + * Note: PAPR doesn't mention anything about masked pending 228 + * when doing a resend, only when doing a delivery. 229 + * 230 + * However that would have the effect of losing a masked 231 + * interrupt that was rejected and isn't consistent with 232 + * the whole masked_pending business which is about not 233 + * losing interrupts that occur while masked. 234 + * 235 + * I don't differentiate normal deliveries and resends, this 236 + * implementation will differ from PAPR and not lose such 237 + * interrupts. 238 + */ 239 + if (state->priority == MASKED) { 240 + state->masked_pending = 1; 241 + goto out; 242 + } 243 + 244 + /* 245 + * Try the delivery, this will set the need_resend flag 246 + * in the ICP as part of the atomic transaction if the 247 + * delivery is not possible. 248 + * 249 + * Note that if successful, the new delivery might have itself 250 + * rejected an interrupt that was "delivered" before we took the 251 + * ics spin lock. 252 + * 253 + * In this case we do the whole sequence all over again for the 254 + * new guy. We cannot assume that the rejected interrupt is less 255 + * favored than the new one, and thus doesn't need to be delivered, 256 + * because by the time we exit icp_rm_try_to_deliver() the target 257 + * processor may well have already consumed & completed it, and thus 258 + * the rejected interrupt might actually be already acceptable. 259 + */ 260 + if (icp_rm_try_to_deliver(icp, new_irq, state->priority, &reject)) { 261 + /* 262 + * Delivery was successful, did we reject somebody else ? 263 + */ 264 + if (reject && reject != XICS_IPI) { 265 + arch_spin_unlock(&ics->lock); 266 + new_irq = reject; 267 + goto again; 268 + } 269 + } else { 270 + /* 271 + * We failed to deliver the interrupt we need to set the 272 + * resend map bit and mark the ICS state as needing a resend 273 + */ 274 + set_bit(ics->icsid, icp->resend_map); 275 + state->resend = 1; 276 + 277 + /* 278 + * If the need_resend flag got cleared in the ICP some time 279 + * between icp_rm_try_to_deliver() atomic update and now, then 280 + * we know it might have missed the resend_map bit. So we 281 + * retry 282 + */ 283 + smp_mb(); 284 + if (!icp->state.need_resend) { 285 + arch_spin_unlock(&ics->lock); 286 + goto again; 287 + } 288 + } 289 + out: 290 + arch_spin_unlock(&ics->lock); 136 291 } 137 292 138 293 static void icp_rm_down_cppr(struct kvmppc_xics *xics, struct kvmppc_icp *icp, ··· 377 184 * separately here as well. 378 185 */ 379 186 if (resend) { 380 - icp->rm_action |= XICS_RM_CHECK_RESEND; 381 - icp->rm_resend_icp = icp; 187 + icp->n_check_resend++; 188 + icp_rm_check_resend(xics, icp); 382 189 } 383 190 } 384 191 ··· 493 300 } 494 301 } while (!icp_rm_try_update(icp, old_state, new_state)); 495 302 496 - /* Pass rejects to virtual mode */ 303 + /* Handle reject in real mode */ 497 304 if (reject && reject != XICS_IPI) { 498 - this_icp->rm_action |= XICS_RM_REJECT; 499 - this_icp->rm_reject = reject; 305 + this_icp->n_reject++; 306 + icp_rm_deliver_irq(xics, icp, reject); 500 307 } 501 308 502 - /* Pass resends to virtual mode */ 309 + /* Handle resends in real mode */ 503 310 if (resend) { 504 - this_icp->rm_action |= XICS_RM_CHECK_RESEND; 505 - this_icp->rm_resend_icp = icp; 311 + this_icp->n_check_resend++; 312 + icp_rm_check_resend(xics, icp); 506 313 } 507 314 508 315 return check_too_hard(xics, this_icp); ··· 558 365 559 366 } while (!icp_rm_try_update(icp, old_state, new_state)); 560 367 561 - /* Pass rejects to virtual mode */ 368 + /* 369 + * Check for rejects. They are handled by doing a new delivery 370 + * attempt (see comments in icp_rm_deliver_irq). 371 + */ 562 372 if (reject && reject != XICS_IPI) { 563 - icp->rm_action |= XICS_RM_REJECT; 564 - icp->rm_reject = reject; 373 + icp->n_reject++; 374 + icp_rm_deliver_irq(xics, icp, reject); 565 375 } 566 376 bail: 567 377 return check_too_hard(xics, icp); ··· 612 416 goto bail; 613 417 state = &ics->irq_state[src]; 614 418 615 - /* Still asserted, resend it, we make it look like a reject */ 419 + /* Still asserted, resend it */ 616 420 if (state->asserted) { 617 - icp->rm_action |= XICS_RM_REJECT; 618 - icp->rm_reject = irq; 421 + icp->n_reject++; 422 + icp_rm_deliver_irq(xics, icp, irq); 619 423 } 620 424 621 425 if (!hlist_empty(&vcpu->kvm->irq_ack_notifier_list)) {

+422 -137

arch/powerpc/kvm/book3s_hv_rmhandlers.S

··· 172 172 173 173 kvmppc_primary_no_guest: 174 174 /* We handle this much like a ceded vcpu */ 175 + /* put the HDEC into the DEC, since HDEC interrupts don't wake us */ 176 + mfspr r3, SPRN_HDEC 177 + mtspr SPRN_DEC, r3 178 + /* 179 + * Make sure the primary has finished the MMU switch. 180 + * We should never get here on a secondary thread, but 181 + * check it for robustness' sake. 182 + */ 183 + ld r5, HSTATE_KVM_VCORE(r13) 184 + 65: lbz r0, VCORE_IN_GUEST(r5) 185 + cmpwi r0, 0 186 + beq 65b 187 + /* Set LPCR. */ 188 + ld r8,VCORE_LPCR(r5) 189 + mtspr SPRN_LPCR,r8 190 + isync 175 191 /* set our bit in napping_threads */ 176 192 ld r5, HSTATE_KVM_VCORE(r13) 177 193 lbz r7, HSTATE_PTID(r13) ··· 198 182 or r3, r3, r0 199 183 stwcx. r3, 0, r6 200 184 bne 1b 201 - /* order napping_threads update vs testing entry_exit_count */ 185 + /* order napping_threads update vs testing entry_exit_map */ 202 186 isync 203 187 li r12, 0 204 188 lwz r7, VCORE_ENTRY_EXIT(r5) ··· 207 191 li r3, NAPPING_NOVCPU 208 192 stb r3, HSTATE_NAPPING(r13) 209 193 194 + li r3, 0 /* Don't wake on privileged (OS) doorbell */ 210 195 b kvm_do_nap 211 196 212 197 kvm_novcpu_wakeup: ··· 219 202 220 203 /* check the wake reason */ 221 204 bl kvmppc_check_wake_reason 222 - 205 + 223 206 /* see if any other thread is already exiting */ 224 207 lwz r0, VCORE_ENTRY_EXIT(r5) 225 208 cmpwi r0, 0x100 ··· 239 222 cmpdi r3, 0 240 223 bge kvm_novcpu_exit 241 224 225 + /* See if our timeslice has expired (HDEC is negative) */ 226 + mfspr r0, SPRN_HDEC 227 + li r12, BOOK3S_INTERRUPT_HV_DECREMENTER 228 + cmpwi r0, 0 229 + blt kvm_novcpu_exit 230 + 242 231 /* Got an IPI but other vcpus aren't yet exiting, must be a latecomer */ 243 232 ld r4, HSTATE_KVM_VCPU(r13) 244 233 cmpdi r4, 0 245 - bne kvmppc_got_guest 234 + beq kvmppc_primary_no_guest 235 + 236 + #ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING 237 + addi r3, r4, VCPU_TB_RMENTRY 238 + bl kvmhv_start_timing 239 + #endif 240 + b kvmppc_got_guest 246 241 247 242 kvm_novcpu_exit: 248 - b hdec_soon 243 + #ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING 244 + ld r4, HSTATE_KVM_VCPU(r13) 245 + cmpdi r4, 0 246 + beq 13f 247 + addi r3, r4, VCPU_TB_RMEXIT 248 + bl kvmhv_accumulate_time 249 + #endif 250 + 13: mr r3, r12 251 + stw r12, 112-4(r1) 252 + bl kvmhv_commence_exit 253 + nop 254 + lwz r12, 112-4(r1) 255 + b kvmhv_switch_to_host 249 256 250 257 /* 251 258 * We come in here when wakened from nap mode. ··· 280 239 kvm_start_guest: 281 240 282 241 /* Set runlatch bit the minute you wake up from nap */ 283 - mfspr r1, SPRN_CTRLF 284 - ori r1, r1, 1 285 - mtspr SPRN_CTRLT, r1 242 + mfspr r0, SPRN_CTRLF 243 + ori r0, r0, 1 244 + mtspr SPRN_CTRLT, r0 286 245 287 246 ld r2,PACATOC(r13) 288 247 ··· 327 286 ld r6, PACA_DSCR(r13) 328 287 std r6, HSTATE_DSCR(r13) 329 288 289 + /* Order load of vcore, ptid etc. after load of vcpu */ 290 + lwsync 330 291 bl kvmppc_hv_entry 331 292 332 293 /* Back from the guest, go back to nap */ 333 294 /* Clear our vcpu pointer so we don't come back in early */ 334 295 li r0, 0 335 - std r0, HSTATE_KVM_VCPU(r13) 336 296 /* 337 - * Make sure we clear HSTATE_KVM_VCPU(r13) before incrementing 338 - * the nap_count, because once the increment to nap_count is 339 - * visible we could be given another vcpu. 297 + * Once we clear HSTATE_KVM_VCPU(r13), the code in 298 + * kvmppc_run_core() is going to assume that all our vcpu 299 + * state is visible in memory. This lwsync makes sure 300 + * that that is true. 340 301 */ 341 302 lwsync 342 - 343 - /* increment the nap count and then go to nap mode */ 344 - ld r4, HSTATE_KVM_VCORE(r13) 345 - addi r4, r4, VCORE_NAP_COUNT 346 - 51: lwarx r3, 0, r4 347 - addi r3, r3, 1 348 - stwcx. r3, 0, r4 349 - bne 51b 303 + std r0, HSTATE_KVM_VCPU(r13) 350 304 351 305 /* 352 306 * At this point we have finished executing in the guest. ··· 412 376 li r6, KVM_GUEST_MODE_HOST_HV 413 377 stb r6, HSTATE_IN_GUEST(r13) 414 378 379 + #ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING 380 + /* Store initial timestamp */ 381 + cmpdi r4, 0 382 + beq 1f 383 + addi r3, r4, VCPU_TB_RMENTRY 384 + bl kvmhv_start_timing 385 + 1: 386 + #endif 415 387 /* Clear out SLB */ 416 388 li r6,0 417 389 slbmte r6,r6 ··· 431 387 * We don't have to lock against concurrent tlbies, 432 388 * but we do have to coordinate across hardware threads. 433 389 */ 434 - /* Increment entry count iff exit count is zero. */ 435 - ld r5,HSTATE_KVM_VCORE(r13) 436 - addi r9,r5,VCORE_ENTRY_EXIT 437 - 21: lwarx r3,0,r9 438 - cmpwi r3,0x100 /* any threads starting to exit? */ 390 + /* Set bit in entry map iff exit map is zero. */ 391 + ld r5, HSTATE_KVM_VCORE(r13) 392 + li r7, 1 393 + lbz r6, HSTATE_PTID(r13) 394 + sld r7, r7, r6 395 + addi r9, r5, VCORE_ENTRY_EXIT 396 + 21: lwarx r3, 0, r9 397 + cmpwi r3, 0x100 /* any threads starting to exit? */ 439 398 bge secondary_too_late /* if so we're too late to the party */ 440 - addi r3,r3,1 441 - stwcx. r3,0,r9 399 + or r3, r3, r7 400 + stwcx. r3, 0, r9 442 401 bne 21b 443 402 444 403 /* Primary thread switches to guest partition. */ 445 404 ld r9,VCORE_KVM(r5) /* pointer to struct kvm */ 446 - lbz r6,HSTATE_PTID(r13) 447 405 cmpwi r6,0 448 - bne 20f 406 + bne 10f 449 407 ld r6,KVM_SDR1(r9) 450 408 lwz r7,KVM_LPID(r9) 451 409 li r0,LPID_RSVD /* switch to reserved LPID */ ··· 518 472 519 473 li r0,1 520 474 stb r0,VCORE_IN_GUEST(r5) /* signal secondaries to continue */ 521 - b 10f 522 - 523 - /* Secondary threads wait for primary to have done partition switch */ 524 - 20: lbz r0,VCORE_IN_GUEST(r5) 525 - cmpwi r0,0 526 - beq 20b 527 - 528 - /* Set LPCR and RMOR. */ 529 - 10: ld r8,VCORE_LPCR(r5) 530 - mtspr SPRN_LPCR,r8 531 - ld r8,KVM_RMOR(r9) 532 - mtspr SPRN_RMOR,r8 533 - isync 534 - 535 - /* Check if HDEC expires soon */ 536 - mfspr r3,SPRN_HDEC 537 - cmpwi r3,512 /* 1 microsecond */ 538 - li r12,BOOK3S_INTERRUPT_HV_DECREMENTER 539 - blt hdec_soon 540 475 541 476 /* Do we have a guest vcpu to run? */ 542 - cmpdi r4, 0 477 + 10: cmpdi r4, 0 543 478 beq kvmppc_primary_no_guest 544 479 kvmppc_got_guest: 545 480 ··· 845 818 clrrdi r6,r6,1 846 819 mtspr SPRN_CTRLT,r6 847 820 4: 821 + /* Secondary threads wait for primary to have done partition switch */ 822 + ld r5, HSTATE_KVM_VCORE(r13) 823 + lbz r6, HSTATE_PTID(r13) 824 + cmpwi r6, 0 825 + beq 21f 826 + lbz r0, VCORE_IN_GUEST(r5) 827 + cmpwi r0, 0 828 + bne 21f 829 + HMT_LOW 830 + 20: lbz r0, VCORE_IN_GUEST(r5) 831 + cmpwi r0, 0 832 + beq 20b 833 + HMT_MEDIUM 834 + 21: 835 + /* Set LPCR. */ 836 + ld r8,VCORE_LPCR(r5) 837 + mtspr SPRN_LPCR,r8 838 + isync 839 + 840 + /* Check if HDEC expires soon */ 841 + mfspr r3, SPRN_HDEC 842 + cmpwi r3, 512 /* 1 microsecond */ 843 + blt hdec_soon 844 + 848 845 ld r6, VCPU_CTR(r4) 849 846 lwz r7, VCPU_XER(r4) 850 847 ··· 931 880 li r9, KVM_GUEST_MODE_GUEST_HV 932 881 stb r9, HSTATE_IN_GUEST(r13) 933 882 883 + #ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING 884 + /* Accumulate timing */ 885 + addi r3, r4, VCPU_TB_GUEST 886 + bl kvmhv_accumulate_time 887 + #endif 888 + 934 889 /* Enter guest */ 935 890 936 891 BEGIN_FTR_SECTION ··· 973 916 974 917 hrfid 975 918 b . 919 + 920 + secondary_too_late: 921 + li r12, 0 922 + cmpdi r4, 0 923 + beq 11f 924 + stw r12, VCPU_TRAP(r4) 925 + #ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING 926 + addi r3, r4, VCPU_TB_RMEXIT 927 + bl kvmhv_accumulate_time 928 + #endif 929 + 11: b kvmhv_switch_to_host 930 + 931 + hdec_soon: 932 + li r12, BOOK3S_INTERRUPT_HV_DECREMENTER 933 + stw r12, VCPU_TRAP(r4) 934 + mr r9, r4 935 + #ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING 936 + addi r3, r4, VCPU_TB_RMEXIT 937 + bl kvmhv_accumulate_time 938 + #endif 939 + b guest_exit_cont 976 940 977 941 /****************************************************************************** 978 942 * * ··· 1080 1002 1081 1003 stw r12,VCPU_TRAP(r9) 1082 1004 1005 + #ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING 1006 + addi r3, r9, VCPU_TB_RMINTR 1007 + mr r4, r9 1008 + bl kvmhv_accumulate_time 1009 + ld r5, VCPU_GPR(R5)(r9) 1010 + ld r6, VCPU_GPR(R6)(r9) 1011 + ld r7, VCPU_GPR(R7)(r9) 1012 + ld r8, VCPU_GPR(R8)(r9) 1013 + #endif 1014 + 1083 1015 /* Save HEIR (HV emulation assist reg) in emul_inst 1084 1016 if this is an HEI (HV emulation interrupt, e40) */ 1085 1017 li r3,KVM_INST_FETCH_FAILED ··· 1116 1028 bne 2f 1117 1029 mfspr r3,SPRN_HDEC 1118 1030 cmpwi r3,0 1119 - bge ignore_hdec 1031 + mr r4,r9 1032 + bge fast_guest_return 1120 1033 2: 1121 1034 /* See if this is an hcall we can handle in real mode */ 1122 1035 cmpwi r12,BOOK3S_INTERRUPT_SYSCALL 1123 1036 beq hcall_try_real_mode 1124 1037 1038 + /* Hypervisor doorbell - exit only if host IPI flag set */ 1039 + cmpwi r12, BOOK3S_INTERRUPT_H_DOORBELL 1040 + bne 3f 1041 + lbz r0, HSTATE_HOST_IPI(r13) 1042 + beq 4f 1043 + b guest_exit_cont 1044 + 3: 1125 1045 /* External interrupt ? */ 1126 1046 cmpwi r12, BOOK3S_INTERRUPT_EXTERNAL 1127 - bne+ ext_interrupt_to_host 1047 + bne+ guest_exit_cont 1128 1048 1129 1049 /* External interrupt, first check for host_ipi. If this is 1130 1050 * set, we know the host wants us out so let's do it now 1131 1051 */ 1132 1052 bl kvmppc_read_intr 1133 1053 cmpdi r3, 0 1134 - bgt ext_interrupt_to_host 1054 + bgt guest_exit_cont 1135 1055 1136 1056 /* Check if any CPU is heading out to the host, if so head out too */ 1137 - ld r5, HSTATE_KVM_VCORE(r13) 1057 + 4: ld r5, HSTATE_KVM_VCORE(r13) 1138 1058 lwz r0, VCORE_ENTRY_EXIT(r5) 1139 1059 cmpwi r0, 0x100 1140 - bge ext_interrupt_to_host 1141 - 1142 - /* Return to guest after delivering any pending interrupt */ 1143 1060 mr r4, r9 1144 - b deliver_guest_interrupt 1145 - 1146 - ext_interrupt_to_host: 1061 + blt deliver_guest_interrupt 1147 1062 1148 1063 guest_exit_cont: /* r9 = vcpu, r12 = trap, r13 = paca */ 1149 1064 /* Save more register state */ ··· 1156 1065 stw r7, VCPU_DSISR(r9) 1157 1066 /* don't overwrite fault_dar/fault_dsisr if HDSI */ 1158 1067 cmpwi r12,BOOK3S_INTERRUPT_H_DATA_STORAGE 1159 - beq 6f 1068 + beq mc_cont 1160 1069 std r6, VCPU_FAULT_DAR(r9) 1161 1070 stw r7, VCPU_FAULT_DSISR(r9) 1162 1071 ··· 1164 1073 cmpwi r12, BOOK3S_INTERRUPT_MACHINE_CHECK 1165 1074 beq machine_check_realmode 1166 1075 mc_cont: 1076 + #ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING 1077 + addi r3, r9, VCPU_TB_RMEXIT 1078 + mr r4, r9 1079 + bl kvmhv_accumulate_time 1080 + #endif 1081 + 1082 + /* Increment exit count, poke other threads to exit */ 1083 + bl kvmhv_commence_exit 1084 + nop 1085 + ld r9, HSTATE_KVM_VCPU(r13) 1086 + lwz r12, VCPU_TRAP(r9) 1167 1087 1168 1088 /* Save guest CTRL register, set runlatch to 1 */ 1169 - 6: mfspr r6,SPRN_CTRLF 1089 + mfspr r6,SPRN_CTRLF 1170 1090 stw r6,VCPU_CTRL(r9) 1171 1091 andi. r0,r6,1 1172 1092 bne 4f ··· 1519 1417 slbia 1520 1418 ptesync 1521 1419 1522 - hdec_soon: /* r12 = trap, r13 = paca */ 1523 1420 /* 1524 1421 * POWER7/POWER8 guest -> host partition switch code. 1525 1422 * We don't have to lock against tlbies but we do 1526 1423 * have to coordinate the hardware threads. 1527 1424 */ 1528 - /* Increment the threads-exiting-guest count in the 0xff00 1529 - bits of vcore->entry_exit_count */ 1530 - ld r5,HSTATE_KVM_VCORE(r13) 1531 - addi r6,r5,VCORE_ENTRY_EXIT 1532 - 41: lwarx r3,0,r6 1533 - addi r0,r3,0x100 1534 - stwcx. r0,0,r6 1535 - bne 41b 1536 - isync /* order stwcx. vs. reading napping_threads */ 1537 - 1538 - /* 1539 - * At this point we have an interrupt that we have to pass 1540 - * up to the kernel or qemu; we can't handle it in real mode. 1541 - * Thus we have to do a partition switch, so we have to 1542 - * collect the other threads, if we are the first thread 1543 - * to take an interrupt. To do this, we set the HDEC to 0, 1544 - * which causes an HDEC interrupt in all threads within 2ns 1545 - * because the HDEC register is shared between all 4 threads. 1546 - * However, we don't need to bother if this is an HDEC 1547 - * interrupt, since the other threads will already be on their 1548 - * way here in that case. 1549 - */ 1550 - cmpwi r3,0x100 /* Are we the first here? */ 1551 - bge 43f 1552 - cmpwi r12,BOOK3S_INTERRUPT_HV_DECREMENTER 1553 - beq 40f 1554 - li r0,0 1555 - mtspr SPRN_HDEC,r0 1556 - 40: 1557 - /* 1558 - * Send an IPI to any napping threads, since an HDEC interrupt 1559 - * doesn't wake CPUs up from nap. 1560 - */ 1561 - lwz r3,VCORE_NAPPING_THREADS(r5) 1562 - lbz r4,HSTATE_PTID(r13) 1563 - li r0,1 1564 - sld r0,r0,r4 1565 - andc. r3,r3,r0 /* no sense IPI'ing ourselves */ 1566 - beq 43f 1567 - /* Order entry/exit update vs. IPIs */ 1568 - sync 1569 - mulli r4,r4,PACA_SIZE /* get paca for thread 0 */ 1570 - subf r6,r4,r13 1571 - 42: andi. r0,r3,1 1572 - beq 44f 1573 - ld r8,HSTATE_XICS_PHYS(r6) /* get thread's XICS reg addr */ 1574 - li r0,IPI_PRIORITY 1575 - li r7,XICS_MFRR 1576 - stbcix r0,r7,r8 /* trigger the IPI */ 1577 - 44: srdi. r3,r3,1 1578 - addi r6,r6,PACA_SIZE 1579 - bne 42b 1580 - 1581 - secondary_too_late: 1425 + kvmhv_switch_to_host: 1582 1426 /* Secondary threads wait for primary to do partition switch */ 1583 - 43: ld r5,HSTATE_KVM_VCORE(r13) 1427 + ld r5,HSTATE_KVM_VCORE(r13) 1584 1428 ld r4,VCORE_KVM(r5) /* pointer to struct kvm */ 1585 1429 lbz r3,HSTATE_PTID(r13) 1586 1430 cmpwi r3,0 ··· 1610 1562 1: addi r8,r8,16 1611 1563 .endr 1612 1564 1565 + #ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING 1566 + /* Finish timing, if we have a vcpu */ 1567 + ld r4, HSTATE_KVM_VCPU(r13) 1568 + cmpdi r4, 0 1569 + li r3, 0 1570 + beq 2f 1571 + bl kvmhv_accumulate_time 1572 + 2: 1573 + #endif 1613 1574 /* Unset guest mode */ 1614 1575 li r0, KVM_GUEST_MODE_NONE 1615 1576 stb r0, HSTATE_IN_GUEST(r13) ··· 1753 1696 * Returns to the guest if we handle it, or continues on up to 1754 1697 * the kernel if we can't (i.e. if we don't have a handler for 1755 1698 * it, or if the handler returns H_TOO_HARD). 1699 + * 1700 + * r5 - r8 contain hcall args, 1701 + * r9 = vcpu, r10 = pc, r11 = msr, r12 = trap, r13 = paca 1756 1702 */ 1757 - .globl hcall_try_real_mode 1758 1703 hcall_try_real_mode: 1759 1704 ld r3,VCPU_GPR(R3)(r9) 1760 1705 andi. r0,r11,MSR_PR ··· 1898 1839 .long 0 /* 0x12c */ 1899 1840 .long 0 /* 0x130 */ 1900 1841 .long DOTSYM(kvmppc_h_set_xdabr) - hcall_real_table 1842 + .long 0 /* 0x138 */ 1843 + .long 0 /* 0x13c */ 1844 + .long 0 /* 0x140 */ 1845 + .long 0 /* 0x144 */ 1846 + .long 0 /* 0x148 */ 1847 + .long 0 /* 0x14c */ 1848 + .long 0 /* 0x150 */ 1849 + .long 0 /* 0x154 */ 1850 + .long 0 /* 0x158 */ 1851 + .long 0 /* 0x15c */ 1852 + .long 0 /* 0x160 */ 1853 + .long 0 /* 0x164 */ 1854 + .long 0 /* 0x168 */ 1855 + .long 0 /* 0x16c */ 1856 + .long 0 /* 0x170 */ 1857 + .long 0 /* 0x174 */ 1858 + .long 0 /* 0x178 */ 1859 + .long 0 /* 0x17c */ 1860 + .long 0 /* 0x180 */ 1861 + .long 0 /* 0x184 */ 1862 + .long 0 /* 0x188 */ 1863 + .long 0 /* 0x18c */ 1864 + .long 0 /* 0x190 */ 1865 + .long 0 /* 0x194 */ 1866 + .long 0 /* 0x198 */ 1867 + .long 0 /* 0x19c */ 1868 + .long 0 /* 0x1a0 */ 1869 + .long 0 /* 0x1a4 */ 1870 + .long 0 /* 0x1a8 */ 1871 + .long 0 /* 0x1ac */ 1872 + .long 0 /* 0x1b0 */ 1873 + .long 0 /* 0x1b4 */ 1874 + .long 0 /* 0x1b8 */ 1875 + .long 0 /* 0x1bc */ 1876 + .long 0 /* 0x1c0 */ 1877 + .long 0 /* 0x1c4 */ 1878 + .long 0 /* 0x1c8 */ 1879 + .long 0 /* 0x1cc */ 1880 + .long 0 /* 0x1d0 */ 1881 + .long 0 /* 0x1d4 */ 1882 + .long 0 /* 0x1d8 */ 1883 + .long 0 /* 0x1dc */ 1884 + .long 0 /* 0x1e0 */ 1885 + .long 0 /* 0x1e4 */ 1886 + .long 0 /* 0x1e8 */ 1887 + .long 0 /* 0x1ec */ 1888 + .long 0 /* 0x1f0 */ 1889 + .long 0 /* 0x1f4 */ 1890 + .long 0 /* 0x1f8 */ 1891 + .long 0 /* 0x1fc */ 1892 + .long 0 /* 0x200 */ 1893 + .long 0 /* 0x204 */ 1894 + .long 0 /* 0x208 */ 1895 + .long 0 /* 0x20c */ 1896 + .long 0 /* 0x210 */ 1897 + .long 0 /* 0x214 */ 1898 + .long 0 /* 0x218 */ 1899 + .long 0 /* 0x21c */ 1900 + .long 0 /* 0x220 */ 1901 + .long 0 /* 0x224 */ 1902 + .long 0 /* 0x228 */ 1903 + .long 0 /* 0x22c */ 1904 + .long 0 /* 0x230 */ 1905 + .long 0 /* 0x234 */ 1906 + .long 0 /* 0x238 */ 1907 + .long 0 /* 0x23c */ 1908 + .long 0 /* 0x240 */ 1909 + .long 0 /* 0x244 */ 1910 + .long 0 /* 0x248 */ 1911 + .long 0 /* 0x24c */ 1912 + .long 0 /* 0x250 */ 1913 + .long 0 /* 0x254 */ 1914 + .long 0 /* 0x258 */ 1915 + .long 0 /* 0x25c */ 1916 + .long 0 /* 0x260 */ 1917 + .long 0 /* 0x264 */ 1918 + .long 0 /* 0x268 */ 1919 + .long 0 /* 0x26c */ 1920 + .long 0 /* 0x270 */ 1921 + .long 0 /* 0x274 */ 1922 + .long 0 /* 0x278 */ 1923 + .long 0 /* 0x27c */ 1924 + .long 0 /* 0x280 */ 1925 + .long 0 /* 0x284 */ 1926 + .long 0 /* 0x288 */ 1927 + .long 0 /* 0x28c */ 1928 + .long 0 /* 0x290 */ 1929 + .long 0 /* 0x294 */ 1930 + .long 0 /* 0x298 */ 1931 + .long 0 /* 0x29c */ 1932 + .long 0 /* 0x2a0 */ 1933 + .long 0 /* 0x2a4 */ 1934 + .long 0 /* 0x2a8 */ 1935 + .long 0 /* 0x2ac */ 1936 + .long 0 /* 0x2b0 */ 1937 + .long 0 /* 0x2b4 */ 1938 + .long 0 /* 0x2b8 */ 1939 + .long 0 /* 0x2bc */ 1940 + .long 0 /* 0x2c0 */ 1941 + .long 0 /* 0x2c4 */ 1942 + .long 0 /* 0x2c8 */ 1943 + .long 0 /* 0x2cc */ 1944 + .long 0 /* 0x2d0 */ 1945 + .long 0 /* 0x2d4 */ 1946 + .long 0 /* 0x2d8 */ 1947 + .long 0 /* 0x2dc */ 1948 + .long 0 /* 0x2e0 */ 1949 + .long 0 /* 0x2e4 */ 1950 + .long 0 /* 0x2e8 */ 1951 + .long 0 /* 0x2ec */ 1952 + .long 0 /* 0x2f0 */ 1953 + .long 0 /* 0x2f4 */ 1954 + .long 0 /* 0x2f8 */ 1955 + .long 0 /* 0x2fc */ 1956 + .long DOTSYM(kvmppc_h_random) - hcall_real_table 1901 1957 .globl hcall_real_table_end 1902 1958 hcall_real_table_end: 1903 - 1904 - ignore_hdec: 1905 - mr r4,r9 1906 - b fast_guest_return 1907 1959 1908 1960 _GLOBAL(kvmppc_h_set_xdabr) 1909 1961 andi. r0, r5, DABRX_USER | DABRX_KERNEL ··· 2054 1884 li r3, 0 2055 1885 blr 2056 1886 2057 - _GLOBAL(kvmppc_h_cede) 1887 + _GLOBAL(kvmppc_h_cede) /* r3 = vcpu pointer, r11 = msr, r13 = paca */ 2058 1888 ori r11,r11,MSR_EE 2059 1889 std r11,VCPU_MSR(r3) 2060 1890 li r0,1 ··· 2063 1893 lbz r5,VCPU_PRODDED(r3) 2064 1894 cmpwi r5,0 2065 1895 bne kvm_cede_prodded 2066 - li r0,0 /* set trap to 0 to say hcall is handled */ 2067 - stw r0,VCPU_TRAP(r3) 1896 + li r12,0 /* set trap to 0 to say hcall is handled */ 1897 + stw r12,VCPU_TRAP(r3) 2068 1898 li r0,H_SUCCESS 2069 1899 std r0,VCPU_GPR(R3)(r3) 2070 1900 ··· 2082 1912 addi r6,r5,VCORE_NAPPING_THREADS 2083 1913 31: lwarx r4,0,r6 2084 1914 or r4,r4,r0 2085 - PPC_POPCNTW(R7,R4) 2086 - cmpw r7,r8 2087 - bge kvm_cede_exit 1915 + cmpw r4,r8 1916 + beq kvm_cede_exit 2088 1917 stwcx. r4,0,r6 2089 1918 bne 31b 2090 - /* order napping_threads update vs testing entry_exit_count */ 1919 + /* order napping_threads update vs testing entry_exit_map */ 2091 1920 isync 2092 1921 li r0,NAPPING_CEDE 2093 1922 stb r0,HSTATE_NAPPING(r13) ··· 2124 1955 bl kvmppc_save_fp 2125 1956 2126 1957 /* 1958 + * Set DEC to the smaller of DEC and HDEC, so that we wake 1959 + * no later than the end of our timeslice (HDEC interrupts 1960 + * don't wake us from nap). 1961 + */ 1962 + mfspr r3, SPRN_DEC 1963 + mfspr r4, SPRN_HDEC 1964 + mftb r5 1965 + cmpw r3, r4 1966 + ble 67f 1967 + mtspr SPRN_DEC, r4 1968 + 67: 1969 + /* save expiry time of guest decrementer */ 1970 + extsw r3, r3 1971 + add r3, r3, r5 1972 + ld r4, HSTATE_KVM_VCPU(r13) 1973 + ld r5, HSTATE_KVM_VCORE(r13) 1974 + ld r6, VCORE_TB_OFFSET(r5) 1975 + subf r3, r6, r3 /* convert to host TB value */ 1976 + std r3, VCPU_DEC_EXPIRES(r4) 1977 + 1978 + #ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING 1979 + ld r4, HSTATE_KVM_VCPU(r13) 1980 + addi r3, r4, VCPU_TB_CEDE 1981 + bl kvmhv_accumulate_time 1982 + #endif 1983 + 1984 + lis r3, LPCR_PECEDP@h /* Do wake on privileged doorbell */ 1985 + 1986 + /* 2127 1987 * Take a nap until a decrementer or external or doobell interrupt 2128 - * occurs, with PECE1, PECE0 and PECEDP set in LPCR. Also clear the 2129 - * runlatch bit before napping. 1988 + * occurs, with PECE1 and PECE0 set in LPCR. 1989 + * On POWER8, set PECEDH, and if we are ceding, also set PECEDP. 1990 + * Also clear the runlatch bit before napping. 2130 1991 */ 2131 1992 kvm_do_nap: 2132 - mfspr r2, SPRN_CTRLF 2133 - clrrdi r2, r2, 1 2134 - mtspr SPRN_CTRLT, r2 1993 + mfspr r0, SPRN_CTRLF 1994 + clrrdi r0, r0, 1 1995 + mtspr SPRN_CTRLT, r0 2135 1996 2136 1997 li r0,1 2137 1998 stb r0,HSTATE_HWTHREAD_REQ(r13) 2138 1999 mfspr r5,SPRN_LPCR 2139 2000 ori r5,r5,LPCR_PECE0 | LPCR_PECE1 2140 2001 BEGIN_FTR_SECTION 2141 - oris r5,r5,LPCR_PECEDP@h 2002 + ori r5, r5, LPCR_PECEDH 2003 + rlwimi r5, r3, 0, LPCR_PECEDP 2142 2004 END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) 2143 2005 mtspr SPRN_LPCR,r5 2144 2006 isync ··· 2194 1994 /* Woken by external or decrementer interrupt */ 2195 1995 ld r1, HSTATE_HOST_R1(r13) 2196 1996 1997 + #ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING 1998 + addi r3, r4, VCPU_TB_RMINTR 1999 + bl kvmhv_accumulate_time 2000 + #endif 2001 + 2197 2002 /* load up FP state */ 2198 2003 bl kvmppc_load_fp 2004 + 2005 + /* Restore guest decrementer */ 2006 + ld r3, VCPU_DEC_EXPIRES(r4) 2007 + ld r5, HSTATE_KVM_VCORE(r13) 2008 + ld r6, VCORE_TB_OFFSET(r5) 2009 + add r3, r3, r6 /* convert host TB to guest TB value */ 2010 + mftb r7 2011 + subf r3, r7, r3 2012 + mtspr SPRN_DEC, r3 2199 2013 2200 2014 /* Load NV GPRS */ 2201 2015 ld r14, VCPU_GPR(R14)(r4) ··· 2271 2057 2272 2058 /* we've ceded but we want to give control to the host */ 2273 2059 kvm_cede_exit: 2274 - b hcall_real_fallback 2060 + ld r9, HSTATE_KVM_VCPU(r13) 2061 + b guest_exit_cont 2275 2062 2276 2063 /* Try to handle a machine check in real mode */ 2277 2064 machine_check_realmode: ··· 2304 2089 2305 2090 /* 2306 2091 * Check the reason we woke from nap, and take appropriate action. 2307 - * Returns: 2092 + * Returns (in r3): 2308 2093 * 0 if nothing needs to be done 2309 2094 * 1 if something happened that needs to be handled by the host 2310 - * -1 if there was a guest wakeup (IPI) 2095 + * -1 if there was a guest wakeup (IPI or msgsnd) 2311 2096 * 2312 2097 * Also sets r12 to the interrupt vector for any interrupt that needs 2313 2098 * to be handled now by the host (0x500 for external interrupt), or zero. 2099 + * Modifies r0, r6, r7, r8. 2314 2100 */ 2315 2101 kvmppc_check_wake_reason: 2316 2102 mfspr r6, SPRN_SRR1 ··· 2338 2122 2339 2123 /* hypervisor doorbell */ 2340 2124 3: li r12, BOOK3S_INTERRUPT_H_DOORBELL 2125 + /* see if it's a host IPI */ 2341 2126 li r3, 1 2127 + lbz r0, HSTATE_HOST_IPI(r13) 2128 + cmpwi r0, 0 2129 + bnelr 2130 + /* if not, clear it and return -1 */ 2131 + lis r6, (PPC_DBELL_SERVER << (63-36))@h 2132 + PPC_MSGCLR(6) 2133 + li r3, -1 2342 2134 blr 2343 2135 2344 2136 /* ··· 2355 2131 * 0 if no interrupt is pending 2356 2132 * 1 if an interrupt is pending that needs to be handled by the host 2357 2133 * -1 if there was a guest wakeup IPI (which has now been cleared) 2134 + * Modifies r0, r6, r7, r8, returns value in r3. 2358 2135 */ 2359 2136 kvmppc_read_intr: 2360 2137 /* see if a host IPI is pending */ ··· 2410 2185 bne- 43f 2411 2186 2412 2187 /* OK, it's an IPI for us */ 2188 + li r12, 0 2413 2189 li r3, -1 2414 2190 1: blr 2415 2191 ··· 2540 2314 mtspr SPRN_PMC6, r3 2541 2315 isync 2542 2316 blr 2317 + 2318 + #ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING 2319 + /* 2320 + * Start timing an activity 2321 + * r3 = pointer to time accumulation struct, r4 = vcpu 2322 + */ 2323 + kvmhv_start_timing: 2324 + ld r5, HSTATE_KVM_VCORE(r13) 2325 + lbz r6, VCORE_IN_GUEST(r5) 2326 + cmpwi r6, 0 2327 + beq 5f /* if in guest, need to */ 2328 + ld r6, VCORE_TB_OFFSET(r5) /* subtract timebase offset */ 2329 + 5: mftb r5 2330 + subf r5, r6, r5 2331 + std r3, VCPU_CUR_ACTIVITY(r4) 2332 + std r5, VCPU_ACTIVITY_START(r4) 2333 + blr 2334 + 2335 + /* 2336 + * Accumulate time to one activity and start another. 2337 + * r3 = pointer to new time accumulation struct, r4 = vcpu 2338 + */ 2339 + kvmhv_accumulate_time: 2340 + ld r5, HSTATE_KVM_VCORE(r13) 2341 + lbz r8, VCORE_IN_GUEST(r5) 2342 + cmpwi r8, 0 2343 + beq 4f /* if in guest, need to */ 2344 + ld r8, VCORE_TB_OFFSET(r5) /* subtract timebase offset */ 2345 + 4: ld r5, VCPU_CUR_ACTIVITY(r4) 2346 + ld r6, VCPU_ACTIVITY_START(r4) 2347 + std r3, VCPU_CUR_ACTIVITY(r4) 2348 + mftb r7 2349 + subf r7, r8, r7 2350 + std r7, VCPU_ACTIVITY_START(r4) 2351 + cmpdi r5, 0 2352 + beqlr 2353 + subf r3, r6, r7 2354 + ld r8, TAS_SEQCOUNT(r5) 2355 + cmpdi r8, 0 2356 + addi r8, r8, 1 2357 + std r8, TAS_SEQCOUNT(r5) 2358 + lwsync 2359 + ld r7, TAS_TOTAL(r5) 2360 + add r7, r7, r3 2361 + std r7, TAS_TOTAL(r5) 2362 + ld r6, TAS_MIN(r5) 2363 + ld r7, TAS_MAX(r5) 2364 + beq 3f 2365 + cmpd r3, r6 2366 + bge 1f 2367 + 3: std r3, TAS_MIN(r5) 2368 + 1: cmpd r3, r7 2369 + ble 2f 2370 + std r3, TAS_MAX(r5) 2371 + 2: lwsync 2372 + addi r8, r8, 1 2373 + std r8, TAS_SEQCOUNT(r5) 2374 + blr 2375 + #endif

+28

arch/powerpc/kvm/book3s_pr_papr.c

··· 258 258 return EMULATE_DONE; 259 259 } 260 260 261 + static int kvmppc_h_pr_logical_ci_load(struct kvm_vcpu *vcpu) 262 + { 263 + long rc; 264 + 265 + rc = kvmppc_h_logical_ci_load(vcpu); 266 + if (rc == H_TOO_HARD) 267 + return EMULATE_FAIL; 268 + kvmppc_set_gpr(vcpu, 3, rc); 269 + return EMULATE_DONE; 270 + } 271 + 272 + static int kvmppc_h_pr_logical_ci_store(struct kvm_vcpu *vcpu) 273 + { 274 + long rc; 275 + 276 + rc = kvmppc_h_logical_ci_store(vcpu); 277 + if (rc == H_TOO_HARD) 278 + return EMULATE_FAIL; 279 + kvmppc_set_gpr(vcpu, 3, rc); 280 + return EMULATE_DONE; 281 + } 282 + 261 283 static int kvmppc_h_pr_xics_hcall(struct kvm_vcpu *vcpu, u32 cmd) 262 284 { 263 285 long rc = kvmppc_xics_hcall(vcpu, cmd); ··· 312 290 clear_bit(KVM_REQ_UNHALT, &vcpu->requests); 313 291 vcpu->stat.halt_wakeup++; 314 292 return EMULATE_DONE; 293 + case H_LOGICAL_CI_LOAD: 294 + return kvmppc_h_pr_logical_ci_load(vcpu); 295 + case H_LOGICAL_CI_STORE: 296 + return kvmppc_h_pr_logical_ci_store(vcpu); 315 297 case H_XIRR: 316 298 case H_CPPR: 317 299 case H_EOI: ··· 349 323 case H_BULK_REMOVE: 350 324 case H_PUT_TCE: 351 325 case H_CEDE: 326 + case H_LOGICAL_CI_LOAD: 327 + case H_LOGICAL_CI_STORE: 352 328 #ifdef CONFIG_KVM_XICS 353 329 case H_XIRR: 354 330 case H_CPPR:

+80 -25

arch/powerpc/kvm/book3s_xics.c

··· 20 20 #include <asm/xics.h> 21 21 #include <asm/debug.h> 22 22 #include <asm/time.h> 23 + #include <asm/spinlock.h> 23 24 24 25 #include <linux/debugfs.h> 25 26 #include <linux/seq_file.h> ··· 40 39 * LOCKING 41 40 * ======= 42 41 * 43 - * Each ICS has a mutex protecting the information about the IRQ 42 + * Each ICS has a spin lock protecting the information about the IRQ 44 43 * sources and avoiding simultaneous deliveries if the same interrupt. 45 44 * 46 45 * ICP operations are done via a single compare & swap transaction ··· 110 109 { 111 110 int i; 112 111 113 - mutex_lock(&ics->lock); 112 + unsigned long flags; 113 + 114 + local_irq_save(flags); 115 + arch_spin_lock(&ics->lock); 114 116 115 117 for (i = 0; i < KVMPPC_XICS_IRQ_PER_ICS; i++) { 116 118 struct ics_irq_state *state = &ics->irq_state[i]; ··· 124 120 XICS_DBG("resend %#x prio %#x\n", state->number, 125 121 state->priority); 126 122 127 - mutex_unlock(&ics->lock); 123 + arch_spin_unlock(&ics->lock); 124 + local_irq_restore(flags); 128 125 icp_deliver_irq(xics, icp, state->number); 129 - mutex_lock(&ics->lock); 126 + local_irq_save(flags); 127 + arch_spin_lock(&ics->lock); 130 128 } 131 129 132 - mutex_unlock(&ics->lock); 130 + arch_spin_unlock(&ics->lock); 131 + local_irq_restore(flags); 133 132 } 134 133 135 134 static bool write_xive(struct kvmppc_xics *xics, struct kvmppc_ics *ics, ··· 140 133 u32 server, u32 priority, u32 saved_priority) 141 134 { 142 135 bool deliver; 136 + unsigned long flags; 143 137 144 - mutex_lock(&ics->lock); 138 + local_irq_save(flags); 139 + arch_spin_lock(&ics->lock); 145 140 146 141 state->server = server; 147 142 state->priority = priority; ··· 154 145 deliver = true; 155 146 } 156 147 157 - mutex_unlock(&ics->lock); 148 + arch_spin_unlock(&ics->lock); 149 + local_irq_restore(flags); 158 150 159 151 return deliver; 160 152 } ··· 196 186 struct kvmppc_ics *ics; 197 187 struct ics_irq_state *state; 198 188 u16 src; 189 + unsigned long flags; 199 190 200 191 if (!xics) 201 192 return -ENODEV; ··· 206 195 return -EINVAL; 207 196 state = &ics->irq_state[src]; 208 197 209 - mutex_lock(&ics->lock); 198 + local_irq_save(flags); 199 + arch_spin_lock(&ics->lock); 210 200 *server = state->server; 211 201 *priority = state->priority; 212 - mutex_unlock(&ics->lock); 202 + arch_spin_unlock(&ics->lock); 203 + local_irq_restore(flags); 213 204 214 205 return 0; 215 206 } ··· 378 365 struct kvmppc_ics *ics; 379 366 u32 reject; 380 367 u16 src; 368 + unsigned long flags; 381 369 382 370 /* 383 371 * This is used both for initial delivery of an interrupt and ··· 405 391 state = &ics->irq_state[src]; 406 392 407 393 /* Get a lock on the ICS */ 408 - mutex_lock(&ics->lock); 394 + local_irq_save(flags); 395 + arch_spin_lock(&ics->lock); 409 396 410 397 /* Get our server */ 411 398 if (!icp || state->server != icp->server_num) { ··· 449 434 * 450 435 * Note that if successful, the new delivery might have itself 451 436 * rejected an interrupt that was "delivered" before we took the 452 - * icp mutex. 437 + * ics spin lock. 453 438 * 454 439 * In this case we do the whole sequence all over again for the 455 440 * new guy. We cannot assume that the rejected interrupt is less ··· 463 448 * Delivery was successful, did we reject somebody else ? 464 449 */ 465 450 if (reject && reject != XICS_IPI) { 466 - mutex_unlock(&ics->lock); 451 + arch_spin_unlock(&ics->lock); 452 + local_irq_restore(flags); 467 453 new_irq = reject; 468 454 goto again; 469 455 } ··· 484 468 */ 485 469 smp_mb(); 486 470 if (!icp->state.need_resend) { 487 - mutex_unlock(&ics->lock); 471 + arch_spin_unlock(&ics->lock); 472 + local_irq_restore(flags); 488 473 goto again; 489 474 } 490 475 } 491 476 out: 492 - mutex_unlock(&ics->lock); 477 + arch_spin_unlock(&ics->lock); 478 + local_irq_restore(flags); 493 479 } 494 480 495 481 static void icp_down_cppr(struct kvmppc_xics *xics, struct kvmppc_icp *icp, ··· 820 802 XICS_DBG("XICS_RM: H_%x completing, act: %x state: %lx tgt: %p\n", 821 803 hcall, icp->rm_action, icp->rm_dbgstate.raw, icp->rm_dbgtgt); 822 804 823 - if (icp->rm_action & XICS_RM_KICK_VCPU) 805 + if (icp->rm_action & XICS_RM_KICK_VCPU) { 806 + icp->n_rm_kick_vcpu++; 824 807 kvmppc_fast_vcpu_kick(icp->rm_kick_target); 825 - if (icp->rm_action & XICS_RM_CHECK_RESEND) 808 + } 809 + if (icp->rm_action & XICS_RM_CHECK_RESEND) { 810 + icp->n_rm_check_resend++; 826 811 icp_check_resend(xics, icp->rm_resend_icp); 827 - if (icp->rm_action & XICS_RM_REJECT) 812 + } 813 + if (icp->rm_action & XICS_RM_REJECT) { 814 + icp->n_rm_reject++; 828 815 icp_deliver_irq(xics, icp, icp->rm_reject); 829 - if (icp->rm_action & XICS_RM_NOTIFY_EOI) 816 + } 817 + if (icp->rm_action & XICS_RM_NOTIFY_EOI) { 818 + icp->n_rm_notify_eoi++; 830 819 kvm_notify_acked_irq(vcpu->kvm, 0, icp->rm_eoied_irq); 820 + } 831 821 832 822 icp->rm_action = 0; 833 823 ··· 898 872 struct kvm *kvm = xics->kvm; 899 873 struct kvm_vcpu *vcpu; 900 874 int icsid, i; 875 + unsigned long flags; 876 + unsigned long t_rm_kick_vcpu, t_rm_check_resend; 877 + unsigned long t_rm_reject, t_rm_notify_eoi; 878 + unsigned long t_reject, t_check_resend; 901 879 902 880 if (!kvm) 903 881 return 0; 882 + 883 + t_rm_kick_vcpu = 0; 884 + t_rm_notify_eoi = 0; 885 + t_rm_check_resend = 0; 886 + t_rm_reject = 0; 887 + t_check_resend = 0; 888 + t_reject = 0; 904 889 905 890 seq_printf(m, "=========\nICP state\n=========\n"); 906 891 ··· 927 890 icp->server_num, state.xisr, 928 891 state.pending_pri, state.cppr, state.mfrr, 929 892 state.out_ee, state.need_resend); 893 + t_rm_kick_vcpu += icp->n_rm_kick_vcpu; 894 + t_rm_notify_eoi += icp->n_rm_notify_eoi; 895 + t_rm_check_resend += icp->n_rm_check_resend; 896 + t_rm_reject += icp->n_rm_reject; 897 + t_check_resend += icp->n_check_resend; 898 + t_reject += icp->n_reject; 930 899 } 931 900 901 + seq_printf(m, "ICP Guest->Host totals: kick_vcpu=%lu check_resend=%lu reject=%lu notify_eoi=%lu\n", 902 + t_rm_kick_vcpu, t_rm_check_resend, 903 + t_rm_reject, t_rm_notify_eoi); 904 + seq_printf(m, "ICP Real Mode totals: check_resend=%lu resend=%lu\n", 905 + t_check_resend, t_reject); 932 906 for (icsid = 0; icsid <= KVMPPC_XICS_MAX_ICS_ID; icsid++) { 933 907 struct kvmppc_ics *ics = xics->ics[icsid]; 934 908 ··· 949 901 seq_printf(m, "=========\nICS state for ICS 0x%x\n=========\n", 950 902 icsid); 951 903 952 - mutex_lock(&ics->lock); 904 + local_irq_save(flags); 905 + arch_spin_lock(&ics->lock); 953 906 954 907 for (i = 0; i < KVMPPC_XICS_IRQ_PER_ICS; i++) { 955 908 struct ics_irq_state *irq = &ics->irq_state[i]; ··· 961 912 irq->resend, irq->masked_pending); 962 913 963 914 } 964 - mutex_unlock(&ics->lock); 915 + arch_spin_unlock(&ics->lock); 916 + local_irq_restore(flags); 965 917 } 966 918 return 0; 967 919 } ··· 1015 965 if (!ics) 1016 966 goto out; 1017 967 1018 - mutex_init(&ics->lock); 1019 968 ics->icsid = icsid; 1020 969 1021 970 for (i = 0; i < KVMPPC_XICS_IRQ_PER_ICS; i++) { ··· 1156 1107 u64 __user *ubufp = (u64 __user *) addr; 1157 1108 u16 idx; 1158 1109 u64 val, prio; 1110 + unsigned long flags; 1159 1111 1160 1112 ics = kvmppc_xics_find_ics(xics, irq, &idx); 1161 1113 if (!ics) 1162 1114 return -ENOENT; 1163 1115 1164 1116 irqp = &ics->irq_state[idx]; 1165 - mutex_lock(&ics->lock); 1117 + local_irq_save(flags); 1118 + arch_spin_lock(&ics->lock); 1166 1119 ret = -ENOENT; 1167 1120 if (irqp->exists) { 1168 1121 val = irqp->server; ··· 1180 1129 val |= KVM_XICS_PENDING; 1181 1130 ret = 0; 1182 1131 } 1183 - mutex_unlock(&ics->lock); 1132 + arch_spin_unlock(&ics->lock); 1133 + local_irq_restore(flags); 1184 1134 1185 1135 if (!ret && put_user(val, ubufp)) 1186 1136 ret = -EFAULT; ··· 1198 1146 u64 val; 1199 1147 u8 prio; 1200 1148 u32 server; 1149 + unsigned long flags; 1201 1150 1202 1151 if (irq < KVMPPC_XICS_FIRST_IRQ || irq >= KVMPPC_XICS_NR_IRQS) 1203 1152 return -ENOENT; ··· 1219 1166 kvmppc_xics_find_server(xics->kvm, server) == NULL) 1220 1167 return -EINVAL; 1221 1168 1222 - mutex_lock(&ics->lock); 1169 + local_irq_save(flags); 1170 + arch_spin_lock(&ics->lock); 1223 1171 irqp->server = server; 1224 1172 irqp->saved_priority = prio; 1225 1173 if (val & KVM_XICS_MASKED) ··· 1232 1178 if ((val & KVM_XICS_PENDING) && (val & KVM_XICS_LEVEL_SENSITIVE)) 1233 1179 irqp->asserted = 1; 1234 1180 irqp->exists = 1; 1235 - mutex_unlock(&ics->lock); 1181 + arch_spin_unlock(&ics->lock); 1182 + local_irq_restore(flags); 1236 1183 1237 1184 if (val & KVM_XICS_PENDING) 1238 1185 icp_deliver_irq(xics, NULL, irqp->number);

+12 -1

arch/powerpc/kvm/book3s_xics.h

··· 78 78 u32 rm_reject; 79 79 u32 rm_eoied_irq; 80 80 81 + /* Counters for each reason we exited real mode */ 82 + unsigned long n_rm_kick_vcpu; 83 + unsigned long n_rm_check_resend; 84 + unsigned long n_rm_reject; 85 + unsigned long n_rm_notify_eoi; 86 + /* Counters for handling ICP processing in real mode */ 87 + unsigned long n_check_resend; 88 + unsigned long n_reject; 89 + 81 90 /* Debug stuff for real mode */ 82 91 union kvmppc_icp_state rm_dbgstate; 83 92 struct kvm_vcpu *rm_dbgtgt; 84 93 }; 85 94 86 95 struct kvmppc_ics { 87 - struct mutex lock; 96 + arch_spinlock_t lock; 88 97 u16 icsid; 89 98 struct ics_irq_state irq_state[KVMPPC_XICS_IRQ_PER_ICS]; 90 99 }; ··· 105 96 u32 max_icsid; 106 97 bool real_mode; 107 98 bool real_mode_dbg; 99 + u32 err_noics; 100 + u32 err_noicp; 108 101 struct kvmppc_ics *ics[KVMPPC_XICS_MAX_ICS_ID + 1]; 109 102 }; 110 103

+3

arch/powerpc/kvm/powerpc.c

··· 529 529 case KVM_CAP_PPC_RMA: 530 530 r = 0; 531 531 break; 532 + case KVM_CAP_PPC_HWRNG: 533 + r = kvmppc_hwrng_present(); 534 + break; 532 535 #endif 533 536 case KVM_CAP_SYNC_MMU: 534 537 #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE

+29

arch/powerpc/platforms/powernv/rng.c

··· 24 24 25 25 struct powernv_rng { 26 26 void __iomem *regs; 27 + void __iomem *regs_real; 27 28 unsigned long mask; 28 29 }; 29 30 30 31 static DEFINE_PER_CPU(struct powernv_rng *, powernv_rng); 31 32 33 + 34 + int powernv_hwrng_present(void) 35 + { 36 + struct powernv_rng *rng; 37 + 38 + rng = get_cpu_var(powernv_rng); 39 + put_cpu_var(rng); 40 + return rng != NULL; 41 + } 32 42 33 43 static unsigned long rng_whiten(struct powernv_rng *rng, unsigned long val) 34 44 { ··· 54 44 rng->mask = (rng->mask << 1) | (parity & 1); 55 45 56 46 return val; 47 + } 48 + 49 + int powernv_get_random_real_mode(unsigned long *v) 50 + { 51 + struct powernv_rng *rng; 52 + 53 + rng = raw_cpu_read(powernv_rng); 54 + 55 + *v = rng_whiten(rng, in_rm64(rng->regs_real)); 56 + 57 + return 1; 57 58 } 58 59 59 60 int powernv_get_random_long(unsigned long *v) ··· 101 80 static __init int rng_create(struct device_node *dn) 102 81 { 103 82 struct powernv_rng *rng; 83 + struct resource res; 104 84 unsigned long val; 105 85 106 86 rng = kzalloc(sizeof(*rng), GFP_KERNEL); 107 87 if (!rng) 108 88 return -ENOMEM; 89 + 90 + if (of_address_to_resource(dn, 0, &res)) { 91 + kfree(rng); 92 + return -ENXIO; 93 + } 94 + 95 + rng->regs_real = (void __iomem *)res.start; 109 96 110 97 rng->regs = of_iomap(dn, 0); 111 98 if (!rng->regs) {

+1 -1

arch/s390/kvm/kvm-s390.c

··· 110 110 /* upper facilities limit for kvm */ 111 111 unsigned long kvm_s390_fac_list_mask[] = { 112 112 0xffe6fffbfcfdfc40UL, 113 - 0x205c800000000000UL, 113 + 0x005c800000000000UL, 114 114 }; 115 115 116 116 unsigned long kvm_s390_fac_list_mask_size(void)

+6 -5

arch/x86/kvm/lapic.c

··· 683 683 unsigned long bitmap = 1; 684 684 struct kvm_lapic **dst; 685 685 int i; 686 - bool ret = false; 687 - bool x2apic_ipi = src && apic_x2apic_mode(src); 686 + bool ret, x2apic_ipi; 688 687 689 688 *r = -1; 690 689 ··· 695 696 if (irq->shorthand) 696 697 return false; 697 698 699 + x2apic_ipi = src && apic_x2apic_mode(src); 698 700 if (irq->dest_id == (x2apic_ipi ? X2APIC_BROADCAST : APIC_BROADCAST)) 699 701 return false; 700 702 703 + ret = true; 701 704 rcu_read_lock(); 702 705 map = rcu_dereference(kvm->arch.apic_map); 703 706 704 - if (!map) 707 + if (!map) { 708 + ret = false; 705 709 goto out; 706 - 707 - ret = true; 710 + } 708 711 709 712 if (irq->dest_mode == APIC_DEST_PHYSICAL) { 710 713 if (irq->dest_id >= ARRAY_SIZE(map->phys_map))

+7 -13

arch/x86/kvm/mmu.c

··· 4481 4481 pfn = spte_to_pfn(*sptep); 4482 4482 4483 4483 /* 4484 - * Only EPT supported for now; otherwise, one would need to 4485 - * find out efficiently whether the guest page tables are 4486 - * also using huge pages. 4484 + * We cannot do huge page mapping for indirect shadow pages, 4485 + * which are found on the last rmap (level = 1) when not using 4486 + * tdp; such shadow pages are synced with the page table in 4487 + * the guest, and the guest page table is using 4K page size 4488 + * mapping if the indirect sp has level = 1. 4487 4489 */ 4488 4490 if (sp->role.direct && 4489 4491 !kvm_is_reserved_pfn(pfn) && ··· 4506 4504 bool flush = false; 4507 4505 unsigned long *rmapp; 4508 4506 unsigned long last_index, index; 4509 - gfn_t gfn_start, gfn_end; 4510 4507 4511 4508 spin_lock(&kvm->mmu_lock); 4512 4509 4513 - gfn_start = memslot->base_gfn; 4514 - gfn_end = memslot->base_gfn + memslot->npages - 1; 4515 - 4516 - if (gfn_start >= gfn_end) 4517 - goto out; 4518 - 4519 4510 rmapp = memslot->arch.rmap[0]; 4520 - last_index = gfn_to_index(gfn_end, memslot->base_gfn, 4521 - PT_PAGE_TABLE_LEVEL); 4511 + last_index = gfn_to_index(memslot->base_gfn + memslot->npages - 1, 4512 + memslot->base_gfn, PT_PAGE_TABLE_LEVEL); 4522 4513 4523 4514 for (index = 0; index <= last_index; ++index, ++rmapp) { 4524 4515 if (*rmapp) ··· 4529 4534 if (flush) 4530 4535 kvm_flush_remote_tlbs(kvm); 4531 4536 4532 - out: 4533 4537 spin_unlock(&kvm->mmu_lock); 4534 4538 } 4535 4539

+10 -2

arch/x86/kvm/vmx.c

··· 3622 3622 3623 3623 static int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) 3624 3624 { 3625 - unsigned long hw_cr4 = cr4 | (to_vmx(vcpu)->rmode.vm86_active ? 3626 - KVM_RMODE_VM_CR4_ALWAYS_ON : KVM_PMODE_VM_CR4_ALWAYS_ON); 3625 + /* 3626 + * Pass through host's Machine Check Enable value to hw_cr4, which 3627 + * is in force while we are in guest mode. Do not let guests control 3628 + * this bit, even if host CR4.MCE == 0. 3629 + */ 3630 + unsigned long hw_cr4 = 3631 + (cr4_read_shadow() & X86_CR4_MCE) | 3632 + (cr4 & ~X86_CR4_MCE) | 3633 + (to_vmx(vcpu)->rmode.vm86_active ? 3634 + KVM_RMODE_VM_CR4_ALWAYS_ON : KVM_PMODE_VM_CR4_ALWAYS_ON); 3627 3635 3628 3636 if (cr4 & X86_CR4_VMXE) { 3629 3637 /*

+8 -2

arch/x86/kvm/x86.c

··· 5799 5799 kvm_set_mmio_spte_mask(); 5800 5800 5801 5801 kvm_x86_ops = ops; 5802 - kvm_init_msr_list(); 5803 5802 5804 5803 kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK, 5805 5804 PT_DIRTY_MASK, PT64_NX_MASK, 0); ··· 7252 7253 7253 7254 int kvm_arch_hardware_setup(void) 7254 7255 { 7255 - return kvm_x86_ops->hardware_setup(); 7256 + int r; 7257 + 7258 + r = kvm_x86_ops->hardware_setup(); 7259 + if (r != 0) 7260 + return r; 7261 + 7262 + kvm_init_msr_list(); 7263 + return 0; 7256 7264 } 7257 7265 7258 7266 void kvm_arch_hardware_unsetup(void)

+1

include/uapi/linux/kvm.h

··· 813 813 #define KVM_CAP_MIPS_MSA 112 814 814 #define KVM_CAP_S390_INJECT_IRQ 113 815 815 #define KVM_CAP_S390_IRQ_STATE 114 816 + #define KVM_CAP_PPC_HWRNG 115 816 817 817 818 #ifdef KVM_CAP_IRQ_ROUTING 818 819

+4 -1

virt/kvm/arm/vgic.c

··· 1561 1561 goto out; 1562 1562 } 1563 1563 1564 + if (irq_num >= kvm->arch.vgic.nr_irqs) 1565 + return -EINVAL; 1566 + 1564 1567 vcpu_id = vgic_update_irq_pending(kvm, cpuid, irq_num, level); 1565 1568 if (vcpu_id >= 0) { 1566 1569 /* kick the specified vcpu */ ··· 2144 2141 struct kvm_kernel_irq_routing_entry *entries, 2145 2142 int gsi) 2146 2143 { 2147 - return gsi; 2144 + return 0; 2148 2145 } 2149 2146 2150 2147 int kvm_irq_map_chip_pin(struct kvm *kvm, unsigned irqchip, unsigned pin)

+1

virt/kvm/kvm_main.c

··· 89 89 static __read_mostly struct preempt_ops kvm_preempt_ops; 90 90 91 91 struct dentry *kvm_debugfs_dir; 92 + EXPORT_SYMBOL_GPL(kvm_debugfs_dir); 92 93 93 94 static long kvm_vcpu_ioctl(struct file *file, unsigned int ioctl, 94 95 unsigned long arg);