Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm

Pull second batch of KVM changes from Paolo Bonzini:
"This mostly includes the PPC changes for 4.1, which this time cover
Book3S HV only (debugging aids, minor performance improvements and
some cleanups). But there are also bug fixes and small cleanups for
ARM, x86 and s390.

The task_migration_notifier revert and real fix is still pending
review, but I'll send it as soon as possible after -rc1"

* tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (29 commits)
KVM: arm/arm64: check IRQ number on userland injection
KVM: arm: irqfd: fix value returned by kvm_irq_map_gsi
KVM: VMX: Preserve host CR4.MCE value while in guest mode.
KVM: PPC: Book3S HV: Use msgsnd for signalling threads on POWER8
KVM: PPC: Book3S HV: Translate kvmhv_commence_exit to C
KVM: PPC: Book3S HV: Streamline guest entry and exit
KVM: PPC: Book3S HV: Use bitmap of active threads rather than count
KVM: PPC: Book3S HV: Use decrementer to wake napping threads
KVM: PPC: Book3S HV: Don't wake thread with no vcpu on guest IPI
KVM: PPC: Book3S HV: Get rid of vcore nap_count and n_woken
KVM: PPC: Book3S HV: Move vcore preemption point up into kvmppc_run_vcpu
KVM: PPC: Book3S HV: Minor cleanups
KVM: PPC: Book3S HV: Simplify handling of VCPUs that need a VPA update
KVM: PPC: Book3S HV: Accumulate timing information for real-mode code
KVM: PPC: Book3S HV: Create debugfs file for each guest's HPT
KVM: PPC: Book3S HV: Add ICP real mode counters
KVM: PPC: Book3S HV: Move virtual mode ICP functions to real-mode
KVM: PPC: Book3S HV: Convert ICS mutex lock to spin lock
KVM: PPC: Book3S HV: Add guest->host real mode completion counters
KVM: PPC: Book3S HV: Add helpers for lock/unlock hpte
...

+1631 -393
+17
Documentation/virtual/kvm/api.txt
··· 3573 3573 @ar - access register number 3574 3574 3575 3575 KVM handlers should exit to userspace with rc = -EREMOTE. 3576 + 3577 + 3578 + 8. Other capabilities. 3579 + ---------------------- 3580 + 3581 + This section lists capabilities that give information about other 3582 + features of the KVM implementation. 3583 + 3584 + 8.1 KVM_CAP_PPC_HWRNG 3585 + 3586 + Architectures: ppc 3587 + 3588 + This capability, if KVM_CHECK_EXTENSION indicates that it is 3589 + available, means that that the kernel has an implementation of the 3590 + H_RANDOM hypercall backed by a hardware random-number generator. 3591 + If present, the kernel H_RANDOM handler can be enabled for guest use 3592 + with the KVM_CAP_PPC_ENABLE_HCALL capability.
+7 -1
arch/arm/include/uapi/asm/kvm.h
··· 195 195 #define KVM_ARM_IRQ_CPU_IRQ 0 196 196 #define KVM_ARM_IRQ_CPU_FIQ 1 197 197 198 - /* Highest supported SPI, from VGIC_NR_IRQS */ 198 + /* 199 + * This used to hold the highest supported SPI, but it is now obsolete 200 + * and only here to provide source code level compatibility with older 201 + * userland. The highest SPI number can be set via KVM_DEV_ARM_VGIC_GRP_NR_IRQS. 202 + */ 203 + #ifndef __KERNEL__ 199 204 #define KVM_ARM_IRQ_GIC_MAX 127 205 + #endif 200 206 201 207 /* One single KVM irqchip, ie. the VGIC */ 202 208 #define KVM_NR_IRQCHIPS 1
+1 -2
arch/arm/kvm/arm.c
··· 671 671 if (!irqchip_in_kernel(kvm)) 672 672 return -ENXIO; 673 673 674 - if (irq_num < VGIC_NR_PRIVATE_IRQS || 675 - irq_num > KVM_ARM_IRQ_GIC_MAX) 674 + if (irq_num < VGIC_NR_PRIVATE_IRQS) 676 675 return -EINVAL; 677 676 678 677 return kvm_vgic_inject_irq(kvm, 0, irq_num, level);
+7 -1
arch/arm64/include/uapi/asm/kvm.h
··· 188 188 #define KVM_ARM_IRQ_CPU_IRQ 0 189 189 #define KVM_ARM_IRQ_CPU_FIQ 1 190 190 191 - /* Highest supported SPI, from VGIC_NR_IRQS */ 191 + /* 192 + * This used to hold the highest supported SPI, but it is now obsolete 193 + * and only here to provide source code level compatibility with older 194 + * userland. The highest SPI number can be set via KVM_DEV_ARM_VGIC_GRP_NR_IRQS. 195 + */ 196 + #ifndef __KERNEL__ 192 197 #define KVM_ARM_IRQ_GIC_MAX 127 198 + #endif 193 199 194 200 /* One single KVM irqchip, ie. the VGIC */ 195 201 #define KVM_NR_IRQCHIPS 1
+9 -2
arch/powerpc/include/asm/archrandom.h
··· 30 30 return !!ppc_md.get_random_long; 31 31 } 32 32 33 - int powernv_get_random_long(unsigned long *v); 34 - 35 33 static inline int arch_get_random_seed_long(unsigned long *v) 36 34 { 37 35 return 0; ··· 44 46 } 45 47 46 48 #endif /* CONFIG_ARCH_RANDOM */ 49 + 50 + #ifdef CONFIG_PPC_POWERNV 51 + int powernv_hwrng_present(void); 52 + int powernv_get_random_long(unsigned long *v); 53 + int powernv_get_random_real_mode(unsigned long *v); 54 + #else 55 + static inline int powernv_hwrng_present(void) { return 0; } 56 + static inline int powernv_get_random_real_mode(unsigned long *v) { return 0; } 57 + #endif 47 58 48 59 #endif /* _ASM_POWERPC_ARCHRANDOM_H */
+3
arch/powerpc/include/asm/kvm_book3s.h
··· 288 288 return !is_kvmppc_hv_enabled(vcpu->kvm); 289 289 } 290 290 291 + extern int kvmppc_h_logical_ci_load(struct kvm_vcpu *vcpu); 292 + extern int kvmppc_h_logical_ci_store(struct kvm_vcpu *vcpu); 293 + 291 294 /* Magic register values loaded into r3 and r4 before the 'sc' assembly 292 295 * instruction for the OSI hypercalls */ 293 296 #define OSI_SC_MAGIC_R3 0x113724FA
+18
arch/powerpc/include/asm/kvm_book3s_64.h
··· 85 85 return old == 0; 86 86 } 87 87 88 + static inline void unlock_hpte(__be64 *hpte, unsigned long hpte_v) 89 + { 90 + hpte_v &= ~HPTE_V_HVLOCK; 91 + asm volatile(PPC_RELEASE_BARRIER "" : : : "memory"); 92 + hpte[0] = cpu_to_be64(hpte_v); 93 + } 94 + 95 + /* Without barrier */ 96 + static inline void __unlock_hpte(__be64 *hpte, unsigned long hpte_v) 97 + { 98 + hpte_v &= ~HPTE_V_HVLOCK; 99 + hpte[0] = cpu_to_be64(hpte_v); 100 + } 101 + 88 102 static inline int __hpte_actual_psize(unsigned int lp, int psize) 89 103 { 90 104 int i, shift; ··· 437 423 { 438 424 return rcu_dereference_raw_notrace(kvm->memslots); 439 425 } 426 + 427 + extern void kvmppc_mmu_debugfs_init(struct kvm *kvm); 428 + 429 + extern void kvmhv_rm_send_ipi(int cpu); 440 430 441 431 #endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */ 442 432
+33 -14
arch/powerpc/include/asm/kvm_host.h
··· 227 227 unsigned long host_sdr1; 228 228 int tlbie_lock; 229 229 unsigned long lpcr; 230 - unsigned long rmor; 231 - struct kvm_rma_info *rma; 232 230 unsigned long vrma_slb_v; 233 - int rma_setup_done; 231 + int hpte_setup_done; 234 232 u32 hpt_order; 235 233 atomic_t vcpus_running; 236 234 u32 online_vcores; ··· 237 239 atomic_t hpte_mod_interest; 238 240 cpumask_t need_tlb_flush; 239 241 int hpt_cma_alloc; 242 + struct dentry *debugfs_dir; 243 + struct dentry *htab_dentry; 240 244 #endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */ 241 245 #ifdef CONFIG_KVM_BOOK3S_PR_POSSIBLE 242 246 struct mutex hpt_mutex; ··· 263 263 264 264 /* 265 265 * Struct for a virtual core. 266 - * Note: entry_exit_count combines an entry count in the bottom 8 bits 267 - * and an exit count in the next 8 bits. This is so that we can 268 - * atomically increment the entry count iff the exit count is 0 269 - * without taking the lock. 266 + * Note: entry_exit_map combines a bitmap of threads that have entered 267 + * in the bottom 8 bits and a bitmap of threads that have exited in the 268 + * next 8 bits. This is so that we can atomically set the entry bit 269 + * iff the exit map is 0 without taking a lock. 270 270 */ 271 271 struct kvmppc_vcore { 272 272 int n_runnable; 273 - int n_busy; 274 273 int num_threads; 275 - int entry_exit_count; 276 - int n_woken; 277 - int nap_count; 274 + int entry_exit_map; 278 275 int napping_threads; 279 276 int first_vcpuid; 280 277 u16 pcpu; ··· 296 299 ulong conferring_threads; 297 300 }; 298 301 299 - #define VCORE_ENTRY_COUNT(vc) ((vc)->entry_exit_count & 0xff) 300 - #define VCORE_EXIT_COUNT(vc) ((vc)->entry_exit_count >> 8) 302 + #define VCORE_ENTRY_MAP(vc) ((vc)->entry_exit_map & 0xff) 303 + #define VCORE_EXIT_MAP(vc) ((vc)->entry_exit_map >> 8) 304 + #define VCORE_IS_EXITING(vc) (VCORE_EXIT_MAP(vc) != 0) 301 305 302 306 /* Values for vcore_state */ 303 307 #define VCORE_INACTIVE 0 304 308 #define VCORE_SLEEPING 1 305 - #define VCORE_STARTING 2 309 + #define VCORE_PREEMPT 2 306 310 #define VCORE_RUNNING 3 307 311 #define VCORE_EXITING 4 308 312 ··· 364 366 bool tb : 1; /* 1TB segment */ 365 367 bool class : 1; 366 368 u8 base_page_size; /* MMU_PAGE_xxx */ 369 + }; 370 + 371 + /* Struct used to accumulate timing information in HV real mode code */ 372 + struct kvmhv_tb_accumulator { 373 + u64 seqcount; /* used to synchronize access, also count * 2 */ 374 + u64 tb_total; /* total time in timebase ticks */ 375 + u64 tb_min; /* min time */ 376 + u64 tb_max; /* max time */ 367 377 }; 368 378 369 379 # ifdef CONFIG_PPC_FSL_BOOK3E ··· 662 656 663 657 u32 emul_inst; 664 658 #endif 659 + 660 + #ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING 661 + struct kvmhv_tb_accumulator *cur_activity; /* What we're timing */ 662 + u64 cur_tb_start; /* when it started */ 663 + struct kvmhv_tb_accumulator rm_entry; /* real-mode entry code */ 664 + struct kvmhv_tb_accumulator rm_intr; /* real-mode intr handling */ 665 + struct kvmhv_tb_accumulator rm_exit; /* real-mode exit code */ 666 + struct kvmhv_tb_accumulator guest_time; /* guest execution */ 667 + struct kvmhv_tb_accumulator cede_time; /* time napping inside guest */ 668 + 669 + struct dentry *debugfs_dir; 670 + struct dentry *debugfs_timings; 671 + #endif /* CONFIG_KVM_BOOK3S_HV_EXIT_TIMING */ 665 672 }; 666 673 667 674 #define VCPU_FPR(vcpu, i) (vcpu)->arch.fp.fpr[i][TS_FPROFFSET]
+2
arch/powerpc/include/asm/kvm_ppc.h
··· 302 302 return kvm->arch.kvm_ops == kvmppc_hv_ops; 303 303 } 304 304 305 + extern int kvmppc_hwrng_present(void); 306 + 305 307 /* 306 308 * Cuts out inst bits with ordering according to spec. 307 309 * That means the leftmost bit is zero. All given bits are included.
+3
arch/powerpc/include/asm/time.h
··· 211 211 212 212 DECLARE_PER_CPU(u64, decrementers_next_tb); 213 213 214 + /* Convert timebase ticks to nanoseconds */ 215 + unsigned long long tb_to_ns(unsigned long long tb_ticks); 216 + 214 217 #endif /* __KERNEL__ */ 215 218 #endif /* __POWERPC_TIME_H */
+17 -3
arch/powerpc/kernel/asm-offsets.c
··· 37 37 #include <asm/thread_info.h> 38 38 #include <asm/rtas.h> 39 39 #include <asm/vdso_datapage.h> 40 + #include <asm/dbell.h> 40 41 #ifdef CONFIG_PPC64 41 42 #include <asm/paca.h> 42 43 #include <asm/lppaca.h> ··· 460 459 DEFINE(VCPU_SPRG2, offsetof(struct kvm_vcpu, arch.shregs.sprg2)); 461 460 DEFINE(VCPU_SPRG3, offsetof(struct kvm_vcpu, arch.shregs.sprg3)); 462 461 #endif 462 + #ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING 463 + DEFINE(VCPU_TB_RMENTRY, offsetof(struct kvm_vcpu, arch.rm_entry)); 464 + DEFINE(VCPU_TB_RMINTR, offsetof(struct kvm_vcpu, arch.rm_intr)); 465 + DEFINE(VCPU_TB_RMEXIT, offsetof(struct kvm_vcpu, arch.rm_exit)); 466 + DEFINE(VCPU_TB_GUEST, offsetof(struct kvm_vcpu, arch.guest_time)); 467 + DEFINE(VCPU_TB_CEDE, offsetof(struct kvm_vcpu, arch.cede_time)); 468 + DEFINE(VCPU_CUR_ACTIVITY, offsetof(struct kvm_vcpu, arch.cur_activity)); 469 + DEFINE(VCPU_ACTIVITY_START, offsetof(struct kvm_vcpu, arch.cur_tb_start)); 470 + DEFINE(TAS_SEQCOUNT, offsetof(struct kvmhv_tb_accumulator, seqcount)); 471 + DEFINE(TAS_TOTAL, offsetof(struct kvmhv_tb_accumulator, tb_total)); 472 + DEFINE(TAS_MIN, offsetof(struct kvmhv_tb_accumulator, tb_min)); 473 + DEFINE(TAS_MAX, offsetof(struct kvmhv_tb_accumulator, tb_max)); 474 + #endif 463 475 DEFINE(VCPU_SHARED_SPRG3, offsetof(struct kvm_vcpu_arch_shared, sprg3)); 464 476 DEFINE(VCPU_SHARED_SPRG4, offsetof(struct kvm_vcpu_arch_shared, sprg4)); 465 477 DEFINE(VCPU_SHARED_SPRG5, offsetof(struct kvm_vcpu_arch_shared, sprg5)); ··· 506 492 DEFINE(KVM_NEED_FLUSH, offsetof(struct kvm, arch.need_tlb_flush.bits)); 507 493 DEFINE(KVM_ENABLED_HCALLS, offsetof(struct kvm, arch.enabled_hcalls)); 508 494 DEFINE(KVM_LPCR, offsetof(struct kvm, arch.lpcr)); 509 - DEFINE(KVM_RMOR, offsetof(struct kvm, arch.rmor)); 510 495 DEFINE(KVM_VRMA_SLB_V, offsetof(struct kvm, arch.vrma_slb_v)); 511 496 DEFINE(VCPU_DSISR, offsetof(struct kvm_vcpu, arch.shregs.dsisr)); 512 497 DEFINE(VCPU_DAR, offsetof(struct kvm_vcpu, arch.shregs.dar)); ··· 563 550 DEFINE(VCPU_ACOP, offsetof(struct kvm_vcpu, arch.acop)); 564 551 DEFINE(VCPU_WORT, offsetof(struct kvm_vcpu, arch.wort)); 565 552 DEFINE(VCPU_SHADOW_SRR1, offsetof(struct kvm_vcpu, arch.shadow_srr1)); 566 - DEFINE(VCORE_ENTRY_EXIT, offsetof(struct kvmppc_vcore, entry_exit_count)); 567 - DEFINE(VCORE_NAP_COUNT, offsetof(struct kvmppc_vcore, nap_count)); 553 + DEFINE(VCORE_ENTRY_EXIT, offsetof(struct kvmppc_vcore, entry_exit_map)); 568 554 DEFINE(VCORE_IN_GUEST, offsetof(struct kvmppc_vcore, in_guest)); 569 555 DEFINE(VCORE_NAPPING_THREADS, offsetof(struct kvmppc_vcore, napping_threads)); 570 556 DEFINE(VCORE_KVM, offsetof(struct kvmppc_vcore, kvm)); ··· 759 747 DEFINE(PACA_SUBCORE_SIBLING_MASK, 760 748 offsetof(struct paca_struct, subcore_sibling_mask)); 761 749 #endif 750 + 751 + DEFINE(PPC_DBELL_SERVER, PPC_DBELL_SERVER); 762 752 763 753 return 0; 764 754 }
+6
arch/powerpc/kernel/time.c
··· 608 608 } 609 609 #endif 610 610 611 + unsigned long long tb_to_ns(unsigned long long ticks) 612 + { 613 + return mulhdu(ticks, tb_to_ns_scale) << tb_to_ns_shift; 614 + } 615 + EXPORT_SYMBOL_GPL(tb_to_ns); 616 + 611 617 /* 612 618 * Scheduler clock - returns current time in nanosec units. 613 619 *
+14
arch/powerpc/kvm/Kconfig
··· 110 110 processor, including emulating 32-bit processors on a 64-bit 111 111 host. 112 112 113 + config KVM_BOOK3S_HV_EXIT_TIMING 114 + bool "Detailed timing for hypervisor real-mode code" 115 + depends on KVM_BOOK3S_HV_POSSIBLE && DEBUG_FS 116 + ---help--- 117 + Calculate time taken for each vcpu in the real-mode guest entry, 118 + exit, and interrupt handling code, plus time spent in the guest 119 + and in nap mode due to idle (cede) while other threads are still 120 + in the guest. The total, minimum and maximum times in nanoseconds 121 + together with the number of executions are reported in debugfs in 122 + kvm/vm#/vcpu#/timings. The overhead is of the order of 30 - 40 123 + ns per exit on POWER8. 124 + 125 + If unsure, say N. 126 + 113 127 config KVM_BOOKE_HV 114 128 bool 115 129
+76
arch/powerpc/kvm/book3s.c
··· 821 821 #endif 822 822 } 823 823 824 + int kvmppc_h_logical_ci_load(struct kvm_vcpu *vcpu) 825 + { 826 + unsigned long size = kvmppc_get_gpr(vcpu, 4); 827 + unsigned long addr = kvmppc_get_gpr(vcpu, 5); 828 + u64 buf; 829 + int ret; 830 + 831 + if (!is_power_of_2(size) || (size > sizeof(buf))) 832 + return H_TOO_HARD; 833 + 834 + ret = kvm_io_bus_read(vcpu, KVM_MMIO_BUS, addr, size, &buf); 835 + if (ret != 0) 836 + return H_TOO_HARD; 837 + 838 + switch (size) { 839 + case 1: 840 + kvmppc_set_gpr(vcpu, 4, *(u8 *)&buf); 841 + break; 842 + 843 + case 2: 844 + kvmppc_set_gpr(vcpu, 4, be16_to_cpu(*(__be16 *)&buf)); 845 + break; 846 + 847 + case 4: 848 + kvmppc_set_gpr(vcpu, 4, be32_to_cpu(*(__be32 *)&buf)); 849 + break; 850 + 851 + case 8: 852 + kvmppc_set_gpr(vcpu, 4, be64_to_cpu(*(__be64 *)&buf)); 853 + break; 854 + 855 + default: 856 + BUG(); 857 + } 858 + 859 + return H_SUCCESS; 860 + } 861 + EXPORT_SYMBOL_GPL(kvmppc_h_logical_ci_load); 862 + 863 + int kvmppc_h_logical_ci_store(struct kvm_vcpu *vcpu) 864 + { 865 + unsigned long size = kvmppc_get_gpr(vcpu, 4); 866 + unsigned long addr = kvmppc_get_gpr(vcpu, 5); 867 + unsigned long val = kvmppc_get_gpr(vcpu, 6); 868 + u64 buf; 869 + int ret; 870 + 871 + switch (size) { 872 + case 1: 873 + *(u8 *)&buf = val; 874 + break; 875 + 876 + case 2: 877 + *(__be16 *)&buf = cpu_to_be16(val); 878 + break; 879 + 880 + case 4: 881 + *(__be32 *)&buf = cpu_to_be32(val); 882 + break; 883 + 884 + case 8: 885 + *(__be64 *)&buf = cpu_to_be64(val); 886 + break; 887 + 888 + default: 889 + return H_TOO_HARD; 890 + } 891 + 892 + ret = kvm_io_bus_write(vcpu, KVM_MMIO_BUS, addr, size, &buf); 893 + if (ret != 0) 894 + return H_TOO_HARD; 895 + 896 + return H_SUCCESS; 897 + } 898 + EXPORT_SYMBOL_GPL(kvmppc_h_logical_ci_store); 899 + 824 900 int kvmppc_core_check_processor_compat(void) 825 901 { 826 902 /*
+160 -29
arch/powerpc/kvm/book3s_64_mmu_hv.c
··· 27 27 #include <linux/srcu.h> 28 28 #include <linux/anon_inodes.h> 29 29 #include <linux/file.h> 30 + #include <linux/debugfs.h> 30 31 31 32 #include <asm/tlbflush.h> 32 33 #include <asm/kvm_ppc.h> ··· 117 116 long order; 118 117 119 118 mutex_lock(&kvm->lock); 120 - if (kvm->arch.rma_setup_done) { 121 - kvm->arch.rma_setup_done = 0; 122 - /* order rma_setup_done vs. vcpus_running */ 119 + if (kvm->arch.hpte_setup_done) { 120 + kvm->arch.hpte_setup_done = 0; 121 + /* order hpte_setup_done vs. vcpus_running */ 123 122 smp_mb(); 124 123 if (atomic_read(&kvm->arch.vcpus_running)) { 125 - kvm->arch.rma_setup_done = 1; 124 + kvm->arch.hpte_setup_done = 1; 126 125 goto out; 127 126 } 128 127 } ··· 339 338 v = be64_to_cpu(hptep[0]) & ~HPTE_V_HVLOCK; 340 339 gr = kvm->arch.revmap[index].guest_rpte; 341 340 342 - /* Unlock the HPTE */ 343 - asm volatile("lwsync" : : : "memory"); 344 - hptep[0] = cpu_to_be64(v); 341 + unlock_hpte(hptep, v); 345 342 preempt_enable(); 346 343 347 344 gpte->eaddr = eaddr; ··· 468 469 hpte[0] = be64_to_cpu(hptep[0]) & ~HPTE_V_HVLOCK; 469 470 hpte[1] = be64_to_cpu(hptep[1]); 470 471 hpte[2] = r = rev->guest_rpte; 471 - asm volatile("lwsync" : : : "memory"); 472 - hptep[0] = cpu_to_be64(hpte[0]); 472 + unlock_hpte(hptep, hpte[0]); 473 473 preempt_enable(); 474 474 475 475 if (hpte[0] != vcpu->arch.pgfault_hpte[0] || ··· 619 621 620 622 hptep[1] = cpu_to_be64(r); 621 623 eieio(); 622 - hptep[0] = cpu_to_be64(hpte[0]); 624 + __unlock_hpte(hptep, hpte[0]); 623 625 asm volatile("ptesync" : : : "memory"); 624 626 preempt_enable(); 625 627 if (page && hpte_is_writable(r)) ··· 640 642 return ret; 641 643 642 644 out_unlock: 643 - hptep[0] &= ~cpu_to_be64(HPTE_V_HVLOCK); 645 + __unlock_hpte(hptep, be64_to_cpu(hptep[0])); 644 646 preempt_enable(); 645 647 goto out_put; 646 648 } ··· 769 771 } 770 772 } 771 773 unlock_rmap(rmapp); 772 - hptep[0] &= ~cpu_to_be64(HPTE_V_HVLOCK); 774 + __unlock_hpte(hptep, be64_to_cpu(hptep[0])); 773 775 } 774 776 return 0; 775 777 } ··· 855 857 } 856 858 ret = 1; 857 859 } 858 - hptep[0] &= ~cpu_to_be64(HPTE_V_HVLOCK); 860 + __unlock_hpte(hptep, be64_to_cpu(hptep[0])); 859 861 } while ((i = j) != head); 860 862 861 863 unlock_rmap(rmapp); ··· 972 974 973 975 /* Now check and modify the HPTE */ 974 976 if (!(hptep[0] & cpu_to_be64(HPTE_V_VALID))) { 975 - /* unlock and continue */ 976 - hptep[0] &= ~cpu_to_be64(HPTE_V_HVLOCK); 977 + __unlock_hpte(hptep, be64_to_cpu(hptep[0])); 977 978 continue; 978 979 } 979 980 ··· 993 996 npages_dirty = n; 994 997 eieio(); 995 998 } 996 - v &= ~(HPTE_V_ABSENT | HPTE_V_HVLOCK); 999 + v &= ~HPTE_V_ABSENT; 997 1000 v |= HPTE_V_VALID; 998 - hptep[0] = cpu_to_be64(v); 1001 + __unlock_hpte(hptep, v); 999 1002 } while ((i = j) != head); 1000 1003 1001 1004 unlock_rmap(rmapp); ··· 1215 1218 r &= ~HPTE_GR_MODIFIED; 1216 1219 revp->guest_rpte = r; 1217 1220 } 1218 - asm volatile(PPC_RELEASE_BARRIER "" : : : "memory"); 1219 - hptp[0] &= ~cpu_to_be64(HPTE_V_HVLOCK); 1221 + unlock_hpte(hptp, be64_to_cpu(hptp[0])); 1220 1222 preempt_enable(); 1221 1223 if (!(valid == want_valid && (first_pass || dirty))) 1222 1224 ok = 0; ··· 1335 1339 unsigned long tmp[2]; 1336 1340 ssize_t nb; 1337 1341 long int err, ret; 1338 - int rma_setup; 1342 + int hpte_setup; 1339 1343 1340 1344 if (!access_ok(VERIFY_READ, buf, count)) 1341 1345 return -EFAULT; 1342 1346 1343 1347 /* lock out vcpus from running while we're doing this */ 1344 1348 mutex_lock(&kvm->lock); 1345 - rma_setup = kvm->arch.rma_setup_done; 1346 - if (rma_setup) { 1347 - kvm->arch.rma_setup_done = 0; /* temporarily */ 1348 - /* order rma_setup_done vs. vcpus_running */ 1349 + hpte_setup = kvm->arch.hpte_setup_done; 1350 + if (hpte_setup) { 1351 + kvm->arch.hpte_setup_done = 0; /* temporarily */ 1352 + /* order hpte_setup_done vs. vcpus_running */ 1349 1353 smp_mb(); 1350 1354 if (atomic_read(&kvm->arch.vcpus_running)) { 1351 - kvm->arch.rma_setup_done = 1; 1355 + kvm->arch.hpte_setup_done = 1; 1352 1356 mutex_unlock(&kvm->lock); 1353 1357 return -EBUSY; 1354 1358 } ··· 1401 1405 "r=%lx\n", ret, i, v, r); 1402 1406 goto out; 1403 1407 } 1404 - if (!rma_setup && is_vrma_hpte(v)) { 1408 + if (!hpte_setup && is_vrma_hpte(v)) { 1405 1409 unsigned long psize = hpte_base_page_size(v, r); 1406 1410 unsigned long senc = slb_pgsize_encoding(psize); 1407 1411 unsigned long lpcr; ··· 1410 1414 (VRMA_VSID << SLB_VSID_SHIFT_1T); 1411 1415 lpcr = senc << (LPCR_VRMASD_SH - 4); 1412 1416 kvmppc_update_lpcr(kvm, lpcr, LPCR_VRMASD); 1413 - rma_setup = 1; 1417 + hpte_setup = 1; 1414 1418 } 1415 1419 ++i; 1416 1420 hptp += 2; ··· 1426 1430 } 1427 1431 1428 1432 out: 1429 - /* Order HPTE updates vs. rma_setup_done */ 1433 + /* Order HPTE updates vs. hpte_setup_done */ 1430 1434 smp_wmb(); 1431 - kvm->arch.rma_setup_done = rma_setup; 1435 + kvm->arch.hpte_setup_done = hpte_setup; 1432 1436 mutex_unlock(&kvm->lock); 1433 1437 1434 1438 if (err) ··· 1489 1493 } 1490 1494 1491 1495 return ret; 1496 + } 1497 + 1498 + struct debugfs_htab_state { 1499 + struct kvm *kvm; 1500 + struct mutex mutex; 1501 + unsigned long hpt_index; 1502 + int chars_left; 1503 + int buf_index; 1504 + char buf[64]; 1505 + }; 1506 + 1507 + static int debugfs_htab_open(struct inode *inode, struct file *file) 1508 + { 1509 + struct kvm *kvm = inode->i_private; 1510 + struct debugfs_htab_state *p; 1511 + 1512 + p = kzalloc(sizeof(*p), GFP_KERNEL); 1513 + if (!p) 1514 + return -ENOMEM; 1515 + 1516 + kvm_get_kvm(kvm); 1517 + p->kvm = kvm; 1518 + mutex_init(&p->mutex); 1519 + file->private_data = p; 1520 + 1521 + return nonseekable_open(inode, file); 1522 + } 1523 + 1524 + static int debugfs_htab_release(struct inode *inode, struct file *file) 1525 + { 1526 + struct debugfs_htab_state *p = file->private_data; 1527 + 1528 + kvm_put_kvm(p->kvm); 1529 + kfree(p); 1530 + return 0; 1531 + } 1532 + 1533 + static ssize_t debugfs_htab_read(struct file *file, char __user *buf, 1534 + size_t len, loff_t *ppos) 1535 + { 1536 + struct debugfs_htab_state *p = file->private_data; 1537 + ssize_t ret, r; 1538 + unsigned long i, n; 1539 + unsigned long v, hr, gr; 1540 + struct kvm *kvm; 1541 + __be64 *hptp; 1542 + 1543 + ret = mutex_lock_interruptible(&p->mutex); 1544 + if (ret) 1545 + return ret; 1546 + 1547 + if (p->chars_left) { 1548 + n = p->chars_left; 1549 + if (n > len) 1550 + n = len; 1551 + r = copy_to_user(buf, p->buf + p->buf_index, n); 1552 + n -= r; 1553 + p->chars_left -= n; 1554 + p->buf_index += n; 1555 + buf += n; 1556 + len -= n; 1557 + ret = n; 1558 + if (r) { 1559 + if (!n) 1560 + ret = -EFAULT; 1561 + goto out; 1562 + } 1563 + } 1564 + 1565 + kvm = p->kvm; 1566 + i = p->hpt_index; 1567 + hptp = (__be64 *)(kvm->arch.hpt_virt + (i * HPTE_SIZE)); 1568 + for (; len != 0 && i < kvm->arch.hpt_npte; ++i, hptp += 2) { 1569 + if (!(be64_to_cpu(hptp[0]) & (HPTE_V_VALID | HPTE_V_ABSENT))) 1570 + continue; 1571 + 1572 + /* lock the HPTE so it's stable and read it */ 1573 + preempt_disable(); 1574 + while (!try_lock_hpte(hptp, HPTE_V_HVLOCK)) 1575 + cpu_relax(); 1576 + v = be64_to_cpu(hptp[0]) & ~HPTE_V_HVLOCK; 1577 + hr = be64_to_cpu(hptp[1]); 1578 + gr = kvm->arch.revmap[i].guest_rpte; 1579 + unlock_hpte(hptp, v); 1580 + preempt_enable(); 1581 + 1582 + if (!(v & (HPTE_V_VALID | HPTE_V_ABSENT))) 1583 + continue; 1584 + 1585 + n = scnprintf(p->buf, sizeof(p->buf), 1586 + "%6lx %.16lx %.16lx %.16lx\n", 1587 + i, v, hr, gr); 1588 + p->chars_left = n; 1589 + if (n > len) 1590 + n = len; 1591 + r = copy_to_user(buf, p->buf, n); 1592 + n -= r; 1593 + p->chars_left -= n; 1594 + p->buf_index = n; 1595 + buf += n; 1596 + len -= n; 1597 + ret += n; 1598 + if (r) { 1599 + if (!ret) 1600 + ret = -EFAULT; 1601 + goto out; 1602 + } 1603 + } 1604 + p->hpt_index = i; 1605 + 1606 + out: 1607 + mutex_unlock(&p->mutex); 1608 + return ret; 1609 + } 1610 + 1611 + ssize_t debugfs_htab_write(struct file *file, const char __user *buf, 1612 + size_t len, loff_t *ppos) 1613 + { 1614 + return -EACCES; 1615 + } 1616 + 1617 + static const struct file_operations debugfs_htab_fops = { 1618 + .owner = THIS_MODULE, 1619 + .open = debugfs_htab_open, 1620 + .release = debugfs_htab_release, 1621 + .read = debugfs_htab_read, 1622 + .write = debugfs_htab_write, 1623 + .llseek = generic_file_llseek, 1624 + }; 1625 + 1626 + void kvmppc_mmu_debugfs_init(struct kvm *kvm) 1627 + { 1628 + kvm->arch.htab_dentry = debugfs_create_file("htab", 0400, 1629 + kvm->arch.debugfs_dir, kvm, 1630 + &debugfs_htab_fops); 1492 1631 } 1493 1632 1494 1633 void kvmppc_mmu_book3s_hv_init(struct kvm_vcpu *vcpu)
+325 -112
arch/powerpc/kvm/book3s_hv.c
··· 32 32 #include <linux/page-flags.h> 33 33 #include <linux/srcu.h> 34 34 #include <linux/miscdevice.h> 35 + #include <linux/debugfs.h> 35 36 36 37 #include <asm/reg.h> 37 38 #include <asm/cputable.h> ··· 51 50 #include <asm/hvcall.h> 52 51 #include <asm/switch_to.h> 53 52 #include <asm/smp.h> 53 + #include <asm/dbell.h> 54 54 #include <linux/gfp.h> 55 55 #include <linux/vmalloc.h> 56 56 #include <linux/highmem.h> ··· 85 83 static void kvmppc_end_cede(struct kvm_vcpu *vcpu); 86 84 static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu); 87 85 86 + static bool kvmppc_ipi_thread(int cpu) 87 + { 88 + /* On POWER8 for IPIs to threads in the same core, use msgsnd */ 89 + if (cpu_has_feature(CPU_FTR_ARCH_207S)) { 90 + preempt_disable(); 91 + if (cpu_first_thread_sibling(cpu) == 92 + cpu_first_thread_sibling(smp_processor_id())) { 93 + unsigned long msg = PPC_DBELL_TYPE(PPC_DBELL_SERVER); 94 + msg |= cpu_thread_in_core(cpu); 95 + smp_mb(); 96 + __asm__ __volatile__ (PPC_MSGSND(%0) : : "r" (msg)); 97 + preempt_enable(); 98 + return true; 99 + } 100 + preempt_enable(); 101 + } 102 + 103 + #if defined(CONFIG_PPC_ICP_NATIVE) && defined(CONFIG_SMP) 104 + if (cpu >= 0 && cpu < nr_cpu_ids && paca[cpu].kvm_hstate.xics_phys) { 105 + xics_wake_cpu(cpu); 106 + return true; 107 + } 108 + #endif 109 + 110 + return false; 111 + } 112 + 88 113 static void kvmppc_fast_vcpu_kick_hv(struct kvm_vcpu *vcpu) 89 114 { 90 - int me; 91 115 int cpu = vcpu->cpu; 92 116 wait_queue_head_t *wqp; 93 117 ··· 123 95 ++vcpu->stat.halt_wakeup; 124 96 } 125 97 126 - me = get_cpu(); 98 + if (kvmppc_ipi_thread(cpu + vcpu->arch.ptid)) 99 + return; 127 100 128 101 /* CPU points to the first thread of the core */ 129 - if (cpu != me && cpu >= 0 && cpu < nr_cpu_ids) { 130 - #ifdef CONFIG_PPC_ICP_NATIVE 131 - int real_cpu = cpu + vcpu->arch.ptid; 132 - if (paca[real_cpu].kvm_hstate.xics_phys) 133 - xics_wake_cpu(real_cpu); 134 - else 135 - #endif 136 - if (cpu_online(cpu)) 137 - smp_send_reschedule(cpu); 138 - } 139 - put_cpu(); 102 + if (cpu >= 0 && cpu < nr_cpu_ids && cpu_online(cpu)) 103 + smp_send_reschedule(cpu); 140 104 } 141 105 142 106 /* ··· 726 706 727 707 /* Send the error out to userspace via KVM_RUN */ 728 708 return rc; 709 + case H_LOGICAL_CI_LOAD: 710 + ret = kvmppc_h_logical_ci_load(vcpu); 711 + if (ret == H_TOO_HARD) 712 + return RESUME_HOST; 713 + break; 714 + case H_LOGICAL_CI_STORE: 715 + ret = kvmppc_h_logical_ci_store(vcpu); 716 + if (ret == H_TOO_HARD) 717 + return RESUME_HOST; 718 + break; 729 719 case H_SET_MODE: 730 720 ret = kvmppc_h_set_mode(vcpu, kvmppc_get_gpr(vcpu, 4), 731 721 kvmppc_get_gpr(vcpu, 5), ··· 770 740 case H_CONFER: 771 741 case H_REGISTER_VPA: 772 742 case H_SET_MODE: 743 + case H_LOGICAL_CI_LOAD: 744 + case H_LOGICAL_CI_STORE: 773 745 #ifdef CONFIG_KVM_XICS 774 746 case H_XIRR: 775 747 case H_CPPR: ··· 1442 1410 return vcore; 1443 1411 } 1444 1412 1413 + #ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING 1414 + static struct debugfs_timings_element { 1415 + const char *name; 1416 + size_t offset; 1417 + } timings[] = { 1418 + {"rm_entry", offsetof(struct kvm_vcpu, arch.rm_entry)}, 1419 + {"rm_intr", offsetof(struct kvm_vcpu, arch.rm_intr)}, 1420 + {"rm_exit", offsetof(struct kvm_vcpu, arch.rm_exit)}, 1421 + {"guest", offsetof(struct kvm_vcpu, arch.guest_time)}, 1422 + {"cede", offsetof(struct kvm_vcpu, arch.cede_time)}, 1423 + }; 1424 + 1425 + #define N_TIMINGS (sizeof(timings) / sizeof(timings[0])) 1426 + 1427 + struct debugfs_timings_state { 1428 + struct kvm_vcpu *vcpu; 1429 + unsigned int buflen; 1430 + char buf[N_TIMINGS * 100]; 1431 + }; 1432 + 1433 + static int debugfs_timings_open(struct inode *inode, struct file *file) 1434 + { 1435 + struct kvm_vcpu *vcpu = inode->i_private; 1436 + struct debugfs_timings_state *p; 1437 + 1438 + p = kzalloc(sizeof(*p), GFP_KERNEL); 1439 + if (!p) 1440 + return -ENOMEM; 1441 + 1442 + kvm_get_kvm(vcpu->kvm); 1443 + p->vcpu = vcpu; 1444 + file->private_data = p; 1445 + 1446 + return nonseekable_open(inode, file); 1447 + } 1448 + 1449 + static int debugfs_timings_release(struct inode *inode, struct file *file) 1450 + { 1451 + struct debugfs_timings_state *p = file->private_data; 1452 + 1453 + kvm_put_kvm(p->vcpu->kvm); 1454 + kfree(p); 1455 + return 0; 1456 + } 1457 + 1458 + static ssize_t debugfs_timings_read(struct file *file, char __user *buf, 1459 + size_t len, loff_t *ppos) 1460 + { 1461 + struct debugfs_timings_state *p = file->private_data; 1462 + struct kvm_vcpu *vcpu = p->vcpu; 1463 + char *s, *buf_end; 1464 + struct kvmhv_tb_accumulator tb; 1465 + u64 count; 1466 + loff_t pos; 1467 + ssize_t n; 1468 + int i, loops; 1469 + bool ok; 1470 + 1471 + if (!p->buflen) { 1472 + s = p->buf; 1473 + buf_end = s + sizeof(p->buf); 1474 + for (i = 0; i < N_TIMINGS; ++i) { 1475 + struct kvmhv_tb_accumulator *acc; 1476 + 1477 + acc = (struct kvmhv_tb_accumulator *) 1478 + ((unsigned long)vcpu + timings[i].offset); 1479 + ok = false; 1480 + for (loops = 0; loops < 1000; ++loops) { 1481 + count = acc->seqcount; 1482 + if (!(count & 1)) { 1483 + smp_rmb(); 1484 + tb = *acc; 1485 + smp_rmb(); 1486 + if (count == acc->seqcount) { 1487 + ok = true; 1488 + break; 1489 + } 1490 + } 1491 + udelay(1); 1492 + } 1493 + if (!ok) 1494 + snprintf(s, buf_end - s, "%s: stuck\n", 1495 + timings[i].name); 1496 + else 1497 + snprintf(s, buf_end - s, 1498 + "%s: %llu %llu %llu %llu\n", 1499 + timings[i].name, count / 2, 1500 + tb_to_ns(tb.tb_total), 1501 + tb_to_ns(tb.tb_min), 1502 + tb_to_ns(tb.tb_max)); 1503 + s += strlen(s); 1504 + } 1505 + p->buflen = s - p->buf; 1506 + } 1507 + 1508 + pos = *ppos; 1509 + if (pos >= p->buflen) 1510 + return 0; 1511 + if (len > p->buflen - pos) 1512 + len = p->buflen - pos; 1513 + n = copy_to_user(buf, p->buf + pos, len); 1514 + if (n) { 1515 + if (n == len) 1516 + return -EFAULT; 1517 + len -= n; 1518 + } 1519 + *ppos = pos + len; 1520 + return len; 1521 + } 1522 + 1523 + static ssize_t debugfs_timings_write(struct file *file, const char __user *buf, 1524 + size_t len, loff_t *ppos) 1525 + { 1526 + return -EACCES; 1527 + } 1528 + 1529 + static const struct file_operations debugfs_timings_ops = { 1530 + .owner = THIS_MODULE, 1531 + .open = debugfs_timings_open, 1532 + .release = debugfs_timings_release, 1533 + .read = debugfs_timings_read, 1534 + .write = debugfs_timings_write, 1535 + .llseek = generic_file_llseek, 1536 + }; 1537 + 1538 + /* Create a debugfs directory for the vcpu */ 1539 + static void debugfs_vcpu_init(struct kvm_vcpu *vcpu, unsigned int id) 1540 + { 1541 + char buf[16]; 1542 + struct kvm *kvm = vcpu->kvm; 1543 + 1544 + snprintf(buf, sizeof(buf), "vcpu%u", id); 1545 + if (IS_ERR_OR_NULL(kvm->arch.debugfs_dir)) 1546 + return; 1547 + vcpu->arch.debugfs_dir = debugfs_create_dir(buf, kvm->arch.debugfs_dir); 1548 + if (IS_ERR_OR_NULL(vcpu->arch.debugfs_dir)) 1549 + return; 1550 + vcpu->arch.debugfs_timings = 1551 + debugfs_create_file("timings", 0444, vcpu->arch.debugfs_dir, 1552 + vcpu, &debugfs_timings_ops); 1553 + } 1554 + 1555 + #else /* CONFIG_KVM_BOOK3S_HV_EXIT_TIMING */ 1556 + static void debugfs_vcpu_init(struct kvm_vcpu *vcpu, unsigned int id) 1557 + { 1558 + } 1559 + #endif /* CONFIG_KVM_BOOK3S_HV_EXIT_TIMING */ 1560 + 1445 1561 static struct kvm_vcpu *kvmppc_core_vcpu_create_hv(struct kvm *kvm, 1446 1562 unsigned int id) 1447 1563 { ··· 1658 1478 1659 1479 vcpu->arch.cpu_type = KVM_CPU_3S_64; 1660 1480 kvmppc_sanity_check(vcpu); 1481 + 1482 + debugfs_vcpu_init(vcpu, id); 1661 1483 1662 1484 return vcpu; 1663 1485 ··· 1748 1566 tpaca = &paca[cpu]; 1749 1567 1750 1568 /* Ensure the thread won't go into the kernel if it wakes */ 1751 - tpaca->kvm_hstate.hwthread_req = 1; 1752 1569 tpaca->kvm_hstate.kvm_vcpu = NULL; 1570 + tpaca->kvm_hstate.napping = 0; 1571 + smp_wmb(); 1572 + tpaca->kvm_hstate.hwthread_req = 1; 1753 1573 1754 1574 /* 1755 1575 * If the thread is already executing in the kernel (e.g. handling ··· 1794 1610 } 1795 1611 cpu = vc->pcpu + vcpu->arch.ptid; 1796 1612 tpaca = &paca[cpu]; 1797 - tpaca->kvm_hstate.kvm_vcpu = vcpu; 1798 1613 tpaca->kvm_hstate.kvm_vcore = vc; 1799 1614 tpaca->kvm_hstate.ptid = vcpu->arch.ptid; 1800 1615 vcpu->cpu = vc->pcpu; 1616 + /* Order stores to hstate.kvm_vcore etc. before store to kvm_vcpu */ 1801 1617 smp_wmb(); 1802 - #if defined(CONFIG_PPC_ICP_NATIVE) && defined(CONFIG_SMP) 1803 - if (cpu != smp_processor_id()) { 1804 - xics_wake_cpu(cpu); 1805 - if (vcpu->arch.ptid) 1806 - ++vc->n_woken; 1807 - } 1808 - #endif 1618 + tpaca->kvm_hstate.kvm_vcpu = vcpu; 1619 + if (cpu != smp_processor_id()) 1620 + kvmppc_ipi_thread(cpu); 1809 1621 } 1810 1622 1811 - static void kvmppc_wait_for_nap(struct kvmppc_vcore *vc) 1623 + static void kvmppc_wait_for_nap(void) 1812 1624 { 1813 - int i; 1625 + int cpu = smp_processor_id(); 1626 + int i, loops; 1814 1627 1815 - HMT_low(); 1816 - i = 0; 1817 - while (vc->nap_count < vc->n_woken) { 1818 - if (++i >= 1000000) { 1819 - pr_err("kvmppc_wait_for_nap timeout %d %d\n", 1820 - vc->nap_count, vc->n_woken); 1821 - break; 1628 + for (loops = 0; loops < 1000000; ++loops) { 1629 + /* 1630 + * Check if all threads are finished. 1631 + * We set the vcpu pointer when starting a thread 1632 + * and the thread clears it when finished, so we look 1633 + * for any threads that still have a non-NULL vcpu ptr. 1634 + */ 1635 + for (i = 1; i < threads_per_subcore; ++i) 1636 + if (paca[cpu + i].kvm_hstate.kvm_vcpu) 1637 + break; 1638 + if (i == threads_per_subcore) { 1639 + HMT_medium(); 1640 + return; 1822 1641 } 1823 - cpu_relax(); 1642 + HMT_low(); 1824 1643 } 1825 1644 HMT_medium(); 1645 + for (i = 1; i < threads_per_subcore; ++i) 1646 + if (paca[cpu + i].kvm_hstate.kvm_vcpu) 1647 + pr_err("KVM: CPU %d seems to be stuck\n", cpu + i); 1826 1648 } 1827 1649 1828 1650 /* ··· 1890 1700 mtspr(SPRN_MPPR, mpp_addr | PPC_MPPR_FETCH_WHOLE_TABLE); 1891 1701 } 1892 1702 1703 + static void prepare_threads(struct kvmppc_vcore *vc) 1704 + { 1705 + struct kvm_vcpu *vcpu, *vnext; 1706 + 1707 + list_for_each_entry_safe(vcpu, vnext, &vc->runnable_threads, 1708 + arch.run_list) { 1709 + if (signal_pending(vcpu->arch.run_task)) 1710 + vcpu->arch.ret = -EINTR; 1711 + else if (vcpu->arch.vpa.update_pending || 1712 + vcpu->arch.slb_shadow.update_pending || 1713 + vcpu->arch.dtl.update_pending) 1714 + vcpu->arch.ret = RESUME_GUEST; 1715 + else 1716 + continue; 1717 + kvmppc_remove_runnable(vc, vcpu); 1718 + wake_up(&vcpu->arch.cpu_run); 1719 + } 1720 + } 1721 + 1722 + static void post_guest_process(struct kvmppc_vcore *vc) 1723 + { 1724 + u64 now; 1725 + long ret; 1726 + struct kvm_vcpu *vcpu, *vnext; 1727 + 1728 + now = get_tb(); 1729 + list_for_each_entry_safe(vcpu, vnext, &vc->runnable_threads, 1730 + arch.run_list) { 1731 + /* cancel pending dec exception if dec is positive */ 1732 + if (now < vcpu->arch.dec_expires && 1733 + kvmppc_core_pending_dec(vcpu)) 1734 + kvmppc_core_dequeue_dec(vcpu); 1735 + 1736 + trace_kvm_guest_exit(vcpu); 1737 + 1738 + ret = RESUME_GUEST; 1739 + if (vcpu->arch.trap) 1740 + ret = kvmppc_handle_exit_hv(vcpu->arch.kvm_run, vcpu, 1741 + vcpu->arch.run_task); 1742 + 1743 + vcpu->arch.ret = ret; 1744 + vcpu->arch.trap = 0; 1745 + 1746 + if (vcpu->arch.ceded) { 1747 + if (!is_kvmppc_resume_guest(ret)) 1748 + kvmppc_end_cede(vcpu); 1749 + else 1750 + kvmppc_set_timer(vcpu); 1751 + } 1752 + if (!is_kvmppc_resume_guest(vcpu->arch.ret)) { 1753 + kvmppc_remove_runnable(vc, vcpu); 1754 + wake_up(&vcpu->arch.cpu_run); 1755 + } 1756 + } 1757 + } 1758 + 1893 1759 /* 1894 1760 * Run a set of guest threads on a physical core. 1895 1761 * Called with vc->lock held. 1896 1762 */ 1897 - static void kvmppc_run_core(struct kvmppc_vcore *vc) 1763 + static noinline void kvmppc_run_core(struct kvmppc_vcore *vc) 1898 1764 { 1899 - struct kvm_vcpu *vcpu, *vnext; 1900 - long ret; 1901 - u64 now; 1902 - int i, need_vpa_update; 1765 + struct kvm_vcpu *vcpu; 1766 + int i; 1903 1767 int srcu_idx; 1904 - struct kvm_vcpu *vcpus_to_update[threads_per_core]; 1905 - 1906 - /* don't start if any threads have a signal pending */ 1907 - need_vpa_update = 0; 1908 - list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list) { 1909 - if (signal_pending(vcpu->arch.run_task)) 1910 - return; 1911 - if (vcpu->arch.vpa.update_pending || 1912 - vcpu->arch.slb_shadow.update_pending || 1913 - vcpu->arch.dtl.update_pending) 1914 - vcpus_to_update[need_vpa_update++] = vcpu; 1915 - } 1916 1768 1917 1769 /* 1918 - * Initialize *vc, in particular vc->vcore_state, so we can 1919 - * drop the vcore lock if necessary. 1770 + * Remove from the list any threads that have a signal pending 1771 + * or need a VPA update done 1920 1772 */ 1921 - vc->n_woken = 0; 1922 - vc->nap_count = 0; 1923 - vc->entry_exit_count = 0; 1773 + prepare_threads(vc); 1774 + 1775 + /* if the runner is no longer runnable, let the caller pick a new one */ 1776 + if (vc->runner->arch.state != KVMPPC_VCPU_RUNNABLE) 1777 + return; 1778 + 1779 + /* 1780 + * Initialize *vc. 1781 + */ 1782 + vc->entry_exit_map = 0; 1924 1783 vc->preempt_tb = TB_NIL; 1925 - vc->vcore_state = VCORE_STARTING; 1926 1784 vc->in_guest = 0; 1927 1785 vc->napping_threads = 0; 1928 1786 vc->conferring_threads = 0; 1929 - 1930 - /* 1931 - * Updating any of the vpas requires calling kvmppc_pin_guest_page, 1932 - * which can't be called with any spinlocks held. 1933 - */ 1934 - if (need_vpa_update) { 1935 - spin_unlock(&vc->lock); 1936 - for (i = 0; i < need_vpa_update; ++i) 1937 - kvmppc_update_vpas(vcpus_to_update[i]); 1938 - spin_lock(&vc->lock); 1939 - } 1940 1787 1941 1788 /* 1942 1789 * Make sure we are running on primary threads, and that secondary ··· 1982 1755 */ 1983 1756 if ((threads_per_core > 1) && 1984 1757 ((vc->num_threads > threads_per_subcore) || !on_primary_thread())) { 1985 - list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list) 1758 + list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list) { 1986 1759 vcpu->arch.ret = -EBUSY; 1760 + kvmppc_remove_runnable(vc, vcpu); 1761 + wake_up(&vcpu->arch.cpu_run); 1762 + } 1987 1763 goto out; 1988 1764 } 1989 1765 ··· 2027 1797 list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list) 2028 1798 vcpu->cpu = -1; 2029 1799 /* wait for secondary threads to finish writing their state to memory */ 2030 - if (vc->nap_count < vc->n_woken) 2031 - kvmppc_wait_for_nap(vc); 1800 + kvmppc_wait_for_nap(); 2032 1801 for (i = 0; i < threads_per_subcore; ++i) 2033 1802 kvmppc_release_hwthread(vc->pcpu + i); 2034 1803 /* prevent other vcpu threads from doing kvmppc_start_thread() now */ ··· 2041 1812 kvm_guest_exit(); 2042 1813 2043 1814 preempt_enable(); 2044 - cond_resched(); 2045 1815 2046 1816 spin_lock(&vc->lock); 2047 - now = get_tb(); 2048 - list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list) { 2049 - /* cancel pending dec exception if dec is positive */ 2050 - if (now < vcpu->arch.dec_expires && 2051 - kvmppc_core_pending_dec(vcpu)) 2052 - kvmppc_core_dequeue_dec(vcpu); 2053 - 2054 - trace_kvm_guest_exit(vcpu); 2055 - 2056 - ret = RESUME_GUEST; 2057 - if (vcpu->arch.trap) 2058 - ret = kvmppc_handle_exit_hv(vcpu->arch.kvm_run, vcpu, 2059 - vcpu->arch.run_task); 2060 - 2061 - vcpu->arch.ret = ret; 2062 - vcpu->arch.trap = 0; 2063 - 2064 - if (vcpu->arch.ceded) { 2065 - if (!is_kvmppc_resume_guest(ret)) 2066 - kvmppc_end_cede(vcpu); 2067 - else 2068 - kvmppc_set_timer(vcpu); 2069 - } 2070 - } 1817 + post_guest_process(vc); 2071 1818 2072 1819 out: 2073 1820 vc->vcore_state = VCORE_INACTIVE; 2074 - list_for_each_entry_safe(vcpu, vnext, &vc->runnable_threads, 2075 - arch.run_list) { 2076 - if (!is_kvmppc_resume_guest(vcpu->arch.ret)) { 2077 - kvmppc_remove_runnable(vc, vcpu); 2078 - wake_up(&vcpu->arch.cpu_run); 2079 - } 2080 - } 2081 - 2082 1821 trace_kvmppc_run_core(vc, 1); 2083 1822 } 2084 1823 ··· 2136 1939 * this thread straight away and have it join in. 2137 1940 */ 2138 1941 if (!signal_pending(current)) { 2139 - if (vc->vcore_state == VCORE_RUNNING && 2140 - VCORE_EXIT_COUNT(vc) == 0) { 1942 + if (vc->vcore_state == VCORE_RUNNING && !VCORE_IS_EXITING(vc)) { 2141 1943 kvmppc_create_dtl_entry(vcpu, vc); 2142 1944 kvmppc_start_thread(vcpu); 2143 1945 trace_kvm_guest_enter(vcpu); ··· 2167 1971 } 2168 1972 if (!vc->n_runnable || vcpu->arch.state != KVMPPC_VCPU_RUNNABLE) 2169 1973 break; 2170 - vc->runner = vcpu; 2171 1974 n_ceded = 0; 2172 1975 list_for_each_entry(v, &vc->runnable_threads, arch.run_list) { 2173 1976 if (!v->arch.pending_exceptions) ··· 2174 1979 else 2175 1980 v->arch.ceded = 0; 2176 1981 } 2177 - if (n_ceded == vc->n_runnable) 1982 + vc->runner = vcpu; 1983 + if (n_ceded == vc->n_runnable) { 2178 1984 kvmppc_vcore_blocked(vc); 2179 - else 1985 + } else if (should_resched()) { 1986 + vc->vcore_state = VCORE_PREEMPT; 1987 + /* Let something else run */ 1988 + cond_resched_lock(&vc->lock); 1989 + vc->vcore_state = VCORE_INACTIVE; 1990 + } else { 2180 1991 kvmppc_run_core(vc); 1992 + } 2181 1993 vc->runner = NULL; 2182 1994 } 2183 1995 ··· 2234 2032 } 2235 2033 2236 2034 atomic_inc(&vcpu->kvm->arch.vcpus_running); 2237 - /* Order vcpus_running vs. rma_setup_done, see kvmppc_alloc_reset_hpt */ 2035 + /* Order vcpus_running vs. hpte_setup_done, see kvmppc_alloc_reset_hpt */ 2238 2036 smp_mb(); 2239 2037 2240 2038 /* On the first time here, set up HTAB and VRMA */ 2241 - if (!vcpu->kvm->arch.rma_setup_done) { 2039 + if (!vcpu->kvm->arch.hpte_setup_done) { 2242 2040 r = kvmppc_hv_setup_htab_rma(vcpu); 2243 2041 if (r) 2244 2042 goto out; ··· 2440 2238 int srcu_idx; 2441 2239 2442 2240 mutex_lock(&kvm->lock); 2443 - if (kvm->arch.rma_setup_done) 2241 + if (kvm->arch.hpte_setup_done) 2444 2242 goto out; /* another vcpu beat us to it */ 2445 2243 2446 2244 /* Allocate hashed page table (if not done already) and reset it */ ··· 2491 2289 2492 2290 kvmppc_update_lpcr(kvm, lpcr, LPCR_VRMASD); 2493 2291 2494 - /* Order updates to kvm->arch.lpcr etc. vs. rma_setup_done */ 2292 + /* Order updates to kvm->arch.lpcr etc. vs. hpte_setup_done */ 2495 2293 smp_wmb(); 2496 - kvm->arch.rma_setup_done = 1; 2294 + kvm->arch.hpte_setup_done = 1; 2497 2295 err = 0; 2498 2296 out_srcu: 2499 2297 srcu_read_unlock(&kvm->srcu, srcu_idx); ··· 2509 2307 static int kvmppc_core_init_vm_hv(struct kvm *kvm) 2510 2308 { 2511 2309 unsigned long lpcr, lpid; 2310 + char buf[32]; 2512 2311 2513 2312 /* Allocate the guest's logical partition ID */ 2514 2313 ··· 2550 2347 */ 2551 2348 kvm_hv_vm_activated(); 2552 2349 2350 + /* 2351 + * Create a debugfs directory for the VM 2352 + */ 2353 + snprintf(buf, sizeof(buf), "vm%d", current->pid); 2354 + kvm->arch.debugfs_dir = debugfs_create_dir(buf, kvm_debugfs_dir); 2355 + if (!IS_ERR_OR_NULL(kvm->arch.debugfs_dir)) 2356 + kvmppc_mmu_debugfs_init(kvm); 2357 + 2553 2358 return 0; 2554 2359 } 2555 2360 ··· 2578 2367 2579 2368 static void kvmppc_core_destroy_vm_hv(struct kvm *kvm) 2580 2369 { 2370 + debugfs_remove_recursive(kvm->arch.debugfs_dir); 2371 + 2581 2372 kvm_hv_vm_deactivated(); 2582 2373 2583 2374 kvmppc_free_vcores(kvm);
+95 -5
arch/powerpc/kvm/book3s_hv_builtin.c
··· 21 21 #include <asm/cputable.h> 22 22 #include <asm/kvm_ppc.h> 23 23 #include <asm/kvm_book3s.h> 24 + #include <asm/archrandom.h> 25 + #include <asm/xics.h> 26 + #include <asm/dbell.h> 27 + #include <asm/cputhreads.h> 24 28 25 29 #define KVM_CMA_CHUNK_ORDER 18 26 30 ··· 118 114 int rv = H_SUCCESS; /* => don't yield */ 119 115 120 116 set_bit(vcpu->arch.ptid, &vc->conferring_threads); 121 - while ((get_tb() < stop) && (VCORE_EXIT_COUNT(vc) == 0)) { 122 - threads_running = VCORE_ENTRY_COUNT(vc); 123 - threads_ceded = hweight32(vc->napping_threads); 124 - threads_conferring = hweight32(vc->conferring_threads); 125 - if (threads_ceded + threads_conferring >= threads_running) { 117 + while ((get_tb() < stop) && !VCORE_IS_EXITING(vc)) { 118 + threads_running = VCORE_ENTRY_MAP(vc); 119 + threads_ceded = vc->napping_threads; 120 + threads_conferring = vc->conferring_threads; 121 + if ((threads_ceded | threads_conferring) == threads_running) { 126 122 rv = H_TOO_HARD; /* => do yield */ 127 123 break; 128 124 } ··· 173 169 return 0; 174 170 } 175 171 EXPORT_SYMBOL_GPL(kvmppc_hcall_impl_hv_realmode); 172 + 173 + int kvmppc_hwrng_present(void) 174 + { 175 + return powernv_hwrng_present(); 176 + } 177 + EXPORT_SYMBOL_GPL(kvmppc_hwrng_present); 178 + 179 + long kvmppc_h_random(struct kvm_vcpu *vcpu) 180 + { 181 + if (powernv_get_random_real_mode(&vcpu->arch.gpr[4])) 182 + return H_SUCCESS; 183 + 184 + return H_HARDWARE; 185 + } 186 + 187 + static inline void rm_writeb(unsigned long paddr, u8 val) 188 + { 189 + __asm__ __volatile__("stbcix %0,0,%1" 190 + : : "r" (val), "r" (paddr) : "memory"); 191 + } 192 + 193 + /* 194 + * Send an interrupt or message to another CPU. 195 + * This can only be called in real mode. 196 + * The caller needs to include any barrier needed to order writes 197 + * to memory vs. the IPI/message. 198 + */ 199 + void kvmhv_rm_send_ipi(int cpu) 200 + { 201 + unsigned long xics_phys; 202 + 203 + /* On POWER8 for IPIs to threads in the same core, use msgsnd */ 204 + if (cpu_has_feature(CPU_FTR_ARCH_207S) && 205 + cpu_first_thread_sibling(cpu) == 206 + cpu_first_thread_sibling(raw_smp_processor_id())) { 207 + unsigned long msg = PPC_DBELL_TYPE(PPC_DBELL_SERVER); 208 + msg |= cpu_thread_in_core(cpu); 209 + __asm__ __volatile__ (PPC_MSGSND(%0) : : "r" (msg)); 210 + return; 211 + } 212 + 213 + /* Else poke the target with an IPI */ 214 + xics_phys = paca[cpu].kvm_hstate.xics_phys; 215 + rm_writeb(xics_phys + XICS_MFRR, IPI_PRIORITY); 216 + } 217 + 218 + /* 219 + * The following functions are called from the assembly code 220 + * in book3s_hv_rmhandlers.S. 221 + */ 222 + static void kvmhv_interrupt_vcore(struct kvmppc_vcore *vc, int active) 223 + { 224 + int cpu = vc->pcpu; 225 + 226 + /* Order setting of exit map vs. msgsnd/IPI */ 227 + smp_mb(); 228 + for (; active; active >>= 1, ++cpu) 229 + if (active & 1) 230 + kvmhv_rm_send_ipi(cpu); 231 + } 232 + 233 + void kvmhv_commence_exit(int trap) 234 + { 235 + struct kvmppc_vcore *vc = local_paca->kvm_hstate.kvm_vcore; 236 + int ptid = local_paca->kvm_hstate.ptid; 237 + int me, ee; 238 + 239 + /* Set our bit in the threads-exiting-guest map in the 0xff00 240 + bits of vcore->entry_exit_map */ 241 + me = 0x100 << ptid; 242 + do { 243 + ee = vc->entry_exit_map; 244 + } while (cmpxchg(&vc->entry_exit_map, ee, ee | me) != ee); 245 + 246 + /* Are we the first here? */ 247 + if ((ee >> 8) != 0) 248 + return; 249 + 250 + /* 251 + * Trigger the other threads in this vcore to exit the guest. 252 + * If this is a hypervisor decrementer interrupt then they 253 + * will be already on their way out of the guest. 254 + */ 255 + if (trap != BOOK3S_INTERRUPT_HV_DECREMENTER) 256 + kvmhv_interrupt_vcore(vc, ee & ~(1 << ptid)); 257 + }
+9 -16
arch/powerpc/kvm/book3s_hv_rm_mmu.c
··· 150 150 return kvmppc_read_update_linux_pte(ptep, writing, hugepage_shift); 151 151 } 152 152 153 - static inline void unlock_hpte(__be64 *hpte, unsigned long hpte_v) 154 - { 155 - asm volatile(PPC_RELEASE_BARRIER "" : : : "memory"); 156 - hpte[0] = cpu_to_be64(hpte_v); 157 - } 158 - 159 153 long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags, 160 154 long pte_index, unsigned long pteh, unsigned long ptel, 161 155 pgd_t *pgdir, bool realmode, unsigned long *pte_idx_ret) ··· 265 271 u64 pte; 266 272 while (!try_lock_hpte(hpte, HPTE_V_HVLOCK)) 267 273 cpu_relax(); 268 - pte = be64_to_cpu(*hpte); 274 + pte = be64_to_cpu(hpte[0]); 269 275 if (!(pte & (HPTE_V_VALID | HPTE_V_ABSENT))) 270 276 break; 271 - *hpte &= ~cpu_to_be64(HPTE_V_HVLOCK); 277 + __unlock_hpte(hpte, pte); 272 278 hpte += 2; 273 279 } 274 280 if (i == 8) ··· 284 290 285 291 while (!try_lock_hpte(hpte, HPTE_V_HVLOCK)) 286 292 cpu_relax(); 287 - pte = be64_to_cpu(*hpte); 293 + pte = be64_to_cpu(hpte[0]); 288 294 if (pte & (HPTE_V_VALID | HPTE_V_ABSENT)) { 289 - *hpte &= ~cpu_to_be64(HPTE_V_HVLOCK); 295 + __unlock_hpte(hpte, pte); 290 296 return H_PTEG_FULL; 291 297 } 292 298 } ··· 325 331 326 332 /* Write the first HPTE dword, unlocking the HPTE and making it valid */ 327 333 eieio(); 328 - hpte[0] = cpu_to_be64(pteh); 334 + __unlock_hpte(hpte, pteh); 329 335 asm volatile("ptesync" : : : "memory"); 330 336 331 337 *pte_idx_ret = pte_index; ··· 406 412 if ((pte & (HPTE_V_ABSENT | HPTE_V_VALID)) == 0 || 407 413 ((flags & H_AVPN) && (pte & ~0x7fUL) != avpn) || 408 414 ((flags & H_ANDCOND) && (pte & avpn) != 0)) { 409 - hpte[0] &= ~cpu_to_be64(HPTE_V_HVLOCK); 415 + __unlock_hpte(hpte, pte); 410 416 return H_NOT_FOUND; 411 417 } 412 418 ··· 542 548 be64_to_cpu(hp[0]), be64_to_cpu(hp[1])); 543 549 rcbits = rev->guest_rpte & (HPTE_R_R|HPTE_R_C); 544 550 args[j] |= rcbits << (56 - 5); 545 - hp[0] = 0; 551 + __unlock_hpte(hp, 0); 546 552 } 547 553 } 548 554 ··· 568 574 pte = be64_to_cpu(hpte[0]); 569 575 if ((pte & (HPTE_V_ABSENT | HPTE_V_VALID)) == 0 || 570 576 ((flags & H_AVPN) && (pte & ~0x7fUL) != avpn)) { 571 - hpte[0] &= ~cpu_to_be64(HPTE_V_HVLOCK); 577 + __unlock_hpte(hpte, pte); 572 578 return H_NOT_FOUND; 573 579 } 574 580 ··· 749 755 /* Return with the HPTE still locked */ 750 756 return (hash << 3) + (i >> 1); 751 757 752 - /* Unlock and move on */ 753 - hpte[i] = cpu_to_be64(v); 758 + __unlock_hpte(&hpte[i], v); 754 759 } 755 760 756 761 if (val & HPTE_V_SECONDARY)
+217 -21
arch/powerpc/kvm/book3s_hv_rm_xics.c
··· 23 23 24 24 #define DEBUG_PASSUP 25 25 26 - static inline void rm_writeb(unsigned long paddr, u8 val) 26 + static void icp_rm_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp, 27 + u32 new_irq); 28 + 29 + /* -- ICS routines -- */ 30 + static void ics_rm_check_resend(struct kvmppc_xics *xics, 31 + struct kvmppc_ics *ics, struct kvmppc_icp *icp) 27 32 { 28 - __asm__ __volatile__("sync; stbcix %0,0,%1" 29 - : : "r" (val), "r" (paddr) : "memory"); 33 + int i; 34 + 35 + arch_spin_lock(&ics->lock); 36 + 37 + for (i = 0; i < KVMPPC_XICS_IRQ_PER_ICS; i++) { 38 + struct ics_irq_state *state = &ics->irq_state[i]; 39 + 40 + if (!state->resend) 41 + continue; 42 + 43 + arch_spin_unlock(&ics->lock); 44 + icp_rm_deliver_irq(xics, icp, state->number); 45 + arch_spin_lock(&ics->lock); 46 + } 47 + 48 + arch_spin_unlock(&ics->lock); 30 49 } 50 + 51 + /* -- ICP routines -- */ 31 52 32 53 static void icp_rm_set_vcpu_irq(struct kvm_vcpu *vcpu, 33 54 struct kvm_vcpu *this_vcpu) 34 55 { 35 56 struct kvmppc_icp *this_icp = this_vcpu->arch.icp; 36 - unsigned long xics_phys; 37 57 int cpu; 38 58 39 59 /* Mark the target VCPU as having an interrupt pending */ ··· 76 56 /* In SMT cpu will always point to thread 0, we adjust it */ 77 57 cpu += vcpu->arch.ptid; 78 58 79 - /* Not too hard, then poke the target */ 80 - xics_phys = paca[cpu].kvm_hstate.xics_phys; 81 - rm_writeb(xics_phys + XICS_MFRR, IPI_PRIORITY); 59 + smp_mb(); 60 + kvmhv_rm_send_ipi(cpu); 82 61 } 83 62 84 63 static void icp_rm_clr_vcpu_irq(struct kvm_vcpu *vcpu) ··· 133 114 struct kvmppc_icp *icp) 134 115 { 135 116 return (xics->real_mode_dbg || icp->rm_action) ? H_TOO_HARD : H_SUCCESS; 117 + } 118 + 119 + static void icp_rm_check_resend(struct kvmppc_xics *xics, 120 + struct kvmppc_icp *icp) 121 + { 122 + u32 icsid; 123 + 124 + /* Order this load with the test for need_resend in the caller */ 125 + smp_rmb(); 126 + for_each_set_bit(icsid, icp->resend_map, xics->max_icsid + 1) { 127 + struct kvmppc_ics *ics = xics->ics[icsid]; 128 + 129 + if (!test_and_clear_bit(icsid, icp->resend_map)) 130 + continue; 131 + if (!ics) 132 + continue; 133 + ics_rm_check_resend(xics, ics, icp); 134 + } 135 + } 136 + 137 + static bool icp_rm_try_to_deliver(struct kvmppc_icp *icp, u32 irq, u8 priority, 138 + u32 *reject) 139 + { 140 + union kvmppc_icp_state old_state, new_state; 141 + bool success; 142 + 143 + do { 144 + old_state = new_state = READ_ONCE(icp->state); 145 + 146 + *reject = 0; 147 + 148 + /* See if we can deliver */ 149 + success = new_state.cppr > priority && 150 + new_state.mfrr > priority && 151 + new_state.pending_pri > priority; 152 + 153 + /* 154 + * If we can, check for a rejection and perform the 155 + * delivery 156 + */ 157 + if (success) { 158 + *reject = new_state.xisr; 159 + new_state.xisr = irq; 160 + new_state.pending_pri = priority; 161 + } else { 162 + /* 163 + * If we failed to deliver we set need_resend 164 + * so a subsequent CPPR state change causes us 165 + * to try a new delivery. 166 + */ 167 + new_state.need_resend = true; 168 + } 169 + 170 + } while (!icp_rm_try_update(icp, old_state, new_state)); 171 + 172 + return success; 173 + } 174 + 175 + static void icp_rm_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp, 176 + u32 new_irq) 177 + { 178 + struct ics_irq_state *state; 179 + struct kvmppc_ics *ics; 180 + u32 reject; 181 + u16 src; 182 + 183 + /* 184 + * This is used both for initial delivery of an interrupt and 185 + * for subsequent rejection. 186 + * 187 + * Rejection can be racy vs. resends. We have evaluated the 188 + * rejection in an atomic ICP transaction which is now complete, 189 + * so potentially the ICP can already accept the interrupt again. 190 + * 191 + * So we need to retry the delivery. Essentially the reject path 192 + * boils down to a failed delivery. Always. 193 + * 194 + * Now the interrupt could also have moved to a different target, 195 + * thus we may need to re-do the ICP lookup as well 196 + */ 197 + 198 + again: 199 + /* Get the ICS state and lock it */ 200 + ics = kvmppc_xics_find_ics(xics, new_irq, &src); 201 + if (!ics) { 202 + /* Unsafe increment, but this does not need to be accurate */ 203 + xics->err_noics++; 204 + return; 205 + } 206 + state = &ics->irq_state[src]; 207 + 208 + /* Get a lock on the ICS */ 209 + arch_spin_lock(&ics->lock); 210 + 211 + /* Get our server */ 212 + if (!icp || state->server != icp->server_num) { 213 + icp = kvmppc_xics_find_server(xics->kvm, state->server); 214 + if (!icp) { 215 + /* Unsafe increment again*/ 216 + xics->err_noicp++; 217 + goto out; 218 + } 219 + } 220 + 221 + /* Clear the resend bit of that interrupt */ 222 + state->resend = 0; 223 + 224 + /* 225 + * If masked, bail out 226 + * 227 + * Note: PAPR doesn't mention anything about masked pending 228 + * when doing a resend, only when doing a delivery. 229 + * 230 + * However that would have the effect of losing a masked 231 + * interrupt that was rejected and isn't consistent with 232 + * the whole masked_pending business which is about not 233 + * losing interrupts that occur while masked. 234 + * 235 + * I don't differentiate normal deliveries and resends, this 236 + * implementation will differ from PAPR and not lose such 237 + * interrupts. 238 + */ 239 + if (state->priority == MASKED) { 240 + state->masked_pending = 1; 241 + goto out; 242 + } 243 + 244 + /* 245 + * Try the delivery, this will set the need_resend flag 246 + * in the ICP as part of the atomic transaction if the 247 + * delivery is not possible. 248 + * 249 + * Note that if successful, the new delivery might have itself 250 + * rejected an interrupt that was "delivered" before we took the 251 + * ics spin lock. 252 + * 253 + * In this case we do the whole sequence all over again for the 254 + * new guy. We cannot assume that the rejected interrupt is less 255 + * favored than the new one, and thus doesn't need to be delivered, 256 + * because by the time we exit icp_rm_try_to_deliver() the target 257 + * processor may well have already consumed & completed it, and thus 258 + * the rejected interrupt might actually be already acceptable. 259 + */ 260 + if (icp_rm_try_to_deliver(icp, new_irq, state->priority, &reject)) { 261 + /* 262 + * Delivery was successful, did we reject somebody else ? 263 + */ 264 + if (reject && reject != XICS_IPI) { 265 + arch_spin_unlock(&ics->lock); 266 + new_irq = reject; 267 + goto again; 268 + } 269 + } else { 270 + /* 271 + * We failed to deliver the interrupt we need to set the 272 + * resend map bit and mark the ICS state as needing a resend 273 + */ 274 + set_bit(ics->icsid, icp->resend_map); 275 + state->resend = 1; 276 + 277 + /* 278 + * If the need_resend flag got cleared in the ICP some time 279 + * between icp_rm_try_to_deliver() atomic update and now, then 280 + * we know it might have missed the resend_map bit. So we 281 + * retry 282 + */ 283 + smp_mb(); 284 + if (!icp->state.need_resend) { 285 + arch_spin_unlock(&ics->lock); 286 + goto again; 287 + } 288 + } 289 + out: 290 + arch_spin_unlock(&ics->lock); 136 291 } 137 292 138 293 static void icp_rm_down_cppr(struct kvmppc_xics *xics, struct kvmppc_icp *icp, ··· 377 184 * separately here as well. 378 185 */ 379 186 if (resend) { 380 - icp->rm_action |= XICS_RM_CHECK_RESEND; 381 - icp->rm_resend_icp = icp; 187 + icp->n_check_resend++; 188 + icp_rm_check_resend(xics, icp); 382 189 } 383 190 } 384 191 ··· 493 300 } 494 301 } while (!icp_rm_try_update(icp, old_state, new_state)); 495 302 496 - /* Pass rejects to virtual mode */ 303 + /* Handle reject in real mode */ 497 304 if (reject && reject != XICS_IPI) { 498 - this_icp->rm_action |= XICS_RM_REJECT; 499 - this_icp->rm_reject = reject; 305 + this_icp->n_reject++; 306 + icp_rm_deliver_irq(xics, icp, reject); 500 307 } 501 308 502 - /* Pass resends to virtual mode */ 309 + /* Handle resends in real mode */ 503 310 if (resend) { 504 - this_icp->rm_action |= XICS_RM_CHECK_RESEND; 505 - this_icp->rm_resend_icp = icp; 311 + this_icp->n_check_resend++; 312 + icp_rm_check_resend(xics, icp); 506 313 } 507 314 508 315 return check_too_hard(xics, this_icp); ··· 558 365 559 366 } while (!icp_rm_try_update(icp, old_state, new_state)); 560 367 561 - /* Pass rejects to virtual mode */ 368 + /* 369 + * Check for rejects. They are handled by doing a new delivery 370 + * attempt (see comments in icp_rm_deliver_irq). 371 + */ 562 372 if (reject && reject != XICS_IPI) { 563 - icp->rm_action |= XICS_RM_REJECT; 564 - icp->rm_reject = reject; 373 + icp->n_reject++; 374 + icp_rm_deliver_irq(xics, icp, reject); 565 375 } 566 376 bail: 567 377 return check_too_hard(xics, icp); ··· 612 416 goto bail; 613 417 state = &ics->irq_state[src]; 614 418 615 - /* Still asserted, resend it, we make it look like a reject */ 419 + /* Still asserted, resend it */ 616 420 if (state->asserted) { 617 - icp->rm_action |= XICS_RM_REJECT; 618 - icp->rm_reject = irq; 421 + icp->n_reject++; 422 + icp_rm_deliver_irq(xics, icp, irq); 619 423 } 620 424 621 425 if (!hlist_empty(&vcpu->kvm->irq_ack_notifier_list)) {
+422 -137
arch/powerpc/kvm/book3s_hv_rmhandlers.S
··· 172 172 173 173 kvmppc_primary_no_guest: 174 174 /* We handle this much like a ceded vcpu */ 175 + /* put the HDEC into the DEC, since HDEC interrupts don't wake us */ 176 + mfspr r3, SPRN_HDEC 177 + mtspr SPRN_DEC, r3 178 + /* 179 + * Make sure the primary has finished the MMU switch. 180 + * We should never get here on a secondary thread, but 181 + * check it for robustness' sake. 182 + */ 183 + ld r5, HSTATE_KVM_VCORE(r13) 184 + 65: lbz r0, VCORE_IN_GUEST(r5) 185 + cmpwi r0, 0 186 + beq 65b 187 + /* Set LPCR. */ 188 + ld r8,VCORE_LPCR(r5) 189 + mtspr SPRN_LPCR,r8 190 + isync 175 191 /* set our bit in napping_threads */ 176 192 ld r5, HSTATE_KVM_VCORE(r13) 177 193 lbz r7, HSTATE_PTID(r13) ··· 198 182 or r3, r3, r0 199 183 stwcx. r3, 0, r6 200 184 bne 1b 201 - /* order napping_threads update vs testing entry_exit_count */ 185 + /* order napping_threads update vs testing entry_exit_map */ 202 186 isync 203 187 li r12, 0 204 188 lwz r7, VCORE_ENTRY_EXIT(r5) ··· 207 191 li r3, NAPPING_NOVCPU 208 192 stb r3, HSTATE_NAPPING(r13) 209 193 194 + li r3, 0 /* Don't wake on privileged (OS) doorbell */ 210 195 b kvm_do_nap 211 196 212 197 kvm_novcpu_wakeup: ··· 219 202 220 203 /* check the wake reason */ 221 204 bl kvmppc_check_wake_reason 222 - 205 + 223 206 /* see if any other thread is already exiting */ 224 207 lwz r0, VCORE_ENTRY_EXIT(r5) 225 208 cmpwi r0, 0x100 ··· 239 222 cmpdi r3, 0 240 223 bge kvm_novcpu_exit 241 224 225 + /* See if our timeslice has expired (HDEC is negative) */ 226 + mfspr r0, SPRN_HDEC 227 + li r12, BOOK3S_INTERRUPT_HV_DECREMENTER 228 + cmpwi r0, 0 229 + blt kvm_novcpu_exit 230 + 242 231 /* Got an IPI but other vcpus aren't yet exiting, must be a latecomer */ 243 232 ld r4, HSTATE_KVM_VCPU(r13) 244 233 cmpdi r4, 0 245 - bne kvmppc_got_guest 234 + beq kvmppc_primary_no_guest 235 + 236 + #ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING 237 + addi r3, r4, VCPU_TB_RMENTRY 238 + bl kvmhv_start_timing 239 + #endif 240 + b kvmppc_got_guest 246 241 247 242 kvm_novcpu_exit: 248 - b hdec_soon 243 + #ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING 244 + ld r4, HSTATE_KVM_VCPU(r13) 245 + cmpdi r4, 0 246 + beq 13f 247 + addi r3, r4, VCPU_TB_RMEXIT 248 + bl kvmhv_accumulate_time 249 + #endif 250 + 13: mr r3, r12 251 + stw r12, 112-4(r1) 252 + bl kvmhv_commence_exit 253 + nop 254 + lwz r12, 112-4(r1) 255 + b kvmhv_switch_to_host 249 256 250 257 /* 251 258 * We come in here when wakened from nap mode. ··· 280 239 kvm_start_guest: 281 240 282 241 /* Set runlatch bit the minute you wake up from nap */ 283 - mfspr r1, SPRN_CTRLF 284 - ori r1, r1, 1 285 - mtspr SPRN_CTRLT, r1 242 + mfspr r0, SPRN_CTRLF 243 + ori r0, r0, 1 244 + mtspr SPRN_CTRLT, r0 286 245 287 246 ld r2,PACATOC(r13) 288 247 ··· 327 286 ld r6, PACA_DSCR(r13) 328 287 std r6, HSTATE_DSCR(r13) 329 288 289 + /* Order load of vcore, ptid etc. after load of vcpu */ 290 + lwsync 330 291 bl kvmppc_hv_entry 331 292 332 293 /* Back from the guest, go back to nap */ 333 294 /* Clear our vcpu pointer so we don't come back in early */ 334 295 li r0, 0 335 - std r0, HSTATE_KVM_VCPU(r13) 336 296 /* 337 - * Make sure we clear HSTATE_KVM_VCPU(r13) before incrementing 338 - * the nap_count, because once the increment to nap_count is 339 - * visible we could be given another vcpu. 297 + * Once we clear HSTATE_KVM_VCPU(r13), the code in 298 + * kvmppc_run_core() is going to assume that all our vcpu 299 + * state is visible in memory. This lwsync makes sure 300 + * that that is true. 340 301 */ 341 302 lwsync 342 - 343 - /* increment the nap count and then go to nap mode */ 344 - ld r4, HSTATE_KVM_VCORE(r13) 345 - addi r4, r4, VCORE_NAP_COUNT 346 - 51: lwarx r3, 0, r4 347 - addi r3, r3, 1 348 - stwcx. r3, 0, r4 349 - bne 51b 303 + std r0, HSTATE_KVM_VCPU(r13) 350 304 351 305 /* 352 306 * At this point we have finished executing in the guest. ··· 412 376 li r6, KVM_GUEST_MODE_HOST_HV 413 377 stb r6, HSTATE_IN_GUEST(r13) 414 378 379 + #ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING 380 + /* Store initial timestamp */ 381 + cmpdi r4, 0 382 + beq 1f 383 + addi r3, r4, VCPU_TB_RMENTRY 384 + bl kvmhv_start_timing 385 + 1: 386 + #endif 415 387 /* Clear out SLB */ 416 388 li r6,0 417 389 slbmte r6,r6 ··· 431 387 * We don't have to lock against concurrent tlbies, 432 388 * but we do have to coordinate across hardware threads. 433 389 */ 434 - /* Increment entry count iff exit count is zero. */ 435 - ld r5,HSTATE_KVM_VCORE(r13) 436 - addi r9,r5,VCORE_ENTRY_EXIT 437 - 21: lwarx r3,0,r9 438 - cmpwi r3,0x100 /* any threads starting to exit? */ 390 + /* Set bit in entry map iff exit map is zero. */ 391 + ld r5, HSTATE_KVM_VCORE(r13) 392 + li r7, 1 393 + lbz r6, HSTATE_PTID(r13) 394 + sld r7, r7, r6 395 + addi r9, r5, VCORE_ENTRY_EXIT 396 + 21: lwarx r3, 0, r9 397 + cmpwi r3, 0x100 /* any threads starting to exit? */ 439 398 bge secondary_too_late /* if so we're too late to the party */ 440 - addi r3,r3,1 441 - stwcx. r3,0,r9 399 + or r3, r3, r7 400 + stwcx. r3, 0, r9 442 401 bne 21b 443 402 444 403 /* Primary thread switches to guest partition. */ 445 404 ld r9,VCORE_KVM(r5) /* pointer to struct kvm */ 446 - lbz r6,HSTATE_PTID(r13) 447 405 cmpwi r6,0 448 - bne 20f 406 + bne 10f 449 407 ld r6,KVM_SDR1(r9) 450 408 lwz r7,KVM_LPID(r9) 451 409 li r0,LPID_RSVD /* switch to reserved LPID */ ··· 518 472 519 473 li r0,1 520 474 stb r0,VCORE_IN_GUEST(r5) /* signal secondaries to continue */ 521 - b 10f 522 - 523 - /* Secondary threads wait for primary to have done partition switch */ 524 - 20: lbz r0,VCORE_IN_GUEST(r5) 525 - cmpwi r0,0 526 - beq 20b 527 - 528 - /* Set LPCR and RMOR. */ 529 - 10: ld r8,VCORE_LPCR(r5) 530 - mtspr SPRN_LPCR,r8 531 - ld r8,KVM_RMOR(r9) 532 - mtspr SPRN_RMOR,r8 533 - isync 534 - 535 - /* Check if HDEC expires soon */ 536 - mfspr r3,SPRN_HDEC 537 - cmpwi r3,512 /* 1 microsecond */ 538 - li r12,BOOK3S_INTERRUPT_HV_DECREMENTER 539 - blt hdec_soon 540 475 541 476 /* Do we have a guest vcpu to run? */ 542 - cmpdi r4, 0 477 + 10: cmpdi r4, 0 543 478 beq kvmppc_primary_no_guest 544 479 kvmppc_got_guest: 545 480 ··· 845 818 clrrdi r6,r6,1 846 819 mtspr SPRN_CTRLT,r6 847 820 4: 821 + /* Secondary threads wait for primary to have done partition switch */ 822 + ld r5, HSTATE_KVM_VCORE(r13) 823 + lbz r6, HSTATE_PTID(r13) 824 + cmpwi r6, 0 825 + beq 21f 826 + lbz r0, VCORE_IN_GUEST(r5) 827 + cmpwi r0, 0 828 + bne 21f 829 + HMT_LOW 830 + 20: lbz r0, VCORE_IN_GUEST(r5) 831 + cmpwi r0, 0 832 + beq 20b 833 + HMT_MEDIUM 834 + 21: 835 + /* Set LPCR. */ 836 + ld r8,VCORE_LPCR(r5) 837 + mtspr SPRN_LPCR,r8 838 + isync 839 + 840 + /* Check if HDEC expires soon */ 841 + mfspr r3, SPRN_HDEC 842 + cmpwi r3, 512 /* 1 microsecond */ 843 + blt hdec_soon 844 + 848 845 ld r6, VCPU_CTR(r4) 849 846 lwz r7, VCPU_XER(r4) 850 847 ··· 931 880 li r9, KVM_GUEST_MODE_GUEST_HV 932 881 stb r9, HSTATE_IN_GUEST(r13) 933 882 883 + #ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING 884 + /* Accumulate timing */ 885 + addi r3, r4, VCPU_TB_GUEST 886 + bl kvmhv_accumulate_time 887 + #endif 888 + 934 889 /* Enter guest */ 935 890 936 891 BEGIN_FTR_SECTION ··· 973 916 974 917 hrfid 975 918 b . 919 + 920 + secondary_too_late: 921 + li r12, 0 922 + cmpdi r4, 0 923 + beq 11f 924 + stw r12, VCPU_TRAP(r4) 925 + #ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING 926 + addi r3, r4, VCPU_TB_RMEXIT 927 + bl kvmhv_accumulate_time 928 + #endif 929 + 11: b kvmhv_switch_to_host 930 + 931 + hdec_soon: 932 + li r12, BOOK3S_INTERRUPT_HV_DECREMENTER 933 + stw r12, VCPU_TRAP(r4) 934 + mr r9, r4 935 + #ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING 936 + addi r3, r4, VCPU_TB_RMEXIT 937 + bl kvmhv_accumulate_time 938 + #endif 939 + b guest_exit_cont 976 940 977 941 /****************************************************************************** 978 942 * * ··· 1080 1002 1081 1003 stw r12,VCPU_TRAP(r9) 1082 1004 1005 + #ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING 1006 + addi r3, r9, VCPU_TB_RMINTR 1007 + mr r4, r9 1008 + bl kvmhv_accumulate_time 1009 + ld r5, VCPU_GPR(R5)(r9) 1010 + ld r6, VCPU_GPR(R6)(r9) 1011 + ld r7, VCPU_GPR(R7)(r9) 1012 + ld r8, VCPU_GPR(R8)(r9) 1013 + #endif 1014 + 1083 1015 /* Save HEIR (HV emulation assist reg) in emul_inst 1084 1016 if this is an HEI (HV emulation interrupt, e40) */ 1085 1017 li r3,KVM_INST_FETCH_FAILED ··· 1116 1028 bne 2f 1117 1029 mfspr r3,SPRN_HDEC 1118 1030 cmpwi r3,0 1119 - bge ignore_hdec 1031 + mr r4,r9 1032 + bge fast_guest_return 1120 1033 2: 1121 1034 /* See if this is an hcall we can handle in real mode */ 1122 1035 cmpwi r12,BOOK3S_INTERRUPT_SYSCALL 1123 1036 beq hcall_try_real_mode 1124 1037 1038 + /* Hypervisor doorbell - exit only if host IPI flag set */ 1039 + cmpwi r12, BOOK3S_INTERRUPT_H_DOORBELL 1040 + bne 3f 1041 + lbz r0, HSTATE_HOST_IPI(r13) 1042 + beq 4f 1043 + b guest_exit_cont 1044 + 3: 1125 1045 /* External interrupt ? */ 1126 1046 cmpwi r12, BOOK3S_INTERRUPT_EXTERNAL 1127 - bne+ ext_interrupt_to_host 1047 + bne+ guest_exit_cont 1128 1048 1129 1049 /* External interrupt, first check for host_ipi. If this is 1130 1050 * set, we know the host wants us out so let's do it now 1131 1051 */ 1132 1052 bl kvmppc_read_intr 1133 1053 cmpdi r3, 0 1134 - bgt ext_interrupt_to_host 1054 + bgt guest_exit_cont 1135 1055 1136 1056 /* Check if any CPU is heading out to the host, if so head out too */ 1137 - ld r5, HSTATE_KVM_VCORE(r13) 1057 + 4: ld r5, HSTATE_KVM_VCORE(r13) 1138 1058 lwz r0, VCORE_ENTRY_EXIT(r5) 1139 1059 cmpwi r0, 0x100 1140 - bge ext_interrupt_to_host 1141 - 1142 - /* Return to guest after delivering any pending interrupt */ 1143 1060 mr r4, r9 1144 - b deliver_guest_interrupt 1145 - 1146 - ext_interrupt_to_host: 1061 + blt deliver_guest_interrupt 1147 1062 1148 1063 guest_exit_cont: /* r9 = vcpu, r12 = trap, r13 = paca */ 1149 1064 /* Save more register state */ ··· 1156 1065 stw r7, VCPU_DSISR(r9) 1157 1066 /* don't overwrite fault_dar/fault_dsisr if HDSI */ 1158 1067 cmpwi r12,BOOK3S_INTERRUPT_H_DATA_STORAGE 1159 - beq 6f 1068 + beq mc_cont 1160 1069 std r6, VCPU_FAULT_DAR(r9) 1161 1070 stw r7, VCPU_FAULT_DSISR(r9) 1162 1071 ··· 1164 1073 cmpwi r12, BOOK3S_INTERRUPT_MACHINE_CHECK 1165 1074 beq machine_check_realmode 1166 1075 mc_cont: 1076 + #ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING 1077 + addi r3, r9, VCPU_TB_RMEXIT 1078 + mr r4, r9 1079 + bl kvmhv_accumulate_time 1080 + #endif 1081 + 1082 + /* Increment exit count, poke other threads to exit */ 1083 + bl kvmhv_commence_exit 1084 + nop 1085 + ld r9, HSTATE_KVM_VCPU(r13) 1086 + lwz r12, VCPU_TRAP(r9) 1167 1087 1168 1088 /* Save guest CTRL register, set runlatch to 1 */ 1169 - 6: mfspr r6,SPRN_CTRLF 1089 + mfspr r6,SPRN_CTRLF 1170 1090 stw r6,VCPU_CTRL(r9) 1171 1091 andi. r0,r6,1 1172 1092 bne 4f ··· 1519 1417 slbia 1520 1418 ptesync 1521 1419 1522 - hdec_soon: /* r12 = trap, r13 = paca */ 1523 1420 /* 1524 1421 * POWER7/POWER8 guest -> host partition switch code. 1525 1422 * We don't have to lock against tlbies but we do 1526 1423 * have to coordinate the hardware threads. 1527 1424 */ 1528 - /* Increment the threads-exiting-guest count in the 0xff00 1529 - bits of vcore->entry_exit_count */ 1530 - ld r5,HSTATE_KVM_VCORE(r13) 1531 - addi r6,r5,VCORE_ENTRY_EXIT 1532 - 41: lwarx r3,0,r6 1533 - addi r0,r3,0x100 1534 - stwcx. r0,0,r6 1535 - bne 41b 1536 - isync /* order stwcx. vs. reading napping_threads */ 1537 - 1538 - /* 1539 - * At this point we have an interrupt that we have to pass 1540 - * up to the kernel or qemu; we can't handle it in real mode. 1541 - * Thus we have to do a partition switch, so we have to 1542 - * collect the other threads, if we are the first thread 1543 - * to take an interrupt. To do this, we set the HDEC to 0, 1544 - * which causes an HDEC interrupt in all threads within 2ns 1545 - * because the HDEC register is shared between all 4 threads. 1546 - * However, we don't need to bother if this is an HDEC 1547 - * interrupt, since the other threads will already be on their 1548 - * way here in that case. 1549 - */ 1550 - cmpwi r3,0x100 /* Are we the first here? */ 1551 - bge 43f 1552 - cmpwi r12,BOOK3S_INTERRUPT_HV_DECREMENTER 1553 - beq 40f 1554 - li r0,0 1555 - mtspr SPRN_HDEC,r0 1556 - 40: 1557 - /* 1558 - * Send an IPI to any napping threads, since an HDEC interrupt 1559 - * doesn't wake CPUs up from nap. 1560 - */ 1561 - lwz r3,VCORE_NAPPING_THREADS(r5) 1562 - lbz r4,HSTATE_PTID(r13) 1563 - li r0,1 1564 - sld r0,r0,r4 1565 - andc. r3,r3,r0 /* no sense IPI'ing ourselves */ 1566 - beq 43f 1567 - /* Order entry/exit update vs. IPIs */ 1568 - sync 1569 - mulli r4,r4,PACA_SIZE /* get paca for thread 0 */ 1570 - subf r6,r4,r13 1571 - 42: andi. r0,r3,1 1572 - beq 44f 1573 - ld r8,HSTATE_XICS_PHYS(r6) /* get thread's XICS reg addr */ 1574 - li r0,IPI_PRIORITY 1575 - li r7,XICS_MFRR 1576 - stbcix r0,r7,r8 /* trigger the IPI */ 1577 - 44: srdi. r3,r3,1 1578 - addi r6,r6,PACA_SIZE 1579 - bne 42b 1580 - 1581 - secondary_too_late: 1425 + kvmhv_switch_to_host: 1582 1426 /* Secondary threads wait for primary to do partition switch */ 1583 - 43: ld r5,HSTATE_KVM_VCORE(r13) 1427 + ld r5,HSTATE_KVM_VCORE(r13) 1584 1428 ld r4,VCORE_KVM(r5) /* pointer to struct kvm */ 1585 1429 lbz r3,HSTATE_PTID(r13) 1586 1430 cmpwi r3,0 ··· 1610 1562 1: addi r8,r8,16 1611 1563 .endr 1612 1564 1565 + #ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING 1566 + /* Finish timing, if we have a vcpu */ 1567 + ld r4, HSTATE_KVM_VCPU(r13) 1568 + cmpdi r4, 0 1569 + li r3, 0 1570 + beq 2f 1571 + bl kvmhv_accumulate_time 1572 + 2: 1573 + #endif 1613 1574 /* Unset guest mode */ 1614 1575 li r0, KVM_GUEST_MODE_NONE 1615 1576 stb r0, HSTATE_IN_GUEST(r13) ··· 1753 1696 * Returns to the guest if we handle it, or continues on up to 1754 1697 * the kernel if we can't (i.e. if we don't have a handler for 1755 1698 * it, or if the handler returns H_TOO_HARD). 1699 + * 1700 + * r5 - r8 contain hcall args, 1701 + * r9 = vcpu, r10 = pc, r11 = msr, r12 = trap, r13 = paca 1756 1702 */ 1757 - .globl hcall_try_real_mode 1758 1703 hcall_try_real_mode: 1759 1704 ld r3,VCPU_GPR(R3)(r9) 1760 1705 andi. r0,r11,MSR_PR ··· 1898 1839 .long 0 /* 0x12c */ 1899 1840 .long 0 /* 0x130 */ 1900 1841 .long DOTSYM(kvmppc_h_set_xdabr) - hcall_real_table 1842 + .long 0 /* 0x138 */ 1843 + .long 0 /* 0x13c */ 1844 + .long 0 /* 0x140 */ 1845 + .long 0 /* 0x144 */ 1846 + .long 0 /* 0x148 */ 1847 + .long 0 /* 0x14c */ 1848 + .long 0 /* 0x150 */ 1849 + .long 0 /* 0x154 */ 1850 + .long 0 /* 0x158 */ 1851 + .long 0 /* 0x15c */ 1852 + .long 0 /* 0x160 */ 1853 + .long 0 /* 0x164 */ 1854 + .long 0 /* 0x168 */ 1855 + .long 0 /* 0x16c */ 1856 + .long 0 /* 0x170 */ 1857 + .long 0 /* 0x174 */ 1858 + .long 0 /* 0x178 */ 1859 + .long 0 /* 0x17c */ 1860 + .long 0 /* 0x180 */ 1861 + .long 0 /* 0x184 */ 1862 + .long 0 /* 0x188 */ 1863 + .long 0 /* 0x18c */ 1864 + .long 0 /* 0x190 */ 1865 + .long 0 /* 0x194 */ 1866 + .long 0 /* 0x198 */ 1867 + .long 0 /* 0x19c */ 1868 + .long 0 /* 0x1a0 */ 1869 + .long 0 /* 0x1a4 */ 1870 + .long 0 /* 0x1a8 */ 1871 + .long 0 /* 0x1ac */ 1872 + .long 0 /* 0x1b0 */ 1873 + .long 0 /* 0x1b4 */ 1874 + .long 0 /* 0x1b8 */ 1875 + .long 0 /* 0x1bc */ 1876 + .long 0 /* 0x1c0 */ 1877 + .long 0 /* 0x1c4 */ 1878 + .long 0 /* 0x1c8 */ 1879 + .long 0 /* 0x1cc */ 1880 + .long 0 /* 0x1d0 */ 1881 + .long 0 /* 0x1d4 */ 1882 + .long 0 /* 0x1d8 */ 1883 + .long 0 /* 0x1dc */ 1884 + .long 0 /* 0x1e0 */ 1885 + .long 0 /* 0x1e4 */ 1886 + .long 0 /* 0x1e8 */ 1887 + .long 0 /* 0x1ec */ 1888 + .long 0 /* 0x1f0 */ 1889 + .long 0 /* 0x1f4 */ 1890 + .long 0 /* 0x1f8 */ 1891 + .long 0 /* 0x1fc */ 1892 + .long 0 /* 0x200 */ 1893 + .long 0 /* 0x204 */ 1894 + .long 0 /* 0x208 */ 1895 + .long 0 /* 0x20c */ 1896 + .long 0 /* 0x210 */ 1897 + .long 0 /* 0x214 */ 1898 + .long 0 /* 0x218 */ 1899 + .long 0 /* 0x21c */ 1900 + .long 0 /* 0x220 */ 1901 + .long 0 /* 0x224 */ 1902 + .long 0 /* 0x228 */ 1903 + .long 0 /* 0x22c */ 1904 + .long 0 /* 0x230 */ 1905 + .long 0 /* 0x234 */ 1906 + .long 0 /* 0x238 */ 1907 + .long 0 /* 0x23c */ 1908 + .long 0 /* 0x240 */ 1909 + .long 0 /* 0x244 */ 1910 + .long 0 /* 0x248 */ 1911 + .long 0 /* 0x24c */ 1912 + .long 0 /* 0x250 */ 1913 + .long 0 /* 0x254 */ 1914 + .long 0 /* 0x258 */ 1915 + .long 0 /* 0x25c */ 1916 + .long 0 /* 0x260 */ 1917 + .long 0 /* 0x264 */ 1918 + .long 0 /* 0x268 */ 1919 + .long 0 /* 0x26c */ 1920 + .long 0 /* 0x270 */ 1921 + .long 0 /* 0x274 */ 1922 + .long 0 /* 0x278 */ 1923 + .long 0 /* 0x27c */ 1924 + .long 0 /* 0x280 */ 1925 + .long 0 /* 0x284 */ 1926 + .long 0 /* 0x288 */ 1927 + .long 0 /* 0x28c */ 1928 + .long 0 /* 0x290 */ 1929 + .long 0 /* 0x294 */ 1930 + .long 0 /* 0x298 */ 1931 + .long 0 /* 0x29c */ 1932 + .long 0 /* 0x2a0 */ 1933 + .long 0 /* 0x2a4 */ 1934 + .long 0 /* 0x2a8 */ 1935 + .long 0 /* 0x2ac */ 1936 + .long 0 /* 0x2b0 */ 1937 + .long 0 /* 0x2b4 */ 1938 + .long 0 /* 0x2b8 */ 1939 + .long 0 /* 0x2bc */ 1940 + .long 0 /* 0x2c0 */ 1941 + .long 0 /* 0x2c4 */ 1942 + .long 0 /* 0x2c8 */ 1943 + .long 0 /* 0x2cc */ 1944 + .long 0 /* 0x2d0 */ 1945 + .long 0 /* 0x2d4 */ 1946 + .long 0 /* 0x2d8 */ 1947 + .long 0 /* 0x2dc */ 1948 + .long 0 /* 0x2e0 */ 1949 + .long 0 /* 0x2e4 */ 1950 + .long 0 /* 0x2e8 */ 1951 + .long 0 /* 0x2ec */ 1952 + .long 0 /* 0x2f0 */ 1953 + .long 0 /* 0x2f4 */ 1954 + .long 0 /* 0x2f8 */ 1955 + .long 0 /* 0x2fc */ 1956 + .long DOTSYM(kvmppc_h_random) - hcall_real_table 1901 1957 .globl hcall_real_table_end 1902 1958 hcall_real_table_end: 1903 - 1904 - ignore_hdec: 1905 - mr r4,r9 1906 - b fast_guest_return 1907 1959 1908 1960 _GLOBAL(kvmppc_h_set_xdabr) 1909 1961 andi. r0, r5, DABRX_USER | DABRX_KERNEL ··· 2054 1884 li r3, 0 2055 1885 blr 2056 1886 2057 - _GLOBAL(kvmppc_h_cede) 1887 + _GLOBAL(kvmppc_h_cede) /* r3 = vcpu pointer, r11 = msr, r13 = paca */ 2058 1888 ori r11,r11,MSR_EE 2059 1889 std r11,VCPU_MSR(r3) 2060 1890 li r0,1 ··· 2063 1893 lbz r5,VCPU_PRODDED(r3) 2064 1894 cmpwi r5,0 2065 1895 bne kvm_cede_prodded 2066 - li r0,0 /* set trap to 0 to say hcall is handled */ 2067 - stw r0,VCPU_TRAP(r3) 1896 + li r12,0 /* set trap to 0 to say hcall is handled */ 1897 + stw r12,VCPU_TRAP(r3) 2068 1898 li r0,H_SUCCESS 2069 1899 std r0,VCPU_GPR(R3)(r3) 2070 1900 ··· 2082 1912 addi r6,r5,VCORE_NAPPING_THREADS 2083 1913 31: lwarx r4,0,r6 2084 1914 or r4,r4,r0 2085 - PPC_POPCNTW(R7,R4) 2086 - cmpw r7,r8 2087 - bge kvm_cede_exit 1915 + cmpw r4,r8 1916 + beq kvm_cede_exit 2088 1917 stwcx. r4,0,r6 2089 1918 bne 31b 2090 - /* order napping_threads update vs testing entry_exit_count */ 1919 + /* order napping_threads update vs testing entry_exit_map */ 2091 1920 isync 2092 1921 li r0,NAPPING_CEDE 2093 1922 stb r0,HSTATE_NAPPING(r13) ··· 2124 1955 bl kvmppc_save_fp 2125 1956 2126 1957 /* 1958 + * Set DEC to the smaller of DEC and HDEC, so that we wake 1959 + * no later than the end of our timeslice (HDEC interrupts 1960 + * don't wake us from nap). 1961 + */ 1962 + mfspr r3, SPRN_DEC 1963 + mfspr r4, SPRN_HDEC 1964 + mftb r5 1965 + cmpw r3, r4 1966 + ble 67f 1967 + mtspr SPRN_DEC, r4 1968 + 67: 1969 + /* save expiry time of guest decrementer */ 1970 + extsw r3, r3 1971 + add r3, r3, r5 1972 + ld r4, HSTATE_KVM_VCPU(r13) 1973 + ld r5, HSTATE_KVM_VCORE(r13) 1974 + ld r6, VCORE_TB_OFFSET(r5) 1975 + subf r3, r6, r3 /* convert to host TB value */ 1976 + std r3, VCPU_DEC_EXPIRES(r4) 1977 + 1978 + #ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING 1979 + ld r4, HSTATE_KVM_VCPU(r13) 1980 + addi r3, r4, VCPU_TB_CEDE 1981 + bl kvmhv_accumulate_time 1982 + #endif 1983 + 1984 + lis r3, LPCR_PECEDP@h /* Do wake on privileged doorbell */ 1985 + 1986 + /* 2127 1987 * Take a nap until a decrementer or external or doobell interrupt 2128 - * occurs, with PECE1, PECE0 and PECEDP set in LPCR. Also clear the 2129 - * runlatch bit before napping. 1988 + * occurs, with PECE1 and PECE0 set in LPCR. 1989 + * On POWER8, set PECEDH, and if we are ceding, also set PECEDP. 1990 + * Also clear the runlatch bit before napping. 2130 1991 */ 2131 1992 kvm_do_nap: 2132 - mfspr r2, SPRN_CTRLF 2133 - clrrdi r2, r2, 1 2134 - mtspr SPRN_CTRLT, r2 1993 + mfspr r0, SPRN_CTRLF 1994 + clrrdi r0, r0, 1 1995 + mtspr SPRN_CTRLT, r0 2135 1996 2136 1997 li r0,1 2137 1998 stb r0,HSTATE_HWTHREAD_REQ(r13) 2138 1999 mfspr r5,SPRN_LPCR 2139 2000 ori r5,r5,LPCR_PECE0 | LPCR_PECE1 2140 2001 BEGIN_FTR_SECTION 2141 - oris r5,r5,LPCR_PECEDP@h 2002 + ori r5, r5, LPCR_PECEDH 2003 + rlwimi r5, r3, 0, LPCR_PECEDP 2142 2004 END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) 2143 2005 mtspr SPRN_LPCR,r5 2144 2006 isync ··· 2194 1994 /* Woken by external or decrementer interrupt */ 2195 1995 ld r1, HSTATE_HOST_R1(r13) 2196 1996 1997 + #ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING 1998 + addi r3, r4, VCPU_TB_RMINTR 1999 + bl kvmhv_accumulate_time 2000 + #endif 2001 + 2197 2002 /* load up FP state */ 2198 2003 bl kvmppc_load_fp 2004 + 2005 + /* Restore guest decrementer */ 2006 + ld r3, VCPU_DEC_EXPIRES(r4) 2007 + ld r5, HSTATE_KVM_VCORE(r13) 2008 + ld r6, VCORE_TB_OFFSET(r5) 2009 + add r3, r3, r6 /* convert host TB to guest TB value */ 2010 + mftb r7 2011 + subf r3, r7, r3 2012 + mtspr SPRN_DEC, r3 2199 2013 2200 2014 /* Load NV GPRS */ 2201 2015 ld r14, VCPU_GPR(R14)(r4) ··· 2271 2057 2272 2058 /* we've ceded but we want to give control to the host */ 2273 2059 kvm_cede_exit: 2274 - b hcall_real_fallback 2060 + ld r9, HSTATE_KVM_VCPU(r13) 2061 + b guest_exit_cont 2275 2062 2276 2063 /* Try to handle a machine check in real mode */ 2277 2064 machine_check_realmode: ··· 2304 2089 2305 2090 /* 2306 2091 * Check the reason we woke from nap, and take appropriate action. 2307 - * Returns: 2092 + * Returns (in r3): 2308 2093 * 0 if nothing needs to be done 2309 2094 * 1 if something happened that needs to be handled by the host 2310 - * -1 if there was a guest wakeup (IPI) 2095 + * -1 if there was a guest wakeup (IPI or msgsnd) 2311 2096 * 2312 2097 * Also sets r12 to the interrupt vector for any interrupt that needs 2313 2098 * to be handled now by the host (0x500 for external interrupt), or zero. 2099 + * Modifies r0, r6, r7, r8. 2314 2100 */ 2315 2101 kvmppc_check_wake_reason: 2316 2102 mfspr r6, SPRN_SRR1 ··· 2338 2122 2339 2123 /* hypervisor doorbell */ 2340 2124 3: li r12, BOOK3S_INTERRUPT_H_DOORBELL 2125 + /* see if it's a host IPI */ 2341 2126 li r3, 1 2127 + lbz r0, HSTATE_HOST_IPI(r13) 2128 + cmpwi r0, 0 2129 + bnelr 2130 + /* if not, clear it and return -1 */ 2131 + lis r6, (PPC_DBELL_SERVER << (63-36))@h 2132 + PPC_MSGCLR(6) 2133 + li r3, -1 2342 2134 blr 2343 2135 2344 2136 /* ··· 2355 2131 * 0 if no interrupt is pending 2356 2132 * 1 if an interrupt is pending that needs to be handled by the host 2357 2133 * -1 if there was a guest wakeup IPI (which has now been cleared) 2134 + * Modifies r0, r6, r7, r8, returns value in r3. 2358 2135 */ 2359 2136 kvmppc_read_intr: 2360 2137 /* see if a host IPI is pending */ ··· 2410 2185 bne- 43f 2411 2186 2412 2187 /* OK, it's an IPI for us */ 2188 + li r12, 0 2413 2189 li r3, -1 2414 2190 1: blr 2415 2191 ··· 2540 2314 mtspr SPRN_PMC6, r3 2541 2315 isync 2542 2316 blr 2317 + 2318 + #ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING 2319 + /* 2320 + * Start timing an activity 2321 + * r3 = pointer to time accumulation struct, r4 = vcpu 2322 + */ 2323 + kvmhv_start_timing: 2324 + ld r5, HSTATE_KVM_VCORE(r13) 2325 + lbz r6, VCORE_IN_GUEST(r5) 2326 + cmpwi r6, 0 2327 + beq 5f /* if in guest, need to */ 2328 + ld r6, VCORE_TB_OFFSET(r5) /* subtract timebase offset */ 2329 + 5: mftb r5 2330 + subf r5, r6, r5 2331 + std r3, VCPU_CUR_ACTIVITY(r4) 2332 + std r5, VCPU_ACTIVITY_START(r4) 2333 + blr 2334 + 2335 + /* 2336 + * Accumulate time to one activity and start another. 2337 + * r3 = pointer to new time accumulation struct, r4 = vcpu 2338 + */ 2339 + kvmhv_accumulate_time: 2340 + ld r5, HSTATE_KVM_VCORE(r13) 2341 + lbz r8, VCORE_IN_GUEST(r5) 2342 + cmpwi r8, 0 2343 + beq 4f /* if in guest, need to */ 2344 + ld r8, VCORE_TB_OFFSET(r5) /* subtract timebase offset */ 2345 + 4: ld r5, VCPU_CUR_ACTIVITY(r4) 2346 + ld r6, VCPU_ACTIVITY_START(r4) 2347 + std r3, VCPU_CUR_ACTIVITY(r4) 2348 + mftb r7 2349 + subf r7, r8, r7 2350 + std r7, VCPU_ACTIVITY_START(r4) 2351 + cmpdi r5, 0 2352 + beqlr 2353 + subf r3, r6, r7 2354 + ld r8, TAS_SEQCOUNT(r5) 2355 + cmpdi r8, 0 2356 + addi r8, r8, 1 2357 + std r8, TAS_SEQCOUNT(r5) 2358 + lwsync 2359 + ld r7, TAS_TOTAL(r5) 2360 + add r7, r7, r3 2361 + std r7, TAS_TOTAL(r5) 2362 + ld r6, TAS_MIN(r5) 2363 + ld r7, TAS_MAX(r5) 2364 + beq 3f 2365 + cmpd r3, r6 2366 + bge 1f 2367 + 3: std r3, TAS_MIN(r5) 2368 + 1: cmpd r3, r7 2369 + ble 2f 2370 + std r3, TAS_MAX(r5) 2371 + 2: lwsync 2372 + addi r8, r8, 1 2373 + std r8, TAS_SEQCOUNT(r5) 2374 + blr 2375 + #endif
+28
arch/powerpc/kvm/book3s_pr_papr.c
··· 258 258 return EMULATE_DONE; 259 259 } 260 260 261 + static int kvmppc_h_pr_logical_ci_load(struct kvm_vcpu *vcpu) 262 + { 263 + long rc; 264 + 265 + rc = kvmppc_h_logical_ci_load(vcpu); 266 + if (rc == H_TOO_HARD) 267 + return EMULATE_FAIL; 268 + kvmppc_set_gpr(vcpu, 3, rc); 269 + return EMULATE_DONE; 270 + } 271 + 272 + static int kvmppc_h_pr_logical_ci_store(struct kvm_vcpu *vcpu) 273 + { 274 + long rc; 275 + 276 + rc = kvmppc_h_logical_ci_store(vcpu); 277 + if (rc == H_TOO_HARD) 278 + return EMULATE_FAIL; 279 + kvmppc_set_gpr(vcpu, 3, rc); 280 + return EMULATE_DONE; 281 + } 282 + 261 283 static int kvmppc_h_pr_xics_hcall(struct kvm_vcpu *vcpu, u32 cmd) 262 284 { 263 285 long rc = kvmppc_xics_hcall(vcpu, cmd); ··· 312 290 clear_bit(KVM_REQ_UNHALT, &vcpu->requests); 313 291 vcpu->stat.halt_wakeup++; 314 292 return EMULATE_DONE; 293 + case H_LOGICAL_CI_LOAD: 294 + return kvmppc_h_pr_logical_ci_load(vcpu); 295 + case H_LOGICAL_CI_STORE: 296 + return kvmppc_h_pr_logical_ci_store(vcpu); 315 297 case H_XIRR: 316 298 case H_CPPR: 317 299 case H_EOI: ··· 349 323 case H_BULK_REMOVE: 350 324 case H_PUT_TCE: 351 325 case H_CEDE: 326 + case H_LOGICAL_CI_LOAD: 327 + case H_LOGICAL_CI_STORE: 352 328 #ifdef CONFIG_KVM_XICS 353 329 case H_XIRR: 354 330 case H_CPPR:
+80 -25
arch/powerpc/kvm/book3s_xics.c
··· 20 20 #include <asm/xics.h> 21 21 #include <asm/debug.h> 22 22 #include <asm/time.h> 23 + #include <asm/spinlock.h> 23 24 24 25 #include <linux/debugfs.h> 25 26 #include <linux/seq_file.h> ··· 40 39 * LOCKING 41 40 * ======= 42 41 * 43 - * Each ICS has a mutex protecting the information about the IRQ 42 + * Each ICS has a spin lock protecting the information about the IRQ 44 43 * sources and avoiding simultaneous deliveries if the same interrupt. 45 44 * 46 45 * ICP operations are done via a single compare & swap transaction ··· 110 109 { 111 110 int i; 112 111 113 - mutex_lock(&ics->lock); 112 + unsigned long flags; 113 + 114 + local_irq_save(flags); 115 + arch_spin_lock(&ics->lock); 114 116 115 117 for (i = 0; i < KVMPPC_XICS_IRQ_PER_ICS; i++) { 116 118 struct ics_irq_state *state = &ics->irq_state[i]; ··· 124 120 XICS_DBG("resend %#x prio %#x\n", state->number, 125 121 state->priority); 126 122 127 - mutex_unlock(&ics->lock); 123 + arch_spin_unlock(&ics->lock); 124 + local_irq_restore(flags); 128 125 icp_deliver_irq(xics, icp, state->number); 129 - mutex_lock(&ics->lock); 126 + local_irq_save(flags); 127 + arch_spin_lock(&ics->lock); 130 128 } 131 129 132 - mutex_unlock(&ics->lock); 130 + arch_spin_unlock(&ics->lock); 131 + local_irq_restore(flags); 133 132 } 134 133 135 134 static bool write_xive(struct kvmppc_xics *xics, struct kvmppc_ics *ics, ··· 140 133 u32 server, u32 priority, u32 saved_priority) 141 134 { 142 135 bool deliver; 136 + unsigned long flags; 143 137 144 - mutex_lock(&ics->lock); 138 + local_irq_save(flags); 139 + arch_spin_lock(&ics->lock); 145 140 146 141 state->server = server; 147 142 state->priority = priority; ··· 154 145 deliver = true; 155 146 } 156 147 157 - mutex_unlock(&ics->lock); 148 + arch_spin_unlock(&ics->lock); 149 + local_irq_restore(flags); 158 150 159 151 return deliver; 160 152 } ··· 196 186 struct kvmppc_ics *ics; 197 187 struct ics_irq_state *state; 198 188 u16 src; 189 + unsigned long flags; 199 190 200 191 if (!xics) 201 192 return -ENODEV; ··· 206 195 return -EINVAL; 207 196 state = &ics->irq_state[src]; 208 197 209 - mutex_lock(&ics->lock); 198 + local_irq_save(flags); 199 + arch_spin_lock(&ics->lock); 210 200 *server = state->server; 211 201 *priority = state->priority; 212 - mutex_unlock(&ics->lock); 202 + arch_spin_unlock(&ics->lock); 203 + local_irq_restore(flags); 213 204 214 205 return 0; 215 206 } ··· 378 365 struct kvmppc_ics *ics; 379 366 u32 reject; 380 367 u16 src; 368 + unsigned long flags; 381 369 382 370 /* 383 371 * This is used both for initial delivery of an interrupt and ··· 405 391 state = &ics->irq_state[src]; 406 392 407 393 /* Get a lock on the ICS */ 408 - mutex_lock(&ics->lock); 394 + local_irq_save(flags); 395 + arch_spin_lock(&ics->lock); 409 396 410 397 /* Get our server */ 411 398 if (!icp || state->server != icp->server_num) { ··· 449 434 * 450 435 * Note that if successful, the new delivery might have itself 451 436 * rejected an interrupt that was "delivered" before we took the 452 - * icp mutex. 437 + * ics spin lock. 453 438 * 454 439 * In this case we do the whole sequence all over again for the 455 440 * new guy. We cannot assume that the rejected interrupt is less ··· 463 448 * Delivery was successful, did we reject somebody else ? 464 449 */ 465 450 if (reject && reject != XICS_IPI) { 466 - mutex_unlock(&ics->lock); 451 + arch_spin_unlock(&ics->lock); 452 + local_irq_restore(flags); 467 453 new_irq = reject; 468 454 goto again; 469 455 } ··· 484 468 */ 485 469 smp_mb(); 486 470 if (!icp->state.need_resend) { 487 - mutex_unlock(&ics->lock); 471 + arch_spin_unlock(&ics->lock); 472 + local_irq_restore(flags); 488 473 goto again; 489 474 } 490 475 } 491 476 out: 492 - mutex_unlock(&ics->lock); 477 + arch_spin_unlock(&ics->lock); 478 + local_irq_restore(flags); 493 479 } 494 480 495 481 static void icp_down_cppr(struct kvmppc_xics *xics, struct kvmppc_icp *icp, ··· 820 802 XICS_DBG("XICS_RM: H_%x completing, act: %x state: %lx tgt: %p\n", 821 803 hcall, icp->rm_action, icp->rm_dbgstate.raw, icp->rm_dbgtgt); 822 804 823 - if (icp->rm_action & XICS_RM_KICK_VCPU) 805 + if (icp->rm_action & XICS_RM_KICK_VCPU) { 806 + icp->n_rm_kick_vcpu++; 824 807 kvmppc_fast_vcpu_kick(icp->rm_kick_target); 825 - if (icp->rm_action & XICS_RM_CHECK_RESEND) 808 + } 809 + if (icp->rm_action & XICS_RM_CHECK_RESEND) { 810 + icp->n_rm_check_resend++; 826 811 icp_check_resend(xics, icp->rm_resend_icp); 827 - if (icp->rm_action & XICS_RM_REJECT) 812 + } 813 + if (icp->rm_action & XICS_RM_REJECT) { 814 + icp->n_rm_reject++; 828 815 icp_deliver_irq(xics, icp, icp->rm_reject); 829 - if (icp->rm_action & XICS_RM_NOTIFY_EOI) 816 + } 817 + if (icp->rm_action & XICS_RM_NOTIFY_EOI) { 818 + icp->n_rm_notify_eoi++; 830 819 kvm_notify_acked_irq(vcpu->kvm, 0, icp->rm_eoied_irq); 820 + } 831 821 832 822 icp->rm_action = 0; 833 823 ··· 898 872 struct kvm *kvm = xics->kvm; 899 873 struct kvm_vcpu *vcpu; 900 874 int icsid, i; 875 + unsigned long flags; 876 + unsigned long t_rm_kick_vcpu, t_rm_check_resend; 877 + unsigned long t_rm_reject, t_rm_notify_eoi; 878 + unsigned long t_reject, t_check_resend; 901 879 902 880 if (!kvm) 903 881 return 0; 882 + 883 + t_rm_kick_vcpu = 0; 884 + t_rm_notify_eoi = 0; 885 + t_rm_check_resend = 0; 886 + t_rm_reject = 0; 887 + t_check_resend = 0; 888 + t_reject = 0; 904 889 905 890 seq_printf(m, "=========\nICP state\n=========\n"); 906 891 ··· 927 890 icp->server_num, state.xisr, 928 891 state.pending_pri, state.cppr, state.mfrr, 929 892 state.out_ee, state.need_resend); 893 + t_rm_kick_vcpu += icp->n_rm_kick_vcpu; 894 + t_rm_notify_eoi += icp->n_rm_notify_eoi; 895 + t_rm_check_resend += icp->n_rm_check_resend; 896 + t_rm_reject += icp->n_rm_reject; 897 + t_check_resend += icp->n_check_resend; 898 + t_reject += icp->n_reject; 930 899 } 931 900 901 + seq_printf(m, "ICP Guest->Host totals: kick_vcpu=%lu check_resend=%lu reject=%lu notify_eoi=%lu\n", 902 + t_rm_kick_vcpu, t_rm_check_resend, 903 + t_rm_reject, t_rm_notify_eoi); 904 + seq_printf(m, "ICP Real Mode totals: check_resend=%lu resend=%lu\n", 905 + t_check_resend, t_reject); 932 906 for (icsid = 0; icsid <= KVMPPC_XICS_MAX_ICS_ID; icsid++) { 933 907 struct kvmppc_ics *ics = xics->ics[icsid]; 934 908 ··· 949 901 seq_printf(m, "=========\nICS state for ICS 0x%x\n=========\n", 950 902 icsid); 951 903 952 - mutex_lock(&ics->lock); 904 + local_irq_save(flags); 905 + arch_spin_lock(&ics->lock); 953 906 954 907 for (i = 0; i < KVMPPC_XICS_IRQ_PER_ICS; i++) { 955 908 struct ics_irq_state *irq = &ics->irq_state[i]; ··· 961 912 irq->resend, irq->masked_pending); 962 913 963 914 } 964 - mutex_unlock(&ics->lock); 915 + arch_spin_unlock(&ics->lock); 916 + local_irq_restore(flags); 965 917 } 966 918 return 0; 967 919 } ··· 1015 965 if (!ics) 1016 966 goto out; 1017 967 1018 - mutex_init(&ics->lock); 1019 968 ics->icsid = icsid; 1020 969 1021 970 for (i = 0; i < KVMPPC_XICS_IRQ_PER_ICS; i++) { ··· 1156 1107 u64 __user *ubufp = (u64 __user *) addr; 1157 1108 u16 idx; 1158 1109 u64 val, prio; 1110 + unsigned long flags; 1159 1111 1160 1112 ics = kvmppc_xics_find_ics(xics, irq, &idx); 1161 1113 if (!ics) 1162 1114 return -ENOENT; 1163 1115 1164 1116 irqp = &ics->irq_state[idx]; 1165 - mutex_lock(&ics->lock); 1117 + local_irq_save(flags); 1118 + arch_spin_lock(&ics->lock); 1166 1119 ret = -ENOENT; 1167 1120 if (irqp->exists) { 1168 1121 val = irqp->server; ··· 1180 1129 val |= KVM_XICS_PENDING; 1181 1130 ret = 0; 1182 1131 } 1183 - mutex_unlock(&ics->lock); 1132 + arch_spin_unlock(&ics->lock); 1133 + local_irq_restore(flags); 1184 1134 1185 1135 if (!ret && put_user(val, ubufp)) 1186 1136 ret = -EFAULT; ··· 1198 1146 u64 val; 1199 1147 u8 prio; 1200 1148 u32 server; 1149 + unsigned long flags; 1201 1150 1202 1151 if (irq < KVMPPC_XICS_FIRST_IRQ || irq >= KVMPPC_XICS_NR_IRQS) 1203 1152 return -ENOENT; ··· 1219 1166 kvmppc_xics_find_server(xics->kvm, server) == NULL) 1220 1167 return -EINVAL; 1221 1168 1222 - mutex_lock(&ics->lock); 1169 + local_irq_save(flags); 1170 + arch_spin_lock(&ics->lock); 1223 1171 irqp->server = server; 1224 1172 irqp->saved_priority = prio; 1225 1173 if (val & KVM_XICS_MASKED) ··· 1232 1178 if ((val & KVM_XICS_PENDING) && (val & KVM_XICS_LEVEL_SENSITIVE)) 1233 1179 irqp->asserted = 1; 1234 1180 irqp->exists = 1; 1235 - mutex_unlock(&ics->lock); 1181 + arch_spin_unlock(&ics->lock); 1182 + local_irq_restore(flags); 1236 1183 1237 1184 if (val & KVM_XICS_PENDING) 1238 1185 icp_deliver_irq(xics, NULL, irqp->number);
+12 -1
arch/powerpc/kvm/book3s_xics.h
··· 78 78 u32 rm_reject; 79 79 u32 rm_eoied_irq; 80 80 81 + /* Counters for each reason we exited real mode */ 82 + unsigned long n_rm_kick_vcpu; 83 + unsigned long n_rm_check_resend; 84 + unsigned long n_rm_reject; 85 + unsigned long n_rm_notify_eoi; 86 + /* Counters for handling ICP processing in real mode */ 87 + unsigned long n_check_resend; 88 + unsigned long n_reject; 89 + 81 90 /* Debug stuff for real mode */ 82 91 union kvmppc_icp_state rm_dbgstate; 83 92 struct kvm_vcpu *rm_dbgtgt; 84 93 }; 85 94 86 95 struct kvmppc_ics { 87 - struct mutex lock; 96 + arch_spinlock_t lock; 88 97 u16 icsid; 89 98 struct ics_irq_state irq_state[KVMPPC_XICS_IRQ_PER_ICS]; 90 99 }; ··· 105 96 u32 max_icsid; 106 97 bool real_mode; 107 98 bool real_mode_dbg; 99 + u32 err_noics; 100 + u32 err_noicp; 108 101 struct kvmppc_ics *ics[KVMPPC_XICS_MAX_ICS_ID + 1]; 109 102 }; 110 103
+3
arch/powerpc/kvm/powerpc.c
··· 529 529 case KVM_CAP_PPC_RMA: 530 530 r = 0; 531 531 break; 532 + case KVM_CAP_PPC_HWRNG: 533 + r = kvmppc_hwrng_present(); 534 + break; 532 535 #endif 533 536 case KVM_CAP_SYNC_MMU: 534 537 #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
+29
arch/powerpc/platforms/powernv/rng.c
··· 24 24 25 25 struct powernv_rng { 26 26 void __iomem *regs; 27 + void __iomem *regs_real; 27 28 unsigned long mask; 28 29 }; 29 30 30 31 static DEFINE_PER_CPU(struct powernv_rng *, powernv_rng); 31 32 33 + 34 + int powernv_hwrng_present(void) 35 + { 36 + struct powernv_rng *rng; 37 + 38 + rng = get_cpu_var(powernv_rng); 39 + put_cpu_var(rng); 40 + return rng != NULL; 41 + } 32 42 33 43 static unsigned long rng_whiten(struct powernv_rng *rng, unsigned long val) 34 44 { ··· 54 44 rng->mask = (rng->mask << 1) | (parity & 1); 55 45 56 46 return val; 47 + } 48 + 49 + int powernv_get_random_real_mode(unsigned long *v) 50 + { 51 + struct powernv_rng *rng; 52 + 53 + rng = raw_cpu_read(powernv_rng); 54 + 55 + *v = rng_whiten(rng, in_rm64(rng->regs_real)); 56 + 57 + return 1; 57 58 } 58 59 59 60 int powernv_get_random_long(unsigned long *v) ··· 101 80 static __init int rng_create(struct device_node *dn) 102 81 { 103 82 struct powernv_rng *rng; 83 + struct resource res; 104 84 unsigned long val; 105 85 106 86 rng = kzalloc(sizeof(*rng), GFP_KERNEL); 107 87 if (!rng) 108 88 return -ENOMEM; 89 + 90 + if (of_address_to_resource(dn, 0, &res)) { 91 + kfree(rng); 92 + return -ENXIO; 93 + } 94 + 95 + rng->regs_real = (void __iomem *)res.start; 109 96 110 97 rng->regs = of_iomap(dn, 0); 111 98 if (!rng->regs) {
+1 -1
arch/s390/kvm/kvm-s390.c
··· 110 110 /* upper facilities limit for kvm */ 111 111 unsigned long kvm_s390_fac_list_mask[] = { 112 112 0xffe6fffbfcfdfc40UL, 113 - 0x205c800000000000UL, 113 + 0x005c800000000000UL, 114 114 }; 115 115 116 116 unsigned long kvm_s390_fac_list_mask_size(void)
+6 -5
arch/x86/kvm/lapic.c
··· 683 683 unsigned long bitmap = 1; 684 684 struct kvm_lapic **dst; 685 685 int i; 686 - bool ret = false; 687 - bool x2apic_ipi = src && apic_x2apic_mode(src); 686 + bool ret, x2apic_ipi; 688 687 689 688 *r = -1; 690 689 ··· 695 696 if (irq->shorthand) 696 697 return false; 697 698 699 + x2apic_ipi = src && apic_x2apic_mode(src); 698 700 if (irq->dest_id == (x2apic_ipi ? X2APIC_BROADCAST : APIC_BROADCAST)) 699 701 return false; 700 702 703 + ret = true; 701 704 rcu_read_lock(); 702 705 map = rcu_dereference(kvm->arch.apic_map); 703 706 704 - if (!map) 707 + if (!map) { 708 + ret = false; 705 709 goto out; 706 - 707 - ret = true; 710 + } 708 711 709 712 if (irq->dest_mode == APIC_DEST_PHYSICAL) { 710 713 if (irq->dest_id >= ARRAY_SIZE(map->phys_map))
+7 -13
arch/x86/kvm/mmu.c
··· 4481 4481 pfn = spte_to_pfn(*sptep); 4482 4482 4483 4483 /* 4484 - * Only EPT supported for now; otherwise, one would need to 4485 - * find out efficiently whether the guest page tables are 4486 - * also using huge pages. 4484 + * We cannot do huge page mapping for indirect shadow pages, 4485 + * which are found on the last rmap (level = 1) when not using 4486 + * tdp; such shadow pages are synced with the page table in 4487 + * the guest, and the guest page table is using 4K page size 4488 + * mapping if the indirect sp has level = 1. 4487 4489 */ 4488 4490 if (sp->role.direct && 4489 4491 !kvm_is_reserved_pfn(pfn) && ··· 4506 4504 bool flush = false; 4507 4505 unsigned long *rmapp; 4508 4506 unsigned long last_index, index; 4509 - gfn_t gfn_start, gfn_end; 4510 4507 4511 4508 spin_lock(&kvm->mmu_lock); 4512 4509 4513 - gfn_start = memslot->base_gfn; 4514 - gfn_end = memslot->base_gfn + memslot->npages - 1; 4515 - 4516 - if (gfn_start >= gfn_end) 4517 - goto out; 4518 - 4519 4510 rmapp = memslot->arch.rmap[0]; 4520 - last_index = gfn_to_index(gfn_end, memslot->base_gfn, 4521 - PT_PAGE_TABLE_LEVEL); 4511 + last_index = gfn_to_index(memslot->base_gfn + memslot->npages - 1, 4512 + memslot->base_gfn, PT_PAGE_TABLE_LEVEL); 4522 4513 4523 4514 for (index = 0; index <= last_index; ++index, ++rmapp) { 4524 4515 if (*rmapp) ··· 4529 4534 if (flush) 4530 4535 kvm_flush_remote_tlbs(kvm); 4531 4536 4532 - out: 4533 4537 spin_unlock(&kvm->mmu_lock); 4534 4538 } 4535 4539
+10 -2
arch/x86/kvm/vmx.c
··· 3622 3622 3623 3623 static int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) 3624 3624 { 3625 - unsigned long hw_cr4 = cr4 | (to_vmx(vcpu)->rmode.vm86_active ? 3626 - KVM_RMODE_VM_CR4_ALWAYS_ON : KVM_PMODE_VM_CR4_ALWAYS_ON); 3625 + /* 3626 + * Pass through host's Machine Check Enable value to hw_cr4, which 3627 + * is in force while we are in guest mode. Do not let guests control 3628 + * this bit, even if host CR4.MCE == 0. 3629 + */ 3630 + unsigned long hw_cr4 = 3631 + (cr4_read_shadow() & X86_CR4_MCE) | 3632 + (cr4 & ~X86_CR4_MCE) | 3633 + (to_vmx(vcpu)->rmode.vm86_active ? 3634 + KVM_RMODE_VM_CR4_ALWAYS_ON : KVM_PMODE_VM_CR4_ALWAYS_ON); 3627 3635 3628 3636 if (cr4 & X86_CR4_VMXE) { 3629 3637 /*
+8 -2
arch/x86/kvm/x86.c
··· 5799 5799 kvm_set_mmio_spte_mask(); 5800 5800 5801 5801 kvm_x86_ops = ops; 5802 - kvm_init_msr_list(); 5803 5802 5804 5803 kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK, 5805 5804 PT_DIRTY_MASK, PT64_NX_MASK, 0); ··· 7252 7253 7253 7254 int kvm_arch_hardware_setup(void) 7254 7255 { 7255 - return kvm_x86_ops->hardware_setup(); 7256 + int r; 7257 + 7258 + r = kvm_x86_ops->hardware_setup(); 7259 + if (r != 0) 7260 + return r; 7261 + 7262 + kvm_init_msr_list(); 7263 + return 0; 7256 7264 } 7257 7265 7258 7266 void kvm_arch_hardware_unsetup(void)
+1
include/uapi/linux/kvm.h
··· 813 813 #define KVM_CAP_MIPS_MSA 112 814 814 #define KVM_CAP_S390_INJECT_IRQ 113 815 815 #define KVM_CAP_S390_IRQ_STATE 114 816 + #define KVM_CAP_PPC_HWRNG 115 816 817 817 818 #ifdef KVM_CAP_IRQ_ROUTING 818 819
+4 -1
virt/kvm/arm/vgic.c
··· 1561 1561 goto out; 1562 1562 } 1563 1563 1564 + if (irq_num >= kvm->arch.vgic.nr_irqs) 1565 + return -EINVAL; 1566 + 1564 1567 vcpu_id = vgic_update_irq_pending(kvm, cpuid, irq_num, level); 1565 1568 if (vcpu_id >= 0) { 1566 1569 /* kick the specified vcpu */ ··· 2144 2141 struct kvm_kernel_irq_routing_entry *entries, 2145 2142 int gsi) 2146 2143 { 2147 - return gsi; 2144 + return 0; 2148 2145 } 2149 2146 2150 2147 int kvm_irq_map_chip_pin(struct kvm *kvm, unsigned irqchip, unsigned pin)
+1
virt/kvm/kvm_main.c
··· 89 89 static __read_mostly struct preempt_ops kvm_preempt_ops; 90 90 91 91 struct dentry *kvm_debugfs_dir; 92 + EXPORT_SYMBOL_GPL(kvm_debugfs_dir); 92 93 93 94 static long kvm_vcpu_ioctl(struct file *file, unsigned int ioctl, 94 95 unsigned long arg);