Merge branch 'linus' into x86/cleanups, to pick up dependent commits

+2 -2

Documentation/virt/kvm/api.rst

··· 4803 4803 4.126 KVM_X86_SET_MSR_FILTER 4804 4804 ---------------------------- 4805 4805 4806 - :Capability: KVM_X86_SET_MSR_FILTER 4806 + :Capability: KVM_CAP_X86_MSR_FILTER 4807 4807 :Architectures: x86 4808 4808 :Type: vm ioctl 4809 4809 :Parameters: struct kvm_msr_filter ··· 6715 6715 instead get bounced to user space through the KVM_EXIT_X86_RDMSR and 6716 6716 KVM_EXIT_X86_WRMSR exit notifications. 6717 6717 6718 - 8.27 KVM_X86_SET_MSR_FILTER 6718 + 8.27 KVM_CAP_X86_MSR_FILTER 6719 6719 --------------------------- 6720 6720 6721 6721 :Architectures: x86

+12 -3

arch/x86/include/asm/kvm_host.h

··· 113 113 #define VALID_PAGE(x) ((x) != INVALID_PAGE) 114 114 115 115 #define UNMAPPED_GVA (~(gpa_t)0) 116 + #define INVALID_GPA (~(gpa_t)0) 116 117 117 118 /* KVM Hugepage definitions for x86 */ 118 119 #define KVM_MAX_HUGEPAGE_LEVEL PG_LEVEL_1G ··· 200 199 201 200 #define KVM_NR_DB_REGS 4 202 201 202 + #define DR6_BUS_LOCK (1 << 11) 203 203 #define DR6_BD (1 << 13) 204 204 #define DR6_BS (1 << 14) 205 205 #define DR6_BT (1 << 15) ··· 214 212 * DR6_ACTIVE_LOW is also used as the init/reset value for DR6. 215 213 */ 216 214 #define DR6_ACTIVE_LOW 0xffff0ff0 217 - #define DR6_VOLATILE 0x0001e00f 215 + #define DR6_VOLATILE 0x0001e80f 218 216 #define DR6_FIXED_1 (DR6_ACTIVE_LOW & ~DR6_VOLATILE) 219 217 220 218 #define DR7_BP_EN_MASK 0x000000ff ··· 409 407 u32 pkru_mask; 410 408 411 409 u64 *pae_root; 412 - u64 *lm_root; 410 + u64 *pml4_root; 413 411 414 412 /* 415 413 * check zero bits on shadow page table entries, these ··· 1419 1417 bool direct_map; 1420 1418 }; 1421 1419 1420 + extern u32 __read_mostly kvm_nr_uret_msrs; 1422 1421 extern u64 __read_mostly host_efer; 1423 1422 extern bool __read_mostly allow_smaller_maxphyaddr; 1424 1423 extern struct kvm_x86_ops kvm_x86_ops; ··· 1778 1775 unsigned long ipi_bitmap_high, u32 min, 1779 1776 unsigned long icr, int op_64_bit); 1780 1777 1781 - void kvm_define_user_return_msr(unsigned index, u32 msr); 1778 + int kvm_add_user_return_msr(u32 msr); 1779 + int kvm_find_user_return_msr(u32 msr); 1782 1780 int kvm_set_user_return_msr(unsigned index, u64 val, u64 mask); 1781 + 1782 + static inline bool kvm_is_supported_user_return_msr(u32 msr) 1783 + { 1784 + return kvm_find_user_return_msr(msr) >= 0; 1785 + } 1783 1786 1784 1787 u64 kvm_scale_tsc(struct kvm_vcpu *vcpu, u64 tsc); 1785 1788 u64 kvm_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc);

+2 -8

arch/x86/include/asm/kvm_para.h

··· 7 7 #include <linux/interrupt.h> 8 8 #include <uapi/asm/kvm_para.h> 9 9 10 - extern void kvmclock_init(void); 11 - 12 10 #ifdef CONFIG_KVM_GUEST 13 11 bool kvm_check_and_clear_guest_paused(void); 14 12 #else ··· 84 86 } 85 87 86 88 #ifdef CONFIG_KVM_GUEST 89 + void kvmclock_init(void); 90 + void kvmclock_disable(void); 87 91 bool kvm_para_available(void); 88 92 unsigned int kvm_arch_para_features(void); 89 93 unsigned int kvm_arch_para_hints(void); 90 94 void kvm_async_pf_task_wait_schedule(u32 token); 91 95 void kvm_async_pf_task_wake(u32 token); 92 96 u32 kvm_read_and_reset_apf_flags(void); 93 - void kvm_disable_steal_time(void); 94 97 bool __kvm_handle_async_pf(struct pt_regs *regs, u32 token); 95 98 96 99 DECLARE_STATIC_KEY_FALSE(kvm_async_pf_enabled); ··· 134 135 static inline u32 kvm_read_and_reset_apf_flags(void) 135 136 { 136 137 return 0; 137 - } 138 - 139 - static inline void kvm_disable_steal_time(void) 140 - { 141 - return; 142 138 } 143 139 144 140 static __always_inline bool kvm_handle_async_pf(struct pt_regs *regs, u32 token)

+2

arch/x86/include/uapi/asm/kvm.h

··· 437 437 __u16 flags; 438 438 } smm; 439 439 440 + __u16 pad; 441 + 440 442 __u32 flags; 441 443 __u64 preemption_timer_deadline; 442 444 };

+88 -51

arch/x86/kernel/kvm.c

··· 26 26 #include <linux/kprobes.h> 27 27 #include <linux/nmi.h> 28 28 #include <linux/swait.h> 29 + #include <linux/syscore_ops.h> 29 30 #include <asm/timer.h> 30 31 #include <asm/cpu.h> 31 32 #include <asm/traps.h> ··· 38 37 #include <asm/tlb.h> 39 38 #include <asm/cpuidle_haltpoll.h> 40 39 #include <asm/ptrace.h> 40 + #include <asm/reboot.h> 41 41 #include <asm/svm.h> 42 42 43 43 DEFINE_STATIC_KEY_FALSE(kvm_async_pf_enabled); ··· 347 345 348 346 wrmsrl(MSR_KVM_ASYNC_PF_EN, pa); 349 347 __this_cpu_write(apf_reason.enabled, 1); 350 - pr_info("KVM setup async PF for cpu %d\n", smp_processor_id()); 348 + pr_info("setup async PF for cpu %d\n", smp_processor_id()); 351 349 } 352 350 353 351 if (kvm_para_has_feature(KVM_FEATURE_PV_EOI)) { ··· 373 371 wrmsrl(MSR_KVM_ASYNC_PF_EN, 0); 374 372 __this_cpu_write(apf_reason.enabled, 0); 375 373 376 - pr_info("Unregister pv shared memory for cpu %d\n", smp_processor_id()); 374 + pr_info("disable async PF for cpu %d\n", smp_processor_id()); 377 375 } 378 376 379 - static void kvm_pv_guest_cpu_reboot(void *unused) 377 + static void kvm_disable_steal_time(void) 380 378 { 381 - /* 382 - * We disable PV EOI before we load a new kernel by kexec, 383 - * since MSR_KVM_PV_EOI_EN stores a pointer into old kernel's memory. 384 - * New kernel can re-enable when it boots. 385 - */ 386 - if (kvm_para_has_feature(KVM_FEATURE_PV_EOI)) 387 - wrmsrl(MSR_KVM_PV_EOI_EN, 0); 388 - kvm_pv_disable_apf(); 389 - kvm_disable_steal_time(); 390 - } 379 + if (!has_steal_clock) 380 + return; 391 381 392 - static int kvm_pv_reboot_notify(struct notifier_block *nb, 393 - unsigned long code, void *unused) 394 - { 395 - if (code == SYS_RESTART) 396 - on_each_cpu(kvm_pv_guest_cpu_reboot, NULL, 1); 397 - return NOTIFY_DONE; 382 + wrmsr(MSR_KVM_STEAL_TIME, 0, 0); 398 383 } 399 - 400 - static struct notifier_block kvm_pv_reboot_nb = { 401 - .notifier_call = kvm_pv_reboot_notify, 402 - }; 403 384 404 385 static u64 kvm_steal_clock(int cpu) 405 386 { ··· 399 414 } while ((version & 1) || (version != src->version)); 400 415 401 416 return steal; 402 - } 403 - 404 - void kvm_disable_steal_time(void) 405 - { 406 - if (!has_steal_clock) 407 - return; 408 - 409 - wrmsr(MSR_KVM_STEAL_TIME, 0, 0); 410 417 } 411 418 412 419 static inline void __set_percpu_decrypted(void *ptr, unsigned long size) ··· 426 449 __set_percpu_decrypted(&per_cpu(steal_time, cpu), sizeof(steal_time)); 427 450 __set_percpu_decrypted(&per_cpu(kvm_apic_eoi, cpu), sizeof(kvm_apic_eoi)); 428 451 } 452 + } 453 + 454 + static void kvm_guest_cpu_offline(bool shutdown) 455 + { 456 + kvm_disable_steal_time(); 457 + if (kvm_para_has_feature(KVM_FEATURE_PV_EOI)) 458 + wrmsrl(MSR_KVM_PV_EOI_EN, 0); 459 + kvm_pv_disable_apf(); 460 + if (!shutdown) 461 + apf_task_wake_all(); 462 + kvmclock_disable(); 463 + } 464 + 465 + static int kvm_cpu_online(unsigned int cpu) 466 + { 467 + unsigned long flags; 468 + 469 + local_irq_save(flags); 470 + kvm_guest_cpu_init(); 471 + local_irq_restore(flags); 472 + return 0; 429 473 } 430 474 431 475 #ifdef CONFIG_SMP ··· 633 635 kvm_spinlock_init(); 634 636 } 635 637 636 - static void kvm_guest_cpu_offline(void) 637 - { 638 - kvm_disable_steal_time(); 639 - if (kvm_para_has_feature(KVM_FEATURE_PV_EOI)) 640 - wrmsrl(MSR_KVM_PV_EOI_EN, 0); 641 - kvm_pv_disable_apf(); 642 - apf_task_wake_all(); 643 - } 644 - 645 - static int kvm_cpu_online(unsigned int cpu) 646 - { 647 - local_irq_disable(); 648 - kvm_guest_cpu_init(); 649 - local_irq_enable(); 650 - return 0; 651 - } 652 - 653 638 static int kvm_cpu_down_prepare(unsigned int cpu) 654 639 { 655 - local_irq_disable(); 656 - kvm_guest_cpu_offline(); 657 - local_irq_enable(); 640 + unsigned long flags; 641 + 642 + local_irq_save(flags); 643 + kvm_guest_cpu_offline(false); 644 + local_irq_restore(flags); 658 645 return 0; 659 646 } 660 647 648 + #endif 649 + 650 + static int kvm_suspend(void) 651 + { 652 + kvm_guest_cpu_offline(false); 653 + 654 + return 0; 655 + } 656 + 657 + static void kvm_resume(void) 658 + { 659 + kvm_cpu_online(raw_smp_processor_id()); 660 + } 661 + 662 + static struct syscore_ops kvm_syscore_ops = { 663 + .suspend = kvm_suspend, 664 + .resume = kvm_resume, 665 + }; 666 + 667 + static void kvm_pv_guest_cpu_reboot(void *unused) 668 + { 669 + kvm_guest_cpu_offline(true); 670 + } 671 + 672 + static int kvm_pv_reboot_notify(struct notifier_block *nb, 673 + unsigned long code, void *unused) 674 + { 675 + if (code == SYS_RESTART) 676 + on_each_cpu(kvm_pv_guest_cpu_reboot, NULL, 1); 677 + return NOTIFY_DONE; 678 + } 679 + 680 + static struct notifier_block kvm_pv_reboot_nb = { 681 + .notifier_call = kvm_pv_reboot_notify, 682 + }; 683 + 684 + /* 685 + * After a PV feature is registered, the host will keep writing to the 686 + * registered memory location. If the guest happens to shutdown, this memory 687 + * won't be valid. In cases like kexec, in which you install a new kernel, this 688 + * means a random memory location will be kept being written. 689 + */ 690 + #ifdef CONFIG_KEXEC_CORE 691 + static void kvm_crash_shutdown(struct pt_regs *regs) 692 + { 693 + kvm_guest_cpu_offline(true); 694 + native_machine_crash_shutdown(regs); 695 + } 661 696 #endif 662 697 663 698 static void __init kvm_guest_init(void) ··· 734 703 sev_map_percpu_data(); 735 704 kvm_guest_cpu_init(); 736 705 #endif 706 + 707 + #ifdef CONFIG_KEXEC_CORE 708 + machine_ops.crash_shutdown = kvm_crash_shutdown; 709 + #endif 710 + 711 + register_syscore_ops(&kvm_syscore_ops); 737 712 738 713 /* 739 714 * Hard lockup detection is enabled by default. Disable it, as guests

+1 -25

arch/x86/kernel/kvmclock.c

··· 20 20 #include <asm/hypervisor.h> 21 21 #include <asm/mem_encrypt.h> 22 22 #include <asm/x86_init.h> 23 - #include <asm/reboot.h> 24 23 #include <asm/kvmclock.h> 25 24 26 25 static int kvmclock __initdata = 1; ··· 202 203 } 203 204 #endif 204 205 205 - /* 206 - * After the clock is registered, the host will keep writing to the 207 - * registered memory location. If the guest happens to shutdown, this memory 208 - * won't be valid. In cases like kexec, in which you install a new kernel, this 209 - * means a random memory location will be kept being written. So before any 210 - * kind of shutdown from our side, we unregister the clock by writing anything 211 - * that does not have the 'enable' bit set in the msr 212 - */ 213 - #ifdef CONFIG_KEXEC_CORE 214 - static void kvm_crash_shutdown(struct pt_regs *regs) 206 + void kvmclock_disable(void) 215 207 { 216 208 native_write_msr(msr_kvm_system_time, 0, 0); 217 - kvm_disable_steal_time(); 218 - native_machine_crash_shutdown(regs); 219 - } 220 - #endif 221 - 222 - static void kvm_shutdown(void) 223 - { 224 - native_write_msr(msr_kvm_system_time, 0, 0); 225 - kvm_disable_steal_time(); 226 - native_machine_shutdown(); 227 209 } 228 210 229 211 static void __init kvmclock_init_mem(void) ··· 331 351 #endif 332 352 x86_platform.save_sched_clock_state = kvm_save_sched_clock_state; 333 353 x86_platform.restore_sched_clock_state = kvm_restore_sched_clock_state; 334 - machine_ops.shutdown = kvm_shutdown; 335 - #ifdef CONFIG_KEXEC_CORE 336 - machine_ops.crash_shutdown = kvm_crash_shutdown; 337 - #endif 338 354 kvm_get_preset_lpj(); 339 355 340 356 /*

+18 -2

arch/x86/kvm/cpuid.c

··· 458 458 F(AVX512_VPOPCNTDQ) | F(UMIP) | F(AVX512_VBMI2) | F(GFNI) | 459 459 F(VAES) | F(VPCLMULQDQ) | F(AVX512_VNNI) | F(AVX512_BITALG) | 460 460 F(CLDEMOTE) | F(MOVDIRI) | F(MOVDIR64B) | 0 /*WAITPKG*/ | 461 - F(SGX_LC) 461 + F(SGX_LC) | F(BUS_LOCK_DETECT) 462 462 ); 463 463 /* Set LA57 based on hardware capability. */ 464 464 if (cpuid_ecx(7) & F(LA57)) ··· 567 567 F(ACE2) | F(ACE2_EN) | F(PHE) | F(PHE_EN) | 568 568 F(PMM) | F(PMM_EN) 569 569 ); 570 + 571 + /* 572 + * Hide RDTSCP and RDPID if either feature is reported as supported but 573 + * probing MSR_TSC_AUX failed. This is purely a sanity check and 574 + * should never happen, but the guest will likely crash if RDTSCP or 575 + * RDPID is misreported, and KVM has botched MSR_TSC_AUX emulation in 576 + * the past. For example, the sanity check may fire if this instance of 577 + * KVM is running as L1 on top of an older, broken KVM. 578 + */ 579 + if (WARN_ON((kvm_cpu_cap_has(X86_FEATURE_RDTSCP) || 580 + kvm_cpu_cap_has(X86_FEATURE_RDPID)) && 581 + !kvm_is_supported_user_return_msr(MSR_TSC_AUX))) { 582 + kvm_cpu_cap_clear(X86_FEATURE_RDTSCP); 583 + kvm_cpu_cap_clear(X86_FEATURE_RDPID); 584 + } 570 585 } 571 586 EXPORT_SYMBOL_GPL(kvm_set_cpu_caps); 572 587 ··· 652 637 case 7: 653 638 entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; 654 639 entry->eax = 0; 655 - entry->ecx = F(RDPID); 640 + if (kvm_cpu_cap_has(X86_FEATURE_RDTSCP)) 641 + entry->ecx = F(RDPID); 656 642 ++array->nent; 657 643 default: 658 644 break;

+1 -1

arch/x86/kvm/emulate.c

··· 4502 4502 * from the register case of group9. 4503 4503 */ 4504 4504 static const struct gprefix pfx_0f_c7_7 = { 4505 - N, N, N, II(DstMem | ModRM | Op3264 | EmulateOnUD, em_rdpid, rdtscp), 4505 + N, N, N, II(DstMem | ModRM | Op3264 | EmulateOnUD, em_rdpid, rdpid), 4506 4506 }; 4507 4507 4508 4508

+1

arch/x86/kvm/kvm_emulate.h

··· 468 468 x86_intercept_clgi, 469 469 x86_intercept_skinit, 470 470 x86_intercept_rdtscp, 471 + x86_intercept_rdpid, 471 472 x86_intercept_icebp, 472 473 x86_intercept_wbinvd, 473 474 x86_intercept_monitor,

+1 -1

arch/x86/kvm/lapic.c

··· 1913 1913 if (!apic->lapic_timer.hv_timer_in_use) 1914 1914 goto out; 1915 1915 WARN_ON(rcuwait_active(&vcpu->wait)); 1916 - cancel_hv_timer(apic); 1917 1916 apic_timer_expired(apic, false); 1917 + cancel_hv_timer(apic); 1918 1918 1919 1919 if (apic_lvtt_period(apic) && apic->lapic_timer.period) { 1920 1920 advance_periodic_target_expiration(apic);

+10 -10

arch/x86/kvm/mmu/mmu.c

··· 3310 3310 if (mmu->shadow_root_level == PT64_ROOT_4LEVEL) { 3311 3311 pm_mask |= PT_ACCESSED_MASK | PT_WRITABLE_MASK | PT_USER_MASK; 3312 3312 3313 - if (WARN_ON_ONCE(!mmu->lm_root)) { 3313 + if (WARN_ON_ONCE(!mmu->pml4_root)) { 3314 3314 r = -EIO; 3315 3315 goto out_unlock; 3316 3316 } 3317 3317 3318 - mmu->lm_root[0] = __pa(mmu->pae_root) | pm_mask; 3318 + mmu->pml4_root[0] = __pa(mmu->pae_root) | pm_mask; 3319 3319 } 3320 3320 3321 3321 for (i = 0; i < 4; ++i) { ··· 3335 3335 } 3336 3336 3337 3337 if (mmu->shadow_root_level == PT64_ROOT_4LEVEL) 3338 - mmu->root_hpa = __pa(mmu->lm_root); 3338 + mmu->root_hpa = __pa(mmu->pml4_root); 3339 3339 else 3340 3340 mmu->root_hpa = __pa(mmu->pae_root); 3341 3341 ··· 3350 3350 static int mmu_alloc_special_roots(struct kvm_vcpu *vcpu) 3351 3351 { 3352 3352 struct kvm_mmu *mmu = vcpu->arch.mmu; 3353 - u64 *lm_root, *pae_root; 3353 + u64 *pml4_root, *pae_root; 3354 3354 3355 3355 /* 3356 3356 * When shadowing 32-bit or PAE NPT with 64-bit NPT, the PML4 and PDP ··· 3369 3369 if (WARN_ON_ONCE(mmu->shadow_root_level != PT64_ROOT_4LEVEL)) 3370 3370 return -EIO; 3371 3371 3372 - if (mmu->pae_root && mmu->lm_root) 3372 + if (mmu->pae_root && mmu->pml4_root) 3373 3373 return 0; 3374 3374 3375 3375 /* 3376 3376 * The special roots should always be allocated in concert. Yell and 3377 3377 * bail if KVM ends up in a state where only one of the roots is valid. 3378 3378 */ 3379 - if (WARN_ON_ONCE(!tdp_enabled || mmu->pae_root || mmu->lm_root)) 3379 + if (WARN_ON_ONCE(!tdp_enabled || mmu->pae_root || mmu->pml4_root)) 3380 3380 return -EIO; 3381 3381 3382 3382 /* ··· 3387 3387 if (!pae_root) 3388 3388 return -ENOMEM; 3389 3389 3390 - lm_root = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT); 3391 - if (!lm_root) { 3390 + pml4_root = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT); 3391 + if (!pml4_root) { 3392 3392 free_page((unsigned long)pae_root); 3393 3393 return -ENOMEM; 3394 3394 } 3395 3395 3396 3396 mmu->pae_root = pae_root; 3397 - mmu->lm_root = lm_root; 3397 + mmu->pml4_root = pml4_root; 3398 3398 3399 3399 return 0; 3400 3400 } ··· 5261 5261 if (!tdp_enabled && mmu->pae_root) 5262 5262 set_memory_encrypted((unsigned long)mmu->pae_root, 1); 5263 5263 free_page((unsigned long)mmu->pae_root); 5264 - free_page((unsigned long)mmu->lm_root); 5264 + free_page((unsigned long)mmu->pml4_root); 5265 5265 } 5266 5266 5267 5267 static int __kvm_mmu_create(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu)

+16 -1

arch/x86/kvm/mmu/tdp_mmu.c

··· 388 388 } 389 389 390 390 /** 391 - * handle_changed_spte - handle bookkeeping associated with an SPTE change 391 + * __handle_changed_spte - handle bookkeeping associated with an SPTE change 392 392 * @kvm: kvm instance 393 393 * @as_id: the address space of the paging structure the SPTE was a part of 394 394 * @gfn: the base GFN that was mapped by the SPTE ··· 443 443 return; 444 444 445 445 trace_kvm_tdp_mmu_spte_changed(as_id, gfn, level, old_spte, new_spte); 446 + 447 + if (is_large_pte(old_spte) != is_large_pte(new_spte)) { 448 + if (is_large_pte(old_spte)) 449 + atomic64_sub(1, (atomic64_t*)&kvm->stat.lpages); 450 + else 451 + atomic64_add(1, (atomic64_t*)&kvm->stat.lpages); 452 + } 446 453 447 454 /* 448 455 * The only times a SPTE should be changed from a non-present to ··· 1016 1009 } 1017 1010 1018 1011 if (!is_shadow_present_pte(iter.old_spte)) { 1012 + /* 1013 + * If SPTE has been forzen by another thread, just 1014 + * give up and retry, avoiding unnecessary page table 1015 + * allocation and free. 1016 + */ 1017 + if (is_removed_spte(iter.old_spte)) 1018 + break; 1019 + 1019 1020 sp = alloc_tdp_mmu_page(vcpu, iter.gfn, iter.level); 1020 1021 child_pt = sp->spt; 1021 1022

+19 -4

arch/x86/kvm/svm/nested.c

··· 764 764 nested_svm_copy_common_state(svm->nested.vmcb02.ptr, svm->vmcb01.ptr); 765 765 766 766 svm_switch_vmcb(svm, &svm->vmcb01); 767 - WARN_ON_ONCE(svm->vmcb->control.exit_code != SVM_EXIT_VMRUN); 768 767 769 768 /* 770 769 * On vmexit the GIF is set to false and ··· 871 872 __free_page(virt_to_page(svm->nested.vmcb02.ptr)); 872 873 svm->nested.vmcb02.ptr = NULL; 873 874 875 + /* 876 + * When last_vmcb12_gpa matches the current vmcb12 gpa, 877 + * some vmcb12 fields are not loaded if they are marked clean 878 + * in the vmcb12, since in this case they are up to date already. 879 + * 880 + * When the vmcb02 is freed, this optimization becomes invalid. 881 + */ 882 + svm->nested.last_vmcb12_gpa = INVALID_GPA; 883 + 874 884 svm->nested.initialized = false; 875 885 } 876 886 ··· 892 884 893 885 if (is_guest_mode(vcpu)) { 894 886 svm->nested.nested_run_pending = 0; 887 + svm->nested.vmcb12_gpa = INVALID_GPA; 888 + 895 889 leave_guest_mode(vcpu); 896 890 897 - svm_switch_vmcb(svm, &svm->nested.vmcb02); 891 + svm_switch_vmcb(svm, &svm->vmcb01); 898 892 899 893 nested_svm_uninit_mmu_context(vcpu); 900 894 vmcb_mark_all_dirty(svm->vmcb); ··· 1308 1298 * L2 registers if needed are moved from the current VMCB to VMCB02. 1309 1299 */ 1310 1300 1301 + if (is_guest_mode(vcpu)) 1302 + svm_leave_nested(svm); 1303 + else 1304 + svm->nested.vmcb02.ptr->save = svm->vmcb01.ptr->save; 1305 + 1306 + svm_set_gif(svm, !!(kvm_state->flags & KVM_STATE_NESTED_GIF_SET)); 1307 + 1311 1308 svm->nested.nested_run_pending = 1312 1309 !!(kvm_state->flags & KVM_STATE_NESTED_RUN_PENDING); 1313 1310 1314 1311 svm->nested.vmcb12_gpa = kvm_state->hdr.svm.vmcb_pa; 1315 - if (svm->current_vmcb == &svm->vmcb01) 1316 - svm->nested.vmcb02.ptr->save = svm->vmcb01.ptr->save; 1317 1312 1318 1313 svm->vmcb01.ptr->save.es = save->es; 1319 1314 svm->vmcb01.ptr->save.cs = save->cs;

+14 -18

arch/x86/kvm/svm/sev.c

··· 763 763 } 764 764 765 765 static int __sev_dbg_decrypt_user(struct kvm *kvm, unsigned long paddr, 766 - unsigned long __user dst_uaddr, 766 + void __user *dst_uaddr, 767 767 unsigned long dst_paddr, 768 768 int size, int *err) 769 769 { ··· 787 787 788 788 if (tpage) { 789 789 offset = paddr & 15; 790 - if (copy_to_user((void __user *)(uintptr_t)dst_uaddr, 791 - page_address(tpage) + offset, size)) 790 + if (copy_to_user(dst_uaddr, page_address(tpage) + offset, size)) 792 791 ret = -EFAULT; 793 792 } 794 793 ··· 799 800 } 800 801 801 802 static int __sev_dbg_encrypt_user(struct kvm *kvm, unsigned long paddr, 802 - unsigned long __user vaddr, 803 + void __user *vaddr, 803 804 unsigned long dst_paddr, 804 - unsigned long __user dst_vaddr, 805 + void __user *dst_vaddr, 805 806 int size, int *error) 806 807 { 807 808 struct page *src_tpage = NULL; ··· 809 810 int ret, len = size; 810 811 811 812 /* If source buffer is not aligned then use an intermediate buffer */ 812 - if (!IS_ALIGNED(vaddr, 16)) { 813 + if (!IS_ALIGNED((unsigned long)vaddr, 16)) { 813 814 src_tpage = alloc_page(GFP_KERNEL); 814 815 if (!src_tpage) 815 816 return -ENOMEM; 816 817 817 - if (copy_from_user(page_address(src_tpage), 818 - (void __user *)(uintptr_t)vaddr, size)) { 818 + if (copy_from_user(page_address(src_tpage), vaddr, size)) { 819 819 __free_page(src_tpage); 820 820 return -EFAULT; 821 821 } ··· 828 830 * - copy the source buffer in an intermediate buffer 829 831 * - use the intermediate buffer as source buffer 830 832 */ 831 - if (!IS_ALIGNED(dst_vaddr, 16) || !IS_ALIGNED(size, 16)) { 833 + if (!IS_ALIGNED((unsigned long)dst_vaddr, 16) || !IS_ALIGNED(size, 16)) { 832 834 int dst_offset; 833 835 834 836 dst_tpage = alloc_page(GFP_KERNEL); ··· 853 855 page_address(src_tpage), size); 854 856 else { 855 857 if (copy_from_user(page_address(dst_tpage) + dst_offset, 856 - (void __user *)(uintptr_t)vaddr, size)) { 858 + vaddr, size)) { 857 859 ret = -EFAULT; 858 860 goto e_free; 859 861 } ··· 933 935 if (dec) 934 936 ret = __sev_dbg_decrypt_user(kvm, 935 937 __sme_page_pa(src_p[0]) + s_off, 936 - dst_vaddr, 938 + (void __user *)dst_vaddr, 937 939 __sme_page_pa(dst_p[0]) + d_off, 938 940 len, &argp->error); 939 941 else 940 942 ret = __sev_dbg_encrypt_user(kvm, 941 943 __sme_page_pa(src_p[0]) + s_off, 942 - vaddr, 944 + (void __user *)vaddr, 943 945 __sme_page_pa(dst_p[0]) + d_off, 944 - dst_vaddr, 946 + (void __user *)dst_vaddr, 945 947 len, &argp->error); 946 948 947 949 sev_unpin_memory(kvm, src_p, n); ··· 1762 1764 e_source_unlock: 1763 1765 mutex_unlock(&source_kvm->lock); 1764 1766 e_source_put: 1765 - fput(source_kvm_file); 1767 + if (source_kvm_file) 1768 + fput(source_kvm_file); 1766 1769 return ret; 1767 1770 } 1768 1771 ··· 2197 2198 return -EINVAL; 2198 2199 } 2199 2200 2200 - static void pre_sev_es_run(struct vcpu_svm *svm) 2201 + void sev_es_unmap_ghcb(struct vcpu_svm *svm) 2201 2202 { 2202 2203 if (!svm->ghcb) 2203 2204 return; ··· 2232 2233 { 2233 2234 struct svm_cpu_data *sd = per_cpu(svm_data, cpu); 2234 2235 int asid = sev_get_asid(svm->vcpu.kvm); 2235 - 2236 - /* Perform any SEV-ES pre-run actions */ 2237 - pre_sev_es_run(svm); 2238 2236 2239 2237 /* Assign the asid allocated with this SEV guest */ 2240 2238 svm->asid = asid;

+28 -34

arch/x86/kvm/svm/svm.c

··· 212 212 * RDTSCP and RDPID are not used in the kernel, specifically to allow KVM to 213 213 * defer the restoration of TSC_AUX until the CPU returns to userspace. 214 214 */ 215 - #define TSC_AUX_URET_SLOT 0 215 + static int tsc_aux_uret_slot __read_mostly = -1; 216 216 217 217 static const u32 msrpm_ranges[] = {0, 0xc0000000, 0xc0010000}; 218 218 ··· 444 444 445 445 if (sev_active()) { 446 446 pr_info("KVM is unsupported when running as an SEV guest\n"); 447 + return 0; 448 + } 449 + 450 + if (pgtable_l5_enabled()) { 451 + pr_info("KVM doesn't yet support 5-level paging on AMD SVM\n"); 447 452 return 0; 448 453 } 449 454 ··· 964 959 kvm_tsc_scaling_ratio_frac_bits = 32; 965 960 } 966 961 967 - if (boot_cpu_has(X86_FEATURE_RDTSCP)) 968 - kvm_define_user_return_msr(TSC_AUX_URET_SLOT, MSR_TSC_AUX); 962 + tsc_aux_uret_slot = kvm_add_user_return_msr(MSR_TSC_AUX); 969 963 970 964 /* Check for pause filtering support */ 971 965 if (!boot_cpu_has(X86_FEATURE_PAUSEFILTER)) { ··· 1104 1100 return svm->vmcb->control.tsc_offset; 1105 1101 } 1106 1102 1107 - static void svm_check_invpcid(struct vcpu_svm *svm) 1103 + /* Evaluate instruction intercepts that depend on guest CPUID features. */ 1104 + static void svm_recalc_instruction_intercepts(struct kvm_vcpu *vcpu, 1105 + struct vcpu_svm *svm) 1108 1106 { 1109 1107 /* 1110 1108 * Intercept INVPCID if shadow paging is enabled to sync/free shadow ··· 1118 1112 svm_set_intercept(svm, INTERCEPT_INVPCID); 1119 1113 else 1120 1114 svm_clr_intercept(svm, INTERCEPT_INVPCID); 1115 + } 1116 + 1117 + if (kvm_cpu_cap_has(X86_FEATURE_RDTSCP)) { 1118 + if (guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP)) 1119 + svm_clr_intercept(svm, INTERCEPT_RDTSCP); 1120 + else 1121 + svm_set_intercept(svm, INTERCEPT_RDTSCP); 1121 1122 } 1122 1123 } 1123 1124 ··· 1248 1235 svm->current_vmcb->asid_generation = 0; 1249 1236 svm->asid = 0; 1250 1237 1251 - svm->nested.vmcb12_gpa = 0; 1252 - svm->nested.last_vmcb12_gpa = 0; 1238 + svm->nested.vmcb12_gpa = INVALID_GPA; 1239 + svm->nested.last_vmcb12_gpa = INVALID_GPA; 1253 1240 vcpu->arch.hflags = 0; 1254 1241 1255 1242 if (!kvm_pause_in_guest(vcpu->kvm)) { ··· 1261 1248 svm_clr_intercept(svm, INTERCEPT_PAUSE); 1262 1249 } 1263 1250 1264 - svm_check_invpcid(svm); 1251 + svm_recalc_instruction_intercepts(vcpu, svm); 1265 1252 1266 1253 /* 1267 1254 * If the host supports V_SPEC_CTRL then disable the interception ··· 1437 1424 struct vcpu_svm *svm = to_svm(vcpu); 1438 1425 struct svm_cpu_data *sd = per_cpu(svm_data, vcpu->cpu); 1439 1426 1427 + if (sev_es_guest(vcpu->kvm)) 1428 + sev_es_unmap_ghcb(svm); 1429 + 1440 1430 if (svm->guest_state_loaded) 1441 1431 return; 1442 1432 ··· 1461 1445 } 1462 1446 } 1463 1447 1464 - if (static_cpu_has(X86_FEATURE_RDTSCP)) 1465 - kvm_set_user_return_msr(TSC_AUX_URET_SLOT, svm->tsc_aux, -1ull); 1448 + if (likely(tsc_aux_uret_slot >= 0)) 1449 + kvm_set_user_return_msr(tsc_aux_uret_slot, svm->tsc_aux, -1ull); 1466 1450 1467 1451 svm->guest_state_loaded = true; 1468 1452 } ··· 2671 2655 msr_info->data |= (u64)svm->sysenter_esp_hi << 32; 2672 2656 break; 2673 2657 case MSR_TSC_AUX: 2674 - if (!boot_cpu_has(X86_FEATURE_RDTSCP)) 2675 - return 1; 2676 - if (!msr_info->host_initiated && 2677 - !guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP)) 2678 - return 1; 2679 2658 msr_info->data = svm->tsc_aux; 2680 2659 break; 2681 2660 /* ··· 2887 2876 svm->sysenter_esp_hi = guest_cpuid_is_intel(vcpu) ? (data >> 32) : 0; 2888 2877 break; 2889 2878 case MSR_TSC_AUX: 2890 - if (!boot_cpu_has(X86_FEATURE_RDTSCP)) 2891 - return 1; 2892 - 2893 - if (!msr->host_initiated && 2894 - !guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP)) 2895 - return 1; 2896 - 2897 - /* 2898 - * Per Intel's SDM, bits 63:32 are reserved, but AMD's APM has 2899 - * incomplete and conflicting architectural behavior. Current 2900 - * AMD CPUs completely ignore bits 63:32, i.e. they aren't 2901 - * reserved and always read as zeros. Emulate AMD CPU behavior 2902 - * to avoid explosions if the vCPU is migrated from an AMD host 2903 - * to an Intel host. 2904 - */ 2905 - data = (u32)data; 2906 - 2907 2879 /* 2908 2880 * TSC_AUX is usually changed only during boot and never read 2909 2881 * directly. Intercept TSC_AUX instead of exposing it to the 2910 2882 * guest via direct_access_msrs, and switch it via user return. 2911 2883 */ 2912 2884 preempt_disable(); 2913 - r = kvm_set_user_return_msr(TSC_AUX_URET_SLOT, data, -1ull); 2885 + r = kvm_set_user_return_msr(tsc_aux_uret_slot, data, -1ull); 2914 2886 preempt_enable(); 2915 2887 if (r) 2916 2888 return 1; ··· 3078 3084 [SVM_EXIT_STGI] = stgi_interception, 3079 3085 [SVM_EXIT_CLGI] = clgi_interception, 3080 3086 [SVM_EXIT_SKINIT] = skinit_interception, 3087 + [SVM_EXIT_RDTSCP] = kvm_handle_invalid_op, 3081 3088 [SVM_EXIT_WBINVD] = kvm_emulate_wbinvd, 3082 3089 [SVM_EXIT_MONITOR] = kvm_emulate_monitor, 3083 3090 [SVM_EXIT_MWAIT] = kvm_emulate_mwait, ··· 3967 3972 svm->nrips_enabled = kvm_cpu_cap_has(X86_FEATURE_NRIPS) && 3968 3973 guest_cpuid_has(vcpu, X86_FEATURE_NRIPS); 3969 3974 3970 - /* Check again if INVPCID interception if required */ 3971 - svm_check_invpcid(svm); 3975 + svm_recalc_instruction_intercepts(vcpu, svm); 3972 3976 3973 3977 /* For sev guests, the memory encryption bit is not reserved in CR3. */ 3974 3978 if (sev_guest(vcpu->kvm)) {

+1

arch/x86/kvm/svm/svm.h

··· 581 581 void sev_es_create_vcpu(struct vcpu_svm *svm); 582 582 void sev_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector); 583 583 void sev_es_prepare_guest_switch(struct vcpu_svm *svm, unsigned int cpu); 584 + void sev_es_unmap_ghcb(struct vcpu_svm *svm); 584 585 585 586 /* vmenter.S */ 586 587

+3

arch/x86/kvm/vmx/capabilities.h

··· 398 398 { 399 399 u64 debugctl = 0; 400 400 401 + if (boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT)) 402 + debugctl |= DEBUGCTLMSR_BUS_LOCK_DETECT; 403 + 401 404 if (vmx_get_perf_capabilities() & PMU_CAP_LBR_FMT) 402 405 debugctl |= DEBUGCTLMSR_LBR_MASK; 403 406

+19 -10

arch/x86/kvm/vmx/nested.c

··· 3098 3098 nested_vmx_handle_enlightened_vmptrld(vcpu, false); 3099 3099 3100 3100 if (evmptrld_status == EVMPTRLD_VMFAIL || 3101 - evmptrld_status == EVMPTRLD_ERROR) { 3102 - pr_debug_ratelimited("%s: enlightened vmptrld failed\n", 3103 - __func__); 3104 - vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; 3105 - vcpu->run->internal.suberror = 3106 - KVM_INTERNAL_ERROR_EMULATION; 3107 - vcpu->run->internal.ndata = 0; 3101 + evmptrld_status == EVMPTRLD_ERROR) 3108 3102 return false; 3109 - } 3110 3103 } 3111 3104 3112 3105 return true; ··· 3187 3194 3188 3195 static bool vmx_get_nested_state_pages(struct kvm_vcpu *vcpu) 3189 3196 { 3190 - if (!nested_get_evmcs_page(vcpu)) 3197 + if (!nested_get_evmcs_page(vcpu)) { 3198 + pr_debug_ratelimited("%s: enlightened vmptrld failed\n", 3199 + __func__); 3200 + vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; 3201 + vcpu->run->internal.suberror = 3202 + KVM_INTERNAL_ERROR_EMULATION; 3203 + vcpu->run->internal.ndata = 0; 3204 + 3191 3205 return false; 3206 + } 3192 3207 3193 3208 if (is_guest_mode(vcpu) && !nested_get_vmcs12_pages(vcpu)) 3194 3209 return false; ··· 4436 4435 /* Similarly, triple faults in L2 should never escape. */ 4437 4436 WARN_ON_ONCE(kvm_check_request(KVM_REQ_TRIPLE_FAULT, vcpu)); 4438 4437 4439 - kvm_clear_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu); 4438 + if (kvm_check_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu)) { 4439 + /* 4440 + * KVM_REQ_GET_NESTED_STATE_PAGES is also used to map 4441 + * Enlightened VMCS after migration and we still need to 4442 + * do that when something is forcing L2->L1 exit prior to 4443 + * the first L2 run. 4444 + */ 4445 + (void)nested_get_evmcs_page(vcpu); 4446 + } 4440 4447 4441 4448 /* Service the TLB flush request for L2 before switching to L1. */ 4442 4449 if (kvm_check_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu))

+110 -110

arch/x86/kvm/vmx/vmx.c

··· 455 455 456 456 static unsigned long host_idt_base; 457 457 458 - /* 459 - * Though SYSCALL is only supported in 64-bit mode on Intel CPUs, kvm 460 - * will emulate SYSCALL in legacy mode if the vendor string in guest 461 - * CPUID.0:{EBX,ECX,EDX} is "AuthenticAMD" or "AMDisbetter!" To 462 - * support this emulation, IA32_STAR must always be included in 463 - * vmx_uret_msrs_list[], even in i386 builds. 464 - */ 465 - static const u32 vmx_uret_msrs_list[] = { 466 - #ifdef CONFIG_X86_64 467 - MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR, 468 - #endif 469 - MSR_EFER, MSR_TSC_AUX, MSR_STAR, 470 - MSR_IA32_TSX_CTRL, 471 - }; 472 - 473 458 #if IS_ENABLED(CONFIG_HYPERV) 474 459 static bool __read_mostly enlightened_vmcs = true; 475 460 module_param(enlightened_vmcs, bool, 0444); ··· 682 697 return r; 683 698 } 684 699 685 - static inline int __vmx_find_uret_msr(struct vcpu_vmx *vmx, u32 msr) 686 - { 687 - int i; 688 - 689 - for (i = 0; i < vmx->nr_uret_msrs; ++i) 690 - if (vmx_uret_msrs_list[vmx->guest_uret_msrs[i].slot] == msr) 691 - return i; 692 - return -1; 693 - } 694 - 695 700 struct vmx_uret_msr *vmx_find_uret_msr(struct vcpu_vmx *vmx, u32 msr) 696 701 { 697 702 int i; 698 703 699 - i = __vmx_find_uret_msr(vmx, msr); 704 + i = kvm_find_user_return_msr(msr); 700 705 if (i >= 0) 701 706 return &vmx->guest_uret_msrs[i]; 702 707 return NULL; ··· 695 720 static int vmx_set_guest_uret_msr(struct vcpu_vmx *vmx, 696 721 struct vmx_uret_msr *msr, u64 data) 697 722 { 723 + unsigned int slot = msr - vmx->guest_uret_msrs; 698 724 int ret = 0; 699 725 700 726 u64 old_msr_data = msr->data; 701 727 msr->data = data; 702 - if (msr - vmx->guest_uret_msrs < vmx->nr_active_uret_msrs) { 728 + if (msr->load_into_hardware) { 703 729 preempt_disable(); 704 - ret = kvm_set_user_return_msr(msr->slot, msr->data, msr->mask); 730 + ret = kvm_set_user_return_msr(slot, msr->data, msr->mask); 705 731 preempt_enable(); 706 732 if (ret) 707 733 msr->data = old_msr_data; ··· 1054 1078 return false; 1055 1079 } 1056 1080 1057 - i = __vmx_find_uret_msr(vmx, MSR_EFER); 1081 + i = kvm_find_user_return_msr(MSR_EFER); 1058 1082 if (i < 0) 1059 1083 return false; 1060 1084 ··· 1216 1240 */ 1217 1241 if (!vmx->guest_uret_msrs_loaded) { 1218 1242 vmx->guest_uret_msrs_loaded = true; 1219 - for (i = 0; i < vmx->nr_active_uret_msrs; ++i) 1220 - kvm_set_user_return_msr(vmx->guest_uret_msrs[i].slot, 1243 + for (i = 0; i < kvm_nr_uret_msrs; ++i) { 1244 + if (!vmx->guest_uret_msrs[i].load_into_hardware) 1245 + continue; 1246 + 1247 + kvm_set_user_return_msr(i, 1221 1248 vmx->guest_uret_msrs[i].data, 1222 1249 vmx->guest_uret_msrs[i].mask); 1223 - 1250 + } 1224 1251 } 1225 1252 1226 1253 if (vmx->nested.need_vmcs12_to_shadow_sync) ··· 1730 1751 vmx_clear_hlt(vcpu); 1731 1752 } 1732 1753 1733 - static void vmx_setup_uret_msr(struct vcpu_vmx *vmx, unsigned int msr) 1754 + static void vmx_setup_uret_msr(struct vcpu_vmx *vmx, unsigned int msr, 1755 + bool load_into_hardware) 1734 1756 { 1735 - struct vmx_uret_msr tmp; 1736 - int from, to; 1757 + struct vmx_uret_msr *uret_msr; 1737 1758 1738 - from = __vmx_find_uret_msr(vmx, msr); 1739 - if (from < 0) 1759 + uret_msr = vmx_find_uret_msr(vmx, msr); 1760 + if (!uret_msr) 1740 1761 return; 1741 - to = vmx->nr_active_uret_msrs++; 1742 1762 1743 - tmp = vmx->guest_uret_msrs[to]; 1744 - vmx->guest_uret_msrs[to] = vmx->guest_uret_msrs[from]; 1745 - vmx->guest_uret_msrs[from] = tmp; 1763 + uret_msr->load_into_hardware = load_into_hardware; 1746 1764 } 1747 1765 1748 1766 /* ··· 1749 1773 */ 1750 1774 static void setup_msrs(struct vcpu_vmx *vmx) 1751 1775 { 1752 - vmx->guest_uret_msrs_loaded = false; 1753 - vmx->nr_active_uret_msrs = 0; 1754 1776 #ifdef CONFIG_X86_64 1777 + bool load_syscall_msrs; 1778 + 1755 1779 /* 1756 1780 * The SYSCALL MSRs are only needed on long mode guests, and only 1757 1781 * when EFER.SCE is set. 1758 1782 */ 1759 - if (is_long_mode(&vmx->vcpu) && (vmx->vcpu.arch.efer & EFER_SCE)) { 1760 - vmx_setup_uret_msr(vmx, MSR_STAR); 1761 - vmx_setup_uret_msr(vmx, MSR_LSTAR); 1762 - vmx_setup_uret_msr(vmx, MSR_SYSCALL_MASK); 1763 - } 1783 + load_syscall_msrs = is_long_mode(&vmx->vcpu) && 1784 + (vmx->vcpu.arch.efer & EFER_SCE); 1785 + 1786 + vmx_setup_uret_msr(vmx, MSR_STAR, load_syscall_msrs); 1787 + vmx_setup_uret_msr(vmx, MSR_LSTAR, load_syscall_msrs); 1788 + vmx_setup_uret_msr(vmx, MSR_SYSCALL_MASK, load_syscall_msrs); 1764 1789 #endif 1765 - if (update_transition_efer(vmx)) 1766 - vmx_setup_uret_msr(vmx, MSR_EFER); 1790 + vmx_setup_uret_msr(vmx, MSR_EFER, update_transition_efer(vmx)); 1767 1791 1768 - if (guest_cpuid_has(&vmx->vcpu, X86_FEATURE_RDTSCP)) 1769 - vmx_setup_uret_msr(vmx, MSR_TSC_AUX); 1792 + vmx_setup_uret_msr(vmx, MSR_TSC_AUX, 1793 + guest_cpuid_has(&vmx->vcpu, X86_FEATURE_RDTSCP) || 1794 + guest_cpuid_has(&vmx->vcpu, X86_FEATURE_RDPID)); 1770 1795 1771 - vmx_setup_uret_msr(vmx, MSR_IA32_TSX_CTRL); 1796 + /* 1797 + * hle=0, rtm=0, tsx_ctrl=1 can be found with some combinations of new 1798 + * kernel and old userspace. If those guests run on a tsx=off host, do 1799 + * allow guests to use TSX_CTRL, but don't change the value in hardware 1800 + * so that TSX remains always disabled. 1801 + */ 1802 + vmx_setup_uret_msr(vmx, MSR_IA32_TSX_CTRL, boot_cpu_has(X86_FEATURE_RTM)); 1772 1803 1773 1804 if (cpu_has_vmx_msr_bitmap()) 1774 1805 vmx_update_msr_bitmap(&vmx->vcpu); 1806 + 1807 + /* 1808 + * The set of MSRs to load may have changed, reload MSRs before the 1809 + * next VM-Enter. 1810 + */ 1811 + vmx->guest_uret_msrs_loaded = false; 1775 1812 } 1776 1813 1777 1814 static u64 vmx_write_l1_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) ··· 1982 1993 else 1983 1994 msr_info->data = vmx->pt_desc.guest.addr_a[index / 2]; 1984 1995 break; 1985 - case MSR_TSC_AUX: 1986 - if (!msr_info->host_initiated && 1987 - !guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP)) 1988 - return 1; 1989 - goto find_uret_msr; 1990 1996 case MSR_IA32_DEBUGCTLMSR: 1991 1997 msr_info->data = vmcs_read64(GUEST_IA32_DEBUGCTL); 1992 1998 break; ··· 2014 2030 2015 2031 if (!intel_pmu_lbr_is_enabled(vcpu)) 2016 2032 debugctl &= ~DEBUGCTLMSR_LBR_MASK; 2033 + 2034 + if (!guest_cpuid_has(vcpu, X86_FEATURE_BUS_LOCK_DETECT)) 2035 + debugctl &= ~DEBUGCTLMSR_BUS_LOCK_DETECT; 2017 2036 2018 2037 return debugctl; 2019 2038 } ··· 2300 2313 else 2301 2314 vmx->pt_desc.guest.addr_a[index / 2] = data; 2302 2315 break; 2303 - case MSR_TSC_AUX: 2304 - if (!msr_info->host_initiated && 2305 - !guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP)) 2306 - return 1; 2307 - /* Check reserved bit, higher 32 bits should be zero */ 2308 - if ((data >> 32) != 0) 2309 - return 1; 2310 - goto find_uret_msr; 2311 2316 case MSR_IA32_PERF_CAPABILITIES: 2312 2317 if (data && !vcpu_to_pmu(vcpu)->version) 2313 2318 return 1; ··· 4348 4369 xsaves_enabled, false); 4349 4370 } 4350 4371 4351 - vmx_adjust_sec_exec_feature(vmx, &exec_control, rdtscp, RDTSCP); 4372 + /* 4373 + * RDPID is also gated by ENABLE_RDTSCP, turn on the control if either 4374 + * feature is exposed to the guest. This creates a virtualization hole 4375 + * if both are supported in hardware but only one is exposed to the 4376 + * guest, but letting the guest execute RDTSCP or RDPID when either one 4377 + * is advertised is preferable to emulating the advertised instruction 4378 + * in KVM on #UD, and obviously better than incorrectly injecting #UD. 4379 + */ 4380 + if (cpu_has_vmx_rdtscp()) { 4381 + bool rdpid_or_rdtscp_enabled = 4382 + guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP) || 4383 + guest_cpuid_has(vcpu, X86_FEATURE_RDPID); 4384 + 4385 + vmx_adjust_secondary_exec_control(vmx, &exec_control, 4386 + SECONDARY_EXEC_ENABLE_RDTSCP, 4387 + rdpid_or_rdtscp_enabled, false); 4388 + } 4352 4389 vmx_adjust_sec_exec_feature(vmx, &exec_control, invpcid, INVPCID); 4353 4390 4354 4391 vmx_adjust_sec_exec_exiting(vmx, &exec_control, rdrand, RDRAND); ··· 6850 6855 6851 6856 static int vmx_create_vcpu(struct kvm_vcpu *vcpu) 6852 6857 { 6858 + struct vmx_uret_msr *tsx_ctrl; 6853 6859 struct vcpu_vmx *vmx; 6854 6860 int i, cpu, err; 6855 6861 ··· 6873 6877 goto free_vpid; 6874 6878 } 6875 6879 6876 - BUILD_BUG_ON(ARRAY_SIZE(vmx_uret_msrs_list) != MAX_NR_USER_RETURN_MSRS); 6877 - 6878 - for (i = 0; i < ARRAY_SIZE(vmx_uret_msrs_list); ++i) { 6879 - u32 index = vmx_uret_msrs_list[i]; 6880 - u32 data_low, data_high; 6881 - int j = vmx->nr_uret_msrs; 6882 - 6883 - if (rdmsr_safe(index, &data_low, &data_high) < 0) 6884 - continue; 6885 - if (wrmsr_safe(index, data_low, data_high) < 0) 6886 - continue; 6887 - 6888 - vmx->guest_uret_msrs[j].slot = i; 6889 - vmx->guest_uret_msrs[j].data = 0; 6890 - switch (index) { 6891 - case MSR_IA32_TSX_CTRL: 6892 - /* 6893 - * TSX_CTRL_CPUID_CLEAR is handled in the CPUID 6894 - * interception. Keep the host value unchanged to avoid 6895 - * changing CPUID bits under the host kernel's feet. 6896 - * 6897 - * hle=0, rtm=0, tsx_ctrl=1 can be found with some 6898 - * combinations of new kernel and old userspace. If 6899 - * those guests run on a tsx=off host, do allow guests 6900 - * to use TSX_CTRL, but do not change the value on the 6901 - * host so that TSX remains always disabled. 6902 - */ 6903 - if (boot_cpu_has(X86_FEATURE_RTM)) 6904 - vmx->guest_uret_msrs[j].mask = ~(u64)TSX_CTRL_CPUID_CLEAR; 6905 - else 6906 - vmx->guest_uret_msrs[j].mask = 0; 6907 - break; 6908 - default: 6909 - vmx->guest_uret_msrs[j].mask = -1ull; 6910 - break; 6911 - } 6912 - ++vmx->nr_uret_msrs; 6880 + for (i = 0; i < kvm_nr_uret_msrs; ++i) { 6881 + vmx->guest_uret_msrs[i].data = 0; 6882 + vmx->guest_uret_msrs[i].mask = -1ull; 6883 + } 6884 + if (boot_cpu_has(X86_FEATURE_RTM)) { 6885 + /* 6886 + * TSX_CTRL_CPUID_CLEAR is handled in the CPUID interception. 6887 + * Keep the host value unchanged to avoid changing CPUID bits 6888 + * under the host kernel's feet. 6889 + */ 6890 + tsx_ctrl = vmx_find_uret_msr(vmx, MSR_IA32_TSX_CTRL); 6891 + if (tsx_ctrl) 6892 + vmx->guest_uret_msrs[i].mask = ~(u64)TSX_CTRL_CPUID_CLEAR; 6913 6893 } 6914 6894 6915 6895 err = alloc_loaded_vmcs(&vmx->vmcs01); ··· 7316 7344 if (!cpu_has_vmx_xsaves()) 7317 7345 kvm_cpu_cap_clear(X86_FEATURE_XSAVES); 7318 7346 7319 - /* CPUID 0x80000001 */ 7320 - if (!cpu_has_vmx_rdtscp()) 7347 + /* CPUID 0x80000001 and 0x7 (RDPID) */ 7348 + if (!cpu_has_vmx_rdtscp()) { 7321 7349 kvm_cpu_cap_clear(X86_FEATURE_RDTSCP); 7350 + kvm_cpu_cap_clear(X86_FEATURE_RDPID); 7351 + } 7322 7352 7323 7353 if (cpu_has_vmx_waitpkg()) 7324 7354 kvm_cpu_cap_check_and_set(X86_FEATURE_WAITPKG); ··· 7376 7402 /* 7377 7403 * RDPID causes #UD if disabled through secondary execution controls. 7378 7404 * Because it is marked as EmulateOnUD, we need to intercept it here. 7405 + * Note, RDPID is hidden behind ENABLE_RDTSCP. 7379 7406 */ 7380 - case x86_intercept_rdtscp: 7407 + case x86_intercept_rdpid: 7381 7408 if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_RDTSCP)) { 7382 7409 exception->vector = UD_VECTOR; 7383 7410 exception->error_code_valid = false; ··· 7744 7769 .vcpu_deliver_sipi_vector = kvm_vcpu_deliver_sipi_vector, 7745 7770 }; 7746 7771 7772 + static __init void vmx_setup_user_return_msrs(void) 7773 + { 7774 + 7775 + /* 7776 + * Though SYSCALL is only supported in 64-bit mode on Intel CPUs, kvm 7777 + * will emulate SYSCALL in legacy mode if the vendor string in guest 7778 + * CPUID.0:{EBX,ECX,EDX} is "AuthenticAMD" or "AMDisbetter!" To 7779 + * support this emulation, MSR_STAR is included in the list for i386, 7780 + * but is never loaded into hardware. MSR_CSTAR is also never loaded 7781 + * into hardware and is here purely for emulation purposes. 7782 + */ 7783 + const u32 vmx_uret_msrs_list[] = { 7784 + #ifdef CONFIG_X86_64 7785 + MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR, 7786 + #endif 7787 + MSR_EFER, MSR_TSC_AUX, MSR_STAR, 7788 + MSR_IA32_TSX_CTRL, 7789 + }; 7790 + int i; 7791 + 7792 + BUILD_BUG_ON(ARRAY_SIZE(vmx_uret_msrs_list) != MAX_NR_USER_RETURN_MSRS); 7793 + 7794 + for (i = 0; i < ARRAY_SIZE(vmx_uret_msrs_list); ++i) 7795 + kvm_add_user_return_msr(vmx_uret_msrs_list[i]); 7796 + } 7797 + 7747 7798 static __init int hardware_setup(void) 7748 7799 { 7749 7800 unsigned long host_bndcfgs; 7750 7801 struct desc_ptr dt; 7751 - int r, i, ept_lpage_level; 7802 + int r, ept_lpage_level; 7752 7803 7753 7804 store_idt(&dt); 7754 7805 host_idt_base = dt.address; 7755 7806 7756 - for (i = 0; i < ARRAY_SIZE(vmx_uret_msrs_list); ++i) 7757 - kvm_define_user_return_msr(i, vmx_uret_msrs_list[i]); 7807 + vmx_setup_user_return_msrs(); 7758 7808 7759 7809 if (setup_vmcs_config(&vmcs_config, &vmx_capability) < 0) 7760 7810 return -EIO;

+10 -2

arch/x86/kvm/vmx/vmx.h

··· 36 36 }; 37 37 38 38 struct vmx_uret_msr { 39 - unsigned int slot; /* The MSR's slot in kvm_user_return_msrs. */ 39 + bool load_into_hardware; 40 40 u64 data; 41 41 u64 mask; 42 42 }; ··· 245 245 u32 idt_vectoring_info; 246 246 ulong rflags; 247 247 248 + /* 249 + * User return MSRs are always emulated when enabled in the guest, but 250 + * only loaded into hardware when necessary, e.g. SYSCALL #UDs outside 251 + * of 64-bit mode or if EFER.SCE=1, thus the SYSCALL MSRs don't need to 252 + * be loaded into hardware if those conditions aren't met. 253 + * nr_active_uret_msrs tracks the number of MSRs that need to be loaded 254 + * into hardware when running the guest. guest_uret_msrs[] is resorted 255 + * whenever the number of "active" uret MSRs is modified. 256 + */ 248 257 struct vmx_uret_msr guest_uret_msrs[MAX_NR_USER_RETURN_MSRS]; 249 - int nr_uret_msrs; 250 258 int nr_active_uret_msrs; 251 259 bool guest_uret_msrs_loaded; 252 260 #ifdef CONFIG_X86_64

+112 -41

arch/x86/kvm/x86.c

··· 184 184 */ 185 185 #define KVM_MAX_NR_USER_RETURN_MSRS 16 186 186 187 - struct kvm_user_return_msrs_global { 188 - int nr; 189 - u32 msrs[KVM_MAX_NR_USER_RETURN_MSRS]; 190 - }; 191 - 192 187 struct kvm_user_return_msrs { 193 188 struct user_return_notifier urn; 194 189 bool registered; ··· 193 198 } values[KVM_MAX_NR_USER_RETURN_MSRS]; 194 199 }; 195 200 196 - static struct kvm_user_return_msrs_global __read_mostly user_return_msrs_global; 201 + u32 __read_mostly kvm_nr_uret_msrs; 202 + EXPORT_SYMBOL_GPL(kvm_nr_uret_msrs); 203 + static u32 __read_mostly kvm_uret_msrs_list[KVM_MAX_NR_USER_RETURN_MSRS]; 197 204 static struct kvm_user_return_msrs __percpu *user_return_msrs; 198 205 199 206 #define KVM_SUPPORTED_XCR0 (XFEATURE_MASK_FP | XFEATURE_MASK_SSE \ ··· 327 330 user_return_notifier_unregister(urn); 328 331 } 329 332 local_irq_restore(flags); 330 - for (slot = 0; slot < user_return_msrs_global.nr; ++slot) { 333 + for (slot = 0; slot < kvm_nr_uret_msrs; ++slot) { 331 334 values = &msrs->values[slot]; 332 335 if (values->host != values->curr) { 333 - wrmsrl(user_return_msrs_global.msrs[slot], values->host); 336 + wrmsrl(kvm_uret_msrs_list[slot], values->host); 334 337 values->curr = values->host; 335 338 } 336 339 } 337 340 } 338 341 339 - void kvm_define_user_return_msr(unsigned slot, u32 msr) 342 + static int kvm_probe_user_return_msr(u32 msr) 340 343 { 341 - BUG_ON(slot >= KVM_MAX_NR_USER_RETURN_MSRS); 342 - user_return_msrs_global.msrs[slot] = msr; 343 - if (slot >= user_return_msrs_global.nr) 344 - user_return_msrs_global.nr = slot + 1; 344 + u64 val; 345 + int ret; 346 + 347 + preempt_disable(); 348 + ret = rdmsrl_safe(msr, &val); 349 + if (ret) 350 + goto out; 351 + ret = wrmsrl_safe(msr, val); 352 + out: 353 + preempt_enable(); 354 + return ret; 345 355 } 346 - EXPORT_SYMBOL_GPL(kvm_define_user_return_msr); 356 + 357 + int kvm_add_user_return_msr(u32 msr) 358 + { 359 + BUG_ON(kvm_nr_uret_msrs >= KVM_MAX_NR_USER_RETURN_MSRS); 360 + 361 + if (kvm_probe_user_return_msr(msr)) 362 + return -1; 363 + 364 + kvm_uret_msrs_list[kvm_nr_uret_msrs] = msr; 365 + return kvm_nr_uret_msrs++; 366 + } 367 + EXPORT_SYMBOL_GPL(kvm_add_user_return_msr); 368 + 369 + int kvm_find_user_return_msr(u32 msr) 370 + { 371 + int i; 372 + 373 + for (i = 0; i < kvm_nr_uret_msrs; ++i) { 374 + if (kvm_uret_msrs_list[i] == msr) 375 + return i; 376 + } 377 + return -1; 378 + } 379 + EXPORT_SYMBOL_GPL(kvm_find_user_return_msr); 347 380 348 381 static void kvm_user_return_msr_cpu_online(void) 349 382 { ··· 382 355 u64 value; 383 356 int i; 384 357 385 - for (i = 0; i < user_return_msrs_global.nr; ++i) { 386 - rdmsrl_safe(user_return_msrs_global.msrs[i], &value); 358 + for (i = 0; i < kvm_nr_uret_msrs; ++i) { 359 + rdmsrl_safe(kvm_uret_msrs_list[i], &value); 387 360 msrs->values[i].host = value; 388 361 msrs->values[i].curr = value; 389 362 } ··· 398 371 value = (value & mask) | (msrs->values[slot].host & ~mask); 399 372 if (value == msrs->values[slot].curr) 400 373 return 0; 401 - err = wrmsrl_safe(user_return_msrs_global.msrs[slot], value); 374 + err = wrmsrl_safe(kvm_uret_msrs_list[slot], value); 402 375 if (err) 403 376 return 1; 404 377 ··· 1176 1149 1177 1150 if (!guest_cpuid_has(vcpu, X86_FEATURE_RTM)) 1178 1151 fixed |= DR6_RTM; 1152 + 1153 + if (!guest_cpuid_has(vcpu, X86_FEATURE_BUS_LOCK_DETECT)) 1154 + fixed |= DR6_BUS_LOCK; 1179 1155 return fixed; 1180 1156 } 1181 1157 ··· 1645 1615 * invokes 64-bit SYSENTER. 1646 1616 */ 1647 1617 data = get_canonical(data, vcpu_virt_addr_bits(vcpu)); 1618 + break; 1619 + case MSR_TSC_AUX: 1620 + if (!kvm_is_supported_user_return_msr(MSR_TSC_AUX)) 1621 + return 1; 1622 + 1623 + if (!host_initiated && 1624 + !guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP) && 1625 + !guest_cpuid_has(vcpu, X86_FEATURE_RDPID)) 1626 + return 1; 1627 + 1628 + /* 1629 + * Per Intel's SDM, bits 63:32 are reserved, but AMD's APM has 1630 + * incomplete and conflicting architectural behavior. Current 1631 + * AMD CPUs completely ignore bits 63:32, i.e. they aren't 1632 + * reserved and always read as zeros. Enforce Intel's reserved 1633 + * bits check if and only if the guest CPU is Intel, and clear 1634 + * the bits in all other cases. This ensures cross-vendor 1635 + * migration will provide consistent behavior for the guest. 1636 + */ 1637 + if (guest_cpuid_is_intel(vcpu) && (data >> 32) != 0) 1638 + return 1; 1639 + 1640 + data = (u32)data; 1641 + break; 1648 1642 } 1649 1643 1650 1644 msr.data = data; ··· 1704 1650 1705 1651 if (!host_initiated && !kvm_msr_allowed(vcpu, index, KVM_MSR_FILTER_READ)) 1706 1652 return KVM_MSR_RET_FILTERED; 1653 + 1654 + switch (index) { 1655 + case MSR_TSC_AUX: 1656 + if (!kvm_is_supported_user_return_msr(MSR_TSC_AUX)) 1657 + return 1; 1658 + 1659 + if (!host_initiated && 1660 + !guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP) && 1661 + !guest_cpuid_has(vcpu, X86_FEATURE_RDPID)) 1662 + return 1; 1663 + break; 1664 + } 1707 1665 1708 1666 msr.index = index; 1709 1667 msr.host_initiated = host_initiated; ··· 5534 5468 static int kvm_add_msr_filter(struct kvm_x86_msr_filter *msr_filter, 5535 5469 struct kvm_msr_filter_range *user_range) 5536 5470 { 5537 - struct msr_bitmap_range range; 5538 5471 unsigned long *bitmap = NULL; 5539 5472 size_t bitmap_size; 5540 - int r; 5541 5473 5542 5474 if (!user_range->nmsrs) 5543 5475 return 0; 5476 + 5477 + if (user_range->flags & ~(KVM_MSR_FILTER_READ | KVM_MSR_FILTER_WRITE)) 5478 + return -EINVAL; 5479 + 5480 + if (!user_range->flags) 5481 + return -EINVAL; 5544 5482 5545 5483 bitmap_size = BITS_TO_LONGS(user_range->nmsrs) * sizeof(long); 5546 5484 if (!bitmap_size || bitmap_size > KVM_MSR_FILTER_MAX_BITMAP_SIZE) ··· 5554 5484 if (IS_ERR(bitmap)) 5555 5485 return PTR_ERR(bitmap); 5556 5486 5557 - range = (struct msr_bitmap_range) { 5487 + msr_filter->ranges[msr_filter->count] = (struct msr_bitmap_range) { 5558 5488 .flags = user_range->flags, 5559 5489 .base = user_range->base, 5560 5490 .nmsrs = user_range->nmsrs, 5561 5491 .bitmap = bitmap, 5562 5492 }; 5563 5493 5564 - if (range.flags & ~(KVM_MSR_FILTER_READ | KVM_MSR_FILTER_WRITE)) { 5565 - r = -EINVAL; 5566 - goto err; 5567 - } 5568 - 5569 - if (!range.flags) { 5570 - r = -EINVAL; 5571 - goto err; 5572 - } 5573 - 5574 - /* Everything ok, add this range identifier. */ 5575 - msr_filter->ranges[msr_filter->count] = range; 5576 5494 msr_filter->count++; 5577 - 5578 5495 return 0; 5579 - err: 5580 - kfree(bitmap); 5581 - return r; 5582 5496 } 5583 5497 5584 5498 static int kvm_vm_ioctl_set_msr_filter(struct kvm *kvm, void __user *argp) ··· 5991 5937 continue; 5992 5938 break; 5993 5939 case MSR_TSC_AUX: 5994 - if (!kvm_cpu_cap_has(X86_FEATURE_RDTSCP)) 5940 + if (!kvm_cpu_cap_has(X86_FEATURE_RDTSCP) && 5941 + !kvm_cpu_cap_has(X86_FEATURE_RDPID)) 5995 5942 continue; 5996 5943 break; 5997 5944 case MSR_IA32_UMWAIT_CONTROL: ··· 8095 8040 static DECLARE_WORK(pvclock_gtod_work, pvclock_gtod_update_fn); 8096 8041 8097 8042 /* 8043 + * Indirection to move queue_work() out of the tk_core.seq write held 8044 + * region to prevent possible deadlocks against time accessors which 8045 + * are invoked with work related locks held. 8046 + */ 8047 + static void pvclock_irq_work_fn(struct irq_work *w) 8048 + { 8049 + queue_work(system_long_wq, &pvclock_gtod_work); 8050 + } 8051 + 8052 + static DEFINE_IRQ_WORK(pvclock_irq_work, pvclock_irq_work_fn); 8053 + 8054 + /* 8098 8055 * Notification about pvclock gtod data update. 8099 8056 */ 8100 8057 static int pvclock_gtod_notify(struct notifier_block *nb, unsigned long unused, ··· 8117 8050 8118 8051 update_pvclock_gtod(tk); 8119 8052 8120 - /* disable master clock if host does not trust, or does not 8121 - * use, TSC based clocksource. 8053 + /* 8054 + * Disable master clock if host does not trust, or does not use, 8055 + * TSC based clocksource. Delegate queue_work() to irq_work as 8056 + * this is invoked with tk_core.seq write held. 8122 8057 */ 8123 8058 if (!gtod_is_based_on_tsc(gtod->clock.vclock_mode) && 8124 8059 atomic_read(&kvm_guest_has_master_clock) != 0) 8125 - queue_work(system_long_wq, &pvclock_gtod_work); 8126 - 8060 + irq_work_queue(&pvclock_irq_work); 8127 8061 return 0; 8128 8062 } 8129 8063 ··· 8186 8118 printk(KERN_ERR "kvm: failed to allocate percpu kvm_user_return_msrs\n"); 8187 8119 goto out_free_x86_emulator_cache; 8188 8120 } 8121 + kvm_nr_uret_msrs = 0; 8189 8122 8190 8123 r = kvm_mmu_module_init(); 8191 8124 if (r) ··· 8237 8168 cpuhp_remove_state_nocalls(CPUHP_AP_X86_KVM_CLK_ONLINE); 8238 8169 #ifdef CONFIG_X86_64 8239 8170 pvclock_gtod_unregister_notifier(&pvclock_gtod_notifier); 8171 + irq_work_sync(&pvclock_irq_work); 8172 + cancel_work_sync(&pvclock_gtod_work); 8240 8173 #endif 8241 8174 kvm_x86_ops.hardware_enable = NULL; 8242 8175 kvm_mmu_module_exit();

+1 -1

fs/btrfs/ctree.h

··· 3127 3127 struct btrfs_inode *inode, u64 new_size, 3128 3128 u32 min_type); 3129 3129 3130 - int btrfs_start_delalloc_snapshot(struct btrfs_root *root); 3130 + int btrfs_start_delalloc_snapshot(struct btrfs_root *root, bool in_reclaim_context); 3131 3131 int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, long nr, 3132 3132 bool in_reclaim_context); 3133 3133 int btrfs_set_extent_delalloc(struct btrfs_inode *inode, u64 start, u64 end,

+5 -1

fs/btrfs/extent-tree.c

··· 1340 1340 stripe = bbio->stripes; 1341 1341 for (i = 0; i < bbio->num_stripes; i++, stripe++) { 1342 1342 u64 bytes; 1343 + struct btrfs_device *device = stripe->dev; 1343 1344 1344 - if (!stripe->dev->bdev) { 1345 + if (!device->bdev) { 1345 1346 ASSERT(btrfs_test_opt(fs_info, DEGRADED)); 1346 1347 continue; 1347 1348 } 1349 + 1350 + if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) 1351 + continue; 1348 1352 1349 1353 ret = do_discard_extent(stripe, &bytes); 1350 1354 if (!ret) {

+25 -10

fs/btrfs/file.c

··· 2067 2067 return ret; 2068 2068 } 2069 2069 2070 + static inline bool skip_inode_logging(const struct btrfs_log_ctx *ctx) 2071 + { 2072 + struct btrfs_inode *inode = BTRFS_I(ctx->inode); 2073 + struct btrfs_fs_info *fs_info = inode->root->fs_info; 2074 + 2075 + if (btrfs_inode_in_log(inode, fs_info->generation) && 2076 + list_empty(&ctx->ordered_extents)) 2077 + return true; 2078 + 2079 + /* 2080 + * If we are doing a fast fsync we can not bail out if the inode's 2081 + * last_trans is <= then the last committed transaction, because we only 2082 + * update the last_trans of the inode during ordered extent completion, 2083 + * and for a fast fsync we don't wait for that, we only wait for the 2084 + * writeback to complete. 2085 + */ 2086 + if (inode->last_trans <= fs_info->last_trans_committed && 2087 + (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags) || 2088 + list_empty(&ctx->ordered_extents))) 2089 + return true; 2090 + 2091 + return false; 2092 + } 2093 + 2070 2094 /* 2071 2095 * fsync call for both files and directories. This logs the inode into 2072 2096 * the tree log instead of forcing full commits whenever possible. ··· 2209 2185 2210 2186 atomic_inc(&root->log_batch); 2211 2187 2212 - /* 2213 - * If we are doing a fast fsync we can not bail out if the inode's 2214 - * last_trans is <= then the last committed transaction, because we only 2215 - * update the last_trans of the inode during ordered extent completion, 2216 - * and for a fast fsync we don't wait for that, we only wait for the 2217 - * writeback to complete. 2218 - */ 2219 2188 smp_mb(); 2220 - if (btrfs_inode_in_log(BTRFS_I(inode), fs_info->generation) || 2221 - (BTRFS_I(inode)->last_trans <= fs_info->last_trans_committed && 2222 - (full_sync || list_empty(&ctx.ordered_extents)))) { 2189 + if (skip_inode_logging(&ctx)) { 2223 2190 /* 2224 2191 * We've had everything committed since the last time we were 2225 2192 * modified so clear this flag in case it was set for whatever

+1 -1

fs/btrfs/free-space-cache.c

··· 3949 3949 { 3950 3950 struct btrfs_block_group *block_group; 3951 3951 struct rb_node *node; 3952 - int ret; 3952 + int ret = 0; 3953 3953 3954 3954 btrfs_info(fs_info, "cleaning free space cache v1"); 3955 3955

+2 -2

fs/btrfs/inode.c

··· 9678 9678 return ret; 9679 9679 } 9680 9680 9681 - int btrfs_start_delalloc_snapshot(struct btrfs_root *root) 9681 + int btrfs_start_delalloc_snapshot(struct btrfs_root *root, bool in_reclaim_context) 9682 9682 { 9683 9683 struct writeback_control wbc = { 9684 9684 .nr_to_write = LONG_MAX, ··· 9691 9691 if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) 9692 9692 return -EROFS; 9693 9693 9694 - return start_delalloc_inodes(root, &wbc, true, false); 9694 + return start_delalloc_inodes(root, &wbc, true, in_reclaim_context); 9695 9695 } 9696 9696 9697 9697 int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, long nr,

+3 -1

fs/btrfs/ioctl.c

··· 259 259 if (!fa->flags_valid) { 260 260 /* 1 item for the inode */ 261 261 trans = btrfs_start_transaction(root, 1); 262 + if (IS_ERR(trans)) 263 + return PTR_ERR(trans); 262 264 goto update_flags; 263 265 } 264 266 ··· 909 907 */ 910 908 btrfs_drew_read_lock(&root->snapshot_lock); 911 909 912 - ret = btrfs_start_delalloc_snapshot(root); 910 + ret = btrfs_start_delalloc_snapshot(root, false); 913 911 if (ret) 914 912 goto out; 915 913

+1 -1

fs/btrfs/ordered-data.c

··· 984 984 985 985 if (pre) 986 986 ret = clone_ordered_extent(ordered, 0, pre); 987 - if (post) 987 + if (ret == 0 && post) 988 988 ret = clone_ordered_extent(ordered, pre + ordered->disk_num_bytes, 989 989 post); 990 990

+10 -6

fs/btrfs/qgroup.c

··· 3545 3545 struct btrfs_trans_handle *trans; 3546 3546 int ret; 3547 3547 3548 - /* Can't hold an open transaction or we run the risk of deadlocking */ 3549 - ASSERT(current->journal_info == NULL || 3550 - current->journal_info == BTRFS_SEND_TRANS_STUB); 3551 - if (WARN_ON(current->journal_info && 3552 - current->journal_info != BTRFS_SEND_TRANS_STUB)) 3548 + /* 3549 + * Can't hold an open transaction or we run the risk of deadlocking, 3550 + * and can't either be under the context of a send operation (where 3551 + * current->journal_info is set to BTRFS_SEND_TRANS_STUB), as that 3552 + * would result in a crash when starting a transaction and does not 3553 + * make sense either (send is a read-only operation). 3554 + */ 3555 + ASSERT(current->journal_info == NULL); 3556 + if (WARN_ON(current->journal_info)) 3553 3557 return 0; 3554 3558 3555 3559 /* ··· 3566 3562 return 0; 3567 3563 } 3568 3564 3569 - ret = btrfs_start_delalloc_snapshot(root); 3565 + ret = btrfs_start_delalloc_snapshot(root, true); 3570 3566 if (ret < 0) 3571 3567 goto out; 3572 3568 btrfs_wait_ordered_extents(root, U64_MAX, 0, (u64)-1);

+2 -2

fs/btrfs/send.c

··· 7170 7170 int i; 7171 7171 7172 7172 if (root) { 7173 - ret = btrfs_start_delalloc_snapshot(root); 7173 + ret = btrfs_start_delalloc_snapshot(root, false); 7174 7174 if (ret) 7175 7175 return ret; 7176 7176 btrfs_wait_ordered_extents(root, U64_MAX, 0, U64_MAX); ··· 7178 7178 7179 7179 for (i = 0; i < sctx->clone_roots_cnt; i++) { 7180 7180 root = sctx->clone_roots[i].root; 7181 - ret = btrfs_start_delalloc_snapshot(root); 7181 + ret = btrfs_start_delalloc_snapshot(root, false); 7182 7182 if (ret) 7183 7183 return ret; 7184 7184 btrfs_wait_ordered_extents(root, U64_MAX, 0, U64_MAX);

+2 -1

fs/btrfs/tree-log.c

··· 6061 6061 * (since logging them is pointless, a link count of 0 means they 6062 6062 * will never be accessible). 6063 6063 */ 6064 - if (btrfs_inode_in_log(inode, trans->transid) || 6064 + if ((btrfs_inode_in_log(inode, trans->transid) && 6065 + list_empty(&ctx->ordered_extents)) || 6065 6066 inode->vfs_inode.i_nlink == 0) { 6066 6067 ret = BTRFS_NO_LOG_SYNC; 6067 6068 goto end_no_trans;

+5

fs/btrfs/zoned.c

··· 1126 1126 goto out; 1127 1127 } 1128 1128 1129 + if (zone.type == BLK_ZONE_TYPE_CONVENTIONAL) { 1130 + ret = -EIO; 1131 + goto out; 1132 + } 1133 + 1129 1134 switch (zone.cond) { 1130 1135 case BLK_ZONE_COND_OFFLINE: 1131 1136 case BLK_ZONE_COND_READONLY:

+1

tools/arch/powerpc/include/uapi/asm/errno.h

··· 2 2 #ifndef _ASM_POWERPC_ERRNO_H 3 3 #define _ASM_POWERPC_ERRNO_H 4 4 5 + #undef EDEADLOCK 5 6 #include <asm-generic/errno.h> 6 7 7 8 #undef EDEADLOCK

+8 -1

tools/arch/x86/include/asm/cpufeatures.h

··· 84 84 85 85 /* CPU types for specific tunings: */ 86 86 #define X86_FEATURE_K8 ( 3*32+ 4) /* "" Opteron, Athlon64 */ 87 - #define X86_FEATURE_K7 ( 3*32+ 5) /* "" Athlon */ 87 + /* FREE, was #define X86_FEATURE_K7 ( 3*32+ 5) "" Athlon */ 88 88 #define X86_FEATURE_P3 ( 3*32+ 6) /* "" P3 */ 89 89 #define X86_FEATURE_P4 ( 3*32+ 7) /* "" P4 */ 90 90 #define X86_FEATURE_CONSTANT_TSC ( 3*32+ 8) /* TSC ticks at a constant rate */ ··· 236 236 #define X86_FEATURE_EPT_AD ( 8*32+17) /* Intel Extended Page Table access-dirty bit */ 237 237 #define X86_FEATURE_VMCALL ( 8*32+18) /* "" Hypervisor supports the VMCALL instruction */ 238 238 #define X86_FEATURE_VMW_VMMCALL ( 8*32+19) /* "" VMware prefers VMMCALL hypercall instruction */ 239 + #define X86_FEATURE_PVUNLOCK ( 8*32+20) /* "" PV unlock function */ 240 + #define X86_FEATURE_VCPUPREEMPT ( 8*32+21) /* "" PV vcpu_is_preempted function */ 239 241 240 242 /* Intel-defined CPU features, CPUID level 0x00000007:0 (EBX), word 9 */ 241 243 #define X86_FEATURE_FSGSBASE ( 9*32+ 0) /* RDFSBASE, WRFSBASE, RDGSBASE, WRGSBASE instructions*/ ··· 292 290 #define X86_FEATURE_FENCE_SWAPGS_KERNEL (11*32+ 5) /* "" LFENCE in kernel entry SWAPGS path */ 293 291 #define X86_FEATURE_SPLIT_LOCK_DETECT (11*32+ 6) /* #AC for split lock */ 294 292 #define X86_FEATURE_PER_THREAD_MBA (11*32+ 7) /* "" Per-thread Memory Bandwidth Allocation */ 293 + #define X86_FEATURE_SGX1 (11*32+ 8) /* "" Basic SGX */ 294 + #define X86_FEATURE_SGX2 (11*32+ 9) /* "" SGX Enclave Dynamic Memory Management (EDMM) */ 295 295 296 296 /* Intel-defined CPU features, CPUID level 0x00000007:1 (EAX), word 12 */ 297 297 #define X86_FEATURE_AVX_VNNI (12*32+ 4) /* AVX VNNI instructions */ ··· 340 336 #define X86_FEATURE_AVIC (15*32+13) /* Virtual Interrupt Controller */ 341 337 #define X86_FEATURE_V_VMSAVE_VMLOAD (15*32+15) /* Virtual VMSAVE VMLOAD */ 342 338 #define X86_FEATURE_VGIF (15*32+16) /* Virtual GIF */ 339 + #define X86_FEATURE_V_SPEC_CTRL (15*32+20) /* Virtual SPEC_CTRL */ 343 340 #define X86_FEATURE_SVME_ADDR_CHK (15*32+28) /* "" SVME addr check */ 344 341 345 342 /* Intel-defined CPU features, CPUID level 0x00000007:0 (ECX), word 16 */ ··· 359 354 #define X86_FEATURE_AVX512_VPOPCNTDQ (16*32+14) /* POPCNT for vectors of DW/QW */ 360 355 #define X86_FEATURE_LA57 (16*32+16) /* 5-level page tables */ 361 356 #define X86_FEATURE_RDPID (16*32+22) /* RDPID instruction */ 357 + #define X86_FEATURE_BUS_LOCK_DETECT (16*32+24) /* Bus Lock detect */ 362 358 #define X86_FEATURE_CLDEMOTE (16*32+25) /* CLDEMOTE instruction */ 363 359 #define X86_FEATURE_MOVDIRI (16*32+27) /* MOVDIRI instruction */ 364 360 #define X86_FEATURE_MOVDIR64B (16*32+28) /* MOVDIR64B instruction */ ··· 380 374 #define X86_FEATURE_MD_CLEAR (18*32+10) /* VERW clears CPU buffers */ 381 375 #define X86_FEATURE_TSX_FORCE_ABORT (18*32+13) /* "" TSX_FORCE_ABORT */ 382 376 #define X86_FEATURE_SERIALIZE (18*32+14) /* SERIALIZE instruction */ 377 + #define X86_FEATURE_HYBRID_CPU (18*32+15) /* "" This part has CPUs of more than one type */ 383 378 #define X86_FEATURE_TSXLDTRK (18*32+16) /* TSX Suspend Load Address Tracking */ 384 379 #define X86_FEATURE_PCONFIG (18*32+18) /* Intel PCONFIG */ 385 380 #define X86_FEATURE_ARCH_LBR (18*32+19) /* Intel ARCH LBR */

+4

tools/arch/x86/include/asm/msr-index.h

··· 185 185 #define MSR_PEBS_DATA_CFG 0x000003f2 186 186 #define MSR_IA32_DS_AREA 0x00000600 187 187 #define MSR_IA32_PERF_CAPABILITIES 0x00000345 188 + #define PERF_CAP_METRICS_IDX 15 189 + #define PERF_CAP_PT_IDX 16 190 + 188 191 #define MSR_PEBS_LD_LAT_THRESHOLD 0x000003f6 189 192 190 193 #define MSR_IA32_RTIT_CTL 0x00000570 ··· 268 265 #define DEBUGCTLMSR_LBR (1UL << 0) /* last branch recording */ 269 266 #define DEBUGCTLMSR_BTF_SHIFT 1 270 267 #define DEBUGCTLMSR_BTF (1UL << 1) /* single-step on branches */ 268 + #define DEBUGCTLMSR_BUS_LOCK_DETECT (1UL << 2) 271 269 #define DEBUGCTLMSR_TR (1UL << 6) 272 270 #define DEBUGCTLMSR_BTS (1UL << 7) 273 271 #define DEBUGCTLMSR_BTINT (1UL << 8)

+1

tools/arch/x86/include/uapi/asm/vmx.h

··· 27 27 28 28 29 29 #define VMX_EXIT_REASONS_FAILED_VMENTRY 0x80000000 30 + #define VMX_EXIT_REASONS_SGX_ENCLAVE_MODE 0x08000000 30 31 31 32 #define EXIT_REASON_EXCEPTION_NMI 0 32 33 #define EXIT_REASON_EXTERNAL_INTERRUPT 1

+1 -1

tools/arch/x86/lib/memcpy_64.S

··· 4 4 #include <linux/linkage.h> 5 5 #include <asm/errno.h> 6 6 #include <asm/cpufeatures.h> 7 - #include <asm/alternative-asm.h> 7 + #include <asm/alternative.h> 8 8 #include <asm/export.h> 9 9 10 10 .pushsection .noinstr.text, "ax"

+1 -1

tools/arch/x86/lib/memset_64.S

··· 3 3 4 4 #include <linux/linkage.h> 5 5 #include <asm/cpufeatures.h> 6 - #include <asm/alternative-asm.h> 6 + #include <asm/alternative.h> 7 7 #include <asm/export.h> 8 8 9 9 /*

tools/include/asm/alternative-asm.h tools/include/asm/alternative.h

+10 -1

tools/include/uapi/asm-generic/unistd.h

··· 863 863 __SC_COMP(__NR_epoll_pwait2, sys_epoll_pwait2, compat_sys_epoll_pwait2) 864 864 #define __NR_mount_setattr 442 865 865 __SYSCALL(__NR_mount_setattr, sys_mount_setattr) 866 + #define __NR_quotactl_path 443 867 + __SYSCALL(__NR_quotactl_path, sys_quotactl_path) 868 + 869 + #define __NR_landlock_create_ruleset 444 870 + __SYSCALL(__NR_landlock_create_ruleset, sys_landlock_create_ruleset) 871 + #define __NR_landlock_add_rule 445 872 + __SYSCALL(__NR_landlock_add_rule, sys_landlock_add_rule) 873 + #define __NR_landlock_restrict_self 446 874 + __SYSCALL(__NR_landlock_restrict_self, sys_landlock_restrict_self) 866 875 867 876 #undef __NR_syscalls 868 - #define __NR_syscalls 443 877 + #define __NR_syscalls 447 869 878 870 879 /* 871 880 * 32 bit systems traditionally used different

+121 -4

tools/include/uapi/drm/drm.h

··· 625 625 __u64 size; 626 626 }; 627 627 628 + /** 629 + * DRM_CAP_DUMB_BUFFER 630 + * 631 + * If set to 1, the driver supports creating dumb buffers via the 632 + * &DRM_IOCTL_MODE_CREATE_DUMB ioctl. 633 + */ 628 634 #define DRM_CAP_DUMB_BUFFER 0x1 635 + /** 636 + * DRM_CAP_VBLANK_HIGH_CRTC 637 + * 638 + * If set to 1, the kernel supports specifying a CRTC index in the high bits of 639 + * &drm_wait_vblank_request.type. 640 + * 641 + * Starting kernel version 2.6.39, this capability is always set to 1. 642 + */ 629 643 #define DRM_CAP_VBLANK_HIGH_CRTC 0x2 644 + /** 645 + * DRM_CAP_DUMB_PREFERRED_DEPTH 646 + * 647 + * The preferred bit depth for dumb buffers. 648 + * 649 + * The bit depth is the number of bits used to indicate the color of a single 650 + * pixel excluding any padding. This is different from the number of bits per 651 + * pixel. For instance, XRGB8888 has a bit depth of 24 but has 32 bits per 652 + * pixel. 653 + * 654 + * Note that this preference only applies to dumb buffers, it's irrelevant for 655 + * other types of buffers. 656 + */ 630 657 #define DRM_CAP_DUMB_PREFERRED_DEPTH 0x3 658 + /** 659 + * DRM_CAP_DUMB_PREFER_SHADOW 660 + * 661 + * If set to 1, the driver prefers userspace to render to a shadow buffer 662 + * instead of directly rendering to a dumb buffer. For best speed, userspace 663 + * should do streaming ordered memory copies into the dumb buffer and never 664 + * read from it. 665 + * 666 + * Note that this preference only applies to dumb buffers, it's irrelevant for 667 + * other types of buffers. 668 + */ 631 669 #define DRM_CAP_DUMB_PREFER_SHADOW 0x4 670 + /** 671 + * DRM_CAP_PRIME 672 + * 673 + * Bitfield of supported PRIME sharing capabilities. See &DRM_PRIME_CAP_IMPORT 674 + * and &DRM_PRIME_CAP_EXPORT. 675 + * 676 + * PRIME buffers are exposed as dma-buf file descriptors. See 677 + * Documentation/gpu/drm-mm.rst, section "PRIME Buffer Sharing". 678 + */ 632 679 #define DRM_CAP_PRIME 0x5 680 + /** 681 + * DRM_PRIME_CAP_IMPORT 682 + * 683 + * If this bit is set in &DRM_CAP_PRIME, the driver supports importing PRIME 684 + * buffers via the &DRM_IOCTL_PRIME_FD_TO_HANDLE ioctl. 685 + */ 633 686 #define DRM_PRIME_CAP_IMPORT 0x1 687 + /** 688 + * DRM_PRIME_CAP_EXPORT 689 + * 690 + * If this bit is set in &DRM_CAP_PRIME, the driver supports exporting PRIME 691 + * buffers via the &DRM_IOCTL_PRIME_HANDLE_TO_FD ioctl. 692 + */ 634 693 #define DRM_PRIME_CAP_EXPORT 0x2 694 + /** 695 + * DRM_CAP_TIMESTAMP_MONOTONIC 696 + * 697 + * If set to 0, the kernel will report timestamps with ``CLOCK_REALTIME`` in 698 + * struct drm_event_vblank. If set to 1, the kernel will report timestamps with 699 + * ``CLOCK_MONOTONIC``. See ``clock_gettime(2)`` for the definition of these 700 + * clocks. 701 + * 702 + * Starting from kernel version 2.6.39, the default value for this capability 703 + * is 1. Starting kernel version 4.15, this capability is always set to 1. 704 + */ 635 705 #define DRM_CAP_TIMESTAMP_MONOTONIC 0x6 706 + /** 707 + * DRM_CAP_ASYNC_PAGE_FLIP 708 + * 709 + * If set to 1, the driver supports &DRM_MODE_PAGE_FLIP_ASYNC. 710 + */ 636 711 #define DRM_CAP_ASYNC_PAGE_FLIP 0x7 637 - /* 638 - * The CURSOR_WIDTH and CURSOR_HEIGHT capabilities return a valid widthxheight 639 - * combination for the hardware cursor. The intention is that a hardware 640 - * agnostic userspace can query a cursor plane size to use. 712 + /** 713 + * DRM_CAP_CURSOR_WIDTH 714 + * 715 + * The ``CURSOR_WIDTH`` and ``CURSOR_HEIGHT`` capabilities return a valid 716 + * width x height combination for the hardware cursor. The intention is that a 717 + * hardware agnostic userspace can query a cursor plane size to use. 641 718 * 642 719 * Note that the cross-driver contract is to merely return a valid size; 643 720 * drivers are free to attach another meaning on top, eg. i915 returns the 644 721 * maximum plane size. 645 722 */ 646 723 #define DRM_CAP_CURSOR_WIDTH 0x8 724 + /** 725 + * DRM_CAP_CURSOR_HEIGHT 726 + * 727 + * See &DRM_CAP_CURSOR_WIDTH. 728 + */ 647 729 #define DRM_CAP_CURSOR_HEIGHT 0x9 730 + /** 731 + * DRM_CAP_ADDFB2_MODIFIERS 732 + * 733 + * If set to 1, the driver supports supplying modifiers in the 734 + * &DRM_IOCTL_MODE_ADDFB2 ioctl. 735 + */ 648 736 #define DRM_CAP_ADDFB2_MODIFIERS 0x10 737 + /** 738 + * DRM_CAP_PAGE_FLIP_TARGET 739 + * 740 + * If set to 1, the driver supports the &DRM_MODE_PAGE_FLIP_TARGET_ABSOLUTE and 741 + * &DRM_MODE_PAGE_FLIP_TARGET_RELATIVE flags in 742 + * &drm_mode_crtc_page_flip_target.flags for the &DRM_IOCTL_MODE_PAGE_FLIP 743 + * ioctl. 744 + */ 649 745 #define DRM_CAP_PAGE_FLIP_TARGET 0x11 746 + /** 747 + * DRM_CAP_CRTC_IN_VBLANK_EVENT 748 + * 749 + * If set to 1, the kernel supports reporting the CRTC ID in 750 + * &drm_event_vblank.crtc_id for the &DRM_EVENT_VBLANK and 751 + * &DRM_EVENT_FLIP_COMPLETE events. 752 + * 753 + * Starting kernel version 4.12, this capability is always set to 1. 754 + */ 650 755 #define DRM_CAP_CRTC_IN_VBLANK_EVENT 0x12 756 + /** 757 + * DRM_CAP_SYNCOBJ 758 + * 759 + * If set to 1, the driver supports sync objects. See 760 + * Documentation/gpu/drm-mm.rst, section "DRM Sync Objects". 761 + */ 651 762 #define DRM_CAP_SYNCOBJ 0x13 763 + /** 764 + * DRM_CAP_SYNCOBJ_TIMELINE 765 + * 766 + * If set to 1, the driver supports timeline operations on sync objects. See 767 + * Documentation/gpu/drm-mm.rst, section "DRM Sync Objects". 768 + */ 652 769 #define DRM_CAP_SYNCOBJ_TIMELINE 0x14 653 770 654 771 /* DRM_IOCTL_GET_CAP ioctl argument type */

+1

tools/include/uapi/drm/i915_drm.h

··· 943 943 __u64 offset; 944 944 }; 945 945 946 + /* DRM_IOCTL_I915_GEM_EXECBUFFER was removed in Linux 5.13 */ 946 947 struct drm_i915_gem_execbuffer { 947 948 /** 948 949 * List of buffers to be validated with their relocations to be

+45

tools/include/uapi/linux/kvm.h

··· 1078 1078 #define KVM_CAP_DIRTY_LOG_RING 192 1079 1079 #define KVM_CAP_X86_BUS_LOCK_EXIT 193 1080 1080 #define KVM_CAP_PPC_DAWR1 194 1081 + #define KVM_CAP_SET_GUEST_DEBUG2 195 1082 + #define KVM_CAP_SGX_ATTRIBUTE 196 1083 + #define KVM_CAP_VM_COPY_ENC_CONTEXT_FROM 197 1084 + #define KVM_CAP_PTP_KVM 198 1081 1085 1082 1086 #ifdef KVM_CAP_IRQ_ROUTING 1083 1087 ··· 1675 1671 KVM_SEV_CERT_EXPORT, 1676 1672 /* Attestation report */ 1677 1673 KVM_SEV_GET_ATTESTATION_REPORT, 1674 + /* Guest Migration Extension */ 1675 + KVM_SEV_SEND_CANCEL, 1678 1676 1679 1677 KVM_SEV_NR_MAX, 1680 1678 }; ··· 1733 1727 __u8 mnonce[16]; 1734 1728 __u64 uaddr; 1735 1729 __u32 len; 1730 + }; 1731 + 1732 + struct kvm_sev_send_start { 1733 + __u32 policy; 1734 + __u64 pdh_cert_uaddr; 1735 + __u32 pdh_cert_len; 1736 + __u64 plat_certs_uaddr; 1737 + __u32 plat_certs_len; 1738 + __u64 amd_certs_uaddr; 1739 + __u32 amd_certs_len; 1740 + __u64 session_uaddr; 1741 + __u32 session_len; 1742 + }; 1743 + 1744 + struct kvm_sev_send_update_data { 1745 + __u64 hdr_uaddr; 1746 + __u32 hdr_len; 1747 + __u64 guest_uaddr; 1748 + __u32 guest_len; 1749 + __u64 trans_uaddr; 1750 + __u32 trans_len; 1751 + }; 1752 + 1753 + struct kvm_sev_receive_start { 1754 + __u32 handle; 1755 + __u32 policy; 1756 + __u64 pdh_uaddr; 1757 + __u32 pdh_len; 1758 + __u64 session_uaddr; 1759 + __u32 session_len; 1760 + }; 1761 + 1762 + struct kvm_sev_receive_update_data { 1763 + __u64 hdr_uaddr; 1764 + __u32 hdr_len; 1765 + __u64 guest_uaddr; 1766 + __u32 guest_len; 1767 + __u64 trans_uaddr; 1768 + __u32 trans_len; 1736 1769 }; 1737 1770 1738 1771 #define KVM_DEV_ASSIGN_ENABLE_IOMMU (1 << 0)

+21 -5

tools/include/uapi/linux/perf_event.h

··· 127 127 PERF_COUNT_SW_EMULATION_FAULTS = 8, 128 128 PERF_COUNT_SW_DUMMY = 9, 129 129 PERF_COUNT_SW_BPF_OUTPUT = 10, 130 + PERF_COUNT_SW_CGROUP_SWITCHES = 11, 130 131 131 132 PERF_COUNT_SW_MAX, /* non-ABI */ 132 133 }; ··· 327 326 #define PERF_ATTR_SIZE_VER4 104 /* add: sample_regs_intr */ 328 327 #define PERF_ATTR_SIZE_VER5 112 /* add: aux_watermark */ 329 328 #define PERF_ATTR_SIZE_VER6 120 /* add: aux_sample_size */ 329 + #define PERF_ATTR_SIZE_VER7 128 /* add: sig_data */ 330 330 331 331 /* 332 332 * Hardware event_id to monitor via a performance monitoring event: ··· 406 404 cgroup : 1, /* include cgroup events */ 407 405 text_poke : 1, /* include text poke events */ 408 406 build_id : 1, /* use build id in mmap2 events */ 409 - __reserved_1 : 29; 407 + inherit_thread : 1, /* children only inherit if cloned with CLONE_THREAD */ 408 + remove_on_exec : 1, /* event is removed from task on exec */ 409 + sigtrap : 1, /* send synchronous SIGTRAP on event */ 410 + __reserved_1 : 26; 410 411 411 412 union { 412 413 __u32 wakeup_events; /* wakeup every n events */ ··· 461 456 __u16 __reserved_2; 462 457 __u32 aux_sample_size; 463 458 __u32 __reserved_3; 459 + 460 + /* 461 + * User provided data if sigtrap=1, passed back to user via 462 + * siginfo_t::si_perf, e.g. to permit user to identify the event. 463 + */ 464 + __u64 sig_data; 464 465 }; 465 466 466 467 /* ··· 1182 1171 /** 1183 1172 * PERF_RECORD_AUX::flags bits 1184 1173 */ 1185 - #define PERF_AUX_FLAG_TRUNCATED 0x01 /* record was truncated to fit */ 1186 - #define PERF_AUX_FLAG_OVERWRITE 0x02 /* snapshot from overwrite mode */ 1187 - #define PERF_AUX_FLAG_PARTIAL 0x04 /* record contains gaps */ 1188 - #define PERF_AUX_FLAG_COLLISION 0x08 /* sample collided with another */ 1174 + #define PERF_AUX_FLAG_TRUNCATED 0x01 /* record was truncated to fit */ 1175 + #define PERF_AUX_FLAG_OVERWRITE 0x02 /* snapshot from overwrite mode */ 1176 + #define PERF_AUX_FLAG_PARTIAL 0x04 /* record contains gaps */ 1177 + #define PERF_AUX_FLAG_COLLISION 0x08 /* sample collided with another */ 1178 + #define PERF_AUX_FLAG_PMU_FORMAT_TYPE_MASK 0xff00 /* PMU specific trace format type */ 1179 + 1180 + /* CoreSight PMU AUX buffer formats */ 1181 + #define PERF_AUX_FLAG_CORESIGHT_FORMAT_CORESIGHT 0x0000 /* Default for backward compatibility */ 1182 + #define PERF_AUX_FLAG_CORESIGHT_FORMAT_RAW 0x0100 /* Raw format of the source */ 1189 1183 1190 1184 #define PERF_FLAG_FD_NO_GROUP (1UL << 0) 1191 1185 #define PERF_FLAG_FD_OUTPUT (1UL << 1)

+4

tools/include/uapi/linux/prctl.h

··· 255 255 # define SYSCALL_DISPATCH_FILTER_ALLOW 0 256 256 # define SYSCALL_DISPATCH_FILTER_BLOCK 1 257 257 258 + /* Set/get enabled arm64 pointer authentication keys */ 259 + #define PR_PAC_SET_ENABLED_KEYS 60 260 + #define PR_PAC_GET_ENABLED_KEYS 61 261 + 258 262 #endif /* _LINUX_PRCTL_H */

+1 -1

tools/kvm/kvm_stat/kvm_stat.txt

··· 111 111 --tracepoints:: 112 112 retrieve statistics from tracepoints 113 113 114 - *z*:: 114 + -z:: 115 115 --skip-zero-records:: 116 116 omit records with all zeros in logging mode 117 117

+1

tools/perf/Makefile.config

··· 540 540 ifdef LIBBPF_DYNAMIC 541 541 ifeq ($(feature-libbpf), 1) 542 542 EXTLIBS += -lbpf 543 + $(call detected,CONFIG_LIBBPF_DYNAMIC) 543 544 else 544 545 dummy := $(error Error: No libbpf devel library found, please install libbpf-devel); 545 546 endif

+1 -1

tools/perf/arch/arm64/util/kvm-stat.c

··· 71 71 .name = "vmexit", 72 72 .ops = &exit_events, 73 73 }, 74 - { NULL }, 74 + { NULL, NULL }, 75 75 }; 76 76 77 77 const char * const kvm_skip_events[] = {

+5

tools/perf/arch/mips/entry/syscalls/syscall_n64.tbl

··· 356 356 439 n64 faccessat2 sys_faccessat2 357 357 440 n64 process_madvise sys_process_madvise 358 358 441 n64 epoll_pwait2 sys_epoll_pwait2 359 + 442 n64 mount_setattr sys_mount_setattr 360 + 443 n64 quotactl_path sys_quotactl_path 361 + 444 n64 landlock_create_ruleset sys_landlock_create_ruleset 362 + 445 n64 landlock_add_rule sys_landlock_add_rule 363 + 446 n64 landlock_restrict_self sys_landlock_restrict_self

+4

tools/perf/arch/powerpc/entry/syscalls/syscall.tbl

··· 522 522 440 common process_madvise sys_process_madvise 523 523 441 common epoll_pwait2 sys_epoll_pwait2 compat_sys_epoll_pwait2 524 524 442 common mount_setattr sys_mount_setattr 525 + 443 common quotactl_path sys_quotactl_path 526 + 444 common landlock_create_ruleset sys_landlock_create_ruleset 527 + 445 common landlock_add_rule sys_landlock_add_rule 528 + 446 common landlock_restrict_self sys_landlock_restrict_self

+4

tools/perf/arch/s390/entry/syscalls/syscall.tbl

··· 445 445 440 common process_madvise sys_process_madvise sys_process_madvise 446 446 441 common epoll_pwait2 sys_epoll_pwait2 compat_sys_epoll_pwait2 447 447 442 common mount_setattr sys_mount_setattr sys_mount_setattr 448 + 443 common quotactl_path sys_quotactl_path sys_quotactl_path 449 + 444 common landlock_create_ruleset sys_landlock_create_ruleset sys_landlock_create_ruleset 450 + 445 common landlock_add_rule sys_landlock_add_rule sys_landlock_add_rule 451 + 446 common landlock_restrict_self sys_landlock_restrict_self sys_landlock_restrict_self

+4

tools/perf/arch/x86/entry/syscalls/syscall_64.tbl

··· 364 364 440 common process_madvise sys_process_madvise 365 365 441 common epoll_pwait2 sys_epoll_pwait2 366 366 442 common mount_setattr sys_mount_setattr 367 + 443 common quotactl_path sys_quotactl_path 368 + 444 common landlock_create_ruleset sys_landlock_create_ruleset 369 + 445 common landlock_add_rule sys_landlock_add_rule 370 + 446 common landlock_restrict_self sys_landlock_restrict_self 367 371 368 372 # 369 373 # Due to a historical design error, certain syscalls are numbered differently

+4 -2

tools/perf/pmu-events/jevents.c

··· 1123 1123 mapfile = strdup(fpath); 1124 1124 return 0; 1125 1125 } 1126 - 1127 - pr_info("%s: Ignoring file %s\n", prog, fpath); 1126 + if (is_json_file(bname)) 1127 + pr_debug("%s: ArchStd json is preprocessed %s\n", prog, fpath); 1128 + else 1129 + pr_info("%s: Ignoring file %s\n", prog, fpath); 1128 1130 return 0; 1129 1131 } 1130 1132

+1 -1

tools/perf/tests/attr/base-record

··· 5 5 flags=0|8 6 6 cpu=* 7 7 type=0|1 8 - size=120 8 + size=128 9 9 config=0 10 10 sample_period=* 11 11 sample_type=263

+1 -1

tools/perf/tests/attr/base-stat

··· 5 5 flags=0|8 6 6 cpu=* 7 7 type=0 8 - size=120 8 + size=128 9 9 config=0 10 10 sample_period=0 11 11 sample_type=65536

+1 -1

tools/perf/tests/attr/system-wide-dummy

··· 7 7 pid=-1 8 8 flags=8 9 9 type=1 10 - size=120 10 + size=128 11 11 config=9 12 12 sample_period=4000 13 13 sample_type=455

+7

tools/perf/util/Build

··· 145 145 perf-$(CONFIG_LIBELF) += probe-file.o 146 146 perf-$(CONFIG_LIBELF) += probe-event.o 147 147 148 + ifdef CONFIG_LIBBPF_DYNAMIC 149 + hashmap := 1 150 + endif 148 151 ifndef CONFIG_LIBBPF 152 + hashmap := 1 153 + endif 154 + 155 + ifdef hashmap 149 156 perf-y += hashmap.o 150 157 endif 151 158

+7 -1

tools/perf/util/record.c

··· 157 157 static int record_opts__config_freq(struct record_opts *opts) 158 158 { 159 159 bool user_freq = opts->user_freq != UINT_MAX; 160 + bool user_interval = opts->user_interval != ULLONG_MAX; 160 161 unsigned int max_rate; 161 162 162 - if (opts->user_interval != ULLONG_MAX) 163 + if (user_interval && user_freq) { 164 + pr_err("cannot set frequency and period at the same time\n"); 165 + return -1; 166 + } 167 + 168 + if (user_interval) 163 169 opts->default_interval = opts->user_interval; 164 170 if (user_freq) 165 171 opts->freq = opts->user_freq;

+2 -2

tools/perf/util/session.c

··· 904 904 struct perf_record_record_cpu_map *mask; 905 905 unsigned i; 906 906 907 - data->type = bswap_64(data->type); 907 + data->type = bswap_16(data->type); 908 908 909 909 switch (data->type) { 910 910 case PERF_CPU_MAP__CPUS: ··· 937 937 { 938 938 u64 size; 939 939 940 - size = event->stat_config.nr * sizeof(event->stat_config.data[0]); 940 + size = bswap_64(event->stat_config.nr) * sizeof(event->stat_config.data[0]); 941 941 size += 1; /* nr item itself */ 942 942 mem_bswap_64(&event->stat_config.nr, size); 943 943 }

+2 -2

tools/testing/selftests/kvm/lib/x86_64/handlers.S

··· 54 54 .align 8 55 55 56 56 /* Fetch current address and append it to idt_handlers. */ 57 - current_handler = . 57 + 666 : 58 58 .pushsection .rodata 59 - .quad current_handler 59 + .quad 666b 60 60 .popsection 61 61 62 62 .if ! \has_error

+71 -19

tools/testing/selftests/kvm/x86_64/evmcs_test.c

··· 18 18 #include "vmx.h" 19 19 20 20 #define VCPU_ID 5 21 + #define NMI_VECTOR 2 22 + 23 + static int ud_count; 24 + 25 + void enable_x2apic(void) 26 + { 27 + uint32_t spiv_reg = APIC_BASE_MSR + (APIC_SPIV >> 4); 28 + 29 + wrmsr(MSR_IA32_APICBASE, rdmsr(MSR_IA32_APICBASE) | 30 + MSR_IA32_APICBASE_ENABLE | MSR_IA32_APICBASE_EXTD); 31 + wrmsr(spiv_reg, rdmsr(spiv_reg) | APIC_SPIV_APIC_ENABLED); 32 + } 33 + 34 + static void guest_ud_handler(struct ex_regs *regs) 35 + { 36 + ud_count++; 37 + regs->rip += 3; /* VMLAUNCH */ 38 + } 39 + 40 + static void guest_nmi_handler(struct ex_regs *regs) 41 + { 42 + } 21 43 22 44 void l2_guest_code(void) 23 45 { ··· 47 25 48 26 GUEST_SYNC(8); 49 27 28 + /* Forced exit to L1 upon restore */ 29 + GUEST_SYNC(9); 30 + 50 31 /* Done, exit to L1 and never come back. */ 51 32 vmcall(); 52 33 } 53 34 54 - void l1_guest_code(struct vmx_pages *vmx_pages) 35 + void guest_code(struct vmx_pages *vmx_pages) 55 36 { 56 37 #define L2_GUEST_STACK_SIZE 64 57 38 unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE]; 39 + 40 + enable_x2apic(); 41 + 42 + GUEST_SYNC(1); 43 + GUEST_SYNC(2); 58 44 59 45 enable_vp_assist(vmx_pages->vp_assist_gpa, vmx_pages->vp_assist); 60 46 ··· 85 55 current_evmcs->revision_id = EVMCS_VERSION; 86 56 GUEST_SYNC(6); 87 57 58 + current_evmcs->pin_based_vm_exec_control |= 59 + PIN_BASED_NMI_EXITING; 88 60 GUEST_ASSERT(!vmlaunch()); 89 61 GUEST_ASSERT(vmptrstz() == vmx_pages->enlightened_vmcs_gpa); 90 - GUEST_SYNC(9); 62 + 63 + /* 64 + * NMI forces L2->L1 exit, resuming L2 and hope that EVMCS is 65 + * up-to-date (RIP points where it should and not at the beginning 66 + * of l2_guest_code(). GUEST_SYNC(9) checkes that. 67 + */ 91 68 GUEST_ASSERT(!vmresume()); 92 - GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_VMCALL); 69 + 93 70 GUEST_SYNC(10); 94 - } 95 71 96 - void guest_code(struct vmx_pages *vmx_pages) 97 - { 98 - GUEST_SYNC(1); 99 - GUEST_SYNC(2); 100 - 101 - if (vmx_pages) 102 - l1_guest_code(vmx_pages); 103 - 104 - GUEST_DONE(); 72 + GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_VMCALL); 73 + GUEST_SYNC(11); 105 74 106 75 /* Try enlightened vmptrld with an incorrect GPA */ 107 76 evmcs_vmptrld(0xdeadbeef, vmx_pages->enlightened_vmcs); 108 77 GUEST_ASSERT(vmlaunch()); 78 + GUEST_ASSERT(ud_count == 1); 79 + GUEST_DONE(); 80 + } 81 + 82 + void inject_nmi(struct kvm_vm *vm) 83 + { 84 + struct kvm_vcpu_events events; 85 + 86 + vcpu_events_get(vm, VCPU_ID, &events); 87 + 88 + events.nmi.pending = 1; 89 + events.flags |= KVM_VCPUEVENT_VALID_NMI_PENDING; 90 + 91 + vcpu_events_set(vm, VCPU_ID, &events); 109 92 } 110 93 111 94 int main(int argc, char *argv[]) ··· 152 109 vcpu_alloc_vmx(vm, &vmx_pages_gva); 153 110 vcpu_args_set(vm, VCPU_ID, 1, vmx_pages_gva); 154 111 112 + vm_init_descriptor_tables(vm); 113 + vcpu_init_descriptor_tables(vm, VCPU_ID); 114 + vm_handle_exception(vm, UD_VECTOR, guest_ud_handler); 115 + vm_handle_exception(vm, NMI_VECTOR, guest_nmi_handler); 116 + 117 + pr_info("Running L1 which uses EVMCS to run L2\n"); 118 + 155 119 for (stage = 1;; stage++) { 156 120 _vcpu_run(vm, VCPU_ID); 157 121 TEST_ASSERT(run->exit_reason == KVM_EXIT_IO, ··· 174 124 case UCALL_SYNC: 175 125 break; 176 126 case UCALL_DONE: 177 - goto part1_done; 127 + goto done; 178 128 default: 179 129 TEST_FAIL("Unknown ucall %lu", uc.cmd); 180 130 } ··· 204 154 TEST_ASSERT(!memcmp(&regs1, &regs2, sizeof(regs2)), 205 155 "Unexpected register values after vcpu_load_state; rdi: %lx rsi: %lx", 206 156 (ulong) regs2.rdi, (ulong) regs2.rsi); 157 + 158 + /* Force immediate L2->L1 exit before resuming */ 159 + if (stage == 8) { 160 + pr_info("Injecting NMI into L1 before L2 had a chance to run after restore\n"); 161 + inject_nmi(vm); 162 + } 207 163 } 208 164 209 - part1_done: 210 - _vcpu_run(vm, VCPU_ID); 211 - TEST_ASSERT(run->exit_reason == KVM_EXIT_SHUTDOWN, 212 - "Unexpected successful VMEnter with invalid eVMCS pointer!"); 213 - 165 + done: 214 166 kvm_vm_free(vm); 215 167 }

+4 -3

virt/kvm/kvm_main.c

··· 2893 2893 if (val < grow_start) 2894 2894 val = grow_start; 2895 2895 2896 - if (val > halt_poll_ns) 2897 - val = halt_poll_ns; 2896 + if (val > vcpu->kvm->max_halt_poll_ns) 2897 + val = vcpu->kvm->max_halt_poll_ns; 2898 2898 2899 2899 vcpu->halt_poll_ns = val; 2900 2900 out: ··· 2973 2973 goto out; 2974 2974 } 2975 2975 poll_end = cur = ktime_get(); 2976 - } while (single_task_running() && ktime_before(cur, stop)); 2976 + } while (single_task_running() && !need_resched() && 2977 + ktime_before(cur, stop)); 2977 2978 } 2978 2979 2979 2980 prepare_to_rcuwait(&vcpu->wait);