commit 81576a9a27dfee37acc2f8a71c7326f233bbbeba · tjh.dev/kernel

tjh.dev / kernel

Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git

kernel os linux

Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm

Pull kvm fixes from Paolo Bonzini:
"ARM64:

- Fix confusion with implicitly-shifted MDCR_EL2 masks breaking
SPE/TRBE initialization

- Align nested page table walker with the intended memory attribute
combining rules of the architecture

- Prevent userspace from constraining the advertised ASID width,
avoiding horrors of guest TLBIs not matching the intended context
in hardware

- Don't leak references on LPIs when insertion into the translation
cache fails

RISC-V:

- Replace csr_write() with csr_set() for HVIEN PMU overflow bit

x86:

- Cache CPUID.0xD XSTATE offsets+sizes during module init

On Intel's Emerald Rapids CPUID costs hundreds of cycles and there
are a lot of leaves under 0xD. Getting rid of the CPUIDs during
nested VM-Enter and VM-Exit is planned for the next release, for
now just cache them: even on Skylake that is 40% faster"

* tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm:
KVM: x86: Cache CPUID.0xD XSTATE offsets+sizes during module init
RISC-V: KVM: Fix csr_write -> csr_set for HVIEN PMU overflow bit
KVM: arm64: vgic-its: Add error handling in vgic_its_cache_translation
KVM: arm64: Do not allow ID_AA64MMFR0_EL1.ASIDbits to be overridden
KVM: arm64: Fix S1/S2 combination when FWB==1 and S2 has Device memory type
arm64: Fix usage of new shifted MDCR_EL2 values

Linus Torvalds 1 year ago 81576a9a 2d8308bf

+58 -16

10 changed files

expand all

unified split

arch

arm64

include

asm

el2_setup.h

kernel

hyp-stub.S

kvm

at.c

hyp

nvhe

pkvm.c

sys_regs.c

vgic

vgic-its.c

riscv

kvm

aia.c

x86

kvm

cpuid.c

cpuid.h

x86.c

+2 -2

arch/arm64/include/asm/el2_setup.h

··· 87 87 1 << PMSCR_EL2_PA_SHIFT) 88 88 msr_s SYS_PMSCR_EL2, x0 // addresses and physical counter 89 89 .Lskip_spe_el2_\@: 90 - mov x0, #(MDCR_EL2_E2PB_MASK << MDCR_EL2_E2PB_SHIFT) 90 + mov x0, #MDCR_EL2_E2PB_MASK 91 91 orr x2, x2, x0 // If we don't have VHE, then 92 92 // use EL1&0 translation. 93 93 ··· 100 100 and x0, x0, TRBIDR_EL1_P 101 101 cbnz x0, .Lskip_trace_\@ // If TRBE is available at EL2 102 102 103 - mov x0, #(MDCR_EL2_E2TB_MASK << MDCR_EL2_E2TB_SHIFT) 103 + mov x0, #MDCR_EL2_E2TB_MASK 104 104 orr x2, x2, x0 // allow the EL1&0 translation 105 105 // to own it. 106 106

+2 -2

arch/arm64/kernel/hyp-stub.S

··· 114 114 115 115 // Use EL2 translations for SPE & TRBE and disable access from EL1 116 116 mrs x0, mdcr_el2 117 - bic x0, x0, #(MDCR_EL2_E2PB_MASK << MDCR_EL2_E2PB_SHIFT) 118 - bic x0, x0, #(MDCR_EL2_E2TB_MASK << MDCR_EL2_E2TB_SHIFT) 117 + bic x0, x0, #MDCR_EL2_E2PB_MASK 118 + bic x0, x0, #MDCR_EL2_E2TB_MASK 119 119 msr mdcr_el2, x0 120 120 121 121 // Transfer the MM state from EL1 to EL2

+9 -2

arch/arm64/kvm/at.c

··· 739 739 final_attr = s1_parattr; 740 740 break; 741 741 default: 742 - /* MemAttr[2]=0, Device from S2 */ 743 - final_attr = s2_memattr & GENMASK(1,0) << 2; 742 + /* 743 + * MemAttr[2]=0, Device from S2. 744 + * 745 + * FWB does not influence the way that stage 1 746 + * memory types and attributes are combined 747 + * with stage 2 Device type and attributes. 748 + */ 749 + final_attr = min(s2_memattr_to_attr(s2_memattr), 750 + s1_parattr); 744 751 } 745 752 } else { 746 753 /* Combination of R_HMNDG, R_TNHFM and R_GQFSF */

+2 -2

arch/arm64/kvm/hyp/nvhe/pkvm.c

··· 126 126 /* Trap SPE */ 127 127 if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_PMSVer), feature_ids)) { 128 128 mdcr_set |= MDCR_EL2_TPMS; 129 - mdcr_clear |= MDCR_EL2_E2PB_MASK << MDCR_EL2_E2PB_SHIFT; 129 + mdcr_clear |= MDCR_EL2_E2PB_MASK; 130 130 } 131 131 132 132 /* Trap Trace Filter */ ··· 143 143 144 144 /* Trap External Trace */ 145 145 if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_ExtTrcBuff), feature_ids)) 146 - mdcr_clear |= MDCR_EL2_E2TB_MASK << MDCR_EL2_E2TB_SHIFT; 146 + mdcr_clear |= MDCR_EL2_E2TB_MASK; 147 147 148 148 vcpu->arch.mdcr_el2 |= mdcr_set; 149 149 vcpu->arch.mdcr_el2 &= ~mdcr_clear;

+2 -1

arch/arm64/kvm/sys_regs.c

··· 2618 2618 ID_WRITABLE(ID_AA64MMFR0_EL1, ~(ID_AA64MMFR0_EL1_RES0 | 2619 2619 ID_AA64MMFR0_EL1_TGRAN4_2 | 2620 2620 ID_AA64MMFR0_EL1_TGRAN64_2 | 2621 - ID_AA64MMFR0_EL1_TGRAN16_2)), 2621 + ID_AA64MMFR0_EL1_TGRAN16_2 | 2622 + ID_AA64MMFR0_EL1_ASIDBITS)), 2622 2623 ID_WRITABLE(ID_AA64MMFR1_EL1, ~(ID_AA64MMFR1_EL1_RES0 | 2623 2624 ID_AA64MMFR1_EL1_HCX | 2624 2625 ID_AA64MMFR1_EL1_TWED |

+11 -1

arch/arm64/kvm/vgic/vgic-its.c

··· 608 608 lockdep_assert_held(&its->its_lock); 609 609 vgic_get_irq_kref(irq); 610 610 611 + old = xa_store(&its->translation_cache, cache_key, irq, GFP_KERNEL_ACCOUNT); 612 + 613 + /* 614 + * Put the reference taken on @irq if the store fails. Intentionally do 615 + * not return the error as the translation cache is best effort. 616 + */ 617 + if (xa_is_err(old)) { 618 + vgic_put_irq(kvm, irq); 619 + return; 620 + } 621 + 611 622 /* 612 623 * We could have raced with another CPU caching the same 613 624 * translation behind our back, ensure we don't leak a 614 625 * reference if that is the case. 615 626 */ 616 - old = xa_store(&its->translation_cache, cache_key, irq, GFP_KERNEL_ACCOUNT); 617 627 if (old) 618 628 vgic_put_irq(kvm, old); 619 629 }

+1 -1

arch/riscv/kvm/aia.c

··· 590 590 csr_set(CSR_HIE, BIT(IRQ_S_GEXT)); 591 591 /* Enable IRQ filtering for overflow interrupt only if sscofpmf is present */ 592 592 if (__riscv_isa_extension_available(NULL, RISCV_ISA_EXT_SSCOFPMF)) 593 - csr_write(CSR_HVIEN, BIT(IRQ_PMU_OVF)); 593 + csr_set(CSR_HVIEN, BIT(IRQ_PMU_OVF)); 594 594 } 595 595 596 596 void kvm_riscv_aia_disable(void)

+26 -5

arch/x86/kvm/cpuid.c

··· 36 36 u32 kvm_cpu_caps[NR_KVM_CPU_CAPS] __read_mostly; 37 37 EXPORT_SYMBOL_GPL(kvm_cpu_caps); 38 38 39 + struct cpuid_xstate_sizes { 40 + u32 eax; 41 + u32 ebx; 42 + u32 ecx; 43 + }; 44 + 45 + static struct cpuid_xstate_sizes xstate_sizes[XFEATURE_MAX] __ro_after_init; 46 + 47 + void __init kvm_init_xstate_sizes(void) 48 + { 49 + u32 ign; 50 + int i; 51 + 52 + for (i = XFEATURE_YMM; i < ARRAY_SIZE(xstate_sizes); i++) { 53 + struct cpuid_xstate_sizes *xs = &xstate_sizes[i]; 54 + 55 + cpuid_count(0xD, i, &xs->eax, &xs->ebx, &xs->ecx, &ign); 56 + } 57 + } 58 + 39 59 u32 xstate_required_size(u64 xstate_bv, bool compacted) 40 60 { 41 61 int feature_bit = 0; ··· 64 44 xstate_bv &= XFEATURE_MASK_EXTEND; 65 45 while (xstate_bv) { 66 46 if (xstate_bv & 0x1) { 67 - u32 eax, ebx, ecx, edx, offset; 68 - cpuid_count(0xD, feature_bit, &eax, &ebx, &ecx, &edx); 47 + struct cpuid_xstate_sizes *xs = &xstate_sizes[feature_bit]; 48 + u32 offset; 49 + 69 50 /* ECX[1]: 64B alignment in compacted form */ 70 51 if (compacted) 71 - offset = (ecx & 0x2) ? ALIGN(ret, 64) : ret; 52 + offset = (xs->ecx & 0x2) ? ALIGN(ret, 64) : ret; 72 53 else 73 - offset = ebx; 74 - ret = max(ret, offset + eax); 54 + offset = xs->ebx; 55 + ret = max(ret, offset + xs->eax); 75 56 } 76 57 77 58 xstate_bv >>= 1;

arch/x86/kvm/cpuid.h

··· 31 31 bool kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx, 32 32 u32 *ecx, u32 *edx, bool exact_only); 33 33 34 + void __init kvm_init_xstate_sizes(void); 34 35 u32 xstate_required_size(u64 xstate_bv, bool compacted); 35 36 36 37 int cpuid_query_maxphyaddr(struct kvm_vcpu *vcpu);

arch/x86/kvm/x86.c

··· 13997 13997 13998 13998 static int __init kvm_x86_init(void) 13999 13999 { 14000 + kvm_init_xstate_sizes(); 14001 + 14000 14002 kvm_mmu_x86_module_init(); 14001 14003 mitigate_smt_rsb &= boot_cpu_has_bug(X86_BUG_SMT_RSB) && cpu_smt_possible(); 14002 14004 return 0;