Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

KVM: x86: Introduce KVM_GET_SREGS2 / KVM_SET_SREGS2

This is a new version of KVM_GET_SREGS / KVM_SET_SREGS.

It has the following changes:
* Has flags for future extensions
* Has vcpu's PDPTRs, allowing to save/restore them on migration.
* Lacks obsolete interrupt bitmap (done now via KVM_SET_VCPU_EVENTS)

New capability, KVM_CAP_SREGS2 is added to signal
the userspace of this ioctl.

Signed-off-by: Maxim Levitsky <mlevitsk@redhat.com>
Message-Id: <20210607090203.133058-8-mlevitsk@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>

authored by

Maxim Levitsky and committed by
Paolo Bonzini
6dba9403 329675dd

+185 -27
+48
Documentation/virt/kvm/api.rst
··· 5034 5034 The KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADJUST type may not be used 5035 5035 with the KVM_XEN_VCPU_GET_ATTR ioctl. 5036 5036 5037 + 5038 + 4.131 KVM_GET_SREGS2 5039 + ------------------ 5040 + 5041 + :Capability: KVM_CAP_SREGS2 5042 + :Architectures: x86 5043 + :Type: vcpu ioctl 5044 + :Parameters: struct kvm_sregs2 (out) 5045 + :Returns: 0 on success, -1 on error 5046 + 5047 + Reads special registers from the vcpu. 5048 + This ioctl (when supported) replaces the KVM_GET_SREGS. 5049 + 5050 + :: 5051 + 5052 + struct kvm_sregs2 { 5053 + /* out (KVM_GET_SREGS2) / in (KVM_SET_SREGS2) */ 5054 + struct kvm_segment cs, ds, es, fs, gs, ss; 5055 + struct kvm_segment tr, ldt; 5056 + struct kvm_dtable gdt, idt; 5057 + __u64 cr0, cr2, cr3, cr4, cr8; 5058 + __u64 efer; 5059 + __u64 apic_base; 5060 + __u64 flags; 5061 + __u64 pdptrs[4]; 5062 + }; 5063 + 5064 + flags values for ``kvm_sregs2``: 5065 + 5066 + ``KVM_SREGS2_FLAGS_PDPTRS_VALID`` 5067 + 5068 + Indicates thats the struct contain valid PDPTR values. 5069 + 5070 + 5071 + 4.132 KVM_SET_SREGS2 5072 + ------------------ 5073 + 5074 + :Capability: KVM_CAP_SREGS2 5075 + :Architectures: x86 5076 + :Type: vcpu ioctl 5077 + :Parameters: struct kvm_sregs2 (in) 5078 + :Returns: 0 on success, -1 on error 5079 + 5080 + Writes special registers into the vcpu. 5081 + See KVM_GET_SREGS2 for the data structures. 5082 + This ioctl (when supported) replaces the KVM_SET_SREGS. 5083 + 5084 + 5037 5085 5. The kvm_run structure 5038 5086 ======================== 5039 5087
+13
arch/x86/include/uapi/asm/kvm.h
··· 159 159 __u64 interrupt_bitmap[(KVM_NR_INTERRUPTS + 63) / 64]; 160 160 }; 161 161 162 + struct kvm_sregs2 { 163 + /* out (KVM_GET_SREGS2) / in (KVM_SET_SREGS2) */ 164 + struct kvm_segment cs, ds, es, fs, gs, ss; 165 + struct kvm_segment tr, ldt; 166 + struct kvm_dtable gdt, idt; 167 + __u64 cr0, cr2, cr3, cr4, cr8; 168 + __u64 efer; 169 + __u64 apic_base; 170 + __u64 flags; 171 + __u64 pdptrs[4]; 172 + }; 173 + #define KVM_SREGS2_FLAGS_PDPTRS_VALID 1 174 + 162 175 /* for KVM_GET_FPU and KVM_SET_FPU */ 163 176 struct kvm_fpu { 164 177 __u8 fpr[8][16];
+5
arch/x86/kvm/kvm_cache_regs.h
··· 125 125 return vcpu->arch.walk_mmu->pdptrs[index]; 126 126 } 127 127 128 + static inline void kvm_pdptr_write(struct kvm_vcpu *vcpu, int index, u64 value) 129 + { 130 + vcpu->arch.walk_mmu->pdptrs[index] = value; 131 + } 132 + 128 133 static inline ulong kvm_read_cr0_bits(struct kvm_vcpu *vcpu, ulong mask) 129 134 { 130 135 ulong tmask = mask & KVM_POSSIBLE_CR0_GUEST_BITS;
+115 -27
arch/x86/kvm/x86.c
··· 114 114 static void store_regs(struct kvm_vcpu *vcpu); 115 115 static int sync_regs(struct kvm_vcpu *vcpu); 116 116 117 + static int __set_sregs2(struct kvm_vcpu *vcpu, struct kvm_sregs2 *sregs2); 118 + static void __get_sregs2(struct kvm_vcpu *vcpu, struct kvm_sregs2 *sregs2); 119 + 117 120 struct kvm_x86_ops kvm_x86_ops __read_mostly; 118 121 EXPORT_SYMBOL_GPL(kvm_x86_ops); 119 122 ··· 820 817 821 818 memcpy(mmu->pdptrs, pdpte, sizeof(mmu->pdptrs)); 822 819 kvm_register_mark_dirty(vcpu, VCPU_EXREG_PDPTR); 823 - 824 820 out: 825 821 826 822 return ret; ··· 3958 3956 case KVM_CAP_SGX_ATTRIBUTE: 3959 3957 #endif 3960 3958 case KVM_CAP_VM_COPY_ENC_CONTEXT_FROM: 3959 + case KVM_CAP_SREGS2: 3961 3960 r = 1; 3962 3961 break; 3963 3962 case KVM_CAP_SET_GUEST_DEBUG2: ··· 4873 4870 void __user *argp = (void __user *)arg; 4874 4871 int r; 4875 4872 union { 4873 + struct kvm_sregs2 *sregs2; 4876 4874 struct kvm_lapic_state *lapic; 4877 4875 struct kvm_xsave *xsave; 4878 4876 struct kvm_xcrs *xcrs; ··· 5246 5242 break; 5247 5243 } 5248 5244 #endif 5245 + case KVM_GET_SREGS2: { 5246 + u.sregs2 = kzalloc(sizeof(struct kvm_sregs2), GFP_KERNEL); 5247 + r = -ENOMEM; 5248 + if (!u.sregs2) 5249 + goto out; 5250 + __get_sregs2(vcpu, u.sregs2); 5251 + r = -EFAULT; 5252 + if (copy_to_user(argp, u.sregs2, sizeof(struct kvm_sregs2))) 5253 + goto out; 5254 + r = 0; 5255 + break; 5256 + } 5257 + case KVM_SET_SREGS2: { 5258 + u.sregs2 = memdup_user(argp, sizeof(struct kvm_sregs2)); 5259 + if (IS_ERR(u.sregs2)) { 5260 + r = PTR_ERR(u.sregs2); 5261 + u.sregs2 = NULL; 5262 + goto out; 5263 + } 5264 + r = __set_sregs2(vcpu, u.sregs2); 5265 + break; 5266 + } 5249 5267 default: 5250 5268 r = -EINVAL; 5251 5269 } ··· 9963 9937 } 9964 9938 EXPORT_SYMBOL_GPL(kvm_get_cs_db_l_bits); 9965 9939 9966 - static void __get_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) 9940 + static void __get_sregs_common(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) 9967 9941 { 9968 9942 struct desc_ptr dt; 9969 9943 ··· 9996 9970 sregs->cr8 = kvm_get_cr8(vcpu); 9997 9971 sregs->efer = vcpu->arch.efer; 9998 9972 sregs->apic_base = kvm_get_apic_base(vcpu); 9973 + } 9999 9974 10000 - memset(sregs->interrupt_bitmap, 0, sizeof(sregs->interrupt_bitmap)); 9975 + static void __get_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) 9976 + { 9977 + __get_sregs_common(vcpu, sregs); 9978 + 9979 + if (vcpu->arch.guest_state_protected) 9980 + return; 10001 9981 10002 9982 if (vcpu->arch.interrupt.injected && !vcpu->arch.interrupt.soft) 10003 9983 set_bit(vcpu->arch.interrupt.nr, 10004 9984 (unsigned long *)sregs->interrupt_bitmap); 9985 + } 9986 + 9987 + static void __get_sregs2(struct kvm_vcpu *vcpu, struct kvm_sregs2 *sregs2) 9988 + { 9989 + int i; 9990 + 9991 + __get_sregs_common(vcpu, (struct kvm_sregs *)sregs2); 9992 + 9993 + if (vcpu->arch.guest_state_protected) 9994 + return; 9995 + 9996 + if (is_pae_paging(vcpu)) { 9997 + for (i = 0 ; i < 4 ; i++) 9998 + sregs2->pdptrs[i] = kvm_pdptr_read(vcpu, i); 9999 + sregs2->flags |= KVM_SREGS2_FLAGS_PDPTRS_VALID; 10000 + } 10005 10001 } 10006 10002 10007 10003 int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, ··· 10144 10096 return kvm_is_valid_cr4(vcpu, sregs->cr4); 10145 10097 } 10146 10098 10147 - static int __set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) 10099 + static int __set_sregs_common(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs, 10100 + int *mmu_reset_needed, bool update_pdptrs) 10148 10101 { 10149 10102 struct msr_data apic_base_msr; 10150 - int mmu_reset_needed = 0; 10151 - int pending_vec, max_bits, idx; 10103 + int idx; 10152 10104 struct desc_ptr dt; 10153 - int ret = -EINVAL; 10154 10105 10155 10106 if (!kvm_is_valid_sregs(vcpu, sregs)) 10156 - goto out; 10107 + return -EINVAL; 10157 10108 10158 10109 apic_base_msr.data = sregs->apic_base; 10159 10110 apic_base_msr.host_initiated = true; 10160 10111 if (kvm_set_apic_base(vcpu, &apic_base_msr)) 10161 - goto out; 10112 + return -EINVAL; 10162 10113 10163 10114 if (vcpu->arch.guest_state_protected) 10164 - goto skip_protected_regs; 10115 + return 0; 10165 10116 10166 10117 dt.size = sregs->idt.limit; 10167 10118 dt.address = sregs->idt.base; ··· 10170 10123 static_call(kvm_x86_set_gdt)(vcpu, &dt); 10171 10124 10172 10125 vcpu->arch.cr2 = sregs->cr2; 10173 - mmu_reset_needed |= kvm_read_cr3(vcpu) != sregs->cr3; 10126 + *mmu_reset_needed |= kvm_read_cr3(vcpu) != sregs->cr3; 10174 10127 vcpu->arch.cr3 = sregs->cr3; 10175 10128 kvm_register_mark_available(vcpu, VCPU_EXREG_CR3); 10176 10129 10177 10130 kvm_set_cr8(vcpu, sregs->cr8); 10178 10131 10179 - mmu_reset_needed |= vcpu->arch.efer != sregs->efer; 10132 + *mmu_reset_needed |= vcpu->arch.efer != sregs->efer; 10180 10133 static_call(kvm_x86_set_efer)(vcpu, sregs->efer); 10181 10134 10182 - mmu_reset_needed |= kvm_read_cr0(vcpu) != sregs->cr0; 10135 + *mmu_reset_needed |= kvm_read_cr0(vcpu) != sregs->cr0; 10183 10136 static_call(kvm_x86_set_cr0)(vcpu, sregs->cr0); 10184 10137 vcpu->arch.cr0 = sregs->cr0; 10185 10138 10186 - mmu_reset_needed |= kvm_read_cr4(vcpu) != sregs->cr4; 10139 + *mmu_reset_needed |= kvm_read_cr4(vcpu) != sregs->cr4; 10187 10140 static_call(kvm_x86_set_cr4)(vcpu, sregs->cr4); 10188 10141 10189 - idx = srcu_read_lock(&vcpu->kvm->srcu); 10190 - if (is_pae_paging(vcpu)) { 10191 - load_pdptrs(vcpu, vcpu->arch.walk_mmu, kvm_read_cr3(vcpu)); 10192 - mmu_reset_needed = 1; 10142 + if (update_pdptrs) { 10143 + idx = srcu_read_lock(&vcpu->kvm->srcu); 10144 + if (is_pae_paging(vcpu)) { 10145 + load_pdptrs(vcpu, vcpu->arch.walk_mmu, kvm_read_cr3(vcpu)); 10146 + *mmu_reset_needed = 1; 10147 + } 10148 + srcu_read_unlock(&vcpu->kvm->srcu, idx); 10193 10149 } 10194 - srcu_read_unlock(&vcpu->kvm->srcu, idx); 10195 - 10196 - if (mmu_reset_needed) 10197 - kvm_mmu_reset_context(vcpu); 10198 10150 10199 10151 kvm_set_segment(vcpu, &sregs->cs, VCPU_SREG_CS); 10200 10152 kvm_set_segment(vcpu, &sregs->ds, VCPU_SREG_DS); ··· 10213 10167 !is_protmode(vcpu)) 10214 10168 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; 10215 10169 10216 - skip_protected_regs: 10170 + return 0; 10171 + } 10172 + 10173 + static int __set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) 10174 + { 10175 + int pending_vec, max_bits; 10176 + int mmu_reset_needed = 0; 10177 + int ret = __set_sregs_common(vcpu, sregs, &mmu_reset_needed, true); 10178 + 10179 + if (ret) 10180 + return ret; 10181 + 10182 + if (mmu_reset_needed) 10183 + kvm_mmu_reset_context(vcpu); 10184 + 10217 10185 max_bits = KVM_NR_INTERRUPTS; 10218 10186 pending_vec = find_first_bit( 10219 10187 (const unsigned long *)sregs->interrupt_bitmap, max_bits); 10188 + 10220 10189 if (pending_vec < max_bits) { 10221 10190 kvm_queue_interrupt(vcpu, pending_vec, false); 10222 10191 pr_debug("Set back pending irq %d\n", pending_vec); 10192 + kvm_make_request(KVM_REQ_EVENT, vcpu); 10223 10193 } 10194 + return 0; 10195 + } 10224 10196 10225 - kvm_make_request(KVM_REQ_EVENT, vcpu); 10197 + static int __set_sregs2(struct kvm_vcpu *vcpu, struct kvm_sregs2 *sregs2) 10198 + { 10199 + int mmu_reset_needed = 0; 10200 + bool valid_pdptrs = sregs2->flags & KVM_SREGS2_FLAGS_PDPTRS_VALID; 10201 + bool pae = (sregs2->cr0 & X86_CR0_PG) && (sregs2->cr4 & X86_CR4_PAE) && 10202 + !(sregs2->efer & EFER_LMA); 10203 + int i, ret; 10226 10204 10227 - ret = 0; 10228 - out: 10229 - return ret; 10205 + if (sregs2->flags & ~KVM_SREGS2_FLAGS_PDPTRS_VALID) 10206 + return -EINVAL; 10207 + 10208 + if (valid_pdptrs && (!pae || vcpu->arch.guest_state_protected)) 10209 + return -EINVAL; 10210 + 10211 + ret = __set_sregs_common(vcpu, (struct kvm_sregs *)sregs2, 10212 + &mmu_reset_needed, !valid_pdptrs); 10213 + if (ret) 10214 + return ret; 10215 + 10216 + if (valid_pdptrs) { 10217 + for (i = 0; i < 4 ; i++) 10218 + kvm_pdptr_write(vcpu, i, sregs2->pdptrs[i]); 10219 + 10220 + kvm_register_mark_dirty(vcpu, VCPU_EXREG_PDPTR); 10221 + mmu_reset_needed = 1; 10222 + } 10223 + if (mmu_reset_needed) 10224 + kvm_mmu_reset_context(vcpu); 10225 + return 0; 10230 10226 } 10231 10227 10232 10228 int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
+4
include/uapi/linux/kvm.h
··· 1084 1084 #define KVM_CAP_VM_COPY_ENC_CONTEXT_FROM 197 1085 1085 #define KVM_CAP_PTP_KVM 198 1086 1086 #define KVM_CAP_HYPERV_ENFORCE_CPUID 199 1087 + #define KVM_CAP_SREGS2 200 1087 1088 1088 1089 #ifdef KVM_CAP_IRQ_ROUTING 1089 1090 ··· 1622 1621 /* Per-vCPU Xen attributes */ 1623 1622 #define KVM_XEN_VCPU_GET_ATTR _IOWR(KVMIO, 0xca, struct kvm_xen_vcpu_attr) 1624 1623 #define KVM_XEN_VCPU_SET_ATTR _IOW(KVMIO, 0xcb, struct kvm_xen_vcpu_attr) 1624 + 1625 + #define KVM_GET_SREGS2 _IOR(KVMIO, 0xcc, struct kvm_sregs2) 1626 + #define KVM_SET_SREGS2 _IOW(KVMIO, 0xcd, struct kvm_sregs2) 1625 1627 1626 1628 struct kvm_xen_vcpu_attr { 1627 1629 __u16 type;