Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm

Pull x86 kvm fixes from Paolo Bonzini:

- Avoid freeing stack-allocated node in kvm_async_pf_queue_task

- Clear XSTATE_BV[i] in guest XSAVE state whenever XFD[i]=1

* tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm:
selftests: kvm: Verify TILELOADD actually #NM faults when XFD[18]=1
selftests: kvm: try getting XFD and XSAVE state out of sync
selftests: kvm: replace numbered sync points with actions
x86/fpu: Clear XSTATE_BV[i] in guest XSAVE state whenever XFD[i]=1
x86/kvm: Avoid freeing stack-allocated node in kvm_async_pf_queue_task

+139 -65
+29 -3
arch/x86/kernel/fpu/core.c
··· 319 319 #ifdef CONFIG_X86_64 320 320 void fpu_update_guest_xfd(struct fpu_guest *guest_fpu, u64 xfd) 321 321 { 322 + struct fpstate *fpstate = guest_fpu->fpstate; 323 + 322 324 fpregs_lock(); 323 - guest_fpu->fpstate->xfd = xfd; 324 - if (guest_fpu->fpstate->in_use) 325 - xfd_update_state(guest_fpu->fpstate); 325 + 326 + /* 327 + * KVM's guest ABI is that setting XFD[i]=1 *can* immediately revert the 328 + * save state to its initial configuration. Likewise, KVM_GET_XSAVE does 329 + * the same as XSAVE and returns XSTATE_BV[i]=0 whenever XFD[i]=1. 330 + * 331 + * If the guest's FPU state is in hardware, just update XFD: the XSAVE 332 + * in fpu_swap_kvm_fpstate will clear XSTATE_BV[i] whenever XFD[i]=1. 333 + * 334 + * If however the guest's FPU state is NOT resident in hardware, clear 335 + * disabled components in XSTATE_BV now, or a subsequent XRSTOR will 336 + * attempt to load disabled components and generate #NM _in the host_. 337 + */ 338 + if (xfd && test_thread_flag(TIF_NEED_FPU_LOAD)) 339 + fpstate->regs.xsave.header.xfeatures &= ~xfd; 340 + 341 + fpstate->xfd = xfd; 342 + if (fpstate->in_use) 343 + xfd_update_state(fpstate); 344 + 326 345 fpregs_unlock(); 327 346 } 328 347 EXPORT_SYMBOL_FOR_KVM(fpu_update_guest_xfd); ··· 447 428 } 448 429 449 430 if (ustate->xsave.header.xfeatures & ~xcr0) 431 + return -EINVAL; 432 + 433 + /* 434 + * Disabled features must be in their initial state, otherwise XRSTOR 435 + * causes an exception. 436 + */ 437 + if (WARN_ON_ONCE(ustate->xsave.header.xfeatures & kstate->xfd)) 450 438 return -EINVAL; 451 439 452 440 /*
+16 -3
arch/x86/kernel/kvm.c
··· 89 89 struct swait_queue_head wq; 90 90 u32 token; 91 91 int cpu; 92 + bool dummy; 92 93 }; 93 94 94 95 static struct kvm_task_sleep_head { ··· 121 120 raw_spin_lock(&b->lock); 122 121 e = _find_apf_task(b, token); 123 122 if (e) { 124 - /* dummy entry exist -> wake up was delivered ahead of PF */ 125 - hlist_del(&e->link); 123 + struct kvm_task_sleep_node *dummy = NULL; 124 + 125 + /* 126 + * The entry can either be a 'dummy' entry (which is put on the 127 + * list when wake-up happens ahead of APF handling completion) 128 + * or a token from another task which should not be touched. 129 + */ 130 + if (e->dummy) { 131 + hlist_del(&e->link); 132 + dummy = e; 133 + } 134 + 126 135 raw_spin_unlock(&b->lock); 127 - kfree(e); 136 + kfree(dummy); 128 137 return false; 129 138 } 130 139 131 140 n->token = token; 132 141 n->cpu = smp_processor_id(); 142 + n->dummy = false; 133 143 init_swait_queue_head(&n->wq); 134 144 hlist_add_head(&n->link, &b->list); 135 145 raw_spin_unlock(&b->lock); ··· 243 231 } 244 232 dummy->token = token; 245 233 dummy->cpu = smp_processor_id(); 234 + dummy->dummy = true; 246 235 init_swait_queue_head(&dummy->wq); 247 236 hlist_add_head(&dummy->link, &b->list); 248 237 dummy = NULL;
+9
arch/x86/kvm/x86.c
··· 5807 5807 static int kvm_vcpu_ioctl_x86_set_xsave(struct kvm_vcpu *vcpu, 5808 5808 struct kvm_xsave *guest_xsave) 5809 5809 { 5810 + union fpregs_state *xstate = (union fpregs_state *)guest_xsave->region; 5811 + 5810 5812 if (fpstate_is_confidential(&vcpu->arch.guest_fpu)) 5811 5813 return vcpu->kvm->arch.has_protected_state ? -EINVAL : 0; 5814 + 5815 + /* 5816 + * For backwards compatibility, do not expect disabled features to be in 5817 + * their initial state. XSTATE_BV[i] must still be cleared whenever 5818 + * XFD[i]=1, or XRSTOR would cause a #NM. 5819 + */ 5820 + xstate->xsave.header.xfeatures &= ~vcpu->arch.guest_fpu.fpstate->xfd; 5812 5821 5813 5822 return fpu_copy_uabi_to_guest_fpstate(&vcpu->arch.guest_fpu, 5814 5823 guest_xsave->region,
+85 -59
tools/testing/selftests/kvm/x86/amx_test.c
··· 69 69 : : "a"(tile), "d"(0)); 70 70 } 71 71 72 + static inline int tileloadd_safe(void *tile) 73 + { 74 + return kvm_asm_safe(".byte 0xc4,0xe2,0x7b,0x4b,0x04,0x10", 75 + "a"(tile), "d"(0)); 76 + } 77 + 72 78 static inline void __tilerelease(void) 73 79 { 74 80 asm volatile(".byte 0xc4, 0xe2, 0x78, 0x49, 0xc0" ::); ··· 130 124 } 131 125 } 132 126 127 + enum { 128 + /* Retrieve TMM0 from guest, stash it for TEST_RESTORE_TILEDATA */ 129 + TEST_SAVE_TILEDATA = 1, 130 + 131 + /* Check TMM0 against tiledata */ 132 + TEST_COMPARE_TILEDATA = 2, 133 + 134 + /* Restore TMM0 from earlier save */ 135 + TEST_RESTORE_TILEDATA = 4, 136 + 137 + /* Full VM save/restore */ 138 + TEST_SAVE_RESTORE = 8, 139 + }; 140 + 133 141 static void __attribute__((__flatten__)) guest_code(struct tile_config *amx_cfg, 134 142 struct tile_data *tiledata, 135 143 struct xstate *xstate) 136 144 { 145 + int vector; 146 + 137 147 GUEST_ASSERT(this_cpu_has(X86_FEATURE_XSAVE) && 138 148 this_cpu_has(X86_FEATURE_OSXSAVE)); 139 149 check_xtile_info(); 140 - GUEST_SYNC(1); 150 + GUEST_SYNC(TEST_SAVE_RESTORE); 141 151 142 152 /* xfd=0, enable amx */ 143 153 wrmsr(MSR_IA32_XFD, 0); 144 - GUEST_SYNC(2); 154 + GUEST_SYNC(TEST_SAVE_RESTORE); 145 155 GUEST_ASSERT(rdmsr(MSR_IA32_XFD) == 0); 146 156 set_tilecfg(amx_cfg); 147 157 __ldtilecfg(amx_cfg); 148 - GUEST_SYNC(3); 158 + GUEST_SYNC(TEST_SAVE_RESTORE); 149 159 /* Check save/restore when trap to userspace */ 150 160 __tileloadd(tiledata); 151 - GUEST_SYNC(4); 161 + GUEST_SYNC(TEST_SAVE_TILEDATA | TEST_COMPARE_TILEDATA | TEST_SAVE_RESTORE); 162 + 163 + /* xfd=0x40000, disable amx tiledata */ 164 + wrmsr(MSR_IA32_XFD, XFEATURE_MASK_XTILE_DATA); 165 + 166 + /* host tries setting tiledata while guest XFD is set */ 167 + GUEST_SYNC(TEST_RESTORE_TILEDATA); 168 + GUEST_SYNC(TEST_SAVE_RESTORE); 169 + 170 + wrmsr(MSR_IA32_XFD, 0); 152 171 __tilerelease(); 153 - GUEST_SYNC(5); 172 + GUEST_SYNC(TEST_SAVE_RESTORE); 154 173 /* 155 174 * After XSAVEC, XTILEDATA is cleared in the xstate_bv but is set in 156 175 * the xcomp_bv. ··· 184 153 __xsavec(xstate, XFEATURE_MASK_XTILE_DATA); 185 154 GUEST_ASSERT(!(xstate->header.xstate_bv & XFEATURE_MASK_XTILE_DATA)); 186 155 GUEST_ASSERT(xstate->header.xcomp_bv & XFEATURE_MASK_XTILE_DATA); 156 + 157 + /* #NM test */ 187 158 188 159 /* xfd=0x40000, disable amx tiledata */ 189 160 wrmsr(MSR_IA32_XFD, XFEATURE_MASK_XTILE_DATA); ··· 199 166 GUEST_ASSERT(!(xstate->header.xstate_bv & XFEATURE_MASK_XTILE_DATA)); 200 167 GUEST_ASSERT((xstate->header.xcomp_bv & XFEATURE_MASK_XTILE_DATA)); 201 168 202 - GUEST_SYNC(6); 169 + GUEST_SYNC(TEST_SAVE_RESTORE); 203 170 GUEST_ASSERT(rdmsr(MSR_IA32_XFD) == XFEATURE_MASK_XTILE_DATA); 204 171 set_tilecfg(amx_cfg); 205 172 __ldtilecfg(amx_cfg); 173 + 206 174 /* Trigger #NM exception */ 207 - __tileloadd(tiledata); 208 - GUEST_SYNC(10); 175 + vector = tileloadd_safe(tiledata); 176 + __GUEST_ASSERT(vector == NM_VECTOR, 177 + "Wanted #NM on tileloadd with XFD[18]=1, got %s", 178 + ex_str(vector)); 209 179 210 - GUEST_DONE(); 211 - } 212 - 213 - void guest_nm_handler(struct ex_regs *regs) 214 - { 215 - /* Check if #NM is triggered by XFEATURE_MASK_XTILE_DATA */ 216 - GUEST_SYNC(7); 217 180 GUEST_ASSERT(!(get_cr0() & X86_CR0_TS)); 218 181 GUEST_ASSERT(rdmsr(MSR_IA32_XFD_ERR) == XFEATURE_MASK_XTILE_DATA); 219 182 GUEST_ASSERT(rdmsr(MSR_IA32_XFD) == XFEATURE_MASK_XTILE_DATA); 220 - GUEST_SYNC(8); 183 + GUEST_SYNC(TEST_SAVE_RESTORE); 221 184 GUEST_ASSERT(rdmsr(MSR_IA32_XFD_ERR) == XFEATURE_MASK_XTILE_DATA); 222 185 GUEST_ASSERT(rdmsr(MSR_IA32_XFD) == XFEATURE_MASK_XTILE_DATA); 223 186 /* Clear xfd_err */ 224 187 wrmsr(MSR_IA32_XFD_ERR, 0); 225 188 /* xfd=0, enable amx */ 226 189 wrmsr(MSR_IA32_XFD, 0); 227 - GUEST_SYNC(9); 190 + GUEST_SYNC(TEST_SAVE_RESTORE); 191 + 192 + __tileloadd(tiledata); 193 + GUEST_SYNC(TEST_COMPARE_TILEDATA | TEST_SAVE_RESTORE); 194 + 195 + GUEST_DONE(); 228 196 } 229 197 230 198 int main(int argc, char *argv[]) ··· 234 200 struct kvm_vcpu *vcpu; 235 201 struct kvm_vm *vm; 236 202 struct kvm_x86_state *state; 203 + struct kvm_x86_state *tile_state = NULL; 237 204 int xsave_restore_size; 238 205 vm_vaddr_t amx_cfg, tiledata, xstate; 239 206 struct ucall uc; 240 - u32 amx_offset; 241 207 int ret; 242 208 243 209 /* ··· 262 228 263 229 vcpu_regs_get(vcpu, &regs1); 264 230 265 - /* Register #NM handler */ 266 - vm_install_exception_handler(vm, NM_VECTOR, guest_nm_handler); 267 - 268 231 /* amx cfg for guest_code */ 269 232 amx_cfg = vm_vaddr_alloc_page(vm); 270 233 memset(addr_gva2hva(vm, amx_cfg), 0x0, getpagesize()); ··· 275 244 memset(addr_gva2hva(vm, xstate), 0, PAGE_SIZE * DIV_ROUND_UP(XSAVE_SIZE, PAGE_SIZE)); 276 245 vcpu_args_set(vcpu, 3, amx_cfg, tiledata, xstate); 277 246 247 + int iter = 0; 278 248 for (;;) { 279 249 vcpu_run(vcpu); 280 250 TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO); ··· 285 253 REPORT_GUEST_ASSERT(uc); 286 254 /* NOT REACHED */ 287 255 case UCALL_SYNC: 288 - switch (uc.args[1]) { 289 - case 1: 290 - case 2: 291 - case 3: 292 - case 5: 293 - case 6: 294 - case 7: 295 - case 8: 296 - fprintf(stderr, "GUEST_SYNC(%ld)\n", uc.args[1]); 297 - break; 298 - case 4: 299 - case 10: 300 - fprintf(stderr, 301 - "GUEST_SYNC(%ld), check save/restore status\n", uc.args[1]); 256 + ++iter; 257 + if (uc.args[1] & TEST_SAVE_TILEDATA) { 258 + fprintf(stderr, "GUEST_SYNC #%d, save tiledata\n", iter); 259 + tile_state = vcpu_save_state(vcpu); 260 + } 261 + if (uc.args[1] & TEST_COMPARE_TILEDATA) { 262 + fprintf(stderr, "GUEST_SYNC #%d, check TMM0 contents\n", iter); 302 263 303 264 /* Compacted mode, get amx offset by xsave area 304 265 * size subtract 8K amx size. 305 266 */ 306 - amx_offset = xsave_restore_size - NUM_TILES*TILE_SIZE; 307 - state = vcpu_save_state(vcpu); 308 - void *amx_start = (void *)state->xsave + amx_offset; 267 + u32 amx_offset = xsave_restore_size - NUM_TILES*TILE_SIZE; 268 + void *amx_start = (void *)tile_state->xsave + amx_offset; 309 269 void *tiles_data = (void *)addr_gva2hva(vm, tiledata); 310 270 /* Only check TMM0 register, 1 tile */ 311 271 ret = memcmp(amx_start, tiles_data, TILE_SIZE); 312 272 TEST_ASSERT(ret == 0, "memcmp failed, ret=%d", ret); 273 + } 274 + if (uc.args[1] & TEST_RESTORE_TILEDATA) { 275 + fprintf(stderr, "GUEST_SYNC #%d, before KVM_SET_XSAVE\n", iter); 276 + vcpu_xsave_set(vcpu, tile_state->xsave); 277 + fprintf(stderr, "GUEST_SYNC #%d, after KVM_SET_XSAVE\n", iter); 278 + } 279 + if (uc.args[1] & TEST_SAVE_RESTORE) { 280 + fprintf(stderr, "GUEST_SYNC #%d, save/restore VM state\n", iter); 281 + state = vcpu_save_state(vcpu); 282 + memset(&regs1, 0, sizeof(regs1)); 283 + vcpu_regs_get(vcpu, &regs1); 284 + 285 + kvm_vm_release(vm); 286 + 287 + /* Restore state in a new VM. */ 288 + vcpu = vm_recreate_with_one_vcpu(vm); 289 + vcpu_load_state(vcpu, state); 313 290 kvm_x86_state_cleanup(state); 314 - break; 315 - case 9: 316 - fprintf(stderr, 317 - "GUEST_SYNC(%ld), #NM exception and enable amx\n", uc.args[1]); 318 - break; 291 + 292 + memset(&regs2, 0, sizeof(regs2)); 293 + vcpu_regs_get(vcpu, &regs2); 294 + TEST_ASSERT(!memcmp(&regs1, &regs2, sizeof(regs2)), 295 + "Unexpected register values after vcpu_load_state; rdi: %lx rsi: %lx", 296 + (ulong) regs2.rdi, (ulong) regs2.rsi); 319 297 } 320 298 break; 321 299 case UCALL_DONE: ··· 335 293 TEST_FAIL("Unknown ucall %lu", uc.cmd); 336 294 } 337 295 338 - state = vcpu_save_state(vcpu); 339 - memset(&regs1, 0, sizeof(regs1)); 340 - vcpu_regs_get(vcpu, &regs1); 341 - 342 - kvm_vm_release(vm); 343 - 344 - /* Restore state in a new VM. */ 345 - vcpu = vm_recreate_with_one_vcpu(vm); 346 - vcpu_load_state(vcpu, state); 347 - kvm_x86_state_cleanup(state); 348 - 349 - memset(&regs2, 0, sizeof(regs2)); 350 - vcpu_regs_get(vcpu, &regs2); 351 - TEST_ASSERT(!memcmp(&regs1, &regs2, sizeof(regs2)), 352 - "Unexpected register values after vcpu_load_state; rdi: %lx rsi: %lx", 353 - (ulong) regs2.rdi, (ulong) regs2.rsi); 354 296 } 355 297 done: 356 298 kvm_vm_free(vm);