Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm

Pull kvm fixes from Paolo Bonzini:
"Bugfixes, including a TLB flush fix that affects processors without
nested page tables"

* tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm:
kvm: fix previous commit for 32-bit builds
kvm: avoid speculation-based attacks from out-of-range memslot accesses
KVM: x86: Unload MMU on guest TLB flush if TDP disabled to force MMU sync
KVM: x86: Ensure liveliness of nested VM-Enter fail tracepoint message
selftests: kvm: Add support for customized slot0 memory size
KVM: selftests: introduce P47V64 for s390x
KVM: x86: Ensure PV TLB flush tracepoint reflects KVM behavior
KVM: X86: MMU: Use the correct inherited permissions to get shadow page
KVM: LAPIC: Write 0 to TMICT should also cancel vmx-preemption timer
KVM: SVM: Fix SEV SEND_START session length & SEND_UPDATE_DATA query length after commit 238eca821cee

+105 -39
+2 -2
Documentation/virt/kvm/mmu.rst
··· 171 171 shadow pages) so role.quadrant takes values in the range 0..3. Each 172 172 quadrant maps 1GB virtual address space. 173 173 role.access: 174 - Inherited guest access permissions in the form uwx. Note execute 175 - permission is positive, not negative. 174 + Inherited guest access permissions from the parent ptes in the form uwx. 175 + Note execute permission is positive, not negative. 176 176 role.invalid: 177 177 The page is invalid and should not be used. It is a root page that is 178 178 currently pinned (by a cpu hardware register pointing to it); once it is
+11 -6
arch/x86/kvm/lapic.c
··· 1494 1494 1495 1495 static void cancel_hv_timer(struct kvm_lapic *apic); 1496 1496 1497 + static void cancel_apic_timer(struct kvm_lapic *apic) 1498 + { 1499 + hrtimer_cancel(&apic->lapic_timer.timer); 1500 + preempt_disable(); 1501 + if (apic->lapic_timer.hv_timer_in_use) 1502 + cancel_hv_timer(apic); 1503 + preempt_enable(); 1504 + } 1505 + 1497 1506 static void apic_update_lvtt(struct kvm_lapic *apic) 1498 1507 { 1499 1508 u32 timer_mode = kvm_lapic_get_reg(apic, APIC_LVTT) & ··· 1511 1502 if (apic->lapic_timer.timer_mode != timer_mode) { 1512 1503 if (apic_lvtt_tscdeadline(apic) != (timer_mode == 1513 1504 APIC_LVT_TIMER_TSCDEADLINE)) { 1514 - hrtimer_cancel(&apic->lapic_timer.timer); 1515 - preempt_disable(); 1516 - if (apic->lapic_timer.hv_timer_in_use) 1517 - cancel_hv_timer(apic); 1518 - preempt_enable(); 1505 + cancel_apic_timer(apic); 1519 1506 kvm_lapic_set_reg(apic, APIC_TMICT, 0); 1520 1507 apic->lapic_timer.period = 0; 1521 1508 apic->lapic_timer.tscdeadline = 0; ··· 2097 2092 if (apic_lvtt_tscdeadline(apic)) 2098 2093 break; 2099 2094 2100 - hrtimer_cancel(&apic->lapic_timer.timer); 2095 + cancel_apic_timer(apic); 2101 2096 kvm_lapic_set_reg(apic, APIC_TMICT, val); 2102 2097 start_apic_timer(apic); 2103 2098 break;
+9 -5
arch/x86/kvm/mmu/paging_tmpl.h
··· 90 90 gpa_t pte_gpa[PT_MAX_FULL_LEVELS]; 91 91 pt_element_t __user *ptep_user[PT_MAX_FULL_LEVELS]; 92 92 bool pte_writable[PT_MAX_FULL_LEVELS]; 93 - unsigned pt_access; 94 - unsigned pte_access; 93 + unsigned int pt_access[PT_MAX_FULL_LEVELS]; 94 + unsigned int pte_access; 95 95 gfn_t gfn; 96 96 struct x86_exception fault; 97 97 }; ··· 418 418 } 419 419 420 420 walker->ptes[walker->level - 1] = pte; 421 + 422 + /* Convert to ACC_*_MASK flags for struct guest_walker. */ 423 + walker->pt_access[walker->level - 1] = FNAME(gpte_access)(pt_access ^ walk_nx_mask); 421 424 } while (!is_last_gpte(mmu, walker->level, pte)); 422 425 423 426 pte_pkey = FNAME(gpte_pkeys)(vcpu, pte); 424 427 accessed_dirty = have_ad ? pte_access & PT_GUEST_ACCESSED_MASK : 0; 425 428 426 429 /* Convert to ACC_*_MASK flags for struct guest_walker. */ 427 - walker->pt_access = FNAME(gpte_access)(pt_access ^ walk_nx_mask); 428 430 walker->pte_access = FNAME(gpte_access)(pte_access ^ walk_nx_mask); 429 431 errcode = permission_fault(vcpu, mmu, walker->pte_access, pte_pkey, access); 430 432 if (unlikely(errcode)) ··· 465 463 } 466 464 467 465 pgprintk("%s: pte %llx pte_access %x pt_access %x\n", 468 - __func__, (u64)pte, walker->pte_access, walker->pt_access); 466 + __func__, (u64)pte, walker->pte_access, 467 + walker->pt_access[walker->level - 1]); 469 468 return 1; 470 469 471 470 error: ··· 646 643 bool huge_page_disallowed = exec && nx_huge_page_workaround_enabled; 647 644 struct kvm_mmu_page *sp = NULL; 648 645 struct kvm_shadow_walk_iterator it; 649 - unsigned direct_access, access = gw->pt_access; 646 + unsigned int direct_access, access; 650 647 int top_level, level, req_level, ret; 651 648 gfn_t base_gfn = gw->gfn; 652 649 ··· 678 675 sp = NULL; 679 676 if (!is_shadow_present_pte(*it.sptep)) { 680 677 table_gfn = gw->table_gfn[it.level - 2]; 678 + access = gw->pt_access[it.level - 2]; 681 679 sp = kvm_mmu_get_page(vcpu, table_gfn, addr, it.level-1, 682 680 false, access); 683 681 }
+2 -4
arch/x86/kvm/svm/sev.c
··· 1103 1103 struct sev_data_send_start data; 1104 1104 int ret; 1105 1105 1106 + memset(&data, 0, sizeof(data)); 1106 1107 data.handle = sev->handle; 1107 1108 ret = sev_issue_cmd(kvm, SEV_CMD_SEND_START, &data, &argp->error); 1108 - if (ret < 0) 1109 - return ret; 1110 1109 1111 1110 params->session_len = data.session_len; 1112 1111 if (copy_to_user((void __user *)(uintptr_t)argp->data, params, ··· 1214 1215 struct sev_data_send_update_data data; 1215 1216 int ret; 1216 1217 1218 + memset(&data, 0, sizeof(data)); 1217 1219 data.handle = sev->handle; 1218 1220 ret = sev_issue_cmd(kvm, SEV_CMD_SEND_UPDATE_DATA, &data, &argp->error); 1219 - if (ret < 0) 1220 - return ret; 1221 1221 1222 1222 params->hdr_len = data.hdr_len; 1223 1223 params->trans_len = data.trans_len;
+3 -3
arch/x86/kvm/trace.h
··· 1550 1550 TP_ARGS(msg, err), 1551 1551 1552 1552 TP_STRUCT__entry( 1553 - __field(const char *, msg) 1553 + __string(msg, msg) 1554 1554 __field(u32, err) 1555 1555 ), 1556 1556 1557 1557 TP_fast_assign( 1558 - __entry->msg = msg; 1558 + __assign_str(msg, msg); 1559 1559 __entry->err = err; 1560 1560 ), 1561 1561 1562 - TP_printk("%s%s", __entry->msg, !__entry->err ? "" : 1562 + TP_printk("%s%s", __get_str(msg), !__entry->err ? "" : 1563 1563 __print_symbolic(__entry->err, VMX_VMENTER_INSTRUCTION_ERRORS)) 1564 1564 ); 1565 1565
+17 -2
arch/x86/kvm/x86.c
··· 3072 3072 static void kvm_vcpu_flush_tlb_guest(struct kvm_vcpu *vcpu) 3073 3073 { 3074 3074 ++vcpu->stat.tlb_flush; 3075 + 3076 + if (!tdp_enabled) { 3077 + /* 3078 + * A TLB flush on behalf of the guest is equivalent to 3079 + * INVPCID(all), toggling CR4.PGE, etc., which requires 3080 + * a forced sync of the shadow page tables. Unload the 3081 + * entire MMU here and the subsequent load will sync the 3082 + * shadow page tables, and also flush the TLB. 3083 + */ 3084 + kvm_mmu_unload(vcpu); 3085 + return; 3086 + } 3087 + 3075 3088 static_call(kvm_x86_tlb_flush_guest)(vcpu); 3076 3089 } 3077 3090 ··· 3114 3101 * expensive IPIs. 3115 3102 */ 3116 3103 if (guest_pv_has(vcpu, KVM_FEATURE_PV_TLB_FLUSH)) { 3104 + u8 st_preempted = xchg(&st->preempted, 0); 3105 + 3117 3106 trace_kvm_pv_tlb_flush(vcpu->vcpu_id, 3118 - st->preempted & KVM_VCPU_FLUSH_TLB); 3119 - if (xchg(&st->preempted, 0) & KVM_VCPU_FLUSH_TLB) 3107 + st_preempted & KVM_VCPU_FLUSH_TLB); 3108 + if (st_preempted & KVM_VCPU_FLUSH_TLB) 3120 3109 kvm_vcpu_flush_tlb_guest(vcpu); 3121 3110 } else { 3122 3111 st->preempted = 0;
+9 -1
include/linux/kvm_host.h
··· 1185 1185 static inline unsigned long 1186 1186 __gfn_to_hva_memslot(const struct kvm_memory_slot *slot, gfn_t gfn) 1187 1187 { 1188 - return slot->userspace_addr + (gfn - slot->base_gfn) * PAGE_SIZE; 1188 + /* 1189 + * The index was checked originally in search_memslots. To avoid 1190 + * that a malicious guest builds a Spectre gadget out of e.g. page 1191 + * table walks, do not let the processor speculate loads outside 1192 + * the guest's registered memslots. 1193 + */ 1194 + unsigned long offset = gfn - slot->base_gfn; 1195 + offset = array_index_nospec(offset, slot->npages); 1196 + return slot->userspace_addr + offset * PAGE_SIZE; 1189 1197 } 1190 1198 1191 1199 static inline int memslot_id(struct kvm *kvm, gfn_t gfn)
+6 -4
tools/testing/selftests/kvm/include/kvm_util.h
··· 43 43 VM_MODE_P40V48_4K, 44 44 VM_MODE_P40V48_64K, 45 45 VM_MODE_PXXV48_4K, /* For 48bits VA but ANY bits PA */ 46 + VM_MODE_P47V64_4K, 46 47 NUM_VM_MODES, 47 48 }; 48 49 ··· 61 60 62 61 #elif defined(__s390x__) 63 62 64 - #define VM_MODE_DEFAULT VM_MODE_P52V48_4K 63 + #define VM_MODE_DEFAULT VM_MODE_P47V64_4K 65 64 #define MIN_PAGE_SHIFT 12U 66 65 #define ptes_per_page(page_size) ((page_size) / 16) 67 66 ··· 286 285 uint32_t num_percpu_pages, void *guest_code, 287 286 uint32_t vcpuids[]); 288 287 289 - /* Like vm_create_default_with_vcpus, but accepts mode as a parameter */ 288 + /* Like vm_create_default_with_vcpus, but accepts mode and slot0 memory as a parameter */ 290 289 struct kvm_vm *vm_create_with_vcpus(enum vm_guest_mode mode, uint32_t nr_vcpus, 291 - uint64_t extra_mem_pages, uint32_t num_percpu_pages, 292 - void *guest_code, uint32_t vcpuids[]); 290 + uint64_t slot0_mem_pages, uint64_t extra_mem_pages, 291 + uint32_t num_percpu_pages, void *guest_code, 292 + uint32_t vcpuids[]); 293 293 294 294 /* 295 295 * Adds a vCPU with reasonable defaults (e.g. a stack)
+1 -1
tools/testing/selftests/kvm/kvm_page_table_test.c
··· 268 268 269 269 /* Create a VM with enough guest pages */ 270 270 guest_num_pages = test_mem_size / guest_page_size; 271 - vm = vm_create_with_vcpus(mode, nr_vcpus, 271 + vm = vm_create_with_vcpus(mode, nr_vcpus, DEFAULT_GUEST_PHY_PAGES, 272 272 guest_num_pages, 0, guest_code, NULL); 273 273 274 274 /* Align down GPA of the testing memslot */
+43 -9
tools/testing/selftests/kvm/lib/kvm_util.c
··· 175 175 [VM_MODE_P40V48_4K] = "PA-bits:40, VA-bits:48, 4K pages", 176 176 [VM_MODE_P40V48_64K] = "PA-bits:40, VA-bits:48, 64K pages", 177 177 [VM_MODE_PXXV48_4K] = "PA-bits:ANY, VA-bits:48, 4K pages", 178 + [VM_MODE_P47V64_4K] = "PA-bits:47, VA-bits:64, 4K pages", 178 179 }; 179 180 _Static_assert(sizeof(strings)/sizeof(char *) == NUM_VM_MODES, 180 181 "Missing new mode strings?"); ··· 193 192 { 40, 48, 0x1000, 12 }, 194 193 { 40, 48, 0x10000, 16 }, 195 194 { 0, 0, 0x1000, 12 }, 195 + { 47, 64, 0x1000, 12 }, 196 196 }; 197 197 _Static_assert(sizeof(vm_guest_mode_params)/sizeof(struct vm_guest_mode_params) == NUM_VM_MODES, 198 198 "Missing new mode params?"); ··· 279 277 TEST_FAIL("VM_MODE_PXXV48_4K not supported on non-x86 platforms"); 280 278 #endif 281 279 break; 280 + case VM_MODE_P47V64_4K: 281 + vm->pgtable_levels = 5; 282 + break; 282 283 default: 283 284 TEST_FAIL("Unknown guest mode, mode: 0x%x", mode); 284 285 } ··· 313 308 return vm; 314 309 } 315 310 311 + /* 312 + * VM Create with customized parameters 313 + * 314 + * Input Args: 315 + * mode - VM Mode (e.g. VM_MODE_P52V48_4K) 316 + * nr_vcpus - VCPU count 317 + * slot0_mem_pages - Slot0 physical memory size 318 + * extra_mem_pages - Non-slot0 physical memory total size 319 + * num_percpu_pages - Per-cpu physical memory pages 320 + * guest_code - Guest entry point 321 + * vcpuids - VCPU IDs 322 + * 323 + * Output Args: None 324 + * 325 + * Return: 326 + * Pointer to opaque structure that describes the created VM. 327 + * 328 + * Creates a VM with the mode specified by mode (e.g. VM_MODE_P52V48_4K), 329 + * with customized slot0 memory size, at least 512 pages currently. 330 + * extra_mem_pages is only used to calculate the maximum page table size, 331 + * no real memory allocation for non-slot0 memory in this function. 332 + */ 316 333 struct kvm_vm *vm_create_with_vcpus(enum vm_guest_mode mode, uint32_t nr_vcpus, 317 - uint64_t extra_mem_pages, uint32_t num_percpu_pages, 318 - void *guest_code, uint32_t vcpuids[]) 334 + uint64_t slot0_mem_pages, uint64_t extra_mem_pages, 335 + uint32_t num_percpu_pages, void *guest_code, 336 + uint32_t vcpuids[]) 319 337 { 338 + uint64_t vcpu_pages, extra_pg_pages, pages; 339 + struct kvm_vm *vm; 340 + int i; 341 + 342 + /* Force slot0 memory size not small than DEFAULT_GUEST_PHY_PAGES */ 343 + if (slot0_mem_pages < DEFAULT_GUEST_PHY_PAGES) 344 + slot0_mem_pages = DEFAULT_GUEST_PHY_PAGES; 345 + 320 346 /* The maximum page table size for a memory region will be when the 321 347 * smallest pages are used. Considering each page contains x page 322 348 * table descriptors, the total extra size for page tables (for extra 323 349 * N pages) will be: N/x+N/x^2+N/x^3+... which is definitely smaller 324 350 * than N/x*2. 325 351 */ 326 - uint64_t vcpu_pages = (DEFAULT_STACK_PGS + num_percpu_pages) * nr_vcpus; 327 - uint64_t extra_pg_pages = (extra_mem_pages + vcpu_pages) / PTES_PER_MIN_PAGE * 2; 328 - uint64_t pages = DEFAULT_GUEST_PHY_PAGES + extra_mem_pages + vcpu_pages + extra_pg_pages; 329 - struct kvm_vm *vm; 330 - int i; 352 + vcpu_pages = (DEFAULT_STACK_PGS + num_percpu_pages) * nr_vcpus; 353 + extra_pg_pages = (slot0_mem_pages + extra_mem_pages + vcpu_pages) / PTES_PER_MIN_PAGE * 2; 354 + pages = slot0_mem_pages + vcpu_pages + extra_pg_pages; 331 355 332 356 TEST_ASSERT(nr_vcpus <= kvm_check_cap(KVM_CAP_MAX_VCPUS), 333 357 "nr_vcpus = %d too large for host, max-vcpus = %d", ··· 388 354 uint32_t num_percpu_pages, void *guest_code, 389 355 uint32_t vcpuids[]) 390 356 { 391 - return vm_create_with_vcpus(VM_MODE_DEFAULT, nr_vcpus, extra_mem_pages, 392 - num_percpu_pages, guest_code, vcpuids); 357 + return vm_create_with_vcpus(VM_MODE_DEFAULT, nr_vcpus, DEFAULT_GUEST_PHY_PAGES, 358 + extra_mem_pages, num_percpu_pages, guest_code, vcpuids); 393 359 } 394 360 395 361 struct kvm_vm *vm_create_default(uint32_t vcpuid, uint64_t extra_mem_pages,
+1 -1
tools/testing/selftests/kvm/lib/perf_test_util.c
··· 69 69 TEST_ASSERT(vcpu_memory_bytes % perf_test_args.guest_page_size == 0, 70 70 "Guest memory size is not guest page size aligned."); 71 71 72 - vm = vm_create_with_vcpus(mode, vcpus, 72 + vm = vm_create_with_vcpus(mode, vcpus, DEFAULT_GUEST_PHY_PAGES, 73 73 (vcpus * vcpu_memory_bytes) / perf_test_args.guest_page_size, 74 74 0, guest_code, NULL); 75 75
+1 -1
tools/testing/selftests/kvm/memslot_perf_test.c
··· 267 267 data->hva_slots = malloc(sizeof(*data->hva_slots) * data->nslots); 268 268 TEST_ASSERT(data->hva_slots, "malloc() fail"); 269 269 270 - data->vm = vm_create_default(VCPU_ID, 1024, guest_code); 270 + data->vm = vm_create_default(VCPU_ID, mempages, guest_code); 271 271 272 272 pr_info_v("Adding slots 1..%i, each slot with %"PRIu64" pages + %"PRIu64" extra pages last\n", 273 273 max_mem_slots - 1, data->pages_per_slot, rempages);