Merge tag 'kvm-x86-mmu-6.8' of https://github.com/kvm-x86/linux into HEAD

Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git

kernel os linux

KVM x86 MMU changes for 6.8:

- Fix a relatively benign off-by-one error when splitting huge pages during
CLEAR_DIRTY_LOG.

- Fix a bug where KVM could incorrectly test-and-clear dirty bits in non-leaf
TDP MMU SPTEs if a racing thread replaces a huge SPTE with a non-huge SPTE.

- Relax the TDP MMU's lockdep assertions related to holding mmu_lock for read
versus write so that KVM doesn't pass "bool shared" all over the place just
to have precise assertions in paths that don't actually care about whether
the caller is a reader or a writer.

Paolo Bonzini 2 years ago 7f26fea9 3115d2de

+57 -67

5 changed files

expand all

Documentation

virt

kvm

locking.rst

arch

x86

include

asm

kvm_host.h

kvm

mmu

mmu.c

tdp_mmu.c

tdp_mmu.h

+3 -4

Documentation/virt/kvm/locking.rst

··· 43 43 44 44 - vcpu->mutex is taken outside kvm->arch.hyperv.hv_lock and kvm->arch.xen.xen_lock 45 45 46 - - kvm->arch.mmu_lock is an rwlock. kvm->arch.tdp_mmu_pages_lock and 47 - kvm->arch.mmu_unsync_pages_lock are taken inside kvm->arch.mmu_lock, and 48 - cannot be taken without already holding kvm->arch.mmu_lock (typically with 49 - ``read_lock`` for the TDP MMU, thus the need for additional spinlocks). 46 + - kvm->arch.mmu_lock is an rwlock; critical sections for 47 + kvm->arch.tdp_mmu_pages_lock and kvm->arch.mmu_unsync_pages_lock must 48 + also take kvm->arch.mmu_lock 50 49 51 50 Everything else is a leaf: no other lock is taken inside the critical 52 51 sections.

+6 -5

arch/x86/include/asm/kvm_host.h

··· 1433 1433 * the MMU lock in read mode + RCU or 1434 1434 * the MMU lock in write mode 1435 1435 * 1436 - * For writes, this list is protected by: 1437 - * the MMU lock in read mode + the tdp_mmu_pages_lock or 1438 - * the MMU lock in write mode 1436 + * For writes, this list is protected by tdp_mmu_pages_lock; see 1437 + * below for the details. 1439 1438 * 1440 1439 * Roots will remain in the list until their tdp_mmu_root_count 1441 1440 * drops to zero, at which point the thread that decremented the ··· 1451 1452 * - possible_nx_huge_pages; 1452 1453 * - the possible_nx_huge_page_link field of kvm_mmu_page structs used 1453 1454 * by the TDP MMU 1454 - * It is acceptable, but not necessary, to acquire this lock when 1455 - * the thread holds the MMU lock in write mode. 1455 + * Because the lock is only taken within the MMU lock, strictly 1456 + * speaking it is redundant to acquire this lock when the thread 1457 + * holds the MMU lock in write mode. However it often simplifies 1458 + * the code to do so. 1456 1459 */ 1457 1460 spinlock_t tdp_mmu_pages_lock; 1458 1461 #endif /* CONFIG_X86_64 */

+4 -4

arch/x86/kvm/mmu/mmu.c

··· 1388 1388 gfn_t end = slot->base_gfn + gfn_offset + __fls(mask); 1389 1389 1390 1390 if (READ_ONCE(eager_page_split)) 1391 - kvm_mmu_try_split_huge_pages(kvm, slot, start, end, PG_LEVEL_4K); 1391 + kvm_mmu_try_split_huge_pages(kvm, slot, start, end + 1, PG_LEVEL_4K); 1392 1392 1393 1393 kvm_mmu_slot_gfn_write_protect(kvm, slot, start, PG_LEVEL_2M); 1394 1394 ··· 2846 2846 /* 2847 2847 * Recheck after taking the spinlock, a different vCPU 2848 2848 * may have since marked the page unsync. A false 2849 - * positive on the unprotected check above is not 2849 + * negative on the unprotected check above is not 2850 2850 * possible as clearing sp->unsync _must_ hold mmu_lock 2851 - * for write, i.e. unsync cannot transition from 0->1 2851 + * for write, i.e. unsync cannot transition from 1->0 2852 2852 * while this CPU holds mmu_lock for read (or write). 2853 2853 */ 2854 2854 if (READ_ONCE(sp->unsync)) ··· 3576 3576 return; 3577 3577 3578 3578 if (is_tdp_mmu_page(sp)) 3579 - kvm_tdp_mmu_put_root(kvm, sp, false); 3579 + kvm_tdp_mmu_put_root(kvm, sp); 3580 3580 else if (!--sp->root_count && sp->role.invalid) 3581 3581 kvm_mmu_prepare_zap_page(kvm, sp, invalid_list); 3582 3582

+43 -52

arch/x86/kvm/mmu/tdp_mmu.c

··· 73 73 tdp_mmu_free_sp(sp); 74 74 } 75 75 76 - void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root, 77 - bool shared) 76 + void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root) 78 77 { 79 - kvm_lockdep_assert_mmu_lock_held(kvm, shared); 80 - 81 78 if (!refcount_dec_and_test(&root->tdp_mmu_root_count)) 82 79 return; 83 80 ··· 103 106 */ 104 107 static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm, 105 108 struct kvm_mmu_page *prev_root, 106 - bool shared, bool only_valid) 109 + bool only_valid) 107 110 { 108 111 struct kvm_mmu_page *next_root; 112 + 113 + /* 114 + * While the roots themselves are RCU-protected, fields such as 115 + * role.invalid are protected by mmu_lock. 116 + */ 117 + lockdep_assert_held(&kvm->mmu_lock); 109 118 110 119 rcu_read_lock(); 111 120 ··· 135 132 rcu_read_unlock(); 136 133 137 134 if (prev_root) 138 - kvm_tdp_mmu_put_root(kvm, prev_root, shared); 135 + kvm_tdp_mmu_put_root(kvm, prev_root); 139 136 140 137 return next_root; 141 138 } ··· 147 144 * recent root. (Unless keeping a live reference is desirable.) 148 145 * 149 146 * If shared is set, this function is operating under the MMU lock in read 150 - * mode. In the unlikely event that this thread must free a root, the lock 151 - * will be temporarily dropped and reacquired in write mode. 147 + * mode. 152 148 */ 153 - #define __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared, _only_valid)\ 154 - for (_root = tdp_mmu_next_root(_kvm, NULL, _shared, _only_valid); \ 155 - _root; \ 156 - _root = tdp_mmu_next_root(_kvm, _root, _shared, _only_valid)) \ 157 - if (kvm_lockdep_assert_mmu_lock_held(_kvm, _shared) && \ 158 - kvm_mmu_page_as_id(_root) != _as_id) { \ 149 + #define __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _only_valid)\ 150 + for (_root = tdp_mmu_next_root(_kvm, NULL, _only_valid); \ 151 + ({ lockdep_assert_held(&(_kvm)->mmu_lock); }), _root; \ 152 + _root = tdp_mmu_next_root(_kvm, _root, _only_valid)) \ 153 + if (kvm_mmu_page_as_id(_root) != _as_id) { \ 159 154 } else 160 155 161 - #define for_each_valid_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared) \ 162 - __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared, true) 156 + #define for_each_valid_tdp_mmu_root_yield_safe(_kvm, _root, _as_id) \ 157 + __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, true) 163 158 164 - #define for_each_tdp_mmu_root_yield_safe(_kvm, _root, _shared) \ 165 - for (_root = tdp_mmu_next_root(_kvm, NULL, _shared, false); \ 166 - _root; \ 167 - _root = tdp_mmu_next_root(_kvm, _root, _shared, false)) \ 168 - if (!kvm_lockdep_assert_mmu_lock_held(_kvm, _shared)) { \ 169 - } else 159 + #define for_each_tdp_mmu_root_yield_safe(_kvm, _root) \ 160 + for (_root = tdp_mmu_next_root(_kvm, NULL, false); \ 161 + ({ lockdep_assert_held(&(_kvm)->mmu_lock); }), _root; \ 162 + _root = tdp_mmu_next_root(_kvm, _root, false)) 170 163 171 164 /* 172 165 * Iterate over all TDP MMU roots. Requires that mmu_lock be held for write, ··· 275 276 * 276 277 * @kvm: kvm instance 277 278 * @sp: the page to be removed 278 - * @shared: This operation may not be running under the exclusive use of 279 - * the MMU lock and the operation must synchronize with other 280 - * threads that might be adding or removing pages. 281 279 */ 282 - static void tdp_mmu_unlink_sp(struct kvm *kvm, struct kvm_mmu_page *sp, 283 - bool shared) 280 + static void tdp_mmu_unlink_sp(struct kvm *kvm, struct kvm_mmu_page *sp) 284 281 { 285 282 tdp_unaccount_mmu_page(kvm, sp); 286 283 287 284 if (!sp->nx_huge_page_disallowed) 288 285 return; 289 286 290 - if (shared) 291 - spin_lock(&kvm->arch.tdp_mmu_pages_lock); 292 - else 293 - lockdep_assert_held_write(&kvm->mmu_lock); 294 - 287 + spin_lock(&kvm->arch.tdp_mmu_pages_lock); 295 288 sp->nx_huge_page_disallowed = false; 296 289 untrack_possible_nx_huge_page(kvm, sp); 297 - 298 - if (shared) 299 - spin_unlock(&kvm->arch.tdp_mmu_pages_lock); 290 + spin_unlock(&kvm->arch.tdp_mmu_pages_lock); 300 291 } 301 292 302 293 /** ··· 315 326 316 327 trace_kvm_mmu_prepare_zap_page(sp); 317 328 318 - tdp_mmu_unlink_sp(kvm, sp, shared); 329 + tdp_mmu_unlink_sp(kvm, sp); 319 330 320 331 for (i = 0; i < SPTE_ENT_PER_PAGE; i++) { 321 332 tdp_ptep_t sptep = pt + i; ··· 821 832 { 822 833 struct kvm_mmu_page *root; 823 834 824 - for_each_tdp_mmu_root_yield_safe(kvm, root, false) 835 + lockdep_assert_held_write(&kvm->mmu_lock); 836 + for_each_tdp_mmu_root_yield_safe(kvm, root) 825 837 flush = tdp_mmu_zap_leafs(kvm, root, start, end, true, flush); 826 838 827 839 return flush; ··· 844 854 * is being destroyed or the userspace VMM has exited. In both cases, 845 855 * KVM_RUN is unreachable, i.e. no vCPUs will ever service the request. 846 856 */ 847 - for_each_tdp_mmu_root_yield_safe(kvm, root, false) 857 + lockdep_assert_held_write(&kvm->mmu_lock); 858 + for_each_tdp_mmu_root_yield_safe(kvm, root) 848 859 tdp_mmu_zap_root(kvm, root, false); 849 860 } 850 861 ··· 859 868 860 869 read_lock(&kvm->mmu_lock); 861 870 862 - for_each_tdp_mmu_root_yield_safe(kvm, root, true) { 871 + for_each_tdp_mmu_root_yield_safe(kvm, root) { 863 872 if (!root->tdp_mmu_scheduled_root_to_zap) 864 873 continue; 865 874 ··· 882 891 * the root must be reachable by mmu_notifiers while it's being 883 892 * zapped 884 893 */ 885 - kvm_tdp_mmu_put_root(kvm, root, true); 894 + kvm_tdp_mmu_put_root(kvm, root); 886 895 } 887 896 888 897 read_unlock(&kvm->mmu_lock); ··· 1116 1125 { 1117 1126 struct kvm_mmu_page *root; 1118 1127 1119 - __for_each_tdp_mmu_root_yield_safe(kvm, root, range->slot->as_id, false, false) 1128 + __for_each_tdp_mmu_root_yield_safe(kvm, root, range->slot->as_id, false) 1120 1129 flush = tdp_mmu_zap_leafs(kvm, root, range->start, range->end, 1121 1130 range->may_block, flush); 1122 1131 ··· 1305 1314 1306 1315 lockdep_assert_held_read(&kvm->mmu_lock); 1307 1316 1308 - for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true) 1317 + for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id) 1309 1318 spte_set |= wrprot_gfn_range(kvm, root, slot->base_gfn, 1310 1319 slot->base_gfn + slot->npages, min_level); 1311 1320 ··· 1336 1345 bool shared) 1337 1346 { 1338 1347 struct kvm_mmu_page *sp; 1348 + 1349 + kvm_lockdep_assert_mmu_lock_held(kvm, shared); 1339 1350 1340 1351 /* 1341 1352 * Since we are allocating while under the MMU lock we have to be ··· 1489 1496 int r = 0; 1490 1497 1491 1498 kvm_lockdep_assert_mmu_lock_held(kvm, shared); 1492 - 1493 - for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, shared) { 1499 + for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id) { 1494 1500 r = tdp_mmu_split_huge_pages_root(kvm, root, start, end, target_level, shared); 1495 1501 if (r) { 1496 - kvm_tdp_mmu_put_root(kvm, root, shared); 1502 + kvm_tdp_mmu_put_root(kvm, root); 1497 1503 break; 1498 1504 } 1499 1505 } ··· 1514 1522 1515 1523 rcu_read_lock(); 1516 1524 1517 - tdp_root_for_each_leaf_pte(iter, root, start, end) { 1525 + tdp_root_for_each_pte(iter, root, start, end) { 1518 1526 retry: 1519 - if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true)) 1527 + if (!is_shadow_present_pte(iter.old_spte) || 1528 + !is_last_spte(iter.old_spte, iter.level)) 1520 1529 continue; 1521 1530 1522 - if (!is_shadow_present_pte(iter.old_spte)) 1531 + if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true)) 1523 1532 continue; 1524 1533 1525 1534 KVM_MMU_WARN_ON(kvm_ad_enabled() && ··· 1553 1560 bool spte_set = false; 1554 1561 1555 1562 lockdep_assert_held_read(&kvm->mmu_lock); 1556 - 1557 - for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true) 1563 + for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id) 1558 1564 spte_set |= clear_dirty_gfn_range(kvm, root, slot->base_gfn, 1559 1565 slot->base_gfn + slot->npages); 1560 1566 ··· 1687 1695 struct kvm_mmu_page *root; 1688 1696 1689 1697 lockdep_assert_held_read(&kvm->mmu_lock); 1690 - 1691 - for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true) 1698 + for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id) 1692 1699 zap_collapsible_spte_range(kvm, root, slot); 1693 1700 } 1694 1701

+1 -2

arch/x86/kvm/mmu/tdp_mmu.h

··· 17 17 return refcount_inc_not_zero(&root->tdp_mmu_root_count); 18 18 } 19 19 20 - void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root, 21 - bool shared); 20 + void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root); 22 21 23 22 bool kvm_tdp_mmu_zap_leafs(struct kvm *kvm, gfn_t start, gfn_t end, bool flush); 24 23 bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp);