Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

at v4.6 5108 lines 132 kB view raw
1/* 2 * Kernel-based Virtual Machine driver for Linux 3 * 4 * This module enables machines with Intel VT-x extensions to run virtual 5 * machines without emulation or binary translation. 6 * 7 * MMU support 8 * 9 * Copyright (C) 2006 Qumranet, Inc. 10 * Copyright 2010 Red Hat, Inc. and/or its affiliates. 11 * 12 * Authors: 13 * Yaniv Kamay <yaniv@qumranet.com> 14 * Avi Kivity <avi@qumranet.com> 15 * 16 * This work is licensed under the terms of the GNU GPL, version 2. See 17 * the COPYING file in the top-level directory. 18 * 19 */ 20 21#include "irq.h" 22#include "mmu.h" 23#include "x86.h" 24#include "kvm_cache_regs.h" 25#include "cpuid.h" 26 27#include <linux/kvm_host.h> 28#include <linux/types.h> 29#include <linux/string.h> 30#include <linux/mm.h> 31#include <linux/highmem.h> 32#include <linux/module.h> 33#include <linux/swap.h> 34#include <linux/hugetlb.h> 35#include <linux/compiler.h> 36#include <linux/srcu.h> 37#include <linux/slab.h> 38#include <linux/uaccess.h> 39 40#include <asm/page.h> 41#include <asm/cmpxchg.h> 42#include <asm/io.h> 43#include <asm/vmx.h> 44#include <asm/kvm_page_track.h> 45 46/* 47 * When setting this variable to true it enables Two-Dimensional-Paging 48 * where the hardware walks 2 page tables: 49 * 1. the guest-virtual to guest-physical 50 * 2. while doing 1. it walks guest-physical to host-physical 51 * If the hardware supports that we don't need to do shadow paging. 52 */ 53bool tdp_enabled = false; 54 55enum { 56 AUDIT_PRE_PAGE_FAULT, 57 AUDIT_POST_PAGE_FAULT, 58 AUDIT_PRE_PTE_WRITE, 59 AUDIT_POST_PTE_WRITE, 60 AUDIT_PRE_SYNC, 61 AUDIT_POST_SYNC 62}; 63 64#undef MMU_DEBUG 65 66#ifdef MMU_DEBUG 67static bool dbg = 0; 68module_param(dbg, bool, 0644); 69 70#define pgprintk(x...) do { if (dbg) printk(x); } while (0) 71#define rmap_printk(x...) do { if (dbg) printk(x); } while (0) 72#define MMU_WARN_ON(x) WARN_ON(x) 73#else 74#define pgprintk(x...) do { } while (0) 75#define rmap_printk(x...) do { } while (0) 76#define MMU_WARN_ON(x) do { } while (0) 77#endif 78 79#define PTE_PREFETCH_NUM 8 80 81#define PT_FIRST_AVAIL_BITS_SHIFT 10 82#define PT64_SECOND_AVAIL_BITS_SHIFT 52 83 84#define PT64_LEVEL_BITS 9 85 86#define PT64_LEVEL_SHIFT(level) \ 87 (PAGE_SHIFT + (level - 1) * PT64_LEVEL_BITS) 88 89#define PT64_INDEX(address, level)\ 90 (((address) >> PT64_LEVEL_SHIFT(level)) & ((1 << PT64_LEVEL_BITS) - 1)) 91 92 93#define PT32_LEVEL_BITS 10 94 95#define PT32_LEVEL_SHIFT(level) \ 96 (PAGE_SHIFT + (level - 1) * PT32_LEVEL_BITS) 97 98#define PT32_LVL_OFFSET_MASK(level) \ 99 (PT32_BASE_ADDR_MASK & ((1ULL << (PAGE_SHIFT + (((level) - 1) \ 100 * PT32_LEVEL_BITS))) - 1)) 101 102#define PT32_INDEX(address, level)\ 103 (((address) >> PT32_LEVEL_SHIFT(level)) & ((1 << PT32_LEVEL_BITS) - 1)) 104 105 106#define PT64_BASE_ADDR_MASK (((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1)) 107#define PT64_DIR_BASE_ADDR_MASK \ 108 (PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + PT64_LEVEL_BITS)) - 1)) 109#define PT64_LVL_ADDR_MASK(level) \ 110 (PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + (((level) - 1) \ 111 * PT64_LEVEL_BITS))) - 1)) 112#define PT64_LVL_OFFSET_MASK(level) \ 113 (PT64_BASE_ADDR_MASK & ((1ULL << (PAGE_SHIFT + (((level) - 1) \ 114 * PT64_LEVEL_BITS))) - 1)) 115 116#define PT32_BASE_ADDR_MASK PAGE_MASK 117#define PT32_DIR_BASE_ADDR_MASK \ 118 (PAGE_MASK & ~((1ULL << (PAGE_SHIFT + PT32_LEVEL_BITS)) - 1)) 119#define PT32_LVL_ADDR_MASK(level) \ 120 (PAGE_MASK & ~((1ULL << (PAGE_SHIFT + (((level) - 1) \ 121 * PT32_LEVEL_BITS))) - 1)) 122 123#define PT64_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | shadow_user_mask \ 124 | shadow_x_mask | shadow_nx_mask) 125 126#define ACC_EXEC_MASK 1 127#define ACC_WRITE_MASK PT_WRITABLE_MASK 128#define ACC_USER_MASK PT_USER_MASK 129#define ACC_ALL (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK) 130 131#include <trace/events/kvm.h> 132 133#define CREATE_TRACE_POINTS 134#include "mmutrace.h" 135 136#define SPTE_HOST_WRITEABLE (1ULL << PT_FIRST_AVAIL_BITS_SHIFT) 137#define SPTE_MMU_WRITEABLE (1ULL << (PT_FIRST_AVAIL_BITS_SHIFT + 1)) 138 139#define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level) 140 141/* make pte_list_desc fit well in cache line */ 142#define PTE_LIST_EXT 3 143 144struct pte_list_desc { 145 u64 *sptes[PTE_LIST_EXT]; 146 struct pte_list_desc *more; 147}; 148 149struct kvm_shadow_walk_iterator { 150 u64 addr; 151 hpa_t shadow_addr; 152 u64 *sptep; 153 int level; 154 unsigned index; 155}; 156 157#define for_each_shadow_entry(_vcpu, _addr, _walker) \ 158 for (shadow_walk_init(&(_walker), _vcpu, _addr); \ 159 shadow_walk_okay(&(_walker)); \ 160 shadow_walk_next(&(_walker))) 161 162#define for_each_shadow_entry_lockless(_vcpu, _addr, _walker, spte) \ 163 for (shadow_walk_init(&(_walker), _vcpu, _addr); \ 164 shadow_walk_okay(&(_walker)) && \ 165 ({ spte = mmu_spte_get_lockless(_walker.sptep); 1; }); \ 166 __shadow_walk_next(&(_walker), spte)) 167 168static struct kmem_cache *pte_list_desc_cache; 169static struct kmem_cache *mmu_page_header_cache; 170static struct percpu_counter kvm_total_used_mmu_pages; 171 172static u64 __read_mostly shadow_nx_mask; 173static u64 __read_mostly shadow_x_mask; /* mutual exclusive with nx_mask */ 174static u64 __read_mostly shadow_user_mask; 175static u64 __read_mostly shadow_accessed_mask; 176static u64 __read_mostly shadow_dirty_mask; 177static u64 __read_mostly shadow_mmio_mask; 178 179static void mmu_spte_set(u64 *sptep, u64 spte); 180static void mmu_free_roots(struct kvm_vcpu *vcpu); 181 182void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask) 183{ 184 shadow_mmio_mask = mmio_mask; 185} 186EXPORT_SYMBOL_GPL(kvm_mmu_set_mmio_spte_mask); 187 188/* 189 * the low bit of the generation number is always presumed to be zero. 190 * This disables mmio caching during memslot updates. The concept is 191 * similar to a seqcount but instead of retrying the access we just punt 192 * and ignore the cache. 193 * 194 * spte bits 3-11 are used as bits 1-9 of the generation number, 195 * the bits 52-61 are used as bits 10-19 of the generation number. 196 */ 197#define MMIO_SPTE_GEN_LOW_SHIFT 2 198#define MMIO_SPTE_GEN_HIGH_SHIFT 52 199 200#define MMIO_GEN_SHIFT 20 201#define MMIO_GEN_LOW_SHIFT 10 202#define MMIO_GEN_LOW_MASK ((1 << MMIO_GEN_LOW_SHIFT) - 2) 203#define MMIO_GEN_MASK ((1 << MMIO_GEN_SHIFT) - 1) 204 205static u64 generation_mmio_spte_mask(unsigned int gen) 206{ 207 u64 mask; 208 209 WARN_ON(gen & ~MMIO_GEN_MASK); 210 211 mask = (gen & MMIO_GEN_LOW_MASK) << MMIO_SPTE_GEN_LOW_SHIFT; 212 mask |= ((u64)gen >> MMIO_GEN_LOW_SHIFT) << MMIO_SPTE_GEN_HIGH_SHIFT; 213 return mask; 214} 215 216static unsigned int get_mmio_spte_generation(u64 spte) 217{ 218 unsigned int gen; 219 220 spte &= ~shadow_mmio_mask; 221 222 gen = (spte >> MMIO_SPTE_GEN_LOW_SHIFT) & MMIO_GEN_LOW_MASK; 223 gen |= (spte >> MMIO_SPTE_GEN_HIGH_SHIFT) << MMIO_GEN_LOW_SHIFT; 224 return gen; 225} 226 227static unsigned int kvm_current_mmio_generation(struct kvm_vcpu *vcpu) 228{ 229 return kvm_vcpu_memslots(vcpu)->generation & MMIO_GEN_MASK; 230} 231 232static void mark_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, u64 gfn, 233 unsigned access) 234{ 235 unsigned int gen = kvm_current_mmio_generation(vcpu); 236 u64 mask = generation_mmio_spte_mask(gen); 237 238 access &= ACC_WRITE_MASK | ACC_USER_MASK; 239 mask |= shadow_mmio_mask | access | gfn << PAGE_SHIFT; 240 241 trace_mark_mmio_spte(sptep, gfn, access, gen); 242 mmu_spte_set(sptep, mask); 243} 244 245static bool is_mmio_spte(u64 spte) 246{ 247 return (spte & shadow_mmio_mask) == shadow_mmio_mask; 248} 249 250static gfn_t get_mmio_spte_gfn(u64 spte) 251{ 252 u64 mask = generation_mmio_spte_mask(MMIO_GEN_MASK) | shadow_mmio_mask; 253 return (spte & ~mask) >> PAGE_SHIFT; 254} 255 256static unsigned get_mmio_spte_access(u64 spte) 257{ 258 u64 mask = generation_mmio_spte_mask(MMIO_GEN_MASK) | shadow_mmio_mask; 259 return (spte & ~mask) & ~PAGE_MASK; 260} 261 262static bool set_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, gfn_t gfn, 263 kvm_pfn_t pfn, unsigned access) 264{ 265 if (unlikely(is_noslot_pfn(pfn))) { 266 mark_mmio_spte(vcpu, sptep, gfn, access); 267 return true; 268 } 269 270 return false; 271} 272 273static bool check_mmio_spte(struct kvm_vcpu *vcpu, u64 spte) 274{ 275 unsigned int kvm_gen, spte_gen; 276 277 kvm_gen = kvm_current_mmio_generation(vcpu); 278 spte_gen = get_mmio_spte_generation(spte); 279 280 trace_check_mmio_spte(spte, kvm_gen, spte_gen); 281 return likely(kvm_gen == spte_gen); 282} 283 284void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask, 285 u64 dirty_mask, u64 nx_mask, u64 x_mask) 286{ 287 shadow_user_mask = user_mask; 288 shadow_accessed_mask = accessed_mask; 289 shadow_dirty_mask = dirty_mask; 290 shadow_nx_mask = nx_mask; 291 shadow_x_mask = x_mask; 292} 293EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes); 294 295static int is_cpuid_PSE36(void) 296{ 297 return 1; 298} 299 300static int is_nx(struct kvm_vcpu *vcpu) 301{ 302 return vcpu->arch.efer & EFER_NX; 303} 304 305static int is_shadow_present_pte(u64 pte) 306{ 307 return pte & PT_PRESENT_MASK && !is_mmio_spte(pte); 308} 309 310static int is_large_pte(u64 pte) 311{ 312 return pte & PT_PAGE_SIZE_MASK; 313} 314 315static int is_last_spte(u64 pte, int level) 316{ 317 if (level == PT_PAGE_TABLE_LEVEL) 318 return 1; 319 if (is_large_pte(pte)) 320 return 1; 321 return 0; 322} 323 324static kvm_pfn_t spte_to_pfn(u64 pte) 325{ 326 return (pte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT; 327} 328 329static gfn_t pse36_gfn_delta(u32 gpte) 330{ 331 int shift = 32 - PT32_DIR_PSE36_SHIFT - PAGE_SHIFT; 332 333 return (gpte & PT32_DIR_PSE36_MASK) << shift; 334} 335 336#ifdef CONFIG_X86_64 337static void __set_spte(u64 *sptep, u64 spte) 338{ 339 *sptep = spte; 340} 341 342static void __update_clear_spte_fast(u64 *sptep, u64 spte) 343{ 344 *sptep = spte; 345} 346 347static u64 __update_clear_spte_slow(u64 *sptep, u64 spte) 348{ 349 return xchg(sptep, spte); 350} 351 352static u64 __get_spte_lockless(u64 *sptep) 353{ 354 return ACCESS_ONCE(*sptep); 355} 356#else 357union split_spte { 358 struct { 359 u32 spte_low; 360 u32 spte_high; 361 }; 362 u64 spte; 363}; 364 365static void count_spte_clear(u64 *sptep, u64 spte) 366{ 367 struct kvm_mmu_page *sp = page_header(__pa(sptep)); 368 369 if (is_shadow_present_pte(spte)) 370 return; 371 372 /* Ensure the spte is completely set before we increase the count */ 373 smp_wmb(); 374 sp->clear_spte_count++; 375} 376 377static void __set_spte(u64 *sptep, u64 spte) 378{ 379 union split_spte *ssptep, sspte; 380 381 ssptep = (union split_spte *)sptep; 382 sspte = (union split_spte)spte; 383 384 ssptep->spte_high = sspte.spte_high; 385 386 /* 387 * If we map the spte from nonpresent to present, We should store 388 * the high bits firstly, then set present bit, so cpu can not 389 * fetch this spte while we are setting the spte. 390 */ 391 smp_wmb(); 392 393 ssptep->spte_low = sspte.spte_low; 394} 395 396static void __update_clear_spte_fast(u64 *sptep, u64 spte) 397{ 398 union split_spte *ssptep, sspte; 399 400 ssptep = (union split_spte *)sptep; 401 sspte = (union split_spte)spte; 402 403 ssptep->spte_low = sspte.spte_low; 404 405 /* 406 * If we map the spte from present to nonpresent, we should clear 407 * present bit firstly to avoid vcpu fetch the old high bits. 408 */ 409 smp_wmb(); 410 411 ssptep->spte_high = sspte.spte_high; 412 count_spte_clear(sptep, spte); 413} 414 415static u64 __update_clear_spte_slow(u64 *sptep, u64 spte) 416{ 417 union split_spte *ssptep, sspte, orig; 418 419 ssptep = (union split_spte *)sptep; 420 sspte = (union split_spte)spte; 421 422 /* xchg acts as a barrier before the setting of the high bits */ 423 orig.spte_low = xchg(&ssptep->spte_low, sspte.spte_low); 424 orig.spte_high = ssptep->spte_high; 425 ssptep->spte_high = sspte.spte_high; 426 count_spte_clear(sptep, spte); 427 428 return orig.spte; 429} 430 431/* 432 * The idea using the light way get the spte on x86_32 guest is from 433 * gup_get_pte(arch/x86/mm/gup.c). 434 * 435 * An spte tlb flush may be pending, because kvm_set_pte_rmapp 436 * coalesces them and we are running out of the MMU lock. Therefore 437 * we need to protect against in-progress updates of the spte. 438 * 439 * Reading the spte while an update is in progress may get the old value 440 * for the high part of the spte. The race is fine for a present->non-present 441 * change (because the high part of the spte is ignored for non-present spte), 442 * but for a present->present change we must reread the spte. 443 * 444 * All such changes are done in two steps (present->non-present and 445 * non-present->present), hence it is enough to count the number of 446 * present->non-present updates: if it changed while reading the spte, 447 * we might have hit the race. This is done using clear_spte_count. 448 */ 449static u64 __get_spte_lockless(u64 *sptep) 450{ 451 struct kvm_mmu_page *sp = page_header(__pa(sptep)); 452 union split_spte spte, *orig = (union split_spte *)sptep; 453 int count; 454 455retry: 456 count = sp->clear_spte_count; 457 smp_rmb(); 458 459 spte.spte_low = orig->spte_low; 460 smp_rmb(); 461 462 spte.spte_high = orig->spte_high; 463 smp_rmb(); 464 465 if (unlikely(spte.spte_low != orig->spte_low || 466 count != sp->clear_spte_count)) 467 goto retry; 468 469 return spte.spte; 470} 471#endif 472 473static bool spte_is_locklessly_modifiable(u64 spte) 474{ 475 return (spte & (SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE)) == 476 (SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE); 477} 478 479static bool spte_has_volatile_bits(u64 spte) 480{ 481 /* 482 * Always atomically update spte if it can be updated 483 * out of mmu-lock, it can ensure dirty bit is not lost, 484 * also, it can help us to get a stable is_writable_pte() 485 * to ensure tlb flush is not missed. 486 */ 487 if (spte_is_locklessly_modifiable(spte)) 488 return true; 489 490 if (!shadow_accessed_mask) 491 return false; 492 493 if (!is_shadow_present_pte(spte)) 494 return false; 495 496 if ((spte & shadow_accessed_mask) && 497 (!is_writable_pte(spte) || (spte & shadow_dirty_mask))) 498 return false; 499 500 return true; 501} 502 503static bool spte_is_bit_cleared(u64 old_spte, u64 new_spte, u64 bit_mask) 504{ 505 return (old_spte & bit_mask) && !(new_spte & bit_mask); 506} 507 508static bool spte_is_bit_changed(u64 old_spte, u64 new_spte, u64 bit_mask) 509{ 510 return (old_spte & bit_mask) != (new_spte & bit_mask); 511} 512 513/* Rules for using mmu_spte_set: 514 * Set the sptep from nonpresent to present. 515 * Note: the sptep being assigned *must* be either not present 516 * or in a state where the hardware will not attempt to update 517 * the spte. 518 */ 519static void mmu_spte_set(u64 *sptep, u64 new_spte) 520{ 521 WARN_ON(is_shadow_present_pte(*sptep)); 522 __set_spte(sptep, new_spte); 523} 524 525/* Rules for using mmu_spte_update: 526 * Update the state bits, it means the mapped pfn is not changged. 527 * 528 * Whenever we overwrite a writable spte with a read-only one we 529 * should flush remote TLBs. Otherwise rmap_write_protect 530 * will find a read-only spte, even though the writable spte 531 * might be cached on a CPU's TLB, the return value indicates this 532 * case. 533 */ 534static bool mmu_spte_update(u64 *sptep, u64 new_spte) 535{ 536 u64 old_spte = *sptep; 537 bool ret = false; 538 539 WARN_ON(!is_shadow_present_pte(new_spte)); 540 541 if (!is_shadow_present_pte(old_spte)) { 542 mmu_spte_set(sptep, new_spte); 543 return ret; 544 } 545 546 if (!spte_has_volatile_bits(old_spte)) 547 __update_clear_spte_fast(sptep, new_spte); 548 else 549 old_spte = __update_clear_spte_slow(sptep, new_spte); 550 551 /* 552 * For the spte updated out of mmu-lock is safe, since 553 * we always atomically update it, see the comments in 554 * spte_has_volatile_bits(). 555 */ 556 if (spte_is_locklessly_modifiable(old_spte) && 557 !is_writable_pte(new_spte)) 558 ret = true; 559 560 if (!shadow_accessed_mask) { 561 /* 562 * We don't set page dirty when dropping non-writable spte. 563 * So do it now if the new spte is becoming non-writable. 564 */ 565 if (ret) 566 kvm_set_pfn_dirty(spte_to_pfn(old_spte)); 567 return ret; 568 } 569 570 /* 571 * Flush TLB when accessed/dirty bits are changed in the page tables, 572 * to guarantee consistency between TLB and page tables. 573 */ 574 if (spte_is_bit_changed(old_spte, new_spte, 575 shadow_accessed_mask | shadow_dirty_mask)) 576 ret = true; 577 578 if (spte_is_bit_cleared(old_spte, new_spte, shadow_accessed_mask)) 579 kvm_set_pfn_accessed(spte_to_pfn(old_spte)); 580 if (spte_is_bit_cleared(old_spte, new_spte, shadow_dirty_mask)) 581 kvm_set_pfn_dirty(spte_to_pfn(old_spte)); 582 583 return ret; 584} 585 586/* 587 * Rules for using mmu_spte_clear_track_bits: 588 * It sets the sptep from present to nonpresent, and track the 589 * state bits, it is used to clear the last level sptep. 590 */ 591static int mmu_spte_clear_track_bits(u64 *sptep) 592{ 593 kvm_pfn_t pfn; 594 u64 old_spte = *sptep; 595 596 if (!spte_has_volatile_bits(old_spte)) 597 __update_clear_spte_fast(sptep, 0ull); 598 else 599 old_spte = __update_clear_spte_slow(sptep, 0ull); 600 601 if (!is_shadow_present_pte(old_spte)) 602 return 0; 603 604 pfn = spte_to_pfn(old_spte); 605 606 /* 607 * KVM does not hold the refcount of the page used by 608 * kvm mmu, before reclaiming the page, we should 609 * unmap it from mmu first. 610 */ 611 WARN_ON(!kvm_is_reserved_pfn(pfn) && !page_count(pfn_to_page(pfn))); 612 613 if (!shadow_accessed_mask || old_spte & shadow_accessed_mask) 614 kvm_set_pfn_accessed(pfn); 615 if (old_spte & (shadow_dirty_mask ? shadow_dirty_mask : 616 PT_WRITABLE_MASK)) 617 kvm_set_pfn_dirty(pfn); 618 return 1; 619} 620 621/* 622 * Rules for using mmu_spte_clear_no_track: 623 * Directly clear spte without caring the state bits of sptep, 624 * it is used to set the upper level spte. 625 */ 626static void mmu_spte_clear_no_track(u64 *sptep) 627{ 628 __update_clear_spte_fast(sptep, 0ull); 629} 630 631static u64 mmu_spte_get_lockless(u64 *sptep) 632{ 633 return __get_spte_lockless(sptep); 634} 635 636static void walk_shadow_page_lockless_begin(struct kvm_vcpu *vcpu) 637{ 638 /* 639 * Prevent page table teardown by making any free-er wait during 640 * kvm_flush_remote_tlbs() IPI to all active vcpus. 641 */ 642 local_irq_disable(); 643 644 /* 645 * Make sure a following spte read is not reordered ahead of the write 646 * to vcpu->mode. 647 */ 648 smp_store_mb(vcpu->mode, READING_SHADOW_PAGE_TABLES); 649} 650 651static void walk_shadow_page_lockless_end(struct kvm_vcpu *vcpu) 652{ 653 /* 654 * Make sure the write to vcpu->mode is not reordered in front of 655 * reads to sptes. If it does, kvm_commit_zap_page() can see us 656 * OUTSIDE_GUEST_MODE and proceed to free the shadow page table. 657 */ 658 smp_store_release(&vcpu->mode, OUTSIDE_GUEST_MODE); 659 local_irq_enable(); 660} 661 662static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache, 663 struct kmem_cache *base_cache, int min) 664{ 665 void *obj; 666 667 if (cache->nobjs >= min) 668 return 0; 669 while (cache->nobjs < ARRAY_SIZE(cache->objects)) { 670 obj = kmem_cache_zalloc(base_cache, GFP_KERNEL); 671 if (!obj) 672 return -ENOMEM; 673 cache->objects[cache->nobjs++] = obj; 674 } 675 return 0; 676} 677 678static int mmu_memory_cache_free_objects(struct kvm_mmu_memory_cache *cache) 679{ 680 return cache->nobjs; 681} 682 683static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc, 684 struct kmem_cache *cache) 685{ 686 while (mc->nobjs) 687 kmem_cache_free(cache, mc->objects[--mc->nobjs]); 688} 689 690static int mmu_topup_memory_cache_page(struct kvm_mmu_memory_cache *cache, 691 int min) 692{ 693 void *page; 694 695 if (cache->nobjs >= min) 696 return 0; 697 while (cache->nobjs < ARRAY_SIZE(cache->objects)) { 698 page = (void *)__get_free_page(GFP_KERNEL); 699 if (!page) 700 return -ENOMEM; 701 cache->objects[cache->nobjs++] = page; 702 } 703 return 0; 704} 705 706static void mmu_free_memory_cache_page(struct kvm_mmu_memory_cache *mc) 707{ 708 while (mc->nobjs) 709 free_page((unsigned long)mc->objects[--mc->nobjs]); 710} 711 712static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu) 713{ 714 int r; 715 716 r = mmu_topup_memory_cache(&vcpu->arch.mmu_pte_list_desc_cache, 717 pte_list_desc_cache, 8 + PTE_PREFETCH_NUM); 718 if (r) 719 goto out; 720 r = mmu_topup_memory_cache_page(&vcpu->arch.mmu_page_cache, 8); 721 if (r) 722 goto out; 723 r = mmu_topup_memory_cache(&vcpu->arch.mmu_page_header_cache, 724 mmu_page_header_cache, 4); 725out: 726 return r; 727} 728 729static void mmu_free_memory_caches(struct kvm_vcpu *vcpu) 730{ 731 mmu_free_memory_cache(&vcpu->arch.mmu_pte_list_desc_cache, 732 pte_list_desc_cache); 733 mmu_free_memory_cache_page(&vcpu->arch.mmu_page_cache); 734 mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache, 735 mmu_page_header_cache); 736} 737 738static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc) 739{ 740 void *p; 741 742 BUG_ON(!mc->nobjs); 743 p = mc->objects[--mc->nobjs]; 744 return p; 745} 746 747static struct pte_list_desc *mmu_alloc_pte_list_desc(struct kvm_vcpu *vcpu) 748{ 749 return mmu_memory_cache_alloc(&vcpu->arch.mmu_pte_list_desc_cache); 750} 751 752static void mmu_free_pte_list_desc(struct pte_list_desc *pte_list_desc) 753{ 754 kmem_cache_free(pte_list_desc_cache, pte_list_desc); 755} 756 757static gfn_t kvm_mmu_page_get_gfn(struct kvm_mmu_page *sp, int index) 758{ 759 if (!sp->role.direct) 760 return sp->gfns[index]; 761 762 return sp->gfn + (index << ((sp->role.level - 1) * PT64_LEVEL_BITS)); 763} 764 765static void kvm_mmu_page_set_gfn(struct kvm_mmu_page *sp, int index, gfn_t gfn) 766{ 767 if (sp->role.direct) 768 BUG_ON(gfn != kvm_mmu_page_get_gfn(sp, index)); 769 else 770 sp->gfns[index] = gfn; 771} 772 773/* 774 * Return the pointer to the large page information for a given gfn, 775 * handling slots that are not large page aligned. 776 */ 777static struct kvm_lpage_info *lpage_info_slot(gfn_t gfn, 778 struct kvm_memory_slot *slot, 779 int level) 780{ 781 unsigned long idx; 782 783 idx = gfn_to_index(gfn, slot->base_gfn, level); 784 return &slot->arch.lpage_info[level - 2][idx]; 785} 786 787static void update_gfn_disallow_lpage_count(struct kvm_memory_slot *slot, 788 gfn_t gfn, int count) 789{ 790 struct kvm_lpage_info *linfo; 791 int i; 792 793 for (i = PT_DIRECTORY_LEVEL; i <= PT_MAX_HUGEPAGE_LEVEL; ++i) { 794 linfo = lpage_info_slot(gfn, slot, i); 795 linfo->disallow_lpage += count; 796 WARN_ON(linfo->disallow_lpage < 0); 797 } 798} 799 800void kvm_mmu_gfn_disallow_lpage(struct kvm_memory_slot *slot, gfn_t gfn) 801{ 802 update_gfn_disallow_lpage_count(slot, gfn, 1); 803} 804 805void kvm_mmu_gfn_allow_lpage(struct kvm_memory_slot *slot, gfn_t gfn) 806{ 807 update_gfn_disallow_lpage_count(slot, gfn, -1); 808} 809 810static void account_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp) 811{ 812 struct kvm_memslots *slots; 813 struct kvm_memory_slot *slot; 814 gfn_t gfn; 815 816 kvm->arch.indirect_shadow_pages++; 817 gfn = sp->gfn; 818 slots = kvm_memslots_for_spte_role(kvm, sp->role); 819 slot = __gfn_to_memslot(slots, gfn); 820 821 /* the non-leaf shadow pages are keeping readonly. */ 822 if (sp->role.level > PT_PAGE_TABLE_LEVEL) 823 return kvm_slot_page_track_add_page(kvm, slot, gfn, 824 KVM_PAGE_TRACK_WRITE); 825 826 kvm_mmu_gfn_disallow_lpage(slot, gfn); 827} 828 829static void unaccount_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp) 830{ 831 struct kvm_memslots *slots; 832 struct kvm_memory_slot *slot; 833 gfn_t gfn; 834 835 kvm->arch.indirect_shadow_pages--; 836 gfn = sp->gfn; 837 slots = kvm_memslots_for_spte_role(kvm, sp->role); 838 slot = __gfn_to_memslot(slots, gfn); 839 if (sp->role.level > PT_PAGE_TABLE_LEVEL) 840 return kvm_slot_page_track_remove_page(kvm, slot, gfn, 841 KVM_PAGE_TRACK_WRITE); 842 843 kvm_mmu_gfn_allow_lpage(slot, gfn); 844} 845 846static bool __mmu_gfn_lpage_is_disallowed(gfn_t gfn, int level, 847 struct kvm_memory_slot *slot) 848{ 849 struct kvm_lpage_info *linfo; 850 851 if (slot) { 852 linfo = lpage_info_slot(gfn, slot, level); 853 return !!linfo->disallow_lpage; 854 } 855 856 return true; 857} 858 859static bool mmu_gfn_lpage_is_disallowed(struct kvm_vcpu *vcpu, gfn_t gfn, 860 int level) 861{ 862 struct kvm_memory_slot *slot; 863 864 slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); 865 return __mmu_gfn_lpage_is_disallowed(gfn, level, slot); 866} 867 868static int host_mapping_level(struct kvm *kvm, gfn_t gfn) 869{ 870 unsigned long page_size; 871 int i, ret = 0; 872 873 page_size = kvm_host_page_size(kvm, gfn); 874 875 for (i = PT_PAGE_TABLE_LEVEL; i <= PT_MAX_HUGEPAGE_LEVEL; ++i) { 876 if (page_size >= KVM_HPAGE_SIZE(i)) 877 ret = i; 878 else 879 break; 880 } 881 882 return ret; 883} 884 885static inline bool memslot_valid_for_gpte(struct kvm_memory_slot *slot, 886 bool no_dirty_log) 887{ 888 if (!slot || slot->flags & KVM_MEMSLOT_INVALID) 889 return false; 890 if (no_dirty_log && slot->dirty_bitmap) 891 return false; 892 893 return true; 894} 895 896static struct kvm_memory_slot * 897gfn_to_memslot_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t gfn, 898 bool no_dirty_log) 899{ 900 struct kvm_memory_slot *slot; 901 902 slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); 903 if (!memslot_valid_for_gpte(slot, no_dirty_log)) 904 slot = NULL; 905 906 return slot; 907} 908 909static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn, 910 bool *force_pt_level) 911{ 912 int host_level, level, max_level; 913 struct kvm_memory_slot *slot; 914 915 if (unlikely(*force_pt_level)) 916 return PT_PAGE_TABLE_LEVEL; 917 918 slot = kvm_vcpu_gfn_to_memslot(vcpu, large_gfn); 919 *force_pt_level = !memslot_valid_for_gpte(slot, true); 920 if (unlikely(*force_pt_level)) 921 return PT_PAGE_TABLE_LEVEL; 922 923 host_level = host_mapping_level(vcpu->kvm, large_gfn); 924 925 if (host_level == PT_PAGE_TABLE_LEVEL) 926 return host_level; 927 928 max_level = min(kvm_x86_ops->get_lpage_level(), host_level); 929 930 for (level = PT_DIRECTORY_LEVEL; level <= max_level; ++level) 931 if (__mmu_gfn_lpage_is_disallowed(large_gfn, level, slot)) 932 break; 933 934 return level - 1; 935} 936 937/* 938 * About rmap_head encoding: 939 * 940 * If the bit zero of rmap_head->val is clear, then it points to the only spte 941 * in this rmap chain. Otherwise, (rmap_head->val & ~1) points to a struct 942 * pte_list_desc containing more mappings. 943 */ 944 945/* 946 * Returns the number of pointers in the rmap chain, not counting the new one. 947 */ 948static int pte_list_add(struct kvm_vcpu *vcpu, u64 *spte, 949 struct kvm_rmap_head *rmap_head) 950{ 951 struct pte_list_desc *desc; 952 int i, count = 0; 953 954 if (!rmap_head->val) { 955 rmap_printk("pte_list_add: %p %llx 0->1\n", spte, *spte); 956 rmap_head->val = (unsigned long)spte; 957 } else if (!(rmap_head->val & 1)) { 958 rmap_printk("pte_list_add: %p %llx 1->many\n", spte, *spte); 959 desc = mmu_alloc_pte_list_desc(vcpu); 960 desc->sptes[0] = (u64 *)rmap_head->val; 961 desc->sptes[1] = spte; 962 rmap_head->val = (unsigned long)desc | 1; 963 ++count; 964 } else { 965 rmap_printk("pte_list_add: %p %llx many->many\n", spte, *spte); 966 desc = (struct pte_list_desc *)(rmap_head->val & ~1ul); 967 while (desc->sptes[PTE_LIST_EXT-1] && desc->more) { 968 desc = desc->more; 969 count += PTE_LIST_EXT; 970 } 971 if (desc->sptes[PTE_LIST_EXT-1]) { 972 desc->more = mmu_alloc_pte_list_desc(vcpu); 973 desc = desc->more; 974 } 975 for (i = 0; desc->sptes[i]; ++i) 976 ++count; 977 desc->sptes[i] = spte; 978 } 979 return count; 980} 981 982static void 983pte_list_desc_remove_entry(struct kvm_rmap_head *rmap_head, 984 struct pte_list_desc *desc, int i, 985 struct pte_list_desc *prev_desc) 986{ 987 int j; 988 989 for (j = PTE_LIST_EXT - 1; !desc->sptes[j] && j > i; --j) 990 ; 991 desc->sptes[i] = desc->sptes[j]; 992 desc->sptes[j] = NULL; 993 if (j != 0) 994 return; 995 if (!prev_desc && !desc->more) 996 rmap_head->val = (unsigned long)desc->sptes[0]; 997 else 998 if (prev_desc) 999 prev_desc->more = desc->more; 1000 else 1001 rmap_head->val = (unsigned long)desc->more | 1; 1002 mmu_free_pte_list_desc(desc); 1003} 1004 1005static void pte_list_remove(u64 *spte, struct kvm_rmap_head *rmap_head) 1006{ 1007 struct pte_list_desc *desc; 1008 struct pte_list_desc *prev_desc; 1009 int i; 1010 1011 if (!rmap_head->val) { 1012 printk(KERN_ERR "pte_list_remove: %p 0->BUG\n", spte); 1013 BUG(); 1014 } else if (!(rmap_head->val & 1)) { 1015 rmap_printk("pte_list_remove: %p 1->0\n", spte); 1016 if ((u64 *)rmap_head->val != spte) { 1017 printk(KERN_ERR "pte_list_remove: %p 1->BUG\n", spte); 1018 BUG(); 1019 } 1020 rmap_head->val = 0; 1021 } else { 1022 rmap_printk("pte_list_remove: %p many->many\n", spte); 1023 desc = (struct pte_list_desc *)(rmap_head->val & ~1ul); 1024 prev_desc = NULL; 1025 while (desc) { 1026 for (i = 0; i < PTE_LIST_EXT && desc->sptes[i]; ++i) { 1027 if (desc->sptes[i] == spte) { 1028 pte_list_desc_remove_entry(rmap_head, 1029 desc, i, prev_desc); 1030 return; 1031 } 1032 } 1033 prev_desc = desc; 1034 desc = desc->more; 1035 } 1036 pr_err("pte_list_remove: %p many->many\n", spte); 1037 BUG(); 1038 } 1039} 1040 1041static struct kvm_rmap_head *__gfn_to_rmap(gfn_t gfn, int level, 1042 struct kvm_memory_slot *slot) 1043{ 1044 unsigned long idx; 1045 1046 idx = gfn_to_index(gfn, slot->base_gfn, level); 1047 return &slot->arch.rmap[level - PT_PAGE_TABLE_LEVEL][idx]; 1048} 1049 1050static struct kvm_rmap_head *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, 1051 struct kvm_mmu_page *sp) 1052{ 1053 struct kvm_memslots *slots; 1054 struct kvm_memory_slot *slot; 1055 1056 slots = kvm_memslots_for_spte_role(kvm, sp->role); 1057 slot = __gfn_to_memslot(slots, gfn); 1058 return __gfn_to_rmap(gfn, sp->role.level, slot); 1059} 1060 1061static bool rmap_can_add(struct kvm_vcpu *vcpu) 1062{ 1063 struct kvm_mmu_memory_cache *cache; 1064 1065 cache = &vcpu->arch.mmu_pte_list_desc_cache; 1066 return mmu_memory_cache_free_objects(cache); 1067} 1068 1069static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn) 1070{ 1071 struct kvm_mmu_page *sp; 1072 struct kvm_rmap_head *rmap_head; 1073 1074 sp = page_header(__pa(spte)); 1075 kvm_mmu_page_set_gfn(sp, spte - sp->spt, gfn); 1076 rmap_head = gfn_to_rmap(vcpu->kvm, gfn, sp); 1077 return pte_list_add(vcpu, spte, rmap_head); 1078} 1079 1080static void rmap_remove(struct kvm *kvm, u64 *spte) 1081{ 1082 struct kvm_mmu_page *sp; 1083 gfn_t gfn; 1084 struct kvm_rmap_head *rmap_head; 1085 1086 sp = page_header(__pa(spte)); 1087 gfn = kvm_mmu_page_get_gfn(sp, spte - sp->spt); 1088 rmap_head = gfn_to_rmap(kvm, gfn, sp); 1089 pte_list_remove(spte, rmap_head); 1090} 1091 1092/* 1093 * Used by the following functions to iterate through the sptes linked by a 1094 * rmap. All fields are private and not assumed to be used outside. 1095 */ 1096struct rmap_iterator { 1097 /* private fields */ 1098 struct pte_list_desc *desc; /* holds the sptep if not NULL */ 1099 int pos; /* index of the sptep */ 1100}; 1101 1102/* 1103 * Iteration must be started by this function. This should also be used after 1104 * removing/dropping sptes from the rmap link because in such cases the 1105 * information in the itererator may not be valid. 1106 * 1107 * Returns sptep if found, NULL otherwise. 1108 */ 1109static u64 *rmap_get_first(struct kvm_rmap_head *rmap_head, 1110 struct rmap_iterator *iter) 1111{ 1112 u64 *sptep; 1113 1114 if (!rmap_head->val) 1115 return NULL; 1116 1117 if (!(rmap_head->val & 1)) { 1118 iter->desc = NULL; 1119 sptep = (u64 *)rmap_head->val; 1120 goto out; 1121 } 1122 1123 iter->desc = (struct pte_list_desc *)(rmap_head->val & ~1ul); 1124 iter->pos = 0; 1125 sptep = iter->desc->sptes[iter->pos]; 1126out: 1127 BUG_ON(!is_shadow_present_pte(*sptep)); 1128 return sptep; 1129} 1130 1131/* 1132 * Must be used with a valid iterator: e.g. after rmap_get_first(). 1133 * 1134 * Returns sptep if found, NULL otherwise. 1135 */ 1136static u64 *rmap_get_next(struct rmap_iterator *iter) 1137{ 1138 u64 *sptep; 1139 1140 if (iter->desc) { 1141 if (iter->pos < PTE_LIST_EXT - 1) { 1142 ++iter->pos; 1143 sptep = iter->desc->sptes[iter->pos]; 1144 if (sptep) 1145 goto out; 1146 } 1147 1148 iter->desc = iter->desc->more; 1149 1150 if (iter->desc) { 1151 iter->pos = 0; 1152 /* desc->sptes[0] cannot be NULL */ 1153 sptep = iter->desc->sptes[iter->pos]; 1154 goto out; 1155 } 1156 } 1157 1158 return NULL; 1159out: 1160 BUG_ON(!is_shadow_present_pte(*sptep)); 1161 return sptep; 1162} 1163 1164#define for_each_rmap_spte(_rmap_head_, _iter_, _spte_) \ 1165 for (_spte_ = rmap_get_first(_rmap_head_, _iter_); \ 1166 _spte_; _spte_ = rmap_get_next(_iter_)) 1167 1168static void drop_spte(struct kvm *kvm, u64 *sptep) 1169{ 1170 if (mmu_spte_clear_track_bits(sptep)) 1171 rmap_remove(kvm, sptep); 1172} 1173 1174 1175static bool __drop_large_spte(struct kvm *kvm, u64 *sptep) 1176{ 1177 if (is_large_pte(*sptep)) { 1178 WARN_ON(page_header(__pa(sptep))->role.level == 1179 PT_PAGE_TABLE_LEVEL); 1180 drop_spte(kvm, sptep); 1181 --kvm->stat.lpages; 1182 return true; 1183 } 1184 1185 return false; 1186} 1187 1188static void drop_large_spte(struct kvm_vcpu *vcpu, u64 *sptep) 1189{ 1190 if (__drop_large_spte(vcpu->kvm, sptep)) 1191 kvm_flush_remote_tlbs(vcpu->kvm); 1192} 1193 1194/* 1195 * Write-protect on the specified @sptep, @pt_protect indicates whether 1196 * spte write-protection is caused by protecting shadow page table. 1197 * 1198 * Note: write protection is difference between dirty logging and spte 1199 * protection: 1200 * - for dirty logging, the spte can be set to writable at anytime if 1201 * its dirty bitmap is properly set. 1202 * - for spte protection, the spte can be writable only after unsync-ing 1203 * shadow page. 1204 * 1205 * Return true if tlb need be flushed. 1206 */ 1207static bool spte_write_protect(struct kvm *kvm, u64 *sptep, bool pt_protect) 1208{ 1209 u64 spte = *sptep; 1210 1211 if (!is_writable_pte(spte) && 1212 !(pt_protect && spte_is_locklessly_modifiable(spte))) 1213 return false; 1214 1215 rmap_printk("rmap_write_protect: spte %p %llx\n", sptep, *sptep); 1216 1217 if (pt_protect) 1218 spte &= ~SPTE_MMU_WRITEABLE; 1219 spte = spte & ~PT_WRITABLE_MASK; 1220 1221 return mmu_spte_update(sptep, spte); 1222} 1223 1224static bool __rmap_write_protect(struct kvm *kvm, 1225 struct kvm_rmap_head *rmap_head, 1226 bool pt_protect) 1227{ 1228 u64 *sptep; 1229 struct rmap_iterator iter; 1230 bool flush = false; 1231 1232 for_each_rmap_spte(rmap_head, &iter, sptep) 1233 flush |= spte_write_protect(kvm, sptep, pt_protect); 1234 1235 return flush; 1236} 1237 1238static bool spte_clear_dirty(struct kvm *kvm, u64 *sptep) 1239{ 1240 u64 spte = *sptep; 1241 1242 rmap_printk("rmap_clear_dirty: spte %p %llx\n", sptep, *sptep); 1243 1244 spte &= ~shadow_dirty_mask; 1245 1246 return mmu_spte_update(sptep, spte); 1247} 1248 1249static bool __rmap_clear_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head) 1250{ 1251 u64 *sptep; 1252 struct rmap_iterator iter; 1253 bool flush = false; 1254 1255 for_each_rmap_spte(rmap_head, &iter, sptep) 1256 flush |= spte_clear_dirty(kvm, sptep); 1257 1258 return flush; 1259} 1260 1261static bool spte_set_dirty(struct kvm *kvm, u64 *sptep) 1262{ 1263 u64 spte = *sptep; 1264 1265 rmap_printk("rmap_set_dirty: spte %p %llx\n", sptep, *sptep); 1266 1267 spte |= shadow_dirty_mask; 1268 1269 return mmu_spte_update(sptep, spte); 1270} 1271 1272static bool __rmap_set_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head) 1273{ 1274 u64 *sptep; 1275 struct rmap_iterator iter; 1276 bool flush = false; 1277 1278 for_each_rmap_spte(rmap_head, &iter, sptep) 1279 flush |= spte_set_dirty(kvm, sptep); 1280 1281 return flush; 1282} 1283 1284/** 1285 * kvm_mmu_write_protect_pt_masked - write protect selected PT level pages 1286 * @kvm: kvm instance 1287 * @slot: slot to protect 1288 * @gfn_offset: start of the BITS_PER_LONG pages we care about 1289 * @mask: indicates which pages we should protect 1290 * 1291 * Used when we do not need to care about huge page mappings: e.g. during dirty 1292 * logging we do not have any such mappings. 1293 */ 1294static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm, 1295 struct kvm_memory_slot *slot, 1296 gfn_t gfn_offset, unsigned long mask) 1297{ 1298 struct kvm_rmap_head *rmap_head; 1299 1300 while (mask) { 1301 rmap_head = __gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask), 1302 PT_PAGE_TABLE_LEVEL, slot); 1303 __rmap_write_protect(kvm, rmap_head, false); 1304 1305 /* clear the first set bit */ 1306 mask &= mask - 1; 1307 } 1308} 1309 1310/** 1311 * kvm_mmu_clear_dirty_pt_masked - clear MMU D-bit for PT level pages 1312 * @kvm: kvm instance 1313 * @slot: slot to clear D-bit 1314 * @gfn_offset: start of the BITS_PER_LONG pages we care about 1315 * @mask: indicates which pages we should clear D-bit 1316 * 1317 * Used for PML to re-log the dirty GPAs after userspace querying dirty_bitmap. 1318 */ 1319void kvm_mmu_clear_dirty_pt_masked(struct kvm *kvm, 1320 struct kvm_memory_slot *slot, 1321 gfn_t gfn_offset, unsigned long mask) 1322{ 1323 struct kvm_rmap_head *rmap_head; 1324 1325 while (mask) { 1326 rmap_head = __gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask), 1327 PT_PAGE_TABLE_LEVEL, slot); 1328 __rmap_clear_dirty(kvm, rmap_head); 1329 1330 /* clear the first set bit */ 1331 mask &= mask - 1; 1332 } 1333} 1334EXPORT_SYMBOL_GPL(kvm_mmu_clear_dirty_pt_masked); 1335 1336/** 1337 * kvm_arch_mmu_enable_log_dirty_pt_masked - enable dirty logging for selected 1338 * PT level pages. 1339 * 1340 * It calls kvm_mmu_write_protect_pt_masked to write protect selected pages to 1341 * enable dirty logging for them. 1342 * 1343 * Used when we do not need to care about huge page mappings: e.g. during dirty 1344 * logging we do not have any such mappings. 1345 */ 1346void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm, 1347 struct kvm_memory_slot *slot, 1348 gfn_t gfn_offset, unsigned long mask) 1349{ 1350 if (kvm_x86_ops->enable_log_dirty_pt_masked) 1351 kvm_x86_ops->enable_log_dirty_pt_masked(kvm, slot, gfn_offset, 1352 mask); 1353 else 1354 kvm_mmu_write_protect_pt_masked(kvm, slot, gfn_offset, mask); 1355} 1356 1357bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm, 1358 struct kvm_memory_slot *slot, u64 gfn) 1359{ 1360 struct kvm_rmap_head *rmap_head; 1361 int i; 1362 bool write_protected = false; 1363 1364 for (i = PT_PAGE_TABLE_LEVEL; i <= PT_MAX_HUGEPAGE_LEVEL; ++i) { 1365 rmap_head = __gfn_to_rmap(gfn, i, slot); 1366 write_protected |= __rmap_write_protect(kvm, rmap_head, true); 1367 } 1368 1369 return write_protected; 1370} 1371 1372static bool rmap_write_protect(struct kvm_vcpu *vcpu, u64 gfn) 1373{ 1374 struct kvm_memory_slot *slot; 1375 1376 slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); 1377 return kvm_mmu_slot_gfn_write_protect(vcpu->kvm, slot, gfn); 1378} 1379 1380static bool kvm_zap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head) 1381{ 1382 u64 *sptep; 1383 struct rmap_iterator iter; 1384 bool flush = false; 1385 1386 while ((sptep = rmap_get_first(rmap_head, &iter))) { 1387 rmap_printk("%s: spte %p %llx.\n", __func__, sptep, *sptep); 1388 1389 drop_spte(kvm, sptep); 1390 flush = true; 1391 } 1392 1393 return flush; 1394} 1395 1396static int kvm_unmap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head, 1397 struct kvm_memory_slot *slot, gfn_t gfn, int level, 1398 unsigned long data) 1399{ 1400 return kvm_zap_rmapp(kvm, rmap_head); 1401} 1402 1403static int kvm_set_pte_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head, 1404 struct kvm_memory_slot *slot, gfn_t gfn, int level, 1405 unsigned long data) 1406{ 1407 u64 *sptep; 1408 struct rmap_iterator iter; 1409 int need_flush = 0; 1410 u64 new_spte; 1411 pte_t *ptep = (pte_t *)data; 1412 kvm_pfn_t new_pfn; 1413 1414 WARN_ON(pte_huge(*ptep)); 1415 new_pfn = pte_pfn(*ptep); 1416 1417restart: 1418 for_each_rmap_spte(rmap_head, &iter, sptep) { 1419 rmap_printk("kvm_set_pte_rmapp: spte %p %llx gfn %llx (%d)\n", 1420 sptep, *sptep, gfn, level); 1421 1422 need_flush = 1; 1423 1424 if (pte_write(*ptep)) { 1425 drop_spte(kvm, sptep); 1426 goto restart; 1427 } else { 1428 new_spte = *sptep & ~PT64_BASE_ADDR_MASK; 1429 new_spte |= (u64)new_pfn << PAGE_SHIFT; 1430 1431 new_spte &= ~PT_WRITABLE_MASK; 1432 new_spte &= ~SPTE_HOST_WRITEABLE; 1433 new_spte &= ~shadow_accessed_mask; 1434 1435 mmu_spte_clear_track_bits(sptep); 1436 mmu_spte_set(sptep, new_spte); 1437 } 1438 } 1439 1440 if (need_flush) 1441 kvm_flush_remote_tlbs(kvm); 1442 1443 return 0; 1444} 1445 1446struct slot_rmap_walk_iterator { 1447 /* input fields. */ 1448 struct kvm_memory_slot *slot; 1449 gfn_t start_gfn; 1450 gfn_t end_gfn; 1451 int start_level; 1452 int end_level; 1453 1454 /* output fields. */ 1455 gfn_t gfn; 1456 struct kvm_rmap_head *rmap; 1457 int level; 1458 1459 /* private field. */ 1460 struct kvm_rmap_head *end_rmap; 1461}; 1462 1463static void 1464rmap_walk_init_level(struct slot_rmap_walk_iterator *iterator, int level) 1465{ 1466 iterator->level = level; 1467 iterator->gfn = iterator->start_gfn; 1468 iterator->rmap = __gfn_to_rmap(iterator->gfn, level, iterator->slot); 1469 iterator->end_rmap = __gfn_to_rmap(iterator->end_gfn, level, 1470 iterator->slot); 1471} 1472 1473static void 1474slot_rmap_walk_init(struct slot_rmap_walk_iterator *iterator, 1475 struct kvm_memory_slot *slot, int start_level, 1476 int end_level, gfn_t start_gfn, gfn_t end_gfn) 1477{ 1478 iterator->slot = slot; 1479 iterator->start_level = start_level; 1480 iterator->end_level = end_level; 1481 iterator->start_gfn = start_gfn; 1482 iterator->end_gfn = end_gfn; 1483 1484 rmap_walk_init_level(iterator, iterator->start_level); 1485} 1486 1487static bool slot_rmap_walk_okay(struct slot_rmap_walk_iterator *iterator) 1488{ 1489 return !!iterator->rmap; 1490} 1491 1492static void slot_rmap_walk_next(struct slot_rmap_walk_iterator *iterator) 1493{ 1494 if (++iterator->rmap <= iterator->end_rmap) { 1495 iterator->gfn += (1UL << KVM_HPAGE_GFN_SHIFT(iterator->level)); 1496 return; 1497 } 1498 1499 if (++iterator->level > iterator->end_level) { 1500 iterator->rmap = NULL; 1501 return; 1502 } 1503 1504 rmap_walk_init_level(iterator, iterator->level); 1505} 1506 1507#define for_each_slot_rmap_range(_slot_, _start_level_, _end_level_, \ 1508 _start_gfn, _end_gfn, _iter_) \ 1509 for (slot_rmap_walk_init(_iter_, _slot_, _start_level_, \ 1510 _end_level_, _start_gfn, _end_gfn); \ 1511 slot_rmap_walk_okay(_iter_); \ 1512 slot_rmap_walk_next(_iter_)) 1513 1514static int kvm_handle_hva_range(struct kvm *kvm, 1515 unsigned long start, 1516 unsigned long end, 1517 unsigned long data, 1518 int (*handler)(struct kvm *kvm, 1519 struct kvm_rmap_head *rmap_head, 1520 struct kvm_memory_slot *slot, 1521 gfn_t gfn, 1522 int level, 1523 unsigned long data)) 1524{ 1525 struct kvm_memslots *slots; 1526 struct kvm_memory_slot *memslot; 1527 struct slot_rmap_walk_iterator iterator; 1528 int ret = 0; 1529 int i; 1530 1531 for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) { 1532 slots = __kvm_memslots(kvm, i); 1533 kvm_for_each_memslot(memslot, slots) { 1534 unsigned long hva_start, hva_end; 1535 gfn_t gfn_start, gfn_end; 1536 1537 hva_start = max(start, memslot->userspace_addr); 1538 hva_end = min(end, memslot->userspace_addr + 1539 (memslot->npages << PAGE_SHIFT)); 1540 if (hva_start >= hva_end) 1541 continue; 1542 /* 1543 * {gfn(page) | page intersects with [hva_start, hva_end)} = 1544 * {gfn_start, gfn_start+1, ..., gfn_end-1}. 1545 */ 1546 gfn_start = hva_to_gfn_memslot(hva_start, memslot); 1547 gfn_end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, memslot); 1548 1549 for_each_slot_rmap_range(memslot, PT_PAGE_TABLE_LEVEL, 1550 PT_MAX_HUGEPAGE_LEVEL, 1551 gfn_start, gfn_end - 1, 1552 &iterator) 1553 ret |= handler(kvm, iterator.rmap, memslot, 1554 iterator.gfn, iterator.level, data); 1555 } 1556 } 1557 1558 return ret; 1559} 1560 1561static int kvm_handle_hva(struct kvm *kvm, unsigned long hva, 1562 unsigned long data, 1563 int (*handler)(struct kvm *kvm, 1564 struct kvm_rmap_head *rmap_head, 1565 struct kvm_memory_slot *slot, 1566 gfn_t gfn, int level, 1567 unsigned long data)) 1568{ 1569 return kvm_handle_hva_range(kvm, hva, hva + 1, data, handler); 1570} 1571 1572int kvm_unmap_hva(struct kvm *kvm, unsigned long hva) 1573{ 1574 return kvm_handle_hva(kvm, hva, 0, kvm_unmap_rmapp); 1575} 1576 1577int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end) 1578{ 1579 return kvm_handle_hva_range(kvm, start, end, 0, kvm_unmap_rmapp); 1580} 1581 1582void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte) 1583{ 1584 kvm_handle_hva(kvm, hva, (unsigned long)&pte, kvm_set_pte_rmapp); 1585} 1586 1587static int kvm_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head, 1588 struct kvm_memory_slot *slot, gfn_t gfn, int level, 1589 unsigned long data) 1590{ 1591 u64 *sptep; 1592 struct rmap_iterator uninitialized_var(iter); 1593 int young = 0; 1594 1595 BUG_ON(!shadow_accessed_mask); 1596 1597 for_each_rmap_spte(rmap_head, &iter, sptep) { 1598 if (*sptep & shadow_accessed_mask) { 1599 young = 1; 1600 clear_bit((ffs(shadow_accessed_mask) - 1), 1601 (unsigned long *)sptep); 1602 } 1603 } 1604 1605 trace_kvm_age_page(gfn, level, slot, young); 1606 return young; 1607} 1608 1609static int kvm_test_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head, 1610 struct kvm_memory_slot *slot, gfn_t gfn, 1611 int level, unsigned long data) 1612{ 1613 u64 *sptep; 1614 struct rmap_iterator iter; 1615 int young = 0; 1616 1617 /* 1618 * If there's no access bit in the secondary pte set by the 1619 * hardware it's up to gup-fast/gup to set the access bit in 1620 * the primary pte or in the page structure. 1621 */ 1622 if (!shadow_accessed_mask) 1623 goto out; 1624 1625 for_each_rmap_spte(rmap_head, &iter, sptep) { 1626 if (*sptep & shadow_accessed_mask) { 1627 young = 1; 1628 break; 1629 } 1630 } 1631out: 1632 return young; 1633} 1634 1635#define RMAP_RECYCLE_THRESHOLD 1000 1636 1637static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn) 1638{ 1639 struct kvm_rmap_head *rmap_head; 1640 struct kvm_mmu_page *sp; 1641 1642 sp = page_header(__pa(spte)); 1643 1644 rmap_head = gfn_to_rmap(vcpu->kvm, gfn, sp); 1645 1646 kvm_unmap_rmapp(vcpu->kvm, rmap_head, NULL, gfn, sp->role.level, 0); 1647 kvm_flush_remote_tlbs(vcpu->kvm); 1648} 1649 1650int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end) 1651{ 1652 /* 1653 * In case of absence of EPT Access and Dirty Bits supports, 1654 * emulate the accessed bit for EPT, by checking if this page has 1655 * an EPT mapping, and clearing it if it does. On the next access, 1656 * a new EPT mapping will be established. 1657 * This has some overhead, but not as much as the cost of swapping 1658 * out actively used pages or breaking up actively used hugepages. 1659 */ 1660 if (!shadow_accessed_mask) { 1661 /* 1662 * We are holding the kvm->mmu_lock, and we are blowing up 1663 * shadow PTEs. MMU notifier consumers need to be kept at bay. 1664 * This is correct as long as we don't decouple the mmu_lock 1665 * protected regions (like invalidate_range_start|end does). 1666 */ 1667 kvm->mmu_notifier_seq++; 1668 return kvm_handle_hva_range(kvm, start, end, 0, 1669 kvm_unmap_rmapp); 1670 } 1671 1672 return kvm_handle_hva_range(kvm, start, end, 0, kvm_age_rmapp); 1673} 1674 1675int kvm_test_age_hva(struct kvm *kvm, unsigned long hva) 1676{ 1677 return kvm_handle_hva(kvm, hva, 0, kvm_test_age_rmapp); 1678} 1679 1680#ifdef MMU_DEBUG 1681static int is_empty_shadow_page(u64 *spt) 1682{ 1683 u64 *pos; 1684 u64 *end; 1685 1686 for (pos = spt, end = pos + PAGE_SIZE / sizeof(u64); pos != end; pos++) 1687 if (is_shadow_present_pte(*pos)) { 1688 printk(KERN_ERR "%s: %p %llx\n", __func__, 1689 pos, *pos); 1690 return 0; 1691 } 1692 return 1; 1693} 1694#endif 1695 1696/* 1697 * This value is the sum of all of the kvm instances's 1698 * kvm->arch.n_used_mmu_pages values. We need a global, 1699 * aggregate version in order to make the slab shrinker 1700 * faster 1701 */ 1702static inline void kvm_mod_used_mmu_pages(struct kvm *kvm, int nr) 1703{ 1704 kvm->arch.n_used_mmu_pages += nr; 1705 percpu_counter_add(&kvm_total_used_mmu_pages, nr); 1706} 1707 1708static void kvm_mmu_free_page(struct kvm_mmu_page *sp) 1709{ 1710 MMU_WARN_ON(!is_empty_shadow_page(sp->spt)); 1711 hlist_del(&sp->hash_link); 1712 list_del(&sp->link); 1713 free_page((unsigned long)sp->spt); 1714 if (!sp->role.direct) 1715 free_page((unsigned long)sp->gfns); 1716 kmem_cache_free(mmu_page_header_cache, sp); 1717} 1718 1719static unsigned kvm_page_table_hashfn(gfn_t gfn) 1720{ 1721 return gfn & ((1 << KVM_MMU_HASH_SHIFT) - 1); 1722} 1723 1724static void mmu_page_add_parent_pte(struct kvm_vcpu *vcpu, 1725 struct kvm_mmu_page *sp, u64 *parent_pte) 1726{ 1727 if (!parent_pte) 1728 return; 1729 1730 pte_list_add(vcpu, parent_pte, &sp->parent_ptes); 1731} 1732 1733static void mmu_page_remove_parent_pte(struct kvm_mmu_page *sp, 1734 u64 *parent_pte) 1735{ 1736 pte_list_remove(parent_pte, &sp->parent_ptes); 1737} 1738 1739static void drop_parent_pte(struct kvm_mmu_page *sp, 1740 u64 *parent_pte) 1741{ 1742 mmu_page_remove_parent_pte(sp, parent_pte); 1743 mmu_spte_clear_no_track(parent_pte); 1744} 1745 1746static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, int direct) 1747{ 1748 struct kvm_mmu_page *sp; 1749 1750 sp = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache); 1751 sp->spt = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache); 1752 if (!direct) 1753 sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache); 1754 set_page_private(virt_to_page(sp->spt), (unsigned long)sp); 1755 1756 /* 1757 * The active_mmu_pages list is the FIFO list, do not move the 1758 * page until it is zapped. kvm_zap_obsolete_pages depends on 1759 * this feature. See the comments in kvm_zap_obsolete_pages(). 1760 */ 1761 list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages); 1762 kvm_mod_used_mmu_pages(vcpu->kvm, +1); 1763 return sp; 1764} 1765 1766static void mark_unsync(u64 *spte); 1767static void kvm_mmu_mark_parents_unsync(struct kvm_mmu_page *sp) 1768{ 1769 u64 *sptep; 1770 struct rmap_iterator iter; 1771 1772 for_each_rmap_spte(&sp->parent_ptes, &iter, sptep) { 1773 mark_unsync(sptep); 1774 } 1775} 1776 1777static void mark_unsync(u64 *spte) 1778{ 1779 struct kvm_mmu_page *sp; 1780 unsigned int index; 1781 1782 sp = page_header(__pa(spte)); 1783 index = spte - sp->spt; 1784 if (__test_and_set_bit(index, sp->unsync_child_bitmap)) 1785 return; 1786 if (sp->unsync_children++) 1787 return; 1788 kvm_mmu_mark_parents_unsync(sp); 1789} 1790 1791static int nonpaging_sync_page(struct kvm_vcpu *vcpu, 1792 struct kvm_mmu_page *sp) 1793{ 1794 return 0; 1795} 1796 1797static void nonpaging_invlpg(struct kvm_vcpu *vcpu, gva_t gva) 1798{ 1799} 1800 1801static void nonpaging_update_pte(struct kvm_vcpu *vcpu, 1802 struct kvm_mmu_page *sp, u64 *spte, 1803 const void *pte) 1804{ 1805 WARN_ON(1); 1806} 1807 1808#define KVM_PAGE_ARRAY_NR 16 1809 1810struct kvm_mmu_pages { 1811 struct mmu_page_and_offset { 1812 struct kvm_mmu_page *sp; 1813 unsigned int idx; 1814 } page[KVM_PAGE_ARRAY_NR]; 1815 unsigned int nr; 1816}; 1817 1818static int mmu_pages_add(struct kvm_mmu_pages *pvec, struct kvm_mmu_page *sp, 1819 int idx) 1820{ 1821 int i; 1822 1823 if (sp->unsync) 1824 for (i=0; i < pvec->nr; i++) 1825 if (pvec->page[i].sp == sp) 1826 return 0; 1827 1828 pvec->page[pvec->nr].sp = sp; 1829 pvec->page[pvec->nr].idx = idx; 1830 pvec->nr++; 1831 return (pvec->nr == KVM_PAGE_ARRAY_NR); 1832} 1833 1834static inline void clear_unsync_child_bit(struct kvm_mmu_page *sp, int idx) 1835{ 1836 --sp->unsync_children; 1837 WARN_ON((int)sp->unsync_children < 0); 1838 __clear_bit(idx, sp->unsync_child_bitmap); 1839} 1840 1841static int __mmu_unsync_walk(struct kvm_mmu_page *sp, 1842 struct kvm_mmu_pages *pvec) 1843{ 1844 int i, ret, nr_unsync_leaf = 0; 1845 1846 for_each_set_bit(i, sp->unsync_child_bitmap, 512) { 1847 struct kvm_mmu_page *child; 1848 u64 ent = sp->spt[i]; 1849 1850 if (!is_shadow_present_pte(ent) || is_large_pte(ent)) { 1851 clear_unsync_child_bit(sp, i); 1852 continue; 1853 } 1854 1855 child = page_header(ent & PT64_BASE_ADDR_MASK); 1856 1857 if (child->unsync_children) { 1858 if (mmu_pages_add(pvec, child, i)) 1859 return -ENOSPC; 1860 1861 ret = __mmu_unsync_walk(child, pvec); 1862 if (!ret) { 1863 clear_unsync_child_bit(sp, i); 1864 continue; 1865 } else if (ret > 0) { 1866 nr_unsync_leaf += ret; 1867 } else 1868 return ret; 1869 } else if (child->unsync) { 1870 nr_unsync_leaf++; 1871 if (mmu_pages_add(pvec, child, i)) 1872 return -ENOSPC; 1873 } else 1874 clear_unsync_child_bit(sp, i); 1875 } 1876 1877 return nr_unsync_leaf; 1878} 1879 1880#define INVALID_INDEX (-1) 1881 1882static int mmu_unsync_walk(struct kvm_mmu_page *sp, 1883 struct kvm_mmu_pages *pvec) 1884{ 1885 pvec->nr = 0; 1886 if (!sp->unsync_children) 1887 return 0; 1888 1889 mmu_pages_add(pvec, sp, INVALID_INDEX); 1890 return __mmu_unsync_walk(sp, pvec); 1891} 1892 1893static void kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp) 1894{ 1895 WARN_ON(!sp->unsync); 1896 trace_kvm_mmu_sync_page(sp); 1897 sp->unsync = 0; 1898 --kvm->stat.mmu_unsync; 1899} 1900 1901static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp, 1902 struct list_head *invalid_list); 1903static void kvm_mmu_commit_zap_page(struct kvm *kvm, 1904 struct list_head *invalid_list); 1905 1906/* 1907 * NOTE: we should pay more attention on the zapped-obsolete page 1908 * (is_obsolete_sp(sp) && sp->role.invalid) when you do hash list walk 1909 * since it has been deleted from active_mmu_pages but still can be found 1910 * at hast list. 1911 * 1912 * for_each_gfn_indirect_valid_sp has skipped that kind of page and 1913 * kvm_mmu_get_page(), the only user of for_each_gfn_sp(), has skipped 1914 * all the obsolete pages. 1915 */ 1916#define for_each_gfn_sp(_kvm, _sp, _gfn) \ 1917 hlist_for_each_entry(_sp, \ 1918 &(_kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(_gfn)], hash_link) \ 1919 if ((_sp)->gfn != (_gfn)) {} else 1920 1921#define for_each_gfn_indirect_valid_sp(_kvm, _sp, _gfn) \ 1922 for_each_gfn_sp(_kvm, _sp, _gfn) \ 1923 if ((_sp)->role.direct || (_sp)->role.invalid) {} else 1924 1925/* @sp->gfn should be write-protected at the call site */ 1926static bool __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, 1927 struct list_head *invalid_list) 1928{ 1929 if (sp->role.cr4_pae != !!is_pae(vcpu)) { 1930 kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list); 1931 return false; 1932 } 1933 1934 if (vcpu->arch.mmu.sync_page(vcpu, sp) == 0) { 1935 kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list); 1936 return false; 1937 } 1938 1939 return true; 1940} 1941 1942static void kvm_mmu_flush_or_zap(struct kvm_vcpu *vcpu, 1943 struct list_head *invalid_list, 1944 bool remote_flush, bool local_flush) 1945{ 1946 if (!list_empty(invalid_list)) { 1947 kvm_mmu_commit_zap_page(vcpu->kvm, invalid_list); 1948 return; 1949 } 1950 1951 if (remote_flush) 1952 kvm_flush_remote_tlbs(vcpu->kvm); 1953 else if (local_flush) 1954 kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); 1955} 1956 1957#ifdef CONFIG_KVM_MMU_AUDIT 1958#include "mmu_audit.c" 1959#else 1960static void kvm_mmu_audit(struct kvm_vcpu *vcpu, int point) { } 1961static void mmu_audit_disable(void) { } 1962#endif 1963 1964static bool kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, 1965 struct list_head *invalid_list) 1966{ 1967 kvm_unlink_unsync_page(vcpu->kvm, sp); 1968 return __kvm_sync_page(vcpu, sp, invalid_list); 1969} 1970 1971/* @gfn should be write-protected at the call site */ 1972static bool kvm_sync_pages(struct kvm_vcpu *vcpu, gfn_t gfn, 1973 struct list_head *invalid_list) 1974{ 1975 struct kvm_mmu_page *s; 1976 bool ret = false; 1977 1978 for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn) { 1979 if (!s->unsync) 1980 continue; 1981 1982 WARN_ON(s->role.level != PT_PAGE_TABLE_LEVEL); 1983 ret |= kvm_sync_page(vcpu, s, invalid_list); 1984 } 1985 1986 return ret; 1987} 1988 1989struct mmu_page_path { 1990 struct kvm_mmu_page *parent[PT64_ROOT_LEVEL]; 1991 unsigned int idx[PT64_ROOT_LEVEL]; 1992}; 1993 1994#define for_each_sp(pvec, sp, parents, i) \ 1995 for (i = mmu_pages_first(&pvec, &parents); \ 1996 i < pvec.nr && ({ sp = pvec.page[i].sp; 1;}); \ 1997 i = mmu_pages_next(&pvec, &parents, i)) 1998 1999static int mmu_pages_next(struct kvm_mmu_pages *pvec, 2000 struct mmu_page_path *parents, 2001 int i) 2002{ 2003 int n; 2004 2005 for (n = i+1; n < pvec->nr; n++) { 2006 struct kvm_mmu_page *sp = pvec->page[n].sp; 2007 unsigned idx = pvec->page[n].idx; 2008 int level = sp->role.level; 2009 2010 parents->idx[level-1] = idx; 2011 if (level == PT_PAGE_TABLE_LEVEL) 2012 break; 2013 2014 parents->parent[level-2] = sp; 2015 } 2016 2017 return n; 2018} 2019 2020static int mmu_pages_first(struct kvm_mmu_pages *pvec, 2021 struct mmu_page_path *parents) 2022{ 2023 struct kvm_mmu_page *sp; 2024 int level; 2025 2026 if (pvec->nr == 0) 2027 return 0; 2028 2029 WARN_ON(pvec->page[0].idx != INVALID_INDEX); 2030 2031 sp = pvec->page[0].sp; 2032 level = sp->role.level; 2033 WARN_ON(level == PT_PAGE_TABLE_LEVEL); 2034 2035 parents->parent[level-2] = sp; 2036 2037 /* Also set up a sentinel. Further entries in pvec are all 2038 * children of sp, so this element is never overwritten. 2039 */ 2040 parents->parent[level-1] = NULL; 2041 return mmu_pages_next(pvec, parents, 0); 2042} 2043 2044static void mmu_pages_clear_parents(struct mmu_page_path *parents) 2045{ 2046 struct kvm_mmu_page *sp; 2047 unsigned int level = 0; 2048 2049 do { 2050 unsigned int idx = parents->idx[level]; 2051 sp = parents->parent[level]; 2052 if (!sp) 2053 return; 2054 2055 WARN_ON(idx == INVALID_INDEX); 2056 clear_unsync_child_bit(sp, idx); 2057 level++; 2058 } while (!sp->unsync_children); 2059} 2060 2061static void mmu_sync_children(struct kvm_vcpu *vcpu, 2062 struct kvm_mmu_page *parent) 2063{ 2064 int i; 2065 struct kvm_mmu_page *sp; 2066 struct mmu_page_path parents; 2067 struct kvm_mmu_pages pages; 2068 LIST_HEAD(invalid_list); 2069 bool flush = false; 2070 2071 while (mmu_unsync_walk(parent, &pages)) { 2072 bool protected = false; 2073 2074 for_each_sp(pages, sp, parents, i) 2075 protected |= rmap_write_protect(vcpu, sp->gfn); 2076 2077 if (protected) { 2078 kvm_flush_remote_tlbs(vcpu->kvm); 2079 flush = false; 2080 } 2081 2082 for_each_sp(pages, sp, parents, i) { 2083 flush |= kvm_sync_page(vcpu, sp, &invalid_list); 2084 mmu_pages_clear_parents(&parents); 2085 } 2086 if (need_resched() || spin_needbreak(&vcpu->kvm->mmu_lock)) { 2087 kvm_mmu_flush_or_zap(vcpu, &invalid_list, false, flush); 2088 cond_resched_lock(&vcpu->kvm->mmu_lock); 2089 flush = false; 2090 } 2091 } 2092 2093 kvm_mmu_flush_or_zap(vcpu, &invalid_list, false, flush); 2094} 2095 2096static void __clear_sp_write_flooding_count(struct kvm_mmu_page *sp) 2097{ 2098 atomic_set(&sp->write_flooding_count, 0); 2099} 2100 2101static void clear_sp_write_flooding_count(u64 *spte) 2102{ 2103 struct kvm_mmu_page *sp = page_header(__pa(spte)); 2104 2105 __clear_sp_write_flooding_count(sp); 2106} 2107 2108static bool is_obsolete_sp(struct kvm *kvm, struct kvm_mmu_page *sp) 2109{ 2110 return unlikely(sp->mmu_valid_gen != kvm->arch.mmu_valid_gen); 2111} 2112 2113static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, 2114 gfn_t gfn, 2115 gva_t gaddr, 2116 unsigned level, 2117 int direct, 2118 unsigned access) 2119{ 2120 union kvm_mmu_page_role role; 2121 unsigned quadrant; 2122 struct kvm_mmu_page *sp; 2123 bool need_sync = false; 2124 bool flush = false; 2125 LIST_HEAD(invalid_list); 2126 2127 role = vcpu->arch.mmu.base_role; 2128 role.level = level; 2129 role.direct = direct; 2130 if (role.direct) 2131 role.cr4_pae = 0; 2132 role.access = access; 2133 if (!vcpu->arch.mmu.direct_map 2134 && vcpu->arch.mmu.root_level <= PT32_ROOT_LEVEL) { 2135 quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level)); 2136 quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1; 2137 role.quadrant = quadrant; 2138 } 2139 for_each_gfn_sp(vcpu->kvm, sp, gfn) { 2140 if (is_obsolete_sp(vcpu->kvm, sp)) 2141 continue; 2142 2143 if (!need_sync && sp->unsync) 2144 need_sync = true; 2145 2146 if (sp->role.word != role.word) 2147 continue; 2148 2149 if (sp->unsync) { 2150 /* The page is good, but __kvm_sync_page might still end 2151 * up zapping it. If so, break in order to rebuild it. 2152 */ 2153 if (!__kvm_sync_page(vcpu, sp, &invalid_list)) 2154 break; 2155 2156 WARN_ON(!list_empty(&invalid_list)); 2157 kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); 2158 } 2159 2160 if (sp->unsync_children) 2161 kvm_make_request(KVM_REQ_MMU_SYNC, vcpu); 2162 2163 __clear_sp_write_flooding_count(sp); 2164 trace_kvm_mmu_get_page(sp, false); 2165 return sp; 2166 } 2167 2168 ++vcpu->kvm->stat.mmu_cache_miss; 2169 2170 sp = kvm_mmu_alloc_page(vcpu, direct); 2171 2172 sp->gfn = gfn; 2173 sp->role = role; 2174 hlist_add_head(&sp->hash_link, 2175 &vcpu->kvm->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)]); 2176 if (!direct) { 2177 /* 2178 * we should do write protection before syncing pages 2179 * otherwise the content of the synced shadow page may 2180 * be inconsistent with guest page table. 2181 */ 2182 account_shadowed(vcpu->kvm, sp); 2183 if (level == PT_PAGE_TABLE_LEVEL && 2184 rmap_write_protect(vcpu, gfn)) 2185 kvm_flush_remote_tlbs(vcpu->kvm); 2186 2187 if (level > PT_PAGE_TABLE_LEVEL && need_sync) 2188 flush |= kvm_sync_pages(vcpu, gfn, &invalid_list); 2189 } 2190 sp->mmu_valid_gen = vcpu->kvm->arch.mmu_valid_gen; 2191 clear_page(sp->spt); 2192 trace_kvm_mmu_get_page(sp, true); 2193 2194 kvm_mmu_flush_or_zap(vcpu, &invalid_list, false, flush); 2195 return sp; 2196} 2197 2198static void shadow_walk_init(struct kvm_shadow_walk_iterator *iterator, 2199 struct kvm_vcpu *vcpu, u64 addr) 2200{ 2201 iterator->addr = addr; 2202 iterator->shadow_addr = vcpu->arch.mmu.root_hpa; 2203 iterator->level = vcpu->arch.mmu.shadow_root_level; 2204 2205 if (iterator->level == PT64_ROOT_LEVEL && 2206 vcpu->arch.mmu.root_level < PT64_ROOT_LEVEL && 2207 !vcpu->arch.mmu.direct_map) 2208 --iterator->level; 2209 2210 if (iterator->level == PT32E_ROOT_LEVEL) { 2211 iterator->shadow_addr 2212 = vcpu->arch.mmu.pae_root[(addr >> 30) & 3]; 2213 iterator->shadow_addr &= PT64_BASE_ADDR_MASK; 2214 --iterator->level; 2215 if (!iterator->shadow_addr) 2216 iterator->level = 0; 2217 } 2218} 2219 2220static bool shadow_walk_okay(struct kvm_shadow_walk_iterator *iterator) 2221{ 2222 if (iterator->level < PT_PAGE_TABLE_LEVEL) 2223 return false; 2224 2225 iterator->index = SHADOW_PT_INDEX(iterator->addr, iterator->level); 2226 iterator->sptep = ((u64 *)__va(iterator->shadow_addr)) + iterator->index; 2227 return true; 2228} 2229 2230static void __shadow_walk_next(struct kvm_shadow_walk_iterator *iterator, 2231 u64 spte) 2232{ 2233 if (is_last_spte(spte, iterator->level)) { 2234 iterator->level = 0; 2235 return; 2236 } 2237 2238 iterator->shadow_addr = spte & PT64_BASE_ADDR_MASK; 2239 --iterator->level; 2240} 2241 2242static void shadow_walk_next(struct kvm_shadow_walk_iterator *iterator) 2243{ 2244 return __shadow_walk_next(iterator, *iterator->sptep); 2245} 2246 2247static void link_shadow_page(struct kvm_vcpu *vcpu, u64 *sptep, 2248 struct kvm_mmu_page *sp) 2249{ 2250 u64 spte; 2251 2252 BUILD_BUG_ON(VMX_EPT_READABLE_MASK != PT_PRESENT_MASK || 2253 VMX_EPT_WRITABLE_MASK != PT_WRITABLE_MASK); 2254 2255 spte = __pa(sp->spt) | PT_PRESENT_MASK | PT_WRITABLE_MASK | 2256 shadow_user_mask | shadow_x_mask | shadow_accessed_mask; 2257 2258 mmu_spte_set(sptep, spte); 2259 2260 mmu_page_add_parent_pte(vcpu, sp, sptep); 2261 2262 if (sp->unsync_children || sp->unsync) 2263 mark_unsync(sptep); 2264} 2265 2266static void validate_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep, 2267 unsigned direct_access) 2268{ 2269 if (is_shadow_present_pte(*sptep) && !is_large_pte(*sptep)) { 2270 struct kvm_mmu_page *child; 2271 2272 /* 2273 * For the direct sp, if the guest pte's dirty bit 2274 * changed form clean to dirty, it will corrupt the 2275 * sp's access: allow writable in the read-only sp, 2276 * so we should update the spte at this point to get 2277 * a new sp with the correct access. 2278 */ 2279 child = page_header(*sptep & PT64_BASE_ADDR_MASK); 2280 if (child->role.access == direct_access) 2281 return; 2282 2283 drop_parent_pte(child, sptep); 2284 kvm_flush_remote_tlbs(vcpu->kvm); 2285 } 2286} 2287 2288static bool mmu_page_zap_pte(struct kvm *kvm, struct kvm_mmu_page *sp, 2289 u64 *spte) 2290{ 2291 u64 pte; 2292 struct kvm_mmu_page *child; 2293 2294 pte = *spte; 2295 if (is_shadow_present_pte(pte)) { 2296 if (is_last_spte(pte, sp->role.level)) { 2297 drop_spte(kvm, spte); 2298 if (is_large_pte(pte)) 2299 --kvm->stat.lpages; 2300 } else { 2301 child = page_header(pte & PT64_BASE_ADDR_MASK); 2302 drop_parent_pte(child, spte); 2303 } 2304 return true; 2305 } 2306 2307 if (is_mmio_spte(pte)) 2308 mmu_spte_clear_no_track(spte); 2309 2310 return false; 2311} 2312 2313static void kvm_mmu_page_unlink_children(struct kvm *kvm, 2314 struct kvm_mmu_page *sp) 2315{ 2316 unsigned i; 2317 2318 for (i = 0; i < PT64_ENT_PER_PAGE; ++i) 2319 mmu_page_zap_pte(kvm, sp, sp->spt + i); 2320} 2321 2322static void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp) 2323{ 2324 u64 *sptep; 2325 struct rmap_iterator iter; 2326 2327 while ((sptep = rmap_get_first(&sp->parent_ptes, &iter))) 2328 drop_parent_pte(sp, sptep); 2329} 2330 2331static int mmu_zap_unsync_children(struct kvm *kvm, 2332 struct kvm_mmu_page *parent, 2333 struct list_head *invalid_list) 2334{ 2335 int i, zapped = 0; 2336 struct mmu_page_path parents; 2337 struct kvm_mmu_pages pages; 2338 2339 if (parent->role.level == PT_PAGE_TABLE_LEVEL) 2340 return 0; 2341 2342 while (mmu_unsync_walk(parent, &pages)) { 2343 struct kvm_mmu_page *sp; 2344 2345 for_each_sp(pages, sp, parents, i) { 2346 kvm_mmu_prepare_zap_page(kvm, sp, invalid_list); 2347 mmu_pages_clear_parents(&parents); 2348 zapped++; 2349 } 2350 } 2351 2352 return zapped; 2353} 2354 2355static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp, 2356 struct list_head *invalid_list) 2357{ 2358 int ret; 2359 2360 trace_kvm_mmu_prepare_zap_page(sp); 2361 ++kvm->stat.mmu_shadow_zapped; 2362 ret = mmu_zap_unsync_children(kvm, sp, invalid_list); 2363 kvm_mmu_page_unlink_children(kvm, sp); 2364 kvm_mmu_unlink_parents(kvm, sp); 2365 2366 if (!sp->role.invalid && !sp->role.direct) 2367 unaccount_shadowed(kvm, sp); 2368 2369 if (sp->unsync) 2370 kvm_unlink_unsync_page(kvm, sp); 2371 if (!sp->root_count) { 2372 /* Count self */ 2373 ret++; 2374 list_move(&sp->link, invalid_list); 2375 kvm_mod_used_mmu_pages(kvm, -1); 2376 } else { 2377 list_move(&sp->link, &kvm->arch.active_mmu_pages); 2378 2379 /* 2380 * The obsolete pages can not be used on any vcpus. 2381 * See the comments in kvm_mmu_invalidate_zap_all_pages(). 2382 */ 2383 if (!sp->role.invalid && !is_obsolete_sp(kvm, sp)) 2384 kvm_reload_remote_mmus(kvm); 2385 } 2386 2387 sp->role.invalid = 1; 2388 return ret; 2389} 2390 2391static void kvm_mmu_commit_zap_page(struct kvm *kvm, 2392 struct list_head *invalid_list) 2393{ 2394 struct kvm_mmu_page *sp, *nsp; 2395 2396 if (list_empty(invalid_list)) 2397 return; 2398 2399 /* 2400 * We need to make sure everyone sees our modifications to 2401 * the page tables and see changes to vcpu->mode here. The barrier 2402 * in the kvm_flush_remote_tlbs() achieves this. This pairs 2403 * with vcpu_enter_guest and walk_shadow_page_lockless_begin/end. 2404 * 2405 * In addition, kvm_flush_remote_tlbs waits for all vcpus to exit 2406 * guest mode and/or lockless shadow page table walks. 2407 */ 2408 kvm_flush_remote_tlbs(kvm); 2409 2410 list_for_each_entry_safe(sp, nsp, invalid_list, link) { 2411 WARN_ON(!sp->role.invalid || sp->root_count); 2412 kvm_mmu_free_page(sp); 2413 } 2414} 2415 2416static bool prepare_zap_oldest_mmu_page(struct kvm *kvm, 2417 struct list_head *invalid_list) 2418{ 2419 struct kvm_mmu_page *sp; 2420 2421 if (list_empty(&kvm->arch.active_mmu_pages)) 2422 return false; 2423 2424 sp = list_last_entry(&kvm->arch.active_mmu_pages, 2425 struct kvm_mmu_page, link); 2426 kvm_mmu_prepare_zap_page(kvm, sp, invalid_list); 2427 2428 return true; 2429} 2430 2431/* 2432 * Changing the number of mmu pages allocated to the vm 2433 * Note: if goal_nr_mmu_pages is too small, you will get dead lock 2434 */ 2435void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int goal_nr_mmu_pages) 2436{ 2437 LIST_HEAD(invalid_list); 2438 2439 spin_lock(&kvm->mmu_lock); 2440 2441 if (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages) { 2442 /* Need to free some mmu pages to achieve the goal. */ 2443 while (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages) 2444 if (!prepare_zap_oldest_mmu_page(kvm, &invalid_list)) 2445 break; 2446 2447 kvm_mmu_commit_zap_page(kvm, &invalid_list); 2448 goal_nr_mmu_pages = kvm->arch.n_used_mmu_pages; 2449 } 2450 2451 kvm->arch.n_max_mmu_pages = goal_nr_mmu_pages; 2452 2453 spin_unlock(&kvm->mmu_lock); 2454} 2455 2456int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn) 2457{ 2458 struct kvm_mmu_page *sp; 2459 LIST_HEAD(invalid_list); 2460 int r; 2461 2462 pgprintk("%s: looking for gfn %llx\n", __func__, gfn); 2463 r = 0; 2464 spin_lock(&kvm->mmu_lock); 2465 for_each_gfn_indirect_valid_sp(kvm, sp, gfn) { 2466 pgprintk("%s: gfn %llx role %x\n", __func__, gfn, 2467 sp->role.word); 2468 r = 1; 2469 kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list); 2470 } 2471 kvm_mmu_commit_zap_page(kvm, &invalid_list); 2472 spin_unlock(&kvm->mmu_lock); 2473 2474 return r; 2475} 2476EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page); 2477 2478static void kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) 2479{ 2480 trace_kvm_mmu_unsync_page(sp); 2481 ++vcpu->kvm->stat.mmu_unsync; 2482 sp->unsync = 1; 2483 2484 kvm_mmu_mark_parents_unsync(sp); 2485} 2486 2487static bool mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn, 2488 bool can_unsync) 2489{ 2490 struct kvm_mmu_page *sp; 2491 2492 if (kvm_page_track_is_active(vcpu, gfn, KVM_PAGE_TRACK_WRITE)) 2493 return true; 2494 2495 for_each_gfn_indirect_valid_sp(vcpu->kvm, sp, gfn) { 2496 if (!can_unsync) 2497 return true; 2498 2499 if (sp->unsync) 2500 continue; 2501 2502 WARN_ON(sp->role.level != PT_PAGE_TABLE_LEVEL); 2503 kvm_unsync_page(vcpu, sp); 2504 } 2505 2506 return false; 2507} 2508 2509static bool kvm_is_mmio_pfn(kvm_pfn_t pfn) 2510{ 2511 if (pfn_valid(pfn)) 2512 return !is_zero_pfn(pfn) && PageReserved(pfn_to_page(pfn)); 2513 2514 return true; 2515} 2516 2517static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, 2518 unsigned pte_access, int level, 2519 gfn_t gfn, kvm_pfn_t pfn, bool speculative, 2520 bool can_unsync, bool host_writable) 2521{ 2522 u64 spte; 2523 int ret = 0; 2524 2525 if (set_mmio_spte(vcpu, sptep, gfn, pfn, pte_access)) 2526 return 0; 2527 2528 spte = PT_PRESENT_MASK; 2529 if (!speculative) 2530 spte |= shadow_accessed_mask; 2531 2532 if (pte_access & ACC_EXEC_MASK) 2533 spte |= shadow_x_mask; 2534 else 2535 spte |= shadow_nx_mask; 2536 2537 if (pte_access & ACC_USER_MASK) 2538 spte |= shadow_user_mask; 2539 2540 if (level > PT_PAGE_TABLE_LEVEL) 2541 spte |= PT_PAGE_SIZE_MASK; 2542 if (tdp_enabled) 2543 spte |= kvm_x86_ops->get_mt_mask(vcpu, gfn, 2544 kvm_is_mmio_pfn(pfn)); 2545 2546 if (host_writable) 2547 spte |= SPTE_HOST_WRITEABLE; 2548 else 2549 pte_access &= ~ACC_WRITE_MASK; 2550 2551 spte |= (u64)pfn << PAGE_SHIFT; 2552 2553 if (pte_access & ACC_WRITE_MASK) { 2554 2555 /* 2556 * Other vcpu creates new sp in the window between 2557 * mapping_level() and acquiring mmu-lock. We can 2558 * allow guest to retry the access, the mapping can 2559 * be fixed if guest refault. 2560 */ 2561 if (level > PT_PAGE_TABLE_LEVEL && 2562 mmu_gfn_lpage_is_disallowed(vcpu, gfn, level)) 2563 goto done; 2564 2565 spte |= PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE; 2566 2567 /* 2568 * Optimization: for pte sync, if spte was writable the hash 2569 * lookup is unnecessary (and expensive). Write protection 2570 * is responsibility of mmu_get_page / kvm_sync_page. 2571 * Same reasoning can be applied to dirty page accounting. 2572 */ 2573 if (!can_unsync && is_writable_pte(*sptep)) 2574 goto set_pte; 2575 2576 if (mmu_need_write_protect(vcpu, gfn, can_unsync)) { 2577 pgprintk("%s: found shadow page for %llx, marking ro\n", 2578 __func__, gfn); 2579 ret = 1; 2580 pte_access &= ~ACC_WRITE_MASK; 2581 spte &= ~(PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE); 2582 } 2583 } 2584 2585 if (pte_access & ACC_WRITE_MASK) { 2586 kvm_vcpu_mark_page_dirty(vcpu, gfn); 2587 spte |= shadow_dirty_mask; 2588 } 2589 2590set_pte: 2591 if (mmu_spte_update(sptep, spte)) 2592 kvm_flush_remote_tlbs(vcpu->kvm); 2593done: 2594 return ret; 2595} 2596 2597static bool mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, unsigned pte_access, 2598 int write_fault, int level, gfn_t gfn, kvm_pfn_t pfn, 2599 bool speculative, bool host_writable) 2600{ 2601 int was_rmapped = 0; 2602 int rmap_count; 2603 bool emulate = false; 2604 2605 pgprintk("%s: spte %llx write_fault %d gfn %llx\n", __func__, 2606 *sptep, write_fault, gfn); 2607 2608 if (is_shadow_present_pte(*sptep)) { 2609 /* 2610 * If we overwrite a PTE page pointer with a 2MB PMD, unlink 2611 * the parent of the now unreachable PTE. 2612 */ 2613 if (level > PT_PAGE_TABLE_LEVEL && 2614 !is_large_pte(*sptep)) { 2615 struct kvm_mmu_page *child; 2616 u64 pte = *sptep; 2617 2618 child = page_header(pte & PT64_BASE_ADDR_MASK); 2619 drop_parent_pte(child, sptep); 2620 kvm_flush_remote_tlbs(vcpu->kvm); 2621 } else if (pfn != spte_to_pfn(*sptep)) { 2622 pgprintk("hfn old %llx new %llx\n", 2623 spte_to_pfn(*sptep), pfn); 2624 drop_spte(vcpu->kvm, sptep); 2625 kvm_flush_remote_tlbs(vcpu->kvm); 2626 } else 2627 was_rmapped = 1; 2628 } 2629 2630 if (set_spte(vcpu, sptep, pte_access, level, gfn, pfn, speculative, 2631 true, host_writable)) { 2632 if (write_fault) 2633 emulate = true; 2634 kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); 2635 } 2636 2637 if (unlikely(is_mmio_spte(*sptep))) 2638 emulate = true; 2639 2640 pgprintk("%s: setting spte %llx\n", __func__, *sptep); 2641 pgprintk("instantiating %s PTE (%s) at %llx (%llx) addr %p\n", 2642 is_large_pte(*sptep)? "2MB" : "4kB", 2643 *sptep & PT_PRESENT_MASK ?"RW":"R", gfn, 2644 *sptep, sptep); 2645 if (!was_rmapped && is_large_pte(*sptep)) 2646 ++vcpu->kvm->stat.lpages; 2647 2648 if (is_shadow_present_pte(*sptep)) { 2649 if (!was_rmapped) { 2650 rmap_count = rmap_add(vcpu, sptep, gfn); 2651 if (rmap_count > RMAP_RECYCLE_THRESHOLD) 2652 rmap_recycle(vcpu, sptep, gfn); 2653 } 2654 } 2655 2656 kvm_release_pfn_clean(pfn); 2657 2658 return emulate; 2659} 2660 2661static kvm_pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn, 2662 bool no_dirty_log) 2663{ 2664 struct kvm_memory_slot *slot; 2665 2666 slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, no_dirty_log); 2667 if (!slot) 2668 return KVM_PFN_ERR_FAULT; 2669 2670 return gfn_to_pfn_memslot_atomic(slot, gfn); 2671} 2672 2673static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu, 2674 struct kvm_mmu_page *sp, 2675 u64 *start, u64 *end) 2676{ 2677 struct page *pages[PTE_PREFETCH_NUM]; 2678 struct kvm_memory_slot *slot; 2679 unsigned access = sp->role.access; 2680 int i, ret; 2681 gfn_t gfn; 2682 2683 gfn = kvm_mmu_page_get_gfn(sp, start - sp->spt); 2684 slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, access & ACC_WRITE_MASK); 2685 if (!slot) 2686 return -1; 2687 2688 ret = gfn_to_page_many_atomic(slot, gfn, pages, end - start); 2689 if (ret <= 0) 2690 return -1; 2691 2692 for (i = 0; i < ret; i++, gfn++, start++) 2693 mmu_set_spte(vcpu, start, access, 0, sp->role.level, gfn, 2694 page_to_pfn(pages[i]), true, true); 2695 2696 return 0; 2697} 2698 2699static void __direct_pte_prefetch(struct kvm_vcpu *vcpu, 2700 struct kvm_mmu_page *sp, u64 *sptep) 2701{ 2702 u64 *spte, *start = NULL; 2703 int i; 2704 2705 WARN_ON(!sp->role.direct); 2706 2707 i = (sptep - sp->spt) & ~(PTE_PREFETCH_NUM - 1); 2708 spte = sp->spt + i; 2709 2710 for (i = 0; i < PTE_PREFETCH_NUM; i++, spte++) { 2711 if (is_shadow_present_pte(*spte) || spte == sptep) { 2712 if (!start) 2713 continue; 2714 if (direct_pte_prefetch_many(vcpu, sp, start, spte) < 0) 2715 break; 2716 start = NULL; 2717 } else if (!start) 2718 start = spte; 2719 } 2720} 2721 2722static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep) 2723{ 2724 struct kvm_mmu_page *sp; 2725 2726 /* 2727 * Since it's no accessed bit on EPT, it's no way to 2728 * distinguish between actually accessed translations 2729 * and prefetched, so disable pte prefetch if EPT is 2730 * enabled. 2731 */ 2732 if (!shadow_accessed_mask) 2733 return; 2734 2735 sp = page_header(__pa(sptep)); 2736 if (sp->role.level > PT_PAGE_TABLE_LEVEL) 2737 return; 2738 2739 __direct_pte_prefetch(vcpu, sp, sptep); 2740} 2741 2742static int __direct_map(struct kvm_vcpu *vcpu, int write, int map_writable, 2743 int level, gfn_t gfn, kvm_pfn_t pfn, bool prefault) 2744{ 2745 struct kvm_shadow_walk_iterator iterator; 2746 struct kvm_mmu_page *sp; 2747 int emulate = 0; 2748 gfn_t pseudo_gfn; 2749 2750 if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) 2751 return 0; 2752 2753 for_each_shadow_entry(vcpu, (u64)gfn << PAGE_SHIFT, iterator) { 2754 if (iterator.level == level) { 2755 emulate = mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, 2756 write, level, gfn, pfn, prefault, 2757 map_writable); 2758 direct_pte_prefetch(vcpu, iterator.sptep); 2759 ++vcpu->stat.pf_fixed; 2760 break; 2761 } 2762 2763 drop_large_spte(vcpu, iterator.sptep); 2764 if (!is_shadow_present_pte(*iterator.sptep)) { 2765 u64 base_addr = iterator.addr; 2766 2767 base_addr &= PT64_LVL_ADDR_MASK(iterator.level); 2768 pseudo_gfn = base_addr >> PAGE_SHIFT; 2769 sp = kvm_mmu_get_page(vcpu, pseudo_gfn, iterator.addr, 2770 iterator.level - 1, 1, ACC_ALL); 2771 2772 link_shadow_page(vcpu, iterator.sptep, sp); 2773 } 2774 } 2775 return emulate; 2776} 2777 2778static void kvm_send_hwpoison_signal(unsigned long address, struct task_struct *tsk) 2779{ 2780 siginfo_t info; 2781 2782 info.si_signo = SIGBUS; 2783 info.si_errno = 0; 2784 info.si_code = BUS_MCEERR_AR; 2785 info.si_addr = (void __user *)address; 2786 info.si_addr_lsb = PAGE_SHIFT; 2787 2788 send_sig_info(SIGBUS, &info, tsk); 2789} 2790 2791static int kvm_handle_bad_page(struct kvm_vcpu *vcpu, gfn_t gfn, kvm_pfn_t pfn) 2792{ 2793 /* 2794 * Do not cache the mmio info caused by writing the readonly gfn 2795 * into the spte otherwise read access on readonly gfn also can 2796 * caused mmio page fault and treat it as mmio access. 2797 * Return 1 to tell kvm to emulate it. 2798 */ 2799 if (pfn == KVM_PFN_ERR_RO_FAULT) 2800 return 1; 2801 2802 if (pfn == KVM_PFN_ERR_HWPOISON) { 2803 kvm_send_hwpoison_signal(kvm_vcpu_gfn_to_hva(vcpu, gfn), current); 2804 return 0; 2805 } 2806 2807 return -EFAULT; 2808} 2809 2810static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu, 2811 gfn_t *gfnp, kvm_pfn_t *pfnp, 2812 int *levelp) 2813{ 2814 kvm_pfn_t pfn = *pfnp; 2815 gfn_t gfn = *gfnp; 2816 int level = *levelp; 2817 2818 /* 2819 * Check if it's a transparent hugepage. If this would be an 2820 * hugetlbfs page, level wouldn't be set to 2821 * PT_PAGE_TABLE_LEVEL and there would be no adjustment done 2822 * here. 2823 */ 2824 if (!is_error_noslot_pfn(pfn) && !kvm_is_reserved_pfn(pfn) && 2825 level == PT_PAGE_TABLE_LEVEL && 2826 PageTransCompoundMap(pfn_to_page(pfn)) && 2827 !mmu_gfn_lpage_is_disallowed(vcpu, gfn, PT_DIRECTORY_LEVEL)) { 2828 unsigned long mask; 2829 /* 2830 * mmu_notifier_retry was successful and we hold the 2831 * mmu_lock here, so the pmd can't become splitting 2832 * from under us, and in turn 2833 * __split_huge_page_refcount() can't run from under 2834 * us and we can safely transfer the refcount from 2835 * PG_tail to PG_head as we switch the pfn to tail to 2836 * head. 2837 */ 2838 *levelp = level = PT_DIRECTORY_LEVEL; 2839 mask = KVM_PAGES_PER_HPAGE(level) - 1; 2840 VM_BUG_ON((gfn & mask) != (pfn & mask)); 2841 if (pfn & mask) { 2842 gfn &= ~mask; 2843 *gfnp = gfn; 2844 kvm_release_pfn_clean(pfn); 2845 pfn &= ~mask; 2846 kvm_get_pfn(pfn); 2847 *pfnp = pfn; 2848 } 2849 } 2850} 2851 2852static bool handle_abnormal_pfn(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn, 2853 kvm_pfn_t pfn, unsigned access, int *ret_val) 2854{ 2855 /* The pfn is invalid, report the error! */ 2856 if (unlikely(is_error_pfn(pfn))) { 2857 *ret_val = kvm_handle_bad_page(vcpu, gfn, pfn); 2858 return true; 2859 } 2860 2861 if (unlikely(is_noslot_pfn(pfn))) 2862 vcpu_cache_mmio_info(vcpu, gva, gfn, access); 2863 2864 return false; 2865} 2866 2867static bool page_fault_can_be_fast(u32 error_code) 2868{ 2869 /* 2870 * Do not fix the mmio spte with invalid generation number which 2871 * need to be updated by slow page fault path. 2872 */ 2873 if (unlikely(error_code & PFERR_RSVD_MASK)) 2874 return false; 2875 2876 /* 2877 * #PF can be fast only if the shadow page table is present and it 2878 * is caused by write-protect, that means we just need change the 2879 * W bit of the spte which can be done out of mmu-lock. 2880 */ 2881 if (!(error_code & PFERR_PRESENT_MASK) || 2882 !(error_code & PFERR_WRITE_MASK)) 2883 return false; 2884 2885 return true; 2886} 2887 2888static bool 2889fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, 2890 u64 *sptep, u64 spte) 2891{ 2892 gfn_t gfn; 2893 2894 WARN_ON(!sp->role.direct); 2895 2896 /* 2897 * The gfn of direct spte is stable since it is calculated 2898 * by sp->gfn. 2899 */ 2900 gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt); 2901 2902 /* 2903 * Theoretically we could also set dirty bit (and flush TLB) here in 2904 * order to eliminate unnecessary PML logging. See comments in 2905 * set_spte. But fast_page_fault is very unlikely to happen with PML 2906 * enabled, so we do not do this. This might result in the same GPA 2907 * to be logged in PML buffer again when the write really happens, and 2908 * eventually to be called by mark_page_dirty twice. But it's also no 2909 * harm. This also avoids the TLB flush needed after setting dirty bit 2910 * so non-PML cases won't be impacted. 2911 * 2912 * Compare with set_spte where instead shadow_dirty_mask is set. 2913 */ 2914 if (cmpxchg64(sptep, spte, spte | PT_WRITABLE_MASK) == spte) 2915 kvm_vcpu_mark_page_dirty(vcpu, gfn); 2916 2917 return true; 2918} 2919 2920/* 2921 * Return value: 2922 * - true: let the vcpu to access on the same address again. 2923 * - false: let the real page fault path to fix it. 2924 */ 2925static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level, 2926 u32 error_code) 2927{ 2928 struct kvm_shadow_walk_iterator iterator; 2929 struct kvm_mmu_page *sp; 2930 bool ret = false; 2931 u64 spte = 0ull; 2932 2933 if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) 2934 return false; 2935 2936 if (!page_fault_can_be_fast(error_code)) 2937 return false; 2938 2939 walk_shadow_page_lockless_begin(vcpu); 2940 for_each_shadow_entry_lockless(vcpu, gva, iterator, spte) 2941 if (!is_shadow_present_pte(spte) || iterator.level < level) 2942 break; 2943 2944 /* 2945 * If the mapping has been changed, let the vcpu fault on the 2946 * same address again. 2947 */ 2948 if (!is_shadow_present_pte(spte)) { 2949 ret = true; 2950 goto exit; 2951 } 2952 2953 sp = page_header(__pa(iterator.sptep)); 2954 if (!is_last_spte(spte, sp->role.level)) 2955 goto exit; 2956 2957 /* 2958 * Check if it is a spurious fault caused by TLB lazily flushed. 2959 * 2960 * Need not check the access of upper level table entries since 2961 * they are always ACC_ALL. 2962 */ 2963 if (is_writable_pte(spte)) { 2964 ret = true; 2965 goto exit; 2966 } 2967 2968 /* 2969 * Currently, to simplify the code, only the spte write-protected 2970 * by dirty-log can be fast fixed. 2971 */ 2972 if (!spte_is_locklessly_modifiable(spte)) 2973 goto exit; 2974 2975 /* 2976 * Do not fix write-permission on the large spte since we only dirty 2977 * the first page into the dirty-bitmap in fast_pf_fix_direct_spte() 2978 * that means other pages are missed if its slot is dirty-logged. 2979 * 2980 * Instead, we let the slow page fault path create a normal spte to 2981 * fix the access. 2982 * 2983 * See the comments in kvm_arch_commit_memory_region(). 2984 */ 2985 if (sp->role.level > PT_PAGE_TABLE_LEVEL) 2986 goto exit; 2987 2988 /* 2989 * Currently, fast page fault only works for direct mapping since 2990 * the gfn is not stable for indirect shadow page. 2991 * See Documentation/virtual/kvm/locking.txt to get more detail. 2992 */ 2993 ret = fast_pf_fix_direct_spte(vcpu, sp, iterator.sptep, spte); 2994exit: 2995 trace_fast_page_fault(vcpu, gva, error_code, iterator.sptep, 2996 spte, ret); 2997 walk_shadow_page_lockless_end(vcpu); 2998 2999 return ret; 3000} 3001 3002static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn, 3003 gva_t gva, kvm_pfn_t *pfn, bool write, bool *writable); 3004static void make_mmu_pages_available(struct kvm_vcpu *vcpu); 3005 3006static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code, 3007 gfn_t gfn, bool prefault) 3008{ 3009 int r; 3010 int level; 3011 bool force_pt_level = false; 3012 kvm_pfn_t pfn; 3013 unsigned long mmu_seq; 3014 bool map_writable, write = error_code & PFERR_WRITE_MASK; 3015 3016 level = mapping_level(vcpu, gfn, &force_pt_level); 3017 if (likely(!force_pt_level)) { 3018 /* 3019 * This path builds a PAE pagetable - so we can map 3020 * 2mb pages at maximum. Therefore check if the level 3021 * is larger than that. 3022 */ 3023 if (level > PT_DIRECTORY_LEVEL) 3024 level = PT_DIRECTORY_LEVEL; 3025 3026 gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1); 3027 } 3028 3029 if (fast_page_fault(vcpu, v, level, error_code)) 3030 return 0; 3031 3032 mmu_seq = vcpu->kvm->mmu_notifier_seq; 3033 smp_rmb(); 3034 3035 if (try_async_pf(vcpu, prefault, gfn, v, &pfn, write, &map_writable)) 3036 return 0; 3037 3038 if (handle_abnormal_pfn(vcpu, v, gfn, pfn, ACC_ALL, &r)) 3039 return r; 3040 3041 spin_lock(&vcpu->kvm->mmu_lock); 3042 if (mmu_notifier_retry(vcpu->kvm, mmu_seq)) 3043 goto out_unlock; 3044 make_mmu_pages_available(vcpu); 3045 if (likely(!force_pt_level)) 3046 transparent_hugepage_adjust(vcpu, &gfn, &pfn, &level); 3047 r = __direct_map(vcpu, write, map_writable, level, gfn, pfn, prefault); 3048 spin_unlock(&vcpu->kvm->mmu_lock); 3049 3050 return r; 3051 3052out_unlock: 3053 spin_unlock(&vcpu->kvm->mmu_lock); 3054 kvm_release_pfn_clean(pfn); 3055 return 0; 3056} 3057 3058 3059static void mmu_free_roots(struct kvm_vcpu *vcpu) 3060{ 3061 int i; 3062 struct kvm_mmu_page *sp; 3063 LIST_HEAD(invalid_list); 3064 3065 if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) 3066 return; 3067 3068 if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL && 3069 (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL || 3070 vcpu->arch.mmu.direct_map)) { 3071 hpa_t root = vcpu->arch.mmu.root_hpa; 3072 3073 spin_lock(&vcpu->kvm->mmu_lock); 3074 sp = page_header(root); 3075 --sp->root_count; 3076 if (!sp->root_count && sp->role.invalid) { 3077 kvm_mmu_prepare_zap_page(vcpu->kvm, sp, &invalid_list); 3078 kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); 3079 } 3080 spin_unlock(&vcpu->kvm->mmu_lock); 3081 vcpu->arch.mmu.root_hpa = INVALID_PAGE; 3082 return; 3083 } 3084 3085 spin_lock(&vcpu->kvm->mmu_lock); 3086 for (i = 0; i < 4; ++i) { 3087 hpa_t root = vcpu->arch.mmu.pae_root[i]; 3088 3089 if (root) { 3090 root &= PT64_BASE_ADDR_MASK; 3091 sp = page_header(root); 3092 --sp->root_count; 3093 if (!sp->root_count && sp->role.invalid) 3094 kvm_mmu_prepare_zap_page(vcpu->kvm, sp, 3095 &invalid_list); 3096 } 3097 vcpu->arch.mmu.pae_root[i] = INVALID_PAGE; 3098 } 3099 kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); 3100 spin_unlock(&vcpu->kvm->mmu_lock); 3101 vcpu->arch.mmu.root_hpa = INVALID_PAGE; 3102} 3103 3104static int mmu_check_root(struct kvm_vcpu *vcpu, gfn_t root_gfn) 3105{ 3106 int ret = 0; 3107 3108 if (!kvm_is_visible_gfn(vcpu->kvm, root_gfn)) { 3109 kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); 3110 ret = 1; 3111 } 3112 3113 return ret; 3114} 3115 3116static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu) 3117{ 3118 struct kvm_mmu_page *sp; 3119 unsigned i; 3120 3121 if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) { 3122 spin_lock(&vcpu->kvm->mmu_lock); 3123 make_mmu_pages_available(vcpu); 3124 sp = kvm_mmu_get_page(vcpu, 0, 0, PT64_ROOT_LEVEL, 1, ACC_ALL); 3125 ++sp->root_count; 3126 spin_unlock(&vcpu->kvm->mmu_lock); 3127 vcpu->arch.mmu.root_hpa = __pa(sp->spt); 3128 } else if (vcpu->arch.mmu.shadow_root_level == PT32E_ROOT_LEVEL) { 3129 for (i = 0; i < 4; ++i) { 3130 hpa_t root = vcpu->arch.mmu.pae_root[i]; 3131 3132 MMU_WARN_ON(VALID_PAGE(root)); 3133 spin_lock(&vcpu->kvm->mmu_lock); 3134 make_mmu_pages_available(vcpu); 3135 sp = kvm_mmu_get_page(vcpu, i << (30 - PAGE_SHIFT), 3136 i << 30, PT32_ROOT_LEVEL, 1, ACC_ALL); 3137 root = __pa(sp->spt); 3138 ++sp->root_count; 3139 spin_unlock(&vcpu->kvm->mmu_lock); 3140 vcpu->arch.mmu.pae_root[i] = root | PT_PRESENT_MASK; 3141 } 3142 vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root); 3143 } else 3144 BUG(); 3145 3146 return 0; 3147} 3148 3149static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu) 3150{ 3151 struct kvm_mmu_page *sp; 3152 u64 pdptr, pm_mask; 3153 gfn_t root_gfn; 3154 int i; 3155 3156 root_gfn = vcpu->arch.mmu.get_cr3(vcpu) >> PAGE_SHIFT; 3157 3158 if (mmu_check_root(vcpu, root_gfn)) 3159 return 1; 3160 3161 /* 3162 * Do we shadow a long mode page table? If so we need to 3163 * write-protect the guests page table root. 3164 */ 3165 if (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL) { 3166 hpa_t root = vcpu->arch.mmu.root_hpa; 3167 3168 MMU_WARN_ON(VALID_PAGE(root)); 3169 3170 spin_lock(&vcpu->kvm->mmu_lock); 3171 make_mmu_pages_available(vcpu); 3172 sp = kvm_mmu_get_page(vcpu, root_gfn, 0, PT64_ROOT_LEVEL, 3173 0, ACC_ALL); 3174 root = __pa(sp->spt); 3175 ++sp->root_count; 3176 spin_unlock(&vcpu->kvm->mmu_lock); 3177 vcpu->arch.mmu.root_hpa = root; 3178 return 0; 3179 } 3180 3181 /* 3182 * We shadow a 32 bit page table. This may be a legacy 2-level 3183 * or a PAE 3-level page table. In either case we need to be aware that 3184 * the shadow page table may be a PAE or a long mode page table. 3185 */ 3186 pm_mask = PT_PRESENT_MASK; 3187 if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) 3188 pm_mask |= PT_ACCESSED_MASK | PT_WRITABLE_MASK | PT_USER_MASK; 3189 3190 for (i = 0; i < 4; ++i) { 3191 hpa_t root = vcpu->arch.mmu.pae_root[i]; 3192 3193 MMU_WARN_ON(VALID_PAGE(root)); 3194 if (vcpu->arch.mmu.root_level == PT32E_ROOT_LEVEL) { 3195 pdptr = vcpu->arch.mmu.get_pdptr(vcpu, i); 3196 if (!is_present_gpte(pdptr)) { 3197 vcpu->arch.mmu.pae_root[i] = 0; 3198 continue; 3199 } 3200 root_gfn = pdptr >> PAGE_SHIFT; 3201 if (mmu_check_root(vcpu, root_gfn)) 3202 return 1; 3203 } 3204 spin_lock(&vcpu->kvm->mmu_lock); 3205 make_mmu_pages_available(vcpu); 3206 sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30, PT32_ROOT_LEVEL, 3207 0, ACC_ALL); 3208 root = __pa(sp->spt); 3209 ++sp->root_count; 3210 spin_unlock(&vcpu->kvm->mmu_lock); 3211 3212 vcpu->arch.mmu.pae_root[i] = root | pm_mask; 3213 } 3214 vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root); 3215 3216 /* 3217 * If we shadow a 32 bit page table with a long mode page 3218 * table we enter this path. 3219 */ 3220 if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) { 3221 if (vcpu->arch.mmu.lm_root == NULL) { 3222 /* 3223 * The additional page necessary for this is only 3224 * allocated on demand. 3225 */ 3226 3227 u64 *lm_root; 3228 3229 lm_root = (void*)get_zeroed_page(GFP_KERNEL); 3230 if (lm_root == NULL) 3231 return 1; 3232 3233 lm_root[0] = __pa(vcpu->arch.mmu.pae_root) | pm_mask; 3234 3235 vcpu->arch.mmu.lm_root = lm_root; 3236 } 3237 3238 vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.lm_root); 3239 } 3240 3241 return 0; 3242} 3243 3244static int mmu_alloc_roots(struct kvm_vcpu *vcpu) 3245{ 3246 if (vcpu->arch.mmu.direct_map) 3247 return mmu_alloc_direct_roots(vcpu); 3248 else 3249 return mmu_alloc_shadow_roots(vcpu); 3250} 3251 3252static void mmu_sync_roots(struct kvm_vcpu *vcpu) 3253{ 3254 int i; 3255 struct kvm_mmu_page *sp; 3256 3257 if (vcpu->arch.mmu.direct_map) 3258 return; 3259 3260 if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) 3261 return; 3262 3263 vcpu_clear_mmio_info(vcpu, MMIO_GVA_ANY); 3264 kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC); 3265 if (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL) { 3266 hpa_t root = vcpu->arch.mmu.root_hpa; 3267 sp = page_header(root); 3268 mmu_sync_children(vcpu, sp); 3269 kvm_mmu_audit(vcpu, AUDIT_POST_SYNC); 3270 return; 3271 } 3272 for (i = 0; i < 4; ++i) { 3273 hpa_t root = vcpu->arch.mmu.pae_root[i]; 3274 3275 if (root && VALID_PAGE(root)) { 3276 root &= PT64_BASE_ADDR_MASK; 3277 sp = page_header(root); 3278 mmu_sync_children(vcpu, sp); 3279 } 3280 } 3281 kvm_mmu_audit(vcpu, AUDIT_POST_SYNC); 3282} 3283 3284void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu) 3285{ 3286 spin_lock(&vcpu->kvm->mmu_lock); 3287 mmu_sync_roots(vcpu); 3288 spin_unlock(&vcpu->kvm->mmu_lock); 3289} 3290EXPORT_SYMBOL_GPL(kvm_mmu_sync_roots); 3291 3292static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr, 3293 u32 access, struct x86_exception *exception) 3294{ 3295 if (exception) 3296 exception->error_code = 0; 3297 return vaddr; 3298} 3299 3300static gpa_t nonpaging_gva_to_gpa_nested(struct kvm_vcpu *vcpu, gva_t vaddr, 3301 u32 access, 3302 struct x86_exception *exception) 3303{ 3304 if (exception) 3305 exception->error_code = 0; 3306 return vcpu->arch.nested_mmu.translate_gpa(vcpu, vaddr, access, exception); 3307} 3308 3309static bool 3310__is_rsvd_bits_set(struct rsvd_bits_validate *rsvd_check, u64 pte, int level) 3311{ 3312 int bit7 = (pte >> 7) & 1, low6 = pte & 0x3f; 3313 3314 return (pte & rsvd_check->rsvd_bits_mask[bit7][level-1]) | 3315 ((rsvd_check->bad_mt_xwr & (1ull << low6)) != 0); 3316} 3317 3318static bool is_rsvd_bits_set(struct kvm_mmu *mmu, u64 gpte, int level) 3319{ 3320 return __is_rsvd_bits_set(&mmu->guest_rsvd_check, gpte, level); 3321} 3322 3323static bool is_shadow_zero_bits_set(struct kvm_mmu *mmu, u64 spte, int level) 3324{ 3325 return __is_rsvd_bits_set(&mmu->shadow_zero_check, spte, level); 3326} 3327 3328static bool mmio_info_in_cache(struct kvm_vcpu *vcpu, u64 addr, bool direct) 3329{ 3330 if (direct) 3331 return vcpu_match_mmio_gpa(vcpu, addr); 3332 3333 return vcpu_match_mmio_gva(vcpu, addr); 3334} 3335 3336/* return true if reserved bit is detected on spte. */ 3337static bool 3338walk_shadow_page_get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr, u64 *sptep) 3339{ 3340 struct kvm_shadow_walk_iterator iterator; 3341 u64 sptes[PT64_ROOT_LEVEL], spte = 0ull; 3342 int root, leaf; 3343 bool reserved = false; 3344 3345 if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) 3346 goto exit; 3347 3348 walk_shadow_page_lockless_begin(vcpu); 3349 3350 for (shadow_walk_init(&iterator, vcpu, addr), 3351 leaf = root = iterator.level; 3352 shadow_walk_okay(&iterator); 3353 __shadow_walk_next(&iterator, spte)) { 3354 spte = mmu_spte_get_lockless(iterator.sptep); 3355 3356 sptes[leaf - 1] = spte; 3357 leaf--; 3358 3359 if (!is_shadow_present_pte(spte)) 3360 break; 3361 3362 reserved |= is_shadow_zero_bits_set(&vcpu->arch.mmu, spte, 3363 iterator.level); 3364 } 3365 3366 walk_shadow_page_lockless_end(vcpu); 3367 3368 if (reserved) { 3369 pr_err("%s: detect reserved bits on spte, addr 0x%llx, dump hierarchy:\n", 3370 __func__, addr); 3371 while (root > leaf) { 3372 pr_err("------ spte 0x%llx level %d.\n", 3373 sptes[root - 1], root); 3374 root--; 3375 } 3376 } 3377exit: 3378 *sptep = spte; 3379 return reserved; 3380} 3381 3382int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr, bool direct) 3383{ 3384 u64 spte; 3385 bool reserved; 3386 3387 if (mmio_info_in_cache(vcpu, addr, direct)) 3388 return RET_MMIO_PF_EMULATE; 3389 3390 reserved = walk_shadow_page_get_mmio_spte(vcpu, addr, &spte); 3391 if (WARN_ON(reserved)) 3392 return RET_MMIO_PF_BUG; 3393 3394 if (is_mmio_spte(spte)) { 3395 gfn_t gfn = get_mmio_spte_gfn(spte); 3396 unsigned access = get_mmio_spte_access(spte); 3397 3398 if (!check_mmio_spte(vcpu, spte)) 3399 return RET_MMIO_PF_INVALID; 3400 3401 if (direct) 3402 addr = 0; 3403 3404 trace_handle_mmio_page_fault(addr, gfn, access); 3405 vcpu_cache_mmio_info(vcpu, addr, gfn, access); 3406 return RET_MMIO_PF_EMULATE; 3407 } 3408 3409 /* 3410 * If the page table is zapped by other cpus, let CPU fault again on 3411 * the address. 3412 */ 3413 return RET_MMIO_PF_RETRY; 3414} 3415EXPORT_SYMBOL_GPL(handle_mmio_page_fault); 3416 3417static bool page_fault_handle_page_track(struct kvm_vcpu *vcpu, 3418 u32 error_code, gfn_t gfn) 3419{ 3420 if (unlikely(error_code & PFERR_RSVD_MASK)) 3421 return false; 3422 3423 if (!(error_code & PFERR_PRESENT_MASK) || 3424 !(error_code & PFERR_WRITE_MASK)) 3425 return false; 3426 3427 /* 3428 * guest is writing the page which is write tracked which can 3429 * not be fixed by page fault handler. 3430 */ 3431 if (kvm_page_track_is_active(vcpu, gfn, KVM_PAGE_TRACK_WRITE)) 3432 return true; 3433 3434 return false; 3435} 3436 3437static void shadow_page_table_clear_flood(struct kvm_vcpu *vcpu, gva_t addr) 3438{ 3439 struct kvm_shadow_walk_iterator iterator; 3440 u64 spte; 3441 3442 if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) 3443 return; 3444 3445 walk_shadow_page_lockless_begin(vcpu); 3446 for_each_shadow_entry_lockless(vcpu, addr, iterator, spte) { 3447 clear_sp_write_flooding_count(iterator.sptep); 3448 if (!is_shadow_present_pte(spte)) 3449 break; 3450 } 3451 walk_shadow_page_lockless_end(vcpu); 3452} 3453 3454static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva, 3455 u32 error_code, bool prefault) 3456{ 3457 gfn_t gfn = gva >> PAGE_SHIFT; 3458 int r; 3459 3460 pgprintk("%s: gva %lx error %x\n", __func__, gva, error_code); 3461 3462 if (page_fault_handle_page_track(vcpu, error_code, gfn)) 3463 return 1; 3464 3465 r = mmu_topup_memory_caches(vcpu); 3466 if (r) 3467 return r; 3468 3469 MMU_WARN_ON(!VALID_PAGE(vcpu->arch.mmu.root_hpa)); 3470 3471 3472 return nonpaging_map(vcpu, gva & PAGE_MASK, 3473 error_code, gfn, prefault); 3474} 3475 3476static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn) 3477{ 3478 struct kvm_arch_async_pf arch; 3479 3480 arch.token = (vcpu->arch.apf.id++ << 12) | vcpu->vcpu_id; 3481 arch.gfn = gfn; 3482 arch.direct_map = vcpu->arch.mmu.direct_map; 3483 arch.cr3 = vcpu->arch.mmu.get_cr3(vcpu); 3484 3485 return kvm_setup_async_pf(vcpu, gva, kvm_vcpu_gfn_to_hva(vcpu, gfn), &arch); 3486} 3487 3488static bool can_do_async_pf(struct kvm_vcpu *vcpu) 3489{ 3490 if (unlikely(!lapic_in_kernel(vcpu) || 3491 kvm_event_needs_reinjection(vcpu))) 3492 return false; 3493 3494 return kvm_x86_ops->interrupt_allowed(vcpu); 3495} 3496 3497static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn, 3498 gva_t gva, kvm_pfn_t *pfn, bool write, bool *writable) 3499{ 3500 struct kvm_memory_slot *slot; 3501 bool async; 3502 3503 slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); 3504 async = false; 3505 *pfn = __gfn_to_pfn_memslot(slot, gfn, false, &async, write, writable); 3506 if (!async) 3507 return false; /* *pfn has correct page already */ 3508 3509 if (!prefault && can_do_async_pf(vcpu)) { 3510 trace_kvm_try_async_get_page(gva, gfn); 3511 if (kvm_find_async_pf_gfn(vcpu, gfn)) { 3512 trace_kvm_async_pf_doublefault(gva, gfn); 3513 kvm_make_request(KVM_REQ_APF_HALT, vcpu); 3514 return true; 3515 } else if (kvm_arch_setup_async_pf(vcpu, gva, gfn)) 3516 return true; 3517 } 3518 3519 *pfn = __gfn_to_pfn_memslot(slot, gfn, false, NULL, write, writable); 3520 return false; 3521} 3522 3523static bool 3524check_hugepage_cache_consistency(struct kvm_vcpu *vcpu, gfn_t gfn, int level) 3525{ 3526 int page_num = KVM_PAGES_PER_HPAGE(level); 3527 3528 gfn &= ~(page_num - 1); 3529 3530 return kvm_mtrr_check_gfn_range_consistency(vcpu, gfn, page_num); 3531} 3532 3533static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code, 3534 bool prefault) 3535{ 3536 kvm_pfn_t pfn; 3537 int r; 3538 int level; 3539 bool force_pt_level; 3540 gfn_t gfn = gpa >> PAGE_SHIFT; 3541 unsigned long mmu_seq; 3542 int write = error_code & PFERR_WRITE_MASK; 3543 bool map_writable; 3544 3545 MMU_WARN_ON(!VALID_PAGE(vcpu->arch.mmu.root_hpa)); 3546 3547 if (page_fault_handle_page_track(vcpu, error_code, gfn)) 3548 return 1; 3549 3550 r = mmu_topup_memory_caches(vcpu); 3551 if (r) 3552 return r; 3553 3554 force_pt_level = !check_hugepage_cache_consistency(vcpu, gfn, 3555 PT_DIRECTORY_LEVEL); 3556 level = mapping_level(vcpu, gfn, &force_pt_level); 3557 if (likely(!force_pt_level)) { 3558 if (level > PT_DIRECTORY_LEVEL && 3559 !check_hugepage_cache_consistency(vcpu, gfn, level)) 3560 level = PT_DIRECTORY_LEVEL; 3561 gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1); 3562 } 3563 3564 if (fast_page_fault(vcpu, gpa, level, error_code)) 3565 return 0; 3566 3567 mmu_seq = vcpu->kvm->mmu_notifier_seq; 3568 smp_rmb(); 3569 3570 if (try_async_pf(vcpu, prefault, gfn, gpa, &pfn, write, &map_writable)) 3571 return 0; 3572 3573 if (handle_abnormal_pfn(vcpu, 0, gfn, pfn, ACC_ALL, &r)) 3574 return r; 3575 3576 spin_lock(&vcpu->kvm->mmu_lock); 3577 if (mmu_notifier_retry(vcpu->kvm, mmu_seq)) 3578 goto out_unlock; 3579 make_mmu_pages_available(vcpu); 3580 if (likely(!force_pt_level)) 3581 transparent_hugepage_adjust(vcpu, &gfn, &pfn, &level); 3582 r = __direct_map(vcpu, write, map_writable, level, gfn, pfn, prefault); 3583 spin_unlock(&vcpu->kvm->mmu_lock); 3584 3585 return r; 3586 3587out_unlock: 3588 spin_unlock(&vcpu->kvm->mmu_lock); 3589 kvm_release_pfn_clean(pfn); 3590 return 0; 3591} 3592 3593static void nonpaging_init_context(struct kvm_vcpu *vcpu, 3594 struct kvm_mmu *context) 3595{ 3596 context->page_fault = nonpaging_page_fault; 3597 context->gva_to_gpa = nonpaging_gva_to_gpa; 3598 context->sync_page = nonpaging_sync_page; 3599 context->invlpg = nonpaging_invlpg; 3600 context->update_pte = nonpaging_update_pte; 3601 context->root_level = 0; 3602 context->shadow_root_level = PT32E_ROOT_LEVEL; 3603 context->root_hpa = INVALID_PAGE; 3604 context->direct_map = true; 3605 context->nx = false; 3606} 3607 3608void kvm_mmu_new_cr3(struct kvm_vcpu *vcpu) 3609{ 3610 mmu_free_roots(vcpu); 3611} 3612 3613static unsigned long get_cr3(struct kvm_vcpu *vcpu) 3614{ 3615 return kvm_read_cr3(vcpu); 3616} 3617 3618static void inject_page_fault(struct kvm_vcpu *vcpu, 3619 struct x86_exception *fault) 3620{ 3621 vcpu->arch.mmu.inject_page_fault(vcpu, fault); 3622} 3623 3624static bool sync_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, gfn_t gfn, 3625 unsigned access, int *nr_present) 3626{ 3627 if (unlikely(is_mmio_spte(*sptep))) { 3628 if (gfn != get_mmio_spte_gfn(*sptep)) { 3629 mmu_spte_clear_no_track(sptep); 3630 return true; 3631 } 3632 3633 (*nr_present)++; 3634 mark_mmio_spte(vcpu, sptep, gfn, access); 3635 return true; 3636 } 3637 3638 return false; 3639} 3640 3641static inline bool is_last_gpte(struct kvm_mmu *mmu, 3642 unsigned level, unsigned gpte) 3643{ 3644 /* 3645 * PT_PAGE_TABLE_LEVEL always terminates. The RHS has bit 7 set 3646 * iff level <= PT_PAGE_TABLE_LEVEL, which for our purpose means 3647 * level == PT_PAGE_TABLE_LEVEL; set PT_PAGE_SIZE_MASK in gpte then. 3648 */ 3649 gpte |= level - PT_PAGE_TABLE_LEVEL - 1; 3650 3651 /* 3652 * The RHS has bit 7 set iff level < mmu->last_nonleaf_level. 3653 * If it is clear, there are no large pages at this level, so clear 3654 * PT_PAGE_SIZE_MASK in gpte if that is the case. 3655 */ 3656 gpte &= level - mmu->last_nonleaf_level; 3657 3658 return gpte & PT_PAGE_SIZE_MASK; 3659} 3660 3661#define PTTYPE_EPT 18 /* arbitrary */ 3662#define PTTYPE PTTYPE_EPT 3663#include "paging_tmpl.h" 3664#undef PTTYPE 3665 3666#define PTTYPE 64 3667#include "paging_tmpl.h" 3668#undef PTTYPE 3669 3670#define PTTYPE 32 3671#include "paging_tmpl.h" 3672#undef PTTYPE 3673 3674static void 3675__reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, 3676 struct rsvd_bits_validate *rsvd_check, 3677 int maxphyaddr, int level, bool nx, bool gbpages, 3678 bool pse, bool amd) 3679{ 3680 u64 exb_bit_rsvd = 0; 3681 u64 gbpages_bit_rsvd = 0; 3682 u64 nonleaf_bit8_rsvd = 0; 3683 3684 rsvd_check->bad_mt_xwr = 0; 3685 3686 if (!nx) 3687 exb_bit_rsvd = rsvd_bits(63, 63); 3688 if (!gbpages) 3689 gbpages_bit_rsvd = rsvd_bits(7, 7); 3690 3691 /* 3692 * Non-leaf PML4Es and PDPEs reserve bit 8 (which would be the G bit for 3693 * leaf entries) on AMD CPUs only. 3694 */ 3695 if (amd) 3696 nonleaf_bit8_rsvd = rsvd_bits(8, 8); 3697 3698 switch (level) { 3699 case PT32_ROOT_LEVEL: 3700 /* no rsvd bits for 2 level 4K page table entries */ 3701 rsvd_check->rsvd_bits_mask[0][1] = 0; 3702 rsvd_check->rsvd_bits_mask[0][0] = 0; 3703 rsvd_check->rsvd_bits_mask[1][0] = 3704 rsvd_check->rsvd_bits_mask[0][0]; 3705 3706 if (!pse) { 3707 rsvd_check->rsvd_bits_mask[1][1] = 0; 3708 break; 3709 } 3710 3711 if (is_cpuid_PSE36()) 3712 /* 36bits PSE 4MB page */ 3713 rsvd_check->rsvd_bits_mask[1][1] = rsvd_bits(17, 21); 3714 else 3715 /* 32 bits PSE 4MB page */ 3716 rsvd_check->rsvd_bits_mask[1][1] = rsvd_bits(13, 21); 3717 break; 3718 case PT32E_ROOT_LEVEL: 3719 rsvd_check->rsvd_bits_mask[0][2] = 3720 rsvd_bits(maxphyaddr, 63) | 3721 rsvd_bits(5, 8) | rsvd_bits(1, 2); /* PDPTE */ 3722 rsvd_check->rsvd_bits_mask[0][1] = exb_bit_rsvd | 3723 rsvd_bits(maxphyaddr, 62); /* PDE */ 3724 rsvd_check->rsvd_bits_mask[0][0] = exb_bit_rsvd | 3725 rsvd_bits(maxphyaddr, 62); /* PTE */ 3726 rsvd_check->rsvd_bits_mask[1][1] = exb_bit_rsvd | 3727 rsvd_bits(maxphyaddr, 62) | 3728 rsvd_bits(13, 20); /* large page */ 3729 rsvd_check->rsvd_bits_mask[1][0] = 3730 rsvd_check->rsvd_bits_mask[0][0]; 3731 break; 3732 case PT64_ROOT_LEVEL: 3733 rsvd_check->rsvd_bits_mask[0][3] = exb_bit_rsvd | 3734 nonleaf_bit8_rsvd | rsvd_bits(7, 7) | 3735 rsvd_bits(maxphyaddr, 51); 3736 rsvd_check->rsvd_bits_mask[0][2] = exb_bit_rsvd | 3737 nonleaf_bit8_rsvd | gbpages_bit_rsvd | 3738 rsvd_bits(maxphyaddr, 51); 3739 rsvd_check->rsvd_bits_mask[0][1] = exb_bit_rsvd | 3740 rsvd_bits(maxphyaddr, 51); 3741 rsvd_check->rsvd_bits_mask[0][0] = exb_bit_rsvd | 3742 rsvd_bits(maxphyaddr, 51); 3743 rsvd_check->rsvd_bits_mask[1][3] = 3744 rsvd_check->rsvd_bits_mask[0][3]; 3745 rsvd_check->rsvd_bits_mask[1][2] = exb_bit_rsvd | 3746 gbpages_bit_rsvd | rsvd_bits(maxphyaddr, 51) | 3747 rsvd_bits(13, 29); 3748 rsvd_check->rsvd_bits_mask[1][1] = exb_bit_rsvd | 3749 rsvd_bits(maxphyaddr, 51) | 3750 rsvd_bits(13, 20); /* large page */ 3751 rsvd_check->rsvd_bits_mask[1][0] = 3752 rsvd_check->rsvd_bits_mask[0][0]; 3753 break; 3754 } 3755} 3756 3757static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, 3758 struct kvm_mmu *context) 3759{ 3760 __reset_rsvds_bits_mask(vcpu, &context->guest_rsvd_check, 3761 cpuid_maxphyaddr(vcpu), context->root_level, 3762 context->nx, guest_cpuid_has_gbpages(vcpu), 3763 is_pse(vcpu), guest_cpuid_is_amd(vcpu)); 3764} 3765 3766static void 3767__reset_rsvds_bits_mask_ept(struct rsvd_bits_validate *rsvd_check, 3768 int maxphyaddr, bool execonly) 3769{ 3770 u64 bad_mt_xwr; 3771 3772 rsvd_check->rsvd_bits_mask[0][3] = 3773 rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 7); 3774 rsvd_check->rsvd_bits_mask[0][2] = 3775 rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 6); 3776 rsvd_check->rsvd_bits_mask[0][1] = 3777 rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 6); 3778 rsvd_check->rsvd_bits_mask[0][0] = rsvd_bits(maxphyaddr, 51); 3779 3780 /* large page */ 3781 rsvd_check->rsvd_bits_mask[1][3] = rsvd_check->rsvd_bits_mask[0][3]; 3782 rsvd_check->rsvd_bits_mask[1][2] = 3783 rsvd_bits(maxphyaddr, 51) | rsvd_bits(12, 29); 3784 rsvd_check->rsvd_bits_mask[1][1] = 3785 rsvd_bits(maxphyaddr, 51) | rsvd_bits(12, 20); 3786 rsvd_check->rsvd_bits_mask[1][0] = rsvd_check->rsvd_bits_mask[0][0]; 3787 3788 bad_mt_xwr = 0xFFull << (2 * 8); /* bits 3..5 must not be 2 */ 3789 bad_mt_xwr |= 0xFFull << (3 * 8); /* bits 3..5 must not be 3 */ 3790 bad_mt_xwr |= 0xFFull << (7 * 8); /* bits 3..5 must not be 7 */ 3791 bad_mt_xwr |= REPEAT_BYTE(1ull << 2); /* bits 0..2 must not be 010 */ 3792 bad_mt_xwr |= REPEAT_BYTE(1ull << 6); /* bits 0..2 must not be 110 */ 3793 if (!execonly) { 3794 /* bits 0..2 must not be 100 unless VMX capabilities allow it */ 3795 bad_mt_xwr |= REPEAT_BYTE(1ull << 4); 3796 } 3797 rsvd_check->bad_mt_xwr = bad_mt_xwr; 3798} 3799 3800static void reset_rsvds_bits_mask_ept(struct kvm_vcpu *vcpu, 3801 struct kvm_mmu *context, bool execonly) 3802{ 3803 __reset_rsvds_bits_mask_ept(&context->guest_rsvd_check, 3804 cpuid_maxphyaddr(vcpu), execonly); 3805} 3806 3807/* 3808 * the page table on host is the shadow page table for the page 3809 * table in guest or amd nested guest, its mmu features completely 3810 * follow the features in guest. 3811 */ 3812void 3813reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, struct kvm_mmu *context) 3814{ 3815 bool uses_nx = context->nx || context->base_role.smep_andnot_wp; 3816 3817 /* 3818 * Passing "true" to the last argument is okay; it adds a check 3819 * on bit 8 of the SPTEs which KVM doesn't use anyway. 3820 */ 3821 __reset_rsvds_bits_mask(vcpu, &context->shadow_zero_check, 3822 boot_cpu_data.x86_phys_bits, 3823 context->shadow_root_level, uses_nx, 3824 guest_cpuid_has_gbpages(vcpu), is_pse(vcpu), 3825 true); 3826} 3827EXPORT_SYMBOL_GPL(reset_shadow_zero_bits_mask); 3828 3829static inline bool boot_cpu_is_amd(void) 3830{ 3831 WARN_ON_ONCE(!tdp_enabled); 3832 return shadow_x_mask == 0; 3833} 3834 3835/* 3836 * the direct page table on host, use as much mmu features as 3837 * possible, however, kvm currently does not do execution-protection. 3838 */ 3839static void 3840reset_tdp_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, 3841 struct kvm_mmu *context) 3842{ 3843 if (boot_cpu_is_amd()) 3844 __reset_rsvds_bits_mask(vcpu, &context->shadow_zero_check, 3845 boot_cpu_data.x86_phys_bits, 3846 context->shadow_root_level, false, 3847 cpu_has_gbpages, true, true); 3848 else 3849 __reset_rsvds_bits_mask_ept(&context->shadow_zero_check, 3850 boot_cpu_data.x86_phys_bits, 3851 false); 3852 3853} 3854 3855/* 3856 * as the comments in reset_shadow_zero_bits_mask() except it 3857 * is the shadow page table for intel nested guest. 3858 */ 3859static void 3860reset_ept_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, 3861 struct kvm_mmu *context, bool execonly) 3862{ 3863 __reset_rsvds_bits_mask_ept(&context->shadow_zero_check, 3864 boot_cpu_data.x86_phys_bits, execonly); 3865} 3866 3867static void update_permission_bitmask(struct kvm_vcpu *vcpu, 3868 struct kvm_mmu *mmu, bool ept) 3869{ 3870 unsigned bit, byte, pfec; 3871 u8 map; 3872 bool fault, x, w, u, wf, uf, ff, smapf, cr4_smap, cr4_smep, smap = 0; 3873 3874 cr4_smep = kvm_read_cr4_bits(vcpu, X86_CR4_SMEP); 3875 cr4_smap = kvm_read_cr4_bits(vcpu, X86_CR4_SMAP); 3876 for (byte = 0; byte < ARRAY_SIZE(mmu->permissions); ++byte) { 3877 pfec = byte << 1; 3878 map = 0; 3879 wf = pfec & PFERR_WRITE_MASK; 3880 uf = pfec & PFERR_USER_MASK; 3881 ff = pfec & PFERR_FETCH_MASK; 3882 /* 3883 * PFERR_RSVD_MASK bit is set in PFEC if the access is not 3884 * subject to SMAP restrictions, and cleared otherwise. The 3885 * bit is only meaningful if the SMAP bit is set in CR4. 3886 */ 3887 smapf = !(pfec & PFERR_RSVD_MASK); 3888 for (bit = 0; bit < 8; ++bit) { 3889 x = bit & ACC_EXEC_MASK; 3890 w = bit & ACC_WRITE_MASK; 3891 u = bit & ACC_USER_MASK; 3892 3893 if (!ept) { 3894 /* Not really needed: !nx will cause pte.nx to fault */ 3895 x |= !mmu->nx; 3896 /* Allow supervisor writes if !cr0.wp */ 3897 w |= !is_write_protection(vcpu) && !uf; 3898 /* Disallow supervisor fetches of user code if cr4.smep */ 3899 x &= !(cr4_smep && u && !uf); 3900 3901 /* 3902 * SMAP:kernel-mode data accesses from user-mode 3903 * mappings should fault. A fault is considered 3904 * as a SMAP violation if all of the following 3905 * conditions are ture: 3906 * - X86_CR4_SMAP is set in CR4 3907 * - An user page is accessed 3908 * - Page fault in kernel mode 3909 * - if CPL = 3 or X86_EFLAGS_AC is clear 3910 * 3911 * Here, we cover the first three conditions. 3912 * The fourth is computed dynamically in 3913 * permission_fault() and is in smapf. 3914 * 3915 * Also, SMAP does not affect instruction 3916 * fetches, add the !ff check here to make it 3917 * clearer. 3918 */ 3919 smap = cr4_smap && u && !uf && !ff; 3920 } else 3921 /* Not really needed: no U/S accesses on ept */ 3922 u = 1; 3923 3924 fault = (ff && !x) || (uf && !u) || (wf && !w) || 3925 (smapf && smap); 3926 map |= fault << bit; 3927 } 3928 mmu->permissions[byte] = map; 3929 } 3930} 3931 3932/* 3933* PKU is an additional mechanism by which the paging controls access to 3934* user-mode addresses based on the value in the PKRU register. Protection 3935* key violations are reported through a bit in the page fault error code. 3936* Unlike other bits of the error code, the PK bit is not known at the 3937* call site of e.g. gva_to_gpa; it must be computed directly in 3938* permission_fault based on two bits of PKRU, on some machine state (CR4, 3939* CR0, EFER, CPL), and on other bits of the error code and the page tables. 3940* 3941* In particular the following conditions come from the error code, the 3942* page tables and the machine state: 3943* - PK is always zero unless CR4.PKE=1 and EFER.LMA=1 3944* - PK is always zero if RSVD=1 (reserved bit set) or F=1 (instruction fetch) 3945* - PK is always zero if U=0 in the page tables 3946* - PKRU.WD is ignored if CR0.WP=0 and the access is a supervisor access. 3947* 3948* The PKRU bitmask caches the result of these four conditions. The error 3949* code (minus the P bit) and the page table's U bit form an index into the 3950* PKRU bitmask. Two bits of the PKRU bitmask are then extracted and ANDed 3951* with the two bits of the PKRU register corresponding to the protection key. 3952* For the first three conditions above the bits will be 00, thus masking 3953* away both AD and WD. For all reads or if the last condition holds, WD 3954* only will be masked away. 3955*/ 3956static void update_pkru_bitmask(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, 3957 bool ept) 3958{ 3959 unsigned bit; 3960 bool wp; 3961 3962 if (ept) { 3963 mmu->pkru_mask = 0; 3964 return; 3965 } 3966 3967 /* PKEY is enabled only if CR4.PKE and EFER.LMA are both set. */ 3968 if (!kvm_read_cr4_bits(vcpu, X86_CR4_PKE) || !is_long_mode(vcpu)) { 3969 mmu->pkru_mask = 0; 3970 return; 3971 } 3972 3973 wp = is_write_protection(vcpu); 3974 3975 for (bit = 0; bit < ARRAY_SIZE(mmu->permissions); ++bit) { 3976 unsigned pfec, pkey_bits; 3977 bool check_pkey, check_write, ff, uf, wf, pte_user; 3978 3979 pfec = bit << 1; 3980 ff = pfec & PFERR_FETCH_MASK; 3981 uf = pfec & PFERR_USER_MASK; 3982 wf = pfec & PFERR_WRITE_MASK; 3983 3984 /* PFEC.RSVD is replaced by ACC_USER_MASK. */ 3985 pte_user = pfec & PFERR_RSVD_MASK; 3986 3987 /* 3988 * Only need to check the access which is not an 3989 * instruction fetch and is to a user page. 3990 */ 3991 check_pkey = (!ff && pte_user); 3992 /* 3993 * write access is controlled by PKRU if it is a 3994 * user access or CR0.WP = 1. 3995 */ 3996 check_write = check_pkey && wf && (uf || wp); 3997 3998 /* PKRU.AD stops both read and write access. */ 3999 pkey_bits = !!check_pkey; 4000 /* PKRU.WD stops write access. */ 4001 pkey_bits |= (!!check_write) << 1; 4002 4003 mmu->pkru_mask |= (pkey_bits & 3) << pfec; 4004 } 4005} 4006 4007static void update_last_nonleaf_level(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu) 4008{ 4009 unsigned root_level = mmu->root_level; 4010 4011 mmu->last_nonleaf_level = root_level; 4012 if (root_level == PT32_ROOT_LEVEL && is_pse(vcpu)) 4013 mmu->last_nonleaf_level++; 4014} 4015 4016static void paging64_init_context_common(struct kvm_vcpu *vcpu, 4017 struct kvm_mmu *context, 4018 int level) 4019{ 4020 context->nx = is_nx(vcpu); 4021 context->root_level = level; 4022 4023 reset_rsvds_bits_mask(vcpu, context); 4024 update_permission_bitmask(vcpu, context, false); 4025 update_pkru_bitmask(vcpu, context, false); 4026 update_last_nonleaf_level(vcpu, context); 4027 4028 MMU_WARN_ON(!is_pae(vcpu)); 4029 context->page_fault = paging64_page_fault; 4030 context->gva_to_gpa = paging64_gva_to_gpa; 4031 context->sync_page = paging64_sync_page; 4032 context->invlpg = paging64_invlpg; 4033 context->update_pte = paging64_update_pte; 4034 context->shadow_root_level = level; 4035 context->root_hpa = INVALID_PAGE; 4036 context->direct_map = false; 4037} 4038 4039static void paging64_init_context(struct kvm_vcpu *vcpu, 4040 struct kvm_mmu *context) 4041{ 4042 paging64_init_context_common(vcpu, context, PT64_ROOT_LEVEL); 4043} 4044 4045static void paging32_init_context(struct kvm_vcpu *vcpu, 4046 struct kvm_mmu *context) 4047{ 4048 context->nx = false; 4049 context->root_level = PT32_ROOT_LEVEL; 4050 4051 reset_rsvds_bits_mask(vcpu, context); 4052 update_permission_bitmask(vcpu, context, false); 4053 update_pkru_bitmask(vcpu, context, false); 4054 update_last_nonleaf_level(vcpu, context); 4055 4056 context->page_fault = paging32_page_fault; 4057 context->gva_to_gpa = paging32_gva_to_gpa; 4058 context->sync_page = paging32_sync_page; 4059 context->invlpg = paging32_invlpg; 4060 context->update_pte = paging32_update_pte; 4061 context->shadow_root_level = PT32E_ROOT_LEVEL; 4062 context->root_hpa = INVALID_PAGE; 4063 context->direct_map = false; 4064} 4065 4066static void paging32E_init_context(struct kvm_vcpu *vcpu, 4067 struct kvm_mmu *context) 4068{ 4069 paging64_init_context_common(vcpu, context, PT32E_ROOT_LEVEL); 4070} 4071 4072static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) 4073{ 4074 struct kvm_mmu *context = &vcpu->arch.mmu; 4075 4076 context->base_role.word = 0; 4077 context->base_role.smm = is_smm(vcpu); 4078 context->page_fault = tdp_page_fault; 4079 context->sync_page = nonpaging_sync_page; 4080 context->invlpg = nonpaging_invlpg; 4081 context->update_pte = nonpaging_update_pte; 4082 context->shadow_root_level = kvm_x86_ops->get_tdp_level(); 4083 context->root_hpa = INVALID_PAGE; 4084 context->direct_map = true; 4085 context->set_cr3 = kvm_x86_ops->set_tdp_cr3; 4086 context->get_cr3 = get_cr3; 4087 context->get_pdptr = kvm_pdptr_read; 4088 context->inject_page_fault = kvm_inject_page_fault; 4089 4090 if (!is_paging(vcpu)) { 4091 context->nx = false; 4092 context->gva_to_gpa = nonpaging_gva_to_gpa; 4093 context->root_level = 0; 4094 } else if (is_long_mode(vcpu)) { 4095 context->nx = is_nx(vcpu); 4096 context->root_level = PT64_ROOT_LEVEL; 4097 reset_rsvds_bits_mask(vcpu, context); 4098 context->gva_to_gpa = paging64_gva_to_gpa; 4099 } else if (is_pae(vcpu)) { 4100 context->nx = is_nx(vcpu); 4101 context->root_level = PT32E_ROOT_LEVEL; 4102 reset_rsvds_bits_mask(vcpu, context); 4103 context->gva_to_gpa = paging64_gva_to_gpa; 4104 } else { 4105 context->nx = false; 4106 context->root_level = PT32_ROOT_LEVEL; 4107 reset_rsvds_bits_mask(vcpu, context); 4108 context->gva_to_gpa = paging32_gva_to_gpa; 4109 } 4110 4111 update_permission_bitmask(vcpu, context, false); 4112 update_pkru_bitmask(vcpu, context, false); 4113 update_last_nonleaf_level(vcpu, context); 4114 reset_tdp_shadow_zero_bits_mask(vcpu, context); 4115} 4116 4117void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu) 4118{ 4119 bool smep = kvm_read_cr4_bits(vcpu, X86_CR4_SMEP); 4120 bool smap = kvm_read_cr4_bits(vcpu, X86_CR4_SMAP); 4121 struct kvm_mmu *context = &vcpu->arch.mmu; 4122 4123 MMU_WARN_ON(VALID_PAGE(context->root_hpa)); 4124 4125 if (!is_paging(vcpu)) 4126 nonpaging_init_context(vcpu, context); 4127 else if (is_long_mode(vcpu)) 4128 paging64_init_context(vcpu, context); 4129 else if (is_pae(vcpu)) 4130 paging32E_init_context(vcpu, context); 4131 else 4132 paging32_init_context(vcpu, context); 4133 4134 context->base_role.nxe = is_nx(vcpu); 4135 context->base_role.cr4_pae = !!is_pae(vcpu); 4136 context->base_role.cr0_wp = is_write_protection(vcpu); 4137 context->base_role.smep_andnot_wp 4138 = smep && !is_write_protection(vcpu); 4139 context->base_role.smap_andnot_wp 4140 = smap && !is_write_protection(vcpu); 4141 context->base_role.smm = is_smm(vcpu); 4142 reset_shadow_zero_bits_mask(vcpu, context); 4143} 4144EXPORT_SYMBOL_GPL(kvm_init_shadow_mmu); 4145 4146void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly) 4147{ 4148 struct kvm_mmu *context = &vcpu->arch.mmu; 4149 4150 MMU_WARN_ON(VALID_PAGE(context->root_hpa)); 4151 4152 context->shadow_root_level = kvm_x86_ops->get_tdp_level(); 4153 4154 context->nx = true; 4155 context->page_fault = ept_page_fault; 4156 context->gva_to_gpa = ept_gva_to_gpa; 4157 context->sync_page = ept_sync_page; 4158 context->invlpg = ept_invlpg; 4159 context->update_pte = ept_update_pte; 4160 context->root_level = context->shadow_root_level; 4161 context->root_hpa = INVALID_PAGE; 4162 context->direct_map = false; 4163 4164 update_permission_bitmask(vcpu, context, true); 4165 update_pkru_bitmask(vcpu, context, true); 4166 reset_rsvds_bits_mask_ept(vcpu, context, execonly); 4167 reset_ept_shadow_zero_bits_mask(vcpu, context, execonly); 4168} 4169EXPORT_SYMBOL_GPL(kvm_init_shadow_ept_mmu); 4170 4171static void init_kvm_softmmu(struct kvm_vcpu *vcpu) 4172{ 4173 struct kvm_mmu *context = &vcpu->arch.mmu; 4174 4175 kvm_init_shadow_mmu(vcpu); 4176 context->set_cr3 = kvm_x86_ops->set_cr3; 4177 context->get_cr3 = get_cr3; 4178 context->get_pdptr = kvm_pdptr_read; 4179 context->inject_page_fault = kvm_inject_page_fault; 4180} 4181 4182static void init_kvm_nested_mmu(struct kvm_vcpu *vcpu) 4183{ 4184 struct kvm_mmu *g_context = &vcpu->arch.nested_mmu; 4185 4186 g_context->get_cr3 = get_cr3; 4187 g_context->get_pdptr = kvm_pdptr_read; 4188 g_context->inject_page_fault = kvm_inject_page_fault; 4189 4190 /* 4191 * Note that arch.mmu.gva_to_gpa translates l2_gpa to l1_gpa using 4192 * L1's nested page tables (e.g. EPT12). The nested translation 4193 * of l2_gva to l1_gpa is done by arch.nested_mmu.gva_to_gpa using 4194 * L2's page tables as the first level of translation and L1's 4195 * nested page tables as the second level of translation. Basically 4196 * the gva_to_gpa functions between mmu and nested_mmu are swapped. 4197 */ 4198 if (!is_paging(vcpu)) { 4199 g_context->nx = false; 4200 g_context->root_level = 0; 4201 g_context->gva_to_gpa = nonpaging_gva_to_gpa_nested; 4202 } else if (is_long_mode(vcpu)) { 4203 g_context->nx = is_nx(vcpu); 4204 g_context->root_level = PT64_ROOT_LEVEL; 4205 reset_rsvds_bits_mask(vcpu, g_context); 4206 g_context->gva_to_gpa = paging64_gva_to_gpa_nested; 4207 } else if (is_pae(vcpu)) { 4208 g_context->nx = is_nx(vcpu); 4209 g_context->root_level = PT32E_ROOT_LEVEL; 4210 reset_rsvds_bits_mask(vcpu, g_context); 4211 g_context->gva_to_gpa = paging64_gva_to_gpa_nested; 4212 } else { 4213 g_context->nx = false; 4214 g_context->root_level = PT32_ROOT_LEVEL; 4215 reset_rsvds_bits_mask(vcpu, g_context); 4216 g_context->gva_to_gpa = paging32_gva_to_gpa_nested; 4217 } 4218 4219 update_permission_bitmask(vcpu, g_context, false); 4220 update_pkru_bitmask(vcpu, g_context, false); 4221 update_last_nonleaf_level(vcpu, g_context); 4222} 4223 4224static void init_kvm_mmu(struct kvm_vcpu *vcpu) 4225{ 4226 if (mmu_is_nested(vcpu)) 4227 init_kvm_nested_mmu(vcpu); 4228 else if (tdp_enabled) 4229 init_kvm_tdp_mmu(vcpu); 4230 else 4231 init_kvm_softmmu(vcpu); 4232} 4233 4234void kvm_mmu_reset_context(struct kvm_vcpu *vcpu) 4235{ 4236 kvm_mmu_unload(vcpu); 4237 init_kvm_mmu(vcpu); 4238} 4239EXPORT_SYMBOL_GPL(kvm_mmu_reset_context); 4240 4241int kvm_mmu_load(struct kvm_vcpu *vcpu) 4242{ 4243 int r; 4244 4245 r = mmu_topup_memory_caches(vcpu); 4246 if (r) 4247 goto out; 4248 r = mmu_alloc_roots(vcpu); 4249 kvm_mmu_sync_roots(vcpu); 4250 if (r) 4251 goto out; 4252 /* set_cr3() should ensure TLB has been flushed */ 4253 vcpu->arch.mmu.set_cr3(vcpu, vcpu->arch.mmu.root_hpa); 4254out: 4255 return r; 4256} 4257EXPORT_SYMBOL_GPL(kvm_mmu_load); 4258 4259void kvm_mmu_unload(struct kvm_vcpu *vcpu) 4260{ 4261 mmu_free_roots(vcpu); 4262 WARN_ON(VALID_PAGE(vcpu->arch.mmu.root_hpa)); 4263} 4264EXPORT_SYMBOL_GPL(kvm_mmu_unload); 4265 4266static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu, 4267 struct kvm_mmu_page *sp, u64 *spte, 4268 const void *new) 4269{ 4270 if (sp->role.level != PT_PAGE_TABLE_LEVEL) { 4271 ++vcpu->kvm->stat.mmu_pde_zapped; 4272 return; 4273 } 4274 4275 ++vcpu->kvm->stat.mmu_pte_updated; 4276 vcpu->arch.mmu.update_pte(vcpu, sp, spte, new); 4277} 4278 4279static bool need_remote_flush(u64 old, u64 new) 4280{ 4281 if (!is_shadow_present_pte(old)) 4282 return false; 4283 if (!is_shadow_present_pte(new)) 4284 return true; 4285 if ((old ^ new) & PT64_BASE_ADDR_MASK) 4286 return true; 4287 old ^= shadow_nx_mask; 4288 new ^= shadow_nx_mask; 4289 return (old & ~new & PT64_PERM_MASK) != 0; 4290} 4291 4292static u64 mmu_pte_write_fetch_gpte(struct kvm_vcpu *vcpu, gpa_t *gpa, 4293 const u8 *new, int *bytes) 4294{ 4295 u64 gentry; 4296 int r; 4297 4298 /* 4299 * Assume that the pte write on a page table of the same type 4300 * as the current vcpu paging mode since we update the sptes only 4301 * when they have the same mode. 4302 */ 4303 if (is_pae(vcpu) && *bytes == 4) { 4304 /* Handle a 32-bit guest writing two halves of a 64-bit gpte */ 4305 *gpa &= ~(gpa_t)7; 4306 *bytes = 8; 4307 r = kvm_vcpu_read_guest(vcpu, *gpa, &gentry, 8); 4308 if (r) 4309 gentry = 0; 4310 new = (const u8 *)&gentry; 4311 } 4312 4313 switch (*bytes) { 4314 case 4: 4315 gentry = *(const u32 *)new; 4316 break; 4317 case 8: 4318 gentry = *(const u64 *)new; 4319 break; 4320 default: 4321 gentry = 0; 4322 break; 4323 } 4324 4325 return gentry; 4326} 4327 4328/* 4329 * If we're seeing too many writes to a page, it may no longer be a page table, 4330 * or we may be forking, in which case it is better to unmap the page. 4331 */ 4332static bool detect_write_flooding(struct kvm_mmu_page *sp) 4333{ 4334 /* 4335 * Skip write-flooding detected for the sp whose level is 1, because 4336 * it can become unsync, then the guest page is not write-protected. 4337 */ 4338 if (sp->role.level == PT_PAGE_TABLE_LEVEL) 4339 return false; 4340 4341 atomic_inc(&sp->write_flooding_count); 4342 return atomic_read(&sp->write_flooding_count) >= 3; 4343} 4344 4345/* 4346 * Misaligned accesses are too much trouble to fix up; also, they usually 4347 * indicate a page is not used as a page table. 4348 */ 4349static bool detect_write_misaligned(struct kvm_mmu_page *sp, gpa_t gpa, 4350 int bytes) 4351{ 4352 unsigned offset, pte_size, misaligned; 4353 4354 pgprintk("misaligned: gpa %llx bytes %d role %x\n", 4355 gpa, bytes, sp->role.word); 4356 4357 offset = offset_in_page(gpa); 4358 pte_size = sp->role.cr4_pae ? 8 : 4; 4359 4360 /* 4361 * Sometimes, the OS only writes the last one bytes to update status 4362 * bits, for example, in linux, andb instruction is used in clear_bit(). 4363 */ 4364 if (!(offset & (pte_size - 1)) && bytes == 1) 4365 return false; 4366 4367 misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1); 4368 misaligned |= bytes < 4; 4369 4370 return misaligned; 4371} 4372 4373static u64 *get_written_sptes(struct kvm_mmu_page *sp, gpa_t gpa, int *nspte) 4374{ 4375 unsigned page_offset, quadrant; 4376 u64 *spte; 4377 int level; 4378 4379 page_offset = offset_in_page(gpa); 4380 level = sp->role.level; 4381 *nspte = 1; 4382 if (!sp->role.cr4_pae) { 4383 page_offset <<= 1; /* 32->64 */ 4384 /* 4385 * A 32-bit pde maps 4MB while the shadow pdes map 4386 * only 2MB. So we need to double the offset again 4387 * and zap two pdes instead of one. 4388 */ 4389 if (level == PT32_ROOT_LEVEL) { 4390 page_offset &= ~7; /* kill rounding error */ 4391 page_offset <<= 1; 4392 *nspte = 2; 4393 } 4394 quadrant = page_offset >> PAGE_SHIFT; 4395 page_offset &= ~PAGE_MASK; 4396 if (quadrant != sp->role.quadrant) 4397 return NULL; 4398 } 4399 4400 spte = &sp->spt[page_offset / sizeof(*spte)]; 4401 return spte; 4402} 4403 4404static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, 4405 const u8 *new, int bytes) 4406{ 4407 gfn_t gfn = gpa >> PAGE_SHIFT; 4408 struct kvm_mmu_page *sp; 4409 LIST_HEAD(invalid_list); 4410 u64 entry, gentry, *spte; 4411 int npte; 4412 bool remote_flush, local_flush; 4413 union kvm_mmu_page_role mask = { }; 4414 4415 mask.cr0_wp = 1; 4416 mask.cr4_pae = 1; 4417 mask.nxe = 1; 4418 mask.smep_andnot_wp = 1; 4419 mask.smap_andnot_wp = 1; 4420 mask.smm = 1; 4421 4422 /* 4423 * If we don't have indirect shadow pages, it means no page is 4424 * write-protected, so we can exit simply. 4425 */ 4426 if (!ACCESS_ONCE(vcpu->kvm->arch.indirect_shadow_pages)) 4427 return; 4428 4429 remote_flush = local_flush = false; 4430 4431 pgprintk("%s: gpa %llx bytes %d\n", __func__, gpa, bytes); 4432 4433 gentry = mmu_pte_write_fetch_gpte(vcpu, &gpa, new, &bytes); 4434 4435 /* 4436 * No need to care whether allocation memory is successful 4437 * or not since pte prefetch is skiped if it does not have 4438 * enough objects in the cache. 4439 */ 4440 mmu_topup_memory_caches(vcpu); 4441 4442 spin_lock(&vcpu->kvm->mmu_lock); 4443 ++vcpu->kvm->stat.mmu_pte_write; 4444 kvm_mmu_audit(vcpu, AUDIT_PRE_PTE_WRITE); 4445 4446 for_each_gfn_indirect_valid_sp(vcpu->kvm, sp, gfn) { 4447 if (detect_write_misaligned(sp, gpa, bytes) || 4448 detect_write_flooding(sp)) { 4449 kvm_mmu_prepare_zap_page(vcpu->kvm, sp, &invalid_list); 4450 ++vcpu->kvm->stat.mmu_flooded; 4451 continue; 4452 } 4453 4454 spte = get_written_sptes(sp, gpa, &npte); 4455 if (!spte) 4456 continue; 4457 4458 local_flush = true; 4459 while (npte--) { 4460 entry = *spte; 4461 mmu_page_zap_pte(vcpu->kvm, sp, spte); 4462 if (gentry && 4463 !((sp->role.word ^ vcpu->arch.mmu.base_role.word) 4464 & mask.word) && rmap_can_add(vcpu)) 4465 mmu_pte_write_new_pte(vcpu, sp, spte, &gentry); 4466 if (need_remote_flush(entry, *spte)) 4467 remote_flush = true; 4468 ++spte; 4469 } 4470 } 4471 kvm_mmu_flush_or_zap(vcpu, &invalid_list, remote_flush, local_flush); 4472 kvm_mmu_audit(vcpu, AUDIT_POST_PTE_WRITE); 4473 spin_unlock(&vcpu->kvm->mmu_lock); 4474} 4475 4476int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva) 4477{ 4478 gpa_t gpa; 4479 int r; 4480 4481 if (vcpu->arch.mmu.direct_map) 4482 return 0; 4483 4484 gpa = kvm_mmu_gva_to_gpa_read(vcpu, gva, NULL); 4485 4486 r = kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT); 4487 4488 return r; 4489} 4490EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page_virt); 4491 4492static void make_mmu_pages_available(struct kvm_vcpu *vcpu) 4493{ 4494 LIST_HEAD(invalid_list); 4495 4496 if (likely(kvm_mmu_available_pages(vcpu->kvm) >= KVM_MIN_FREE_MMU_PAGES)) 4497 return; 4498 4499 while (kvm_mmu_available_pages(vcpu->kvm) < KVM_REFILL_PAGES) { 4500 if (!prepare_zap_oldest_mmu_page(vcpu->kvm, &invalid_list)) 4501 break; 4502 4503 ++vcpu->kvm->stat.mmu_recycled; 4504 } 4505 kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); 4506} 4507 4508int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code, 4509 void *insn, int insn_len) 4510{ 4511 int r, emulation_type = EMULTYPE_RETRY; 4512 enum emulation_result er; 4513 bool direct = vcpu->arch.mmu.direct_map || mmu_is_nested(vcpu); 4514 4515 if (unlikely(error_code & PFERR_RSVD_MASK)) { 4516 r = handle_mmio_page_fault(vcpu, cr2, direct); 4517 if (r == RET_MMIO_PF_EMULATE) { 4518 emulation_type = 0; 4519 goto emulate; 4520 } 4521 if (r == RET_MMIO_PF_RETRY) 4522 return 1; 4523 if (r < 0) 4524 return r; 4525 } 4526 4527 r = vcpu->arch.mmu.page_fault(vcpu, cr2, error_code, false); 4528 if (r < 0) 4529 return r; 4530 if (!r) 4531 return 1; 4532 4533 if (mmio_info_in_cache(vcpu, cr2, direct)) 4534 emulation_type = 0; 4535emulate: 4536 er = x86_emulate_instruction(vcpu, cr2, emulation_type, insn, insn_len); 4537 4538 switch (er) { 4539 case EMULATE_DONE: 4540 return 1; 4541 case EMULATE_USER_EXIT: 4542 ++vcpu->stat.mmio_exits; 4543 /* fall through */ 4544 case EMULATE_FAIL: 4545 return 0; 4546 default: 4547 BUG(); 4548 } 4549} 4550EXPORT_SYMBOL_GPL(kvm_mmu_page_fault); 4551 4552void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva) 4553{ 4554 vcpu->arch.mmu.invlpg(vcpu, gva); 4555 kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); 4556 ++vcpu->stat.invlpg; 4557} 4558EXPORT_SYMBOL_GPL(kvm_mmu_invlpg); 4559 4560void kvm_enable_tdp(void) 4561{ 4562 tdp_enabled = true; 4563} 4564EXPORT_SYMBOL_GPL(kvm_enable_tdp); 4565 4566void kvm_disable_tdp(void) 4567{ 4568 tdp_enabled = false; 4569} 4570EXPORT_SYMBOL_GPL(kvm_disable_tdp); 4571 4572static void free_mmu_pages(struct kvm_vcpu *vcpu) 4573{ 4574 free_page((unsigned long)vcpu->arch.mmu.pae_root); 4575 if (vcpu->arch.mmu.lm_root != NULL) 4576 free_page((unsigned long)vcpu->arch.mmu.lm_root); 4577} 4578 4579static int alloc_mmu_pages(struct kvm_vcpu *vcpu) 4580{ 4581 struct page *page; 4582 int i; 4583 4584 /* 4585 * When emulating 32-bit mode, cr3 is only 32 bits even on x86_64. 4586 * Therefore we need to allocate shadow page tables in the first 4587 * 4GB of memory, which happens to fit the DMA32 zone. 4588 */ 4589 page = alloc_page(GFP_KERNEL | __GFP_DMA32); 4590 if (!page) 4591 return -ENOMEM; 4592 4593 vcpu->arch.mmu.pae_root = page_address(page); 4594 for (i = 0; i < 4; ++i) 4595 vcpu->arch.mmu.pae_root[i] = INVALID_PAGE; 4596 4597 return 0; 4598} 4599 4600int kvm_mmu_create(struct kvm_vcpu *vcpu) 4601{ 4602 vcpu->arch.walk_mmu = &vcpu->arch.mmu; 4603 vcpu->arch.mmu.root_hpa = INVALID_PAGE; 4604 vcpu->arch.mmu.translate_gpa = translate_gpa; 4605 vcpu->arch.nested_mmu.translate_gpa = translate_nested_gpa; 4606 4607 return alloc_mmu_pages(vcpu); 4608} 4609 4610void kvm_mmu_setup(struct kvm_vcpu *vcpu) 4611{ 4612 MMU_WARN_ON(VALID_PAGE(vcpu->arch.mmu.root_hpa)); 4613 4614 init_kvm_mmu(vcpu); 4615} 4616 4617void kvm_mmu_init_vm(struct kvm *kvm) 4618{ 4619 struct kvm_page_track_notifier_node *node = &kvm->arch.mmu_sp_tracker; 4620 4621 node->track_write = kvm_mmu_pte_write; 4622 kvm_page_track_register_notifier(kvm, node); 4623} 4624 4625void kvm_mmu_uninit_vm(struct kvm *kvm) 4626{ 4627 struct kvm_page_track_notifier_node *node = &kvm->arch.mmu_sp_tracker; 4628 4629 kvm_page_track_unregister_notifier(kvm, node); 4630} 4631 4632/* The return value indicates if tlb flush on all vcpus is needed. */ 4633typedef bool (*slot_level_handler) (struct kvm *kvm, struct kvm_rmap_head *rmap_head); 4634 4635/* The caller should hold mmu-lock before calling this function. */ 4636static bool 4637slot_handle_level_range(struct kvm *kvm, struct kvm_memory_slot *memslot, 4638 slot_level_handler fn, int start_level, int end_level, 4639 gfn_t start_gfn, gfn_t end_gfn, bool lock_flush_tlb) 4640{ 4641 struct slot_rmap_walk_iterator iterator; 4642 bool flush = false; 4643 4644 for_each_slot_rmap_range(memslot, start_level, end_level, start_gfn, 4645 end_gfn, &iterator) { 4646 if (iterator.rmap) 4647 flush |= fn(kvm, iterator.rmap); 4648 4649 if (need_resched() || spin_needbreak(&kvm->mmu_lock)) { 4650 if (flush && lock_flush_tlb) { 4651 kvm_flush_remote_tlbs(kvm); 4652 flush = false; 4653 } 4654 cond_resched_lock(&kvm->mmu_lock); 4655 } 4656 } 4657 4658 if (flush && lock_flush_tlb) { 4659 kvm_flush_remote_tlbs(kvm); 4660 flush = false; 4661 } 4662 4663 return flush; 4664} 4665 4666static bool 4667slot_handle_level(struct kvm *kvm, struct kvm_memory_slot *memslot, 4668 slot_level_handler fn, int start_level, int end_level, 4669 bool lock_flush_tlb) 4670{ 4671 return slot_handle_level_range(kvm, memslot, fn, start_level, 4672 end_level, memslot->base_gfn, 4673 memslot->base_gfn + memslot->npages - 1, 4674 lock_flush_tlb); 4675} 4676 4677static bool 4678slot_handle_all_level(struct kvm *kvm, struct kvm_memory_slot *memslot, 4679 slot_level_handler fn, bool lock_flush_tlb) 4680{ 4681 return slot_handle_level(kvm, memslot, fn, PT_PAGE_TABLE_LEVEL, 4682 PT_MAX_HUGEPAGE_LEVEL, lock_flush_tlb); 4683} 4684 4685static bool 4686slot_handle_large_level(struct kvm *kvm, struct kvm_memory_slot *memslot, 4687 slot_level_handler fn, bool lock_flush_tlb) 4688{ 4689 return slot_handle_level(kvm, memslot, fn, PT_PAGE_TABLE_LEVEL + 1, 4690 PT_MAX_HUGEPAGE_LEVEL, lock_flush_tlb); 4691} 4692 4693static bool 4694slot_handle_leaf(struct kvm *kvm, struct kvm_memory_slot *memslot, 4695 slot_level_handler fn, bool lock_flush_tlb) 4696{ 4697 return slot_handle_level(kvm, memslot, fn, PT_PAGE_TABLE_LEVEL, 4698 PT_PAGE_TABLE_LEVEL, lock_flush_tlb); 4699} 4700 4701void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end) 4702{ 4703 struct kvm_memslots *slots; 4704 struct kvm_memory_slot *memslot; 4705 int i; 4706 4707 spin_lock(&kvm->mmu_lock); 4708 for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) { 4709 slots = __kvm_memslots(kvm, i); 4710 kvm_for_each_memslot(memslot, slots) { 4711 gfn_t start, end; 4712 4713 start = max(gfn_start, memslot->base_gfn); 4714 end = min(gfn_end, memslot->base_gfn + memslot->npages); 4715 if (start >= end) 4716 continue; 4717 4718 slot_handle_level_range(kvm, memslot, kvm_zap_rmapp, 4719 PT_PAGE_TABLE_LEVEL, PT_MAX_HUGEPAGE_LEVEL, 4720 start, end - 1, true); 4721 } 4722 } 4723 4724 spin_unlock(&kvm->mmu_lock); 4725} 4726 4727static bool slot_rmap_write_protect(struct kvm *kvm, 4728 struct kvm_rmap_head *rmap_head) 4729{ 4730 return __rmap_write_protect(kvm, rmap_head, false); 4731} 4732 4733void kvm_mmu_slot_remove_write_access(struct kvm *kvm, 4734 struct kvm_memory_slot *memslot) 4735{ 4736 bool flush; 4737 4738 spin_lock(&kvm->mmu_lock); 4739 flush = slot_handle_all_level(kvm, memslot, slot_rmap_write_protect, 4740 false); 4741 spin_unlock(&kvm->mmu_lock); 4742 4743 /* 4744 * kvm_mmu_slot_remove_write_access() and kvm_vm_ioctl_get_dirty_log() 4745 * which do tlb flush out of mmu-lock should be serialized by 4746 * kvm->slots_lock otherwise tlb flush would be missed. 4747 */ 4748 lockdep_assert_held(&kvm->slots_lock); 4749 4750 /* 4751 * We can flush all the TLBs out of the mmu lock without TLB 4752 * corruption since we just change the spte from writable to 4753 * readonly so that we only need to care the case of changing 4754 * spte from present to present (changing the spte from present 4755 * to nonpresent will flush all the TLBs immediately), in other 4756 * words, the only case we care is mmu_spte_update() where we 4757 * haved checked SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE 4758 * instead of PT_WRITABLE_MASK, that means it does not depend 4759 * on PT_WRITABLE_MASK anymore. 4760 */ 4761 if (flush) 4762 kvm_flush_remote_tlbs(kvm); 4763} 4764 4765static bool kvm_mmu_zap_collapsible_spte(struct kvm *kvm, 4766 struct kvm_rmap_head *rmap_head) 4767{ 4768 u64 *sptep; 4769 struct rmap_iterator iter; 4770 int need_tlb_flush = 0; 4771 kvm_pfn_t pfn; 4772 struct kvm_mmu_page *sp; 4773 4774restart: 4775 for_each_rmap_spte(rmap_head, &iter, sptep) { 4776 sp = page_header(__pa(sptep)); 4777 pfn = spte_to_pfn(*sptep); 4778 4779 /* 4780 * We cannot do huge page mapping for indirect shadow pages, 4781 * which are found on the last rmap (level = 1) when not using 4782 * tdp; such shadow pages are synced with the page table in 4783 * the guest, and the guest page table is using 4K page size 4784 * mapping if the indirect sp has level = 1. 4785 */ 4786 if (sp->role.direct && 4787 !kvm_is_reserved_pfn(pfn) && 4788 PageTransCompoundMap(pfn_to_page(pfn))) { 4789 drop_spte(kvm, sptep); 4790 need_tlb_flush = 1; 4791 goto restart; 4792 } 4793 } 4794 4795 return need_tlb_flush; 4796} 4797 4798void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm, 4799 const struct kvm_memory_slot *memslot) 4800{ 4801 /* FIXME: const-ify all uses of struct kvm_memory_slot. */ 4802 spin_lock(&kvm->mmu_lock); 4803 slot_handle_leaf(kvm, (struct kvm_memory_slot *)memslot, 4804 kvm_mmu_zap_collapsible_spte, true); 4805 spin_unlock(&kvm->mmu_lock); 4806} 4807 4808void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm, 4809 struct kvm_memory_slot *memslot) 4810{ 4811 bool flush; 4812 4813 spin_lock(&kvm->mmu_lock); 4814 flush = slot_handle_leaf(kvm, memslot, __rmap_clear_dirty, false); 4815 spin_unlock(&kvm->mmu_lock); 4816 4817 lockdep_assert_held(&kvm->slots_lock); 4818 4819 /* 4820 * It's also safe to flush TLBs out of mmu lock here as currently this 4821 * function is only used for dirty logging, in which case flushing TLB 4822 * out of mmu lock also guarantees no dirty pages will be lost in 4823 * dirty_bitmap. 4824 */ 4825 if (flush) 4826 kvm_flush_remote_tlbs(kvm); 4827} 4828EXPORT_SYMBOL_GPL(kvm_mmu_slot_leaf_clear_dirty); 4829 4830void kvm_mmu_slot_largepage_remove_write_access(struct kvm *kvm, 4831 struct kvm_memory_slot *memslot) 4832{ 4833 bool flush; 4834 4835 spin_lock(&kvm->mmu_lock); 4836 flush = slot_handle_large_level(kvm, memslot, slot_rmap_write_protect, 4837 false); 4838 spin_unlock(&kvm->mmu_lock); 4839 4840 /* see kvm_mmu_slot_remove_write_access */ 4841 lockdep_assert_held(&kvm->slots_lock); 4842 4843 if (flush) 4844 kvm_flush_remote_tlbs(kvm); 4845} 4846EXPORT_SYMBOL_GPL(kvm_mmu_slot_largepage_remove_write_access); 4847 4848void kvm_mmu_slot_set_dirty(struct kvm *kvm, 4849 struct kvm_memory_slot *memslot) 4850{ 4851 bool flush; 4852 4853 spin_lock(&kvm->mmu_lock); 4854 flush = slot_handle_all_level(kvm, memslot, __rmap_set_dirty, false); 4855 spin_unlock(&kvm->mmu_lock); 4856 4857 lockdep_assert_held(&kvm->slots_lock); 4858 4859 /* see kvm_mmu_slot_leaf_clear_dirty */ 4860 if (flush) 4861 kvm_flush_remote_tlbs(kvm); 4862} 4863EXPORT_SYMBOL_GPL(kvm_mmu_slot_set_dirty); 4864 4865#define BATCH_ZAP_PAGES 10 4866static void kvm_zap_obsolete_pages(struct kvm *kvm) 4867{ 4868 struct kvm_mmu_page *sp, *node; 4869 int batch = 0; 4870 4871restart: 4872 list_for_each_entry_safe_reverse(sp, node, 4873 &kvm->arch.active_mmu_pages, link) { 4874 int ret; 4875 4876 /* 4877 * No obsolete page exists before new created page since 4878 * active_mmu_pages is the FIFO list. 4879 */ 4880 if (!is_obsolete_sp(kvm, sp)) 4881 break; 4882 4883 /* 4884 * Since we are reversely walking the list and the invalid 4885 * list will be moved to the head, skip the invalid page 4886 * can help us to avoid the infinity list walking. 4887 */ 4888 if (sp->role.invalid) 4889 continue; 4890 4891 /* 4892 * Need not flush tlb since we only zap the sp with invalid 4893 * generation number. 4894 */ 4895 if (batch >= BATCH_ZAP_PAGES && 4896 cond_resched_lock(&kvm->mmu_lock)) { 4897 batch = 0; 4898 goto restart; 4899 } 4900 4901 ret = kvm_mmu_prepare_zap_page(kvm, sp, 4902 &kvm->arch.zapped_obsolete_pages); 4903 batch += ret; 4904 4905 if (ret) 4906 goto restart; 4907 } 4908 4909 /* 4910 * Should flush tlb before free page tables since lockless-walking 4911 * may use the pages. 4912 */ 4913 kvm_mmu_commit_zap_page(kvm, &kvm->arch.zapped_obsolete_pages); 4914} 4915 4916/* 4917 * Fast invalidate all shadow pages and use lock-break technique 4918 * to zap obsolete pages. 4919 * 4920 * It's required when memslot is being deleted or VM is being 4921 * destroyed, in these cases, we should ensure that KVM MMU does 4922 * not use any resource of the being-deleted slot or all slots 4923 * after calling the function. 4924 */ 4925void kvm_mmu_invalidate_zap_all_pages(struct kvm *kvm) 4926{ 4927 spin_lock(&kvm->mmu_lock); 4928 trace_kvm_mmu_invalidate_zap_all_pages(kvm); 4929 kvm->arch.mmu_valid_gen++; 4930 4931 /* 4932 * Notify all vcpus to reload its shadow page table 4933 * and flush TLB. Then all vcpus will switch to new 4934 * shadow page table with the new mmu_valid_gen. 4935 * 4936 * Note: we should do this under the protection of 4937 * mmu-lock, otherwise, vcpu would purge shadow page 4938 * but miss tlb flush. 4939 */ 4940 kvm_reload_remote_mmus(kvm); 4941 4942 kvm_zap_obsolete_pages(kvm); 4943 spin_unlock(&kvm->mmu_lock); 4944} 4945 4946static bool kvm_has_zapped_obsolete_pages(struct kvm *kvm) 4947{ 4948 return unlikely(!list_empty_careful(&kvm->arch.zapped_obsolete_pages)); 4949} 4950 4951void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, struct kvm_memslots *slots) 4952{ 4953 /* 4954 * The very rare case: if the generation-number is round, 4955 * zap all shadow pages. 4956 */ 4957 if (unlikely((slots->generation & MMIO_GEN_MASK) == 0)) { 4958 printk_ratelimited(KERN_DEBUG "kvm: zapping shadow pages for mmio generation wraparound\n"); 4959 kvm_mmu_invalidate_zap_all_pages(kvm); 4960 } 4961} 4962 4963static unsigned long 4964mmu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) 4965{ 4966 struct kvm *kvm; 4967 int nr_to_scan = sc->nr_to_scan; 4968 unsigned long freed = 0; 4969 4970 spin_lock(&kvm_lock); 4971 4972 list_for_each_entry(kvm, &vm_list, vm_list) { 4973 int idx; 4974 LIST_HEAD(invalid_list); 4975 4976 /* 4977 * Never scan more than sc->nr_to_scan VM instances. 4978 * Will not hit this condition practically since we do not try 4979 * to shrink more than one VM and it is very unlikely to see 4980 * !n_used_mmu_pages so many times. 4981 */ 4982 if (!nr_to_scan--) 4983 break; 4984 /* 4985 * n_used_mmu_pages is accessed without holding kvm->mmu_lock 4986 * here. We may skip a VM instance errorneosly, but we do not 4987 * want to shrink a VM that only started to populate its MMU 4988 * anyway. 4989 */ 4990 if (!kvm->arch.n_used_mmu_pages && 4991 !kvm_has_zapped_obsolete_pages(kvm)) 4992 continue; 4993 4994 idx = srcu_read_lock(&kvm->srcu); 4995 spin_lock(&kvm->mmu_lock); 4996 4997 if (kvm_has_zapped_obsolete_pages(kvm)) { 4998 kvm_mmu_commit_zap_page(kvm, 4999 &kvm->arch.zapped_obsolete_pages); 5000 goto unlock; 5001 } 5002 5003 if (prepare_zap_oldest_mmu_page(kvm, &invalid_list)) 5004 freed++; 5005 kvm_mmu_commit_zap_page(kvm, &invalid_list); 5006 5007unlock: 5008 spin_unlock(&kvm->mmu_lock); 5009 srcu_read_unlock(&kvm->srcu, idx); 5010 5011 /* 5012 * unfair on small ones 5013 * per-vm shrinkers cry out 5014 * sadness comes quickly 5015 */ 5016 list_move_tail(&kvm->vm_list, &vm_list); 5017 break; 5018 } 5019 5020 spin_unlock(&kvm_lock); 5021 return freed; 5022} 5023 5024static unsigned long 5025mmu_shrink_count(struct shrinker *shrink, struct shrink_control *sc) 5026{ 5027 return percpu_counter_read_positive(&kvm_total_used_mmu_pages); 5028} 5029 5030static struct shrinker mmu_shrinker = { 5031 .count_objects = mmu_shrink_count, 5032 .scan_objects = mmu_shrink_scan, 5033 .seeks = DEFAULT_SEEKS * 10, 5034}; 5035 5036static void mmu_destroy_caches(void) 5037{ 5038 if (pte_list_desc_cache) 5039 kmem_cache_destroy(pte_list_desc_cache); 5040 if (mmu_page_header_cache) 5041 kmem_cache_destroy(mmu_page_header_cache); 5042} 5043 5044int kvm_mmu_module_init(void) 5045{ 5046 pte_list_desc_cache = kmem_cache_create("pte_list_desc", 5047 sizeof(struct pte_list_desc), 5048 0, 0, NULL); 5049 if (!pte_list_desc_cache) 5050 goto nomem; 5051 5052 mmu_page_header_cache = kmem_cache_create("kvm_mmu_page_header", 5053 sizeof(struct kvm_mmu_page), 5054 0, 0, NULL); 5055 if (!mmu_page_header_cache) 5056 goto nomem; 5057 5058 if (percpu_counter_init(&kvm_total_used_mmu_pages, 0, GFP_KERNEL)) 5059 goto nomem; 5060 5061 register_shrinker(&mmu_shrinker); 5062 5063 return 0; 5064 5065nomem: 5066 mmu_destroy_caches(); 5067 return -ENOMEM; 5068} 5069 5070/* 5071 * Caculate mmu pages needed for kvm. 5072 */ 5073unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm) 5074{ 5075 unsigned int nr_mmu_pages; 5076 unsigned int nr_pages = 0; 5077 struct kvm_memslots *slots; 5078 struct kvm_memory_slot *memslot; 5079 int i; 5080 5081 for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) { 5082 slots = __kvm_memslots(kvm, i); 5083 5084 kvm_for_each_memslot(memslot, slots) 5085 nr_pages += memslot->npages; 5086 } 5087 5088 nr_mmu_pages = nr_pages * KVM_PERMILLE_MMU_PAGES / 1000; 5089 nr_mmu_pages = max(nr_mmu_pages, 5090 (unsigned int) KVM_MIN_ALLOC_MMU_PAGES); 5091 5092 return nr_mmu_pages; 5093} 5094 5095void kvm_mmu_destroy(struct kvm_vcpu *vcpu) 5096{ 5097 kvm_mmu_unload(vcpu); 5098 free_mmu_pages(vcpu); 5099 mmu_free_memory_caches(vcpu); 5100} 5101 5102void kvm_mmu_module_exit(void) 5103{ 5104 mmu_destroy_caches(); 5105 percpu_counter_destroy(&kvm_total_used_mmu_pages); 5106 unregister_shrinker(&mmu_shrinker); 5107 mmu_audit_disable(); 5108}