Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

at v4.6-rc7 1952 lines 52 kB view raw
1/* 2 * Copyright (C) 2012 - Virtual Open Systems and Columbia University 3 * Author: Christoffer Dall <c.dall@virtualopensystems.com> 4 * 5 * This program is free software; you can redistribute it and/or modify 6 * it under the terms of the GNU General Public License, version 2, as 7 * published by the Free Software Foundation. 8 * 9 * This program is distributed in the hope that it will be useful, 10 * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 * GNU General Public License for more details. 13 * 14 * You should have received a copy of the GNU General Public License 15 * along with this program; if not, write to the Free Software 16 * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. 17 */ 18 19#include <linux/mman.h> 20#include <linux/kvm_host.h> 21#include <linux/io.h> 22#include <linux/hugetlb.h> 23#include <trace/events/kvm.h> 24#include <asm/pgalloc.h> 25#include <asm/cacheflush.h> 26#include <asm/kvm_arm.h> 27#include <asm/kvm_mmu.h> 28#include <asm/kvm_mmio.h> 29#include <asm/kvm_asm.h> 30#include <asm/kvm_emulate.h> 31#include <asm/virt.h> 32 33#include "trace.h" 34 35extern char __hyp_idmap_text_start[], __hyp_idmap_text_end[]; 36 37static pgd_t *boot_hyp_pgd; 38static pgd_t *hyp_pgd; 39static pgd_t *merged_hyp_pgd; 40static DEFINE_MUTEX(kvm_hyp_pgd_mutex); 41 42static unsigned long hyp_idmap_start; 43static unsigned long hyp_idmap_end; 44static phys_addr_t hyp_idmap_vector; 45 46#define hyp_pgd_order get_order(PTRS_PER_PGD * sizeof(pgd_t)) 47 48#define kvm_pmd_huge(_x) (pmd_huge(_x) || pmd_trans_huge(_x)) 49#define kvm_pud_huge(_x) pud_huge(_x) 50 51#define KVM_S2PTE_FLAG_IS_IOMAP (1UL << 0) 52#define KVM_S2_FLAG_LOGGING_ACTIVE (1UL << 1) 53 54static bool memslot_is_logging(struct kvm_memory_slot *memslot) 55{ 56 return memslot->dirty_bitmap && !(memslot->flags & KVM_MEM_READONLY); 57} 58 59/** 60 * kvm_flush_remote_tlbs() - flush all VM TLB entries for v7/8 61 * @kvm: pointer to kvm structure. 62 * 63 * Interface to HYP function to flush all VM TLB entries 64 */ 65void kvm_flush_remote_tlbs(struct kvm *kvm) 66{ 67 kvm_call_hyp(__kvm_tlb_flush_vmid, kvm); 68} 69 70static void kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa) 71{ 72 /* 73 * This function also gets called when dealing with HYP page 74 * tables. As HYP doesn't have an associated struct kvm (and 75 * the HYP page tables are fairly static), we don't do 76 * anything there. 77 */ 78 if (kvm) 79 kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, kvm, ipa); 80} 81 82/* 83 * D-Cache management functions. They take the page table entries by 84 * value, as they are flushing the cache using the kernel mapping (or 85 * kmap on 32bit). 86 */ 87static void kvm_flush_dcache_pte(pte_t pte) 88{ 89 __kvm_flush_dcache_pte(pte); 90} 91 92static void kvm_flush_dcache_pmd(pmd_t pmd) 93{ 94 __kvm_flush_dcache_pmd(pmd); 95} 96 97static void kvm_flush_dcache_pud(pud_t pud) 98{ 99 __kvm_flush_dcache_pud(pud); 100} 101 102static bool kvm_is_device_pfn(unsigned long pfn) 103{ 104 return !pfn_valid(pfn); 105} 106 107/** 108 * stage2_dissolve_pmd() - clear and flush huge PMD entry 109 * @kvm: pointer to kvm structure. 110 * @addr: IPA 111 * @pmd: pmd pointer for IPA 112 * 113 * Function clears a PMD entry, flushes addr 1st and 2nd stage TLBs. Marks all 114 * pages in the range dirty. 115 */ 116static void stage2_dissolve_pmd(struct kvm *kvm, phys_addr_t addr, pmd_t *pmd) 117{ 118 if (!kvm_pmd_huge(*pmd)) 119 return; 120 121 pmd_clear(pmd); 122 kvm_tlb_flush_vmid_ipa(kvm, addr); 123 put_page(virt_to_page(pmd)); 124} 125 126static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache, 127 int min, int max) 128{ 129 void *page; 130 131 BUG_ON(max > KVM_NR_MEM_OBJS); 132 if (cache->nobjs >= min) 133 return 0; 134 while (cache->nobjs < max) { 135 page = (void *)__get_free_page(PGALLOC_GFP); 136 if (!page) 137 return -ENOMEM; 138 cache->objects[cache->nobjs++] = page; 139 } 140 return 0; 141} 142 143static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc) 144{ 145 while (mc->nobjs) 146 free_page((unsigned long)mc->objects[--mc->nobjs]); 147} 148 149static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc) 150{ 151 void *p; 152 153 BUG_ON(!mc || !mc->nobjs); 154 p = mc->objects[--mc->nobjs]; 155 return p; 156} 157 158static void clear_pgd_entry(struct kvm *kvm, pgd_t *pgd, phys_addr_t addr) 159{ 160 pud_t *pud_table __maybe_unused = pud_offset(pgd, 0); 161 pgd_clear(pgd); 162 kvm_tlb_flush_vmid_ipa(kvm, addr); 163 pud_free(NULL, pud_table); 164 put_page(virt_to_page(pgd)); 165} 166 167static void clear_pud_entry(struct kvm *kvm, pud_t *pud, phys_addr_t addr) 168{ 169 pmd_t *pmd_table = pmd_offset(pud, 0); 170 VM_BUG_ON(pud_huge(*pud)); 171 pud_clear(pud); 172 kvm_tlb_flush_vmid_ipa(kvm, addr); 173 pmd_free(NULL, pmd_table); 174 put_page(virt_to_page(pud)); 175} 176 177static void clear_pmd_entry(struct kvm *kvm, pmd_t *pmd, phys_addr_t addr) 178{ 179 pte_t *pte_table = pte_offset_kernel(pmd, 0); 180 VM_BUG_ON(kvm_pmd_huge(*pmd)); 181 pmd_clear(pmd); 182 kvm_tlb_flush_vmid_ipa(kvm, addr); 183 pte_free_kernel(NULL, pte_table); 184 put_page(virt_to_page(pmd)); 185} 186 187/* 188 * Unmapping vs dcache management: 189 * 190 * If a guest maps certain memory pages as uncached, all writes will 191 * bypass the data cache and go directly to RAM. However, the CPUs 192 * can still speculate reads (not writes) and fill cache lines with 193 * data. 194 * 195 * Those cache lines will be *clean* cache lines though, so a 196 * clean+invalidate operation is equivalent to an invalidate 197 * operation, because no cache lines are marked dirty. 198 * 199 * Those clean cache lines could be filled prior to an uncached write 200 * by the guest, and the cache coherent IO subsystem would therefore 201 * end up writing old data to disk. 202 * 203 * This is why right after unmapping a page/section and invalidating 204 * the corresponding TLBs, we call kvm_flush_dcache_p*() to make sure 205 * the IO subsystem will never hit in the cache. 206 */ 207static void unmap_ptes(struct kvm *kvm, pmd_t *pmd, 208 phys_addr_t addr, phys_addr_t end) 209{ 210 phys_addr_t start_addr = addr; 211 pte_t *pte, *start_pte; 212 213 start_pte = pte = pte_offset_kernel(pmd, addr); 214 do { 215 if (!pte_none(*pte)) { 216 pte_t old_pte = *pte; 217 218 kvm_set_pte(pte, __pte(0)); 219 kvm_tlb_flush_vmid_ipa(kvm, addr); 220 221 /* No need to invalidate the cache for device mappings */ 222 if (!kvm_is_device_pfn(pte_pfn(old_pte))) 223 kvm_flush_dcache_pte(old_pte); 224 225 put_page(virt_to_page(pte)); 226 } 227 } while (pte++, addr += PAGE_SIZE, addr != end); 228 229 if (kvm_pte_table_empty(kvm, start_pte)) 230 clear_pmd_entry(kvm, pmd, start_addr); 231} 232 233static void unmap_pmds(struct kvm *kvm, pud_t *pud, 234 phys_addr_t addr, phys_addr_t end) 235{ 236 phys_addr_t next, start_addr = addr; 237 pmd_t *pmd, *start_pmd; 238 239 start_pmd = pmd = pmd_offset(pud, addr); 240 do { 241 next = kvm_pmd_addr_end(addr, end); 242 if (!pmd_none(*pmd)) { 243 if (kvm_pmd_huge(*pmd)) { 244 pmd_t old_pmd = *pmd; 245 246 pmd_clear(pmd); 247 kvm_tlb_flush_vmid_ipa(kvm, addr); 248 249 kvm_flush_dcache_pmd(old_pmd); 250 251 put_page(virt_to_page(pmd)); 252 } else { 253 unmap_ptes(kvm, pmd, addr, next); 254 } 255 } 256 } while (pmd++, addr = next, addr != end); 257 258 if (kvm_pmd_table_empty(kvm, start_pmd)) 259 clear_pud_entry(kvm, pud, start_addr); 260} 261 262static void unmap_puds(struct kvm *kvm, pgd_t *pgd, 263 phys_addr_t addr, phys_addr_t end) 264{ 265 phys_addr_t next, start_addr = addr; 266 pud_t *pud, *start_pud; 267 268 start_pud = pud = pud_offset(pgd, addr); 269 do { 270 next = kvm_pud_addr_end(addr, end); 271 if (!pud_none(*pud)) { 272 if (pud_huge(*pud)) { 273 pud_t old_pud = *pud; 274 275 pud_clear(pud); 276 kvm_tlb_flush_vmid_ipa(kvm, addr); 277 278 kvm_flush_dcache_pud(old_pud); 279 280 put_page(virt_to_page(pud)); 281 } else { 282 unmap_pmds(kvm, pud, addr, next); 283 } 284 } 285 } while (pud++, addr = next, addr != end); 286 287 if (kvm_pud_table_empty(kvm, start_pud)) 288 clear_pgd_entry(kvm, pgd, start_addr); 289} 290 291 292static void unmap_range(struct kvm *kvm, pgd_t *pgdp, 293 phys_addr_t start, u64 size) 294{ 295 pgd_t *pgd; 296 phys_addr_t addr = start, end = start + size; 297 phys_addr_t next; 298 299 pgd = pgdp + kvm_pgd_index(addr); 300 do { 301 next = kvm_pgd_addr_end(addr, end); 302 if (!pgd_none(*pgd)) 303 unmap_puds(kvm, pgd, addr, next); 304 } while (pgd++, addr = next, addr != end); 305} 306 307static void stage2_flush_ptes(struct kvm *kvm, pmd_t *pmd, 308 phys_addr_t addr, phys_addr_t end) 309{ 310 pte_t *pte; 311 312 pte = pte_offset_kernel(pmd, addr); 313 do { 314 if (!pte_none(*pte) && !kvm_is_device_pfn(pte_pfn(*pte))) 315 kvm_flush_dcache_pte(*pte); 316 } while (pte++, addr += PAGE_SIZE, addr != end); 317} 318 319static void stage2_flush_pmds(struct kvm *kvm, pud_t *pud, 320 phys_addr_t addr, phys_addr_t end) 321{ 322 pmd_t *pmd; 323 phys_addr_t next; 324 325 pmd = pmd_offset(pud, addr); 326 do { 327 next = kvm_pmd_addr_end(addr, end); 328 if (!pmd_none(*pmd)) { 329 if (kvm_pmd_huge(*pmd)) 330 kvm_flush_dcache_pmd(*pmd); 331 else 332 stage2_flush_ptes(kvm, pmd, addr, next); 333 } 334 } while (pmd++, addr = next, addr != end); 335} 336 337static void stage2_flush_puds(struct kvm *kvm, pgd_t *pgd, 338 phys_addr_t addr, phys_addr_t end) 339{ 340 pud_t *pud; 341 phys_addr_t next; 342 343 pud = pud_offset(pgd, addr); 344 do { 345 next = kvm_pud_addr_end(addr, end); 346 if (!pud_none(*pud)) { 347 if (pud_huge(*pud)) 348 kvm_flush_dcache_pud(*pud); 349 else 350 stage2_flush_pmds(kvm, pud, addr, next); 351 } 352 } while (pud++, addr = next, addr != end); 353} 354 355static void stage2_flush_memslot(struct kvm *kvm, 356 struct kvm_memory_slot *memslot) 357{ 358 phys_addr_t addr = memslot->base_gfn << PAGE_SHIFT; 359 phys_addr_t end = addr + PAGE_SIZE * memslot->npages; 360 phys_addr_t next; 361 pgd_t *pgd; 362 363 pgd = kvm->arch.pgd + kvm_pgd_index(addr); 364 do { 365 next = kvm_pgd_addr_end(addr, end); 366 stage2_flush_puds(kvm, pgd, addr, next); 367 } while (pgd++, addr = next, addr != end); 368} 369 370/** 371 * stage2_flush_vm - Invalidate cache for pages mapped in stage 2 372 * @kvm: The struct kvm pointer 373 * 374 * Go through the stage 2 page tables and invalidate any cache lines 375 * backing memory already mapped to the VM. 376 */ 377static void stage2_flush_vm(struct kvm *kvm) 378{ 379 struct kvm_memslots *slots; 380 struct kvm_memory_slot *memslot; 381 int idx; 382 383 idx = srcu_read_lock(&kvm->srcu); 384 spin_lock(&kvm->mmu_lock); 385 386 slots = kvm_memslots(kvm); 387 kvm_for_each_memslot(memslot, slots) 388 stage2_flush_memslot(kvm, memslot); 389 390 spin_unlock(&kvm->mmu_lock); 391 srcu_read_unlock(&kvm->srcu, idx); 392} 393 394/** 395 * free_boot_hyp_pgd - free HYP boot page tables 396 * 397 * Free the HYP boot page tables. The bounce page is also freed. 398 */ 399void free_boot_hyp_pgd(void) 400{ 401 mutex_lock(&kvm_hyp_pgd_mutex); 402 403 if (boot_hyp_pgd) { 404 unmap_range(NULL, boot_hyp_pgd, hyp_idmap_start, PAGE_SIZE); 405 unmap_range(NULL, boot_hyp_pgd, TRAMPOLINE_VA, PAGE_SIZE); 406 free_pages((unsigned long)boot_hyp_pgd, hyp_pgd_order); 407 boot_hyp_pgd = NULL; 408 } 409 410 if (hyp_pgd) 411 unmap_range(NULL, hyp_pgd, TRAMPOLINE_VA, PAGE_SIZE); 412 413 mutex_unlock(&kvm_hyp_pgd_mutex); 414} 415 416/** 417 * free_hyp_pgds - free Hyp-mode page tables 418 * 419 * Assumes hyp_pgd is a page table used strictly in Hyp-mode and 420 * therefore contains either mappings in the kernel memory area (above 421 * PAGE_OFFSET), or device mappings in the vmalloc range (from 422 * VMALLOC_START to VMALLOC_END). 423 * 424 * boot_hyp_pgd should only map two pages for the init code. 425 */ 426void free_hyp_pgds(void) 427{ 428 unsigned long addr; 429 430 free_boot_hyp_pgd(); 431 432 mutex_lock(&kvm_hyp_pgd_mutex); 433 434 if (hyp_pgd) { 435 for (addr = PAGE_OFFSET; virt_addr_valid(addr); addr += PGDIR_SIZE) 436 unmap_range(NULL, hyp_pgd, KERN_TO_HYP(addr), PGDIR_SIZE); 437 for (addr = VMALLOC_START; is_vmalloc_addr((void*)addr); addr += PGDIR_SIZE) 438 unmap_range(NULL, hyp_pgd, KERN_TO_HYP(addr), PGDIR_SIZE); 439 440 free_pages((unsigned long)hyp_pgd, hyp_pgd_order); 441 hyp_pgd = NULL; 442 } 443 if (merged_hyp_pgd) { 444 clear_page(merged_hyp_pgd); 445 free_page((unsigned long)merged_hyp_pgd); 446 merged_hyp_pgd = NULL; 447 } 448 449 mutex_unlock(&kvm_hyp_pgd_mutex); 450} 451 452static void create_hyp_pte_mappings(pmd_t *pmd, unsigned long start, 453 unsigned long end, unsigned long pfn, 454 pgprot_t prot) 455{ 456 pte_t *pte; 457 unsigned long addr; 458 459 addr = start; 460 do { 461 pte = pte_offset_kernel(pmd, addr); 462 kvm_set_pte(pte, pfn_pte(pfn, prot)); 463 get_page(virt_to_page(pte)); 464 kvm_flush_dcache_to_poc(pte, sizeof(*pte)); 465 pfn++; 466 } while (addr += PAGE_SIZE, addr != end); 467} 468 469static int create_hyp_pmd_mappings(pud_t *pud, unsigned long start, 470 unsigned long end, unsigned long pfn, 471 pgprot_t prot) 472{ 473 pmd_t *pmd; 474 pte_t *pte; 475 unsigned long addr, next; 476 477 addr = start; 478 do { 479 pmd = pmd_offset(pud, addr); 480 481 BUG_ON(pmd_sect(*pmd)); 482 483 if (pmd_none(*pmd)) { 484 pte = pte_alloc_one_kernel(NULL, addr); 485 if (!pte) { 486 kvm_err("Cannot allocate Hyp pte\n"); 487 return -ENOMEM; 488 } 489 pmd_populate_kernel(NULL, pmd, pte); 490 get_page(virt_to_page(pmd)); 491 kvm_flush_dcache_to_poc(pmd, sizeof(*pmd)); 492 } 493 494 next = pmd_addr_end(addr, end); 495 496 create_hyp_pte_mappings(pmd, addr, next, pfn, prot); 497 pfn += (next - addr) >> PAGE_SHIFT; 498 } while (addr = next, addr != end); 499 500 return 0; 501} 502 503static int create_hyp_pud_mappings(pgd_t *pgd, unsigned long start, 504 unsigned long end, unsigned long pfn, 505 pgprot_t prot) 506{ 507 pud_t *pud; 508 pmd_t *pmd; 509 unsigned long addr, next; 510 int ret; 511 512 addr = start; 513 do { 514 pud = pud_offset(pgd, addr); 515 516 if (pud_none_or_clear_bad(pud)) { 517 pmd = pmd_alloc_one(NULL, addr); 518 if (!pmd) { 519 kvm_err("Cannot allocate Hyp pmd\n"); 520 return -ENOMEM; 521 } 522 pud_populate(NULL, pud, pmd); 523 get_page(virt_to_page(pud)); 524 kvm_flush_dcache_to_poc(pud, sizeof(*pud)); 525 } 526 527 next = pud_addr_end(addr, end); 528 ret = create_hyp_pmd_mappings(pud, addr, next, pfn, prot); 529 if (ret) 530 return ret; 531 pfn += (next - addr) >> PAGE_SHIFT; 532 } while (addr = next, addr != end); 533 534 return 0; 535} 536 537static int __create_hyp_mappings(pgd_t *pgdp, 538 unsigned long start, unsigned long end, 539 unsigned long pfn, pgprot_t prot) 540{ 541 pgd_t *pgd; 542 pud_t *pud; 543 unsigned long addr, next; 544 int err = 0; 545 546 mutex_lock(&kvm_hyp_pgd_mutex); 547 addr = start & PAGE_MASK; 548 end = PAGE_ALIGN(end); 549 do { 550 pgd = pgdp + pgd_index(addr); 551 552 if (pgd_none(*pgd)) { 553 pud = pud_alloc_one(NULL, addr); 554 if (!pud) { 555 kvm_err("Cannot allocate Hyp pud\n"); 556 err = -ENOMEM; 557 goto out; 558 } 559 pgd_populate(NULL, pgd, pud); 560 get_page(virt_to_page(pgd)); 561 kvm_flush_dcache_to_poc(pgd, sizeof(*pgd)); 562 } 563 564 next = pgd_addr_end(addr, end); 565 err = create_hyp_pud_mappings(pgd, addr, next, pfn, prot); 566 if (err) 567 goto out; 568 pfn += (next - addr) >> PAGE_SHIFT; 569 } while (addr = next, addr != end); 570out: 571 mutex_unlock(&kvm_hyp_pgd_mutex); 572 return err; 573} 574 575static phys_addr_t kvm_kaddr_to_phys(void *kaddr) 576{ 577 if (!is_vmalloc_addr(kaddr)) { 578 BUG_ON(!virt_addr_valid(kaddr)); 579 return __pa(kaddr); 580 } else { 581 return page_to_phys(vmalloc_to_page(kaddr)) + 582 offset_in_page(kaddr); 583 } 584} 585 586/** 587 * create_hyp_mappings - duplicate a kernel virtual address range in Hyp mode 588 * @from: The virtual kernel start address of the range 589 * @to: The virtual kernel end address of the range (exclusive) 590 * 591 * The same virtual address as the kernel virtual address is also used 592 * in Hyp-mode mapping (modulo HYP_PAGE_OFFSET) to the same underlying 593 * physical pages. 594 */ 595int create_hyp_mappings(void *from, void *to) 596{ 597 phys_addr_t phys_addr; 598 unsigned long virt_addr; 599 unsigned long start = KERN_TO_HYP((unsigned long)from); 600 unsigned long end = KERN_TO_HYP((unsigned long)to); 601 602 if (is_kernel_in_hyp_mode()) 603 return 0; 604 605 start = start & PAGE_MASK; 606 end = PAGE_ALIGN(end); 607 608 for (virt_addr = start; virt_addr < end; virt_addr += PAGE_SIZE) { 609 int err; 610 611 phys_addr = kvm_kaddr_to_phys(from + virt_addr - start); 612 err = __create_hyp_mappings(hyp_pgd, virt_addr, 613 virt_addr + PAGE_SIZE, 614 __phys_to_pfn(phys_addr), 615 PAGE_HYP); 616 if (err) 617 return err; 618 } 619 620 return 0; 621} 622 623/** 624 * create_hyp_io_mappings - duplicate a kernel IO mapping into Hyp mode 625 * @from: The kernel start VA of the range 626 * @to: The kernel end VA of the range (exclusive) 627 * @phys_addr: The physical start address which gets mapped 628 * 629 * The resulting HYP VA is the same as the kernel VA, modulo 630 * HYP_PAGE_OFFSET. 631 */ 632int create_hyp_io_mappings(void *from, void *to, phys_addr_t phys_addr) 633{ 634 unsigned long start = KERN_TO_HYP((unsigned long)from); 635 unsigned long end = KERN_TO_HYP((unsigned long)to); 636 637 if (is_kernel_in_hyp_mode()) 638 return 0; 639 640 /* Check for a valid kernel IO mapping */ 641 if (!is_vmalloc_addr(from) || !is_vmalloc_addr(to - 1)) 642 return -EINVAL; 643 644 return __create_hyp_mappings(hyp_pgd, start, end, 645 __phys_to_pfn(phys_addr), PAGE_HYP_DEVICE); 646} 647 648/* Free the HW pgd, one page at a time */ 649static void kvm_free_hwpgd(void *hwpgd) 650{ 651 free_pages_exact(hwpgd, kvm_get_hwpgd_size()); 652} 653 654/* Allocate the HW PGD, making sure that each page gets its own refcount */ 655static void *kvm_alloc_hwpgd(void) 656{ 657 unsigned int size = kvm_get_hwpgd_size(); 658 659 return alloc_pages_exact(size, GFP_KERNEL | __GFP_ZERO); 660} 661 662/** 663 * kvm_alloc_stage2_pgd - allocate level-1 table for stage-2 translation. 664 * @kvm: The KVM struct pointer for the VM. 665 * 666 * Allocates only the stage-2 HW PGD level table(s) (can support either full 667 * 40-bit input addresses or limited to 32-bit input addresses). Clears the 668 * allocated pages. 669 * 670 * Note we don't need locking here as this is only called when the VM is 671 * created, which can only be done once. 672 */ 673int kvm_alloc_stage2_pgd(struct kvm *kvm) 674{ 675 pgd_t *pgd; 676 void *hwpgd; 677 678 if (kvm->arch.pgd != NULL) { 679 kvm_err("kvm_arch already initialized?\n"); 680 return -EINVAL; 681 } 682 683 hwpgd = kvm_alloc_hwpgd(); 684 if (!hwpgd) 685 return -ENOMEM; 686 687 /* When the kernel uses more levels of page tables than the 688 * guest, we allocate a fake PGD and pre-populate it to point 689 * to the next-level page table, which will be the real 690 * initial page table pointed to by the VTTBR. 691 * 692 * When KVM_PREALLOC_LEVEL==2, we allocate a single page for 693 * the PMD and the kernel will use folded pud. 694 * When KVM_PREALLOC_LEVEL==1, we allocate 2 consecutive PUD 695 * pages. 696 */ 697 if (KVM_PREALLOC_LEVEL > 0) { 698 int i; 699 700 /* 701 * Allocate fake pgd for the page table manipulation macros to 702 * work. This is not used by the hardware and we have no 703 * alignment requirement for this allocation. 704 */ 705 pgd = kmalloc(PTRS_PER_S2_PGD * sizeof(pgd_t), 706 GFP_KERNEL | __GFP_ZERO); 707 708 if (!pgd) { 709 kvm_free_hwpgd(hwpgd); 710 return -ENOMEM; 711 } 712 713 /* Plug the HW PGD into the fake one. */ 714 for (i = 0; i < PTRS_PER_S2_PGD; i++) { 715 if (KVM_PREALLOC_LEVEL == 1) 716 pgd_populate(NULL, pgd + i, 717 (pud_t *)hwpgd + i * PTRS_PER_PUD); 718 else if (KVM_PREALLOC_LEVEL == 2) 719 pud_populate(NULL, pud_offset(pgd, 0) + i, 720 (pmd_t *)hwpgd + i * PTRS_PER_PMD); 721 } 722 } else { 723 /* 724 * Allocate actual first-level Stage-2 page table used by the 725 * hardware for Stage-2 page table walks. 726 */ 727 pgd = (pgd_t *)hwpgd; 728 } 729 730 kvm_clean_pgd(pgd); 731 kvm->arch.pgd = pgd; 732 return 0; 733} 734 735/** 736 * unmap_stage2_range -- Clear stage2 page table entries to unmap a range 737 * @kvm: The VM pointer 738 * @start: The intermediate physical base address of the range to unmap 739 * @size: The size of the area to unmap 740 * 741 * Clear a range of stage-2 mappings, lowering the various ref-counts. Must 742 * be called while holding mmu_lock (unless for freeing the stage2 pgd before 743 * destroying the VM), otherwise another faulting VCPU may come in and mess 744 * with things behind our backs. 745 */ 746static void unmap_stage2_range(struct kvm *kvm, phys_addr_t start, u64 size) 747{ 748 unmap_range(kvm, kvm->arch.pgd, start, size); 749} 750 751static void stage2_unmap_memslot(struct kvm *kvm, 752 struct kvm_memory_slot *memslot) 753{ 754 hva_t hva = memslot->userspace_addr; 755 phys_addr_t addr = memslot->base_gfn << PAGE_SHIFT; 756 phys_addr_t size = PAGE_SIZE * memslot->npages; 757 hva_t reg_end = hva + size; 758 759 /* 760 * A memory region could potentially cover multiple VMAs, and any holes 761 * between them, so iterate over all of them to find out if we should 762 * unmap any of them. 763 * 764 * +--------------------------------------------+ 765 * +---------------+----------------+ +----------------+ 766 * | : VMA 1 | VMA 2 | | VMA 3 : | 767 * +---------------+----------------+ +----------------+ 768 * | memory region | 769 * +--------------------------------------------+ 770 */ 771 do { 772 struct vm_area_struct *vma = find_vma(current->mm, hva); 773 hva_t vm_start, vm_end; 774 775 if (!vma || vma->vm_start >= reg_end) 776 break; 777 778 /* 779 * Take the intersection of this VMA with the memory region 780 */ 781 vm_start = max(hva, vma->vm_start); 782 vm_end = min(reg_end, vma->vm_end); 783 784 if (!(vma->vm_flags & VM_PFNMAP)) { 785 gpa_t gpa = addr + (vm_start - memslot->userspace_addr); 786 unmap_stage2_range(kvm, gpa, vm_end - vm_start); 787 } 788 hva = vm_end; 789 } while (hva < reg_end); 790} 791 792/** 793 * stage2_unmap_vm - Unmap Stage-2 RAM mappings 794 * @kvm: The struct kvm pointer 795 * 796 * Go through the memregions and unmap any reguler RAM 797 * backing memory already mapped to the VM. 798 */ 799void stage2_unmap_vm(struct kvm *kvm) 800{ 801 struct kvm_memslots *slots; 802 struct kvm_memory_slot *memslot; 803 int idx; 804 805 idx = srcu_read_lock(&kvm->srcu); 806 spin_lock(&kvm->mmu_lock); 807 808 slots = kvm_memslots(kvm); 809 kvm_for_each_memslot(memslot, slots) 810 stage2_unmap_memslot(kvm, memslot); 811 812 spin_unlock(&kvm->mmu_lock); 813 srcu_read_unlock(&kvm->srcu, idx); 814} 815 816/** 817 * kvm_free_stage2_pgd - free all stage-2 tables 818 * @kvm: The KVM struct pointer for the VM. 819 * 820 * Walks the level-1 page table pointed to by kvm->arch.pgd and frees all 821 * underlying level-2 and level-3 tables before freeing the actual level-1 table 822 * and setting the struct pointer to NULL. 823 * 824 * Note we don't need locking here as this is only called when the VM is 825 * destroyed, which can only be done once. 826 */ 827void kvm_free_stage2_pgd(struct kvm *kvm) 828{ 829 if (kvm->arch.pgd == NULL) 830 return; 831 832 unmap_stage2_range(kvm, 0, KVM_PHYS_SIZE); 833 kvm_free_hwpgd(kvm_get_hwpgd(kvm)); 834 if (KVM_PREALLOC_LEVEL > 0) 835 kfree(kvm->arch.pgd); 836 837 kvm->arch.pgd = NULL; 838} 839 840static pud_t *stage2_get_pud(struct kvm *kvm, struct kvm_mmu_memory_cache *cache, 841 phys_addr_t addr) 842{ 843 pgd_t *pgd; 844 pud_t *pud; 845 846 pgd = kvm->arch.pgd + kvm_pgd_index(addr); 847 if (WARN_ON(pgd_none(*pgd))) { 848 if (!cache) 849 return NULL; 850 pud = mmu_memory_cache_alloc(cache); 851 pgd_populate(NULL, pgd, pud); 852 get_page(virt_to_page(pgd)); 853 } 854 855 return pud_offset(pgd, addr); 856} 857 858static pmd_t *stage2_get_pmd(struct kvm *kvm, struct kvm_mmu_memory_cache *cache, 859 phys_addr_t addr) 860{ 861 pud_t *pud; 862 pmd_t *pmd; 863 864 pud = stage2_get_pud(kvm, cache, addr); 865 if (pud_none(*pud)) { 866 if (!cache) 867 return NULL; 868 pmd = mmu_memory_cache_alloc(cache); 869 pud_populate(NULL, pud, pmd); 870 get_page(virt_to_page(pud)); 871 } 872 873 return pmd_offset(pud, addr); 874} 875 876static int stage2_set_pmd_huge(struct kvm *kvm, struct kvm_mmu_memory_cache 877 *cache, phys_addr_t addr, const pmd_t *new_pmd) 878{ 879 pmd_t *pmd, old_pmd; 880 881 pmd = stage2_get_pmd(kvm, cache, addr); 882 VM_BUG_ON(!pmd); 883 884 /* 885 * Mapping in huge pages should only happen through a fault. If a 886 * page is merged into a transparent huge page, the individual 887 * subpages of that huge page should be unmapped through MMU 888 * notifiers before we get here. 889 * 890 * Merging of CompoundPages is not supported; they should become 891 * splitting first, unmapped, merged, and mapped back in on-demand. 892 */ 893 VM_BUG_ON(pmd_present(*pmd) && pmd_pfn(*pmd) != pmd_pfn(*new_pmd)); 894 895 old_pmd = *pmd; 896 kvm_set_pmd(pmd, *new_pmd); 897 if (pmd_present(old_pmd)) 898 kvm_tlb_flush_vmid_ipa(kvm, addr); 899 else 900 get_page(virt_to_page(pmd)); 901 return 0; 902} 903 904static int stage2_set_pte(struct kvm *kvm, struct kvm_mmu_memory_cache *cache, 905 phys_addr_t addr, const pte_t *new_pte, 906 unsigned long flags) 907{ 908 pmd_t *pmd; 909 pte_t *pte, old_pte; 910 bool iomap = flags & KVM_S2PTE_FLAG_IS_IOMAP; 911 bool logging_active = flags & KVM_S2_FLAG_LOGGING_ACTIVE; 912 913 VM_BUG_ON(logging_active && !cache); 914 915 /* Create stage-2 page table mapping - Levels 0 and 1 */ 916 pmd = stage2_get_pmd(kvm, cache, addr); 917 if (!pmd) { 918 /* 919 * Ignore calls from kvm_set_spte_hva for unallocated 920 * address ranges. 921 */ 922 return 0; 923 } 924 925 /* 926 * While dirty page logging - dissolve huge PMD, then continue on to 927 * allocate page. 928 */ 929 if (logging_active) 930 stage2_dissolve_pmd(kvm, addr, pmd); 931 932 /* Create stage-2 page mappings - Level 2 */ 933 if (pmd_none(*pmd)) { 934 if (!cache) 935 return 0; /* ignore calls from kvm_set_spte_hva */ 936 pte = mmu_memory_cache_alloc(cache); 937 kvm_clean_pte(pte); 938 pmd_populate_kernel(NULL, pmd, pte); 939 get_page(virt_to_page(pmd)); 940 } 941 942 pte = pte_offset_kernel(pmd, addr); 943 944 if (iomap && pte_present(*pte)) 945 return -EFAULT; 946 947 /* Create 2nd stage page table mapping - Level 3 */ 948 old_pte = *pte; 949 kvm_set_pte(pte, *new_pte); 950 if (pte_present(old_pte)) 951 kvm_tlb_flush_vmid_ipa(kvm, addr); 952 else 953 get_page(virt_to_page(pte)); 954 955 return 0; 956} 957 958/** 959 * kvm_phys_addr_ioremap - map a device range to guest IPA 960 * 961 * @kvm: The KVM pointer 962 * @guest_ipa: The IPA at which to insert the mapping 963 * @pa: The physical address of the device 964 * @size: The size of the mapping 965 */ 966int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa, 967 phys_addr_t pa, unsigned long size, bool writable) 968{ 969 phys_addr_t addr, end; 970 int ret = 0; 971 unsigned long pfn; 972 struct kvm_mmu_memory_cache cache = { 0, }; 973 974 end = (guest_ipa + size + PAGE_SIZE - 1) & PAGE_MASK; 975 pfn = __phys_to_pfn(pa); 976 977 for (addr = guest_ipa; addr < end; addr += PAGE_SIZE) { 978 pte_t pte = pfn_pte(pfn, PAGE_S2_DEVICE); 979 980 if (writable) 981 kvm_set_s2pte_writable(&pte); 982 983 ret = mmu_topup_memory_cache(&cache, KVM_MMU_CACHE_MIN_PAGES, 984 KVM_NR_MEM_OBJS); 985 if (ret) 986 goto out; 987 spin_lock(&kvm->mmu_lock); 988 ret = stage2_set_pte(kvm, &cache, addr, &pte, 989 KVM_S2PTE_FLAG_IS_IOMAP); 990 spin_unlock(&kvm->mmu_lock); 991 if (ret) 992 goto out; 993 994 pfn++; 995 } 996 997out: 998 mmu_free_memory_cache(&cache); 999 return ret; 1000} 1001 1002static bool transparent_hugepage_adjust(kvm_pfn_t *pfnp, phys_addr_t *ipap) 1003{ 1004 kvm_pfn_t pfn = *pfnp; 1005 gfn_t gfn = *ipap >> PAGE_SHIFT; 1006 1007 if (PageTransCompoundMap(pfn_to_page(pfn))) { 1008 unsigned long mask; 1009 /* 1010 * The address we faulted on is backed by a transparent huge 1011 * page. However, because we map the compound huge page and 1012 * not the individual tail page, we need to transfer the 1013 * refcount to the head page. We have to be careful that the 1014 * THP doesn't start to split while we are adjusting the 1015 * refcounts. 1016 * 1017 * We are sure this doesn't happen, because mmu_notifier_retry 1018 * was successful and we are holding the mmu_lock, so if this 1019 * THP is trying to split, it will be blocked in the mmu 1020 * notifier before touching any of the pages, specifically 1021 * before being able to call __split_huge_page_refcount(). 1022 * 1023 * We can therefore safely transfer the refcount from PG_tail 1024 * to PG_head and switch the pfn from a tail page to the head 1025 * page accordingly. 1026 */ 1027 mask = PTRS_PER_PMD - 1; 1028 VM_BUG_ON((gfn & mask) != (pfn & mask)); 1029 if (pfn & mask) { 1030 *ipap &= PMD_MASK; 1031 kvm_release_pfn_clean(pfn); 1032 pfn &= ~mask; 1033 kvm_get_pfn(pfn); 1034 *pfnp = pfn; 1035 } 1036 1037 return true; 1038 } 1039 1040 return false; 1041} 1042 1043static bool kvm_is_write_fault(struct kvm_vcpu *vcpu) 1044{ 1045 if (kvm_vcpu_trap_is_iabt(vcpu)) 1046 return false; 1047 1048 return kvm_vcpu_dabt_iswrite(vcpu); 1049} 1050 1051/** 1052 * stage2_wp_ptes - write protect PMD range 1053 * @pmd: pointer to pmd entry 1054 * @addr: range start address 1055 * @end: range end address 1056 */ 1057static void stage2_wp_ptes(pmd_t *pmd, phys_addr_t addr, phys_addr_t end) 1058{ 1059 pte_t *pte; 1060 1061 pte = pte_offset_kernel(pmd, addr); 1062 do { 1063 if (!pte_none(*pte)) { 1064 if (!kvm_s2pte_readonly(pte)) 1065 kvm_set_s2pte_readonly(pte); 1066 } 1067 } while (pte++, addr += PAGE_SIZE, addr != end); 1068} 1069 1070/** 1071 * stage2_wp_pmds - write protect PUD range 1072 * @pud: pointer to pud entry 1073 * @addr: range start address 1074 * @end: range end address 1075 */ 1076static void stage2_wp_pmds(pud_t *pud, phys_addr_t addr, phys_addr_t end) 1077{ 1078 pmd_t *pmd; 1079 phys_addr_t next; 1080 1081 pmd = pmd_offset(pud, addr); 1082 1083 do { 1084 next = kvm_pmd_addr_end(addr, end); 1085 if (!pmd_none(*pmd)) { 1086 if (kvm_pmd_huge(*pmd)) { 1087 if (!kvm_s2pmd_readonly(pmd)) 1088 kvm_set_s2pmd_readonly(pmd); 1089 } else { 1090 stage2_wp_ptes(pmd, addr, next); 1091 } 1092 } 1093 } while (pmd++, addr = next, addr != end); 1094} 1095 1096/** 1097 * stage2_wp_puds - write protect PGD range 1098 * @pgd: pointer to pgd entry 1099 * @addr: range start address 1100 * @end: range end address 1101 * 1102 * Process PUD entries, for a huge PUD we cause a panic. 1103 */ 1104static void stage2_wp_puds(pgd_t *pgd, phys_addr_t addr, phys_addr_t end) 1105{ 1106 pud_t *pud; 1107 phys_addr_t next; 1108 1109 pud = pud_offset(pgd, addr); 1110 do { 1111 next = kvm_pud_addr_end(addr, end); 1112 if (!pud_none(*pud)) { 1113 /* TODO:PUD not supported, revisit later if supported */ 1114 BUG_ON(kvm_pud_huge(*pud)); 1115 stage2_wp_pmds(pud, addr, next); 1116 } 1117 } while (pud++, addr = next, addr != end); 1118} 1119 1120/** 1121 * stage2_wp_range() - write protect stage2 memory region range 1122 * @kvm: The KVM pointer 1123 * @addr: Start address of range 1124 * @end: End address of range 1125 */ 1126static void stage2_wp_range(struct kvm *kvm, phys_addr_t addr, phys_addr_t end) 1127{ 1128 pgd_t *pgd; 1129 phys_addr_t next; 1130 1131 pgd = kvm->arch.pgd + kvm_pgd_index(addr); 1132 do { 1133 /* 1134 * Release kvm_mmu_lock periodically if the memory region is 1135 * large. Otherwise, we may see kernel panics with 1136 * CONFIG_DETECT_HUNG_TASK, CONFIG_LOCKUP_DETECTOR, 1137 * CONFIG_LOCKDEP. Additionally, holding the lock too long 1138 * will also starve other vCPUs. 1139 */ 1140 if (need_resched() || spin_needbreak(&kvm->mmu_lock)) 1141 cond_resched_lock(&kvm->mmu_lock); 1142 1143 next = kvm_pgd_addr_end(addr, end); 1144 if (pgd_present(*pgd)) 1145 stage2_wp_puds(pgd, addr, next); 1146 } while (pgd++, addr = next, addr != end); 1147} 1148 1149/** 1150 * kvm_mmu_wp_memory_region() - write protect stage 2 entries for memory slot 1151 * @kvm: The KVM pointer 1152 * @slot: The memory slot to write protect 1153 * 1154 * Called to start logging dirty pages after memory region 1155 * KVM_MEM_LOG_DIRTY_PAGES operation is called. After this function returns 1156 * all present PMD and PTEs are write protected in the memory region. 1157 * Afterwards read of dirty page log can be called. 1158 * 1159 * Acquires kvm_mmu_lock. Called with kvm->slots_lock mutex acquired, 1160 * serializing operations for VM memory regions. 1161 */ 1162void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot) 1163{ 1164 struct kvm_memslots *slots = kvm_memslots(kvm); 1165 struct kvm_memory_slot *memslot = id_to_memslot(slots, slot); 1166 phys_addr_t start = memslot->base_gfn << PAGE_SHIFT; 1167 phys_addr_t end = (memslot->base_gfn + memslot->npages) << PAGE_SHIFT; 1168 1169 spin_lock(&kvm->mmu_lock); 1170 stage2_wp_range(kvm, start, end); 1171 spin_unlock(&kvm->mmu_lock); 1172 kvm_flush_remote_tlbs(kvm); 1173} 1174 1175/** 1176 * kvm_mmu_write_protect_pt_masked() - write protect dirty pages 1177 * @kvm: The KVM pointer 1178 * @slot: The memory slot associated with mask 1179 * @gfn_offset: The gfn offset in memory slot 1180 * @mask: The mask of dirty pages at offset 'gfn_offset' in this memory 1181 * slot to be write protected 1182 * 1183 * Walks bits set in mask write protects the associated pte's. Caller must 1184 * acquire kvm_mmu_lock. 1185 */ 1186static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm, 1187 struct kvm_memory_slot *slot, 1188 gfn_t gfn_offset, unsigned long mask) 1189{ 1190 phys_addr_t base_gfn = slot->base_gfn + gfn_offset; 1191 phys_addr_t start = (base_gfn + __ffs(mask)) << PAGE_SHIFT; 1192 phys_addr_t end = (base_gfn + __fls(mask) + 1) << PAGE_SHIFT; 1193 1194 stage2_wp_range(kvm, start, end); 1195} 1196 1197/* 1198 * kvm_arch_mmu_enable_log_dirty_pt_masked - enable dirty logging for selected 1199 * dirty pages. 1200 * 1201 * It calls kvm_mmu_write_protect_pt_masked to write protect selected pages to 1202 * enable dirty logging for them. 1203 */ 1204void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm, 1205 struct kvm_memory_slot *slot, 1206 gfn_t gfn_offset, unsigned long mask) 1207{ 1208 kvm_mmu_write_protect_pt_masked(kvm, slot, gfn_offset, mask); 1209} 1210 1211static void coherent_cache_guest_page(struct kvm_vcpu *vcpu, kvm_pfn_t pfn, 1212 unsigned long size, bool uncached) 1213{ 1214 __coherent_cache_guest_page(vcpu, pfn, size, uncached); 1215} 1216 1217static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, 1218 struct kvm_memory_slot *memslot, unsigned long hva, 1219 unsigned long fault_status) 1220{ 1221 int ret; 1222 bool write_fault, writable, hugetlb = false, force_pte = false; 1223 unsigned long mmu_seq; 1224 gfn_t gfn = fault_ipa >> PAGE_SHIFT; 1225 struct kvm *kvm = vcpu->kvm; 1226 struct kvm_mmu_memory_cache *memcache = &vcpu->arch.mmu_page_cache; 1227 struct vm_area_struct *vma; 1228 kvm_pfn_t pfn; 1229 pgprot_t mem_type = PAGE_S2; 1230 bool fault_ipa_uncached; 1231 bool logging_active = memslot_is_logging(memslot); 1232 unsigned long flags = 0; 1233 1234 write_fault = kvm_is_write_fault(vcpu); 1235 if (fault_status == FSC_PERM && !write_fault) { 1236 kvm_err("Unexpected L2 read permission error\n"); 1237 return -EFAULT; 1238 } 1239 1240 /* Let's check if we will get back a huge page backed by hugetlbfs */ 1241 down_read(&current->mm->mmap_sem); 1242 vma = find_vma_intersection(current->mm, hva, hva + 1); 1243 if (unlikely(!vma)) { 1244 kvm_err("Failed to find VMA for hva 0x%lx\n", hva); 1245 up_read(&current->mm->mmap_sem); 1246 return -EFAULT; 1247 } 1248 1249 if (is_vm_hugetlb_page(vma) && !logging_active) { 1250 hugetlb = true; 1251 gfn = (fault_ipa & PMD_MASK) >> PAGE_SHIFT; 1252 } else { 1253 /* 1254 * Pages belonging to memslots that don't have the same 1255 * alignment for userspace and IPA cannot be mapped using 1256 * block descriptors even if the pages belong to a THP for 1257 * the process, because the stage-2 block descriptor will 1258 * cover more than a single THP and we loose atomicity for 1259 * unmapping, updates, and splits of the THP or other pages 1260 * in the stage-2 block range. 1261 */ 1262 if ((memslot->userspace_addr & ~PMD_MASK) != 1263 ((memslot->base_gfn << PAGE_SHIFT) & ~PMD_MASK)) 1264 force_pte = true; 1265 } 1266 up_read(&current->mm->mmap_sem); 1267 1268 /* We need minimum second+third level pages */ 1269 ret = mmu_topup_memory_cache(memcache, KVM_MMU_CACHE_MIN_PAGES, 1270 KVM_NR_MEM_OBJS); 1271 if (ret) 1272 return ret; 1273 1274 mmu_seq = vcpu->kvm->mmu_notifier_seq; 1275 /* 1276 * Ensure the read of mmu_notifier_seq happens before we call 1277 * gfn_to_pfn_prot (which calls get_user_pages), so that we don't risk 1278 * the page we just got a reference to gets unmapped before we have a 1279 * chance to grab the mmu_lock, which ensure that if the page gets 1280 * unmapped afterwards, the call to kvm_unmap_hva will take it away 1281 * from us again properly. This smp_rmb() interacts with the smp_wmb() 1282 * in kvm_mmu_notifier_invalidate_<page|range_end>. 1283 */ 1284 smp_rmb(); 1285 1286 pfn = gfn_to_pfn_prot(kvm, gfn, write_fault, &writable); 1287 if (is_error_pfn(pfn)) 1288 return -EFAULT; 1289 1290 if (kvm_is_device_pfn(pfn)) { 1291 mem_type = PAGE_S2_DEVICE; 1292 flags |= KVM_S2PTE_FLAG_IS_IOMAP; 1293 } else if (logging_active) { 1294 /* 1295 * Faults on pages in a memslot with logging enabled 1296 * should not be mapped with huge pages (it introduces churn 1297 * and performance degradation), so force a pte mapping. 1298 */ 1299 force_pte = true; 1300 flags |= KVM_S2_FLAG_LOGGING_ACTIVE; 1301 1302 /* 1303 * Only actually map the page as writable if this was a write 1304 * fault. 1305 */ 1306 if (!write_fault) 1307 writable = false; 1308 } 1309 1310 spin_lock(&kvm->mmu_lock); 1311 if (mmu_notifier_retry(kvm, mmu_seq)) 1312 goto out_unlock; 1313 1314 if (!hugetlb && !force_pte) 1315 hugetlb = transparent_hugepage_adjust(&pfn, &fault_ipa); 1316 1317 fault_ipa_uncached = memslot->flags & KVM_MEMSLOT_INCOHERENT; 1318 1319 if (hugetlb) { 1320 pmd_t new_pmd = pfn_pmd(pfn, mem_type); 1321 new_pmd = pmd_mkhuge(new_pmd); 1322 if (writable) { 1323 kvm_set_s2pmd_writable(&new_pmd); 1324 kvm_set_pfn_dirty(pfn); 1325 } 1326 coherent_cache_guest_page(vcpu, pfn, PMD_SIZE, fault_ipa_uncached); 1327 ret = stage2_set_pmd_huge(kvm, memcache, fault_ipa, &new_pmd); 1328 } else { 1329 pte_t new_pte = pfn_pte(pfn, mem_type); 1330 1331 if (writable) { 1332 kvm_set_s2pte_writable(&new_pte); 1333 kvm_set_pfn_dirty(pfn); 1334 mark_page_dirty(kvm, gfn); 1335 } 1336 coherent_cache_guest_page(vcpu, pfn, PAGE_SIZE, fault_ipa_uncached); 1337 ret = stage2_set_pte(kvm, memcache, fault_ipa, &new_pte, flags); 1338 } 1339 1340out_unlock: 1341 spin_unlock(&kvm->mmu_lock); 1342 kvm_set_pfn_accessed(pfn); 1343 kvm_release_pfn_clean(pfn); 1344 return ret; 1345} 1346 1347/* 1348 * Resolve the access fault by making the page young again. 1349 * Note that because the faulting entry is guaranteed not to be 1350 * cached in the TLB, we don't need to invalidate anything. 1351 */ 1352static void handle_access_fault(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa) 1353{ 1354 pmd_t *pmd; 1355 pte_t *pte; 1356 kvm_pfn_t pfn; 1357 bool pfn_valid = false; 1358 1359 trace_kvm_access_fault(fault_ipa); 1360 1361 spin_lock(&vcpu->kvm->mmu_lock); 1362 1363 pmd = stage2_get_pmd(vcpu->kvm, NULL, fault_ipa); 1364 if (!pmd || pmd_none(*pmd)) /* Nothing there */ 1365 goto out; 1366 1367 if (kvm_pmd_huge(*pmd)) { /* THP, HugeTLB */ 1368 *pmd = pmd_mkyoung(*pmd); 1369 pfn = pmd_pfn(*pmd); 1370 pfn_valid = true; 1371 goto out; 1372 } 1373 1374 pte = pte_offset_kernel(pmd, fault_ipa); 1375 if (pte_none(*pte)) /* Nothing there either */ 1376 goto out; 1377 1378 *pte = pte_mkyoung(*pte); /* Just a page... */ 1379 pfn = pte_pfn(*pte); 1380 pfn_valid = true; 1381out: 1382 spin_unlock(&vcpu->kvm->mmu_lock); 1383 if (pfn_valid) 1384 kvm_set_pfn_accessed(pfn); 1385} 1386 1387/** 1388 * kvm_handle_guest_abort - handles all 2nd stage aborts 1389 * @vcpu: the VCPU pointer 1390 * @run: the kvm_run structure 1391 * 1392 * Any abort that gets to the host is almost guaranteed to be caused by a 1393 * missing second stage translation table entry, which can mean that either the 1394 * guest simply needs more memory and we must allocate an appropriate page or it 1395 * can mean that the guest tried to access I/O memory, which is emulated by user 1396 * space. The distinction is based on the IPA causing the fault and whether this 1397 * memory region has been registered as standard RAM by user space. 1398 */ 1399int kvm_handle_guest_abort(struct kvm_vcpu *vcpu, struct kvm_run *run) 1400{ 1401 unsigned long fault_status; 1402 phys_addr_t fault_ipa; 1403 struct kvm_memory_slot *memslot; 1404 unsigned long hva; 1405 bool is_iabt, write_fault, writable; 1406 gfn_t gfn; 1407 int ret, idx; 1408 1409 is_iabt = kvm_vcpu_trap_is_iabt(vcpu); 1410 fault_ipa = kvm_vcpu_get_fault_ipa(vcpu); 1411 1412 trace_kvm_guest_fault(*vcpu_pc(vcpu), kvm_vcpu_get_hsr(vcpu), 1413 kvm_vcpu_get_hfar(vcpu), fault_ipa); 1414 1415 /* Check the stage-2 fault is trans. fault or write fault */ 1416 fault_status = kvm_vcpu_trap_get_fault_type(vcpu); 1417 if (fault_status != FSC_FAULT && fault_status != FSC_PERM && 1418 fault_status != FSC_ACCESS) { 1419 kvm_err("Unsupported FSC: EC=%#x xFSC=%#lx ESR_EL2=%#lx\n", 1420 kvm_vcpu_trap_get_class(vcpu), 1421 (unsigned long)kvm_vcpu_trap_get_fault(vcpu), 1422 (unsigned long)kvm_vcpu_get_hsr(vcpu)); 1423 return -EFAULT; 1424 } 1425 1426 idx = srcu_read_lock(&vcpu->kvm->srcu); 1427 1428 gfn = fault_ipa >> PAGE_SHIFT; 1429 memslot = gfn_to_memslot(vcpu->kvm, gfn); 1430 hva = gfn_to_hva_memslot_prot(memslot, gfn, &writable); 1431 write_fault = kvm_is_write_fault(vcpu); 1432 if (kvm_is_error_hva(hva) || (write_fault && !writable)) { 1433 if (is_iabt) { 1434 /* Prefetch Abort on I/O address */ 1435 kvm_inject_pabt(vcpu, kvm_vcpu_get_hfar(vcpu)); 1436 ret = 1; 1437 goto out_unlock; 1438 } 1439 1440 /* 1441 * Check for a cache maintenance operation. Since we 1442 * ended-up here, we know it is outside of any memory 1443 * slot. But we can't find out if that is for a device, 1444 * or if the guest is just being stupid. The only thing 1445 * we know for sure is that this range cannot be cached. 1446 * 1447 * So let's assume that the guest is just being 1448 * cautious, and skip the instruction. 1449 */ 1450 if (kvm_vcpu_dabt_is_cm(vcpu)) { 1451 kvm_skip_instr(vcpu, kvm_vcpu_trap_il_is32bit(vcpu)); 1452 ret = 1; 1453 goto out_unlock; 1454 } 1455 1456 /* 1457 * The IPA is reported as [MAX:12], so we need to 1458 * complement it with the bottom 12 bits from the 1459 * faulting VA. This is always 12 bits, irrespective 1460 * of the page size. 1461 */ 1462 fault_ipa |= kvm_vcpu_get_hfar(vcpu) & ((1 << 12) - 1); 1463 ret = io_mem_abort(vcpu, run, fault_ipa); 1464 goto out_unlock; 1465 } 1466 1467 /* Userspace should not be able to register out-of-bounds IPAs */ 1468 VM_BUG_ON(fault_ipa >= KVM_PHYS_SIZE); 1469 1470 if (fault_status == FSC_ACCESS) { 1471 handle_access_fault(vcpu, fault_ipa); 1472 ret = 1; 1473 goto out_unlock; 1474 } 1475 1476 ret = user_mem_abort(vcpu, fault_ipa, memslot, hva, fault_status); 1477 if (ret == 0) 1478 ret = 1; 1479out_unlock: 1480 srcu_read_unlock(&vcpu->kvm->srcu, idx); 1481 return ret; 1482} 1483 1484static int handle_hva_to_gpa(struct kvm *kvm, 1485 unsigned long start, 1486 unsigned long end, 1487 int (*handler)(struct kvm *kvm, 1488 gpa_t gpa, void *data), 1489 void *data) 1490{ 1491 struct kvm_memslots *slots; 1492 struct kvm_memory_slot *memslot; 1493 int ret = 0; 1494 1495 slots = kvm_memslots(kvm); 1496 1497 /* we only care about the pages that the guest sees */ 1498 kvm_for_each_memslot(memslot, slots) { 1499 unsigned long hva_start, hva_end; 1500 gfn_t gfn, gfn_end; 1501 1502 hva_start = max(start, memslot->userspace_addr); 1503 hva_end = min(end, memslot->userspace_addr + 1504 (memslot->npages << PAGE_SHIFT)); 1505 if (hva_start >= hva_end) 1506 continue; 1507 1508 /* 1509 * {gfn(page) | page intersects with [hva_start, hva_end)} = 1510 * {gfn_start, gfn_start+1, ..., gfn_end-1}. 1511 */ 1512 gfn = hva_to_gfn_memslot(hva_start, memslot); 1513 gfn_end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, memslot); 1514 1515 for (; gfn < gfn_end; ++gfn) { 1516 gpa_t gpa = gfn << PAGE_SHIFT; 1517 ret |= handler(kvm, gpa, data); 1518 } 1519 } 1520 1521 return ret; 1522} 1523 1524static int kvm_unmap_hva_handler(struct kvm *kvm, gpa_t gpa, void *data) 1525{ 1526 unmap_stage2_range(kvm, gpa, PAGE_SIZE); 1527 return 0; 1528} 1529 1530int kvm_unmap_hva(struct kvm *kvm, unsigned long hva) 1531{ 1532 unsigned long end = hva + PAGE_SIZE; 1533 1534 if (!kvm->arch.pgd) 1535 return 0; 1536 1537 trace_kvm_unmap_hva(hva); 1538 handle_hva_to_gpa(kvm, hva, end, &kvm_unmap_hva_handler, NULL); 1539 return 0; 1540} 1541 1542int kvm_unmap_hva_range(struct kvm *kvm, 1543 unsigned long start, unsigned long end) 1544{ 1545 if (!kvm->arch.pgd) 1546 return 0; 1547 1548 trace_kvm_unmap_hva_range(start, end); 1549 handle_hva_to_gpa(kvm, start, end, &kvm_unmap_hva_handler, NULL); 1550 return 0; 1551} 1552 1553static int kvm_set_spte_handler(struct kvm *kvm, gpa_t gpa, void *data) 1554{ 1555 pte_t *pte = (pte_t *)data; 1556 1557 /* 1558 * We can always call stage2_set_pte with KVM_S2PTE_FLAG_LOGGING_ACTIVE 1559 * flag clear because MMU notifiers will have unmapped a huge PMD before 1560 * calling ->change_pte() (which in turn calls kvm_set_spte_hva()) and 1561 * therefore stage2_set_pte() never needs to clear out a huge PMD 1562 * through this calling path. 1563 */ 1564 stage2_set_pte(kvm, NULL, gpa, pte, 0); 1565 return 0; 1566} 1567 1568 1569void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte) 1570{ 1571 unsigned long end = hva + PAGE_SIZE; 1572 pte_t stage2_pte; 1573 1574 if (!kvm->arch.pgd) 1575 return; 1576 1577 trace_kvm_set_spte_hva(hva); 1578 stage2_pte = pfn_pte(pte_pfn(pte), PAGE_S2); 1579 handle_hva_to_gpa(kvm, hva, end, &kvm_set_spte_handler, &stage2_pte); 1580} 1581 1582static int kvm_age_hva_handler(struct kvm *kvm, gpa_t gpa, void *data) 1583{ 1584 pmd_t *pmd; 1585 pte_t *pte; 1586 1587 pmd = stage2_get_pmd(kvm, NULL, gpa); 1588 if (!pmd || pmd_none(*pmd)) /* Nothing there */ 1589 return 0; 1590 1591 if (kvm_pmd_huge(*pmd)) { /* THP, HugeTLB */ 1592 if (pmd_young(*pmd)) { 1593 *pmd = pmd_mkold(*pmd); 1594 return 1; 1595 } 1596 1597 return 0; 1598 } 1599 1600 pte = pte_offset_kernel(pmd, gpa); 1601 if (pte_none(*pte)) 1602 return 0; 1603 1604 if (pte_young(*pte)) { 1605 *pte = pte_mkold(*pte); /* Just a page... */ 1606 return 1; 1607 } 1608 1609 return 0; 1610} 1611 1612static int kvm_test_age_hva_handler(struct kvm *kvm, gpa_t gpa, void *data) 1613{ 1614 pmd_t *pmd; 1615 pte_t *pte; 1616 1617 pmd = stage2_get_pmd(kvm, NULL, gpa); 1618 if (!pmd || pmd_none(*pmd)) /* Nothing there */ 1619 return 0; 1620 1621 if (kvm_pmd_huge(*pmd)) /* THP, HugeTLB */ 1622 return pmd_young(*pmd); 1623 1624 pte = pte_offset_kernel(pmd, gpa); 1625 if (!pte_none(*pte)) /* Just a page... */ 1626 return pte_young(*pte); 1627 1628 return 0; 1629} 1630 1631int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end) 1632{ 1633 trace_kvm_age_hva(start, end); 1634 return handle_hva_to_gpa(kvm, start, end, kvm_age_hva_handler, NULL); 1635} 1636 1637int kvm_test_age_hva(struct kvm *kvm, unsigned long hva) 1638{ 1639 trace_kvm_test_age_hva(hva); 1640 return handle_hva_to_gpa(kvm, hva, hva, kvm_test_age_hva_handler, NULL); 1641} 1642 1643void kvm_mmu_free_memory_caches(struct kvm_vcpu *vcpu) 1644{ 1645 mmu_free_memory_cache(&vcpu->arch.mmu_page_cache); 1646} 1647 1648phys_addr_t kvm_mmu_get_httbr(void) 1649{ 1650 if (__kvm_cpu_uses_extended_idmap()) 1651 return virt_to_phys(merged_hyp_pgd); 1652 else 1653 return virt_to_phys(hyp_pgd); 1654} 1655 1656phys_addr_t kvm_mmu_get_boot_httbr(void) 1657{ 1658 if (__kvm_cpu_uses_extended_idmap()) 1659 return virt_to_phys(merged_hyp_pgd); 1660 else 1661 return virt_to_phys(boot_hyp_pgd); 1662} 1663 1664phys_addr_t kvm_get_idmap_vector(void) 1665{ 1666 return hyp_idmap_vector; 1667} 1668 1669int kvm_mmu_init(void) 1670{ 1671 int err; 1672 1673 hyp_idmap_start = kvm_virt_to_phys(__hyp_idmap_text_start); 1674 hyp_idmap_end = kvm_virt_to_phys(__hyp_idmap_text_end); 1675 hyp_idmap_vector = kvm_virt_to_phys(__kvm_hyp_init); 1676 1677 /* 1678 * We rely on the linker script to ensure at build time that the HYP 1679 * init code does not cross a page boundary. 1680 */ 1681 BUG_ON((hyp_idmap_start ^ (hyp_idmap_end - 1)) & PAGE_MASK); 1682 1683 hyp_pgd = (pgd_t *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, hyp_pgd_order); 1684 boot_hyp_pgd = (pgd_t *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, hyp_pgd_order); 1685 1686 if (!hyp_pgd || !boot_hyp_pgd) { 1687 kvm_err("Hyp mode PGD not allocated\n"); 1688 err = -ENOMEM; 1689 goto out; 1690 } 1691 1692 /* Create the idmap in the boot page tables */ 1693 err = __create_hyp_mappings(boot_hyp_pgd, 1694 hyp_idmap_start, hyp_idmap_end, 1695 __phys_to_pfn(hyp_idmap_start), 1696 PAGE_HYP); 1697 1698 if (err) { 1699 kvm_err("Failed to idmap %lx-%lx\n", 1700 hyp_idmap_start, hyp_idmap_end); 1701 goto out; 1702 } 1703 1704 if (__kvm_cpu_uses_extended_idmap()) { 1705 merged_hyp_pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO); 1706 if (!merged_hyp_pgd) { 1707 kvm_err("Failed to allocate extra HYP pgd\n"); 1708 goto out; 1709 } 1710 __kvm_extend_hypmap(boot_hyp_pgd, hyp_pgd, merged_hyp_pgd, 1711 hyp_idmap_start); 1712 return 0; 1713 } 1714 1715 /* Map the very same page at the trampoline VA */ 1716 err = __create_hyp_mappings(boot_hyp_pgd, 1717 TRAMPOLINE_VA, TRAMPOLINE_VA + PAGE_SIZE, 1718 __phys_to_pfn(hyp_idmap_start), 1719 PAGE_HYP); 1720 if (err) { 1721 kvm_err("Failed to map trampoline @%lx into boot HYP pgd\n", 1722 TRAMPOLINE_VA); 1723 goto out; 1724 } 1725 1726 /* Map the same page again into the runtime page tables */ 1727 err = __create_hyp_mappings(hyp_pgd, 1728 TRAMPOLINE_VA, TRAMPOLINE_VA + PAGE_SIZE, 1729 __phys_to_pfn(hyp_idmap_start), 1730 PAGE_HYP); 1731 if (err) { 1732 kvm_err("Failed to map trampoline @%lx into runtime HYP pgd\n", 1733 TRAMPOLINE_VA); 1734 goto out; 1735 } 1736 1737 return 0; 1738out: 1739 free_hyp_pgds(); 1740 return err; 1741} 1742 1743void kvm_arch_commit_memory_region(struct kvm *kvm, 1744 const struct kvm_userspace_memory_region *mem, 1745 const struct kvm_memory_slot *old, 1746 const struct kvm_memory_slot *new, 1747 enum kvm_mr_change change) 1748{ 1749 /* 1750 * At this point memslot has been committed and there is an 1751 * allocated dirty_bitmap[], dirty pages will be be tracked while the 1752 * memory slot is write protected. 1753 */ 1754 if (change != KVM_MR_DELETE && mem->flags & KVM_MEM_LOG_DIRTY_PAGES) 1755 kvm_mmu_wp_memory_region(kvm, mem->slot); 1756} 1757 1758int kvm_arch_prepare_memory_region(struct kvm *kvm, 1759 struct kvm_memory_slot *memslot, 1760 const struct kvm_userspace_memory_region *mem, 1761 enum kvm_mr_change change) 1762{ 1763 hva_t hva = mem->userspace_addr; 1764 hva_t reg_end = hva + mem->memory_size; 1765 bool writable = !(mem->flags & KVM_MEM_READONLY); 1766 int ret = 0; 1767 1768 if (change != KVM_MR_CREATE && change != KVM_MR_MOVE && 1769 change != KVM_MR_FLAGS_ONLY) 1770 return 0; 1771 1772 /* 1773 * Prevent userspace from creating a memory region outside of the IPA 1774 * space addressable by the KVM guest IPA space. 1775 */ 1776 if (memslot->base_gfn + memslot->npages >= 1777 (KVM_PHYS_SIZE >> PAGE_SHIFT)) 1778 return -EFAULT; 1779 1780 /* 1781 * A memory region could potentially cover multiple VMAs, and any holes 1782 * between them, so iterate over all of them to find out if we can map 1783 * any of them right now. 1784 * 1785 * +--------------------------------------------+ 1786 * +---------------+----------------+ +----------------+ 1787 * | : VMA 1 | VMA 2 | | VMA 3 : | 1788 * +---------------+----------------+ +----------------+ 1789 * | memory region | 1790 * +--------------------------------------------+ 1791 */ 1792 do { 1793 struct vm_area_struct *vma = find_vma(current->mm, hva); 1794 hva_t vm_start, vm_end; 1795 1796 if (!vma || vma->vm_start >= reg_end) 1797 break; 1798 1799 /* 1800 * Mapping a read-only VMA is only allowed if the 1801 * memory region is configured as read-only. 1802 */ 1803 if (writable && !(vma->vm_flags & VM_WRITE)) { 1804 ret = -EPERM; 1805 break; 1806 } 1807 1808 /* 1809 * Take the intersection of this VMA with the memory region 1810 */ 1811 vm_start = max(hva, vma->vm_start); 1812 vm_end = min(reg_end, vma->vm_end); 1813 1814 if (vma->vm_flags & VM_PFNMAP) { 1815 gpa_t gpa = mem->guest_phys_addr + 1816 (vm_start - mem->userspace_addr); 1817 phys_addr_t pa; 1818 1819 pa = (phys_addr_t)vma->vm_pgoff << PAGE_SHIFT; 1820 pa += vm_start - vma->vm_start; 1821 1822 /* IO region dirty page logging not allowed */ 1823 if (memslot->flags & KVM_MEM_LOG_DIRTY_PAGES) 1824 return -EINVAL; 1825 1826 ret = kvm_phys_addr_ioremap(kvm, gpa, pa, 1827 vm_end - vm_start, 1828 writable); 1829 if (ret) 1830 break; 1831 } 1832 hva = vm_end; 1833 } while (hva < reg_end); 1834 1835 if (change == KVM_MR_FLAGS_ONLY) 1836 return ret; 1837 1838 spin_lock(&kvm->mmu_lock); 1839 if (ret) 1840 unmap_stage2_range(kvm, mem->guest_phys_addr, mem->memory_size); 1841 else 1842 stage2_flush_memslot(kvm, memslot); 1843 spin_unlock(&kvm->mmu_lock); 1844 return ret; 1845} 1846 1847void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free, 1848 struct kvm_memory_slot *dont) 1849{ 1850} 1851 1852int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot, 1853 unsigned long npages) 1854{ 1855 /* 1856 * Readonly memslots are not incoherent with the caches by definition, 1857 * but in practice, they are used mostly to emulate ROMs or NOR flashes 1858 * that the guest may consider devices and hence map as uncached. 1859 * To prevent incoherency issues in these cases, tag all readonly 1860 * regions as incoherent. 1861 */ 1862 if (slot->flags & KVM_MEM_READONLY) 1863 slot->flags |= KVM_MEMSLOT_INCOHERENT; 1864 return 0; 1865} 1866 1867void kvm_arch_memslots_updated(struct kvm *kvm, struct kvm_memslots *slots) 1868{ 1869} 1870 1871void kvm_arch_flush_shadow_all(struct kvm *kvm) 1872{ 1873} 1874 1875void kvm_arch_flush_shadow_memslot(struct kvm *kvm, 1876 struct kvm_memory_slot *slot) 1877{ 1878 gpa_t gpa = slot->base_gfn << PAGE_SHIFT; 1879 phys_addr_t size = slot->npages << PAGE_SHIFT; 1880 1881 spin_lock(&kvm->mmu_lock); 1882 unmap_stage2_range(kvm, gpa, size); 1883 spin_unlock(&kvm->mmu_lock); 1884} 1885 1886/* 1887 * See note at ARMv7 ARM B1.14.4 (TL;DR: S/W ops are not easily virtualized). 1888 * 1889 * Main problems: 1890 * - S/W ops are local to a CPU (not broadcast) 1891 * - We have line migration behind our back (speculation) 1892 * - System caches don't support S/W at all (damn!) 1893 * 1894 * In the face of the above, the best we can do is to try and convert 1895 * S/W ops to VA ops. Because the guest is not allowed to infer the 1896 * S/W to PA mapping, it can only use S/W to nuke the whole cache, 1897 * which is a rather good thing for us. 1898 * 1899 * Also, it is only used when turning caches on/off ("The expected 1900 * usage of the cache maintenance instructions that operate by set/way 1901 * is associated with the cache maintenance instructions associated 1902 * with the powerdown and powerup of caches, if this is required by 1903 * the implementation."). 1904 * 1905 * We use the following policy: 1906 * 1907 * - If we trap a S/W operation, we enable VM trapping to detect 1908 * caches being turned on/off, and do a full clean. 1909 * 1910 * - We flush the caches on both caches being turned on and off. 1911 * 1912 * - Once the caches are enabled, we stop trapping VM ops. 1913 */ 1914void kvm_set_way_flush(struct kvm_vcpu *vcpu) 1915{ 1916 unsigned long hcr = vcpu_get_hcr(vcpu); 1917 1918 /* 1919 * If this is the first time we do a S/W operation 1920 * (i.e. HCR_TVM not set) flush the whole memory, and set the 1921 * VM trapping. 1922 * 1923 * Otherwise, rely on the VM trapping to wait for the MMU + 1924 * Caches to be turned off. At that point, we'll be able to 1925 * clean the caches again. 1926 */ 1927 if (!(hcr & HCR_TVM)) { 1928 trace_kvm_set_way_flush(*vcpu_pc(vcpu), 1929 vcpu_has_cache_enabled(vcpu)); 1930 stage2_flush_vm(vcpu->kvm); 1931 vcpu_set_hcr(vcpu, hcr | HCR_TVM); 1932 } 1933} 1934 1935void kvm_toggle_cache(struct kvm_vcpu *vcpu, bool was_enabled) 1936{ 1937 bool now_enabled = vcpu_has_cache_enabled(vcpu); 1938 1939 /* 1940 * If switching the MMU+caches on, need to invalidate the caches. 1941 * If switching it off, need to clean the caches. 1942 * Clean + invalidate does the trick always. 1943 */ 1944 if (now_enabled != was_enabled) 1945 stage2_flush_vm(vcpu->kvm); 1946 1947 /* Caches are now on, stop trapping VM ops (until a S/W op) */ 1948 if (now_enabled) 1949 vcpu_set_hcr(vcpu, vcpu_get_hcr(vcpu) & ~HCR_TVM); 1950 1951 trace_kvm_toggle_cache(*vcpu_pc(vcpu), was_enabled, now_enabled); 1952}