Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

at v4.15-rc1 2026 lines 54 kB view raw
1/* 2 * Copyright (C) 2012 - Virtual Open Systems and Columbia University 3 * Author: Christoffer Dall <c.dall@virtualopensystems.com> 4 * 5 * This program is free software; you can redistribute it and/or modify 6 * it under the terms of the GNU General Public License, version 2, as 7 * published by the Free Software Foundation. 8 * 9 * This program is distributed in the hope that it will be useful, 10 * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 * GNU General Public License for more details. 13 * 14 * You should have received a copy of the GNU General Public License 15 * along with this program; if not, write to the Free Software 16 * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. 17 */ 18 19#include <linux/mman.h> 20#include <linux/kvm_host.h> 21#include <linux/io.h> 22#include <linux/hugetlb.h> 23#include <linux/sched/signal.h> 24#include <trace/events/kvm.h> 25#include <asm/pgalloc.h> 26#include <asm/cacheflush.h> 27#include <asm/kvm_arm.h> 28#include <asm/kvm_mmu.h> 29#include <asm/kvm_mmio.h> 30#include <asm/kvm_asm.h> 31#include <asm/kvm_emulate.h> 32#include <asm/virt.h> 33#include <asm/system_misc.h> 34 35#include "trace.h" 36 37static pgd_t *boot_hyp_pgd; 38static pgd_t *hyp_pgd; 39static pgd_t *merged_hyp_pgd; 40static DEFINE_MUTEX(kvm_hyp_pgd_mutex); 41 42static unsigned long hyp_idmap_start; 43static unsigned long hyp_idmap_end; 44static phys_addr_t hyp_idmap_vector; 45 46#define S2_PGD_SIZE (PTRS_PER_S2_PGD * sizeof(pgd_t)) 47#define hyp_pgd_order get_order(PTRS_PER_PGD * sizeof(pgd_t)) 48 49#define KVM_S2PTE_FLAG_IS_IOMAP (1UL << 0) 50#define KVM_S2_FLAG_LOGGING_ACTIVE (1UL << 1) 51 52static bool memslot_is_logging(struct kvm_memory_slot *memslot) 53{ 54 return memslot->dirty_bitmap && !(memslot->flags & KVM_MEM_READONLY); 55} 56 57/** 58 * kvm_flush_remote_tlbs() - flush all VM TLB entries for v7/8 59 * @kvm: pointer to kvm structure. 60 * 61 * Interface to HYP function to flush all VM TLB entries 62 */ 63void kvm_flush_remote_tlbs(struct kvm *kvm) 64{ 65 kvm_call_hyp(__kvm_tlb_flush_vmid, kvm); 66} 67 68static void kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa) 69{ 70 kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, kvm, ipa); 71} 72 73/* 74 * D-Cache management functions. They take the page table entries by 75 * value, as they are flushing the cache using the kernel mapping (or 76 * kmap on 32bit). 77 */ 78static void kvm_flush_dcache_pte(pte_t pte) 79{ 80 __kvm_flush_dcache_pte(pte); 81} 82 83static void kvm_flush_dcache_pmd(pmd_t pmd) 84{ 85 __kvm_flush_dcache_pmd(pmd); 86} 87 88static void kvm_flush_dcache_pud(pud_t pud) 89{ 90 __kvm_flush_dcache_pud(pud); 91} 92 93static bool kvm_is_device_pfn(unsigned long pfn) 94{ 95 return !pfn_valid(pfn); 96} 97 98/** 99 * stage2_dissolve_pmd() - clear and flush huge PMD entry 100 * @kvm: pointer to kvm structure. 101 * @addr: IPA 102 * @pmd: pmd pointer for IPA 103 * 104 * Function clears a PMD entry, flushes addr 1st and 2nd stage TLBs. Marks all 105 * pages in the range dirty. 106 */ 107static void stage2_dissolve_pmd(struct kvm *kvm, phys_addr_t addr, pmd_t *pmd) 108{ 109 if (!pmd_thp_or_huge(*pmd)) 110 return; 111 112 pmd_clear(pmd); 113 kvm_tlb_flush_vmid_ipa(kvm, addr); 114 put_page(virt_to_page(pmd)); 115} 116 117static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache, 118 int min, int max) 119{ 120 void *page; 121 122 BUG_ON(max > KVM_NR_MEM_OBJS); 123 if (cache->nobjs >= min) 124 return 0; 125 while (cache->nobjs < max) { 126 page = (void *)__get_free_page(PGALLOC_GFP); 127 if (!page) 128 return -ENOMEM; 129 cache->objects[cache->nobjs++] = page; 130 } 131 return 0; 132} 133 134static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc) 135{ 136 while (mc->nobjs) 137 free_page((unsigned long)mc->objects[--mc->nobjs]); 138} 139 140static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc) 141{ 142 void *p; 143 144 BUG_ON(!mc || !mc->nobjs); 145 p = mc->objects[--mc->nobjs]; 146 return p; 147} 148 149static void clear_stage2_pgd_entry(struct kvm *kvm, pgd_t *pgd, phys_addr_t addr) 150{ 151 pud_t *pud_table __maybe_unused = stage2_pud_offset(pgd, 0UL); 152 stage2_pgd_clear(pgd); 153 kvm_tlb_flush_vmid_ipa(kvm, addr); 154 stage2_pud_free(pud_table); 155 put_page(virt_to_page(pgd)); 156} 157 158static void clear_stage2_pud_entry(struct kvm *kvm, pud_t *pud, phys_addr_t addr) 159{ 160 pmd_t *pmd_table __maybe_unused = stage2_pmd_offset(pud, 0); 161 VM_BUG_ON(stage2_pud_huge(*pud)); 162 stage2_pud_clear(pud); 163 kvm_tlb_flush_vmid_ipa(kvm, addr); 164 stage2_pmd_free(pmd_table); 165 put_page(virt_to_page(pud)); 166} 167 168static void clear_stage2_pmd_entry(struct kvm *kvm, pmd_t *pmd, phys_addr_t addr) 169{ 170 pte_t *pte_table = pte_offset_kernel(pmd, 0); 171 VM_BUG_ON(pmd_thp_or_huge(*pmd)); 172 pmd_clear(pmd); 173 kvm_tlb_flush_vmid_ipa(kvm, addr); 174 pte_free_kernel(NULL, pte_table); 175 put_page(virt_to_page(pmd)); 176} 177 178/* 179 * Unmapping vs dcache management: 180 * 181 * If a guest maps certain memory pages as uncached, all writes will 182 * bypass the data cache and go directly to RAM. However, the CPUs 183 * can still speculate reads (not writes) and fill cache lines with 184 * data. 185 * 186 * Those cache lines will be *clean* cache lines though, so a 187 * clean+invalidate operation is equivalent to an invalidate 188 * operation, because no cache lines are marked dirty. 189 * 190 * Those clean cache lines could be filled prior to an uncached write 191 * by the guest, and the cache coherent IO subsystem would therefore 192 * end up writing old data to disk. 193 * 194 * This is why right after unmapping a page/section and invalidating 195 * the corresponding TLBs, we call kvm_flush_dcache_p*() to make sure 196 * the IO subsystem will never hit in the cache. 197 */ 198static void unmap_stage2_ptes(struct kvm *kvm, pmd_t *pmd, 199 phys_addr_t addr, phys_addr_t end) 200{ 201 phys_addr_t start_addr = addr; 202 pte_t *pte, *start_pte; 203 204 start_pte = pte = pte_offset_kernel(pmd, addr); 205 do { 206 if (!pte_none(*pte)) { 207 pte_t old_pte = *pte; 208 209 kvm_set_pte(pte, __pte(0)); 210 kvm_tlb_flush_vmid_ipa(kvm, addr); 211 212 /* No need to invalidate the cache for device mappings */ 213 if (!kvm_is_device_pfn(pte_pfn(old_pte))) 214 kvm_flush_dcache_pte(old_pte); 215 216 put_page(virt_to_page(pte)); 217 } 218 } while (pte++, addr += PAGE_SIZE, addr != end); 219 220 if (stage2_pte_table_empty(start_pte)) 221 clear_stage2_pmd_entry(kvm, pmd, start_addr); 222} 223 224static void unmap_stage2_pmds(struct kvm *kvm, pud_t *pud, 225 phys_addr_t addr, phys_addr_t end) 226{ 227 phys_addr_t next, start_addr = addr; 228 pmd_t *pmd, *start_pmd; 229 230 start_pmd = pmd = stage2_pmd_offset(pud, addr); 231 do { 232 next = stage2_pmd_addr_end(addr, end); 233 if (!pmd_none(*pmd)) { 234 if (pmd_thp_or_huge(*pmd)) { 235 pmd_t old_pmd = *pmd; 236 237 pmd_clear(pmd); 238 kvm_tlb_flush_vmid_ipa(kvm, addr); 239 240 kvm_flush_dcache_pmd(old_pmd); 241 242 put_page(virt_to_page(pmd)); 243 } else { 244 unmap_stage2_ptes(kvm, pmd, addr, next); 245 } 246 } 247 } while (pmd++, addr = next, addr != end); 248 249 if (stage2_pmd_table_empty(start_pmd)) 250 clear_stage2_pud_entry(kvm, pud, start_addr); 251} 252 253static void unmap_stage2_puds(struct kvm *kvm, pgd_t *pgd, 254 phys_addr_t addr, phys_addr_t end) 255{ 256 phys_addr_t next, start_addr = addr; 257 pud_t *pud, *start_pud; 258 259 start_pud = pud = stage2_pud_offset(pgd, addr); 260 do { 261 next = stage2_pud_addr_end(addr, end); 262 if (!stage2_pud_none(*pud)) { 263 if (stage2_pud_huge(*pud)) { 264 pud_t old_pud = *pud; 265 266 stage2_pud_clear(pud); 267 kvm_tlb_flush_vmid_ipa(kvm, addr); 268 kvm_flush_dcache_pud(old_pud); 269 put_page(virt_to_page(pud)); 270 } else { 271 unmap_stage2_pmds(kvm, pud, addr, next); 272 } 273 } 274 } while (pud++, addr = next, addr != end); 275 276 if (stage2_pud_table_empty(start_pud)) 277 clear_stage2_pgd_entry(kvm, pgd, start_addr); 278} 279 280/** 281 * unmap_stage2_range -- Clear stage2 page table entries to unmap a range 282 * @kvm: The VM pointer 283 * @start: The intermediate physical base address of the range to unmap 284 * @size: The size of the area to unmap 285 * 286 * Clear a range of stage-2 mappings, lowering the various ref-counts. Must 287 * be called while holding mmu_lock (unless for freeing the stage2 pgd before 288 * destroying the VM), otherwise another faulting VCPU may come in and mess 289 * with things behind our backs. 290 */ 291static void unmap_stage2_range(struct kvm *kvm, phys_addr_t start, u64 size) 292{ 293 pgd_t *pgd; 294 phys_addr_t addr = start, end = start + size; 295 phys_addr_t next; 296 297 assert_spin_locked(&kvm->mmu_lock); 298 pgd = kvm->arch.pgd + stage2_pgd_index(addr); 299 do { 300 /* 301 * Make sure the page table is still active, as another thread 302 * could have possibly freed the page table, while we released 303 * the lock. 304 */ 305 if (!READ_ONCE(kvm->arch.pgd)) 306 break; 307 next = stage2_pgd_addr_end(addr, end); 308 if (!stage2_pgd_none(*pgd)) 309 unmap_stage2_puds(kvm, pgd, addr, next); 310 /* 311 * If the range is too large, release the kvm->mmu_lock 312 * to prevent starvation and lockup detector warnings. 313 */ 314 if (next != end) 315 cond_resched_lock(&kvm->mmu_lock); 316 } while (pgd++, addr = next, addr != end); 317} 318 319static void stage2_flush_ptes(struct kvm *kvm, pmd_t *pmd, 320 phys_addr_t addr, phys_addr_t end) 321{ 322 pte_t *pte; 323 324 pte = pte_offset_kernel(pmd, addr); 325 do { 326 if (!pte_none(*pte) && !kvm_is_device_pfn(pte_pfn(*pte))) 327 kvm_flush_dcache_pte(*pte); 328 } while (pte++, addr += PAGE_SIZE, addr != end); 329} 330 331static void stage2_flush_pmds(struct kvm *kvm, pud_t *pud, 332 phys_addr_t addr, phys_addr_t end) 333{ 334 pmd_t *pmd; 335 phys_addr_t next; 336 337 pmd = stage2_pmd_offset(pud, addr); 338 do { 339 next = stage2_pmd_addr_end(addr, end); 340 if (!pmd_none(*pmd)) { 341 if (pmd_thp_or_huge(*pmd)) 342 kvm_flush_dcache_pmd(*pmd); 343 else 344 stage2_flush_ptes(kvm, pmd, addr, next); 345 } 346 } while (pmd++, addr = next, addr != end); 347} 348 349static void stage2_flush_puds(struct kvm *kvm, pgd_t *pgd, 350 phys_addr_t addr, phys_addr_t end) 351{ 352 pud_t *pud; 353 phys_addr_t next; 354 355 pud = stage2_pud_offset(pgd, addr); 356 do { 357 next = stage2_pud_addr_end(addr, end); 358 if (!stage2_pud_none(*pud)) { 359 if (stage2_pud_huge(*pud)) 360 kvm_flush_dcache_pud(*pud); 361 else 362 stage2_flush_pmds(kvm, pud, addr, next); 363 } 364 } while (pud++, addr = next, addr != end); 365} 366 367static void stage2_flush_memslot(struct kvm *kvm, 368 struct kvm_memory_slot *memslot) 369{ 370 phys_addr_t addr = memslot->base_gfn << PAGE_SHIFT; 371 phys_addr_t end = addr + PAGE_SIZE * memslot->npages; 372 phys_addr_t next; 373 pgd_t *pgd; 374 375 pgd = kvm->arch.pgd + stage2_pgd_index(addr); 376 do { 377 next = stage2_pgd_addr_end(addr, end); 378 stage2_flush_puds(kvm, pgd, addr, next); 379 } while (pgd++, addr = next, addr != end); 380} 381 382/** 383 * stage2_flush_vm - Invalidate cache for pages mapped in stage 2 384 * @kvm: The struct kvm pointer 385 * 386 * Go through the stage 2 page tables and invalidate any cache lines 387 * backing memory already mapped to the VM. 388 */ 389static void stage2_flush_vm(struct kvm *kvm) 390{ 391 struct kvm_memslots *slots; 392 struct kvm_memory_slot *memslot; 393 int idx; 394 395 idx = srcu_read_lock(&kvm->srcu); 396 spin_lock(&kvm->mmu_lock); 397 398 slots = kvm_memslots(kvm); 399 kvm_for_each_memslot(memslot, slots) 400 stage2_flush_memslot(kvm, memslot); 401 402 spin_unlock(&kvm->mmu_lock); 403 srcu_read_unlock(&kvm->srcu, idx); 404} 405 406static void clear_hyp_pgd_entry(pgd_t *pgd) 407{ 408 pud_t *pud_table __maybe_unused = pud_offset(pgd, 0UL); 409 pgd_clear(pgd); 410 pud_free(NULL, pud_table); 411 put_page(virt_to_page(pgd)); 412} 413 414static void clear_hyp_pud_entry(pud_t *pud) 415{ 416 pmd_t *pmd_table __maybe_unused = pmd_offset(pud, 0); 417 VM_BUG_ON(pud_huge(*pud)); 418 pud_clear(pud); 419 pmd_free(NULL, pmd_table); 420 put_page(virt_to_page(pud)); 421} 422 423static void clear_hyp_pmd_entry(pmd_t *pmd) 424{ 425 pte_t *pte_table = pte_offset_kernel(pmd, 0); 426 VM_BUG_ON(pmd_thp_or_huge(*pmd)); 427 pmd_clear(pmd); 428 pte_free_kernel(NULL, pte_table); 429 put_page(virt_to_page(pmd)); 430} 431 432static void unmap_hyp_ptes(pmd_t *pmd, phys_addr_t addr, phys_addr_t end) 433{ 434 pte_t *pte, *start_pte; 435 436 start_pte = pte = pte_offset_kernel(pmd, addr); 437 do { 438 if (!pte_none(*pte)) { 439 kvm_set_pte(pte, __pte(0)); 440 put_page(virt_to_page(pte)); 441 } 442 } while (pte++, addr += PAGE_SIZE, addr != end); 443 444 if (hyp_pte_table_empty(start_pte)) 445 clear_hyp_pmd_entry(pmd); 446} 447 448static void unmap_hyp_pmds(pud_t *pud, phys_addr_t addr, phys_addr_t end) 449{ 450 phys_addr_t next; 451 pmd_t *pmd, *start_pmd; 452 453 start_pmd = pmd = pmd_offset(pud, addr); 454 do { 455 next = pmd_addr_end(addr, end); 456 /* Hyp doesn't use huge pmds */ 457 if (!pmd_none(*pmd)) 458 unmap_hyp_ptes(pmd, addr, next); 459 } while (pmd++, addr = next, addr != end); 460 461 if (hyp_pmd_table_empty(start_pmd)) 462 clear_hyp_pud_entry(pud); 463} 464 465static void unmap_hyp_puds(pgd_t *pgd, phys_addr_t addr, phys_addr_t end) 466{ 467 phys_addr_t next; 468 pud_t *pud, *start_pud; 469 470 start_pud = pud = pud_offset(pgd, addr); 471 do { 472 next = pud_addr_end(addr, end); 473 /* Hyp doesn't use huge puds */ 474 if (!pud_none(*pud)) 475 unmap_hyp_pmds(pud, addr, next); 476 } while (pud++, addr = next, addr != end); 477 478 if (hyp_pud_table_empty(start_pud)) 479 clear_hyp_pgd_entry(pgd); 480} 481 482static void unmap_hyp_range(pgd_t *pgdp, phys_addr_t start, u64 size) 483{ 484 pgd_t *pgd; 485 phys_addr_t addr = start, end = start + size; 486 phys_addr_t next; 487 488 /* 489 * We don't unmap anything from HYP, except at the hyp tear down. 490 * Hence, we don't have to invalidate the TLBs here. 491 */ 492 pgd = pgdp + pgd_index(addr); 493 do { 494 next = pgd_addr_end(addr, end); 495 if (!pgd_none(*pgd)) 496 unmap_hyp_puds(pgd, addr, next); 497 } while (pgd++, addr = next, addr != end); 498} 499 500/** 501 * free_hyp_pgds - free Hyp-mode page tables 502 * 503 * Assumes hyp_pgd is a page table used strictly in Hyp-mode and 504 * therefore contains either mappings in the kernel memory area (above 505 * PAGE_OFFSET), or device mappings in the vmalloc range (from 506 * VMALLOC_START to VMALLOC_END). 507 * 508 * boot_hyp_pgd should only map two pages for the init code. 509 */ 510void free_hyp_pgds(void) 511{ 512 unsigned long addr; 513 514 mutex_lock(&kvm_hyp_pgd_mutex); 515 516 if (boot_hyp_pgd) { 517 unmap_hyp_range(boot_hyp_pgd, hyp_idmap_start, PAGE_SIZE); 518 free_pages((unsigned long)boot_hyp_pgd, hyp_pgd_order); 519 boot_hyp_pgd = NULL; 520 } 521 522 if (hyp_pgd) { 523 unmap_hyp_range(hyp_pgd, hyp_idmap_start, PAGE_SIZE); 524 for (addr = PAGE_OFFSET; virt_addr_valid(addr); addr += PGDIR_SIZE) 525 unmap_hyp_range(hyp_pgd, kern_hyp_va(addr), PGDIR_SIZE); 526 for (addr = VMALLOC_START; is_vmalloc_addr((void*)addr); addr += PGDIR_SIZE) 527 unmap_hyp_range(hyp_pgd, kern_hyp_va(addr), PGDIR_SIZE); 528 529 free_pages((unsigned long)hyp_pgd, hyp_pgd_order); 530 hyp_pgd = NULL; 531 } 532 if (merged_hyp_pgd) { 533 clear_page(merged_hyp_pgd); 534 free_page((unsigned long)merged_hyp_pgd); 535 merged_hyp_pgd = NULL; 536 } 537 538 mutex_unlock(&kvm_hyp_pgd_mutex); 539} 540 541static void create_hyp_pte_mappings(pmd_t *pmd, unsigned long start, 542 unsigned long end, unsigned long pfn, 543 pgprot_t prot) 544{ 545 pte_t *pte; 546 unsigned long addr; 547 548 addr = start; 549 do { 550 pte = pte_offset_kernel(pmd, addr); 551 kvm_set_pte(pte, pfn_pte(pfn, prot)); 552 get_page(virt_to_page(pte)); 553 kvm_flush_dcache_to_poc(pte, sizeof(*pte)); 554 pfn++; 555 } while (addr += PAGE_SIZE, addr != end); 556} 557 558static int create_hyp_pmd_mappings(pud_t *pud, unsigned long start, 559 unsigned long end, unsigned long pfn, 560 pgprot_t prot) 561{ 562 pmd_t *pmd; 563 pte_t *pte; 564 unsigned long addr, next; 565 566 addr = start; 567 do { 568 pmd = pmd_offset(pud, addr); 569 570 BUG_ON(pmd_sect(*pmd)); 571 572 if (pmd_none(*pmd)) { 573 pte = pte_alloc_one_kernel(NULL, addr); 574 if (!pte) { 575 kvm_err("Cannot allocate Hyp pte\n"); 576 return -ENOMEM; 577 } 578 pmd_populate_kernel(NULL, pmd, pte); 579 get_page(virt_to_page(pmd)); 580 kvm_flush_dcache_to_poc(pmd, sizeof(*pmd)); 581 } 582 583 next = pmd_addr_end(addr, end); 584 585 create_hyp_pte_mappings(pmd, addr, next, pfn, prot); 586 pfn += (next - addr) >> PAGE_SHIFT; 587 } while (addr = next, addr != end); 588 589 return 0; 590} 591 592static int create_hyp_pud_mappings(pgd_t *pgd, unsigned long start, 593 unsigned long end, unsigned long pfn, 594 pgprot_t prot) 595{ 596 pud_t *pud; 597 pmd_t *pmd; 598 unsigned long addr, next; 599 int ret; 600 601 addr = start; 602 do { 603 pud = pud_offset(pgd, addr); 604 605 if (pud_none_or_clear_bad(pud)) { 606 pmd = pmd_alloc_one(NULL, addr); 607 if (!pmd) { 608 kvm_err("Cannot allocate Hyp pmd\n"); 609 return -ENOMEM; 610 } 611 pud_populate(NULL, pud, pmd); 612 get_page(virt_to_page(pud)); 613 kvm_flush_dcache_to_poc(pud, sizeof(*pud)); 614 } 615 616 next = pud_addr_end(addr, end); 617 ret = create_hyp_pmd_mappings(pud, addr, next, pfn, prot); 618 if (ret) 619 return ret; 620 pfn += (next - addr) >> PAGE_SHIFT; 621 } while (addr = next, addr != end); 622 623 return 0; 624} 625 626static int __create_hyp_mappings(pgd_t *pgdp, 627 unsigned long start, unsigned long end, 628 unsigned long pfn, pgprot_t prot) 629{ 630 pgd_t *pgd; 631 pud_t *pud; 632 unsigned long addr, next; 633 int err = 0; 634 635 mutex_lock(&kvm_hyp_pgd_mutex); 636 addr = start & PAGE_MASK; 637 end = PAGE_ALIGN(end); 638 do { 639 pgd = pgdp + pgd_index(addr); 640 641 if (pgd_none(*pgd)) { 642 pud = pud_alloc_one(NULL, addr); 643 if (!pud) { 644 kvm_err("Cannot allocate Hyp pud\n"); 645 err = -ENOMEM; 646 goto out; 647 } 648 pgd_populate(NULL, pgd, pud); 649 get_page(virt_to_page(pgd)); 650 kvm_flush_dcache_to_poc(pgd, sizeof(*pgd)); 651 } 652 653 next = pgd_addr_end(addr, end); 654 err = create_hyp_pud_mappings(pgd, addr, next, pfn, prot); 655 if (err) 656 goto out; 657 pfn += (next - addr) >> PAGE_SHIFT; 658 } while (addr = next, addr != end); 659out: 660 mutex_unlock(&kvm_hyp_pgd_mutex); 661 return err; 662} 663 664static phys_addr_t kvm_kaddr_to_phys(void *kaddr) 665{ 666 if (!is_vmalloc_addr(kaddr)) { 667 BUG_ON(!virt_addr_valid(kaddr)); 668 return __pa(kaddr); 669 } else { 670 return page_to_phys(vmalloc_to_page(kaddr)) + 671 offset_in_page(kaddr); 672 } 673} 674 675/** 676 * create_hyp_mappings - duplicate a kernel virtual address range in Hyp mode 677 * @from: The virtual kernel start address of the range 678 * @to: The virtual kernel end address of the range (exclusive) 679 * @prot: The protection to be applied to this range 680 * 681 * The same virtual address as the kernel virtual address is also used 682 * in Hyp-mode mapping (modulo HYP_PAGE_OFFSET) to the same underlying 683 * physical pages. 684 */ 685int create_hyp_mappings(void *from, void *to, pgprot_t prot) 686{ 687 phys_addr_t phys_addr; 688 unsigned long virt_addr; 689 unsigned long start = kern_hyp_va((unsigned long)from); 690 unsigned long end = kern_hyp_va((unsigned long)to); 691 692 if (is_kernel_in_hyp_mode()) 693 return 0; 694 695 start = start & PAGE_MASK; 696 end = PAGE_ALIGN(end); 697 698 for (virt_addr = start; virt_addr < end; virt_addr += PAGE_SIZE) { 699 int err; 700 701 phys_addr = kvm_kaddr_to_phys(from + virt_addr - start); 702 err = __create_hyp_mappings(hyp_pgd, virt_addr, 703 virt_addr + PAGE_SIZE, 704 __phys_to_pfn(phys_addr), 705 prot); 706 if (err) 707 return err; 708 } 709 710 return 0; 711} 712 713/** 714 * create_hyp_io_mappings - duplicate a kernel IO mapping into Hyp mode 715 * @from: The kernel start VA of the range 716 * @to: The kernel end VA of the range (exclusive) 717 * @phys_addr: The physical start address which gets mapped 718 * 719 * The resulting HYP VA is the same as the kernel VA, modulo 720 * HYP_PAGE_OFFSET. 721 */ 722int create_hyp_io_mappings(void *from, void *to, phys_addr_t phys_addr) 723{ 724 unsigned long start = kern_hyp_va((unsigned long)from); 725 unsigned long end = kern_hyp_va((unsigned long)to); 726 727 if (is_kernel_in_hyp_mode()) 728 return 0; 729 730 /* Check for a valid kernel IO mapping */ 731 if (!is_vmalloc_addr(from) || !is_vmalloc_addr(to - 1)) 732 return -EINVAL; 733 734 return __create_hyp_mappings(hyp_pgd, start, end, 735 __phys_to_pfn(phys_addr), PAGE_HYP_DEVICE); 736} 737 738/** 739 * kvm_alloc_stage2_pgd - allocate level-1 table for stage-2 translation. 740 * @kvm: The KVM struct pointer for the VM. 741 * 742 * Allocates only the stage-2 HW PGD level table(s) (can support either full 743 * 40-bit input addresses or limited to 32-bit input addresses). Clears the 744 * allocated pages. 745 * 746 * Note we don't need locking here as this is only called when the VM is 747 * created, which can only be done once. 748 */ 749int kvm_alloc_stage2_pgd(struct kvm *kvm) 750{ 751 pgd_t *pgd; 752 753 if (kvm->arch.pgd != NULL) { 754 kvm_err("kvm_arch already initialized?\n"); 755 return -EINVAL; 756 } 757 758 /* Allocate the HW PGD, making sure that each page gets its own refcount */ 759 pgd = alloc_pages_exact(S2_PGD_SIZE, GFP_KERNEL | __GFP_ZERO); 760 if (!pgd) 761 return -ENOMEM; 762 763 kvm->arch.pgd = pgd; 764 return 0; 765} 766 767static void stage2_unmap_memslot(struct kvm *kvm, 768 struct kvm_memory_slot *memslot) 769{ 770 hva_t hva = memslot->userspace_addr; 771 phys_addr_t addr = memslot->base_gfn << PAGE_SHIFT; 772 phys_addr_t size = PAGE_SIZE * memslot->npages; 773 hva_t reg_end = hva + size; 774 775 /* 776 * A memory region could potentially cover multiple VMAs, and any holes 777 * between them, so iterate over all of them to find out if we should 778 * unmap any of them. 779 * 780 * +--------------------------------------------+ 781 * +---------------+----------------+ +----------------+ 782 * | : VMA 1 | VMA 2 | | VMA 3 : | 783 * +---------------+----------------+ +----------------+ 784 * | memory region | 785 * +--------------------------------------------+ 786 */ 787 do { 788 struct vm_area_struct *vma = find_vma(current->mm, hva); 789 hva_t vm_start, vm_end; 790 791 if (!vma || vma->vm_start >= reg_end) 792 break; 793 794 /* 795 * Take the intersection of this VMA with the memory region 796 */ 797 vm_start = max(hva, vma->vm_start); 798 vm_end = min(reg_end, vma->vm_end); 799 800 if (!(vma->vm_flags & VM_PFNMAP)) { 801 gpa_t gpa = addr + (vm_start - memslot->userspace_addr); 802 unmap_stage2_range(kvm, gpa, vm_end - vm_start); 803 } 804 hva = vm_end; 805 } while (hva < reg_end); 806} 807 808/** 809 * stage2_unmap_vm - Unmap Stage-2 RAM mappings 810 * @kvm: The struct kvm pointer 811 * 812 * Go through the memregions and unmap any reguler RAM 813 * backing memory already mapped to the VM. 814 */ 815void stage2_unmap_vm(struct kvm *kvm) 816{ 817 struct kvm_memslots *slots; 818 struct kvm_memory_slot *memslot; 819 int idx; 820 821 idx = srcu_read_lock(&kvm->srcu); 822 down_read(&current->mm->mmap_sem); 823 spin_lock(&kvm->mmu_lock); 824 825 slots = kvm_memslots(kvm); 826 kvm_for_each_memslot(memslot, slots) 827 stage2_unmap_memslot(kvm, memslot); 828 829 spin_unlock(&kvm->mmu_lock); 830 up_read(&current->mm->mmap_sem); 831 srcu_read_unlock(&kvm->srcu, idx); 832} 833 834/** 835 * kvm_free_stage2_pgd - free all stage-2 tables 836 * @kvm: The KVM struct pointer for the VM. 837 * 838 * Walks the level-1 page table pointed to by kvm->arch.pgd and frees all 839 * underlying level-2 and level-3 tables before freeing the actual level-1 table 840 * and setting the struct pointer to NULL. 841 */ 842void kvm_free_stage2_pgd(struct kvm *kvm) 843{ 844 void *pgd = NULL; 845 846 spin_lock(&kvm->mmu_lock); 847 if (kvm->arch.pgd) { 848 unmap_stage2_range(kvm, 0, KVM_PHYS_SIZE); 849 pgd = READ_ONCE(kvm->arch.pgd); 850 kvm->arch.pgd = NULL; 851 } 852 spin_unlock(&kvm->mmu_lock); 853 854 /* Free the HW pgd, one page at a time */ 855 if (pgd) 856 free_pages_exact(pgd, S2_PGD_SIZE); 857} 858 859static pud_t *stage2_get_pud(struct kvm *kvm, struct kvm_mmu_memory_cache *cache, 860 phys_addr_t addr) 861{ 862 pgd_t *pgd; 863 pud_t *pud; 864 865 pgd = kvm->arch.pgd + stage2_pgd_index(addr); 866 if (WARN_ON(stage2_pgd_none(*pgd))) { 867 if (!cache) 868 return NULL; 869 pud = mmu_memory_cache_alloc(cache); 870 stage2_pgd_populate(pgd, pud); 871 get_page(virt_to_page(pgd)); 872 } 873 874 return stage2_pud_offset(pgd, addr); 875} 876 877static pmd_t *stage2_get_pmd(struct kvm *kvm, struct kvm_mmu_memory_cache *cache, 878 phys_addr_t addr) 879{ 880 pud_t *pud; 881 pmd_t *pmd; 882 883 pud = stage2_get_pud(kvm, cache, addr); 884 if (!pud) 885 return NULL; 886 887 if (stage2_pud_none(*pud)) { 888 if (!cache) 889 return NULL; 890 pmd = mmu_memory_cache_alloc(cache); 891 stage2_pud_populate(pud, pmd); 892 get_page(virt_to_page(pud)); 893 } 894 895 return stage2_pmd_offset(pud, addr); 896} 897 898static int stage2_set_pmd_huge(struct kvm *kvm, struct kvm_mmu_memory_cache 899 *cache, phys_addr_t addr, const pmd_t *new_pmd) 900{ 901 pmd_t *pmd, old_pmd; 902 903 pmd = stage2_get_pmd(kvm, cache, addr); 904 VM_BUG_ON(!pmd); 905 906 /* 907 * Mapping in huge pages should only happen through a fault. If a 908 * page is merged into a transparent huge page, the individual 909 * subpages of that huge page should be unmapped through MMU 910 * notifiers before we get here. 911 * 912 * Merging of CompoundPages is not supported; they should become 913 * splitting first, unmapped, merged, and mapped back in on-demand. 914 */ 915 VM_BUG_ON(pmd_present(*pmd) && pmd_pfn(*pmd) != pmd_pfn(*new_pmd)); 916 917 old_pmd = *pmd; 918 if (pmd_present(old_pmd)) { 919 pmd_clear(pmd); 920 kvm_tlb_flush_vmid_ipa(kvm, addr); 921 } else { 922 get_page(virt_to_page(pmd)); 923 } 924 925 kvm_set_pmd(pmd, *new_pmd); 926 return 0; 927} 928 929static int stage2_set_pte(struct kvm *kvm, struct kvm_mmu_memory_cache *cache, 930 phys_addr_t addr, const pte_t *new_pte, 931 unsigned long flags) 932{ 933 pmd_t *pmd; 934 pte_t *pte, old_pte; 935 bool iomap = flags & KVM_S2PTE_FLAG_IS_IOMAP; 936 bool logging_active = flags & KVM_S2_FLAG_LOGGING_ACTIVE; 937 938 VM_BUG_ON(logging_active && !cache); 939 940 /* Create stage-2 page table mapping - Levels 0 and 1 */ 941 pmd = stage2_get_pmd(kvm, cache, addr); 942 if (!pmd) { 943 /* 944 * Ignore calls from kvm_set_spte_hva for unallocated 945 * address ranges. 946 */ 947 return 0; 948 } 949 950 /* 951 * While dirty page logging - dissolve huge PMD, then continue on to 952 * allocate page. 953 */ 954 if (logging_active) 955 stage2_dissolve_pmd(kvm, addr, pmd); 956 957 /* Create stage-2 page mappings - Level 2 */ 958 if (pmd_none(*pmd)) { 959 if (!cache) 960 return 0; /* ignore calls from kvm_set_spte_hva */ 961 pte = mmu_memory_cache_alloc(cache); 962 pmd_populate_kernel(NULL, pmd, pte); 963 get_page(virt_to_page(pmd)); 964 } 965 966 pte = pte_offset_kernel(pmd, addr); 967 968 if (iomap && pte_present(*pte)) 969 return -EFAULT; 970 971 /* Create 2nd stage page table mapping - Level 3 */ 972 old_pte = *pte; 973 if (pte_present(old_pte)) { 974 kvm_set_pte(pte, __pte(0)); 975 kvm_tlb_flush_vmid_ipa(kvm, addr); 976 } else { 977 get_page(virt_to_page(pte)); 978 } 979 980 kvm_set_pte(pte, *new_pte); 981 return 0; 982} 983 984#ifndef __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG 985static int stage2_ptep_test_and_clear_young(pte_t *pte) 986{ 987 if (pte_young(*pte)) { 988 *pte = pte_mkold(*pte); 989 return 1; 990 } 991 return 0; 992} 993#else 994static int stage2_ptep_test_and_clear_young(pte_t *pte) 995{ 996 return __ptep_test_and_clear_young(pte); 997} 998#endif 999 1000static int stage2_pmdp_test_and_clear_young(pmd_t *pmd) 1001{ 1002 return stage2_ptep_test_and_clear_young((pte_t *)pmd); 1003} 1004 1005/** 1006 * kvm_phys_addr_ioremap - map a device range to guest IPA 1007 * 1008 * @kvm: The KVM pointer 1009 * @guest_ipa: The IPA at which to insert the mapping 1010 * @pa: The physical address of the device 1011 * @size: The size of the mapping 1012 */ 1013int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa, 1014 phys_addr_t pa, unsigned long size, bool writable) 1015{ 1016 phys_addr_t addr, end; 1017 int ret = 0; 1018 unsigned long pfn; 1019 struct kvm_mmu_memory_cache cache = { 0, }; 1020 1021 end = (guest_ipa + size + PAGE_SIZE - 1) & PAGE_MASK; 1022 pfn = __phys_to_pfn(pa); 1023 1024 for (addr = guest_ipa; addr < end; addr += PAGE_SIZE) { 1025 pte_t pte = pfn_pte(pfn, PAGE_S2_DEVICE); 1026 1027 if (writable) 1028 pte = kvm_s2pte_mkwrite(pte); 1029 1030 ret = mmu_topup_memory_cache(&cache, KVM_MMU_CACHE_MIN_PAGES, 1031 KVM_NR_MEM_OBJS); 1032 if (ret) 1033 goto out; 1034 spin_lock(&kvm->mmu_lock); 1035 ret = stage2_set_pte(kvm, &cache, addr, &pte, 1036 KVM_S2PTE_FLAG_IS_IOMAP); 1037 spin_unlock(&kvm->mmu_lock); 1038 if (ret) 1039 goto out; 1040 1041 pfn++; 1042 } 1043 1044out: 1045 mmu_free_memory_cache(&cache); 1046 return ret; 1047} 1048 1049static bool transparent_hugepage_adjust(kvm_pfn_t *pfnp, phys_addr_t *ipap) 1050{ 1051 kvm_pfn_t pfn = *pfnp; 1052 gfn_t gfn = *ipap >> PAGE_SHIFT; 1053 1054 if (PageTransCompoundMap(pfn_to_page(pfn))) { 1055 unsigned long mask; 1056 /* 1057 * The address we faulted on is backed by a transparent huge 1058 * page. However, because we map the compound huge page and 1059 * not the individual tail page, we need to transfer the 1060 * refcount to the head page. We have to be careful that the 1061 * THP doesn't start to split while we are adjusting the 1062 * refcounts. 1063 * 1064 * We are sure this doesn't happen, because mmu_notifier_retry 1065 * was successful and we are holding the mmu_lock, so if this 1066 * THP is trying to split, it will be blocked in the mmu 1067 * notifier before touching any of the pages, specifically 1068 * before being able to call __split_huge_page_refcount(). 1069 * 1070 * We can therefore safely transfer the refcount from PG_tail 1071 * to PG_head and switch the pfn from a tail page to the head 1072 * page accordingly. 1073 */ 1074 mask = PTRS_PER_PMD - 1; 1075 VM_BUG_ON((gfn & mask) != (pfn & mask)); 1076 if (pfn & mask) { 1077 *ipap &= PMD_MASK; 1078 kvm_release_pfn_clean(pfn); 1079 pfn &= ~mask; 1080 kvm_get_pfn(pfn); 1081 *pfnp = pfn; 1082 } 1083 1084 return true; 1085 } 1086 1087 return false; 1088} 1089 1090static bool kvm_is_write_fault(struct kvm_vcpu *vcpu) 1091{ 1092 if (kvm_vcpu_trap_is_iabt(vcpu)) 1093 return false; 1094 1095 return kvm_vcpu_dabt_iswrite(vcpu); 1096} 1097 1098/** 1099 * stage2_wp_ptes - write protect PMD range 1100 * @pmd: pointer to pmd entry 1101 * @addr: range start address 1102 * @end: range end address 1103 */ 1104static void stage2_wp_ptes(pmd_t *pmd, phys_addr_t addr, phys_addr_t end) 1105{ 1106 pte_t *pte; 1107 1108 pte = pte_offset_kernel(pmd, addr); 1109 do { 1110 if (!pte_none(*pte)) { 1111 if (!kvm_s2pte_readonly(pte)) 1112 kvm_set_s2pte_readonly(pte); 1113 } 1114 } while (pte++, addr += PAGE_SIZE, addr != end); 1115} 1116 1117/** 1118 * stage2_wp_pmds - write protect PUD range 1119 * @pud: pointer to pud entry 1120 * @addr: range start address 1121 * @end: range end address 1122 */ 1123static void stage2_wp_pmds(pud_t *pud, phys_addr_t addr, phys_addr_t end) 1124{ 1125 pmd_t *pmd; 1126 phys_addr_t next; 1127 1128 pmd = stage2_pmd_offset(pud, addr); 1129 1130 do { 1131 next = stage2_pmd_addr_end(addr, end); 1132 if (!pmd_none(*pmd)) { 1133 if (pmd_thp_or_huge(*pmd)) { 1134 if (!kvm_s2pmd_readonly(pmd)) 1135 kvm_set_s2pmd_readonly(pmd); 1136 } else { 1137 stage2_wp_ptes(pmd, addr, next); 1138 } 1139 } 1140 } while (pmd++, addr = next, addr != end); 1141} 1142 1143/** 1144 * stage2_wp_puds - write protect PGD range 1145 * @pgd: pointer to pgd entry 1146 * @addr: range start address 1147 * @end: range end address 1148 * 1149 * Process PUD entries, for a huge PUD we cause a panic. 1150 */ 1151static void stage2_wp_puds(pgd_t *pgd, phys_addr_t addr, phys_addr_t end) 1152{ 1153 pud_t *pud; 1154 phys_addr_t next; 1155 1156 pud = stage2_pud_offset(pgd, addr); 1157 do { 1158 next = stage2_pud_addr_end(addr, end); 1159 if (!stage2_pud_none(*pud)) { 1160 /* TODO:PUD not supported, revisit later if supported */ 1161 BUG_ON(stage2_pud_huge(*pud)); 1162 stage2_wp_pmds(pud, addr, next); 1163 } 1164 } while (pud++, addr = next, addr != end); 1165} 1166 1167/** 1168 * stage2_wp_range() - write protect stage2 memory region range 1169 * @kvm: The KVM pointer 1170 * @addr: Start address of range 1171 * @end: End address of range 1172 */ 1173static void stage2_wp_range(struct kvm *kvm, phys_addr_t addr, phys_addr_t end) 1174{ 1175 pgd_t *pgd; 1176 phys_addr_t next; 1177 1178 pgd = kvm->arch.pgd + stage2_pgd_index(addr); 1179 do { 1180 /* 1181 * Release kvm_mmu_lock periodically if the memory region is 1182 * large. Otherwise, we may see kernel panics with 1183 * CONFIG_DETECT_HUNG_TASK, CONFIG_LOCKUP_DETECTOR, 1184 * CONFIG_LOCKDEP. Additionally, holding the lock too long 1185 * will also starve other vCPUs. We have to also make sure 1186 * that the page tables are not freed while we released 1187 * the lock. 1188 */ 1189 cond_resched_lock(&kvm->mmu_lock); 1190 if (!READ_ONCE(kvm->arch.pgd)) 1191 break; 1192 next = stage2_pgd_addr_end(addr, end); 1193 if (stage2_pgd_present(*pgd)) 1194 stage2_wp_puds(pgd, addr, next); 1195 } while (pgd++, addr = next, addr != end); 1196} 1197 1198/** 1199 * kvm_mmu_wp_memory_region() - write protect stage 2 entries for memory slot 1200 * @kvm: The KVM pointer 1201 * @slot: The memory slot to write protect 1202 * 1203 * Called to start logging dirty pages after memory region 1204 * KVM_MEM_LOG_DIRTY_PAGES operation is called. After this function returns 1205 * all present PMD and PTEs are write protected in the memory region. 1206 * Afterwards read of dirty page log can be called. 1207 * 1208 * Acquires kvm_mmu_lock. Called with kvm->slots_lock mutex acquired, 1209 * serializing operations for VM memory regions. 1210 */ 1211void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot) 1212{ 1213 struct kvm_memslots *slots = kvm_memslots(kvm); 1214 struct kvm_memory_slot *memslot = id_to_memslot(slots, slot); 1215 phys_addr_t start = memslot->base_gfn << PAGE_SHIFT; 1216 phys_addr_t end = (memslot->base_gfn + memslot->npages) << PAGE_SHIFT; 1217 1218 spin_lock(&kvm->mmu_lock); 1219 stage2_wp_range(kvm, start, end); 1220 spin_unlock(&kvm->mmu_lock); 1221 kvm_flush_remote_tlbs(kvm); 1222} 1223 1224/** 1225 * kvm_mmu_write_protect_pt_masked() - write protect dirty pages 1226 * @kvm: The KVM pointer 1227 * @slot: The memory slot associated with mask 1228 * @gfn_offset: The gfn offset in memory slot 1229 * @mask: The mask of dirty pages at offset 'gfn_offset' in this memory 1230 * slot to be write protected 1231 * 1232 * Walks bits set in mask write protects the associated pte's. Caller must 1233 * acquire kvm_mmu_lock. 1234 */ 1235static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm, 1236 struct kvm_memory_slot *slot, 1237 gfn_t gfn_offset, unsigned long mask) 1238{ 1239 phys_addr_t base_gfn = slot->base_gfn + gfn_offset; 1240 phys_addr_t start = (base_gfn + __ffs(mask)) << PAGE_SHIFT; 1241 phys_addr_t end = (base_gfn + __fls(mask) + 1) << PAGE_SHIFT; 1242 1243 stage2_wp_range(kvm, start, end); 1244} 1245 1246/* 1247 * kvm_arch_mmu_enable_log_dirty_pt_masked - enable dirty logging for selected 1248 * dirty pages. 1249 * 1250 * It calls kvm_mmu_write_protect_pt_masked to write protect selected pages to 1251 * enable dirty logging for them. 1252 */ 1253void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm, 1254 struct kvm_memory_slot *slot, 1255 gfn_t gfn_offset, unsigned long mask) 1256{ 1257 kvm_mmu_write_protect_pt_masked(kvm, slot, gfn_offset, mask); 1258} 1259 1260static void coherent_cache_guest_page(struct kvm_vcpu *vcpu, kvm_pfn_t pfn, 1261 unsigned long size) 1262{ 1263 __coherent_cache_guest_page(vcpu, pfn, size); 1264} 1265 1266static void kvm_send_hwpoison_signal(unsigned long address, 1267 struct vm_area_struct *vma) 1268{ 1269 siginfo_t info; 1270 1271 info.si_signo = SIGBUS; 1272 info.si_errno = 0; 1273 info.si_code = BUS_MCEERR_AR; 1274 info.si_addr = (void __user *)address; 1275 1276 if (is_vm_hugetlb_page(vma)) 1277 info.si_addr_lsb = huge_page_shift(hstate_vma(vma)); 1278 else 1279 info.si_addr_lsb = PAGE_SHIFT; 1280 1281 send_sig_info(SIGBUS, &info, current); 1282} 1283 1284static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, 1285 struct kvm_memory_slot *memslot, unsigned long hva, 1286 unsigned long fault_status) 1287{ 1288 int ret; 1289 bool write_fault, writable, hugetlb = false, force_pte = false; 1290 unsigned long mmu_seq; 1291 gfn_t gfn = fault_ipa >> PAGE_SHIFT; 1292 struct kvm *kvm = vcpu->kvm; 1293 struct kvm_mmu_memory_cache *memcache = &vcpu->arch.mmu_page_cache; 1294 struct vm_area_struct *vma; 1295 kvm_pfn_t pfn; 1296 pgprot_t mem_type = PAGE_S2; 1297 bool logging_active = memslot_is_logging(memslot); 1298 unsigned long flags = 0; 1299 1300 write_fault = kvm_is_write_fault(vcpu); 1301 if (fault_status == FSC_PERM && !write_fault) { 1302 kvm_err("Unexpected L2 read permission error\n"); 1303 return -EFAULT; 1304 } 1305 1306 /* Let's check if we will get back a huge page backed by hugetlbfs */ 1307 down_read(&current->mm->mmap_sem); 1308 vma = find_vma_intersection(current->mm, hva, hva + 1); 1309 if (unlikely(!vma)) { 1310 kvm_err("Failed to find VMA for hva 0x%lx\n", hva); 1311 up_read(&current->mm->mmap_sem); 1312 return -EFAULT; 1313 } 1314 1315 if (is_vm_hugetlb_page(vma) && !logging_active) { 1316 hugetlb = true; 1317 gfn = (fault_ipa & PMD_MASK) >> PAGE_SHIFT; 1318 } else { 1319 /* 1320 * Pages belonging to memslots that don't have the same 1321 * alignment for userspace and IPA cannot be mapped using 1322 * block descriptors even if the pages belong to a THP for 1323 * the process, because the stage-2 block descriptor will 1324 * cover more than a single THP and we loose atomicity for 1325 * unmapping, updates, and splits of the THP or other pages 1326 * in the stage-2 block range. 1327 */ 1328 if ((memslot->userspace_addr & ~PMD_MASK) != 1329 ((memslot->base_gfn << PAGE_SHIFT) & ~PMD_MASK)) 1330 force_pte = true; 1331 } 1332 up_read(&current->mm->mmap_sem); 1333 1334 /* We need minimum second+third level pages */ 1335 ret = mmu_topup_memory_cache(memcache, KVM_MMU_CACHE_MIN_PAGES, 1336 KVM_NR_MEM_OBJS); 1337 if (ret) 1338 return ret; 1339 1340 mmu_seq = vcpu->kvm->mmu_notifier_seq; 1341 /* 1342 * Ensure the read of mmu_notifier_seq happens before we call 1343 * gfn_to_pfn_prot (which calls get_user_pages), so that we don't risk 1344 * the page we just got a reference to gets unmapped before we have a 1345 * chance to grab the mmu_lock, which ensure that if the page gets 1346 * unmapped afterwards, the call to kvm_unmap_hva will take it away 1347 * from us again properly. This smp_rmb() interacts with the smp_wmb() 1348 * in kvm_mmu_notifier_invalidate_<page|range_end>. 1349 */ 1350 smp_rmb(); 1351 1352 pfn = gfn_to_pfn_prot(kvm, gfn, write_fault, &writable); 1353 if (pfn == KVM_PFN_ERR_HWPOISON) { 1354 kvm_send_hwpoison_signal(hva, vma); 1355 return 0; 1356 } 1357 if (is_error_noslot_pfn(pfn)) 1358 return -EFAULT; 1359 1360 if (kvm_is_device_pfn(pfn)) { 1361 mem_type = PAGE_S2_DEVICE; 1362 flags |= KVM_S2PTE_FLAG_IS_IOMAP; 1363 } else if (logging_active) { 1364 /* 1365 * Faults on pages in a memslot with logging enabled 1366 * should not be mapped with huge pages (it introduces churn 1367 * and performance degradation), so force a pte mapping. 1368 */ 1369 force_pte = true; 1370 flags |= KVM_S2_FLAG_LOGGING_ACTIVE; 1371 1372 /* 1373 * Only actually map the page as writable if this was a write 1374 * fault. 1375 */ 1376 if (!write_fault) 1377 writable = false; 1378 } 1379 1380 spin_lock(&kvm->mmu_lock); 1381 if (mmu_notifier_retry(kvm, mmu_seq)) 1382 goto out_unlock; 1383 1384 if (!hugetlb && !force_pte) 1385 hugetlb = transparent_hugepage_adjust(&pfn, &fault_ipa); 1386 1387 if (hugetlb) { 1388 pmd_t new_pmd = pfn_pmd(pfn, mem_type); 1389 new_pmd = pmd_mkhuge(new_pmd); 1390 if (writable) { 1391 new_pmd = kvm_s2pmd_mkwrite(new_pmd); 1392 kvm_set_pfn_dirty(pfn); 1393 } 1394 coherent_cache_guest_page(vcpu, pfn, PMD_SIZE); 1395 ret = stage2_set_pmd_huge(kvm, memcache, fault_ipa, &new_pmd); 1396 } else { 1397 pte_t new_pte = pfn_pte(pfn, mem_type); 1398 1399 if (writable) { 1400 new_pte = kvm_s2pte_mkwrite(new_pte); 1401 kvm_set_pfn_dirty(pfn); 1402 mark_page_dirty(kvm, gfn); 1403 } 1404 coherent_cache_guest_page(vcpu, pfn, PAGE_SIZE); 1405 ret = stage2_set_pte(kvm, memcache, fault_ipa, &new_pte, flags); 1406 } 1407 1408out_unlock: 1409 spin_unlock(&kvm->mmu_lock); 1410 kvm_set_pfn_accessed(pfn); 1411 kvm_release_pfn_clean(pfn); 1412 return ret; 1413} 1414 1415/* 1416 * Resolve the access fault by making the page young again. 1417 * Note that because the faulting entry is guaranteed not to be 1418 * cached in the TLB, we don't need to invalidate anything. 1419 * Only the HW Access Flag updates are supported for Stage 2 (no DBM), 1420 * so there is no need for atomic (pte|pmd)_mkyoung operations. 1421 */ 1422static void handle_access_fault(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa) 1423{ 1424 pmd_t *pmd; 1425 pte_t *pte; 1426 kvm_pfn_t pfn; 1427 bool pfn_valid = false; 1428 1429 trace_kvm_access_fault(fault_ipa); 1430 1431 spin_lock(&vcpu->kvm->mmu_lock); 1432 1433 pmd = stage2_get_pmd(vcpu->kvm, NULL, fault_ipa); 1434 if (!pmd || pmd_none(*pmd)) /* Nothing there */ 1435 goto out; 1436 1437 if (pmd_thp_or_huge(*pmd)) { /* THP, HugeTLB */ 1438 *pmd = pmd_mkyoung(*pmd); 1439 pfn = pmd_pfn(*pmd); 1440 pfn_valid = true; 1441 goto out; 1442 } 1443 1444 pte = pte_offset_kernel(pmd, fault_ipa); 1445 if (pte_none(*pte)) /* Nothing there either */ 1446 goto out; 1447 1448 *pte = pte_mkyoung(*pte); /* Just a page... */ 1449 pfn = pte_pfn(*pte); 1450 pfn_valid = true; 1451out: 1452 spin_unlock(&vcpu->kvm->mmu_lock); 1453 if (pfn_valid) 1454 kvm_set_pfn_accessed(pfn); 1455} 1456 1457/** 1458 * kvm_handle_guest_abort - handles all 2nd stage aborts 1459 * @vcpu: the VCPU pointer 1460 * @run: the kvm_run structure 1461 * 1462 * Any abort that gets to the host is almost guaranteed to be caused by a 1463 * missing second stage translation table entry, which can mean that either the 1464 * guest simply needs more memory and we must allocate an appropriate page or it 1465 * can mean that the guest tried to access I/O memory, which is emulated by user 1466 * space. The distinction is based on the IPA causing the fault and whether this 1467 * memory region has been registered as standard RAM by user space. 1468 */ 1469int kvm_handle_guest_abort(struct kvm_vcpu *vcpu, struct kvm_run *run) 1470{ 1471 unsigned long fault_status; 1472 phys_addr_t fault_ipa; 1473 struct kvm_memory_slot *memslot; 1474 unsigned long hva; 1475 bool is_iabt, write_fault, writable; 1476 gfn_t gfn; 1477 int ret, idx; 1478 1479 fault_status = kvm_vcpu_trap_get_fault_type(vcpu); 1480 1481 fault_ipa = kvm_vcpu_get_fault_ipa(vcpu); 1482 is_iabt = kvm_vcpu_trap_is_iabt(vcpu); 1483 1484 /* Synchronous External Abort? */ 1485 if (kvm_vcpu_dabt_isextabt(vcpu)) { 1486 /* 1487 * For RAS the host kernel may handle this abort. 1488 * There is no need to pass the error into the guest. 1489 */ 1490 if (!handle_guest_sea(fault_ipa, kvm_vcpu_get_hsr(vcpu))) 1491 return 1; 1492 1493 if (unlikely(!is_iabt)) { 1494 kvm_inject_vabt(vcpu); 1495 return 1; 1496 } 1497 } 1498 1499 trace_kvm_guest_fault(*vcpu_pc(vcpu), kvm_vcpu_get_hsr(vcpu), 1500 kvm_vcpu_get_hfar(vcpu), fault_ipa); 1501 1502 /* Check the stage-2 fault is trans. fault or write fault */ 1503 if (fault_status != FSC_FAULT && fault_status != FSC_PERM && 1504 fault_status != FSC_ACCESS) { 1505 kvm_err("Unsupported FSC: EC=%#x xFSC=%#lx ESR_EL2=%#lx\n", 1506 kvm_vcpu_trap_get_class(vcpu), 1507 (unsigned long)kvm_vcpu_trap_get_fault(vcpu), 1508 (unsigned long)kvm_vcpu_get_hsr(vcpu)); 1509 return -EFAULT; 1510 } 1511 1512 idx = srcu_read_lock(&vcpu->kvm->srcu); 1513 1514 gfn = fault_ipa >> PAGE_SHIFT; 1515 memslot = gfn_to_memslot(vcpu->kvm, gfn); 1516 hva = gfn_to_hva_memslot_prot(memslot, gfn, &writable); 1517 write_fault = kvm_is_write_fault(vcpu); 1518 if (kvm_is_error_hva(hva) || (write_fault && !writable)) { 1519 if (is_iabt) { 1520 /* Prefetch Abort on I/O address */ 1521 kvm_inject_pabt(vcpu, kvm_vcpu_get_hfar(vcpu)); 1522 ret = 1; 1523 goto out_unlock; 1524 } 1525 1526 /* 1527 * Check for a cache maintenance operation. Since we 1528 * ended-up here, we know it is outside of any memory 1529 * slot. But we can't find out if that is for a device, 1530 * or if the guest is just being stupid. The only thing 1531 * we know for sure is that this range cannot be cached. 1532 * 1533 * So let's assume that the guest is just being 1534 * cautious, and skip the instruction. 1535 */ 1536 if (kvm_vcpu_dabt_is_cm(vcpu)) { 1537 kvm_skip_instr(vcpu, kvm_vcpu_trap_il_is32bit(vcpu)); 1538 ret = 1; 1539 goto out_unlock; 1540 } 1541 1542 /* 1543 * The IPA is reported as [MAX:12], so we need to 1544 * complement it with the bottom 12 bits from the 1545 * faulting VA. This is always 12 bits, irrespective 1546 * of the page size. 1547 */ 1548 fault_ipa |= kvm_vcpu_get_hfar(vcpu) & ((1 << 12) - 1); 1549 ret = io_mem_abort(vcpu, run, fault_ipa); 1550 goto out_unlock; 1551 } 1552 1553 /* Userspace should not be able to register out-of-bounds IPAs */ 1554 VM_BUG_ON(fault_ipa >= KVM_PHYS_SIZE); 1555 1556 if (fault_status == FSC_ACCESS) { 1557 handle_access_fault(vcpu, fault_ipa); 1558 ret = 1; 1559 goto out_unlock; 1560 } 1561 1562 ret = user_mem_abort(vcpu, fault_ipa, memslot, hva, fault_status); 1563 if (ret == 0) 1564 ret = 1; 1565out_unlock: 1566 srcu_read_unlock(&vcpu->kvm->srcu, idx); 1567 return ret; 1568} 1569 1570static int handle_hva_to_gpa(struct kvm *kvm, 1571 unsigned long start, 1572 unsigned long end, 1573 int (*handler)(struct kvm *kvm, 1574 gpa_t gpa, u64 size, 1575 void *data), 1576 void *data) 1577{ 1578 struct kvm_memslots *slots; 1579 struct kvm_memory_slot *memslot; 1580 int ret = 0; 1581 1582 slots = kvm_memslots(kvm); 1583 1584 /* we only care about the pages that the guest sees */ 1585 kvm_for_each_memslot(memslot, slots) { 1586 unsigned long hva_start, hva_end; 1587 gfn_t gpa; 1588 1589 hva_start = max(start, memslot->userspace_addr); 1590 hva_end = min(end, memslot->userspace_addr + 1591 (memslot->npages << PAGE_SHIFT)); 1592 if (hva_start >= hva_end) 1593 continue; 1594 1595 gpa = hva_to_gfn_memslot(hva_start, memslot) << PAGE_SHIFT; 1596 ret |= handler(kvm, gpa, (u64)(hva_end - hva_start), data); 1597 } 1598 1599 return ret; 1600} 1601 1602static int kvm_unmap_hva_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data) 1603{ 1604 unmap_stage2_range(kvm, gpa, size); 1605 return 0; 1606} 1607 1608int kvm_unmap_hva(struct kvm *kvm, unsigned long hva) 1609{ 1610 unsigned long end = hva + PAGE_SIZE; 1611 1612 if (!kvm->arch.pgd) 1613 return 0; 1614 1615 trace_kvm_unmap_hva(hva); 1616 handle_hva_to_gpa(kvm, hva, end, &kvm_unmap_hva_handler, NULL); 1617 return 0; 1618} 1619 1620int kvm_unmap_hva_range(struct kvm *kvm, 1621 unsigned long start, unsigned long end) 1622{ 1623 if (!kvm->arch.pgd) 1624 return 0; 1625 1626 trace_kvm_unmap_hva_range(start, end); 1627 handle_hva_to_gpa(kvm, start, end, &kvm_unmap_hva_handler, NULL); 1628 return 0; 1629} 1630 1631static int kvm_set_spte_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data) 1632{ 1633 pte_t *pte = (pte_t *)data; 1634 1635 WARN_ON(size != PAGE_SIZE); 1636 /* 1637 * We can always call stage2_set_pte with KVM_S2PTE_FLAG_LOGGING_ACTIVE 1638 * flag clear because MMU notifiers will have unmapped a huge PMD before 1639 * calling ->change_pte() (which in turn calls kvm_set_spte_hva()) and 1640 * therefore stage2_set_pte() never needs to clear out a huge PMD 1641 * through this calling path. 1642 */ 1643 stage2_set_pte(kvm, NULL, gpa, pte, 0); 1644 return 0; 1645} 1646 1647 1648void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte) 1649{ 1650 unsigned long end = hva + PAGE_SIZE; 1651 pte_t stage2_pte; 1652 1653 if (!kvm->arch.pgd) 1654 return; 1655 1656 trace_kvm_set_spte_hva(hva); 1657 stage2_pte = pfn_pte(pte_pfn(pte), PAGE_S2); 1658 handle_hva_to_gpa(kvm, hva, end, &kvm_set_spte_handler, &stage2_pte); 1659} 1660 1661static int kvm_age_hva_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data) 1662{ 1663 pmd_t *pmd; 1664 pte_t *pte; 1665 1666 WARN_ON(size != PAGE_SIZE && size != PMD_SIZE); 1667 pmd = stage2_get_pmd(kvm, NULL, gpa); 1668 if (!pmd || pmd_none(*pmd)) /* Nothing there */ 1669 return 0; 1670 1671 if (pmd_thp_or_huge(*pmd)) /* THP, HugeTLB */ 1672 return stage2_pmdp_test_and_clear_young(pmd); 1673 1674 pte = pte_offset_kernel(pmd, gpa); 1675 if (pte_none(*pte)) 1676 return 0; 1677 1678 return stage2_ptep_test_and_clear_young(pte); 1679} 1680 1681static int kvm_test_age_hva_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data) 1682{ 1683 pmd_t *pmd; 1684 pte_t *pte; 1685 1686 WARN_ON(size != PAGE_SIZE && size != PMD_SIZE); 1687 pmd = stage2_get_pmd(kvm, NULL, gpa); 1688 if (!pmd || pmd_none(*pmd)) /* Nothing there */ 1689 return 0; 1690 1691 if (pmd_thp_or_huge(*pmd)) /* THP, HugeTLB */ 1692 return pmd_young(*pmd); 1693 1694 pte = pte_offset_kernel(pmd, gpa); 1695 if (!pte_none(*pte)) /* Just a page... */ 1696 return pte_young(*pte); 1697 1698 return 0; 1699} 1700 1701int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end) 1702{ 1703 if (!kvm->arch.pgd) 1704 return 0; 1705 trace_kvm_age_hva(start, end); 1706 return handle_hva_to_gpa(kvm, start, end, kvm_age_hva_handler, NULL); 1707} 1708 1709int kvm_test_age_hva(struct kvm *kvm, unsigned long hva) 1710{ 1711 if (!kvm->arch.pgd) 1712 return 0; 1713 trace_kvm_test_age_hva(hva); 1714 return handle_hva_to_gpa(kvm, hva, hva, kvm_test_age_hva_handler, NULL); 1715} 1716 1717void kvm_mmu_free_memory_caches(struct kvm_vcpu *vcpu) 1718{ 1719 mmu_free_memory_cache(&vcpu->arch.mmu_page_cache); 1720} 1721 1722phys_addr_t kvm_mmu_get_httbr(void) 1723{ 1724 if (__kvm_cpu_uses_extended_idmap()) 1725 return virt_to_phys(merged_hyp_pgd); 1726 else 1727 return virt_to_phys(hyp_pgd); 1728} 1729 1730phys_addr_t kvm_get_idmap_vector(void) 1731{ 1732 return hyp_idmap_vector; 1733} 1734 1735static int kvm_map_idmap_text(pgd_t *pgd) 1736{ 1737 int err; 1738 1739 /* Create the idmap in the boot page tables */ 1740 err = __create_hyp_mappings(pgd, 1741 hyp_idmap_start, hyp_idmap_end, 1742 __phys_to_pfn(hyp_idmap_start), 1743 PAGE_HYP_EXEC); 1744 if (err) 1745 kvm_err("Failed to idmap %lx-%lx\n", 1746 hyp_idmap_start, hyp_idmap_end); 1747 1748 return err; 1749} 1750 1751int kvm_mmu_init(void) 1752{ 1753 int err; 1754 1755 hyp_idmap_start = kvm_virt_to_phys(__hyp_idmap_text_start); 1756 hyp_idmap_end = kvm_virt_to_phys(__hyp_idmap_text_end); 1757 hyp_idmap_vector = kvm_virt_to_phys(__kvm_hyp_init); 1758 1759 /* 1760 * We rely on the linker script to ensure at build time that the HYP 1761 * init code does not cross a page boundary. 1762 */ 1763 BUG_ON((hyp_idmap_start ^ (hyp_idmap_end - 1)) & PAGE_MASK); 1764 1765 kvm_info("IDMAP page: %lx\n", hyp_idmap_start); 1766 kvm_info("HYP VA range: %lx:%lx\n", 1767 kern_hyp_va(PAGE_OFFSET), kern_hyp_va(~0UL)); 1768 1769 if (hyp_idmap_start >= kern_hyp_va(PAGE_OFFSET) && 1770 hyp_idmap_start < kern_hyp_va(~0UL) && 1771 hyp_idmap_start != (unsigned long)__hyp_idmap_text_start) { 1772 /* 1773 * The idmap page is intersecting with the VA space, 1774 * it is not safe to continue further. 1775 */ 1776 kvm_err("IDMAP intersecting with HYP VA, unable to continue\n"); 1777 err = -EINVAL; 1778 goto out; 1779 } 1780 1781 hyp_pgd = (pgd_t *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, hyp_pgd_order); 1782 if (!hyp_pgd) { 1783 kvm_err("Hyp mode PGD not allocated\n"); 1784 err = -ENOMEM; 1785 goto out; 1786 } 1787 1788 if (__kvm_cpu_uses_extended_idmap()) { 1789 boot_hyp_pgd = (pgd_t *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, 1790 hyp_pgd_order); 1791 if (!boot_hyp_pgd) { 1792 kvm_err("Hyp boot PGD not allocated\n"); 1793 err = -ENOMEM; 1794 goto out; 1795 } 1796 1797 err = kvm_map_idmap_text(boot_hyp_pgd); 1798 if (err) 1799 goto out; 1800 1801 merged_hyp_pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO); 1802 if (!merged_hyp_pgd) { 1803 kvm_err("Failed to allocate extra HYP pgd\n"); 1804 goto out; 1805 } 1806 __kvm_extend_hypmap(boot_hyp_pgd, hyp_pgd, merged_hyp_pgd, 1807 hyp_idmap_start); 1808 } else { 1809 err = kvm_map_idmap_text(hyp_pgd); 1810 if (err) 1811 goto out; 1812 } 1813 1814 return 0; 1815out: 1816 free_hyp_pgds(); 1817 return err; 1818} 1819 1820void kvm_arch_commit_memory_region(struct kvm *kvm, 1821 const struct kvm_userspace_memory_region *mem, 1822 const struct kvm_memory_slot *old, 1823 const struct kvm_memory_slot *new, 1824 enum kvm_mr_change change) 1825{ 1826 /* 1827 * At this point memslot has been committed and there is an 1828 * allocated dirty_bitmap[], dirty pages will be be tracked while the 1829 * memory slot is write protected. 1830 */ 1831 if (change != KVM_MR_DELETE && mem->flags & KVM_MEM_LOG_DIRTY_PAGES) 1832 kvm_mmu_wp_memory_region(kvm, mem->slot); 1833} 1834 1835int kvm_arch_prepare_memory_region(struct kvm *kvm, 1836 struct kvm_memory_slot *memslot, 1837 const struct kvm_userspace_memory_region *mem, 1838 enum kvm_mr_change change) 1839{ 1840 hva_t hva = mem->userspace_addr; 1841 hva_t reg_end = hva + mem->memory_size; 1842 bool writable = !(mem->flags & KVM_MEM_READONLY); 1843 int ret = 0; 1844 1845 if (change != KVM_MR_CREATE && change != KVM_MR_MOVE && 1846 change != KVM_MR_FLAGS_ONLY) 1847 return 0; 1848 1849 /* 1850 * Prevent userspace from creating a memory region outside of the IPA 1851 * space addressable by the KVM guest IPA space. 1852 */ 1853 if (memslot->base_gfn + memslot->npages >= 1854 (KVM_PHYS_SIZE >> PAGE_SHIFT)) 1855 return -EFAULT; 1856 1857 down_read(&current->mm->mmap_sem); 1858 /* 1859 * A memory region could potentially cover multiple VMAs, and any holes 1860 * between them, so iterate over all of them to find out if we can map 1861 * any of them right now. 1862 * 1863 * +--------------------------------------------+ 1864 * +---------------+----------------+ +----------------+ 1865 * | : VMA 1 | VMA 2 | | VMA 3 : | 1866 * +---------------+----------------+ +----------------+ 1867 * | memory region | 1868 * +--------------------------------------------+ 1869 */ 1870 do { 1871 struct vm_area_struct *vma = find_vma(current->mm, hva); 1872 hva_t vm_start, vm_end; 1873 1874 if (!vma || vma->vm_start >= reg_end) 1875 break; 1876 1877 /* 1878 * Mapping a read-only VMA is only allowed if the 1879 * memory region is configured as read-only. 1880 */ 1881 if (writable && !(vma->vm_flags & VM_WRITE)) { 1882 ret = -EPERM; 1883 break; 1884 } 1885 1886 /* 1887 * Take the intersection of this VMA with the memory region 1888 */ 1889 vm_start = max(hva, vma->vm_start); 1890 vm_end = min(reg_end, vma->vm_end); 1891 1892 if (vma->vm_flags & VM_PFNMAP) { 1893 gpa_t gpa = mem->guest_phys_addr + 1894 (vm_start - mem->userspace_addr); 1895 phys_addr_t pa; 1896 1897 pa = (phys_addr_t)vma->vm_pgoff << PAGE_SHIFT; 1898 pa += vm_start - vma->vm_start; 1899 1900 /* IO region dirty page logging not allowed */ 1901 if (memslot->flags & KVM_MEM_LOG_DIRTY_PAGES) { 1902 ret = -EINVAL; 1903 goto out; 1904 } 1905 1906 ret = kvm_phys_addr_ioremap(kvm, gpa, pa, 1907 vm_end - vm_start, 1908 writable); 1909 if (ret) 1910 break; 1911 } 1912 hva = vm_end; 1913 } while (hva < reg_end); 1914 1915 if (change == KVM_MR_FLAGS_ONLY) 1916 goto out; 1917 1918 spin_lock(&kvm->mmu_lock); 1919 if (ret) 1920 unmap_stage2_range(kvm, mem->guest_phys_addr, mem->memory_size); 1921 else 1922 stage2_flush_memslot(kvm, memslot); 1923 spin_unlock(&kvm->mmu_lock); 1924out: 1925 up_read(&current->mm->mmap_sem); 1926 return ret; 1927} 1928 1929void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free, 1930 struct kvm_memory_slot *dont) 1931{ 1932} 1933 1934int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot, 1935 unsigned long npages) 1936{ 1937 return 0; 1938} 1939 1940void kvm_arch_memslots_updated(struct kvm *kvm, struct kvm_memslots *slots) 1941{ 1942} 1943 1944void kvm_arch_flush_shadow_all(struct kvm *kvm) 1945{ 1946 kvm_free_stage2_pgd(kvm); 1947} 1948 1949void kvm_arch_flush_shadow_memslot(struct kvm *kvm, 1950 struct kvm_memory_slot *slot) 1951{ 1952 gpa_t gpa = slot->base_gfn << PAGE_SHIFT; 1953 phys_addr_t size = slot->npages << PAGE_SHIFT; 1954 1955 spin_lock(&kvm->mmu_lock); 1956 unmap_stage2_range(kvm, gpa, size); 1957 spin_unlock(&kvm->mmu_lock); 1958} 1959 1960/* 1961 * See note at ARMv7 ARM B1.14.4 (TL;DR: S/W ops are not easily virtualized). 1962 * 1963 * Main problems: 1964 * - S/W ops are local to a CPU (not broadcast) 1965 * - We have line migration behind our back (speculation) 1966 * - System caches don't support S/W at all (damn!) 1967 * 1968 * In the face of the above, the best we can do is to try and convert 1969 * S/W ops to VA ops. Because the guest is not allowed to infer the 1970 * S/W to PA mapping, it can only use S/W to nuke the whole cache, 1971 * which is a rather good thing for us. 1972 * 1973 * Also, it is only used when turning caches on/off ("The expected 1974 * usage of the cache maintenance instructions that operate by set/way 1975 * is associated with the cache maintenance instructions associated 1976 * with the powerdown and powerup of caches, if this is required by 1977 * the implementation."). 1978 * 1979 * We use the following policy: 1980 * 1981 * - If we trap a S/W operation, we enable VM trapping to detect 1982 * caches being turned on/off, and do a full clean. 1983 * 1984 * - We flush the caches on both caches being turned on and off. 1985 * 1986 * - Once the caches are enabled, we stop trapping VM ops. 1987 */ 1988void kvm_set_way_flush(struct kvm_vcpu *vcpu) 1989{ 1990 unsigned long hcr = vcpu_get_hcr(vcpu); 1991 1992 /* 1993 * If this is the first time we do a S/W operation 1994 * (i.e. HCR_TVM not set) flush the whole memory, and set the 1995 * VM trapping. 1996 * 1997 * Otherwise, rely on the VM trapping to wait for the MMU + 1998 * Caches to be turned off. At that point, we'll be able to 1999 * clean the caches again. 2000 */ 2001 if (!(hcr & HCR_TVM)) { 2002 trace_kvm_set_way_flush(*vcpu_pc(vcpu), 2003 vcpu_has_cache_enabled(vcpu)); 2004 stage2_flush_vm(vcpu->kvm); 2005 vcpu_set_hcr(vcpu, hcr | HCR_TVM); 2006 } 2007} 2008 2009void kvm_toggle_cache(struct kvm_vcpu *vcpu, bool was_enabled) 2010{ 2011 bool now_enabled = vcpu_has_cache_enabled(vcpu); 2012 2013 /* 2014 * If switching the MMU+caches on, need to invalidate the caches. 2015 * If switching it off, need to clean the caches. 2016 * Clean + invalidate does the trick always. 2017 */ 2018 if (now_enabled != was_enabled) 2019 stage2_flush_vm(vcpu->kvm); 2020 2021 /* Caches are now on, stop trapping VM ops (until a S/W op) */ 2022 if (now_enabled) 2023 vcpu_set_hcr(vcpu, vcpu_get_hcr(vcpu) & ~HCR_TVM); 2024 2025 trace_kvm_toggle_cache(*vcpu_pc(vcpu), was_enabled, now_enabled); 2026}