Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

at v4.18 2193 lines 58 kB view raw
1/* 2 * Copyright (C) 2012 - Virtual Open Systems and Columbia University 3 * Author: Christoffer Dall <c.dall@virtualopensystems.com> 4 * 5 * This program is free software; you can redistribute it and/or modify 6 * it under the terms of the GNU General Public License, version 2, as 7 * published by the Free Software Foundation. 8 * 9 * This program is distributed in the hope that it will be useful, 10 * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 * GNU General Public License for more details. 13 * 14 * You should have received a copy of the GNU General Public License 15 * along with this program; if not, write to the Free Software 16 * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. 17 */ 18 19#include <linux/mman.h> 20#include <linux/kvm_host.h> 21#include <linux/io.h> 22#include <linux/hugetlb.h> 23#include <linux/sched/signal.h> 24#include <trace/events/kvm.h> 25#include <asm/pgalloc.h> 26#include <asm/cacheflush.h> 27#include <asm/kvm_arm.h> 28#include <asm/kvm_mmu.h> 29#include <asm/kvm_mmio.h> 30#include <asm/kvm_asm.h> 31#include <asm/kvm_emulate.h> 32#include <asm/virt.h> 33#include <asm/system_misc.h> 34 35#include "trace.h" 36 37static pgd_t *boot_hyp_pgd; 38static pgd_t *hyp_pgd; 39static pgd_t *merged_hyp_pgd; 40static DEFINE_MUTEX(kvm_hyp_pgd_mutex); 41 42static unsigned long hyp_idmap_start; 43static unsigned long hyp_idmap_end; 44static phys_addr_t hyp_idmap_vector; 45 46static unsigned long io_map_base; 47 48#define S2_PGD_SIZE (PTRS_PER_S2_PGD * sizeof(pgd_t)) 49#define hyp_pgd_order get_order(PTRS_PER_PGD * sizeof(pgd_t)) 50 51#define KVM_S2PTE_FLAG_IS_IOMAP (1UL << 0) 52#define KVM_S2_FLAG_LOGGING_ACTIVE (1UL << 1) 53 54static bool memslot_is_logging(struct kvm_memory_slot *memslot) 55{ 56 return memslot->dirty_bitmap && !(memslot->flags & KVM_MEM_READONLY); 57} 58 59/** 60 * kvm_flush_remote_tlbs() - flush all VM TLB entries for v7/8 61 * @kvm: pointer to kvm structure. 62 * 63 * Interface to HYP function to flush all VM TLB entries 64 */ 65void kvm_flush_remote_tlbs(struct kvm *kvm) 66{ 67 kvm_call_hyp(__kvm_tlb_flush_vmid, kvm); 68} 69 70static void kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa) 71{ 72 kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, kvm, ipa); 73} 74 75/* 76 * D-Cache management functions. They take the page table entries by 77 * value, as they are flushing the cache using the kernel mapping (or 78 * kmap on 32bit). 79 */ 80static void kvm_flush_dcache_pte(pte_t pte) 81{ 82 __kvm_flush_dcache_pte(pte); 83} 84 85static void kvm_flush_dcache_pmd(pmd_t pmd) 86{ 87 __kvm_flush_dcache_pmd(pmd); 88} 89 90static void kvm_flush_dcache_pud(pud_t pud) 91{ 92 __kvm_flush_dcache_pud(pud); 93} 94 95static bool kvm_is_device_pfn(unsigned long pfn) 96{ 97 return !pfn_valid(pfn); 98} 99 100/** 101 * stage2_dissolve_pmd() - clear and flush huge PMD entry 102 * @kvm: pointer to kvm structure. 103 * @addr: IPA 104 * @pmd: pmd pointer for IPA 105 * 106 * Function clears a PMD entry, flushes addr 1st and 2nd stage TLBs. Marks all 107 * pages in the range dirty. 108 */ 109static void stage2_dissolve_pmd(struct kvm *kvm, phys_addr_t addr, pmd_t *pmd) 110{ 111 if (!pmd_thp_or_huge(*pmd)) 112 return; 113 114 pmd_clear(pmd); 115 kvm_tlb_flush_vmid_ipa(kvm, addr); 116 put_page(virt_to_page(pmd)); 117} 118 119static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache, 120 int min, int max) 121{ 122 void *page; 123 124 BUG_ON(max > KVM_NR_MEM_OBJS); 125 if (cache->nobjs >= min) 126 return 0; 127 while (cache->nobjs < max) { 128 page = (void *)__get_free_page(PGALLOC_GFP); 129 if (!page) 130 return -ENOMEM; 131 cache->objects[cache->nobjs++] = page; 132 } 133 return 0; 134} 135 136static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc) 137{ 138 while (mc->nobjs) 139 free_page((unsigned long)mc->objects[--mc->nobjs]); 140} 141 142static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc) 143{ 144 void *p; 145 146 BUG_ON(!mc || !mc->nobjs); 147 p = mc->objects[--mc->nobjs]; 148 return p; 149} 150 151static void clear_stage2_pgd_entry(struct kvm *kvm, pgd_t *pgd, phys_addr_t addr) 152{ 153 pud_t *pud_table __maybe_unused = stage2_pud_offset(pgd, 0UL); 154 stage2_pgd_clear(pgd); 155 kvm_tlb_flush_vmid_ipa(kvm, addr); 156 stage2_pud_free(pud_table); 157 put_page(virt_to_page(pgd)); 158} 159 160static void clear_stage2_pud_entry(struct kvm *kvm, pud_t *pud, phys_addr_t addr) 161{ 162 pmd_t *pmd_table __maybe_unused = stage2_pmd_offset(pud, 0); 163 VM_BUG_ON(stage2_pud_huge(*pud)); 164 stage2_pud_clear(pud); 165 kvm_tlb_flush_vmid_ipa(kvm, addr); 166 stage2_pmd_free(pmd_table); 167 put_page(virt_to_page(pud)); 168} 169 170static void clear_stage2_pmd_entry(struct kvm *kvm, pmd_t *pmd, phys_addr_t addr) 171{ 172 pte_t *pte_table = pte_offset_kernel(pmd, 0); 173 VM_BUG_ON(pmd_thp_or_huge(*pmd)); 174 pmd_clear(pmd); 175 kvm_tlb_flush_vmid_ipa(kvm, addr); 176 pte_free_kernel(NULL, pte_table); 177 put_page(virt_to_page(pmd)); 178} 179 180/* 181 * Unmapping vs dcache management: 182 * 183 * If a guest maps certain memory pages as uncached, all writes will 184 * bypass the data cache and go directly to RAM. However, the CPUs 185 * can still speculate reads (not writes) and fill cache lines with 186 * data. 187 * 188 * Those cache lines will be *clean* cache lines though, so a 189 * clean+invalidate operation is equivalent to an invalidate 190 * operation, because no cache lines are marked dirty. 191 * 192 * Those clean cache lines could be filled prior to an uncached write 193 * by the guest, and the cache coherent IO subsystem would therefore 194 * end up writing old data to disk. 195 * 196 * This is why right after unmapping a page/section and invalidating 197 * the corresponding TLBs, we call kvm_flush_dcache_p*() to make sure 198 * the IO subsystem will never hit in the cache. 199 */ 200static void unmap_stage2_ptes(struct kvm *kvm, pmd_t *pmd, 201 phys_addr_t addr, phys_addr_t end) 202{ 203 phys_addr_t start_addr = addr; 204 pte_t *pte, *start_pte; 205 206 start_pte = pte = pte_offset_kernel(pmd, addr); 207 do { 208 if (!pte_none(*pte)) { 209 pte_t old_pte = *pte; 210 211 kvm_set_pte(pte, __pte(0)); 212 kvm_tlb_flush_vmid_ipa(kvm, addr); 213 214 /* No need to invalidate the cache for device mappings */ 215 if (!kvm_is_device_pfn(pte_pfn(old_pte))) 216 kvm_flush_dcache_pte(old_pte); 217 218 put_page(virt_to_page(pte)); 219 } 220 } while (pte++, addr += PAGE_SIZE, addr != end); 221 222 if (stage2_pte_table_empty(start_pte)) 223 clear_stage2_pmd_entry(kvm, pmd, start_addr); 224} 225 226static void unmap_stage2_pmds(struct kvm *kvm, pud_t *pud, 227 phys_addr_t addr, phys_addr_t end) 228{ 229 phys_addr_t next, start_addr = addr; 230 pmd_t *pmd, *start_pmd; 231 232 start_pmd = pmd = stage2_pmd_offset(pud, addr); 233 do { 234 next = stage2_pmd_addr_end(addr, end); 235 if (!pmd_none(*pmd)) { 236 if (pmd_thp_or_huge(*pmd)) { 237 pmd_t old_pmd = *pmd; 238 239 pmd_clear(pmd); 240 kvm_tlb_flush_vmid_ipa(kvm, addr); 241 242 kvm_flush_dcache_pmd(old_pmd); 243 244 put_page(virt_to_page(pmd)); 245 } else { 246 unmap_stage2_ptes(kvm, pmd, addr, next); 247 } 248 } 249 } while (pmd++, addr = next, addr != end); 250 251 if (stage2_pmd_table_empty(start_pmd)) 252 clear_stage2_pud_entry(kvm, pud, start_addr); 253} 254 255static void unmap_stage2_puds(struct kvm *kvm, pgd_t *pgd, 256 phys_addr_t addr, phys_addr_t end) 257{ 258 phys_addr_t next, start_addr = addr; 259 pud_t *pud, *start_pud; 260 261 start_pud = pud = stage2_pud_offset(pgd, addr); 262 do { 263 next = stage2_pud_addr_end(addr, end); 264 if (!stage2_pud_none(*pud)) { 265 if (stage2_pud_huge(*pud)) { 266 pud_t old_pud = *pud; 267 268 stage2_pud_clear(pud); 269 kvm_tlb_flush_vmid_ipa(kvm, addr); 270 kvm_flush_dcache_pud(old_pud); 271 put_page(virt_to_page(pud)); 272 } else { 273 unmap_stage2_pmds(kvm, pud, addr, next); 274 } 275 } 276 } while (pud++, addr = next, addr != end); 277 278 if (stage2_pud_table_empty(start_pud)) 279 clear_stage2_pgd_entry(kvm, pgd, start_addr); 280} 281 282/** 283 * unmap_stage2_range -- Clear stage2 page table entries to unmap a range 284 * @kvm: The VM pointer 285 * @start: The intermediate physical base address of the range to unmap 286 * @size: The size of the area to unmap 287 * 288 * Clear a range of stage-2 mappings, lowering the various ref-counts. Must 289 * be called while holding mmu_lock (unless for freeing the stage2 pgd before 290 * destroying the VM), otherwise another faulting VCPU may come in and mess 291 * with things behind our backs. 292 */ 293static void unmap_stage2_range(struct kvm *kvm, phys_addr_t start, u64 size) 294{ 295 pgd_t *pgd; 296 phys_addr_t addr = start, end = start + size; 297 phys_addr_t next; 298 299 assert_spin_locked(&kvm->mmu_lock); 300 WARN_ON(size & ~PAGE_MASK); 301 302 pgd = kvm->arch.pgd + stage2_pgd_index(addr); 303 do { 304 /* 305 * Make sure the page table is still active, as another thread 306 * could have possibly freed the page table, while we released 307 * the lock. 308 */ 309 if (!READ_ONCE(kvm->arch.pgd)) 310 break; 311 next = stage2_pgd_addr_end(addr, end); 312 if (!stage2_pgd_none(*pgd)) 313 unmap_stage2_puds(kvm, pgd, addr, next); 314 /* 315 * If the range is too large, release the kvm->mmu_lock 316 * to prevent starvation and lockup detector warnings. 317 */ 318 if (next != end) 319 cond_resched_lock(&kvm->mmu_lock); 320 } while (pgd++, addr = next, addr != end); 321} 322 323static void stage2_flush_ptes(struct kvm *kvm, pmd_t *pmd, 324 phys_addr_t addr, phys_addr_t end) 325{ 326 pte_t *pte; 327 328 pte = pte_offset_kernel(pmd, addr); 329 do { 330 if (!pte_none(*pte) && !kvm_is_device_pfn(pte_pfn(*pte))) 331 kvm_flush_dcache_pte(*pte); 332 } while (pte++, addr += PAGE_SIZE, addr != end); 333} 334 335static void stage2_flush_pmds(struct kvm *kvm, pud_t *pud, 336 phys_addr_t addr, phys_addr_t end) 337{ 338 pmd_t *pmd; 339 phys_addr_t next; 340 341 pmd = stage2_pmd_offset(pud, addr); 342 do { 343 next = stage2_pmd_addr_end(addr, end); 344 if (!pmd_none(*pmd)) { 345 if (pmd_thp_or_huge(*pmd)) 346 kvm_flush_dcache_pmd(*pmd); 347 else 348 stage2_flush_ptes(kvm, pmd, addr, next); 349 } 350 } while (pmd++, addr = next, addr != end); 351} 352 353static void stage2_flush_puds(struct kvm *kvm, pgd_t *pgd, 354 phys_addr_t addr, phys_addr_t end) 355{ 356 pud_t *pud; 357 phys_addr_t next; 358 359 pud = stage2_pud_offset(pgd, addr); 360 do { 361 next = stage2_pud_addr_end(addr, end); 362 if (!stage2_pud_none(*pud)) { 363 if (stage2_pud_huge(*pud)) 364 kvm_flush_dcache_pud(*pud); 365 else 366 stage2_flush_pmds(kvm, pud, addr, next); 367 } 368 } while (pud++, addr = next, addr != end); 369} 370 371static void stage2_flush_memslot(struct kvm *kvm, 372 struct kvm_memory_slot *memslot) 373{ 374 phys_addr_t addr = memslot->base_gfn << PAGE_SHIFT; 375 phys_addr_t end = addr + PAGE_SIZE * memslot->npages; 376 phys_addr_t next; 377 pgd_t *pgd; 378 379 pgd = kvm->arch.pgd + stage2_pgd_index(addr); 380 do { 381 next = stage2_pgd_addr_end(addr, end); 382 stage2_flush_puds(kvm, pgd, addr, next); 383 } while (pgd++, addr = next, addr != end); 384} 385 386/** 387 * stage2_flush_vm - Invalidate cache for pages mapped in stage 2 388 * @kvm: The struct kvm pointer 389 * 390 * Go through the stage 2 page tables and invalidate any cache lines 391 * backing memory already mapped to the VM. 392 */ 393static void stage2_flush_vm(struct kvm *kvm) 394{ 395 struct kvm_memslots *slots; 396 struct kvm_memory_slot *memslot; 397 int idx; 398 399 idx = srcu_read_lock(&kvm->srcu); 400 spin_lock(&kvm->mmu_lock); 401 402 slots = kvm_memslots(kvm); 403 kvm_for_each_memslot(memslot, slots) 404 stage2_flush_memslot(kvm, memslot); 405 406 spin_unlock(&kvm->mmu_lock); 407 srcu_read_unlock(&kvm->srcu, idx); 408} 409 410static void clear_hyp_pgd_entry(pgd_t *pgd) 411{ 412 pud_t *pud_table __maybe_unused = pud_offset(pgd, 0UL); 413 pgd_clear(pgd); 414 pud_free(NULL, pud_table); 415 put_page(virt_to_page(pgd)); 416} 417 418static void clear_hyp_pud_entry(pud_t *pud) 419{ 420 pmd_t *pmd_table __maybe_unused = pmd_offset(pud, 0); 421 VM_BUG_ON(pud_huge(*pud)); 422 pud_clear(pud); 423 pmd_free(NULL, pmd_table); 424 put_page(virt_to_page(pud)); 425} 426 427static void clear_hyp_pmd_entry(pmd_t *pmd) 428{ 429 pte_t *pte_table = pte_offset_kernel(pmd, 0); 430 VM_BUG_ON(pmd_thp_or_huge(*pmd)); 431 pmd_clear(pmd); 432 pte_free_kernel(NULL, pte_table); 433 put_page(virt_to_page(pmd)); 434} 435 436static void unmap_hyp_ptes(pmd_t *pmd, phys_addr_t addr, phys_addr_t end) 437{ 438 pte_t *pte, *start_pte; 439 440 start_pte = pte = pte_offset_kernel(pmd, addr); 441 do { 442 if (!pte_none(*pte)) { 443 kvm_set_pte(pte, __pte(0)); 444 put_page(virt_to_page(pte)); 445 } 446 } while (pte++, addr += PAGE_SIZE, addr != end); 447 448 if (hyp_pte_table_empty(start_pte)) 449 clear_hyp_pmd_entry(pmd); 450} 451 452static void unmap_hyp_pmds(pud_t *pud, phys_addr_t addr, phys_addr_t end) 453{ 454 phys_addr_t next; 455 pmd_t *pmd, *start_pmd; 456 457 start_pmd = pmd = pmd_offset(pud, addr); 458 do { 459 next = pmd_addr_end(addr, end); 460 /* Hyp doesn't use huge pmds */ 461 if (!pmd_none(*pmd)) 462 unmap_hyp_ptes(pmd, addr, next); 463 } while (pmd++, addr = next, addr != end); 464 465 if (hyp_pmd_table_empty(start_pmd)) 466 clear_hyp_pud_entry(pud); 467} 468 469static void unmap_hyp_puds(pgd_t *pgd, phys_addr_t addr, phys_addr_t end) 470{ 471 phys_addr_t next; 472 pud_t *pud, *start_pud; 473 474 start_pud = pud = pud_offset(pgd, addr); 475 do { 476 next = pud_addr_end(addr, end); 477 /* Hyp doesn't use huge puds */ 478 if (!pud_none(*pud)) 479 unmap_hyp_pmds(pud, addr, next); 480 } while (pud++, addr = next, addr != end); 481 482 if (hyp_pud_table_empty(start_pud)) 483 clear_hyp_pgd_entry(pgd); 484} 485 486static unsigned int kvm_pgd_index(unsigned long addr, unsigned int ptrs_per_pgd) 487{ 488 return (addr >> PGDIR_SHIFT) & (ptrs_per_pgd - 1); 489} 490 491static void __unmap_hyp_range(pgd_t *pgdp, unsigned long ptrs_per_pgd, 492 phys_addr_t start, u64 size) 493{ 494 pgd_t *pgd; 495 phys_addr_t addr = start, end = start + size; 496 phys_addr_t next; 497 498 /* 499 * We don't unmap anything from HYP, except at the hyp tear down. 500 * Hence, we don't have to invalidate the TLBs here. 501 */ 502 pgd = pgdp + kvm_pgd_index(addr, ptrs_per_pgd); 503 do { 504 next = pgd_addr_end(addr, end); 505 if (!pgd_none(*pgd)) 506 unmap_hyp_puds(pgd, addr, next); 507 } while (pgd++, addr = next, addr != end); 508} 509 510static void unmap_hyp_range(pgd_t *pgdp, phys_addr_t start, u64 size) 511{ 512 __unmap_hyp_range(pgdp, PTRS_PER_PGD, start, size); 513} 514 515static void unmap_hyp_idmap_range(pgd_t *pgdp, phys_addr_t start, u64 size) 516{ 517 __unmap_hyp_range(pgdp, __kvm_idmap_ptrs_per_pgd(), start, size); 518} 519 520/** 521 * free_hyp_pgds - free Hyp-mode page tables 522 * 523 * Assumes hyp_pgd is a page table used strictly in Hyp-mode and 524 * therefore contains either mappings in the kernel memory area (above 525 * PAGE_OFFSET), or device mappings in the idmap range. 526 * 527 * boot_hyp_pgd should only map the idmap range, and is only used in 528 * the extended idmap case. 529 */ 530void free_hyp_pgds(void) 531{ 532 pgd_t *id_pgd; 533 534 mutex_lock(&kvm_hyp_pgd_mutex); 535 536 id_pgd = boot_hyp_pgd ? boot_hyp_pgd : hyp_pgd; 537 538 if (id_pgd) { 539 /* In case we never called hyp_mmu_init() */ 540 if (!io_map_base) 541 io_map_base = hyp_idmap_start; 542 unmap_hyp_idmap_range(id_pgd, io_map_base, 543 hyp_idmap_start + PAGE_SIZE - io_map_base); 544 } 545 546 if (boot_hyp_pgd) { 547 free_pages((unsigned long)boot_hyp_pgd, hyp_pgd_order); 548 boot_hyp_pgd = NULL; 549 } 550 551 if (hyp_pgd) { 552 unmap_hyp_range(hyp_pgd, kern_hyp_va(PAGE_OFFSET), 553 (uintptr_t)high_memory - PAGE_OFFSET); 554 555 free_pages((unsigned long)hyp_pgd, hyp_pgd_order); 556 hyp_pgd = NULL; 557 } 558 if (merged_hyp_pgd) { 559 clear_page(merged_hyp_pgd); 560 free_page((unsigned long)merged_hyp_pgd); 561 merged_hyp_pgd = NULL; 562 } 563 564 mutex_unlock(&kvm_hyp_pgd_mutex); 565} 566 567static void create_hyp_pte_mappings(pmd_t *pmd, unsigned long start, 568 unsigned long end, unsigned long pfn, 569 pgprot_t prot) 570{ 571 pte_t *pte; 572 unsigned long addr; 573 574 addr = start; 575 do { 576 pte = pte_offset_kernel(pmd, addr); 577 kvm_set_pte(pte, pfn_pte(pfn, prot)); 578 get_page(virt_to_page(pte)); 579 kvm_flush_dcache_to_poc(pte, sizeof(*pte)); 580 pfn++; 581 } while (addr += PAGE_SIZE, addr != end); 582} 583 584static int create_hyp_pmd_mappings(pud_t *pud, unsigned long start, 585 unsigned long end, unsigned long pfn, 586 pgprot_t prot) 587{ 588 pmd_t *pmd; 589 pte_t *pte; 590 unsigned long addr, next; 591 592 addr = start; 593 do { 594 pmd = pmd_offset(pud, addr); 595 596 BUG_ON(pmd_sect(*pmd)); 597 598 if (pmd_none(*pmd)) { 599 pte = pte_alloc_one_kernel(NULL, addr); 600 if (!pte) { 601 kvm_err("Cannot allocate Hyp pte\n"); 602 return -ENOMEM; 603 } 604 pmd_populate_kernel(NULL, pmd, pte); 605 get_page(virt_to_page(pmd)); 606 kvm_flush_dcache_to_poc(pmd, sizeof(*pmd)); 607 } 608 609 next = pmd_addr_end(addr, end); 610 611 create_hyp_pte_mappings(pmd, addr, next, pfn, prot); 612 pfn += (next - addr) >> PAGE_SHIFT; 613 } while (addr = next, addr != end); 614 615 return 0; 616} 617 618static int create_hyp_pud_mappings(pgd_t *pgd, unsigned long start, 619 unsigned long end, unsigned long pfn, 620 pgprot_t prot) 621{ 622 pud_t *pud; 623 pmd_t *pmd; 624 unsigned long addr, next; 625 int ret; 626 627 addr = start; 628 do { 629 pud = pud_offset(pgd, addr); 630 631 if (pud_none_or_clear_bad(pud)) { 632 pmd = pmd_alloc_one(NULL, addr); 633 if (!pmd) { 634 kvm_err("Cannot allocate Hyp pmd\n"); 635 return -ENOMEM; 636 } 637 pud_populate(NULL, pud, pmd); 638 get_page(virt_to_page(pud)); 639 kvm_flush_dcache_to_poc(pud, sizeof(*pud)); 640 } 641 642 next = pud_addr_end(addr, end); 643 ret = create_hyp_pmd_mappings(pud, addr, next, pfn, prot); 644 if (ret) 645 return ret; 646 pfn += (next - addr) >> PAGE_SHIFT; 647 } while (addr = next, addr != end); 648 649 return 0; 650} 651 652static int __create_hyp_mappings(pgd_t *pgdp, unsigned long ptrs_per_pgd, 653 unsigned long start, unsigned long end, 654 unsigned long pfn, pgprot_t prot) 655{ 656 pgd_t *pgd; 657 pud_t *pud; 658 unsigned long addr, next; 659 int err = 0; 660 661 mutex_lock(&kvm_hyp_pgd_mutex); 662 addr = start & PAGE_MASK; 663 end = PAGE_ALIGN(end); 664 do { 665 pgd = pgdp + kvm_pgd_index(addr, ptrs_per_pgd); 666 667 if (pgd_none(*pgd)) { 668 pud = pud_alloc_one(NULL, addr); 669 if (!pud) { 670 kvm_err("Cannot allocate Hyp pud\n"); 671 err = -ENOMEM; 672 goto out; 673 } 674 pgd_populate(NULL, pgd, pud); 675 get_page(virt_to_page(pgd)); 676 kvm_flush_dcache_to_poc(pgd, sizeof(*pgd)); 677 } 678 679 next = pgd_addr_end(addr, end); 680 err = create_hyp_pud_mappings(pgd, addr, next, pfn, prot); 681 if (err) 682 goto out; 683 pfn += (next - addr) >> PAGE_SHIFT; 684 } while (addr = next, addr != end); 685out: 686 mutex_unlock(&kvm_hyp_pgd_mutex); 687 return err; 688} 689 690static phys_addr_t kvm_kaddr_to_phys(void *kaddr) 691{ 692 if (!is_vmalloc_addr(kaddr)) { 693 BUG_ON(!virt_addr_valid(kaddr)); 694 return __pa(kaddr); 695 } else { 696 return page_to_phys(vmalloc_to_page(kaddr)) + 697 offset_in_page(kaddr); 698 } 699} 700 701/** 702 * create_hyp_mappings - duplicate a kernel virtual address range in Hyp mode 703 * @from: The virtual kernel start address of the range 704 * @to: The virtual kernel end address of the range (exclusive) 705 * @prot: The protection to be applied to this range 706 * 707 * The same virtual address as the kernel virtual address is also used 708 * in Hyp-mode mapping (modulo HYP_PAGE_OFFSET) to the same underlying 709 * physical pages. 710 */ 711int create_hyp_mappings(void *from, void *to, pgprot_t prot) 712{ 713 phys_addr_t phys_addr; 714 unsigned long virt_addr; 715 unsigned long start = kern_hyp_va((unsigned long)from); 716 unsigned long end = kern_hyp_va((unsigned long)to); 717 718 if (is_kernel_in_hyp_mode()) 719 return 0; 720 721 start = start & PAGE_MASK; 722 end = PAGE_ALIGN(end); 723 724 for (virt_addr = start; virt_addr < end; virt_addr += PAGE_SIZE) { 725 int err; 726 727 phys_addr = kvm_kaddr_to_phys(from + virt_addr - start); 728 err = __create_hyp_mappings(hyp_pgd, PTRS_PER_PGD, 729 virt_addr, virt_addr + PAGE_SIZE, 730 __phys_to_pfn(phys_addr), 731 prot); 732 if (err) 733 return err; 734 } 735 736 return 0; 737} 738 739static int __create_hyp_private_mapping(phys_addr_t phys_addr, size_t size, 740 unsigned long *haddr, pgprot_t prot) 741{ 742 pgd_t *pgd = hyp_pgd; 743 unsigned long base; 744 int ret = 0; 745 746 mutex_lock(&kvm_hyp_pgd_mutex); 747 748 /* 749 * This assumes that we we have enough space below the idmap 750 * page to allocate our VAs. If not, the check below will 751 * kick. A potential alternative would be to detect that 752 * overflow and switch to an allocation above the idmap. 753 * 754 * The allocated size is always a multiple of PAGE_SIZE. 755 */ 756 size = PAGE_ALIGN(size + offset_in_page(phys_addr)); 757 base = io_map_base - size; 758 759 /* 760 * Verify that BIT(VA_BITS - 1) hasn't been flipped by 761 * allocating the new area, as it would indicate we've 762 * overflowed the idmap/IO address range. 763 */ 764 if ((base ^ io_map_base) & BIT(VA_BITS - 1)) 765 ret = -ENOMEM; 766 else 767 io_map_base = base; 768 769 mutex_unlock(&kvm_hyp_pgd_mutex); 770 771 if (ret) 772 goto out; 773 774 if (__kvm_cpu_uses_extended_idmap()) 775 pgd = boot_hyp_pgd; 776 777 ret = __create_hyp_mappings(pgd, __kvm_idmap_ptrs_per_pgd(), 778 base, base + size, 779 __phys_to_pfn(phys_addr), prot); 780 if (ret) 781 goto out; 782 783 *haddr = base + offset_in_page(phys_addr); 784 785out: 786 return ret; 787} 788 789/** 790 * create_hyp_io_mappings - Map IO into both kernel and HYP 791 * @phys_addr: The physical start address which gets mapped 792 * @size: Size of the region being mapped 793 * @kaddr: Kernel VA for this mapping 794 * @haddr: HYP VA for this mapping 795 */ 796int create_hyp_io_mappings(phys_addr_t phys_addr, size_t size, 797 void __iomem **kaddr, 798 void __iomem **haddr) 799{ 800 unsigned long addr; 801 int ret; 802 803 *kaddr = ioremap(phys_addr, size); 804 if (!*kaddr) 805 return -ENOMEM; 806 807 if (is_kernel_in_hyp_mode()) { 808 *haddr = *kaddr; 809 return 0; 810 } 811 812 ret = __create_hyp_private_mapping(phys_addr, size, 813 &addr, PAGE_HYP_DEVICE); 814 if (ret) { 815 iounmap(*kaddr); 816 *kaddr = NULL; 817 *haddr = NULL; 818 return ret; 819 } 820 821 *haddr = (void __iomem *)addr; 822 return 0; 823} 824 825/** 826 * create_hyp_exec_mappings - Map an executable range into HYP 827 * @phys_addr: The physical start address which gets mapped 828 * @size: Size of the region being mapped 829 * @haddr: HYP VA for this mapping 830 */ 831int create_hyp_exec_mappings(phys_addr_t phys_addr, size_t size, 832 void **haddr) 833{ 834 unsigned long addr; 835 int ret; 836 837 BUG_ON(is_kernel_in_hyp_mode()); 838 839 ret = __create_hyp_private_mapping(phys_addr, size, 840 &addr, PAGE_HYP_EXEC); 841 if (ret) { 842 *haddr = NULL; 843 return ret; 844 } 845 846 *haddr = (void *)addr; 847 return 0; 848} 849 850/** 851 * kvm_alloc_stage2_pgd - allocate level-1 table for stage-2 translation. 852 * @kvm: The KVM struct pointer for the VM. 853 * 854 * Allocates only the stage-2 HW PGD level table(s) (can support either full 855 * 40-bit input addresses or limited to 32-bit input addresses). Clears the 856 * allocated pages. 857 * 858 * Note we don't need locking here as this is only called when the VM is 859 * created, which can only be done once. 860 */ 861int kvm_alloc_stage2_pgd(struct kvm *kvm) 862{ 863 pgd_t *pgd; 864 865 if (kvm->arch.pgd != NULL) { 866 kvm_err("kvm_arch already initialized?\n"); 867 return -EINVAL; 868 } 869 870 /* Allocate the HW PGD, making sure that each page gets its own refcount */ 871 pgd = alloc_pages_exact(S2_PGD_SIZE, GFP_KERNEL | __GFP_ZERO); 872 if (!pgd) 873 return -ENOMEM; 874 875 kvm->arch.pgd = pgd; 876 return 0; 877} 878 879static void stage2_unmap_memslot(struct kvm *kvm, 880 struct kvm_memory_slot *memslot) 881{ 882 hva_t hva = memslot->userspace_addr; 883 phys_addr_t addr = memslot->base_gfn << PAGE_SHIFT; 884 phys_addr_t size = PAGE_SIZE * memslot->npages; 885 hva_t reg_end = hva + size; 886 887 /* 888 * A memory region could potentially cover multiple VMAs, and any holes 889 * between them, so iterate over all of them to find out if we should 890 * unmap any of them. 891 * 892 * +--------------------------------------------+ 893 * +---------------+----------------+ +----------------+ 894 * | : VMA 1 | VMA 2 | | VMA 3 : | 895 * +---------------+----------------+ +----------------+ 896 * | memory region | 897 * +--------------------------------------------+ 898 */ 899 do { 900 struct vm_area_struct *vma = find_vma(current->mm, hva); 901 hva_t vm_start, vm_end; 902 903 if (!vma || vma->vm_start >= reg_end) 904 break; 905 906 /* 907 * Take the intersection of this VMA with the memory region 908 */ 909 vm_start = max(hva, vma->vm_start); 910 vm_end = min(reg_end, vma->vm_end); 911 912 if (!(vma->vm_flags & VM_PFNMAP)) { 913 gpa_t gpa = addr + (vm_start - memslot->userspace_addr); 914 unmap_stage2_range(kvm, gpa, vm_end - vm_start); 915 } 916 hva = vm_end; 917 } while (hva < reg_end); 918} 919 920/** 921 * stage2_unmap_vm - Unmap Stage-2 RAM mappings 922 * @kvm: The struct kvm pointer 923 * 924 * Go through the memregions and unmap any reguler RAM 925 * backing memory already mapped to the VM. 926 */ 927void stage2_unmap_vm(struct kvm *kvm) 928{ 929 struct kvm_memslots *slots; 930 struct kvm_memory_slot *memslot; 931 int idx; 932 933 idx = srcu_read_lock(&kvm->srcu); 934 down_read(&current->mm->mmap_sem); 935 spin_lock(&kvm->mmu_lock); 936 937 slots = kvm_memslots(kvm); 938 kvm_for_each_memslot(memslot, slots) 939 stage2_unmap_memslot(kvm, memslot); 940 941 spin_unlock(&kvm->mmu_lock); 942 up_read(&current->mm->mmap_sem); 943 srcu_read_unlock(&kvm->srcu, idx); 944} 945 946/** 947 * kvm_free_stage2_pgd - free all stage-2 tables 948 * @kvm: The KVM struct pointer for the VM. 949 * 950 * Walks the level-1 page table pointed to by kvm->arch.pgd and frees all 951 * underlying level-2 and level-3 tables before freeing the actual level-1 table 952 * and setting the struct pointer to NULL. 953 */ 954void kvm_free_stage2_pgd(struct kvm *kvm) 955{ 956 void *pgd = NULL; 957 958 spin_lock(&kvm->mmu_lock); 959 if (kvm->arch.pgd) { 960 unmap_stage2_range(kvm, 0, KVM_PHYS_SIZE); 961 pgd = READ_ONCE(kvm->arch.pgd); 962 kvm->arch.pgd = NULL; 963 } 964 spin_unlock(&kvm->mmu_lock); 965 966 /* Free the HW pgd, one page at a time */ 967 if (pgd) 968 free_pages_exact(pgd, S2_PGD_SIZE); 969} 970 971static pud_t *stage2_get_pud(struct kvm *kvm, struct kvm_mmu_memory_cache *cache, 972 phys_addr_t addr) 973{ 974 pgd_t *pgd; 975 pud_t *pud; 976 977 pgd = kvm->arch.pgd + stage2_pgd_index(addr); 978 if (WARN_ON(stage2_pgd_none(*pgd))) { 979 if (!cache) 980 return NULL; 981 pud = mmu_memory_cache_alloc(cache); 982 stage2_pgd_populate(pgd, pud); 983 get_page(virt_to_page(pgd)); 984 } 985 986 return stage2_pud_offset(pgd, addr); 987} 988 989static pmd_t *stage2_get_pmd(struct kvm *kvm, struct kvm_mmu_memory_cache *cache, 990 phys_addr_t addr) 991{ 992 pud_t *pud; 993 pmd_t *pmd; 994 995 pud = stage2_get_pud(kvm, cache, addr); 996 if (!pud) 997 return NULL; 998 999 if (stage2_pud_none(*pud)) { 1000 if (!cache) 1001 return NULL; 1002 pmd = mmu_memory_cache_alloc(cache); 1003 stage2_pud_populate(pud, pmd); 1004 get_page(virt_to_page(pud)); 1005 } 1006 1007 return stage2_pmd_offset(pud, addr); 1008} 1009 1010static int stage2_set_pmd_huge(struct kvm *kvm, struct kvm_mmu_memory_cache 1011 *cache, phys_addr_t addr, const pmd_t *new_pmd) 1012{ 1013 pmd_t *pmd, old_pmd; 1014 1015 pmd = stage2_get_pmd(kvm, cache, addr); 1016 VM_BUG_ON(!pmd); 1017 1018 /* 1019 * Mapping in huge pages should only happen through a fault. If a 1020 * page is merged into a transparent huge page, the individual 1021 * subpages of that huge page should be unmapped through MMU 1022 * notifiers before we get here. 1023 * 1024 * Merging of CompoundPages is not supported; they should become 1025 * splitting first, unmapped, merged, and mapped back in on-demand. 1026 */ 1027 VM_BUG_ON(pmd_present(*pmd) && pmd_pfn(*pmd) != pmd_pfn(*new_pmd)); 1028 1029 old_pmd = *pmd; 1030 if (pmd_present(old_pmd)) { 1031 pmd_clear(pmd); 1032 kvm_tlb_flush_vmid_ipa(kvm, addr); 1033 } else { 1034 get_page(virt_to_page(pmd)); 1035 } 1036 1037 kvm_set_pmd(pmd, *new_pmd); 1038 return 0; 1039} 1040 1041static bool stage2_is_exec(struct kvm *kvm, phys_addr_t addr) 1042{ 1043 pmd_t *pmdp; 1044 pte_t *ptep; 1045 1046 pmdp = stage2_get_pmd(kvm, NULL, addr); 1047 if (!pmdp || pmd_none(*pmdp) || !pmd_present(*pmdp)) 1048 return false; 1049 1050 if (pmd_thp_or_huge(*pmdp)) 1051 return kvm_s2pmd_exec(pmdp); 1052 1053 ptep = pte_offset_kernel(pmdp, addr); 1054 if (!ptep || pte_none(*ptep) || !pte_present(*ptep)) 1055 return false; 1056 1057 return kvm_s2pte_exec(ptep); 1058} 1059 1060static int stage2_set_pte(struct kvm *kvm, struct kvm_mmu_memory_cache *cache, 1061 phys_addr_t addr, const pte_t *new_pte, 1062 unsigned long flags) 1063{ 1064 pmd_t *pmd; 1065 pte_t *pte, old_pte; 1066 bool iomap = flags & KVM_S2PTE_FLAG_IS_IOMAP; 1067 bool logging_active = flags & KVM_S2_FLAG_LOGGING_ACTIVE; 1068 1069 VM_BUG_ON(logging_active && !cache); 1070 1071 /* Create stage-2 page table mapping - Levels 0 and 1 */ 1072 pmd = stage2_get_pmd(kvm, cache, addr); 1073 if (!pmd) { 1074 /* 1075 * Ignore calls from kvm_set_spte_hva for unallocated 1076 * address ranges. 1077 */ 1078 return 0; 1079 } 1080 1081 /* 1082 * While dirty page logging - dissolve huge PMD, then continue on to 1083 * allocate page. 1084 */ 1085 if (logging_active) 1086 stage2_dissolve_pmd(kvm, addr, pmd); 1087 1088 /* Create stage-2 page mappings - Level 2 */ 1089 if (pmd_none(*pmd)) { 1090 if (!cache) 1091 return 0; /* ignore calls from kvm_set_spte_hva */ 1092 pte = mmu_memory_cache_alloc(cache); 1093 pmd_populate_kernel(NULL, pmd, pte); 1094 get_page(virt_to_page(pmd)); 1095 } 1096 1097 pte = pte_offset_kernel(pmd, addr); 1098 1099 if (iomap && pte_present(*pte)) 1100 return -EFAULT; 1101 1102 /* Create 2nd stage page table mapping - Level 3 */ 1103 old_pte = *pte; 1104 if (pte_present(old_pte)) { 1105 kvm_set_pte(pte, __pte(0)); 1106 kvm_tlb_flush_vmid_ipa(kvm, addr); 1107 } else { 1108 get_page(virt_to_page(pte)); 1109 } 1110 1111 kvm_set_pte(pte, *new_pte); 1112 return 0; 1113} 1114 1115#ifndef __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG 1116static int stage2_ptep_test_and_clear_young(pte_t *pte) 1117{ 1118 if (pte_young(*pte)) { 1119 *pte = pte_mkold(*pte); 1120 return 1; 1121 } 1122 return 0; 1123} 1124#else 1125static int stage2_ptep_test_and_clear_young(pte_t *pte) 1126{ 1127 return __ptep_test_and_clear_young(pte); 1128} 1129#endif 1130 1131static int stage2_pmdp_test_and_clear_young(pmd_t *pmd) 1132{ 1133 return stage2_ptep_test_and_clear_young((pte_t *)pmd); 1134} 1135 1136/** 1137 * kvm_phys_addr_ioremap - map a device range to guest IPA 1138 * 1139 * @kvm: The KVM pointer 1140 * @guest_ipa: The IPA at which to insert the mapping 1141 * @pa: The physical address of the device 1142 * @size: The size of the mapping 1143 */ 1144int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa, 1145 phys_addr_t pa, unsigned long size, bool writable) 1146{ 1147 phys_addr_t addr, end; 1148 int ret = 0; 1149 unsigned long pfn; 1150 struct kvm_mmu_memory_cache cache = { 0, }; 1151 1152 end = (guest_ipa + size + PAGE_SIZE - 1) & PAGE_MASK; 1153 pfn = __phys_to_pfn(pa); 1154 1155 for (addr = guest_ipa; addr < end; addr += PAGE_SIZE) { 1156 pte_t pte = pfn_pte(pfn, PAGE_S2_DEVICE); 1157 1158 if (writable) 1159 pte = kvm_s2pte_mkwrite(pte); 1160 1161 ret = mmu_topup_memory_cache(&cache, KVM_MMU_CACHE_MIN_PAGES, 1162 KVM_NR_MEM_OBJS); 1163 if (ret) 1164 goto out; 1165 spin_lock(&kvm->mmu_lock); 1166 ret = stage2_set_pte(kvm, &cache, addr, &pte, 1167 KVM_S2PTE_FLAG_IS_IOMAP); 1168 spin_unlock(&kvm->mmu_lock); 1169 if (ret) 1170 goto out; 1171 1172 pfn++; 1173 } 1174 1175out: 1176 mmu_free_memory_cache(&cache); 1177 return ret; 1178} 1179 1180static bool transparent_hugepage_adjust(kvm_pfn_t *pfnp, phys_addr_t *ipap) 1181{ 1182 kvm_pfn_t pfn = *pfnp; 1183 gfn_t gfn = *ipap >> PAGE_SHIFT; 1184 1185 if (PageTransCompoundMap(pfn_to_page(pfn))) { 1186 unsigned long mask; 1187 /* 1188 * The address we faulted on is backed by a transparent huge 1189 * page. However, because we map the compound huge page and 1190 * not the individual tail page, we need to transfer the 1191 * refcount to the head page. We have to be careful that the 1192 * THP doesn't start to split while we are adjusting the 1193 * refcounts. 1194 * 1195 * We are sure this doesn't happen, because mmu_notifier_retry 1196 * was successful and we are holding the mmu_lock, so if this 1197 * THP is trying to split, it will be blocked in the mmu 1198 * notifier before touching any of the pages, specifically 1199 * before being able to call __split_huge_page_refcount(). 1200 * 1201 * We can therefore safely transfer the refcount from PG_tail 1202 * to PG_head and switch the pfn from a tail page to the head 1203 * page accordingly. 1204 */ 1205 mask = PTRS_PER_PMD - 1; 1206 VM_BUG_ON((gfn & mask) != (pfn & mask)); 1207 if (pfn & mask) { 1208 *ipap &= PMD_MASK; 1209 kvm_release_pfn_clean(pfn); 1210 pfn &= ~mask; 1211 kvm_get_pfn(pfn); 1212 *pfnp = pfn; 1213 } 1214 1215 return true; 1216 } 1217 1218 return false; 1219} 1220 1221static bool kvm_is_write_fault(struct kvm_vcpu *vcpu) 1222{ 1223 if (kvm_vcpu_trap_is_iabt(vcpu)) 1224 return false; 1225 1226 return kvm_vcpu_dabt_iswrite(vcpu); 1227} 1228 1229/** 1230 * stage2_wp_ptes - write protect PMD range 1231 * @pmd: pointer to pmd entry 1232 * @addr: range start address 1233 * @end: range end address 1234 */ 1235static void stage2_wp_ptes(pmd_t *pmd, phys_addr_t addr, phys_addr_t end) 1236{ 1237 pte_t *pte; 1238 1239 pte = pte_offset_kernel(pmd, addr); 1240 do { 1241 if (!pte_none(*pte)) { 1242 if (!kvm_s2pte_readonly(pte)) 1243 kvm_set_s2pte_readonly(pte); 1244 } 1245 } while (pte++, addr += PAGE_SIZE, addr != end); 1246} 1247 1248/** 1249 * stage2_wp_pmds - write protect PUD range 1250 * @pud: pointer to pud entry 1251 * @addr: range start address 1252 * @end: range end address 1253 */ 1254static void stage2_wp_pmds(pud_t *pud, phys_addr_t addr, phys_addr_t end) 1255{ 1256 pmd_t *pmd; 1257 phys_addr_t next; 1258 1259 pmd = stage2_pmd_offset(pud, addr); 1260 1261 do { 1262 next = stage2_pmd_addr_end(addr, end); 1263 if (!pmd_none(*pmd)) { 1264 if (pmd_thp_or_huge(*pmd)) { 1265 if (!kvm_s2pmd_readonly(pmd)) 1266 kvm_set_s2pmd_readonly(pmd); 1267 } else { 1268 stage2_wp_ptes(pmd, addr, next); 1269 } 1270 } 1271 } while (pmd++, addr = next, addr != end); 1272} 1273 1274/** 1275 * stage2_wp_puds - write protect PGD range 1276 * @pgd: pointer to pgd entry 1277 * @addr: range start address 1278 * @end: range end address 1279 * 1280 * Process PUD entries, for a huge PUD we cause a panic. 1281 */ 1282static void stage2_wp_puds(pgd_t *pgd, phys_addr_t addr, phys_addr_t end) 1283{ 1284 pud_t *pud; 1285 phys_addr_t next; 1286 1287 pud = stage2_pud_offset(pgd, addr); 1288 do { 1289 next = stage2_pud_addr_end(addr, end); 1290 if (!stage2_pud_none(*pud)) { 1291 /* TODO:PUD not supported, revisit later if supported */ 1292 BUG_ON(stage2_pud_huge(*pud)); 1293 stage2_wp_pmds(pud, addr, next); 1294 } 1295 } while (pud++, addr = next, addr != end); 1296} 1297 1298/** 1299 * stage2_wp_range() - write protect stage2 memory region range 1300 * @kvm: The KVM pointer 1301 * @addr: Start address of range 1302 * @end: End address of range 1303 */ 1304static void stage2_wp_range(struct kvm *kvm, phys_addr_t addr, phys_addr_t end) 1305{ 1306 pgd_t *pgd; 1307 phys_addr_t next; 1308 1309 pgd = kvm->arch.pgd + stage2_pgd_index(addr); 1310 do { 1311 /* 1312 * Release kvm_mmu_lock periodically if the memory region is 1313 * large. Otherwise, we may see kernel panics with 1314 * CONFIG_DETECT_HUNG_TASK, CONFIG_LOCKUP_DETECTOR, 1315 * CONFIG_LOCKDEP. Additionally, holding the lock too long 1316 * will also starve other vCPUs. We have to also make sure 1317 * that the page tables are not freed while we released 1318 * the lock. 1319 */ 1320 cond_resched_lock(&kvm->mmu_lock); 1321 if (!READ_ONCE(kvm->arch.pgd)) 1322 break; 1323 next = stage2_pgd_addr_end(addr, end); 1324 if (stage2_pgd_present(*pgd)) 1325 stage2_wp_puds(pgd, addr, next); 1326 } while (pgd++, addr = next, addr != end); 1327} 1328 1329/** 1330 * kvm_mmu_wp_memory_region() - write protect stage 2 entries for memory slot 1331 * @kvm: The KVM pointer 1332 * @slot: The memory slot to write protect 1333 * 1334 * Called to start logging dirty pages after memory region 1335 * KVM_MEM_LOG_DIRTY_PAGES operation is called. After this function returns 1336 * all present PMD and PTEs are write protected in the memory region. 1337 * Afterwards read of dirty page log can be called. 1338 * 1339 * Acquires kvm_mmu_lock. Called with kvm->slots_lock mutex acquired, 1340 * serializing operations for VM memory regions. 1341 */ 1342void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot) 1343{ 1344 struct kvm_memslots *slots = kvm_memslots(kvm); 1345 struct kvm_memory_slot *memslot = id_to_memslot(slots, slot); 1346 phys_addr_t start = memslot->base_gfn << PAGE_SHIFT; 1347 phys_addr_t end = (memslot->base_gfn + memslot->npages) << PAGE_SHIFT; 1348 1349 spin_lock(&kvm->mmu_lock); 1350 stage2_wp_range(kvm, start, end); 1351 spin_unlock(&kvm->mmu_lock); 1352 kvm_flush_remote_tlbs(kvm); 1353} 1354 1355/** 1356 * kvm_mmu_write_protect_pt_masked() - write protect dirty pages 1357 * @kvm: The KVM pointer 1358 * @slot: The memory slot associated with mask 1359 * @gfn_offset: The gfn offset in memory slot 1360 * @mask: The mask of dirty pages at offset 'gfn_offset' in this memory 1361 * slot to be write protected 1362 * 1363 * Walks bits set in mask write protects the associated pte's. Caller must 1364 * acquire kvm_mmu_lock. 1365 */ 1366static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm, 1367 struct kvm_memory_slot *slot, 1368 gfn_t gfn_offset, unsigned long mask) 1369{ 1370 phys_addr_t base_gfn = slot->base_gfn + gfn_offset; 1371 phys_addr_t start = (base_gfn + __ffs(mask)) << PAGE_SHIFT; 1372 phys_addr_t end = (base_gfn + __fls(mask) + 1) << PAGE_SHIFT; 1373 1374 stage2_wp_range(kvm, start, end); 1375} 1376 1377/* 1378 * kvm_arch_mmu_enable_log_dirty_pt_masked - enable dirty logging for selected 1379 * dirty pages. 1380 * 1381 * It calls kvm_mmu_write_protect_pt_masked to write protect selected pages to 1382 * enable dirty logging for them. 1383 */ 1384void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm, 1385 struct kvm_memory_slot *slot, 1386 gfn_t gfn_offset, unsigned long mask) 1387{ 1388 kvm_mmu_write_protect_pt_masked(kvm, slot, gfn_offset, mask); 1389} 1390 1391static void clean_dcache_guest_page(kvm_pfn_t pfn, unsigned long size) 1392{ 1393 __clean_dcache_guest_page(pfn, size); 1394} 1395 1396static void invalidate_icache_guest_page(kvm_pfn_t pfn, unsigned long size) 1397{ 1398 __invalidate_icache_guest_page(pfn, size); 1399} 1400 1401static void kvm_send_hwpoison_signal(unsigned long address, 1402 struct vm_area_struct *vma) 1403{ 1404 siginfo_t info; 1405 1406 clear_siginfo(&info); 1407 info.si_signo = SIGBUS; 1408 info.si_errno = 0; 1409 info.si_code = BUS_MCEERR_AR; 1410 info.si_addr = (void __user *)address; 1411 1412 if (is_vm_hugetlb_page(vma)) 1413 info.si_addr_lsb = huge_page_shift(hstate_vma(vma)); 1414 else 1415 info.si_addr_lsb = PAGE_SHIFT; 1416 1417 send_sig_info(SIGBUS, &info, current); 1418} 1419 1420static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, 1421 struct kvm_memory_slot *memslot, unsigned long hva, 1422 unsigned long fault_status) 1423{ 1424 int ret; 1425 bool write_fault, exec_fault, writable, hugetlb = false, force_pte = false; 1426 unsigned long mmu_seq; 1427 gfn_t gfn = fault_ipa >> PAGE_SHIFT; 1428 struct kvm *kvm = vcpu->kvm; 1429 struct kvm_mmu_memory_cache *memcache = &vcpu->arch.mmu_page_cache; 1430 struct vm_area_struct *vma; 1431 kvm_pfn_t pfn; 1432 pgprot_t mem_type = PAGE_S2; 1433 bool logging_active = memslot_is_logging(memslot); 1434 unsigned long flags = 0; 1435 1436 write_fault = kvm_is_write_fault(vcpu); 1437 exec_fault = kvm_vcpu_trap_is_iabt(vcpu); 1438 VM_BUG_ON(write_fault && exec_fault); 1439 1440 if (fault_status == FSC_PERM && !write_fault && !exec_fault) { 1441 kvm_err("Unexpected L2 read permission error\n"); 1442 return -EFAULT; 1443 } 1444 1445 /* Let's check if we will get back a huge page backed by hugetlbfs */ 1446 down_read(&current->mm->mmap_sem); 1447 vma = find_vma_intersection(current->mm, hva, hva + 1); 1448 if (unlikely(!vma)) { 1449 kvm_err("Failed to find VMA for hva 0x%lx\n", hva); 1450 up_read(&current->mm->mmap_sem); 1451 return -EFAULT; 1452 } 1453 1454 if (vma_kernel_pagesize(vma) == PMD_SIZE && !logging_active) { 1455 hugetlb = true; 1456 gfn = (fault_ipa & PMD_MASK) >> PAGE_SHIFT; 1457 } else { 1458 /* 1459 * Pages belonging to memslots that don't have the same 1460 * alignment for userspace and IPA cannot be mapped using 1461 * block descriptors even if the pages belong to a THP for 1462 * the process, because the stage-2 block descriptor will 1463 * cover more than a single THP and we loose atomicity for 1464 * unmapping, updates, and splits of the THP or other pages 1465 * in the stage-2 block range. 1466 */ 1467 if ((memslot->userspace_addr & ~PMD_MASK) != 1468 ((memslot->base_gfn << PAGE_SHIFT) & ~PMD_MASK)) 1469 force_pte = true; 1470 } 1471 up_read(&current->mm->mmap_sem); 1472 1473 /* We need minimum second+third level pages */ 1474 ret = mmu_topup_memory_cache(memcache, KVM_MMU_CACHE_MIN_PAGES, 1475 KVM_NR_MEM_OBJS); 1476 if (ret) 1477 return ret; 1478 1479 mmu_seq = vcpu->kvm->mmu_notifier_seq; 1480 /* 1481 * Ensure the read of mmu_notifier_seq happens before we call 1482 * gfn_to_pfn_prot (which calls get_user_pages), so that we don't risk 1483 * the page we just got a reference to gets unmapped before we have a 1484 * chance to grab the mmu_lock, which ensure that if the page gets 1485 * unmapped afterwards, the call to kvm_unmap_hva will take it away 1486 * from us again properly. This smp_rmb() interacts with the smp_wmb() 1487 * in kvm_mmu_notifier_invalidate_<page|range_end>. 1488 */ 1489 smp_rmb(); 1490 1491 pfn = gfn_to_pfn_prot(kvm, gfn, write_fault, &writable); 1492 if (pfn == KVM_PFN_ERR_HWPOISON) { 1493 kvm_send_hwpoison_signal(hva, vma); 1494 return 0; 1495 } 1496 if (is_error_noslot_pfn(pfn)) 1497 return -EFAULT; 1498 1499 if (kvm_is_device_pfn(pfn)) { 1500 mem_type = PAGE_S2_DEVICE; 1501 flags |= KVM_S2PTE_FLAG_IS_IOMAP; 1502 } else if (logging_active) { 1503 /* 1504 * Faults on pages in a memslot with logging enabled 1505 * should not be mapped with huge pages (it introduces churn 1506 * and performance degradation), so force a pte mapping. 1507 */ 1508 force_pte = true; 1509 flags |= KVM_S2_FLAG_LOGGING_ACTIVE; 1510 1511 /* 1512 * Only actually map the page as writable if this was a write 1513 * fault. 1514 */ 1515 if (!write_fault) 1516 writable = false; 1517 } 1518 1519 spin_lock(&kvm->mmu_lock); 1520 if (mmu_notifier_retry(kvm, mmu_seq)) 1521 goto out_unlock; 1522 1523 if (!hugetlb && !force_pte) 1524 hugetlb = transparent_hugepage_adjust(&pfn, &fault_ipa); 1525 1526 if (hugetlb) { 1527 pmd_t new_pmd = pfn_pmd(pfn, mem_type); 1528 new_pmd = pmd_mkhuge(new_pmd); 1529 if (writable) { 1530 new_pmd = kvm_s2pmd_mkwrite(new_pmd); 1531 kvm_set_pfn_dirty(pfn); 1532 } 1533 1534 if (fault_status != FSC_PERM) 1535 clean_dcache_guest_page(pfn, PMD_SIZE); 1536 1537 if (exec_fault) { 1538 new_pmd = kvm_s2pmd_mkexec(new_pmd); 1539 invalidate_icache_guest_page(pfn, PMD_SIZE); 1540 } else if (fault_status == FSC_PERM) { 1541 /* Preserve execute if XN was already cleared */ 1542 if (stage2_is_exec(kvm, fault_ipa)) 1543 new_pmd = kvm_s2pmd_mkexec(new_pmd); 1544 } 1545 1546 ret = stage2_set_pmd_huge(kvm, memcache, fault_ipa, &new_pmd); 1547 } else { 1548 pte_t new_pte = pfn_pte(pfn, mem_type); 1549 1550 if (writable) { 1551 new_pte = kvm_s2pte_mkwrite(new_pte); 1552 kvm_set_pfn_dirty(pfn); 1553 mark_page_dirty(kvm, gfn); 1554 } 1555 1556 if (fault_status != FSC_PERM) 1557 clean_dcache_guest_page(pfn, PAGE_SIZE); 1558 1559 if (exec_fault) { 1560 new_pte = kvm_s2pte_mkexec(new_pte); 1561 invalidate_icache_guest_page(pfn, PAGE_SIZE); 1562 } else if (fault_status == FSC_PERM) { 1563 /* Preserve execute if XN was already cleared */ 1564 if (stage2_is_exec(kvm, fault_ipa)) 1565 new_pte = kvm_s2pte_mkexec(new_pte); 1566 } 1567 1568 ret = stage2_set_pte(kvm, memcache, fault_ipa, &new_pte, flags); 1569 } 1570 1571out_unlock: 1572 spin_unlock(&kvm->mmu_lock); 1573 kvm_set_pfn_accessed(pfn); 1574 kvm_release_pfn_clean(pfn); 1575 return ret; 1576} 1577 1578/* 1579 * Resolve the access fault by making the page young again. 1580 * Note that because the faulting entry is guaranteed not to be 1581 * cached in the TLB, we don't need to invalidate anything. 1582 * Only the HW Access Flag updates are supported for Stage 2 (no DBM), 1583 * so there is no need for atomic (pte|pmd)_mkyoung operations. 1584 */ 1585static void handle_access_fault(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa) 1586{ 1587 pmd_t *pmd; 1588 pte_t *pte; 1589 kvm_pfn_t pfn; 1590 bool pfn_valid = false; 1591 1592 trace_kvm_access_fault(fault_ipa); 1593 1594 spin_lock(&vcpu->kvm->mmu_lock); 1595 1596 pmd = stage2_get_pmd(vcpu->kvm, NULL, fault_ipa); 1597 if (!pmd || pmd_none(*pmd)) /* Nothing there */ 1598 goto out; 1599 1600 if (pmd_thp_or_huge(*pmd)) { /* THP, HugeTLB */ 1601 *pmd = pmd_mkyoung(*pmd); 1602 pfn = pmd_pfn(*pmd); 1603 pfn_valid = true; 1604 goto out; 1605 } 1606 1607 pte = pte_offset_kernel(pmd, fault_ipa); 1608 if (pte_none(*pte)) /* Nothing there either */ 1609 goto out; 1610 1611 *pte = pte_mkyoung(*pte); /* Just a page... */ 1612 pfn = pte_pfn(*pte); 1613 pfn_valid = true; 1614out: 1615 spin_unlock(&vcpu->kvm->mmu_lock); 1616 if (pfn_valid) 1617 kvm_set_pfn_accessed(pfn); 1618} 1619 1620/** 1621 * kvm_handle_guest_abort - handles all 2nd stage aborts 1622 * @vcpu: the VCPU pointer 1623 * @run: the kvm_run structure 1624 * 1625 * Any abort that gets to the host is almost guaranteed to be caused by a 1626 * missing second stage translation table entry, which can mean that either the 1627 * guest simply needs more memory and we must allocate an appropriate page or it 1628 * can mean that the guest tried to access I/O memory, which is emulated by user 1629 * space. The distinction is based on the IPA causing the fault and whether this 1630 * memory region has been registered as standard RAM by user space. 1631 */ 1632int kvm_handle_guest_abort(struct kvm_vcpu *vcpu, struct kvm_run *run) 1633{ 1634 unsigned long fault_status; 1635 phys_addr_t fault_ipa; 1636 struct kvm_memory_slot *memslot; 1637 unsigned long hva; 1638 bool is_iabt, write_fault, writable; 1639 gfn_t gfn; 1640 int ret, idx; 1641 1642 fault_status = kvm_vcpu_trap_get_fault_type(vcpu); 1643 1644 fault_ipa = kvm_vcpu_get_fault_ipa(vcpu); 1645 is_iabt = kvm_vcpu_trap_is_iabt(vcpu); 1646 1647 /* Synchronous External Abort? */ 1648 if (kvm_vcpu_dabt_isextabt(vcpu)) { 1649 /* 1650 * For RAS the host kernel may handle this abort. 1651 * There is no need to pass the error into the guest. 1652 */ 1653 if (!handle_guest_sea(fault_ipa, kvm_vcpu_get_hsr(vcpu))) 1654 return 1; 1655 1656 if (unlikely(!is_iabt)) { 1657 kvm_inject_vabt(vcpu); 1658 return 1; 1659 } 1660 } 1661 1662 trace_kvm_guest_fault(*vcpu_pc(vcpu), kvm_vcpu_get_hsr(vcpu), 1663 kvm_vcpu_get_hfar(vcpu), fault_ipa); 1664 1665 /* Check the stage-2 fault is trans. fault or write fault */ 1666 if (fault_status != FSC_FAULT && fault_status != FSC_PERM && 1667 fault_status != FSC_ACCESS) { 1668 kvm_err("Unsupported FSC: EC=%#x xFSC=%#lx ESR_EL2=%#lx\n", 1669 kvm_vcpu_trap_get_class(vcpu), 1670 (unsigned long)kvm_vcpu_trap_get_fault(vcpu), 1671 (unsigned long)kvm_vcpu_get_hsr(vcpu)); 1672 return -EFAULT; 1673 } 1674 1675 idx = srcu_read_lock(&vcpu->kvm->srcu); 1676 1677 gfn = fault_ipa >> PAGE_SHIFT; 1678 memslot = gfn_to_memslot(vcpu->kvm, gfn); 1679 hva = gfn_to_hva_memslot_prot(memslot, gfn, &writable); 1680 write_fault = kvm_is_write_fault(vcpu); 1681 if (kvm_is_error_hva(hva) || (write_fault && !writable)) { 1682 if (is_iabt) { 1683 /* Prefetch Abort on I/O address */ 1684 kvm_inject_pabt(vcpu, kvm_vcpu_get_hfar(vcpu)); 1685 ret = 1; 1686 goto out_unlock; 1687 } 1688 1689 /* 1690 * Check for a cache maintenance operation. Since we 1691 * ended-up here, we know it is outside of any memory 1692 * slot. But we can't find out if that is for a device, 1693 * or if the guest is just being stupid. The only thing 1694 * we know for sure is that this range cannot be cached. 1695 * 1696 * So let's assume that the guest is just being 1697 * cautious, and skip the instruction. 1698 */ 1699 if (kvm_vcpu_dabt_is_cm(vcpu)) { 1700 kvm_skip_instr(vcpu, kvm_vcpu_trap_il_is32bit(vcpu)); 1701 ret = 1; 1702 goto out_unlock; 1703 } 1704 1705 /* 1706 * The IPA is reported as [MAX:12], so we need to 1707 * complement it with the bottom 12 bits from the 1708 * faulting VA. This is always 12 bits, irrespective 1709 * of the page size. 1710 */ 1711 fault_ipa |= kvm_vcpu_get_hfar(vcpu) & ((1 << 12) - 1); 1712 ret = io_mem_abort(vcpu, run, fault_ipa); 1713 goto out_unlock; 1714 } 1715 1716 /* Userspace should not be able to register out-of-bounds IPAs */ 1717 VM_BUG_ON(fault_ipa >= KVM_PHYS_SIZE); 1718 1719 if (fault_status == FSC_ACCESS) { 1720 handle_access_fault(vcpu, fault_ipa); 1721 ret = 1; 1722 goto out_unlock; 1723 } 1724 1725 ret = user_mem_abort(vcpu, fault_ipa, memslot, hva, fault_status); 1726 if (ret == 0) 1727 ret = 1; 1728out_unlock: 1729 srcu_read_unlock(&vcpu->kvm->srcu, idx); 1730 return ret; 1731} 1732 1733static int handle_hva_to_gpa(struct kvm *kvm, 1734 unsigned long start, 1735 unsigned long end, 1736 int (*handler)(struct kvm *kvm, 1737 gpa_t gpa, u64 size, 1738 void *data), 1739 void *data) 1740{ 1741 struct kvm_memslots *slots; 1742 struct kvm_memory_slot *memslot; 1743 int ret = 0; 1744 1745 slots = kvm_memslots(kvm); 1746 1747 /* we only care about the pages that the guest sees */ 1748 kvm_for_each_memslot(memslot, slots) { 1749 unsigned long hva_start, hva_end; 1750 gfn_t gpa; 1751 1752 hva_start = max(start, memslot->userspace_addr); 1753 hva_end = min(end, memslot->userspace_addr + 1754 (memslot->npages << PAGE_SHIFT)); 1755 if (hva_start >= hva_end) 1756 continue; 1757 1758 gpa = hva_to_gfn_memslot(hva_start, memslot) << PAGE_SHIFT; 1759 ret |= handler(kvm, gpa, (u64)(hva_end - hva_start), data); 1760 } 1761 1762 return ret; 1763} 1764 1765static int kvm_unmap_hva_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data) 1766{ 1767 unmap_stage2_range(kvm, gpa, size); 1768 return 0; 1769} 1770 1771int kvm_unmap_hva(struct kvm *kvm, unsigned long hva) 1772{ 1773 unsigned long end = hva + PAGE_SIZE; 1774 1775 if (!kvm->arch.pgd) 1776 return 0; 1777 1778 trace_kvm_unmap_hva(hva); 1779 handle_hva_to_gpa(kvm, hva, end, &kvm_unmap_hva_handler, NULL); 1780 return 0; 1781} 1782 1783int kvm_unmap_hva_range(struct kvm *kvm, 1784 unsigned long start, unsigned long end) 1785{ 1786 if (!kvm->arch.pgd) 1787 return 0; 1788 1789 trace_kvm_unmap_hva_range(start, end); 1790 handle_hva_to_gpa(kvm, start, end, &kvm_unmap_hva_handler, NULL); 1791 return 0; 1792} 1793 1794static int kvm_set_spte_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data) 1795{ 1796 pte_t *pte = (pte_t *)data; 1797 1798 WARN_ON(size != PAGE_SIZE); 1799 /* 1800 * We can always call stage2_set_pte with KVM_S2PTE_FLAG_LOGGING_ACTIVE 1801 * flag clear because MMU notifiers will have unmapped a huge PMD before 1802 * calling ->change_pte() (which in turn calls kvm_set_spte_hva()) and 1803 * therefore stage2_set_pte() never needs to clear out a huge PMD 1804 * through this calling path. 1805 */ 1806 stage2_set_pte(kvm, NULL, gpa, pte, 0); 1807 return 0; 1808} 1809 1810 1811void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte) 1812{ 1813 unsigned long end = hva + PAGE_SIZE; 1814 pte_t stage2_pte; 1815 1816 if (!kvm->arch.pgd) 1817 return; 1818 1819 trace_kvm_set_spte_hva(hva); 1820 stage2_pte = pfn_pte(pte_pfn(pte), PAGE_S2); 1821 handle_hva_to_gpa(kvm, hva, end, &kvm_set_spte_handler, &stage2_pte); 1822} 1823 1824static int kvm_age_hva_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data) 1825{ 1826 pmd_t *pmd; 1827 pte_t *pte; 1828 1829 WARN_ON(size != PAGE_SIZE && size != PMD_SIZE); 1830 pmd = stage2_get_pmd(kvm, NULL, gpa); 1831 if (!pmd || pmd_none(*pmd)) /* Nothing there */ 1832 return 0; 1833 1834 if (pmd_thp_or_huge(*pmd)) /* THP, HugeTLB */ 1835 return stage2_pmdp_test_and_clear_young(pmd); 1836 1837 pte = pte_offset_kernel(pmd, gpa); 1838 if (pte_none(*pte)) 1839 return 0; 1840 1841 return stage2_ptep_test_and_clear_young(pte); 1842} 1843 1844static int kvm_test_age_hva_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data) 1845{ 1846 pmd_t *pmd; 1847 pte_t *pte; 1848 1849 WARN_ON(size != PAGE_SIZE && size != PMD_SIZE); 1850 pmd = stage2_get_pmd(kvm, NULL, gpa); 1851 if (!pmd || pmd_none(*pmd)) /* Nothing there */ 1852 return 0; 1853 1854 if (pmd_thp_or_huge(*pmd)) /* THP, HugeTLB */ 1855 return pmd_young(*pmd); 1856 1857 pte = pte_offset_kernel(pmd, gpa); 1858 if (!pte_none(*pte)) /* Just a page... */ 1859 return pte_young(*pte); 1860 1861 return 0; 1862} 1863 1864int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end) 1865{ 1866 if (!kvm->arch.pgd) 1867 return 0; 1868 trace_kvm_age_hva(start, end); 1869 return handle_hva_to_gpa(kvm, start, end, kvm_age_hva_handler, NULL); 1870} 1871 1872int kvm_test_age_hva(struct kvm *kvm, unsigned long hva) 1873{ 1874 if (!kvm->arch.pgd) 1875 return 0; 1876 trace_kvm_test_age_hva(hva); 1877 return handle_hva_to_gpa(kvm, hva, hva, kvm_test_age_hva_handler, NULL); 1878} 1879 1880void kvm_mmu_free_memory_caches(struct kvm_vcpu *vcpu) 1881{ 1882 mmu_free_memory_cache(&vcpu->arch.mmu_page_cache); 1883} 1884 1885phys_addr_t kvm_mmu_get_httbr(void) 1886{ 1887 if (__kvm_cpu_uses_extended_idmap()) 1888 return virt_to_phys(merged_hyp_pgd); 1889 else 1890 return virt_to_phys(hyp_pgd); 1891} 1892 1893phys_addr_t kvm_get_idmap_vector(void) 1894{ 1895 return hyp_idmap_vector; 1896} 1897 1898static int kvm_map_idmap_text(pgd_t *pgd) 1899{ 1900 int err; 1901 1902 /* Create the idmap in the boot page tables */ 1903 err = __create_hyp_mappings(pgd, __kvm_idmap_ptrs_per_pgd(), 1904 hyp_idmap_start, hyp_idmap_end, 1905 __phys_to_pfn(hyp_idmap_start), 1906 PAGE_HYP_EXEC); 1907 if (err) 1908 kvm_err("Failed to idmap %lx-%lx\n", 1909 hyp_idmap_start, hyp_idmap_end); 1910 1911 return err; 1912} 1913 1914int kvm_mmu_init(void) 1915{ 1916 int err; 1917 1918 hyp_idmap_start = kvm_virt_to_phys(__hyp_idmap_text_start); 1919 hyp_idmap_start = ALIGN_DOWN(hyp_idmap_start, PAGE_SIZE); 1920 hyp_idmap_end = kvm_virt_to_phys(__hyp_idmap_text_end); 1921 hyp_idmap_end = ALIGN(hyp_idmap_end, PAGE_SIZE); 1922 hyp_idmap_vector = kvm_virt_to_phys(__kvm_hyp_init); 1923 1924 /* 1925 * We rely on the linker script to ensure at build time that the HYP 1926 * init code does not cross a page boundary. 1927 */ 1928 BUG_ON((hyp_idmap_start ^ (hyp_idmap_end - 1)) & PAGE_MASK); 1929 1930 kvm_debug("IDMAP page: %lx\n", hyp_idmap_start); 1931 kvm_debug("HYP VA range: %lx:%lx\n", 1932 kern_hyp_va(PAGE_OFFSET), 1933 kern_hyp_va((unsigned long)high_memory - 1)); 1934 1935 if (hyp_idmap_start >= kern_hyp_va(PAGE_OFFSET) && 1936 hyp_idmap_start < kern_hyp_va((unsigned long)high_memory - 1) && 1937 hyp_idmap_start != (unsigned long)__hyp_idmap_text_start) { 1938 /* 1939 * The idmap page is intersecting with the VA space, 1940 * it is not safe to continue further. 1941 */ 1942 kvm_err("IDMAP intersecting with HYP VA, unable to continue\n"); 1943 err = -EINVAL; 1944 goto out; 1945 } 1946 1947 hyp_pgd = (pgd_t *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, hyp_pgd_order); 1948 if (!hyp_pgd) { 1949 kvm_err("Hyp mode PGD not allocated\n"); 1950 err = -ENOMEM; 1951 goto out; 1952 } 1953 1954 if (__kvm_cpu_uses_extended_idmap()) { 1955 boot_hyp_pgd = (pgd_t *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, 1956 hyp_pgd_order); 1957 if (!boot_hyp_pgd) { 1958 kvm_err("Hyp boot PGD not allocated\n"); 1959 err = -ENOMEM; 1960 goto out; 1961 } 1962 1963 err = kvm_map_idmap_text(boot_hyp_pgd); 1964 if (err) 1965 goto out; 1966 1967 merged_hyp_pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO); 1968 if (!merged_hyp_pgd) { 1969 kvm_err("Failed to allocate extra HYP pgd\n"); 1970 goto out; 1971 } 1972 __kvm_extend_hypmap(boot_hyp_pgd, hyp_pgd, merged_hyp_pgd, 1973 hyp_idmap_start); 1974 } else { 1975 err = kvm_map_idmap_text(hyp_pgd); 1976 if (err) 1977 goto out; 1978 } 1979 1980 io_map_base = hyp_idmap_start; 1981 return 0; 1982out: 1983 free_hyp_pgds(); 1984 return err; 1985} 1986 1987void kvm_arch_commit_memory_region(struct kvm *kvm, 1988 const struct kvm_userspace_memory_region *mem, 1989 const struct kvm_memory_slot *old, 1990 const struct kvm_memory_slot *new, 1991 enum kvm_mr_change change) 1992{ 1993 /* 1994 * At this point memslot has been committed and there is an 1995 * allocated dirty_bitmap[], dirty pages will be be tracked while the 1996 * memory slot is write protected. 1997 */ 1998 if (change != KVM_MR_DELETE && mem->flags & KVM_MEM_LOG_DIRTY_PAGES) 1999 kvm_mmu_wp_memory_region(kvm, mem->slot); 2000} 2001 2002int kvm_arch_prepare_memory_region(struct kvm *kvm, 2003 struct kvm_memory_slot *memslot, 2004 const struct kvm_userspace_memory_region *mem, 2005 enum kvm_mr_change change) 2006{ 2007 hva_t hva = mem->userspace_addr; 2008 hva_t reg_end = hva + mem->memory_size; 2009 bool writable = !(mem->flags & KVM_MEM_READONLY); 2010 int ret = 0; 2011 2012 if (change != KVM_MR_CREATE && change != KVM_MR_MOVE && 2013 change != KVM_MR_FLAGS_ONLY) 2014 return 0; 2015 2016 /* 2017 * Prevent userspace from creating a memory region outside of the IPA 2018 * space addressable by the KVM guest IPA space. 2019 */ 2020 if (memslot->base_gfn + memslot->npages >= 2021 (KVM_PHYS_SIZE >> PAGE_SHIFT)) 2022 return -EFAULT; 2023 2024 down_read(&current->mm->mmap_sem); 2025 /* 2026 * A memory region could potentially cover multiple VMAs, and any holes 2027 * between them, so iterate over all of them to find out if we can map 2028 * any of them right now. 2029 * 2030 * +--------------------------------------------+ 2031 * +---------------+----------------+ +----------------+ 2032 * | : VMA 1 | VMA 2 | | VMA 3 : | 2033 * +---------------+----------------+ +----------------+ 2034 * | memory region | 2035 * +--------------------------------------------+ 2036 */ 2037 do { 2038 struct vm_area_struct *vma = find_vma(current->mm, hva); 2039 hva_t vm_start, vm_end; 2040 2041 if (!vma || vma->vm_start >= reg_end) 2042 break; 2043 2044 /* 2045 * Mapping a read-only VMA is only allowed if the 2046 * memory region is configured as read-only. 2047 */ 2048 if (writable && !(vma->vm_flags & VM_WRITE)) { 2049 ret = -EPERM; 2050 break; 2051 } 2052 2053 /* 2054 * Take the intersection of this VMA with the memory region 2055 */ 2056 vm_start = max(hva, vma->vm_start); 2057 vm_end = min(reg_end, vma->vm_end); 2058 2059 if (vma->vm_flags & VM_PFNMAP) { 2060 gpa_t gpa = mem->guest_phys_addr + 2061 (vm_start - mem->userspace_addr); 2062 phys_addr_t pa; 2063 2064 pa = (phys_addr_t)vma->vm_pgoff << PAGE_SHIFT; 2065 pa += vm_start - vma->vm_start; 2066 2067 /* IO region dirty page logging not allowed */ 2068 if (memslot->flags & KVM_MEM_LOG_DIRTY_PAGES) { 2069 ret = -EINVAL; 2070 goto out; 2071 } 2072 2073 ret = kvm_phys_addr_ioremap(kvm, gpa, pa, 2074 vm_end - vm_start, 2075 writable); 2076 if (ret) 2077 break; 2078 } 2079 hva = vm_end; 2080 } while (hva < reg_end); 2081 2082 if (change == KVM_MR_FLAGS_ONLY) 2083 goto out; 2084 2085 spin_lock(&kvm->mmu_lock); 2086 if (ret) 2087 unmap_stage2_range(kvm, mem->guest_phys_addr, mem->memory_size); 2088 else 2089 stage2_flush_memslot(kvm, memslot); 2090 spin_unlock(&kvm->mmu_lock); 2091out: 2092 up_read(&current->mm->mmap_sem); 2093 return ret; 2094} 2095 2096void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free, 2097 struct kvm_memory_slot *dont) 2098{ 2099} 2100 2101int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot, 2102 unsigned long npages) 2103{ 2104 return 0; 2105} 2106 2107void kvm_arch_memslots_updated(struct kvm *kvm, struct kvm_memslots *slots) 2108{ 2109} 2110 2111void kvm_arch_flush_shadow_all(struct kvm *kvm) 2112{ 2113 kvm_free_stage2_pgd(kvm); 2114} 2115 2116void kvm_arch_flush_shadow_memslot(struct kvm *kvm, 2117 struct kvm_memory_slot *slot) 2118{ 2119 gpa_t gpa = slot->base_gfn << PAGE_SHIFT; 2120 phys_addr_t size = slot->npages << PAGE_SHIFT; 2121 2122 spin_lock(&kvm->mmu_lock); 2123 unmap_stage2_range(kvm, gpa, size); 2124 spin_unlock(&kvm->mmu_lock); 2125} 2126 2127/* 2128 * See note at ARMv7 ARM B1.14.4 (TL;DR: S/W ops are not easily virtualized). 2129 * 2130 * Main problems: 2131 * - S/W ops are local to a CPU (not broadcast) 2132 * - We have line migration behind our back (speculation) 2133 * - System caches don't support S/W at all (damn!) 2134 * 2135 * In the face of the above, the best we can do is to try and convert 2136 * S/W ops to VA ops. Because the guest is not allowed to infer the 2137 * S/W to PA mapping, it can only use S/W to nuke the whole cache, 2138 * which is a rather good thing for us. 2139 * 2140 * Also, it is only used when turning caches on/off ("The expected 2141 * usage of the cache maintenance instructions that operate by set/way 2142 * is associated with the cache maintenance instructions associated 2143 * with the powerdown and powerup of caches, if this is required by 2144 * the implementation."). 2145 * 2146 * We use the following policy: 2147 * 2148 * - If we trap a S/W operation, we enable VM trapping to detect 2149 * caches being turned on/off, and do a full clean. 2150 * 2151 * - We flush the caches on both caches being turned on and off. 2152 * 2153 * - Once the caches are enabled, we stop trapping VM ops. 2154 */ 2155void kvm_set_way_flush(struct kvm_vcpu *vcpu) 2156{ 2157 unsigned long hcr = *vcpu_hcr(vcpu); 2158 2159 /* 2160 * If this is the first time we do a S/W operation 2161 * (i.e. HCR_TVM not set) flush the whole memory, and set the 2162 * VM trapping. 2163 * 2164 * Otherwise, rely on the VM trapping to wait for the MMU + 2165 * Caches to be turned off. At that point, we'll be able to 2166 * clean the caches again. 2167 */ 2168 if (!(hcr & HCR_TVM)) { 2169 trace_kvm_set_way_flush(*vcpu_pc(vcpu), 2170 vcpu_has_cache_enabled(vcpu)); 2171 stage2_flush_vm(vcpu->kvm); 2172 *vcpu_hcr(vcpu) = hcr | HCR_TVM; 2173 } 2174} 2175 2176void kvm_toggle_cache(struct kvm_vcpu *vcpu, bool was_enabled) 2177{ 2178 bool now_enabled = vcpu_has_cache_enabled(vcpu); 2179 2180 /* 2181 * If switching the MMU+caches on, need to invalidate the caches. 2182 * If switching it off, need to clean the caches. 2183 * Clean + invalidate does the trick always. 2184 */ 2185 if (now_enabled != was_enabled) 2186 stage2_flush_vm(vcpu->kvm); 2187 2188 /* Caches are now on, stop trapping VM ops (until a S/W op) */ 2189 if (now_enabled) 2190 *vcpu_hcr(vcpu) &= ~HCR_TVM; 2191 2192 trace_kvm_toggle_cache(*vcpu_pc(vcpu), was_enabled, now_enabled); 2193}