Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

at v4.17-rc1 2190 lines 58 kB view raw
1/* 2 * Copyright (C) 2012 - Virtual Open Systems and Columbia University 3 * Author: Christoffer Dall <c.dall@virtualopensystems.com> 4 * 5 * This program is free software; you can redistribute it and/or modify 6 * it under the terms of the GNU General Public License, version 2, as 7 * published by the Free Software Foundation. 8 * 9 * This program is distributed in the hope that it will be useful, 10 * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 * GNU General Public License for more details. 13 * 14 * You should have received a copy of the GNU General Public License 15 * along with this program; if not, write to the Free Software 16 * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. 17 */ 18 19#include <linux/mman.h> 20#include <linux/kvm_host.h> 21#include <linux/io.h> 22#include <linux/hugetlb.h> 23#include <linux/sched/signal.h> 24#include <trace/events/kvm.h> 25#include <asm/pgalloc.h> 26#include <asm/cacheflush.h> 27#include <asm/kvm_arm.h> 28#include <asm/kvm_mmu.h> 29#include <asm/kvm_mmio.h> 30#include <asm/kvm_asm.h> 31#include <asm/kvm_emulate.h> 32#include <asm/virt.h> 33#include <asm/system_misc.h> 34 35#include "trace.h" 36 37static pgd_t *boot_hyp_pgd; 38static pgd_t *hyp_pgd; 39static pgd_t *merged_hyp_pgd; 40static DEFINE_MUTEX(kvm_hyp_pgd_mutex); 41 42static unsigned long hyp_idmap_start; 43static unsigned long hyp_idmap_end; 44static phys_addr_t hyp_idmap_vector; 45 46static unsigned long io_map_base; 47 48#define S2_PGD_SIZE (PTRS_PER_S2_PGD * sizeof(pgd_t)) 49#define hyp_pgd_order get_order(PTRS_PER_PGD * sizeof(pgd_t)) 50 51#define KVM_S2PTE_FLAG_IS_IOMAP (1UL << 0) 52#define KVM_S2_FLAG_LOGGING_ACTIVE (1UL << 1) 53 54static bool memslot_is_logging(struct kvm_memory_slot *memslot) 55{ 56 return memslot->dirty_bitmap && !(memslot->flags & KVM_MEM_READONLY); 57} 58 59/** 60 * kvm_flush_remote_tlbs() - flush all VM TLB entries for v7/8 61 * @kvm: pointer to kvm structure. 62 * 63 * Interface to HYP function to flush all VM TLB entries 64 */ 65void kvm_flush_remote_tlbs(struct kvm *kvm) 66{ 67 kvm_call_hyp(__kvm_tlb_flush_vmid, kvm); 68} 69 70static void kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa) 71{ 72 kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, kvm, ipa); 73} 74 75/* 76 * D-Cache management functions. They take the page table entries by 77 * value, as they are flushing the cache using the kernel mapping (or 78 * kmap on 32bit). 79 */ 80static void kvm_flush_dcache_pte(pte_t pte) 81{ 82 __kvm_flush_dcache_pte(pte); 83} 84 85static void kvm_flush_dcache_pmd(pmd_t pmd) 86{ 87 __kvm_flush_dcache_pmd(pmd); 88} 89 90static void kvm_flush_dcache_pud(pud_t pud) 91{ 92 __kvm_flush_dcache_pud(pud); 93} 94 95static bool kvm_is_device_pfn(unsigned long pfn) 96{ 97 return !pfn_valid(pfn); 98} 99 100/** 101 * stage2_dissolve_pmd() - clear and flush huge PMD entry 102 * @kvm: pointer to kvm structure. 103 * @addr: IPA 104 * @pmd: pmd pointer for IPA 105 * 106 * Function clears a PMD entry, flushes addr 1st and 2nd stage TLBs. Marks all 107 * pages in the range dirty. 108 */ 109static void stage2_dissolve_pmd(struct kvm *kvm, phys_addr_t addr, pmd_t *pmd) 110{ 111 if (!pmd_thp_or_huge(*pmd)) 112 return; 113 114 pmd_clear(pmd); 115 kvm_tlb_flush_vmid_ipa(kvm, addr); 116 put_page(virt_to_page(pmd)); 117} 118 119static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache, 120 int min, int max) 121{ 122 void *page; 123 124 BUG_ON(max > KVM_NR_MEM_OBJS); 125 if (cache->nobjs >= min) 126 return 0; 127 while (cache->nobjs < max) { 128 page = (void *)__get_free_page(PGALLOC_GFP); 129 if (!page) 130 return -ENOMEM; 131 cache->objects[cache->nobjs++] = page; 132 } 133 return 0; 134} 135 136static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc) 137{ 138 while (mc->nobjs) 139 free_page((unsigned long)mc->objects[--mc->nobjs]); 140} 141 142static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc) 143{ 144 void *p; 145 146 BUG_ON(!mc || !mc->nobjs); 147 p = mc->objects[--mc->nobjs]; 148 return p; 149} 150 151static void clear_stage2_pgd_entry(struct kvm *kvm, pgd_t *pgd, phys_addr_t addr) 152{ 153 pud_t *pud_table __maybe_unused = stage2_pud_offset(pgd, 0UL); 154 stage2_pgd_clear(pgd); 155 kvm_tlb_flush_vmid_ipa(kvm, addr); 156 stage2_pud_free(pud_table); 157 put_page(virt_to_page(pgd)); 158} 159 160static void clear_stage2_pud_entry(struct kvm *kvm, pud_t *pud, phys_addr_t addr) 161{ 162 pmd_t *pmd_table __maybe_unused = stage2_pmd_offset(pud, 0); 163 VM_BUG_ON(stage2_pud_huge(*pud)); 164 stage2_pud_clear(pud); 165 kvm_tlb_flush_vmid_ipa(kvm, addr); 166 stage2_pmd_free(pmd_table); 167 put_page(virt_to_page(pud)); 168} 169 170static void clear_stage2_pmd_entry(struct kvm *kvm, pmd_t *pmd, phys_addr_t addr) 171{ 172 pte_t *pte_table = pte_offset_kernel(pmd, 0); 173 VM_BUG_ON(pmd_thp_or_huge(*pmd)); 174 pmd_clear(pmd); 175 kvm_tlb_flush_vmid_ipa(kvm, addr); 176 pte_free_kernel(NULL, pte_table); 177 put_page(virt_to_page(pmd)); 178} 179 180/* 181 * Unmapping vs dcache management: 182 * 183 * If a guest maps certain memory pages as uncached, all writes will 184 * bypass the data cache and go directly to RAM. However, the CPUs 185 * can still speculate reads (not writes) and fill cache lines with 186 * data. 187 * 188 * Those cache lines will be *clean* cache lines though, so a 189 * clean+invalidate operation is equivalent to an invalidate 190 * operation, because no cache lines are marked dirty. 191 * 192 * Those clean cache lines could be filled prior to an uncached write 193 * by the guest, and the cache coherent IO subsystem would therefore 194 * end up writing old data to disk. 195 * 196 * This is why right after unmapping a page/section and invalidating 197 * the corresponding TLBs, we call kvm_flush_dcache_p*() to make sure 198 * the IO subsystem will never hit in the cache. 199 */ 200static void unmap_stage2_ptes(struct kvm *kvm, pmd_t *pmd, 201 phys_addr_t addr, phys_addr_t end) 202{ 203 phys_addr_t start_addr = addr; 204 pte_t *pte, *start_pte; 205 206 start_pte = pte = pte_offset_kernel(pmd, addr); 207 do { 208 if (!pte_none(*pte)) { 209 pte_t old_pte = *pte; 210 211 kvm_set_pte(pte, __pte(0)); 212 kvm_tlb_flush_vmid_ipa(kvm, addr); 213 214 /* No need to invalidate the cache for device mappings */ 215 if (!kvm_is_device_pfn(pte_pfn(old_pte))) 216 kvm_flush_dcache_pte(old_pte); 217 218 put_page(virt_to_page(pte)); 219 } 220 } while (pte++, addr += PAGE_SIZE, addr != end); 221 222 if (stage2_pte_table_empty(start_pte)) 223 clear_stage2_pmd_entry(kvm, pmd, start_addr); 224} 225 226static void unmap_stage2_pmds(struct kvm *kvm, pud_t *pud, 227 phys_addr_t addr, phys_addr_t end) 228{ 229 phys_addr_t next, start_addr = addr; 230 pmd_t *pmd, *start_pmd; 231 232 start_pmd = pmd = stage2_pmd_offset(pud, addr); 233 do { 234 next = stage2_pmd_addr_end(addr, end); 235 if (!pmd_none(*pmd)) { 236 if (pmd_thp_or_huge(*pmd)) { 237 pmd_t old_pmd = *pmd; 238 239 pmd_clear(pmd); 240 kvm_tlb_flush_vmid_ipa(kvm, addr); 241 242 kvm_flush_dcache_pmd(old_pmd); 243 244 put_page(virt_to_page(pmd)); 245 } else { 246 unmap_stage2_ptes(kvm, pmd, addr, next); 247 } 248 } 249 } while (pmd++, addr = next, addr != end); 250 251 if (stage2_pmd_table_empty(start_pmd)) 252 clear_stage2_pud_entry(kvm, pud, start_addr); 253} 254 255static void unmap_stage2_puds(struct kvm *kvm, pgd_t *pgd, 256 phys_addr_t addr, phys_addr_t end) 257{ 258 phys_addr_t next, start_addr = addr; 259 pud_t *pud, *start_pud; 260 261 start_pud = pud = stage2_pud_offset(pgd, addr); 262 do { 263 next = stage2_pud_addr_end(addr, end); 264 if (!stage2_pud_none(*pud)) { 265 if (stage2_pud_huge(*pud)) { 266 pud_t old_pud = *pud; 267 268 stage2_pud_clear(pud); 269 kvm_tlb_flush_vmid_ipa(kvm, addr); 270 kvm_flush_dcache_pud(old_pud); 271 put_page(virt_to_page(pud)); 272 } else { 273 unmap_stage2_pmds(kvm, pud, addr, next); 274 } 275 } 276 } while (pud++, addr = next, addr != end); 277 278 if (stage2_pud_table_empty(start_pud)) 279 clear_stage2_pgd_entry(kvm, pgd, start_addr); 280} 281 282/** 283 * unmap_stage2_range -- Clear stage2 page table entries to unmap a range 284 * @kvm: The VM pointer 285 * @start: The intermediate physical base address of the range to unmap 286 * @size: The size of the area to unmap 287 * 288 * Clear a range of stage-2 mappings, lowering the various ref-counts. Must 289 * be called while holding mmu_lock (unless for freeing the stage2 pgd before 290 * destroying the VM), otherwise another faulting VCPU may come in and mess 291 * with things behind our backs. 292 */ 293static void unmap_stage2_range(struct kvm *kvm, phys_addr_t start, u64 size) 294{ 295 pgd_t *pgd; 296 phys_addr_t addr = start, end = start + size; 297 phys_addr_t next; 298 299 assert_spin_locked(&kvm->mmu_lock); 300 pgd = kvm->arch.pgd + stage2_pgd_index(addr); 301 do { 302 /* 303 * Make sure the page table is still active, as another thread 304 * could have possibly freed the page table, while we released 305 * the lock. 306 */ 307 if (!READ_ONCE(kvm->arch.pgd)) 308 break; 309 next = stage2_pgd_addr_end(addr, end); 310 if (!stage2_pgd_none(*pgd)) 311 unmap_stage2_puds(kvm, pgd, addr, next); 312 /* 313 * If the range is too large, release the kvm->mmu_lock 314 * to prevent starvation and lockup detector warnings. 315 */ 316 if (next != end) 317 cond_resched_lock(&kvm->mmu_lock); 318 } while (pgd++, addr = next, addr != end); 319} 320 321static void stage2_flush_ptes(struct kvm *kvm, pmd_t *pmd, 322 phys_addr_t addr, phys_addr_t end) 323{ 324 pte_t *pte; 325 326 pte = pte_offset_kernel(pmd, addr); 327 do { 328 if (!pte_none(*pte) && !kvm_is_device_pfn(pte_pfn(*pte))) 329 kvm_flush_dcache_pte(*pte); 330 } while (pte++, addr += PAGE_SIZE, addr != end); 331} 332 333static void stage2_flush_pmds(struct kvm *kvm, pud_t *pud, 334 phys_addr_t addr, phys_addr_t end) 335{ 336 pmd_t *pmd; 337 phys_addr_t next; 338 339 pmd = stage2_pmd_offset(pud, addr); 340 do { 341 next = stage2_pmd_addr_end(addr, end); 342 if (!pmd_none(*pmd)) { 343 if (pmd_thp_or_huge(*pmd)) 344 kvm_flush_dcache_pmd(*pmd); 345 else 346 stage2_flush_ptes(kvm, pmd, addr, next); 347 } 348 } while (pmd++, addr = next, addr != end); 349} 350 351static void stage2_flush_puds(struct kvm *kvm, pgd_t *pgd, 352 phys_addr_t addr, phys_addr_t end) 353{ 354 pud_t *pud; 355 phys_addr_t next; 356 357 pud = stage2_pud_offset(pgd, addr); 358 do { 359 next = stage2_pud_addr_end(addr, end); 360 if (!stage2_pud_none(*pud)) { 361 if (stage2_pud_huge(*pud)) 362 kvm_flush_dcache_pud(*pud); 363 else 364 stage2_flush_pmds(kvm, pud, addr, next); 365 } 366 } while (pud++, addr = next, addr != end); 367} 368 369static void stage2_flush_memslot(struct kvm *kvm, 370 struct kvm_memory_slot *memslot) 371{ 372 phys_addr_t addr = memslot->base_gfn << PAGE_SHIFT; 373 phys_addr_t end = addr + PAGE_SIZE * memslot->npages; 374 phys_addr_t next; 375 pgd_t *pgd; 376 377 pgd = kvm->arch.pgd + stage2_pgd_index(addr); 378 do { 379 next = stage2_pgd_addr_end(addr, end); 380 stage2_flush_puds(kvm, pgd, addr, next); 381 } while (pgd++, addr = next, addr != end); 382} 383 384/** 385 * stage2_flush_vm - Invalidate cache for pages mapped in stage 2 386 * @kvm: The struct kvm pointer 387 * 388 * Go through the stage 2 page tables and invalidate any cache lines 389 * backing memory already mapped to the VM. 390 */ 391static void stage2_flush_vm(struct kvm *kvm) 392{ 393 struct kvm_memslots *slots; 394 struct kvm_memory_slot *memslot; 395 int idx; 396 397 idx = srcu_read_lock(&kvm->srcu); 398 spin_lock(&kvm->mmu_lock); 399 400 slots = kvm_memslots(kvm); 401 kvm_for_each_memslot(memslot, slots) 402 stage2_flush_memslot(kvm, memslot); 403 404 spin_unlock(&kvm->mmu_lock); 405 srcu_read_unlock(&kvm->srcu, idx); 406} 407 408static void clear_hyp_pgd_entry(pgd_t *pgd) 409{ 410 pud_t *pud_table __maybe_unused = pud_offset(pgd, 0UL); 411 pgd_clear(pgd); 412 pud_free(NULL, pud_table); 413 put_page(virt_to_page(pgd)); 414} 415 416static void clear_hyp_pud_entry(pud_t *pud) 417{ 418 pmd_t *pmd_table __maybe_unused = pmd_offset(pud, 0); 419 VM_BUG_ON(pud_huge(*pud)); 420 pud_clear(pud); 421 pmd_free(NULL, pmd_table); 422 put_page(virt_to_page(pud)); 423} 424 425static void clear_hyp_pmd_entry(pmd_t *pmd) 426{ 427 pte_t *pte_table = pte_offset_kernel(pmd, 0); 428 VM_BUG_ON(pmd_thp_or_huge(*pmd)); 429 pmd_clear(pmd); 430 pte_free_kernel(NULL, pte_table); 431 put_page(virt_to_page(pmd)); 432} 433 434static void unmap_hyp_ptes(pmd_t *pmd, phys_addr_t addr, phys_addr_t end) 435{ 436 pte_t *pte, *start_pte; 437 438 start_pte = pte = pte_offset_kernel(pmd, addr); 439 do { 440 if (!pte_none(*pte)) { 441 kvm_set_pte(pte, __pte(0)); 442 put_page(virt_to_page(pte)); 443 } 444 } while (pte++, addr += PAGE_SIZE, addr != end); 445 446 if (hyp_pte_table_empty(start_pte)) 447 clear_hyp_pmd_entry(pmd); 448} 449 450static void unmap_hyp_pmds(pud_t *pud, phys_addr_t addr, phys_addr_t end) 451{ 452 phys_addr_t next; 453 pmd_t *pmd, *start_pmd; 454 455 start_pmd = pmd = pmd_offset(pud, addr); 456 do { 457 next = pmd_addr_end(addr, end); 458 /* Hyp doesn't use huge pmds */ 459 if (!pmd_none(*pmd)) 460 unmap_hyp_ptes(pmd, addr, next); 461 } while (pmd++, addr = next, addr != end); 462 463 if (hyp_pmd_table_empty(start_pmd)) 464 clear_hyp_pud_entry(pud); 465} 466 467static void unmap_hyp_puds(pgd_t *pgd, phys_addr_t addr, phys_addr_t end) 468{ 469 phys_addr_t next; 470 pud_t *pud, *start_pud; 471 472 start_pud = pud = pud_offset(pgd, addr); 473 do { 474 next = pud_addr_end(addr, end); 475 /* Hyp doesn't use huge puds */ 476 if (!pud_none(*pud)) 477 unmap_hyp_pmds(pud, addr, next); 478 } while (pud++, addr = next, addr != end); 479 480 if (hyp_pud_table_empty(start_pud)) 481 clear_hyp_pgd_entry(pgd); 482} 483 484static unsigned int kvm_pgd_index(unsigned long addr, unsigned int ptrs_per_pgd) 485{ 486 return (addr >> PGDIR_SHIFT) & (ptrs_per_pgd - 1); 487} 488 489static void __unmap_hyp_range(pgd_t *pgdp, unsigned long ptrs_per_pgd, 490 phys_addr_t start, u64 size) 491{ 492 pgd_t *pgd; 493 phys_addr_t addr = start, end = start + size; 494 phys_addr_t next; 495 496 /* 497 * We don't unmap anything from HYP, except at the hyp tear down. 498 * Hence, we don't have to invalidate the TLBs here. 499 */ 500 pgd = pgdp + kvm_pgd_index(addr, ptrs_per_pgd); 501 do { 502 next = pgd_addr_end(addr, end); 503 if (!pgd_none(*pgd)) 504 unmap_hyp_puds(pgd, addr, next); 505 } while (pgd++, addr = next, addr != end); 506} 507 508static void unmap_hyp_range(pgd_t *pgdp, phys_addr_t start, u64 size) 509{ 510 __unmap_hyp_range(pgdp, PTRS_PER_PGD, start, size); 511} 512 513static void unmap_hyp_idmap_range(pgd_t *pgdp, phys_addr_t start, u64 size) 514{ 515 __unmap_hyp_range(pgdp, __kvm_idmap_ptrs_per_pgd(), start, size); 516} 517 518/** 519 * free_hyp_pgds - free Hyp-mode page tables 520 * 521 * Assumes hyp_pgd is a page table used strictly in Hyp-mode and 522 * therefore contains either mappings in the kernel memory area (above 523 * PAGE_OFFSET), or device mappings in the idmap range. 524 * 525 * boot_hyp_pgd should only map the idmap range, and is only used in 526 * the extended idmap case. 527 */ 528void free_hyp_pgds(void) 529{ 530 pgd_t *id_pgd; 531 532 mutex_lock(&kvm_hyp_pgd_mutex); 533 534 id_pgd = boot_hyp_pgd ? boot_hyp_pgd : hyp_pgd; 535 536 if (id_pgd) { 537 /* In case we never called hyp_mmu_init() */ 538 if (!io_map_base) 539 io_map_base = hyp_idmap_start; 540 unmap_hyp_idmap_range(id_pgd, io_map_base, 541 hyp_idmap_start + PAGE_SIZE - io_map_base); 542 } 543 544 if (boot_hyp_pgd) { 545 free_pages((unsigned long)boot_hyp_pgd, hyp_pgd_order); 546 boot_hyp_pgd = NULL; 547 } 548 549 if (hyp_pgd) { 550 unmap_hyp_range(hyp_pgd, kern_hyp_va(PAGE_OFFSET), 551 (uintptr_t)high_memory - PAGE_OFFSET); 552 553 free_pages((unsigned long)hyp_pgd, hyp_pgd_order); 554 hyp_pgd = NULL; 555 } 556 if (merged_hyp_pgd) { 557 clear_page(merged_hyp_pgd); 558 free_page((unsigned long)merged_hyp_pgd); 559 merged_hyp_pgd = NULL; 560 } 561 562 mutex_unlock(&kvm_hyp_pgd_mutex); 563} 564 565static void create_hyp_pte_mappings(pmd_t *pmd, unsigned long start, 566 unsigned long end, unsigned long pfn, 567 pgprot_t prot) 568{ 569 pte_t *pte; 570 unsigned long addr; 571 572 addr = start; 573 do { 574 pte = pte_offset_kernel(pmd, addr); 575 kvm_set_pte(pte, pfn_pte(pfn, prot)); 576 get_page(virt_to_page(pte)); 577 kvm_flush_dcache_to_poc(pte, sizeof(*pte)); 578 pfn++; 579 } while (addr += PAGE_SIZE, addr != end); 580} 581 582static int create_hyp_pmd_mappings(pud_t *pud, unsigned long start, 583 unsigned long end, unsigned long pfn, 584 pgprot_t prot) 585{ 586 pmd_t *pmd; 587 pte_t *pte; 588 unsigned long addr, next; 589 590 addr = start; 591 do { 592 pmd = pmd_offset(pud, addr); 593 594 BUG_ON(pmd_sect(*pmd)); 595 596 if (pmd_none(*pmd)) { 597 pte = pte_alloc_one_kernel(NULL, addr); 598 if (!pte) { 599 kvm_err("Cannot allocate Hyp pte\n"); 600 return -ENOMEM; 601 } 602 pmd_populate_kernel(NULL, pmd, pte); 603 get_page(virt_to_page(pmd)); 604 kvm_flush_dcache_to_poc(pmd, sizeof(*pmd)); 605 } 606 607 next = pmd_addr_end(addr, end); 608 609 create_hyp_pte_mappings(pmd, addr, next, pfn, prot); 610 pfn += (next - addr) >> PAGE_SHIFT; 611 } while (addr = next, addr != end); 612 613 return 0; 614} 615 616static int create_hyp_pud_mappings(pgd_t *pgd, unsigned long start, 617 unsigned long end, unsigned long pfn, 618 pgprot_t prot) 619{ 620 pud_t *pud; 621 pmd_t *pmd; 622 unsigned long addr, next; 623 int ret; 624 625 addr = start; 626 do { 627 pud = pud_offset(pgd, addr); 628 629 if (pud_none_or_clear_bad(pud)) { 630 pmd = pmd_alloc_one(NULL, addr); 631 if (!pmd) { 632 kvm_err("Cannot allocate Hyp pmd\n"); 633 return -ENOMEM; 634 } 635 pud_populate(NULL, pud, pmd); 636 get_page(virt_to_page(pud)); 637 kvm_flush_dcache_to_poc(pud, sizeof(*pud)); 638 } 639 640 next = pud_addr_end(addr, end); 641 ret = create_hyp_pmd_mappings(pud, addr, next, pfn, prot); 642 if (ret) 643 return ret; 644 pfn += (next - addr) >> PAGE_SHIFT; 645 } while (addr = next, addr != end); 646 647 return 0; 648} 649 650static int __create_hyp_mappings(pgd_t *pgdp, unsigned long ptrs_per_pgd, 651 unsigned long start, unsigned long end, 652 unsigned long pfn, pgprot_t prot) 653{ 654 pgd_t *pgd; 655 pud_t *pud; 656 unsigned long addr, next; 657 int err = 0; 658 659 mutex_lock(&kvm_hyp_pgd_mutex); 660 addr = start & PAGE_MASK; 661 end = PAGE_ALIGN(end); 662 do { 663 pgd = pgdp + kvm_pgd_index(addr, ptrs_per_pgd); 664 665 if (pgd_none(*pgd)) { 666 pud = pud_alloc_one(NULL, addr); 667 if (!pud) { 668 kvm_err("Cannot allocate Hyp pud\n"); 669 err = -ENOMEM; 670 goto out; 671 } 672 pgd_populate(NULL, pgd, pud); 673 get_page(virt_to_page(pgd)); 674 kvm_flush_dcache_to_poc(pgd, sizeof(*pgd)); 675 } 676 677 next = pgd_addr_end(addr, end); 678 err = create_hyp_pud_mappings(pgd, addr, next, pfn, prot); 679 if (err) 680 goto out; 681 pfn += (next - addr) >> PAGE_SHIFT; 682 } while (addr = next, addr != end); 683out: 684 mutex_unlock(&kvm_hyp_pgd_mutex); 685 return err; 686} 687 688static phys_addr_t kvm_kaddr_to_phys(void *kaddr) 689{ 690 if (!is_vmalloc_addr(kaddr)) { 691 BUG_ON(!virt_addr_valid(kaddr)); 692 return __pa(kaddr); 693 } else { 694 return page_to_phys(vmalloc_to_page(kaddr)) + 695 offset_in_page(kaddr); 696 } 697} 698 699/** 700 * create_hyp_mappings - duplicate a kernel virtual address range in Hyp mode 701 * @from: The virtual kernel start address of the range 702 * @to: The virtual kernel end address of the range (exclusive) 703 * @prot: The protection to be applied to this range 704 * 705 * The same virtual address as the kernel virtual address is also used 706 * in Hyp-mode mapping (modulo HYP_PAGE_OFFSET) to the same underlying 707 * physical pages. 708 */ 709int create_hyp_mappings(void *from, void *to, pgprot_t prot) 710{ 711 phys_addr_t phys_addr; 712 unsigned long virt_addr; 713 unsigned long start = kern_hyp_va((unsigned long)from); 714 unsigned long end = kern_hyp_va((unsigned long)to); 715 716 if (is_kernel_in_hyp_mode()) 717 return 0; 718 719 start = start & PAGE_MASK; 720 end = PAGE_ALIGN(end); 721 722 for (virt_addr = start; virt_addr < end; virt_addr += PAGE_SIZE) { 723 int err; 724 725 phys_addr = kvm_kaddr_to_phys(from + virt_addr - start); 726 err = __create_hyp_mappings(hyp_pgd, PTRS_PER_PGD, 727 virt_addr, virt_addr + PAGE_SIZE, 728 __phys_to_pfn(phys_addr), 729 prot); 730 if (err) 731 return err; 732 } 733 734 return 0; 735} 736 737static int __create_hyp_private_mapping(phys_addr_t phys_addr, size_t size, 738 unsigned long *haddr, pgprot_t prot) 739{ 740 pgd_t *pgd = hyp_pgd; 741 unsigned long base; 742 int ret = 0; 743 744 mutex_lock(&kvm_hyp_pgd_mutex); 745 746 /* 747 * This assumes that we we have enough space below the idmap 748 * page to allocate our VAs. If not, the check below will 749 * kick. A potential alternative would be to detect that 750 * overflow and switch to an allocation above the idmap. 751 * 752 * The allocated size is always a multiple of PAGE_SIZE. 753 */ 754 size = PAGE_ALIGN(size + offset_in_page(phys_addr)); 755 base = io_map_base - size; 756 757 /* 758 * Verify that BIT(VA_BITS - 1) hasn't been flipped by 759 * allocating the new area, as it would indicate we've 760 * overflowed the idmap/IO address range. 761 */ 762 if ((base ^ io_map_base) & BIT(VA_BITS - 1)) 763 ret = -ENOMEM; 764 else 765 io_map_base = base; 766 767 mutex_unlock(&kvm_hyp_pgd_mutex); 768 769 if (ret) 770 goto out; 771 772 if (__kvm_cpu_uses_extended_idmap()) 773 pgd = boot_hyp_pgd; 774 775 ret = __create_hyp_mappings(pgd, __kvm_idmap_ptrs_per_pgd(), 776 base, base + size, 777 __phys_to_pfn(phys_addr), prot); 778 if (ret) 779 goto out; 780 781 *haddr = base + offset_in_page(phys_addr); 782 783out: 784 return ret; 785} 786 787/** 788 * create_hyp_io_mappings - Map IO into both kernel and HYP 789 * @phys_addr: The physical start address which gets mapped 790 * @size: Size of the region being mapped 791 * @kaddr: Kernel VA for this mapping 792 * @haddr: HYP VA for this mapping 793 */ 794int create_hyp_io_mappings(phys_addr_t phys_addr, size_t size, 795 void __iomem **kaddr, 796 void __iomem **haddr) 797{ 798 unsigned long addr; 799 int ret; 800 801 *kaddr = ioremap(phys_addr, size); 802 if (!*kaddr) 803 return -ENOMEM; 804 805 if (is_kernel_in_hyp_mode()) { 806 *haddr = *kaddr; 807 return 0; 808 } 809 810 ret = __create_hyp_private_mapping(phys_addr, size, 811 &addr, PAGE_HYP_DEVICE); 812 if (ret) { 813 iounmap(*kaddr); 814 *kaddr = NULL; 815 *haddr = NULL; 816 return ret; 817 } 818 819 *haddr = (void __iomem *)addr; 820 return 0; 821} 822 823/** 824 * create_hyp_exec_mappings - Map an executable range into HYP 825 * @phys_addr: The physical start address which gets mapped 826 * @size: Size of the region being mapped 827 * @haddr: HYP VA for this mapping 828 */ 829int create_hyp_exec_mappings(phys_addr_t phys_addr, size_t size, 830 void **haddr) 831{ 832 unsigned long addr; 833 int ret; 834 835 BUG_ON(is_kernel_in_hyp_mode()); 836 837 ret = __create_hyp_private_mapping(phys_addr, size, 838 &addr, PAGE_HYP_EXEC); 839 if (ret) { 840 *haddr = NULL; 841 return ret; 842 } 843 844 *haddr = (void *)addr; 845 return 0; 846} 847 848/** 849 * kvm_alloc_stage2_pgd - allocate level-1 table for stage-2 translation. 850 * @kvm: The KVM struct pointer for the VM. 851 * 852 * Allocates only the stage-2 HW PGD level table(s) (can support either full 853 * 40-bit input addresses or limited to 32-bit input addresses). Clears the 854 * allocated pages. 855 * 856 * Note we don't need locking here as this is only called when the VM is 857 * created, which can only be done once. 858 */ 859int kvm_alloc_stage2_pgd(struct kvm *kvm) 860{ 861 pgd_t *pgd; 862 863 if (kvm->arch.pgd != NULL) { 864 kvm_err("kvm_arch already initialized?\n"); 865 return -EINVAL; 866 } 867 868 /* Allocate the HW PGD, making sure that each page gets its own refcount */ 869 pgd = alloc_pages_exact(S2_PGD_SIZE, GFP_KERNEL | __GFP_ZERO); 870 if (!pgd) 871 return -ENOMEM; 872 873 kvm->arch.pgd = pgd; 874 return 0; 875} 876 877static void stage2_unmap_memslot(struct kvm *kvm, 878 struct kvm_memory_slot *memslot) 879{ 880 hva_t hva = memslot->userspace_addr; 881 phys_addr_t addr = memslot->base_gfn << PAGE_SHIFT; 882 phys_addr_t size = PAGE_SIZE * memslot->npages; 883 hva_t reg_end = hva + size; 884 885 /* 886 * A memory region could potentially cover multiple VMAs, and any holes 887 * between them, so iterate over all of them to find out if we should 888 * unmap any of them. 889 * 890 * +--------------------------------------------+ 891 * +---------------+----------------+ +----------------+ 892 * | : VMA 1 | VMA 2 | | VMA 3 : | 893 * +---------------+----------------+ +----------------+ 894 * | memory region | 895 * +--------------------------------------------+ 896 */ 897 do { 898 struct vm_area_struct *vma = find_vma(current->mm, hva); 899 hva_t vm_start, vm_end; 900 901 if (!vma || vma->vm_start >= reg_end) 902 break; 903 904 /* 905 * Take the intersection of this VMA with the memory region 906 */ 907 vm_start = max(hva, vma->vm_start); 908 vm_end = min(reg_end, vma->vm_end); 909 910 if (!(vma->vm_flags & VM_PFNMAP)) { 911 gpa_t gpa = addr + (vm_start - memslot->userspace_addr); 912 unmap_stage2_range(kvm, gpa, vm_end - vm_start); 913 } 914 hva = vm_end; 915 } while (hva < reg_end); 916} 917 918/** 919 * stage2_unmap_vm - Unmap Stage-2 RAM mappings 920 * @kvm: The struct kvm pointer 921 * 922 * Go through the memregions and unmap any reguler RAM 923 * backing memory already mapped to the VM. 924 */ 925void stage2_unmap_vm(struct kvm *kvm) 926{ 927 struct kvm_memslots *slots; 928 struct kvm_memory_slot *memslot; 929 int idx; 930 931 idx = srcu_read_lock(&kvm->srcu); 932 down_read(&current->mm->mmap_sem); 933 spin_lock(&kvm->mmu_lock); 934 935 slots = kvm_memslots(kvm); 936 kvm_for_each_memslot(memslot, slots) 937 stage2_unmap_memslot(kvm, memslot); 938 939 spin_unlock(&kvm->mmu_lock); 940 up_read(&current->mm->mmap_sem); 941 srcu_read_unlock(&kvm->srcu, idx); 942} 943 944/** 945 * kvm_free_stage2_pgd - free all stage-2 tables 946 * @kvm: The KVM struct pointer for the VM. 947 * 948 * Walks the level-1 page table pointed to by kvm->arch.pgd and frees all 949 * underlying level-2 and level-3 tables before freeing the actual level-1 table 950 * and setting the struct pointer to NULL. 951 */ 952void kvm_free_stage2_pgd(struct kvm *kvm) 953{ 954 void *pgd = NULL; 955 956 spin_lock(&kvm->mmu_lock); 957 if (kvm->arch.pgd) { 958 unmap_stage2_range(kvm, 0, KVM_PHYS_SIZE); 959 pgd = READ_ONCE(kvm->arch.pgd); 960 kvm->arch.pgd = NULL; 961 } 962 spin_unlock(&kvm->mmu_lock); 963 964 /* Free the HW pgd, one page at a time */ 965 if (pgd) 966 free_pages_exact(pgd, S2_PGD_SIZE); 967} 968 969static pud_t *stage2_get_pud(struct kvm *kvm, struct kvm_mmu_memory_cache *cache, 970 phys_addr_t addr) 971{ 972 pgd_t *pgd; 973 pud_t *pud; 974 975 pgd = kvm->arch.pgd + stage2_pgd_index(addr); 976 if (WARN_ON(stage2_pgd_none(*pgd))) { 977 if (!cache) 978 return NULL; 979 pud = mmu_memory_cache_alloc(cache); 980 stage2_pgd_populate(pgd, pud); 981 get_page(virt_to_page(pgd)); 982 } 983 984 return stage2_pud_offset(pgd, addr); 985} 986 987static pmd_t *stage2_get_pmd(struct kvm *kvm, struct kvm_mmu_memory_cache *cache, 988 phys_addr_t addr) 989{ 990 pud_t *pud; 991 pmd_t *pmd; 992 993 pud = stage2_get_pud(kvm, cache, addr); 994 if (!pud) 995 return NULL; 996 997 if (stage2_pud_none(*pud)) { 998 if (!cache) 999 return NULL; 1000 pmd = mmu_memory_cache_alloc(cache); 1001 stage2_pud_populate(pud, pmd); 1002 get_page(virt_to_page(pud)); 1003 } 1004 1005 return stage2_pmd_offset(pud, addr); 1006} 1007 1008static int stage2_set_pmd_huge(struct kvm *kvm, struct kvm_mmu_memory_cache 1009 *cache, phys_addr_t addr, const pmd_t *new_pmd) 1010{ 1011 pmd_t *pmd, old_pmd; 1012 1013 pmd = stage2_get_pmd(kvm, cache, addr); 1014 VM_BUG_ON(!pmd); 1015 1016 /* 1017 * Mapping in huge pages should only happen through a fault. If a 1018 * page is merged into a transparent huge page, the individual 1019 * subpages of that huge page should be unmapped through MMU 1020 * notifiers before we get here. 1021 * 1022 * Merging of CompoundPages is not supported; they should become 1023 * splitting first, unmapped, merged, and mapped back in on-demand. 1024 */ 1025 VM_BUG_ON(pmd_present(*pmd) && pmd_pfn(*pmd) != pmd_pfn(*new_pmd)); 1026 1027 old_pmd = *pmd; 1028 if (pmd_present(old_pmd)) { 1029 pmd_clear(pmd); 1030 kvm_tlb_flush_vmid_ipa(kvm, addr); 1031 } else { 1032 get_page(virt_to_page(pmd)); 1033 } 1034 1035 kvm_set_pmd(pmd, *new_pmd); 1036 return 0; 1037} 1038 1039static bool stage2_is_exec(struct kvm *kvm, phys_addr_t addr) 1040{ 1041 pmd_t *pmdp; 1042 pte_t *ptep; 1043 1044 pmdp = stage2_get_pmd(kvm, NULL, addr); 1045 if (!pmdp || pmd_none(*pmdp) || !pmd_present(*pmdp)) 1046 return false; 1047 1048 if (pmd_thp_or_huge(*pmdp)) 1049 return kvm_s2pmd_exec(pmdp); 1050 1051 ptep = pte_offset_kernel(pmdp, addr); 1052 if (!ptep || pte_none(*ptep) || !pte_present(*ptep)) 1053 return false; 1054 1055 return kvm_s2pte_exec(ptep); 1056} 1057 1058static int stage2_set_pte(struct kvm *kvm, struct kvm_mmu_memory_cache *cache, 1059 phys_addr_t addr, const pte_t *new_pte, 1060 unsigned long flags) 1061{ 1062 pmd_t *pmd; 1063 pte_t *pte, old_pte; 1064 bool iomap = flags & KVM_S2PTE_FLAG_IS_IOMAP; 1065 bool logging_active = flags & KVM_S2_FLAG_LOGGING_ACTIVE; 1066 1067 VM_BUG_ON(logging_active && !cache); 1068 1069 /* Create stage-2 page table mapping - Levels 0 and 1 */ 1070 pmd = stage2_get_pmd(kvm, cache, addr); 1071 if (!pmd) { 1072 /* 1073 * Ignore calls from kvm_set_spte_hva for unallocated 1074 * address ranges. 1075 */ 1076 return 0; 1077 } 1078 1079 /* 1080 * While dirty page logging - dissolve huge PMD, then continue on to 1081 * allocate page. 1082 */ 1083 if (logging_active) 1084 stage2_dissolve_pmd(kvm, addr, pmd); 1085 1086 /* Create stage-2 page mappings - Level 2 */ 1087 if (pmd_none(*pmd)) { 1088 if (!cache) 1089 return 0; /* ignore calls from kvm_set_spte_hva */ 1090 pte = mmu_memory_cache_alloc(cache); 1091 pmd_populate_kernel(NULL, pmd, pte); 1092 get_page(virt_to_page(pmd)); 1093 } 1094 1095 pte = pte_offset_kernel(pmd, addr); 1096 1097 if (iomap && pte_present(*pte)) 1098 return -EFAULT; 1099 1100 /* Create 2nd stage page table mapping - Level 3 */ 1101 old_pte = *pte; 1102 if (pte_present(old_pte)) { 1103 kvm_set_pte(pte, __pte(0)); 1104 kvm_tlb_flush_vmid_ipa(kvm, addr); 1105 } else { 1106 get_page(virt_to_page(pte)); 1107 } 1108 1109 kvm_set_pte(pte, *new_pte); 1110 return 0; 1111} 1112 1113#ifndef __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG 1114static int stage2_ptep_test_and_clear_young(pte_t *pte) 1115{ 1116 if (pte_young(*pte)) { 1117 *pte = pte_mkold(*pte); 1118 return 1; 1119 } 1120 return 0; 1121} 1122#else 1123static int stage2_ptep_test_and_clear_young(pte_t *pte) 1124{ 1125 return __ptep_test_and_clear_young(pte); 1126} 1127#endif 1128 1129static int stage2_pmdp_test_and_clear_young(pmd_t *pmd) 1130{ 1131 return stage2_ptep_test_and_clear_young((pte_t *)pmd); 1132} 1133 1134/** 1135 * kvm_phys_addr_ioremap - map a device range to guest IPA 1136 * 1137 * @kvm: The KVM pointer 1138 * @guest_ipa: The IPA at which to insert the mapping 1139 * @pa: The physical address of the device 1140 * @size: The size of the mapping 1141 */ 1142int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa, 1143 phys_addr_t pa, unsigned long size, bool writable) 1144{ 1145 phys_addr_t addr, end; 1146 int ret = 0; 1147 unsigned long pfn; 1148 struct kvm_mmu_memory_cache cache = { 0, }; 1149 1150 end = (guest_ipa + size + PAGE_SIZE - 1) & PAGE_MASK; 1151 pfn = __phys_to_pfn(pa); 1152 1153 for (addr = guest_ipa; addr < end; addr += PAGE_SIZE) { 1154 pte_t pte = pfn_pte(pfn, PAGE_S2_DEVICE); 1155 1156 if (writable) 1157 pte = kvm_s2pte_mkwrite(pte); 1158 1159 ret = mmu_topup_memory_cache(&cache, KVM_MMU_CACHE_MIN_PAGES, 1160 KVM_NR_MEM_OBJS); 1161 if (ret) 1162 goto out; 1163 spin_lock(&kvm->mmu_lock); 1164 ret = stage2_set_pte(kvm, &cache, addr, &pte, 1165 KVM_S2PTE_FLAG_IS_IOMAP); 1166 spin_unlock(&kvm->mmu_lock); 1167 if (ret) 1168 goto out; 1169 1170 pfn++; 1171 } 1172 1173out: 1174 mmu_free_memory_cache(&cache); 1175 return ret; 1176} 1177 1178static bool transparent_hugepage_adjust(kvm_pfn_t *pfnp, phys_addr_t *ipap) 1179{ 1180 kvm_pfn_t pfn = *pfnp; 1181 gfn_t gfn = *ipap >> PAGE_SHIFT; 1182 1183 if (PageTransCompoundMap(pfn_to_page(pfn))) { 1184 unsigned long mask; 1185 /* 1186 * The address we faulted on is backed by a transparent huge 1187 * page. However, because we map the compound huge page and 1188 * not the individual tail page, we need to transfer the 1189 * refcount to the head page. We have to be careful that the 1190 * THP doesn't start to split while we are adjusting the 1191 * refcounts. 1192 * 1193 * We are sure this doesn't happen, because mmu_notifier_retry 1194 * was successful and we are holding the mmu_lock, so if this 1195 * THP is trying to split, it will be blocked in the mmu 1196 * notifier before touching any of the pages, specifically 1197 * before being able to call __split_huge_page_refcount(). 1198 * 1199 * We can therefore safely transfer the refcount from PG_tail 1200 * to PG_head and switch the pfn from a tail page to the head 1201 * page accordingly. 1202 */ 1203 mask = PTRS_PER_PMD - 1; 1204 VM_BUG_ON((gfn & mask) != (pfn & mask)); 1205 if (pfn & mask) { 1206 *ipap &= PMD_MASK; 1207 kvm_release_pfn_clean(pfn); 1208 pfn &= ~mask; 1209 kvm_get_pfn(pfn); 1210 *pfnp = pfn; 1211 } 1212 1213 return true; 1214 } 1215 1216 return false; 1217} 1218 1219static bool kvm_is_write_fault(struct kvm_vcpu *vcpu) 1220{ 1221 if (kvm_vcpu_trap_is_iabt(vcpu)) 1222 return false; 1223 1224 return kvm_vcpu_dabt_iswrite(vcpu); 1225} 1226 1227/** 1228 * stage2_wp_ptes - write protect PMD range 1229 * @pmd: pointer to pmd entry 1230 * @addr: range start address 1231 * @end: range end address 1232 */ 1233static void stage2_wp_ptes(pmd_t *pmd, phys_addr_t addr, phys_addr_t end) 1234{ 1235 pte_t *pte; 1236 1237 pte = pte_offset_kernel(pmd, addr); 1238 do { 1239 if (!pte_none(*pte)) { 1240 if (!kvm_s2pte_readonly(pte)) 1241 kvm_set_s2pte_readonly(pte); 1242 } 1243 } while (pte++, addr += PAGE_SIZE, addr != end); 1244} 1245 1246/** 1247 * stage2_wp_pmds - write protect PUD range 1248 * @pud: pointer to pud entry 1249 * @addr: range start address 1250 * @end: range end address 1251 */ 1252static void stage2_wp_pmds(pud_t *pud, phys_addr_t addr, phys_addr_t end) 1253{ 1254 pmd_t *pmd; 1255 phys_addr_t next; 1256 1257 pmd = stage2_pmd_offset(pud, addr); 1258 1259 do { 1260 next = stage2_pmd_addr_end(addr, end); 1261 if (!pmd_none(*pmd)) { 1262 if (pmd_thp_or_huge(*pmd)) { 1263 if (!kvm_s2pmd_readonly(pmd)) 1264 kvm_set_s2pmd_readonly(pmd); 1265 } else { 1266 stage2_wp_ptes(pmd, addr, next); 1267 } 1268 } 1269 } while (pmd++, addr = next, addr != end); 1270} 1271 1272/** 1273 * stage2_wp_puds - write protect PGD range 1274 * @pgd: pointer to pgd entry 1275 * @addr: range start address 1276 * @end: range end address 1277 * 1278 * Process PUD entries, for a huge PUD we cause a panic. 1279 */ 1280static void stage2_wp_puds(pgd_t *pgd, phys_addr_t addr, phys_addr_t end) 1281{ 1282 pud_t *pud; 1283 phys_addr_t next; 1284 1285 pud = stage2_pud_offset(pgd, addr); 1286 do { 1287 next = stage2_pud_addr_end(addr, end); 1288 if (!stage2_pud_none(*pud)) { 1289 /* TODO:PUD not supported, revisit later if supported */ 1290 BUG_ON(stage2_pud_huge(*pud)); 1291 stage2_wp_pmds(pud, addr, next); 1292 } 1293 } while (pud++, addr = next, addr != end); 1294} 1295 1296/** 1297 * stage2_wp_range() - write protect stage2 memory region range 1298 * @kvm: The KVM pointer 1299 * @addr: Start address of range 1300 * @end: End address of range 1301 */ 1302static void stage2_wp_range(struct kvm *kvm, phys_addr_t addr, phys_addr_t end) 1303{ 1304 pgd_t *pgd; 1305 phys_addr_t next; 1306 1307 pgd = kvm->arch.pgd + stage2_pgd_index(addr); 1308 do { 1309 /* 1310 * Release kvm_mmu_lock periodically if the memory region is 1311 * large. Otherwise, we may see kernel panics with 1312 * CONFIG_DETECT_HUNG_TASK, CONFIG_LOCKUP_DETECTOR, 1313 * CONFIG_LOCKDEP. Additionally, holding the lock too long 1314 * will also starve other vCPUs. We have to also make sure 1315 * that the page tables are not freed while we released 1316 * the lock. 1317 */ 1318 cond_resched_lock(&kvm->mmu_lock); 1319 if (!READ_ONCE(kvm->arch.pgd)) 1320 break; 1321 next = stage2_pgd_addr_end(addr, end); 1322 if (stage2_pgd_present(*pgd)) 1323 stage2_wp_puds(pgd, addr, next); 1324 } while (pgd++, addr = next, addr != end); 1325} 1326 1327/** 1328 * kvm_mmu_wp_memory_region() - write protect stage 2 entries for memory slot 1329 * @kvm: The KVM pointer 1330 * @slot: The memory slot to write protect 1331 * 1332 * Called to start logging dirty pages after memory region 1333 * KVM_MEM_LOG_DIRTY_PAGES operation is called. After this function returns 1334 * all present PMD and PTEs are write protected in the memory region. 1335 * Afterwards read of dirty page log can be called. 1336 * 1337 * Acquires kvm_mmu_lock. Called with kvm->slots_lock mutex acquired, 1338 * serializing operations for VM memory regions. 1339 */ 1340void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot) 1341{ 1342 struct kvm_memslots *slots = kvm_memslots(kvm); 1343 struct kvm_memory_slot *memslot = id_to_memslot(slots, slot); 1344 phys_addr_t start = memslot->base_gfn << PAGE_SHIFT; 1345 phys_addr_t end = (memslot->base_gfn + memslot->npages) << PAGE_SHIFT; 1346 1347 spin_lock(&kvm->mmu_lock); 1348 stage2_wp_range(kvm, start, end); 1349 spin_unlock(&kvm->mmu_lock); 1350 kvm_flush_remote_tlbs(kvm); 1351} 1352 1353/** 1354 * kvm_mmu_write_protect_pt_masked() - write protect dirty pages 1355 * @kvm: The KVM pointer 1356 * @slot: The memory slot associated with mask 1357 * @gfn_offset: The gfn offset in memory slot 1358 * @mask: The mask of dirty pages at offset 'gfn_offset' in this memory 1359 * slot to be write protected 1360 * 1361 * Walks bits set in mask write protects the associated pte's. Caller must 1362 * acquire kvm_mmu_lock. 1363 */ 1364static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm, 1365 struct kvm_memory_slot *slot, 1366 gfn_t gfn_offset, unsigned long mask) 1367{ 1368 phys_addr_t base_gfn = slot->base_gfn + gfn_offset; 1369 phys_addr_t start = (base_gfn + __ffs(mask)) << PAGE_SHIFT; 1370 phys_addr_t end = (base_gfn + __fls(mask) + 1) << PAGE_SHIFT; 1371 1372 stage2_wp_range(kvm, start, end); 1373} 1374 1375/* 1376 * kvm_arch_mmu_enable_log_dirty_pt_masked - enable dirty logging for selected 1377 * dirty pages. 1378 * 1379 * It calls kvm_mmu_write_protect_pt_masked to write protect selected pages to 1380 * enable dirty logging for them. 1381 */ 1382void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm, 1383 struct kvm_memory_slot *slot, 1384 gfn_t gfn_offset, unsigned long mask) 1385{ 1386 kvm_mmu_write_protect_pt_masked(kvm, slot, gfn_offset, mask); 1387} 1388 1389static void clean_dcache_guest_page(kvm_pfn_t pfn, unsigned long size) 1390{ 1391 __clean_dcache_guest_page(pfn, size); 1392} 1393 1394static void invalidate_icache_guest_page(kvm_pfn_t pfn, unsigned long size) 1395{ 1396 __invalidate_icache_guest_page(pfn, size); 1397} 1398 1399static void kvm_send_hwpoison_signal(unsigned long address, 1400 struct vm_area_struct *vma) 1401{ 1402 siginfo_t info; 1403 1404 info.si_signo = SIGBUS; 1405 info.si_errno = 0; 1406 info.si_code = BUS_MCEERR_AR; 1407 info.si_addr = (void __user *)address; 1408 1409 if (is_vm_hugetlb_page(vma)) 1410 info.si_addr_lsb = huge_page_shift(hstate_vma(vma)); 1411 else 1412 info.si_addr_lsb = PAGE_SHIFT; 1413 1414 send_sig_info(SIGBUS, &info, current); 1415} 1416 1417static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, 1418 struct kvm_memory_slot *memslot, unsigned long hva, 1419 unsigned long fault_status) 1420{ 1421 int ret; 1422 bool write_fault, exec_fault, writable, hugetlb = false, force_pte = false; 1423 unsigned long mmu_seq; 1424 gfn_t gfn = fault_ipa >> PAGE_SHIFT; 1425 struct kvm *kvm = vcpu->kvm; 1426 struct kvm_mmu_memory_cache *memcache = &vcpu->arch.mmu_page_cache; 1427 struct vm_area_struct *vma; 1428 kvm_pfn_t pfn; 1429 pgprot_t mem_type = PAGE_S2; 1430 bool logging_active = memslot_is_logging(memslot); 1431 unsigned long flags = 0; 1432 1433 write_fault = kvm_is_write_fault(vcpu); 1434 exec_fault = kvm_vcpu_trap_is_iabt(vcpu); 1435 VM_BUG_ON(write_fault && exec_fault); 1436 1437 if (fault_status == FSC_PERM && !write_fault && !exec_fault) { 1438 kvm_err("Unexpected L2 read permission error\n"); 1439 return -EFAULT; 1440 } 1441 1442 /* Let's check if we will get back a huge page backed by hugetlbfs */ 1443 down_read(&current->mm->mmap_sem); 1444 vma = find_vma_intersection(current->mm, hva, hva + 1); 1445 if (unlikely(!vma)) { 1446 kvm_err("Failed to find VMA for hva 0x%lx\n", hva); 1447 up_read(&current->mm->mmap_sem); 1448 return -EFAULT; 1449 } 1450 1451 if (vma_kernel_pagesize(vma) == PMD_SIZE && !logging_active) { 1452 hugetlb = true; 1453 gfn = (fault_ipa & PMD_MASK) >> PAGE_SHIFT; 1454 } else { 1455 /* 1456 * Pages belonging to memslots that don't have the same 1457 * alignment for userspace and IPA cannot be mapped using 1458 * block descriptors even if the pages belong to a THP for 1459 * the process, because the stage-2 block descriptor will 1460 * cover more than a single THP and we loose atomicity for 1461 * unmapping, updates, and splits of the THP or other pages 1462 * in the stage-2 block range. 1463 */ 1464 if ((memslot->userspace_addr & ~PMD_MASK) != 1465 ((memslot->base_gfn << PAGE_SHIFT) & ~PMD_MASK)) 1466 force_pte = true; 1467 } 1468 up_read(&current->mm->mmap_sem); 1469 1470 /* We need minimum second+third level pages */ 1471 ret = mmu_topup_memory_cache(memcache, KVM_MMU_CACHE_MIN_PAGES, 1472 KVM_NR_MEM_OBJS); 1473 if (ret) 1474 return ret; 1475 1476 mmu_seq = vcpu->kvm->mmu_notifier_seq; 1477 /* 1478 * Ensure the read of mmu_notifier_seq happens before we call 1479 * gfn_to_pfn_prot (which calls get_user_pages), so that we don't risk 1480 * the page we just got a reference to gets unmapped before we have a 1481 * chance to grab the mmu_lock, which ensure that if the page gets 1482 * unmapped afterwards, the call to kvm_unmap_hva will take it away 1483 * from us again properly. This smp_rmb() interacts with the smp_wmb() 1484 * in kvm_mmu_notifier_invalidate_<page|range_end>. 1485 */ 1486 smp_rmb(); 1487 1488 pfn = gfn_to_pfn_prot(kvm, gfn, write_fault, &writable); 1489 if (pfn == KVM_PFN_ERR_HWPOISON) { 1490 kvm_send_hwpoison_signal(hva, vma); 1491 return 0; 1492 } 1493 if (is_error_noslot_pfn(pfn)) 1494 return -EFAULT; 1495 1496 if (kvm_is_device_pfn(pfn)) { 1497 mem_type = PAGE_S2_DEVICE; 1498 flags |= KVM_S2PTE_FLAG_IS_IOMAP; 1499 } else if (logging_active) { 1500 /* 1501 * Faults on pages in a memslot with logging enabled 1502 * should not be mapped with huge pages (it introduces churn 1503 * and performance degradation), so force a pte mapping. 1504 */ 1505 force_pte = true; 1506 flags |= KVM_S2_FLAG_LOGGING_ACTIVE; 1507 1508 /* 1509 * Only actually map the page as writable if this was a write 1510 * fault. 1511 */ 1512 if (!write_fault) 1513 writable = false; 1514 } 1515 1516 spin_lock(&kvm->mmu_lock); 1517 if (mmu_notifier_retry(kvm, mmu_seq)) 1518 goto out_unlock; 1519 1520 if (!hugetlb && !force_pte) 1521 hugetlb = transparent_hugepage_adjust(&pfn, &fault_ipa); 1522 1523 if (hugetlb) { 1524 pmd_t new_pmd = pfn_pmd(pfn, mem_type); 1525 new_pmd = pmd_mkhuge(new_pmd); 1526 if (writable) { 1527 new_pmd = kvm_s2pmd_mkwrite(new_pmd); 1528 kvm_set_pfn_dirty(pfn); 1529 } 1530 1531 if (fault_status != FSC_PERM) 1532 clean_dcache_guest_page(pfn, PMD_SIZE); 1533 1534 if (exec_fault) { 1535 new_pmd = kvm_s2pmd_mkexec(new_pmd); 1536 invalidate_icache_guest_page(pfn, PMD_SIZE); 1537 } else if (fault_status == FSC_PERM) { 1538 /* Preserve execute if XN was already cleared */ 1539 if (stage2_is_exec(kvm, fault_ipa)) 1540 new_pmd = kvm_s2pmd_mkexec(new_pmd); 1541 } 1542 1543 ret = stage2_set_pmd_huge(kvm, memcache, fault_ipa, &new_pmd); 1544 } else { 1545 pte_t new_pte = pfn_pte(pfn, mem_type); 1546 1547 if (writable) { 1548 new_pte = kvm_s2pte_mkwrite(new_pte); 1549 kvm_set_pfn_dirty(pfn); 1550 mark_page_dirty(kvm, gfn); 1551 } 1552 1553 if (fault_status != FSC_PERM) 1554 clean_dcache_guest_page(pfn, PAGE_SIZE); 1555 1556 if (exec_fault) { 1557 new_pte = kvm_s2pte_mkexec(new_pte); 1558 invalidate_icache_guest_page(pfn, PAGE_SIZE); 1559 } else if (fault_status == FSC_PERM) { 1560 /* Preserve execute if XN was already cleared */ 1561 if (stage2_is_exec(kvm, fault_ipa)) 1562 new_pte = kvm_s2pte_mkexec(new_pte); 1563 } 1564 1565 ret = stage2_set_pte(kvm, memcache, fault_ipa, &new_pte, flags); 1566 } 1567 1568out_unlock: 1569 spin_unlock(&kvm->mmu_lock); 1570 kvm_set_pfn_accessed(pfn); 1571 kvm_release_pfn_clean(pfn); 1572 return ret; 1573} 1574 1575/* 1576 * Resolve the access fault by making the page young again. 1577 * Note that because the faulting entry is guaranteed not to be 1578 * cached in the TLB, we don't need to invalidate anything. 1579 * Only the HW Access Flag updates are supported for Stage 2 (no DBM), 1580 * so there is no need for atomic (pte|pmd)_mkyoung operations. 1581 */ 1582static void handle_access_fault(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa) 1583{ 1584 pmd_t *pmd; 1585 pte_t *pte; 1586 kvm_pfn_t pfn; 1587 bool pfn_valid = false; 1588 1589 trace_kvm_access_fault(fault_ipa); 1590 1591 spin_lock(&vcpu->kvm->mmu_lock); 1592 1593 pmd = stage2_get_pmd(vcpu->kvm, NULL, fault_ipa); 1594 if (!pmd || pmd_none(*pmd)) /* Nothing there */ 1595 goto out; 1596 1597 if (pmd_thp_or_huge(*pmd)) { /* THP, HugeTLB */ 1598 *pmd = pmd_mkyoung(*pmd); 1599 pfn = pmd_pfn(*pmd); 1600 pfn_valid = true; 1601 goto out; 1602 } 1603 1604 pte = pte_offset_kernel(pmd, fault_ipa); 1605 if (pte_none(*pte)) /* Nothing there either */ 1606 goto out; 1607 1608 *pte = pte_mkyoung(*pte); /* Just a page... */ 1609 pfn = pte_pfn(*pte); 1610 pfn_valid = true; 1611out: 1612 spin_unlock(&vcpu->kvm->mmu_lock); 1613 if (pfn_valid) 1614 kvm_set_pfn_accessed(pfn); 1615} 1616 1617/** 1618 * kvm_handle_guest_abort - handles all 2nd stage aborts 1619 * @vcpu: the VCPU pointer 1620 * @run: the kvm_run structure 1621 * 1622 * Any abort that gets to the host is almost guaranteed to be caused by a 1623 * missing second stage translation table entry, which can mean that either the 1624 * guest simply needs more memory and we must allocate an appropriate page or it 1625 * can mean that the guest tried to access I/O memory, which is emulated by user 1626 * space. The distinction is based on the IPA causing the fault and whether this 1627 * memory region has been registered as standard RAM by user space. 1628 */ 1629int kvm_handle_guest_abort(struct kvm_vcpu *vcpu, struct kvm_run *run) 1630{ 1631 unsigned long fault_status; 1632 phys_addr_t fault_ipa; 1633 struct kvm_memory_slot *memslot; 1634 unsigned long hva; 1635 bool is_iabt, write_fault, writable; 1636 gfn_t gfn; 1637 int ret, idx; 1638 1639 fault_status = kvm_vcpu_trap_get_fault_type(vcpu); 1640 1641 fault_ipa = kvm_vcpu_get_fault_ipa(vcpu); 1642 is_iabt = kvm_vcpu_trap_is_iabt(vcpu); 1643 1644 /* Synchronous External Abort? */ 1645 if (kvm_vcpu_dabt_isextabt(vcpu)) { 1646 /* 1647 * For RAS the host kernel may handle this abort. 1648 * There is no need to pass the error into the guest. 1649 */ 1650 if (!handle_guest_sea(fault_ipa, kvm_vcpu_get_hsr(vcpu))) 1651 return 1; 1652 1653 if (unlikely(!is_iabt)) { 1654 kvm_inject_vabt(vcpu); 1655 return 1; 1656 } 1657 } 1658 1659 trace_kvm_guest_fault(*vcpu_pc(vcpu), kvm_vcpu_get_hsr(vcpu), 1660 kvm_vcpu_get_hfar(vcpu), fault_ipa); 1661 1662 /* Check the stage-2 fault is trans. fault or write fault */ 1663 if (fault_status != FSC_FAULT && fault_status != FSC_PERM && 1664 fault_status != FSC_ACCESS) { 1665 kvm_err("Unsupported FSC: EC=%#x xFSC=%#lx ESR_EL2=%#lx\n", 1666 kvm_vcpu_trap_get_class(vcpu), 1667 (unsigned long)kvm_vcpu_trap_get_fault(vcpu), 1668 (unsigned long)kvm_vcpu_get_hsr(vcpu)); 1669 return -EFAULT; 1670 } 1671 1672 idx = srcu_read_lock(&vcpu->kvm->srcu); 1673 1674 gfn = fault_ipa >> PAGE_SHIFT; 1675 memslot = gfn_to_memslot(vcpu->kvm, gfn); 1676 hva = gfn_to_hva_memslot_prot(memslot, gfn, &writable); 1677 write_fault = kvm_is_write_fault(vcpu); 1678 if (kvm_is_error_hva(hva) || (write_fault && !writable)) { 1679 if (is_iabt) { 1680 /* Prefetch Abort on I/O address */ 1681 kvm_inject_pabt(vcpu, kvm_vcpu_get_hfar(vcpu)); 1682 ret = 1; 1683 goto out_unlock; 1684 } 1685 1686 /* 1687 * Check for a cache maintenance operation. Since we 1688 * ended-up here, we know it is outside of any memory 1689 * slot. But we can't find out if that is for a device, 1690 * or if the guest is just being stupid. The only thing 1691 * we know for sure is that this range cannot be cached. 1692 * 1693 * So let's assume that the guest is just being 1694 * cautious, and skip the instruction. 1695 */ 1696 if (kvm_vcpu_dabt_is_cm(vcpu)) { 1697 kvm_skip_instr(vcpu, kvm_vcpu_trap_il_is32bit(vcpu)); 1698 ret = 1; 1699 goto out_unlock; 1700 } 1701 1702 /* 1703 * The IPA is reported as [MAX:12], so we need to 1704 * complement it with the bottom 12 bits from the 1705 * faulting VA. This is always 12 bits, irrespective 1706 * of the page size. 1707 */ 1708 fault_ipa |= kvm_vcpu_get_hfar(vcpu) & ((1 << 12) - 1); 1709 ret = io_mem_abort(vcpu, run, fault_ipa); 1710 goto out_unlock; 1711 } 1712 1713 /* Userspace should not be able to register out-of-bounds IPAs */ 1714 VM_BUG_ON(fault_ipa >= KVM_PHYS_SIZE); 1715 1716 if (fault_status == FSC_ACCESS) { 1717 handle_access_fault(vcpu, fault_ipa); 1718 ret = 1; 1719 goto out_unlock; 1720 } 1721 1722 ret = user_mem_abort(vcpu, fault_ipa, memslot, hva, fault_status); 1723 if (ret == 0) 1724 ret = 1; 1725out_unlock: 1726 srcu_read_unlock(&vcpu->kvm->srcu, idx); 1727 return ret; 1728} 1729 1730static int handle_hva_to_gpa(struct kvm *kvm, 1731 unsigned long start, 1732 unsigned long end, 1733 int (*handler)(struct kvm *kvm, 1734 gpa_t gpa, u64 size, 1735 void *data), 1736 void *data) 1737{ 1738 struct kvm_memslots *slots; 1739 struct kvm_memory_slot *memslot; 1740 int ret = 0; 1741 1742 slots = kvm_memslots(kvm); 1743 1744 /* we only care about the pages that the guest sees */ 1745 kvm_for_each_memslot(memslot, slots) { 1746 unsigned long hva_start, hva_end; 1747 gfn_t gpa; 1748 1749 hva_start = max(start, memslot->userspace_addr); 1750 hva_end = min(end, memslot->userspace_addr + 1751 (memslot->npages << PAGE_SHIFT)); 1752 if (hva_start >= hva_end) 1753 continue; 1754 1755 gpa = hva_to_gfn_memslot(hva_start, memslot) << PAGE_SHIFT; 1756 ret |= handler(kvm, gpa, (u64)(hva_end - hva_start), data); 1757 } 1758 1759 return ret; 1760} 1761 1762static int kvm_unmap_hva_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data) 1763{ 1764 unmap_stage2_range(kvm, gpa, size); 1765 return 0; 1766} 1767 1768int kvm_unmap_hva(struct kvm *kvm, unsigned long hva) 1769{ 1770 unsigned long end = hva + PAGE_SIZE; 1771 1772 if (!kvm->arch.pgd) 1773 return 0; 1774 1775 trace_kvm_unmap_hva(hva); 1776 handle_hva_to_gpa(kvm, hva, end, &kvm_unmap_hva_handler, NULL); 1777 return 0; 1778} 1779 1780int kvm_unmap_hva_range(struct kvm *kvm, 1781 unsigned long start, unsigned long end) 1782{ 1783 if (!kvm->arch.pgd) 1784 return 0; 1785 1786 trace_kvm_unmap_hva_range(start, end); 1787 handle_hva_to_gpa(kvm, start, end, &kvm_unmap_hva_handler, NULL); 1788 return 0; 1789} 1790 1791static int kvm_set_spte_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data) 1792{ 1793 pte_t *pte = (pte_t *)data; 1794 1795 WARN_ON(size != PAGE_SIZE); 1796 /* 1797 * We can always call stage2_set_pte with KVM_S2PTE_FLAG_LOGGING_ACTIVE 1798 * flag clear because MMU notifiers will have unmapped a huge PMD before 1799 * calling ->change_pte() (which in turn calls kvm_set_spte_hva()) and 1800 * therefore stage2_set_pte() never needs to clear out a huge PMD 1801 * through this calling path. 1802 */ 1803 stage2_set_pte(kvm, NULL, gpa, pte, 0); 1804 return 0; 1805} 1806 1807 1808void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte) 1809{ 1810 unsigned long end = hva + PAGE_SIZE; 1811 pte_t stage2_pte; 1812 1813 if (!kvm->arch.pgd) 1814 return; 1815 1816 trace_kvm_set_spte_hva(hva); 1817 stage2_pte = pfn_pte(pte_pfn(pte), PAGE_S2); 1818 handle_hva_to_gpa(kvm, hva, end, &kvm_set_spte_handler, &stage2_pte); 1819} 1820 1821static int kvm_age_hva_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data) 1822{ 1823 pmd_t *pmd; 1824 pte_t *pte; 1825 1826 WARN_ON(size != PAGE_SIZE && size != PMD_SIZE); 1827 pmd = stage2_get_pmd(kvm, NULL, gpa); 1828 if (!pmd || pmd_none(*pmd)) /* Nothing there */ 1829 return 0; 1830 1831 if (pmd_thp_or_huge(*pmd)) /* THP, HugeTLB */ 1832 return stage2_pmdp_test_and_clear_young(pmd); 1833 1834 pte = pte_offset_kernel(pmd, gpa); 1835 if (pte_none(*pte)) 1836 return 0; 1837 1838 return stage2_ptep_test_and_clear_young(pte); 1839} 1840 1841static int kvm_test_age_hva_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data) 1842{ 1843 pmd_t *pmd; 1844 pte_t *pte; 1845 1846 WARN_ON(size != PAGE_SIZE && size != PMD_SIZE); 1847 pmd = stage2_get_pmd(kvm, NULL, gpa); 1848 if (!pmd || pmd_none(*pmd)) /* Nothing there */ 1849 return 0; 1850 1851 if (pmd_thp_or_huge(*pmd)) /* THP, HugeTLB */ 1852 return pmd_young(*pmd); 1853 1854 pte = pte_offset_kernel(pmd, gpa); 1855 if (!pte_none(*pte)) /* Just a page... */ 1856 return pte_young(*pte); 1857 1858 return 0; 1859} 1860 1861int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end) 1862{ 1863 if (!kvm->arch.pgd) 1864 return 0; 1865 trace_kvm_age_hva(start, end); 1866 return handle_hva_to_gpa(kvm, start, end, kvm_age_hva_handler, NULL); 1867} 1868 1869int kvm_test_age_hva(struct kvm *kvm, unsigned long hva) 1870{ 1871 if (!kvm->arch.pgd) 1872 return 0; 1873 trace_kvm_test_age_hva(hva); 1874 return handle_hva_to_gpa(kvm, hva, hva, kvm_test_age_hva_handler, NULL); 1875} 1876 1877void kvm_mmu_free_memory_caches(struct kvm_vcpu *vcpu) 1878{ 1879 mmu_free_memory_cache(&vcpu->arch.mmu_page_cache); 1880} 1881 1882phys_addr_t kvm_mmu_get_httbr(void) 1883{ 1884 if (__kvm_cpu_uses_extended_idmap()) 1885 return virt_to_phys(merged_hyp_pgd); 1886 else 1887 return virt_to_phys(hyp_pgd); 1888} 1889 1890phys_addr_t kvm_get_idmap_vector(void) 1891{ 1892 return hyp_idmap_vector; 1893} 1894 1895static int kvm_map_idmap_text(pgd_t *pgd) 1896{ 1897 int err; 1898 1899 /* Create the idmap in the boot page tables */ 1900 err = __create_hyp_mappings(pgd, __kvm_idmap_ptrs_per_pgd(), 1901 hyp_idmap_start, hyp_idmap_end, 1902 __phys_to_pfn(hyp_idmap_start), 1903 PAGE_HYP_EXEC); 1904 if (err) 1905 kvm_err("Failed to idmap %lx-%lx\n", 1906 hyp_idmap_start, hyp_idmap_end); 1907 1908 return err; 1909} 1910 1911int kvm_mmu_init(void) 1912{ 1913 int err; 1914 1915 hyp_idmap_start = kvm_virt_to_phys(__hyp_idmap_text_start); 1916 hyp_idmap_start = ALIGN_DOWN(hyp_idmap_start, PAGE_SIZE); 1917 hyp_idmap_end = kvm_virt_to_phys(__hyp_idmap_text_end); 1918 hyp_idmap_end = ALIGN(hyp_idmap_end, PAGE_SIZE); 1919 hyp_idmap_vector = kvm_virt_to_phys(__kvm_hyp_init); 1920 1921 /* 1922 * We rely on the linker script to ensure at build time that the HYP 1923 * init code does not cross a page boundary. 1924 */ 1925 BUG_ON((hyp_idmap_start ^ (hyp_idmap_end - 1)) & PAGE_MASK); 1926 1927 kvm_debug("IDMAP page: %lx\n", hyp_idmap_start); 1928 kvm_debug("HYP VA range: %lx:%lx\n", 1929 kern_hyp_va(PAGE_OFFSET), 1930 kern_hyp_va((unsigned long)high_memory - 1)); 1931 1932 if (hyp_idmap_start >= kern_hyp_va(PAGE_OFFSET) && 1933 hyp_idmap_start < kern_hyp_va((unsigned long)high_memory - 1) && 1934 hyp_idmap_start != (unsigned long)__hyp_idmap_text_start) { 1935 /* 1936 * The idmap page is intersecting with the VA space, 1937 * it is not safe to continue further. 1938 */ 1939 kvm_err("IDMAP intersecting with HYP VA, unable to continue\n"); 1940 err = -EINVAL; 1941 goto out; 1942 } 1943 1944 hyp_pgd = (pgd_t *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, hyp_pgd_order); 1945 if (!hyp_pgd) { 1946 kvm_err("Hyp mode PGD not allocated\n"); 1947 err = -ENOMEM; 1948 goto out; 1949 } 1950 1951 if (__kvm_cpu_uses_extended_idmap()) { 1952 boot_hyp_pgd = (pgd_t *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, 1953 hyp_pgd_order); 1954 if (!boot_hyp_pgd) { 1955 kvm_err("Hyp boot PGD not allocated\n"); 1956 err = -ENOMEM; 1957 goto out; 1958 } 1959 1960 err = kvm_map_idmap_text(boot_hyp_pgd); 1961 if (err) 1962 goto out; 1963 1964 merged_hyp_pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO); 1965 if (!merged_hyp_pgd) { 1966 kvm_err("Failed to allocate extra HYP pgd\n"); 1967 goto out; 1968 } 1969 __kvm_extend_hypmap(boot_hyp_pgd, hyp_pgd, merged_hyp_pgd, 1970 hyp_idmap_start); 1971 } else { 1972 err = kvm_map_idmap_text(hyp_pgd); 1973 if (err) 1974 goto out; 1975 } 1976 1977 io_map_base = hyp_idmap_start; 1978 return 0; 1979out: 1980 free_hyp_pgds(); 1981 return err; 1982} 1983 1984void kvm_arch_commit_memory_region(struct kvm *kvm, 1985 const struct kvm_userspace_memory_region *mem, 1986 const struct kvm_memory_slot *old, 1987 const struct kvm_memory_slot *new, 1988 enum kvm_mr_change change) 1989{ 1990 /* 1991 * At this point memslot has been committed and there is an 1992 * allocated dirty_bitmap[], dirty pages will be be tracked while the 1993 * memory slot is write protected. 1994 */ 1995 if (change != KVM_MR_DELETE && mem->flags & KVM_MEM_LOG_DIRTY_PAGES) 1996 kvm_mmu_wp_memory_region(kvm, mem->slot); 1997} 1998 1999int kvm_arch_prepare_memory_region(struct kvm *kvm, 2000 struct kvm_memory_slot *memslot, 2001 const struct kvm_userspace_memory_region *mem, 2002 enum kvm_mr_change change) 2003{ 2004 hva_t hva = mem->userspace_addr; 2005 hva_t reg_end = hva + mem->memory_size; 2006 bool writable = !(mem->flags & KVM_MEM_READONLY); 2007 int ret = 0; 2008 2009 if (change != KVM_MR_CREATE && change != KVM_MR_MOVE && 2010 change != KVM_MR_FLAGS_ONLY) 2011 return 0; 2012 2013 /* 2014 * Prevent userspace from creating a memory region outside of the IPA 2015 * space addressable by the KVM guest IPA space. 2016 */ 2017 if (memslot->base_gfn + memslot->npages >= 2018 (KVM_PHYS_SIZE >> PAGE_SHIFT)) 2019 return -EFAULT; 2020 2021 down_read(&current->mm->mmap_sem); 2022 /* 2023 * A memory region could potentially cover multiple VMAs, and any holes 2024 * between them, so iterate over all of them to find out if we can map 2025 * any of them right now. 2026 * 2027 * +--------------------------------------------+ 2028 * +---------------+----------------+ +----------------+ 2029 * | : VMA 1 | VMA 2 | | VMA 3 : | 2030 * +---------------+----------------+ +----------------+ 2031 * | memory region | 2032 * +--------------------------------------------+ 2033 */ 2034 do { 2035 struct vm_area_struct *vma = find_vma(current->mm, hva); 2036 hva_t vm_start, vm_end; 2037 2038 if (!vma || vma->vm_start >= reg_end) 2039 break; 2040 2041 /* 2042 * Mapping a read-only VMA is only allowed if the 2043 * memory region is configured as read-only. 2044 */ 2045 if (writable && !(vma->vm_flags & VM_WRITE)) { 2046 ret = -EPERM; 2047 break; 2048 } 2049 2050 /* 2051 * Take the intersection of this VMA with the memory region 2052 */ 2053 vm_start = max(hva, vma->vm_start); 2054 vm_end = min(reg_end, vma->vm_end); 2055 2056 if (vma->vm_flags & VM_PFNMAP) { 2057 gpa_t gpa = mem->guest_phys_addr + 2058 (vm_start - mem->userspace_addr); 2059 phys_addr_t pa; 2060 2061 pa = (phys_addr_t)vma->vm_pgoff << PAGE_SHIFT; 2062 pa += vm_start - vma->vm_start; 2063 2064 /* IO region dirty page logging not allowed */ 2065 if (memslot->flags & KVM_MEM_LOG_DIRTY_PAGES) { 2066 ret = -EINVAL; 2067 goto out; 2068 } 2069 2070 ret = kvm_phys_addr_ioremap(kvm, gpa, pa, 2071 vm_end - vm_start, 2072 writable); 2073 if (ret) 2074 break; 2075 } 2076 hva = vm_end; 2077 } while (hva < reg_end); 2078 2079 if (change == KVM_MR_FLAGS_ONLY) 2080 goto out; 2081 2082 spin_lock(&kvm->mmu_lock); 2083 if (ret) 2084 unmap_stage2_range(kvm, mem->guest_phys_addr, mem->memory_size); 2085 else 2086 stage2_flush_memslot(kvm, memslot); 2087 spin_unlock(&kvm->mmu_lock); 2088out: 2089 up_read(&current->mm->mmap_sem); 2090 return ret; 2091} 2092 2093void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free, 2094 struct kvm_memory_slot *dont) 2095{ 2096} 2097 2098int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot, 2099 unsigned long npages) 2100{ 2101 return 0; 2102} 2103 2104void kvm_arch_memslots_updated(struct kvm *kvm, struct kvm_memslots *slots) 2105{ 2106} 2107 2108void kvm_arch_flush_shadow_all(struct kvm *kvm) 2109{ 2110 kvm_free_stage2_pgd(kvm); 2111} 2112 2113void kvm_arch_flush_shadow_memslot(struct kvm *kvm, 2114 struct kvm_memory_slot *slot) 2115{ 2116 gpa_t gpa = slot->base_gfn << PAGE_SHIFT; 2117 phys_addr_t size = slot->npages << PAGE_SHIFT; 2118 2119 spin_lock(&kvm->mmu_lock); 2120 unmap_stage2_range(kvm, gpa, size); 2121 spin_unlock(&kvm->mmu_lock); 2122} 2123 2124/* 2125 * See note at ARMv7 ARM B1.14.4 (TL;DR: S/W ops are not easily virtualized). 2126 * 2127 * Main problems: 2128 * - S/W ops are local to a CPU (not broadcast) 2129 * - We have line migration behind our back (speculation) 2130 * - System caches don't support S/W at all (damn!) 2131 * 2132 * In the face of the above, the best we can do is to try and convert 2133 * S/W ops to VA ops. Because the guest is not allowed to infer the 2134 * S/W to PA mapping, it can only use S/W to nuke the whole cache, 2135 * which is a rather good thing for us. 2136 * 2137 * Also, it is only used when turning caches on/off ("The expected 2138 * usage of the cache maintenance instructions that operate by set/way 2139 * is associated with the cache maintenance instructions associated 2140 * with the powerdown and powerup of caches, if this is required by 2141 * the implementation."). 2142 * 2143 * We use the following policy: 2144 * 2145 * - If we trap a S/W operation, we enable VM trapping to detect 2146 * caches being turned on/off, and do a full clean. 2147 * 2148 * - We flush the caches on both caches being turned on and off. 2149 * 2150 * - Once the caches are enabled, we stop trapping VM ops. 2151 */ 2152void kvm_set_way_flush(struct kvm_vcpu *vcpu) 2153{ 2154 unsigned long hcr = *vcpu_hcr(vcpu); 2155 2156 /* 2157 * If this is the first time we do a S/W operation 2158 * (i.e. HCR_TVM not set) flush the whole memory, and set the 2159 * VM trapping. 2160 * 2161 * Otherwise, rely on the VM trapping to wait for the MMU + 2162 * Caches to be turned off. At that point, we'll be able to 2163 * clean the caches again. 2164 */ 2165 if (!(hcr & HCR_TVM)) { 2166 trace_kvm_set_way_flush(*vcpu_pc(vcpu), 2167 vcpu_has_cache_enabled(vcpu)); 2168 stage2_flush_vm(vcpu->kvm); 2169 *vcpu_hcr(vcpu) = hcr | HCR_TVM; 2170 } 2171} 2172 2173void kvm_toggle_cache(struct kvm_vcpu *vcpu, bool was_enabled) 2174{ 2175 bool now_enabled = vcpu_has_cache_enabled(vcpu); 2176 2177 /* 2178 * If switching the MMU+caches on, need to invalidate the caches. 2179 * If switching it off, need to clean the caches. 2180 * Clean + invalidate does the trick always. 2181 */ 2182 if (now_enabled != was_enabled) 2183 stage2_flush_vm(vcpu->kvm); 2184 2185 /* Caches are now on, stop trapping VM ops (until a S/W op) */ 2186 if (now_enabled) 2187 *vcpu_hcr(vcpu) &= ~HCR_TVM; 2188 2189 trace_kvm_toggle_cache(*vcpu_pc(vcpu), was_enabled, now_enabled); 2190}