arch/arm/kvm/mmu.c at v4.3-rc7

tjh.dev / kernel
fork
Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
fork
kernel / arch / arm / kvm / mmu.c
at v4.3-rc7 1930 lines 52 kB view raw
wrap content
   1/*
   2 * Copyright (C) 2012 - Virtual Open Systems and Columbia University
   3 * Author: Christoffer Dall <c.dall@virtualopensystems.com>
   4 *
   5 * This program is free software; you can redistribute it and/or modify
   6 * it under the terms of the GNU General Public License, version 2, as
   7 * published by the Free Software Foundation.
   8 *
   9 * This program is distributed in the hope that it will be useful,
  10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
  11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12 * GNU General Public License for more details.
  13 *
  14 * You should have received a copy of the GNU General Public License
  15 * along with this program; if not, write to the Free Software
  16 * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
  17 */
  18
  19#include <linux/mman.h>
  20#include <linux/kvm_host.h>
  21#include <linux/io.h>
  22#include <linux/hugetlb.h>
  23#include <trace/events/kvm.h>
  24#include <asm/pgalloc.h>
  25#include <asm/cacheflush.h>
  26#include <asm/kvm_arm.h>
  27#include <asm/kvm_mmu.h>
  28#include <asm/kvm_mmio.h>
  29#include <asm/kvm_asm.h>
  30#include <asm/kvm_emulate.h>
  31
  32#include "trace.h"
  33
  34extern char  __hyp_idmap_text_start[], __hyp_idmap_text_end[];
  35
  36static pgd_t *boot_hyp_pgd;
  37static pgd_t *hyp_pgd;
  38static pgd_t *merged_hyp_pgd;
  39static DEFINE_MUTEX(kvm_hyp_pgd_mutex);
  40
  41static unsigned long hyp_idmap_start;
  42static unsigned long hyp_idmap_end;
  43static phys_addr_t hyp_idmap_vector;
  44
  45#define hyp_pgd_order get_order(PTRS_PER_PGD * sizeof(pgd_t))
  46
  47#define kvm_pmd_huge(_x)	(pmd_huge(_x) || pmd_trans_huge(_x))
  48#define kvm_pud_huge(_x)	pud_huge(_x)
  49
  50#define KVM_S2PTE_FLAG_IS_IOMAP		(1UL << 0)
  51#define KVM_S2_FLAG_LOGGING_ACTIVE	(1UL << 1)
  52
  53static bool memslot_is_logging(struct kvm_memory_slot *memslot)
  54{
  55	return memslot->dirty_bitmap && !(memslot->flags & KVM_MEM_READONLY);
  56}
  57
  58/**
  59 * kvm_flush_remote_tlbs() - flush all VM TLB entries for v7/8
  60 * @kvm:	pointer to kvm structure.
  61 *
  62 * Interface to HYP function to flush all VM TLB entries
  63 */
  64void kvm_flush_remote_tlbs(struct kvm *kvm)
  65{
  66	kvm_call_hyp(__kvm_tlb_flush_vmid, kvm);
  67}
  68
  69static void kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa)
  70{
  71	/*
  72	 * This function also gets called when dealing with HYP page
  73	 * tables. As HYP doesn't have an associated struct kvm (and
  74	 * the HYP page tables are fairly static), we don't do
  75	 * anything there.
  76	 */
  77	if (kvm)
  78		kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, kvm, ipa);
  79}
  80
  81/*
  82 * D-Cache management functions. They take the page table entries by
  83 * value, as they are flushing the cache using the kernel mapping (or
  84 * kmap on 32bit).
  85 */
  86static void kvm_flush_dcache_pte(pte_t pte)
  87{
  88	__kvm_flush_dcache_pte(pte);
  89}
  90
  91static void kvm_flush_dcache_pmd(pmd_t pmd)
  92{
  93	__kvm_flush_dcache_pmd(pmd);
  94}
  95
  96static void kvm_flush_dcache_pud(pud_t pud)
  97{
  98	__kvm_flush_dcache_pud(pud);
  99}
 100
 101/**
 102 * stage2_dissolve_pmd() - clear and flush huge PMD entry
 103 * @kvm:	pointer to kvm structure.
 104 * @addr:	IPA
 105 * @pmd:	pmd pointer for IPA
 106 *
 107 * Function clears a PMD entry, flushes addr 1st and 2nd stage TLBs. Marks all
 108 * pages in the range dirty.
 109 */
 110static void stage2_dissolve_pmd(struct kvm *kvm, phys_addr_t addr, pmd_t *pmd)
 111{
 112	if (!kvm_pmd_huge(*pmd))
 113		return;
 114
 115	pmd_clear(pmd);
 116	kvm_tlb_flush_vmid_ipa(kvm, addr);
 117	put_page(virt_to_page(pmd));
 118}
 119
 120static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
 121				  int min, int max)
 122{
 123	void *page;
 124
 125	BUG_ON(max > KVM_NR_MEM_OBJS);
 126	if (cache->nobjs >= min)
 127		return 0;
 128	while (cache->nobjs < max) {
 129		page = (void *)__get_free_page(PGALLOC_GFP);
 130		if (!page)
 131			return -ENOMEM;
 132		cache->objects[cache->nobjs++] = page;
 133	}
 134	return 0;
 135}
 136
 137static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc)
 138{
 139	while (mc->nobjs)
 140		free_page((unsigned long)mc->objects[--mc->nobjs]);
 141}
 142
 143static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc)
 144{
 145	void *p;
 146
 147	BUG_ON(!mc || !mc->nobjs);
 148	p = mc->objects[--mc->nobjs];
 149	return p;
 150}
 151
 152static void clear_pgd_entry(struct kvm *kvm, pgd_t *pgd, phys_addr_t addr)
 153{
 154	pud_t *pud_table __maybe_unused = pud_offset(pgd, 0);
 155	pgd_clear(pgd);
 156	kvm_tlb_flush_vmid_ipa(kvm, addr);
 157	pud_free(NULL, pud_table);
 158	put_page(virt_to_page(pgd));
 159}
 160
 161static void clear_pud_entry(struct kvm *kvm, pud_t *pud, phys_addr_t addr)
 162{
 163	pmd_t *pmd_table = pmd_offset(pud, 0);
 164	VM_BUG_ON(pud_huge(*pud));
 165	pud_clear(pud);
 166	kvm_tlb_flush_vmid_ipa(kvm, addr);
 167	pmd_free(NULL, pmd_table);
 168	put_page(virt_to_page(pud));
 169}
 170
 171static void clear_pmd_entry(struct kvm *kvm, pmd_t *pmd, phys_addr_t addr)
 172{
 173	pte_t *pte_table = pte_offset_kernel(pmd, 0);
 174	VM_BUG_ON(kvm_pmd_huge(*pmd));
 175	pmd_clear(pmd);
 176	kvm_tlb_flush_vmid_ipa(kvm, addr);
 177	pte_free_kernel(NULL, pte_table);
 178	put_page(virt_to_page(pmd));
 179}
 180
 181/*
 182 * Unmapping vs dcache management:
 183 *
 184 * If a guest maps certain memory pages as uncached, all writes will
 185 * bypass the data cache and go directly to RAM.  However, the CPUs
 186 * can still speculate reads (not writes) and fill cache lines with
 187 * data.
 188 *
 189 * Those cache lines will be *clean* cache lines though, so a
 190 * clean+invalidate operation is equivalent to an invalidate
 191 * operation, because no cache lines are marked dirty.
 192 *
 193 * Those clean cache lines could be filled prior to an uncached write
 194 * by the guest, and the cache coherent IO subsystem would therefore
 195 * end up writing old data to disk.
 196 *
 197 * This is why right after unmapping a page/section and invalidating
 198 * the corresponding TLBs, we call kvm_flush_dcache_p*() to make sure
 199 * the IO subsystem will never hit in the cache.
 200 */
 201static void unmap_ptes(struct kvm *kvm, pmd_t *pmd,
 202		       phys_addr_t addr, phys_addr_t end)
 203{
 204	phys_addr_t start_addr = addr;
 205	pte_t *pte, *start_pte;
 206
 207	start_pte = pte = pte_offset_kernel(pmd, addr);
 208	do {
 209		if (!pte_none(*pte)) {
 210			pte_t old_pte = *pte;
 211
 212			kvm_set_pte(pte, __pte(0));
 213			kvm_tlb_flush_vmid_ipa(kvm, addr);
 214
 215			/* No need to invalidate the cache for device mappings */
 216			if ((pte_val(old_pte) & PAGE_S2_DEVICE) != PAGE_S2_DEVICE)
 217				kvm_flush_dcache_pte(old_pte);
 218
 219			put_page(virt_to_page(pte));
 220		}
 221	} while (pte++, addr += PAGE_SIZE, addr != end);
 222
 223	if (kvm_pte_table_empty(kvm, start_pte))
 224		clear_pmd_entry(kvm, pmd, start_addr);
 225}
 226
 227static void unmap_pmds(struct kvm *kvm, pud_t *pud,
 228		       phys_addr_t addr, phys_addr_t end)
 229{
 230	phys_addr_t next, start_addr = addr;
 231	pmd_t *pmd, *start_pmd;
 232
 233	start_pmd = pmd = pmd_offset(pud, addr);
 234	do {
 235		next = kvm_pmd_addr_end(addr, end);
 236		if (!pmd_none(*pmd)) {
 237			if (kvm_pmd_huge(*pmd)) {
 238				pmd_t old_pmd = *pmd;
 239
 240				pmd_clear(pmd);
 241				kvm_tlb_flush_vmid_ipa(kvm, addr);
 242
 243				kvm_flush_dcache_pmd(old_pmd);
 244
 245				put_page(virt_to_page(pmd));
 246			} else {
 247				unmap_ptes(kvm, pmd, addr, next);
 248			}
 249		}
 250	} while (pmd++, addr = next, addr != end);
 251
 252	if (kvm_pmd_table_empty(kvm, start_pmd))
 253		clear_pud_entry(kvm, pud, start_addr);
 254}
 255
 256static void unmap_puds(struct kvm *kvm, pgd_t *pgd,
 257		       phys_addr_t addr, phys_addr_t end)
 258{
 259	phys_addr_t next, start_addr = addr;
 260	pud_t *pud, *start_pud;
 261
 262	start_pud = pud = pud_offset(pgd, addr);
 263	do {
 264		next = kvm_pud_addr_end(addr, end);
 265		if (!pud_none(*pud)) {
 266			if (pud_huge(*pud)) {
 267				pud_t old_pud = *pud;
 268
 269				pud_clear(pud);
 270				kvm_tlb_flush_vmid_ipa(kvm, addr);
 271
 272				kvm_flush_dcache_pud(old_pud);
 273
 274				put_page(virt_to_page(pud));
 275			} else {
 276				unmap_pmds(kvm, pud, addr, next);
 277			}
 278		}
 279	} while (pud++, addr = next, addr != end);
 280
 281	if (kvm_pud_table_empty(kvm, start_pud))
 282		clear_pgd_entry(kvm, pgd, start_addr);
 283}
 284
 285
 286static void unmap_range(struct kvm *kvm, pgd_t *pgdp,
 287			phys_addr_t start, u64 size)
 288{
 289	pgd_t *pgd;
 290	phys_addr_t addr = start, end = start + size;
 291	phys_addr_t next;
 292
 293	pgd = pgdp + kvm_pgd_index(addr);
 294	do {
 295		next = kvm_pgd_addr_end(addr, end);
 296		if (!pgd_none(*pgd))
 297			unmap_puds(kvm, pgd, addr, next);
 298	} while (pgd++, addr = next, addr != end);
 299}
 300
 301static void stage2_flush_ptes(struct kvm *kvm, pmd_t *pmd,
 302			      phys_addr_t addr, phys_addr_t end)
 303{
 304	pte_t *pte;
 305
 306	pte = pte_offset_kernel(pmd, addr);
 307	do {
 308		if (!pte_none(*pte) &&
 309		    (pte_val(*pte) & PAGE_S2_DEVICE) != PAGE_S2_DEVICE)
 310			kvm_flush_dcache_pte(*pte);
 311	} while (pte++, addr += PAGE_SIZE, addr != end);
 312}
 313
 314static void stage2_flush_pmds(struct kvm *kvm, pud_t *pud,
 315			      phys_addr_t addr, phys_addr_t end)
 316{
 317	pmd_t *pmd;
 318	phys_addr_t next;
 319
 320	pmd = pmd_offset(pud, addr);
 321	do {
 322		next = kvm_pmd_addr_end(addr, end);
 323		if (!pmd_none(*pmd)) {
 324			if (kvm_pmd_huge(*pmd))
 325				kvm_flush_dcache_pmd(*pmd);
 326			else
 327				stage2_flush_ptes(kvm, pmd, addr, next);
 328		}
 329	} while (pmd++, addr = next, addr != end);
 330}
 331
 332static void stage2_flush_puds(struct kvm *kvm, pgd_t *pgd,
 333			      phys_addr_t addr, phys_addr_t end)
 334{
 335	pud_t *pud;
 336	phys_addr_t next;
 337
 338	pud = pud_offset(pgd, addr);
 339	do {
 340		next = kvm_pud_addr_end(addr, end);
 341		if (!pud_none(*pud)) {
 342			if (pud_huge(*pud))
 343				kvm_flush_dcache_pud(*pud);
 344			else
 345				stage2_flush_pmds(kvm, pud, addr, next);
 346		}
 347	} while (pud++, addr = next, addr != end);
 348}
 349
 350static void stage2_flush_memslot(struct kvm *kvm,
 351				 struct kvm_memory_slot *memslot)
 352{
 353	phys_addr_t addr = memslot->base_gfn << PAGE_SHIFT;
 354	phys_addr_t end = addr + PAGE_SIZE * memslot->npages;
 355	phys_addr_t next;
 356	pgd_t *pgd;
 357
 358	pgd = kvm->arch.pgd + kvm_pgd_index(addr);
 359	do {
 360		next = kvm_pgd_addr_end(addr, end);
 361		stage2_flush_puds(kvm, pgd, addr, next);
 362	} while (pgd++, addr = next, addr != end);
 363}
 364
 365/**
 366 * stage2_flush_vm - Invalidate cache for pages mapped in stage 2
 367 * @kvm: The struct kvm pointer
 368 *
 369 * Go through the stage 2 page tables and invalidate any cache lines
 370 * backing memory already mapped to the VM.
 371 */
 372static void stage2_flush_vm(struct kvm *kvm)
 373{
 374	struct kvm_memslots *slots;
 375	struct kvm_memory_slot *memslot;
 376	int idx;
 377
 378	idx = srcu_read_lock(&kvm->srcu);
 379	spin_lock(&kvm->mmu_lock);
 380
 381	slots = kvm_memslots(kvm);
 382	kvm_for_each_memslot(memslot, slots)
 383		stage2_flush_memslot(kvm, memslot);
 384
 385	spin_unlock(&kvm->mmu_lock);
 386	srcu_read_unlock(&kvm->srcu, idx);
 387}
 388
 389/**
 390 * free_boot_hyp_pgd - free HYP boot page tables
 391 *
 392 * Free the HYP boot page tables. The bounce page is also freed.
 393 */
 394void free_boot_hyp_pgd(void)
 395{
 396	mutex_lock(&kvm_hyp_pgd_mutex);
 397
 398	if (boot_hyp_pgd) {
 399		unmap_range(NULL, boot_hyp_pgd, hyp_idmap_start, PAGE_SIZE);
 400		unmap_range(NULL, boot_hyp_pgd, TRAMPOLINE_VA, PAGE_SIZE);
 401		free_pages((unsigned long)boot_hyp_pgd, hyp_pgd_order);
 402		boot_hyp_pgd = NULL;
 403	}
 404
 405	if (hyp_pgd)
 406		unmap_range(NULL, hyp_pgd, TRAMPOLINE_VA, PAGE_SIZE);
 407
 408	mutex_unlock(&kvm_hyp_pgd_mutex);
 409}
 410
 411/**
 412 * free_hyp_pgds - free Hyp-mode page tables
 413 *
 414 * Assumes hyp_pgd is a page table used strictly in Hyp-mode and
 415 * therefore contains either mappings in the kernel memory area (above
 416 * PAGE_OFFSET), or device mappings in the vmalloc range (from
 417 * VMALLOC_START to VMALLOC_END).
 418 *
 419 * boot_hyp_pgd should only map two pages for the init code.
 420 */
 421void free_hyp_pgds(void)
 422{
 423	unsigned long addr;
 424
 425	free_boot_hyp_pgd();
 426
 427	mutex_lock(&kvm_hyp_pgd_mutex);
 428
 429	if (hyp_pgd) {
 430		for (addr = PAGE_OFFSET; virt_addr_valid(addr); addr += PGDIR_SIZE)
 431			unmap_range(NULL, hyp_pgd, KERN_TO_HYP(addr), PGDIR_SIZE);
 432		for (addr = VMALLOC_START; is_vmalloc_addr((void*)addr); addr += PGDIR_SIZE)
 433			unmap_range(NULL, hyp_pgd, KERN_TO_HYP(addr), PGDIR_SIZE);
 434
 435		free_pages((unsigned long)hyp_pgd, hyp_pgd_order);
 436		hyp_pgd = NULL;
 437	}
 438	if (merged_hyp_pgd) {
 439		clear_page(merged_hyp_pgd);
 440		free_page((unsigned long)merged_hyp_pgd);
 441		merged_hyp_pgd = NULL;
 442	}
 443
 444	mutex_unlock(&kvm_hyp_pgd_mutex);
 445}
 446
 447static void create_hyp_pte_mappings(pmd_t *pmd, unsigned long start,
 448				    unsigned long end, unsigned long pfn,
 449				    pgprot_t prot)
 450{
 451	pte_t *pte;
 452	unsigned long addr;
 453
 454	addr = start;
 455	do {
 456		pte = pte_offset_kernel(pmd, addr);
 457		kvm_set_pte(pte, pfn_pte(pfn, prot));
 458		get_page(virt_to_page(pte));
 459		kvm_flush_dcache_to_poc(pte, sizeof(*pte));
 460		pfn++;
 461	} while (addr += PAGE_SIZE, addr != end);
 462}
 463
 464static int create_hyp_pmd_mappings(pud_t *pud, unsigned long start,
 465				   unsigned long end, unsigned long pfn,
 466				   pgprot_t prot)
 467{
 468	pmd_t *pmd;
 469	pte_t *pte;
 470	unsigned long addr, next;
 471
 472	addr = start;
 473	do {
 474		pmd = pmd_offset(pud, addr);
 475
 476		BUG_ON(pmd_sect(*pmd));
 477
 478		if (pmd_none(*pmd)) {
 479			pte = pte_alloc_one_kernel(NULL, addr);
 480			if (!pte) {
 481				kvm_err("Cannot allocate Hyp pte\n");
 482				return -ENOMEM;
 483			}
 484			pmd_populate_kernel(NULL, pmd, pte);
 485			get_page(virt_to_page(pmd));
 486			kvm_flush_dcache_to_poc(pmd, sizeof(*pmd));
 487		}
 488
 489		next = pmd_addr_end(addr, end);
 490
 491		create_hyp_pte_mappings(pmd, addr, next, pfn, prot);
 492		pfn += (next - addr) >> PAGE_SHIFT;
 493	} while (addr = next, addr != end);
 494
 495	return 0;
 496}
 497
 498static int create_hyp_pud_mappings(pgd_t *pgd, unsigned long start,
 499				   unsigned long end, unsigned long pfn,
 500				   pgprot_t prot)
 501{
 502	pud_t *pud;
 503	pmd_t *pmd;
 504	unsigned long addr, next;
 505	int ret;
 506
 507	addr = start;
 508	do {
 509		pud = pud_offset(pgd, addr);
 510
 511		if (pud_none_or_clear_bad(pud)) {
 512			pmd = pmd_alloc_one(NULL, addr);
 513			if (!pmd) {
 514				kvm_err("Cannot allocate Hyp pmd\n");
 515				return -ENOMEM;
 516			}
 517			pud_populate(NULL, pud, pmd);
 518			get_page(virt_to_page(pud));
 519			kvm_flush_dcache_to_poc(pud, sizeof(*pud));
 520		}
 521
 522		next = pud_addr_end(addr, end);
 523		ret = create_hyp_pmd_mappings(pud, addr, next, pfn, prot);
 524		if (ret)
 525			return ret;
 526		pfn += (next - addr) >> PAGE_SHIFT;
 527	} while (addr = next, addr != end);
 528
 529	return 0;
 530}
 531
 532static int __create_hyp_mappings(pgd_t *pgdp,
 533				 unsigned long start, unsigned long end,
 534				 unsigned long pfn, pgprot_t prot)
 535{
 536	pgd_t *pgd;
 537	pud_t *pud;
 538	unsigned long addr, next;
 539	int err = 0;
 540
 541	mutex_lock(&kvm_hyp_pgd_mutex);
 542	addr = start & PAGE_MASK;
 543	end = PAGE_ALIGN(end);
 544	do {
 545		pgd = pgdp + pgd_index(addr);
 546
 547		if (pgd_none(*pgd)) {
 548			pud = pud_alloc_one(NULL, addr);
 549			if (!pud) {
 550				kvm_err("Cannot allocate Hyp pud\n");
 551				err = -ENOMEM;
 552				goto out;
 553			}
 554			pgd_populate(NULL, pgd, pud);
 555			get_page(virt_to_page(pgd));
 556			kvm_flush_dcache_to_poc(pgd, sizeof(*pgd));
 557		}
 558
 559		next = pgd_addr_end(addr, end);
 560		err = create_hyp_pud_mappings(pgd, addr, next, pfn, prot);
 561		if (err)
 562			goto out;
 563		pfn += (next - addr) >> PAGE_SHIFT;
 564	} while (addr = next, addr != end);
 565out:
 566	mutex_unlock(&kvm_hyp_pgd_mutex);
 567	return err;
 568}
 569
 570static phys_addr_t kvm_kaddr_to_phys(void *kaddr)
 571{
 572	if (!is_vmalloc_addr(kaddr)) {
 573		BUG_ON(!virt_addr_valid(kaddr));
 574		return __pa(kaddr);
 575	} else {
 576		return page_to_phys(vmalloc_to_page(kaddr)) +
 577		       offset_in_page(kaddr);
 578	}
 579}
 580
 581/**
 582 * create_hyp_mappings - duplicate a kernel virtual address range in Hyp mode
 583 * @from:	The virtual kernel start address of the range
 584 * @to:		The virtual kernel end address of the range (exclusive)
 585 *
 586 * The same virtual address as the kernel virtual address is also used
 587 * in Hyp-mode mapping (modulo HYP_PAGE_OFFSET) to the same underlying
 588 * physical pages.
 589 */
 590int create_hyp_mappings(void *from, void *to)
 591{
 592	phys_addr_t phys_addr;
 593	unsigned long virt_addr;
 594	unsigned long start = KERN_TO_HYP((unsigned long)from);
 595	unsigned long end = KERN_TO_HYP((unsigned long)to);
 596
 597	start = start & PAGE_MASK;
 598	end = PAGE_ALIGN(end);
 599
 600	for (virt_addr = start; virt_addr < end; virt_addr += PAGE_SIZE) {
 601		int err;
 602
 603		phys_addr = kvm_kaddr_to_phys(from + virt_addr - start);
 604		err = __create_hyp_mappings(hyp_pgd, virt_addr,
 605					    virt_addr + PAGE_SIZE,
 606					    __phys_to_pfn(phys_addr),
 607					    PAGE_HYP);
 608		if (err)
 609			return err;
 610	}
 611
 612	return 0;
 613}
 614
 615/**
 616 * create_hyp_io_mappings - duplicate a kernel IO mapping into Hyp mode
 617 * @from:	The kernel start VA of the range
 618 * @to:		The kernel end VA of the range (exclusive)
 619 * @phys_addr:	The physical start address which gets mapped
 620 *
 621 * The resulting HYP VA is the same as the kernel VA, modulo
 622 * HYP_PAGE_OFFSET.
 623 */
 624int create_hyp_io_mappings(void *from, void *to, phys_addr_t phys_addr)
 625{
 626	unsigned long start = KERN_TO_HYP((unsigned long)from);
 627	unsigned long end = KERN_TO_HYP((unsigned long)to);
 628
 629	/* Check for a valid kernel IO mapping */
 630	if (!is_vmalloc_addr(from) || !is_vmalloc_addr(to - 1))
 631		return -EINVAL;
 632
 633	return __create_hyp_mappings(hyp_pgd, start, end,
 634				     __phys_to_pfn(phys_addr), PAGE_HYP_DEVICE);
 635}
 636
 637/* Free the HW pgd, one page at a time */
 638static void kvm_free_hwpgd(void *hwpgd)
 639{
 640	free_pages_exact(hwpgd, kvm_get_hwpgd_size());
 641}
 642
 643/* Allocate the HW PGD, making sure that each page gets its own refcount */
 644static void *kvm_alloc_hwpgd(void)
 645{
 646	unsigned int size = kvm_get_hwpgd_size();
 647
 648	return alloc_pages_exact(size, GFP_KERNEL | __GFP_ZERO);
 649}
 650
 651/**
 652 * kvm_alloc_stage2_pgd - allocate level-1 table for stage-2 translation.
 653 * @kvm:	The KVM struct pointer for the VM.
 654 *
 655 * Allocates the 1st level table only of size defined by S2_PGD_ORDER (can
 656 * support either full 40-bit input addresses or limited to 32-bit input
 657 * addresses). Clears the allocated pages.
 658 *
 659 * Note we don't need locking here as this is only called when the VM is
 660 * created, which can only be done once.
 661 */
 662int kvm_alloc_stage2_pgd(struct kvm *kvm)
 663{
 664	pgd_t *pgd;
 665	void *hwpgd;
 666
 667	if (kvm->arch.pgd != NULL) {
 668		kvm_err("kvm_arch already initialized?\n");
 669		return -EINVAL;
 670	}
 671
 672	hwpgd = kvm_alloc_hwpgd();
 673	if (!hwpgd)
 674		return -ENOMEM;
 675
 676	/* When the kernel uses more levels of page tables than the
 677	 * guest, we allocate a fake PGD and pre-populate it to point
 678	 * to the next-level page table, which will be the real
 679	 * initial page table pointed to by the VTTBR.
 680	 *
 681	 * When KVM_PREALLOC_LEVEL==2, we allocate a single page for
 682	 * the PMD and the kernel will use folded pud.
 683	 * When KVM_PREALLOC_LEVEL==1, we allocate 2 consecutive PUD
 684	 * pages.
 685	 */
 686	if (KVM_PREALLOC_LEVEL > 0) {
 687		int i;
 688
 689		/*
 690		 * Allocate fake pgd for the page table manipulation macros to
 691		 * work.  This is not used by the hardware and we have no
 692		 * alignment requirement for this allocation.
 693		 */
 694		pgd = kmalloc(PTRS_PER_S2_PGD * sizeof(pgd_t),
 695				GFP_KERNEL | __GFP_ZERO);
 696
 697		if (!pgd) {
 698			kvm_free_hwpgd(hwpgd);
 699			return -ENOMEM;
 700		}
 701
 702		/* Plug the HW PGD into the fake one. */
 703		for (i = 0; i < PTRS_PER_S2_PGD; i++) {
 704			if (KVM_PREALLOC_LEVEL == 1)
 705				pgd_populate(NULL, pgd + i,
 706					     (pud_t *)hwpgd + i * PTRS_PER_PUD);
 707			else if (KVM_PREALLOC_LEVEL == 2)
 708				pud_populate(NULL, pud_offset(pgd, 0) + i,
 709					     (pmd_t *)hwpgd + i * PTRS_PER_PMD);
 710		}
 711	} else {
 712		/*
 713		 * Allocate actual first-level Stage-2 page table used by the
 714		 * hardware for Stage-2 page table walks.
 715		 */
 716		pgd = (pgd_t *)hwpgd;
 717	}
 718
 719	kvm_clean_pgd(pgd);
 720	kvm->arch.pgd = pgd;
 721	return 0;
 722}
 723
 724/**
 725 * unmap_stage2_range -- Clear stage2 page table entries to unmap a range
 726 * @kvm:   The VM pointer
 727 * @start: The intermediate physical base address of the range to unmap
 728 * @size:  The size of the area to unmap
 729 *
 730 * Clear a range of stage-2 mappings, lowering the various ref-counts.  Must
 731 * be called while holding mmu_lock (unless for freeing the stage2 pgd before
 732 * destroying the VM), otherwise another faulting VCPU may come in and mess
 733 * with things behind our backs.
 734 */
 735static void unmap_stage2_range(struct kvm *kvm, phys_addr_t start, u64 size)
 736{
 737	unmap_range(kvm, kvm->arch.pgd, start, size);
 738}
 739
 740static void stage2_unmap_memslot(struct kvm *kvm,
 741				 struct kvm_memory_slot *memslot)
 742{
 743	hva_t hva = memslot->userspace_addr;
 744	phys_addr_t addr = memslot->base_gfn << PAGE_SHIFT;
 745	phys_addr_t size = PAGE_SIZE * memslot->npages;
 746	hva_t reg_end = hva + size;
 747
 748	/*
 749	 * A memory region could potentially cover multiple VMAs, and any holes
 750	 * between them, so iterate over all of them to find out if we should
 751	 * unmap any of them.
 752	 *
 753	 *     +--------------------------------------------+
 754	 * +---------------+----------------+   +----------------+
 755	 * |   : VMA 1     |      VMA 2     |   |    VMA 3  :    |
 756	 * +---------------+----------------+   +----------------+
 757	 *     |               memory region                |
 758	 *     +--------------------------------------------+
 759	 */
 760	do {
 761		struct vm_area_struct *vma = find_vma(current->mm, hva);
 762		hva_t vm_start, vm_end;
 763
 764		if (!vma || vma->vm_start >= reg_end)
 765			break;
 766
 767		/*
 768		 * Take the intersection of this VMA with the memory region
 769		 */
 770		vm_start = max(hva, vma->vm_start);
 771		vm_end = min(reg_end, vma->vm_end);
 772
 773		if (!(vma->vm_flags & VM_PFNMAP)) {
 774			gpa_t gpa = addr + (vm_start - memslot->userspace_addr);
 775			unmap_stage2_range(kvm, gpa, vm_end - vm_start);
 776		}
 777		hva = vm_end;
 778	} while (hva < reg_end);
 779}
 780
 781/**
 782 * stage2_unmap_vm - Unmap Stage-2 RAM mappings
 783 * @kvm: The struct kvm pointer
 784 *
 785 * Go through the memregions and unmap any reguler RAM
 786 * backing memory already mapped to the VM.
 787 */
 788void stage2_unmap_vm(struct kvm *kvm)
 789{
 790	struct kvm_memslots *slots;
 791	struct kvm_memory_slot *memslot;
 792	int idx;
 793
 794	idx = srcu_read_lock(&kvm->srcu);
 795	spin_lock(&kvm->mmu_lock);
 796
 797	slots = kvm_memslots(kvm);
 798	kvm_for_each_memslot(memslot, slots)
 799		stage2_unmap_memslot(kvm, memslot);
 800
 801	spin_unlock(&kvm->mmu_lock);
 802	srcu_read_unlock(&kvm->srcu, idx);
 803}
 804
 805/**
 806 * kvm_free_stage2_pgd - free all stage-2 tables
 807 * @kvm:	The KVM struct pointer for the VM.
 808 *
 809 * Walks the level-1 page table pointed to by kvm->arch.pgd and frees all
 810 * underlying level-2 and level-3 tables before freeing the actual level-1 table
 811 * and setting the struct pointer to NULL.
 812 *
 813 * Note we don't need locking here as this is only called when the VM is
 814 * destroyed, which can only be done once.
 815 */
 816void kvm_free_stage2_pgd(struct kvm *kvm)
 817{
 818	if (kvm->arch.pgd == NULL)
 819		return;
 820
 821	unmap_stage2_range(kvm, 0, KVM_PHYS_SIZE);
 822	kvm_free_hwpgd(kvm_get_hwpgd(kvm));
 823	if (KVM_PREALLOC_LEVEL > 0)
 824		kfree(kvm->arch.pgd);
 825
 826	kvm->arch.pgd = NULL;
 827}
 828
 829static pud_t *stage2_get_pud(struct kvm *kvm, struct kvm_mmu_memory_cache *cache,
 830			     phys_addr_t addr)
 831{
 832	pgd_t *pgd;
 833	pud_t *pud;
 834
 835	pgd = kvm->arch.pgd + kvm_pgd_index(addr);
 836	if (WARN_ON(pgd_none(*pgd))) {
 837		if (!cache)
 838			return NULL;
 839		pud = mmu_memory_cache_alloc(cache);
 840		pgd_populate(NULL, pgd, pud);
 841		get_page(virt_to_page(pgd));
 842	}
 843
 844	return pud_offset(pgd, addr);
 845}
 846
 847static pmd_t *stage2_get_pmd(struct kvm *kvm, struct kvm_mmu_memory_cache *cache,
 848			     phys_addr_t addr)
 849{
 850	pud_t *pud;
 851	pmd_t *pmd;
 852
 853	pud = stage2_get_pud(kvm, cache, addr);
 854	if (pud_none(*pud)) {
 855		if (!cache)
 856			return NULL;
 857		pmd = mmu_memory_cache_alloc(cache);
 858		pud_populate(NULL, pud, pmd);
 859		get_page(virt_to_page(pud));
 860	}
 861
 862	return pmd_offset(pud, addr);
 863}
 864
 865static int stage2_set_pmd_huge(struct kvm *kvm, struct kvm_mmu_memory_cache
 866			       *cache, phys_addr_t addr, const pmd_t *new_pmd)
 867{
 868	pmd_t *pmd, old_pmd;
 869
 870	pmd = stage2_get_pmd(kvm, cache, addr);
 871	VM_BUG_ON(!pmd);
 872
 873	/*
 874	 * Mapping in huge pages should only happen through a fault.  If a
 875	 * page is merged into a transparent huge page, the individual
 876	 * subpages of that huge page should be unmapped through MMU
 877	 * notifiers before we get here.
 878	 *
 879	 * Merging of CompoundPages is not supported; they should become
 880	 * splitting first, unmapped, merged, and mapped back in on-demand.
 881	 */
 882	VM_BUG_ON(pmd_present(*pmd) && pmd_pfn(*pmd) != pmd_pfn(*new_pmd));
 883
 884	old_pmd = *pmd;
 885	kvm_set_pmd(pmd, *new_pmd);
 886	if (pmd_present(old_pmd))
 887		kvm_tlb_flush_vmid_ipa(kvm, addr);
 888	else
 889		get_page(virt_to_page(pmd));
 890	return 0;
 891}
 892
 893static int stage2_set_pte(struct kvm *kvm, struct kvm_mmu_memory_cache *cache,
 894			  phys_addr_t addr, const pte_t *new_pte,
 895			  unsigned long flags)
 896{
 897	pmd_t *pmd;
 898	pte_t *pte, old_pte;
 899	bool iomap = flags & KVM_S2PTE_FLAG_IS_IOMAP;
 900	bool logging_active = flags & KVM_S2_FLAG_LOGGING_ACTIVE;
 901
 902	VM_BUG_ON(logging_active && !cache);
 903
 904	/* Create stage-2 page table mapping - Levels 0 and 1 */
 905	pmd = stage2_get_pmd(kvm, cache, addr);
 906	if (!pmd) {
 907		/*
 908		 * Ignore calls from kvm_set_spte_hva for unallocated
 909		 * address ranges.
 910		 */
 911		return 0;
 912	}
 913
 914	/*
 915	 * While dirty page logging - dissolve huge PMD, then continue on to
 916	 * allocate page.
 917	 */
 918	if (logging_active)
 919		stage2_dissolve_pmd(kvm, addr, pmd);
 920
 921	/* Create stage-2 page mappings - Level 2 */
 922	if (pmd_none(*pmd)) {
 923		if (!cache)
 924			return 0; /* ignore calls from kvm_set_spte_hva */
 925		pte = mmu_memory_cache_alloc(cache);
 926		kvm_clean_pte(pte);
 927		pmd_populate_kernel(NULL, pmd, pte);
 928		get_page(virt_to_page(pmd));
 929	}
 930
 931	pte = pte_offset_kernel(pmd, addr);
 932
 933	if (iomap && pte_present(*pte))
 934		return -EFAULT;
 935
 936	/* Create 2nd stage page table mapping - Level 3 */
 937	old_pte = *pte;
 938	kvm_set_pte(pte, *new_pte);
 939	if (pte_present(old_pte))
 940		kvm_tlb_flush_vmid_ipa(kvm, addr);
 941	else
 942		get_page(virt_to_page(pte));
 943
 944	return 0;
 945}
 946
 947/**
 948 * kvm_phys_addr_ioremap - map a device range to guest IPA
 949 *
 950 * @kvm:	The KVM pointer
 951 * @guest_ipa:	The IPA at which to insert the mapping
 952 * @pa:		The physical address of the device
 953 * @size:	The size of the mapping
 954 */
 955int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa,
 956			  phys_addr_t pa, unsigned long size, bool writable)
 957{
 958	phys_addr_t addr, end;
 959	int ret = 0;
 960	unsigned long pfn;
 961	struct kvm_mmu_memory_cache cache = { 0, };
 962
 963	end = (guest_ipa + size + PAGE_SIZE - 1) & PAGE_MASK;
 964	pfn = __phys_to_pfn(pa);
 965
 966	for (addr = guest_ipa; addr < end; addr += PAGE_SIZE) {
 967		pte_t pte = pfn_pte(pfn, PAGE_S2_DEVICE);
 968
 969		if (writable)
 970			kvm_set_s2pte_writable(&pte);
 971
 972		ret = mmu_topup_memory_cache(&cache, KVM_MMU_CACHE_MIN_PAGES,
 973						KVM_NR_MEM_OBJS);
 974		if (ret)
 975			goto out;
 976		spin_lock(&kvm->mmu_lock);
 977		ret = stage2_set_pte(kvm, &cache, addr, &pte,
 978						KVM_S2PTE_FLAG_IS_IOMAP);
 979		spin_unlock(&kvm->mmu_lock);
 980		if (ret)
 981			goto out;
 982
 983		pfn++;
 984	}
 985
 986out:
 987	mmu_free_memory_cache(&cache);
 988	return ret;
 989}
 990
 991static bool transparent_hugepage_adjust(pfn_t *pfnp, phys_addr_t *ipap)
 992{
 993	pfn_t pfn = *pfnp;
 994	gfn_t gfn = *ipap >> PAGE_SHIFT;
 995
 996	if (PageTransCompound(pfn_to_page(pfn))) {
 997		unsigned long mask;
 998		/*
 999		 * The address we faulted on is backed by a transparent huge
1000		 * page.  However, because we map the compound huge page and
1001		 * not the individual tail page, we need to transfer the
1002		 * refcount to the head page.  We have to be careful that the
1003		 * THP doesn't start to split while we are adjusting the
1004		 * refcounts.
1005		 *
1006		 * We are sure this doesn't happen, because mmu_notifier_retry
1007		 * was successful and we are holding the mmu_lock, so if this
1008		 * THP is trying to split, it will be blocked in the mmu
1009		 * notifier before touching any of the pages, specifically
1010		 * before being able to call __split_huge_page_refcount().
1011		 *
1012		 * We can therefore safely transfer the refcount from PG_tail
1013		 * to PG_head and switch the pfn from a tail page to the head
1014		 * page accordingly.
1015		 */
1016		mask = PTRS_PER_PMD - 1;
1017		VM_BUG_ON((gfn & mask) != (pfn & mask));
1018		if (pfn & mask) {
1019			*ipap &= PMD_MASK;
1020			kvm_release_pfn_clean(pfn);
1021			pfn &= ~mask;
1022			kvm_get_pfn(pfn);
1023			*pfnp = pfn;
1024		}
1025
1026		return true;
1027	}
1028
1029	return false;
1030}
1031
1032static bool kvm_is_write_fault(struct kvm_vcpu *vcpu)
1033{
1034	if (kvm_vcpu_trap_is_iabt(vcpu))
1035		return false;
1036
1037	return kvm_vcpu_dabt_iswrite(vcpu);
1038}
1039
1040static bool kvm_is_device_pfn(unsigned long pfn)
1041{
1042	return !pfn_valid(pfn);
1043}
1044
1045/**
1046 * stage2_wp_ptes - write protect PMD range
1047 * @pmd:	pointer to pmd entry
1048 * @addr:	range start address
1049 * @end:	range end address
1050 */
1051static void stage2_wp_ptes(pmd_t *pmd, phys_addr_t addr, phys_addr_t end)
1052{
1053	pte_t *pte;
1054
1055	pte = pte_offset_kernel(pmd, addr);
1056	do {
1057		if (!pte_none(*pte)) {
1058			if (!kvm_s2pte_readonly(pte))
1059				kvm_set_s2pte_readonly(pte);
1060		}
1061	} while (pte++, addr += PAGE_SIZE, addr != end);
1062}
1063
1064/**
1065 * stage2_wp_pmds - write protect PUD range
1066 * @pud:	pointer to pud entry
1067 * @addr:	range start address
1068 * @end:	range end address
1069 */
1070static void stage2_wp_pmds(pud_t *pud, phys_addr_t addr, phys_addr_t end)
1071{
1072	pmd_t *pmd;
1073	phys_addr_t next;
1074
1075	pmd = pmd_offset(pud, addr);
1076
1077	do {
1078		next = kvm_pmd_addr_end(addr, end);
1079		if (!pmd_none(*pmd)) {
1080			if (kvm_pmd_huge(*pmd)) {
1081				if (!kvm_s2pmd_readonly(pmd))
1082					kvm_set_s2pmd_readonly(pmd);
1083			} else {
1084				stage2_wp_ptes(pmd, addr, next);
1085			}
1086		}
1087	} while (pmd++, addr = next, addr != end);
1088}
1089
1090/**
1091  * stage2_wp_puds - write protect PGD range
1092  * @pgd:	pointer to pgd entry
1093  * @addr:	range start address
1094  * @end:	range end address
1095  *
1096  * Process PUD entries, for a huge PUD we cause a panic.
1097  */
1098static void  stage2_wp_puds(pgd_t *pgd, phys_addr_t addr, phys_addr_t end)
1099{
1100	pud_t *pud;
1101	phys_addr_t next;
1102
1103	pud = pud_offset(pgd, addr);
1104	do {
1105		next = kvm_pud_addr_end(addr, end);
1106		if (!pud_none(*pud)) {
1107			/* TODO:PUD not supported, revisit later if supported */
1108			BUG_ON(kvm_pud_huge(*pud));
1109			stage2_wp_pmds(pud, addr, next);
1110		}
1111	} while (pud++, addr = next, addr != end);
1112}
1113
1114/**
1115 * stage2_wp_range() - write protect stage2 memory region range
1116 * @kvm:	The KVM pointer
1117 * @addr:	Start address of range
1118 * @end:	End address of range
1119 */
1120static void stage2_wp_range(struct kvm *kvm, phys_addr_t addr, phys_addr_t end)
1121{
1122	pgd_t *pgd;
1123	phys_addr_t next;
1124
1125	pgd = kvm->arch.pgd + kvm_pgd_index(addr);
1126	do {
1127		/*
1128		 * Release kvm_mmu_lock periodically if the memory region is
1129		 * large. Otherwise, we may see kernel panics with
1130		 * CONFIG_DETECT_HUNG_TASK, CONFIG_LOCKUP_DETECTOR,
1131		 * CONFIG_LOCKDEP. Additionally, holding the lock too long
1132		 * will also starve other vCPUs.
1133		 */
1134		if (need_resched() || spin_needbreak(&kvm->mmu_lock))
1135			cond_resched_lock(&kvm->mmu_lock);
1136
1137		next = kvm_pgd_addr_end(addr, end);
1138		if (pgd_present(*pgd))
1139			stage2_wp_puds(pgd, addr, next);
1140	} while (pgd++, addr = next, addr != end);
1141}
1142
1143/**
1144 * kvm_mmu_wp_memory_region() - write protect stage 2 entries for memory slot
1145 * @kvm:	The KVM pointer
1146 * @slot:	The memory slot to write protect
1147 *
1148 * Called to start logging dirty pages after memory region
1149 * KVM_MEM_LOG_DIRTY_PAGES operation is called. After this function returns
1150 * all present PMD and PTEs are write protected in the memory region.
1151 * Afterwards read of dirty page log can be called.
1152 *
1153 * Acquires kvm_mmu_lock. Called with kvm->slots_lock mutex acquired,
1154 * serializing operations for VM memory regions.
1155 */
1156void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot)
1157{
1158	struct kvm_memslots *slots = kvm_memslots(kvm);
1159	struct kvm_memory_slot *memslot = id_to_memslot(slots, slot);
1160	phys_addr_t start = memslot->base_gfn << PAGE_SHIFT;
1161	phys_addr_t end = (memslot->base_gfn + memslot->npages) << PAGE_SHIFT;
1162
1163	spin_lock(&kvm->mmu_lock);
1164	stage2_wp_range(kvm, start, end);
1165	spin_unlock(&kvm->mmu_lock);
1166	kvm_flush_remote_tlbs(kvm);
1167}
1168
1169/**
1170 * kvm_mmu_write_protect_pt_masked() - write protect dirty pages
1171 * @kvm:	The KVM pointer
1172 * @slot:	The memory slot associated with mask
1173 * @gfn_offset:	The gfn offset in memory slot
1174 * @mask:	The mask of dirty pages at offset 'gfn_offset' in this memory
1175 *		slot to be write protected
1176 *
1177 * Walks bits set in mask write protects the associated pte's. Caller must
1178 * acquire kvm_mmu_lock.
1179 */
1180static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
1181		struct kvm_memory_slot *slot,
1182		gfn_t gfn_offset, unsigned long mask)
1183{
1184	phys_addr_t base_gfn = slot->base_gfn + gfn_offset;
1185	phys_addr_t start = (base_gfn +  __ffs(mask)) << PAGE_SHIFT;
1186	phys_addr_t end = (base_gfn + __fls(mask) + 1) << PAGE_SHIFT;
1187
1188	stage2_wp_range(kvm, start, end);
1189}
1190
1191/*
1192 * kvm_arch_mmu_enable_log_dirty_pt_masked - enable dirty logging for selected
1193 * dirty pages.
1194 *
1195 * It calls kvm_mmu_write_protect_pt_masked to write protect selected pages to
1196 * enable dirty logging for them.
1197 */
1198void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm,
1199		struct kvm_memory_slot *slot,
1200		gfn_t gfn_offset, unsigned long mask)
1201{
1202	kvm_mmu_write_protect_pt_masked(kvm, slot, gfn_offset, mask);
1203}
1204
1205static void coherent_cache_guest_page(struct kvm_vcpu *vcpu, pfn_t pfn,
1206				      unsigned long size, bool uncached)
1207{
1208	__coherent_cache_guest_page(vcpu, pfn, size, uncached);
1209}
1210
1211static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
1212			  struct kvm_memory_slot *memslot, unsigned long hva,
1213			  unsigned long fault_status)
1214{
1215	int ret;
1216	bool write_fault, writable, hugetlb = false, force_pte = false;
1217	unsigned long mmu_seq;
1218	gfn_t gfn = fault_ipa >> PAGE_SHIFT;
1219	struct kvm *kvm = vcpu->kvm;
1220	struct kvm_mmu_memory_cache *memcache = &vcpu->arch.mmu_page_cache;
1221	struct vm_area_struct *vma;
1222	pfn_t pfn;
1223	pgprot_t mem_type = PAGE_S2;
1224	bool fault_ipa_uncached;
1225	bool logging_active = memslot_is_logging(memslot);
1226	unsigned long flags = 0;
1227
1228	write_fault = kvm_is_write_fault(vcpu);
1229	if (fault_status == FSC_PERM && !write_fault) {
1230		kvm_err("Unexpected L2 read permission error\n");
1231		return -EFAULT;
1232	}
1233
1234	/* Let's check if we will get back a huge page backed by hugetlbfs */
1235	down_read(&current->mm->mmap_sem);
1236	vma = find_vma_intersection(current->mm, hva, hva + 1);
1237	if (unlikely(!vma)) {
1238		kvm_err("Failed to find VMA for hva 0x%lx\n", hva);
1239		up_read(&current->mm->mmap_sem);
1240		return -EFAULT;
1241	}
1242
1243	if (is_vm_hugetlb_page(vma) && !logging_active) {
1244		hugetlb = true;
1245		gfn = (fault_ipa & PMD_MASK) >> PAGE_SHIFT;
1246	} else {
1247		/*
1248		 * Pages belonging to memslots that don't have the same
1249		 * alignment for userspace and IPA cannot be mapped using
1250		 * block descriptors even if the pages belong to a THP for
1251		 * the process, because the stage-2 block descriptor will
1252		 * cover more than a single THP and we loose atomicity for
1253		 * unmapping, updates, and splits of the THP or other pages
1254		 * in the stage-2 block range.
1255		 */
1256		if ((memslot->userspace_addr & ~PMD_MASK) !=
1257		    ((memslot->base_gfn << PAGE_SHIFT) & ~PMD_MASK))
1258			force_pte = true;
1259	}
1260	up_read(&current->mm->mmap_sem);
1261
1262	/* We need minimum second+third level pages */
1263	ret = mmu_topup_memory_cache(memcache, KVM_MMU_CACHE_MIN_PAGES,
1264				     KVM_NR_MEM_OBJS);
1265	if (ret)
1266		return ret;
1267
1268	mmu_seq = vcpu->kvm->mmu_notifier_seq;
1269	/*
1270	 * Ensure the read of mmu_notifier_seq happens before we call
1271	 * gfn_to_pfn_prot (which calls get_user_pages), so that we don't risk
1272	 * the page we just got a reference to gets unmapped before we have a
1273	 * chance to grab the mmu_lock, which ensure that if the page gets
1274	 * unmapped afterwards, the call to kvm_unmap_hva will take it away
1275	 * from us again properly. This smp_rmb() interacts with the smp_wmb()
1276	 * in kvm_mmu_notifier_invalidate_<page|range_end>.
1277	 */
1278	smp_rmb();
1279
1280	pfn = gfn_to_pfn_prot(kvm, gfn, write_fault, &writable);
1281	if (is_error_pfn(pfn))
1282		return -EFAULT;
1283
1284	if (kvm_is_device_pfn(pfn)) {
1285		mem_type = PAGE_S2_DEVICE;
1286		flags |= KVM_S2PTE_FLAG_IS_IOMAP;
1287	} else if (logging_active) {
1288		/*
1289		 * Faults on pages in a memslot with logging enabled
1290		 * should not be mapped with huge pages (it introduces churn
1291		 * and performance degradation), so force a pte mapping.
1292		 */
1293		force_pte = true;
1294		flags |= KVM_S2_FLAG_LOGGING_ACTIVE;
1295
1296		/*
1297		 * Only actually map the page as writable if this was a write
1298		 * fault.
1299		 */
1300		if (!write_fault)
1301			writable = false;
1302	}
1303
1304	spin_lock(&kvm->mmu_lock);
1305	if (mmu_notifier_retry(kvm, mmu_seq))
1306		goto out_unlock;
1307
1308	if (!hugetlb && !force_pte)
1309		hugetlb = transparent_hugepage_adjust(&pfn, &fault_ipa);
1310
1311	fault_ipa_uncached = memslot->flags & KVM_MEMSLOT_INCOHERENT;
1312
1313	if (hugetlb) {
1314		pmd_t new_pmd = pfn_pmd(pfn, mem_type);
1315		new_pmd = pmd_mkhuge(new_pmd);
1316		if (writable) {
1317			kvm_set_s2pmd_writable(&new_pmd);
1318			kvm_set_pfn_dirty(pfn);
1319		}
1320		coherent_cache_guest_page(vcpu, pfn, PMD_SIZE, fault_ipa_uncached);
1321		ret = stage2_set_pmd_huge(kvm, memcache, fault_ipa, &new_pmd);
1322	} else {
1323		pte_t new_pte = pfn_pte(pfn, mem_type);
1324
1325		if (writable) {
1326			kvm_set_s2pte_writable(&new_pte);
1327			kvm_set_pfn_dirty(pfn);
1328			mark_page_dirty(kvm, gfn);
1329		}
1330		coherent_cache_guest_page(vcpu, pfn, PAGE_SIZE, fault_ipa_uncached);
1331		ret = stage2_set_pte(kvm, memcache, fault_ipa, &new_pte, flags);
1332	}
1333
1334out_unlock:
1335	spin_unlock(&kvm->mmu_lock);
1336	kvm_set_pfn_accessed(pfn);
1337	kvm_release_pfn_clean(pfn);
1338	return ret;
1339}
1340
1341/*
1342 * Resolve the access fault by making the page young again.
1343 * Note that because the faulting entry is guaranteed not to be
1344 * cached in the TLB, we don't need to invalidate anything.
1345 */
1346static void handle_access_fault(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa)
1347{
1348	pmd_t *pmd;
1349	pte_t *pte;
1350	pfn_t pfn;
1351	bool pfn_valid = false;
1352
1353	trace_kvm_access_fault(fault_ipa);
1354
1355	spin_lock(&vcpu->kvm->mmu_lock);
1356
1357	pmd = stage2_get_pmd(vcpu->kvm, NULL, fault_ipa);
1358	if (!pmd || pmd_none(*pmd))	/* Nothing there */
1359		goto out;
1360
1361	if (kvm_pmd_huge(*pmd)) {	/* THP, HugeTLB */
1362		*pmd = pmd_mkyoung(*pmd);
1363		pfn = pmd_pfn(*pmd);
1364		pfn_valid = true;
1365		goto out;
1366	}
1367
1368	pte = pte_offset_kernel(pmd, fault_ipa);
1369	if (pte_none(*pte))		/* Nothing there either */
1370		goto out;
1371
1372	*pte = pte_mkyoung(*pte);	/* Just a page... */
1373	pfn = pte_pfn(*pte);
1374	pfn_valid = true;
1375out:
1376	spin_unlock(&vcpu->kvm->mmu_lock);
1377	if (pfn_valid)
1378		kvm_set_pfn_accessed(pfn);
1379}
1380
1381/**
1382 * kvm_handle_guest_abort - handles all 2nd stage aborts
1383 * @vcpu:	the VCPU pointer
1384 * @run:	the kvm_run structure
1385 *
1386 * Any abort that gets to the host is almost guaranteed to be caused by a
1387 * missing second stage translation table entry, which can mean that either the
1388 * guest simply needs more memory and we must allocate an appropriate page or it
1389 * can mean that the guest tried to access I/O memory, which is emulated by user
1390 * space. The distinction is based on the IPA causing the fault and whether this
1391 * memory region has been registered as standard RAM by user space.
1392 */
1393int kvm_handle_guest_abort(struct kvm_vcpu *vcpu, struct kvm_run *run)
1394{
1395	unsigned long fault_status;
1396	phys_addr_t fault_ipa;
1397	struct kvm_memory_slot *memslot;
1398	unsigned long hva;
1399	bool is_iabt, write_fault, writable;
1400	gfn_t gfn;
1401	int ret, idx;
1402
1403	is_iabt = kvm_vcpu_trap_is_iabt(vcpu);
1404	fault_ipa = kvm_vcpu_get_fault_ipa(vcpu);
1405
1406	trace_kvm_guest_fault(*vcpu_pc(vcpu), kvm_vcpu_get_hsr(vcpu),
1407			      kvm_vcpu_get_hfar(vcpu), fault_ipa);
1408
1409	/* Check the stage-2 fault is trans. fault or write fault */
1410	fault_status = kvm_vcpu_trap_get_fault_type(vcpu);
1411	if (fault_status != FSC_FAULT && fault_status != FSC_PERM &&
1412	    fault_status != FSC_ACCESS) {
1413		kvm_err("Unsupported FSC: EC=%#x xFSC=%#lx ESR_EL2=%#lx\n",
1414			kvm_vcpu_trap_get_class(vcpu),
1415			(unsigned long)kvm_vcpu_trap_get_fault(vcpu),
1416			(unsigned long)kvm_vcpu_get_hsr(vcpu));
1417		return -EFAULT;
1418	}
1419
1420	idx = srcu_read_lock(&vcpu->kvm->srcu);
1421
1422	gfn = fault_ipa >> PAGE_SHIFT;
1423	memslot = gfn_to_memslot(vcpu->kvm, gfn);
1424	hva = gfn_to_hva_memslot_prot(memslot, gfn, &writable);
1425	write_fault = kvm_is_write_fault(vcpu);
1426	if (kvm_is_error_hva(hva) || (write_fault && !writable)) {
1427		if (is_iabt) {
1428			/* Prefetch Abort on I/O address */
1429			kvm_inject_pabt(vcpu, kvm_vcpu_get_hfar(vcpu));
1430			ret = 1;
1431			goto out_unlock;
1432		}
1433
1434		/*
1435		 * The IPA is reported as [MAX:12], so we need to
1436		 * complement it with the bottom 12 bits from the
1437		 * faulting VA. This is always 12 bits, irrespective
1438		 * of the page size.
1439		 */
1440		fault_ipa |= kvm_vcpu_get_hfar(vcpu) & ((1 << 12) - 1);
1441		ret = io_mem_abort(vcpu, run, fault_ipa);
1442		goto out_unlock;
1443	}
1444
1445	/* Userspace should not be able to register out-of-bounds IPAs */
1446	VM_BUG_ON(fault_ipa >= KVM_PHYS_SIZE);
1447
1448	if (fault_status == FSC_ACCESS) {
1449		handle_access_fault(vcpu, fault_ipa);
1450		ret = 1;
1451		goto out_unlock;
1452	}
1453
1454	ret = user_mem_abort(vcpu, fault_ipa, memslot, hva, fault_status);
1455	if (ret == 0)
1456		ret = 1;
1457out_unlock:
1458	srcu_read_unlock(&vcpu->kvm->srcu, idx);
1459	return ret;
1460}
1461
1462static int handle_hva_to_gpa(struct kvm *kvm,
1463			     unsigned long start,
1464			     unsigned long end,
1465			     int (*handler)(struct kvm *kvm,
1466					    gpa_t gpa, void *data),
1467			     void *data)
1468{
1469	struct kvm_memslots *slots;
1470	struct kvm_memory_slot *memslot;
1471	int ret = 0;
1472
1473	slots = kvm_memslots(kvm);
1474
1475	/* we only care about the pages that the guest sees */
1476	kvm_for_each_memslot(memslot, slots) {
1477		unsigned long hva_start, hva_end;
1478		gfn_t gfn, gfn_end;
1479
1480		hva_start = max(start, memslot->userspace_addr);
1481		hva_end = min(end, memslot->userspace_addr +
1482					(memslot->npages << PAGE_SHIFT));
1483		if (hva_start >= hva_end)
1484			continue;
1485
1486		/*
1487		 * {gfn(page) | page intersects with [hva_start, hva_end)} =
1488		 * {gfn_start, gfn_start+1, ..., gfn_end-1}.
1489		 */
1490		gfn = hva_to_gfn_memslot(hva_start, memslot);
1491		gfn_end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, memslot);
1492
1493		for (; gfn < gfn_end; ++gfn) {
1494			gpa_t gpa = gfn << PAGE_SHIFT;
1495			ret |= handler(kvm, gpa, data);
1496		}
1497	}
1498
1499	return ret;
1500}
1501
1502static int kvm_unmap_hva_handler(struct kvm *kvm, gpa_t gpa, void *data)
1503{
1504	unmap_stage2_range(kvm, gpa, PAGE_SIZE);
1505	return 0;
1506}
1507
1508int kvm_unmap_hva(struct kvm *kvm, unsigned long hva)
1509{
1510	unsigned long end = hva + PAGE_SIZE;
1511
1512	if (!kvm->arch.pgd)
1513		return 0;
1514
1515	trace_kvm_unmap_hva(hva);
1516	handle_hva_to_gpa(kvm, hva, end, &kvm_unmap_hva_handler, NULL);
1517	return 0;
1518}
1519
1520int kvm_unmap_hva_range(struct kvm *kvm,
1521			unsigned long start, unsigned long end)
1522{
1523	if (!kvm->arch.pgd)
1524		return 0;
1525
1526	trace_kvm_unmap_hva_range(start, end);
1527	handle_hva_to_gpa(kvm, start, end, &kvm_unmap_hva_handler, NULL);
1528	return 0;
1529}
1530
1531static int kvm_set_spte_handler(struct kvm *kvm, gpa_t gpa, void *data)
1532{
1533	pte_t *pte = (pte_t *)data;
1534
1535	/*
1536	 * We can always call stage2_set_pte with KVM_S2PTE_FLAG_LOGGING_ACTIVE
1537	 * flag clear because MMU notifiers will have unmapped a huge PMD before
1538	 * calling ->change_pte() (which in turn calls kvm_set_spte_hva()) and
1539	 * therefore stage2_set_pte() never needs to clear out a huge PMD
1540	 * through this calling path.
1541	 */
1542	stage2_set_pte(kvm, NULL, gpa, pte, 0);
1543	return 0;
1544}
1545
1546
1547void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte)
1548{
1549	unsigned long end = hva + PAGE_SIZE;
1550	pte_t stage2_pte;
1551
1552	if (!kvm->arch.pgd)
1553		return;
1554
1555	trace_kvm_set_spte_hva(hva);
1556	stage2_pte = pfn_pte(pte_pfn(pte), PAGE_S2);
1557	handle_hva_to_gpa(kvm, hva, end, &kvm_set_spte_handler, &stage2_pte);
1558}
1559
1560static int kvm_age_hva_handler(struct kvm *kvm, gpa_t gpa, void *data)
1561{
1562	pmd_t *pmd;
1563	pte_t *pte;
1564
1565	pmd = stage2_get_pmd(kvm, NULL, gpa);
1566	if (!pmd || pmd_none(*pmd))	/* Nothing there */
1567		return 0;
1568
1569	if (kvm_pmd_huge(*pmd)) {	/* THP, HugeTLB */
1570		if (pmd_young(*pmd)) {
1571			*pmd = pmd_mkold(*pmd);
1572			return 1;
1573		}
1574
1575		return 0;
1576	}
1577
1578	pte = pte_offset_kernel(pmd, gpa);
1579	if (pte_none(*pte))
1580		return 0;
1581
1582	if (pte_young(*pte)) {
1583		*pte = pte_mkold(*pte);	/* Just a page... */
1584		return 1;
1585	}
1586
1587	return 0;
1588}
1589
1590static int kvm_test_age_hva_handler(struct kvm *kvm, gpa_t gpa, void *data)
1591{
1592	pmd_t *pmd;
1593	pte_t *pte;
1594
1595	pmd = stage2_get_pmd(kvm, NULL, gpa);
1596	if (!pmd || pmd_none(*pmd))	/* Nothing there */
1597		return 0;
1598
1599	if (kvm_pmd_huge(*pmd))		/* THP, HugeTLB */
1600		return pmd_young(*pmd);
1601
1602	pte = pte_offset_kernel(pmd, gpa);
1603	if (!pte_none(*pte))		/* Just a page... */
1604		return pte_young(*pte);
1605
1606	return 0;
1607}
1608
1609int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end)
1610{
1611	trace_kvm_age_hva(start, end);
1612	return handle_hva_to_gpa(kvm, start, end, kvm_age_hva_handler, NULL);
1613}
1614
1615int kvm_test_age_hva(struct kvm *kvm, unsigned long hva)
1616{
1617	trace_kvm_test_age_hva(hva);
1618	return handle_hva_to_gpa(kvm, hva, hva, kvm_test_age_hva_handler, NULL);
1619}
1620
1621void kvm_mmu_free_memory_caches(struct kvm_vcpu *vcpu)
1622{
1623	mmu_free_memory_cache(&vcpu->arch.mmu_page_cache);
1624}
1625
1626phys_addr_t kvm_mmu_get_httbr(void)
1627{
1628	if (__kvm_cpu_uses_extended_idmap())
1629		return virt_to_phys(merged_hyp_pgd);
1630	else
1631		return virt_to_phys(hyp_pgd);
1632}
1633
1634phys_addr_t kvm_mmu_get_boot_httbr(void)
1635{
1636	if (__kvm_cpu_uses_extended_idmap())
1637		return virt_to_phys(merged_hyp_pgd);
1638	else
1639		return virt_to_phys(boot_hyp_pgd);
1640}
1641
1642phys_addr_t kvm_get_idmap_vector(void)
1643{
1644	return hyp_idmap_vector;
1645}
1646
1647int kvm_mmu_init(void)
1648{
1649	int err;
1650
1651	hyp_idmap_start = kvm_virt_to_phys(__hyp_idmap_text_start);
1652	hyp_idmap_end = kvm_virt_to_phys(__hyp_idmap_text_end);
1653	hyp_idmap_vector = kvm_virt_to_phys(__kvm_hyp_init);
1654
1655	/*
1656	 * We rely on the linker script to ensure at build time that the HYP
1657	 * init code does not cross a page boundary.
1658	 */
1659	BUG_ON((hyp_idmap_start ^ (hyp_idmap_end - 1)) & PAGE_MASK);
1660
1661	hyp_pgd = (pgd_t *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, hyp_pgd_order);
1662	boot_hyp_pgd = (pgd_t *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, hyp_pgd_order);
1663
1664	if (!hyp_pgd || !boot_hyp_pgd) {
1665		kvm_err("Hyp mode PGD not allocated\n");
1666		err = -ENOMEM;
1667		goto out;
1668	}
1669
1670	/* Create the idmap in the boot page tables */
1671	err = 	__create_hyp_mappings(boot_hyp_pgd,
1672				      hyp_idmap_start, hyp_idmap_end,
1673				      __phys_to_pfn(hyp_idmap_start),
1674				      PAGE_HYP);
1675
1676	if (err) {
1677		kvm_err("Failed to idmap %lx-%lx\n",
1678			hyp_idmap_start, hyp_idmap_end);
1679		goto out;
1680	}
1681
1682	if (__kvm_cpu_uses_extended_idmap()) {
1683		merged_hyp_pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
1684		if (!merged_hyp_pgd) {
1685			kvm_err("Failed to allocate extra HYP pgd\n");
1686			goto out;
1687		}
1688		__kvm_extend_hypmap(boot_hyp_pgd, hyp_pgd, merged_hyp_pgd,
1689				    hyp_idmap_start);
1690		return 0;
1691	}
1692
1693	/* Map the very same page at the trampoline VA */
1694	err = 	__create_hyp_mappings(boot_hyp_pgd,
1695				      TRAMPOLINE_VA, TRAMPOLINE_VA + PAGE_SIZE,
1696				      __phys_to_pfn(hyp_idmap_start),
1697				      PAGE_HYP);
1698	if (err) {
1699		kvm_err("Failed to map trampoline @%lx into boot HYP pgd\n",
1700			TRAMPOLINE_VA);
1701		goto out;
1702	}
1703
1704	/* Map the same page again into the runtime page tables */
1705	err = 	__create_hyp_mappings(hyp_pgd,
1706				      TRAMPOLINE_VA, TRAMPOLINE_VA + PAGE_SIZE,
1707				      __phys_to_pfn(hyp_idmap_start),
1708				      PAGE_HYP);
1709	if (err) {
1710		kvm_err("Failed to map trampoline @%lx into runtime HYP pgd\n",
1711			TRAMPOLINE_VA);
1712		goto out;
1713	}
1714
1715	return 0;
1716out:
1717	free_hyp_pgds();
1718	return err;
1719}
1720
1721void kvm_arch_commit_memory_region(struct kvm *kvm,
1722				   const struct kvm_userspace_memory_region *mem,
1723				   const struct kvm_memory_slot *old,
1724				   const struct kvm_memory_slot *new,
1725				   enum kvm_mr_change change)
1726{
1727	/*
1728	 * At this point memslot has been committed and there is an
1729	 * allocated dirty_bitmap[], dirty pages will be be tracked while the
1730	 * memory slot is write protected.
1731	 */
1732	if (change != KVM_MR_DELETE && mem->flags & KVM_MEM_LOG_DIRTY_PAGES)
1733		kvm_mmu_wp_memory_region(kvm, mem->slot);
1734}
1735
1736int kvm_arch_prepare_memory_region(struct kvm *kvm,
1737				   struct kvm_memory_slot *memslot,
1738				   const struct kvm_userspace_memory_region *mem,
1739				   enum kvm_mr_change change)
1740{
1741	hva_t hva = mem->userspace_addr;
1742	hva_t reg_end = hva + mem->memory_size;
1743	bool writable = !(mem->flags & KVM_MEM_READONLY);
1744	int ret = 0;
1745
1746	if (change != KVM_MR_CREATE && change != KVM_MR_MOVE &&
1747			change != KVM_MR_FLAGS_ONLY)
1748		return 0;
1749
1750	/*
1751	 * Prevent userspace from creating a memory region outside of the IPA
1752	 * space addressable by the KVM guest IPA space.
1753	 */
1754	if (memslot->base_gfn + memslot->npages >=
1755	    (KVM_PHYS_SIZE >> PAGE_SHIFT))
1756		return -EFAULT;
1757
1758	/*
1759	 * A memory region could potentially cover multiple VMAs, and any holes
1760	 * between them, so iterate over all of them to find out if we can map
1761	 * any of them right now.
1762	 *
1763	 *     +--------------------------------------------+
1764	 * +---------------+----------------+   +----------------+
1765	 * |   : VMA 1     |      VMA 2     |   |    VMA 3  :    |
1766	 * +---------------+----------------+   +----------------+
1767	 *     |               memory region                |
1768	 *     +--------------------------------------------+
1769	 */
1770	do {
1771		struct vm_area_struct *vma = find_vma(current->mm, hva);
1772		hva_t vm_start, vm_end;
1773
1774		if (!vma || vma->vm_start >= reg_end)
1775			break;
1776
1777		/*
1778		 * Mapping a read-only VMA is only allowed if the
1779		 * memory region is configured as read-only.
1780		 */
1781		if (writable && !(vma->vm_flags & VM_WRITE)) {
1782			ret = -EPERM;
1783			break;
1784		}
1785
1786		/*
1787		 * Take the intersection of this VMA with the memory region
1788		 */
1789		vm_start = max(hva, vma->vm_start);
1790		vm_end = min(reg_end, vma->vm_end);
1791
1792		if (vma->vm_flags & VM_PFNMAP) {
1793			gpa_t gpa = mem->guest_phys_addr +
1794				    (vm_start - mem->userspace_addr);
1795			phys_addr_t pa;
1796
1797			pa = (phys_addr_t)vma->vm_pgoff << PAGE_SHIFT;
1798			pa += vm_start - vma->vm_start;
1799
1800			/* IO region dirty page logging not allowed */
1801			if (memslot->flags & KVM_MEM_LOG_DIRTY_PAGES)
1802				return -EINVAL;
1803
1804			ret = kvm_phys_addr_ioremap(kvm, gpa, pa,
1805						    vm_end - vm_start,
1806						    writable);
1807			if (ret)
1808				break;
1809		}
1810		hva = vm_end;
1811	} while (hva < reg_end);
1812
1813	if (change == KVM_MR_FLAGS_ONLY)
1814		return ret;
1815
1816	spin_lock(&kvm->mmu_lock);
1817	if (ret)
1818		unmap_stage2_range(kvm, mem->guest_phys_addr, mem->memory_size);
1819	else
1820		stage2_flush_memslot(kvm, memslot);
1821	spin_unlock(&kvm->mmu_lock);
1822	return ret;
1823}
1824
1825void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free,
1826			   struct kvm_memory_slot *dont)
1827{
1828}
1829
1830int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot,
1831			    unsigned long npages)
1832{
1833	/*
1834	 * Readonly memslots are not incoherent with the caches by definition,
1835	 * but in practice, they are used mostly to emulate ROMs or NOR flashes
1836	 * that the guest may consider devices and hence map as uncached.
1837	 * To prevent incoherency issues in these cases, tag all readonly
1838	 * regions as incoherent.
1839	 */
1840	if (slot->flags & KVM_MEM_READONLY)
1841		slot->flags |= KVM_MEMSLOT_INCOHERENT;
1842	return 0;
1843}
1844
1845void kvm_arch_memslots_updated(struct kvm *kvm, struct kvm_memslots *slots)
1846{
1847}
1848
1849void kvm_arch_flush_shadow_all(struct kvm *kvm)
1850{
1851}
1852
1853void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
1854				   struct kvm_memory_slot *slot)
1855{
1856	gpa_t gpa = slot->base_gfn << PAGE_SHIFT;
1857	phys_addr_t size = slot->npages << PAGE_SHIFT;
1858
1859	spin_lock(&kvm->mmu_lock);
1860	unmap_stage2_range(kvm, gpa, size);
1861	spin_unlock(&kvm->mmu_lock);
1862}
1863
1864/*
1865 * See note at ARMv7 ARM B1.14.4 (TL;DR: S/W ops are not easily virtualized).
1866 *
1867 * Main problems:
1868 * - S/W ops are local to a CPU (not broadcast)
1869 * - We have line migration behind our back (speculation)
1870 * - System caches don't support S/W at all (damn!)
1871 *
1872 * In the face of the above, the best we can do is to try and convert
1873 * S/W ops to VA ops. Because the guest is not allowed to infer the
1874 * S/W to PA mapping, it can only use S/W to nuke the whole cache,
1875 * which is a rather good thing for us.
1876 *
1877 * Also, it is only used when turning caches on/off ("The expected
1878 * usage of the cache maintenance instructions that operate by set/way
1879 * is associated with the cache maintenance instructions associated
1880 * with the powerdown and powerup of caches, if this is required by
1881 * the implementation.").
1882 *
1883 * We use the following policy:
1884 *
1885 * - If we trap a S/W operation, we enable VM trapping to detect
1886 *   caches being turned on/off, and do a full clean.
1887 *
1888 * - We flush the caches on both caches being turned on and off.
1889 *
1890 * - Once the caches are enabled, we stop trapping VM ops.
1891 */
1892void kvm_set_way_flush(struct kvm_vcpu *vcpu)
1893{
1894	unsigned long hcr = vcpu_get_hcr(vcpu);
1895
1896	/*
1897	 * If this is the first time we do a S/W operation
1898	 * (i.e. HCR_TVM not set) flush the whole memory, and set the
1899	 * VM trapping.
1900	 *
1901	 * Otherwise, rely on the VM trapping to wait for the MMU +
1902	 * Caches to be turned off. At that point, we'll be able to
1903	 * clean the caches again.
1904	 */
1905	if (!(hcr & HCR_TVM)) {
1906		trace_kvm_set_way_flush(*vcpu_pc(vcpu),
1907					vcpu_has_cache_enabled(vcpu));
1908		stage2_flush_vm(vcpu->kvm);
1909		vcpu_set_hcr(vcpu, hcr | HCR_TVM);
1910	}
1911}
1912
1913void kvm_toggle_cache(struct kvm_vcpu *vcpu, bool was_enabled)
1914{
1915	bool now_enabled = vcpu_has_cache_enabled(vcpu);
1916
1917	/*
1918	 * If switching the MMU+caches on, need to invalidate the caches.
1919	 * If switching it off, need to clean the caches.
1920	 * Clean + invalidate does the trick always.
1921	 */
1922	if (now_enabled != was_enabled)
1923		stage2_flush_vm(vcpu->kvm);
1924
1925	/* Caches are now on, stop trapping VM ops (until a S/W op) */
1926	if (now_enabled)
1927		vcpu_set_hcr(vcpu, vcpu_get_hcr(vcpu) & ~HCR_TVM);
1928
1929	trace_kvm_toggle_cache(*vcpu_pc(vcpu), was_enabled, now_enabled);
1930}
Configure Feed

Configure Feed