virt/kvm/arm/mmu.c at v4.18

tjh.dev / kernel
fork
Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
fork
kernel / virt / kvm / arm / mmu.c
at v4.18 2193 lines 58 kB view raw
wrap content
   1/*
   2 * Copyright (C) 2012 - Virtual Open Systems and Columbia University
   3 * Author: Christoffer Dall <c.dall@virtualopensystems.com>
   4 *
   5 * This program is free software; you can redistribute it and/or modify
   6 * it under the terms of the GNU General Public License, version 2, as
   7 * published by the Free Software Foundation.
   8 *
   9 * This program is distributed in the hope that it will be useful,
  10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
  11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12 * GNU General Public License for more details.
  13 *
  14 * You should have received a copy of the GNU General Public License
  15 * along with this program; if not, write to the Free Software
  16 * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
  17 */
  18
  19#include <linux/mman.h>
  20#include <linux/kvm_host.h>
  21#include <linux/io.h>
  22#include <linux/hugetlb.h>
  23#include <linux/sched/signal.h>
  24#include <trace/events/kvm.h>
  25#include <asm/pgalloc.h>
  26#include <asm/cacheflush.h>
  27#include <asm/kvm_arm.h>
  28#include <asm/kvm_mmu.h>
  29#include <asm/kvm_mmio.h>
  30#include <asm/kvm_asm.h>
  31#include <asm/kvm_emulate.h>
  32#include <asm/virt.h>
  33#include <asm/system_misc.h>
  34
  35#include "trace.h"
  36
  37static pgd_t *boot_hyp_pgd;
  38static pgd_t *hyp_pgd;
  39static pgd_t *merged_hyp_pgd;
  40static DEFINE_MUTEX(kvm_hyp_pgd_mutex);
  41
  42static unsigned long hyp_idmap_start;
  43static unsigned long hyp_idmap_end;
  44static phys_addr_t hyp_idmap_vector;
  45
  46static unsigned long io_map_base;
  47
  48#define S2_PGD_SIZE	(PTRS_PER_S2_PGD * sizeof(pgd_t))
  49#define hyp_pgd_order get_order(PTRS_PER_PGD * sizeof(pgd_t))
  50
  51#define KVM_S2PTE_FLAG_IS_IOMAP		(1UL << 0)
  52#define KVM_S2_FLAG_LOGGING_ACTIVE	(1UL << 1)
  53
  54static bool memslot_is_logging(struct kvm_memory_slot *memslot)
  55{
  56	return memslot->dirty_bitmap && !(memslot->flags & KVM_MEM_READONLY);
  57}
  58
  59/**
  60 * kvm_flush_remote_tlbs() - flush all VM TLB entries for v7/8
  61 * @kvm:	pointer to kvm structure.
  62 *
  63 * Interface to HYP function to flush all VM TLB entries
  64 */
  65void kvm_flush_remote_tlbs(struct kvm *kvm)
  66{
  67	kvm_call_hyp(__kvm_tlb_flush_vmid, kvm);
  68}
  69
  70static void kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa)
  71{
  72	kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, kvm, ipa);
  73}
  74
  75/*
  76 * D-Cache management functions. They take the page table entries by
  77 * value, as they are flushing the cache using the kernel mapping (or
  78 * kmap on 32bit).
  79 */
  80static void kvm_flush_dcache_pte(pte_t pte)
  81{
  82	__kvm_flush_dcache_pte(pte);
  83}
  84
  85static void kvm_flush_dcache_pmd(pmd_t pmd)
  86{
  87	__kvm_flush_dcache_pmd(pmd);
  88}
  89
  90static void kvm_flush_dcache_pud(pud_t pud)
  91{
  92	__kvm_flush_dcache_pud(pud);
  93}
  94
  95static bool kvm_is_device_pfn(unsigned long pfn)
  96{
  97	return !pfn_valid(pfn);
  98}
  99
 100/**
 101 * stage2_dissolve_pmd() - clear and flush huge PMD entry
 102 * @kvm:	pointer to kvm structure.
 103 * @addr:	IPA
 104 * @pmd:	pmd pointer for IPA
 105 *
 106 * Function clears a PMD entry, flushes addr 1st and 2nd stage TLBs. Marks all
 107 * pages in the range dirty.
 108 */
 109static void stage2_dissolve_pmd(struct kvm *kvm, phys_addr_t addr, pmd_t *pmd)
 110{
 111	if (!pmd_thp_or_huge(*pmd))
 112		return;
 113
 114	pmd_clear(pmd);
 115	kvm_tlb_flush_vmid_ipa(kvm, addr);
 116	put_page(virt_to_page(pmd));
 117}
 118
 119static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
 120				  int min, int max)
 121{
 122	void *page;
 123
 124	BUG_ON(max > KVM_NR_MEM_OBJS);
 125	if (cache->nobjs >= min)
 126		return 0;
 127	while (cache->nobjs < max) {
 128		page = (void *)__get_free_page(PGALLOC_GFP);
 129		if (!page)
 130			return -ENOMEM;
 131		cache->objects[cache->nobjs++] = page;
 132	}
 133	return 0;
 134}
 135
 136static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc)
 137{
 138	while (mc->nobjs)
 139		free_page((unsigned long)mc->objects[--mc->nobjs]);
 140}
 141
 142static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc)
 143{
 144	void *p;
 145
 146	BUG_ON(!mc || !mc->nobjs);
 147	p = mc->objects[--mc->nobjs];
 148	return p;
 149}
 150
 151static void clear_stage2_pgd_entry(struct kvm *kvm, pgd_t *pgd, phys_addr_t addr)
 152{
 153	pud_t *pud_table __maybe_unused = stage2_pud_offset(pgd, 0UL);
 154	stage2_pgd_clear(pgd);
 155	kvm_tlb_flush_vmid_ipa(kvm, addr);
 156	stage2_pud_free(pud_table);
 157	put_page(virt_to_page(pgd));
 158}
 159
 160static void clear_stage2_pud_entry(struct kvm *kvm, pud_t *pud, phys_addr_t addr)
 161{
 162	pmd_t *pmd_table __maybe_unused = stage2_pmd_offset(pud, 0);
 163	VM_BUG_ON(stage2_pud_huge(*pud));
 164	stage2_pud_clear(pud);
 165	kvm_tlb_flush_vmid_ipa(kvm, addr);
 166	stage2_pmd_free(pmd_table);
 167	put_page(virt_to_page(pud));
 168}
 169
 170static void clear_stage2_pmd_entry(struct kvm *kvm, pmd_t *pmd, phys_addr_t addr)
 171{
 172	pte_t *pte_table = pte_offset_kernel(pmd, 0);
 173	VM_BUG_ON(pmd_thp_or_huge(*pmd));
 174	pmd_clear(pmd);
 175	kvm_tlb_flush_vmid_ipa(kvm, addr);
 176	pte_free_kernel(NULL, pte_table);
 177	put_page(virt_to_page(pmd));
 178}
 179
 180/*
 181 * Unmapping vs dcache management:
 182 *
 183 * If a guest maps certain memory pages as uncached, all writes will
 184 * bypass the data cache and go directly to RAM.  However, the CPUs
 185 * can still speculate reads (not writes) and fill cache lines with
 186 * data.
 187 *
 188 * Those cache lines will be *clean* cache lines though, so a
 189 * clean+invalidate operation is equivalent to an invalidate
 190 * operation, because no cache lines are marked dirty.
 191 *
 192 * Those clean cache lines could be filled prior to an uncached write
 193 * by the guest, and the cache coherent IO subsystem would therefore
 194 * end up writing old data to disk.
 195 *
 196 * This is why right after unmapping a page/section and invalidating
 197 * the corresponding TLBs, we call kvm_flush_dcache_p*() to make sure
 198 * the IO subsystem will never hit in the cache.
 199 */
 200static void unmap_stage2_ptes(struct kvm *kvm, pmd_t *pmd,
 201		       phys_addr_t addr, phys_addr_t end)
 202{
 203	phys_addr_t start_addr = addr;
 204	pte_t *pte, *start_pte;
 205
 206	start_pte = pte = pte_offset_kernel(pmd, addr);
 207	do {
 208		if (!pte_none(*pte)) {
 209			pte_t old_pte = *pte;
 210
 211			kvm_set_pte(pte, __pte(0));
 212			kvm_tlb_flush_vmid_ipa(kvm, addr);
 213
 214			/* No need to invalidate the cache for device mappings */
 215			if (!kvm_is_device_pfn(pte_pfn(old_pte)))
 216				kvm_flush_dcache_pte(old_pte);
 217
 218			put_page(virt_to_page(pte));
 219		}
 220	} while (pte++, addr += PAGE_SIZE, addr != end);
 221
 222	if (stage2_pte_table_empty(start_pte))
 223		clear_stage2_pmd_entry(kvm, pmd, start_addr);
 224}
 225
 226static void unmap_stage2_pmds(struct kvm *kvm, pud_t *pud,
 227		       phys_addr_t addr, phys_addr_t end)
 228{
 229	phys_addr_t next, start_addr = addr;
 230	pmd_t *pmd, *start_pmd;
 231
 232	start_pmd = pmd = stage2_pmd_offset(pud, addr);
 233	do {
 234		next = stage2_pmd_addr_end(addr, end);
 235		if (!pmd_none(*pmd)) {
 236			if (pmd_thp_or_huge(*pmd)) {
 237				pmd_t old_pmd = *pmd;
 238
 239				pmd_clear(pmd);
 240				kvm_tlb_flush_vmid_ipa(kvm, addr);
 241
 242				kvm_flush_dcache_pmd(old_pmd);
 243
 244				put_page(virt_to_page(pmd));
 245			} else {
 246				unmap_stage2_ptes(kvm, pmd, addr, next);
 247			}
 248		}
 249	} while (pmd++, addr = next, addr != end);
 250
 251	if (stage2_pmd_table_empty(start_pmd))
 252		clear_stage2_pud_entry(kvm, pud, start_addr);
 253}
 254
 255static void unmap_stage2_puds(struct kvm *kvm, pgd_t *pgd,
 256		       phys_addr_t addr, phys_addr_t end)
 257{
 258	phys_addr_t next, start_addr = addr;
 259	pud_t *pud, *start_pud;
 260
 261	start_pud = pud = stage2_pud_offset(pgd, addr);
 262	do {
 263		next = stage2_pud_addr_end(addr, end);
 264		if (!stage2_pud_none(*pud)) {
 265			if (stage2_pud_huge(*pud)) {
 266				pud_t old_pud = *pud;
 267
 268				stage2_pud_clear(pud);
 269				kvm_tlb_flush_vmid_ipa(kvm, addr);
 270				kvm_flush_dcache_pud(old_pud);
 271				put_page(virt_to_page(pud));
 272			} else {
 273				unmap_stage2_pmds(kvm, pud, addr, next);
 274			}
 275		}
 276	} while (pud++, addr = next, addr != end);
 277
 278	if (stage2_pud_table_empty(start_pud))
 279		clear_stage2_pgd_entry(kvm, pgd, start_addr);
 280}
 281
 282/**
 283 * unmap_stage2_range -- Clear stage2 page table entries to unmap a range
 284 * @kvm:   The VM pointer
 285 * @start: The intermediate physical base address of the range to unmap
 286 * @size:  The size of the area to unmap
 287 *
 288 * Clear a range of stage-2 mappings, lowering the various ref-counts.  Must
 289 * be called while holding mmu_lock (unless for freeing the stage2 pgd before
 290 * destroying the VM), otherwise another faulting VCPU may come in and mess
 291 * with things behind our backs.
 292 */
 293static void unmap_stage2_range(struct kvm *kvm, phys_addr_t start, u64 size)
 294{
 295	pgd_t *pgd;
 296	phys_addr_t addr = start, end = start + size;
 297	phys_addr_t next;
 298
 299	assert_spin_locked(&kvm->mmu_lock);
 300	WARN_ON(size & ~PAGE_MASK);
 301
 302	pgd = kvm->arch.pgd + stage2_pgd_index(addr);
 303	do {
 304		/*
 305		 * Make sure the page table is still active, as another thread
 306		 * could have possibly freed the page table, while we released
 307		 * the lock.
 308		 */
 309		if (!READ_ONCE(kvm->arch.pgd))
 310			break;
 311		next = stage2_pgd_addr_end(addr, end);
 312		if (!stage2_pgd_none(*pgd))
 313			unmap_stage2_puds(kvm, pgd, addr, next);
 314		/*
 315		 * If the range is too large, release the kvm->mmu_lock
 316		 * to prevent starvation and lockup detector warnings.
 317		 */
 318		if (next != end)
 319			cond_resched_lock(&kvm->mmu_lock);
 320	} while (pgd++, addr = next, addr != end);
 321}
 322
 323static void stage2_flush_ptes(struct kvm *kvm, pmd_t *pmd,
 324			      phys_addr_t addr, phys_addr_t end)
 325{
 326	pte_t *pte;
 327
 328	pte = pte_offset_kernel(pmd, addr);
 329	do {
 330		if (!pte_none(*pte) && !kvm_is_device_pfn(pte_pfn(*pte)))
 331			kvm_flush_dcache_pte(*pte);
 332	} while (pte++, addr += PAGE_SIZE, addr != end);
 333}
 334
 335static void stage2_flush_pmds(struct kvm *kvm, pud_t *pud,
 336			      phys_addr_t addr, phys_addr_t end)
 337{
 338	pmd_t *pmd;
 339	phys_addr_t next;
 340
 341	pmd = stage2_pmd_offset(pud, addr);
 342	do {
 343		next = stage2_pmd_addr_end(addr, end);
 344		if (!pmd_none(*pmd)) {
 345			if (pmd_thp_or_huge(*pmd))
 346				kvm_flush_dcache_pmd(*pmd);
 347			else
 348				stage2_flush_ptes(kvm, pmd, addr, next);
 349		}
 350	} while (pmd++, addr = next, addr != end);
 351}
 352
 353static void stage2_flush_puds(struct kvm *kvm, pgd_t *pgd,
 354			      phys_addr_t addr, phys_addr_t end)
 355{
 356	pud_t *pud;
 357	phys_addr_t next;
 358
 359	pud = stage2_pud_offset(pgd, addr);
 360	do {
 361		next = stage2_pud_addr_end(addr, end);
 362		if (!stage2_pud_none(*pud)) {
 363			if (stage2_pud_huge(*pud))
 364				kvm_flush_dcache_pud(*pud);
 365			else
 366				stage2_flush_pmds(kvm, pud, addr, next);
 367		}
 368	} while (pud++, addr = next, addr != end);
 369}
 370
 371static void stage2_flush_memslot(struct kvm *kvm,
 372				 struct kvm_memory_slot *memslot)
 373{
 374	phys_addr_t addr = memslot->base_gfn << PAGE_SHIFT;
 375	phys_addr_t end = addr + PAGE_SIZE * memslot->npages;
 376	phys_addr_t next;
 377	pgd_t *pgd;
 378
 379	pgd = kvm->arch.pgd + stage2_pgd_index(addr);
 380	do {
 381		next = stage2_pgd_addr_end(addr, end);
 382		stage2_flush_puds(kvm, pgd, addr, next);
 383	} while (pgd++, addr = next, addr != end);
 384}
 385
 386/**
 387 * stage2_flush_vm - Invalidate cache for pages mapped in stage 2
 388 * @kvm: The struct kvm pointer
 389 *
 390 * Go through the stage 2 page tables and invalidate any cache lines
 391 * backing memory already mapped to the VM.
 392 */
 393static void stage2_flush_vm(struct kvm *kvm)
 394{
 395	struct kvm_memslots *slots;
 396	struct kvm_memory_slot *memslot;
 397	int idx;
 398
 399	idx = srcu_read_lock(&kvm->srcu);
 400	spin_lock(&kvm->mmu_lock);
 401
 402	slots = kvm_memslots(kvm);
 403	kvm_for_each_memslot(memslot, slots)
 404		stage2_flush_memslot(kvm, memslot);
 405
 406	spin_unlock(&kvm->mmu_lock);
 407	srcu_read_unlock(&kvm->srcu, idx);
 408}
 409
 410static void clear_hyp_pgd_entry(pgd_t *pgd)
 411{
 412	pud_t *pud_table __maybe_unused = pud_offset(pgd, 0UL);
 413	pgd_clear(pgd);
 414	pud_free(NULL, pud_table);
 415	put_page(virt_to_page(pgd));
 416}
 417
 418static void clear_hyp_pud_entry(pud_t *pud)
 419{
 420	pmd_t *pmd_table __maybe_unused = pmd_offset(pud, 0);
 421	VM_BUG_ON(pud_huge(*pud));
 422	pud_clear(pud);
 423	pmd_free(NULL, pmd_table);
 424	put_page(virt_to_page(pud));
 425}
 426
 427static void clear_hyp_pmd_entry(pmd_t *pmd)
 428{
 429	pte_t *pte_table = pte_offset_kernel(pmd, 0);
 430	VM_BUG_ON(pmd_thp_or_huge(*pmd));
 431	pmd_clear(pmd);
 432	pte_free_kernel(NULL, pte_table);
 433	put_page(virt_to_page(pmd));
 434}
 435
 436static void unmap_hyp_ptes(pmd_t *pmd, phys_addr_t addr, phys_addr_t end)
 437{
 438	pte_t *pte, *start_pte;
 439
 440	start_pte = pte = pte_offset_kernel(pmd, addr);
 441	do {
 442		if (!pte_none(*pte)) {
 443			kvm_set_pte(pte, __pte(0));
 444			put_page(virt_to_page(pte));
 445		}
 446	} while (pte++, addr += PAGE_SIZE, addr != end);
 447
 448	if (hyp_pte_table_empty(start_pte))
 449		clear_hyp_pmd_entry(pmd);
 450}
 451
 452static void unmap_hyp_pmds(pud_t *pud, phys_addr_t addr, phys_addr_t end)
 453{
 454	phys_addr_t next;
 455	pmd_t *pmd, *start_pmd;
 456
 457	start_pmd = pmd = pmd_offset(pud, addr);
 458	do {
 459		next = pmd_addr_end(addr, end);
 460		/* Hyp doesn't use huge pmds */
 461		if (!pmd_none(*pmd))
 462			unmap_hyp_ptes(pmd, addr, next);
 463	} while (pmd++, addr = next, addr != end);
 464
 465	if (hyp_pmd_table_empty(start_pmd))
 466		clear_hyp_pud_entry(pud);
 467}
 468
 469static void unmap_hyp_puds(pgd_t *pgd, phys_addr_t addr, phys_addr_t end)
 470{
 471	phys_addr_t next;
 472	pud_t *pud, *start_pud;
 473
 474	start_pud = pud = pud_offset(pgd, addr);
 475	do {
 476		next = pud_addr_end(addr, end);
 477		/* Hyp doesn't use huge puds */
 478		if (!pud_none(*pud))
 479			unmap_hyp_pmds(pud, addr, next);
 480	} while (pud++, addr = next, addr != end);
 481
 482	if (hyp_pud_table_empty(start_pud))
 483		clear_hyp_pgd_entry(pgd);
 484}
 485
 486static unsigned int kvm_pgd_index(unsigned long addr, unsigned int ptrs_per_pgd)
 487{
 488	return (addr >> PGDIR_SHIFT) & (ptrs_per_pgd - 1);
 489}
 490
 491static void __unmap_hyp_range(pgd_t *pgdp, unsigned long ptrs_per_pgd,
 492			      phys_addr_t start, u64 size)
 493{
 494	pgd_t *pgd;
 495	phys_addr_t addr = start, end = start + size;
 496	phys_addr_t next;
 497
 498	/*
 499	 * We don't unmap anything from HYP, except at the hyp tear down.
 500	 * Hence, we don't have to invalidate the TLBs here.
 501	 */
 502	pgd = pgdp + kvm_pgd_index(addr, ptrs_per_pgd);
 503	do {
 504		next = pgd_addr_end(addr, end);
 505		if (!pgd_none(*pgd))
 506			unmap_hyp_puds(pgd, addr, next);
 507	} while (pgd++, addr = next, addr != end);
 508}
 509
 510static void unmap_hyp_range(pgd_t *pgdp, phys_addr_t start, u64 size)
 511{
 512	__unmap_hyp_range(pgdp, PTRS_PER_PGD, start, size);
 513}
 514
 515static void unmap_hyp_idmap_range(pgd_t *pgdp, phys_addr_t start, u64 size)
 516{
 517	__unmap_hyp_range(pgdp, __kvm_idmap_ptrs_per_pgd(), start, size);
 518}
 519
 520/**
 521 * free_hyp_pgds - free Hyp-mode page tables
 522 *
 523 * Assumes hyp_pgd is a page table used strictly in Hyp-mode and
 524 * therefore contains either mappings in the kernel memory area (above
 525 * PAGE_OFFSET), or device mappings in the idmap range.
 526 *
 527 * boot_hyp_pgd should only map the idmap range, and is only used in
 528 * the extended idmap case.
 529 */
 530void free_hyp_pgds(void)
 531{
 532	pgd_t *id_pgd;
 533
 534	mutex_lock(&kvm_hyp_pgd_mutex);
 535
 536	id_pgd = boot_hyp_pgd ? boot_hyp_pgd : hyp_pgd;
 537
 538	if (id_pgd) {
 539		/* In case we never called hyp_mmu_init() */
 540		if (!io_map_base)
 541			io_map_base = hyp_idmap_start;
 542		unmap_hyp_idmap_range(id_pgd, io_map_base,
 543				      hyp_idmap_start + PAGE_SIZE - io_map_base);
 544	}
 545
 546	if (boot_hyp_pgd) {
 547		free_pages((unsigned long)boot_hyp_pgd, hyp_pgd_order);
 548		boot_hyp_pgd = NULL;
 549	}
 550
 551	if (hyp_pgd) {
 552		unmap_hyp_range(hyp_pgd, kern_hyp_va(PAGE_OFFSET),
 553				(uintptr_t)high_memory - PAGE_OFFSET);
 554
 555		free_pages((unsigned long)hyp_pgd, hyp_pgd_order);
 556		hyp_pgd = NULL;
 557	}
 558	if (merged_hyp_pgd) {
 559		clear_page(merged_hyp_pgd);
 560		free_page((unsigned long)merged_hyp_pgd);
 561		merged_hyp_pgd = NULL;
 562	}
 563
 564	mutex_unlock(&kvm_hyp_pgd_mutex);
 565}
 566
 567static void create_hyp_pte_mappings(pmd_t *pmd, unsigned long start,
 568				    unsigned long end, unsigned long pfn,
 569				    pgprot_t prot)
 570{
 571	pte_t *pte;
 572	unsigned long addr;
 573
 574	addr = start;
 575	do {
 576		pte = pte_offset_kernel(pmd, addr);
 577		kvm_set_pte(pte, pfn_pte(pfn, prot));
 578		get_page(virt_to_page(pte));
 579		kvm_flush_dcache_to_poc(pte, sizeof(*pte));
 580		pfn++;
 581	} while (addr += PAGE_SIZE, addr != end);
 582}
 583
 584static int create_hyp_pmd_mappings(pud_t *pud, unsigned long start,
 585				   unsigned long end, unsigned long pfn,
 586				   pgprot_t prot)
 587{
 588	pmd_t *pmd;
 589	pte_t *pte;
 590	unsigned long addr, next;
 591
 592	addr = start;
 593	do {
 594		pmd = pmd_offset(pud, addr);
 595
 596		BUG_ON(pmd_sect(*pmd));
 597
 598		if (pmd_none(*pmd)) {
 599			pte = pte_alloc_one_kernel(NULL, addr);
 600			if (!pte) {
 601				kvm_err("Cannot allocate Hyp pte\n");
 602				return -ENOMEM;
 603			}
 604			pmd_populate_kernel(NULL, pmd, pte);
 605			get_page(virt_to_page(pmd));
 606			kvm_flush_dcache_to_poc(pmd, sizeof(*pmd));
 607		}
 608
 609		next = pmd_addr_end(addr, end);
 610
 611		create_hyp_pte_mappings(pmd, addr, next, pfn, prot);
 612		pfn += (next - addr) >> PAGE_SHIFT;
 613	} while (addr = next, addr != end);
 614
 615	return 0;
 616}
 617
 618static int create_hyp_pud_mappings(pgd_t *pgd, unsigned long start,
 619				   unsigned long end, unsigned long pfn,
 620				   pgprot_t prot)
 621{
 622	pud_t *pud;
 623	pmd_t *pmd;
 624	unsigned long addr, next;
 625	int ret;
 626
 627	addr = start;
 628	do {
 629		pud = pud_offset(pgd, addr);
 630
 631		if (pud_none_or_clear_bad(pud)) {
 632			pmd = pmd_alloc_one(NULL, addr);
 633			if (!pmd) {
 634				kvm_err("Cannot allocate Hyp pmd\n");
 635				return -ENOMEM;
 636			}
 637			pud_populate(NULL, pud, pmd);
 638			get_page(virt_to_page(pud));
 639			kvm_flush_dcache_to_poc(pud, sizeof(*pud));
 640		}
 641
 642		next = pud_addr_end(addr, end);
 643		ret = create_hyp_pmd_mappings(pud, addr, next, pfn, prot);
 644		if (ret)
 645			return ret;
 646		pfn += (next - addr) >> PAGE_SHIFT;
 647	} while (addr = next, addr != end);
 648
 649	return 0;
 650}
 651
 652static int __create_hyp_mappings(pgd_t *pgdp, unsigned long ptrs_per_pgd,
 653				 unsigned long start, unsigned long end,
 654				 unsigned long pfn, pgprot_t prot)
 655{
 656	pgd_t *pgd;
 657	pud_t *pud;
 658	unsigned long addr, next;
 659	int err = 0;
 660
 661	mutex_lock(&kvm_hyp_pgd_mutex);
 662	addr = start & PAGE_MASK;
 663	end = PAGE_ALIGN(end);
 664	do {
 665		pgd = pgdp + kvm_pgd_index(addr, ptrs_per_pgd);
 666
 667		if (pgd_none(*pgd)) {
 668			pud = pud_alloc_one(NULL, addr);
 669			if (!pud) {
 670				kvm_err("Cannot allocate Hyp pud\n");
 671				err = -ENOMEM;
 672				goto out;
 673			}
 674			pgd_populate(NULL, pgd, pud);
 675			get_page(virt_to_page(pgd));
 676			kvm_flush_dcache_to_poc(pgd, sizeof(*pgd));
 677		}
 678
 679		next = pgd_addr_end(addr, end);
 680		err = create_hyp_pud_mappings(pgd, addr, next, pfn, prot);
 681		if (err)
 682			goto out;
 683		pfn += (next - addr) >> PAGE_SHIFT;
 684	} while (addr = next, addr != end);
 685out:
 686	mutex_unlock(&kvm_hyp_pgd_mutex);
 687	return err;
 688}
 689
 690static phys_addr_t kvm_kaddr_to_phys(void *kaddr)
 691{
 692	if (!is_vmalloc_addr(kaddr)) {
 693		BUG_ON(!virt_addr_valid(kaddr));
 694		return __pa(kaddr);
 695	} else {
 696		return page_to_phys(vmalloc_to_page(kaddr)) +
 697		       offset_in_page(kaddr);
 698	}
 699}
 700
 701/**
 702 * create_hyp_mappings - duplicate a kernel virtual address range in Hyp mode
 703 * @from:	The virtual kernel start address of the range
 704 * @to:		The virtual kernel end address of the range (exclusive)
 705 * @prot:	The protection to be applied to this range
 706 *
 707 * The same virtual address as the kernel virtual address is also used
 708 * in Hyp-mode mapping (modulo HYP_PAGE_OFFSET) to the same underlying
 709 * physical pages.
 710 */
 711int create_hyp_mappings(void *from, void *to, pgprot_t prot)
 712{
 713	phys_addr_t phys_addr;
 714	unsigned long virt_addr;
 715	unsigned long start = kern_hyp_va((unsigned long)from);
 716	unsigned long end = kern_hyp_va((unsigned long)to);
 717
 718	if (is_kernel_in_hyp_mode())
 719		return 0;
 720
 721	start = start & PAGE_MASK;
 722	end = PAGE_ALIGN(end);
 723
 724	for (virt_addr = start; virt_addr < end; virt_addr += PAGE_SIZE) {
 725		int err;
 726
 727		phys_addr = kvm_kaddr_to_phys(from + virt_addr - start);
 728		err = __create_hyp_mappings(hyp_pgd, PTRS_PER_PGD,
 729					    virt_addr, virt_addr + PAGE_SIZE,
 730					    __phys_to_pfn(phys_addr),
 731					    prot);
 732		if (err)
 733			return err;
 734	}
 735
 736	return 0;
 737}
 738
 739static int __create_hyp_private_mapping(phys_addr_t phys_addr, size_t size,
 740					unsigned long *haddr, pgprot_t prot)
 741{
 742	pgd_t *pgd = hyp_pgd;
 743	unsigned long base;
 744	int ret = 0;
 745
 746	mutex_lock(&kvm_hyp_pgd_mutex);
 747
 748	/*
 749	 * This assumes that we we have enough space below the idmap
 750	 * page to allocate our VAs. If not, the check below will
 751	 * kick. A potential alternative would be to detect that
 752	 * overflow and switch to an allocation above the idmap.
 753	 *
 754	 * The allocated size is always a multiple of PAGE_SIZE.
 755	 */
 756	size = PAGE_ALIGN(size + offset_in_page(phys_addr));
 757	base = io_map_base - size;
 758
 759	/*
 760	 * Verify that BIT(VA_BITS - 1) hasn't been flipped by
 761	 * allocating the new area, as it would indicate we've
 762	 * overflowed the idmap/IO address range.
 763	 */
 764	if ((base ^ io_map_base) & BIT(VA_BITS - 1))
 765		ret = -ENOMEM;
 766	else
 767		io_map_base = base;
 768
 769	mutex_unlock(&kvm_hyp_pgd_mutex);
 770
 771	if (ret)
 772		goto out;
 773
 774	if (__kvm_cpu_uses_extended_idmap())
 775		pgd = boot_hyp_pgd;
 776
 777	ret = __create_hyp_mappings(pgd, __kvm_idmap_ptrs_per_pgd(),
 778				    base, base + size,
 779				    __phys_to_pfn(phys_addr), prot);
 780	if (ret)
 781		goto out;
 782
 783	*haddr = base + offset_in_page(phys_addr);
 784
 785out:
 786	return ret;
 787}
 788
 789/**
 790 * create_hyp_io_mappings - Map IO into both kernel and HYP
 791 * @phys_addr:	The physical start address which gets mapped
 792 * @size:	Size of the region being mapped
 793 * @kaddr:	Kernel VA for this mapping
 794 * @haddr:	HYP VA for this mapping
 795 */
 796int create_hyp_io_mappings(phys_addr_t phys_addr, size_t size,
 797			   void __iomem **kaddr,
 798			   void __iomem **haddr)
 799{
 800	unsigned long addr;
 801	int ret;
 802
 803	*kaddr = ioremap(phys_addr, size);
 804	if (!*kaddr)
 805		return -ENOMEM;
 806
 807	if (is_kernel_in_hyp_mode()) {
 808		*haddr = *kaddr;
 809		return 0;
 810	}
 811
 812	ret = __create_hyp_private_mapping(phys_addr, size,
 813					   &addr, PAGE_HYP_DEVICE);
 814	if (ret) {
 815		iounmap(*kaddr);
 816		*kaddr = NULL;
 817		*haddr = NULL;
 818		return ret;
 819	}
 820
 821	*haddr = (void __iomem *)addr;
 822	return 0;
 823}
 824
 825/**
 826 * create_hyp_exec_mappings - Map an executable range into HYP
 827 * @phys_addr:	The physical start address which gets mapped
 828 * @size:	Size of the region being mapped
 829 * @haddr:	HYP VA for this mapping
 830 */
 831int create_hyp_exec_mappings(phys_addr_t phys_addr, size_t size,
 832			     void **haddr)
 833{
 834	unsigned long addr;
 835	int ret;
 836
 837	BUG_ON(is_kernel_in_hyp_mode());
 838
 839	ret = __create_hyp_private_mapping(phys_addr, size,
 840					   &addr, PAGE_HYP_EXEC);
 841	if (ret) {
 842		*haddr = NULL;
 843		return ret;
 844	}
 845
 846	*haddr = (void *)addr;
 847	return 0;
 848}
 849
 850/**
 851 * kvm_alloc_stage2_pgd - allocate level-1 table for stage-2 translation.
 852 * @kvm:	The KVM struct pointer for the VM.
 853 *
 854 * Allocates only the stage-2 HW PGD level table(s) (can support either full
 855 * 40-bit input addresses or limited to 32-bit input addresses). Clears the
 856 * allocated pages.
 857 *
 858 * Note we don't need locking here as this is only called when the VM is
 859 * created, which can only be done once.
 860 */
 861int kvm_alloc_stage2_pgd(struct kvm *kvm)
 862{
 863	pgd_t *pgd;
 864
 865	if (kvm->arch.pgd != NULL) {
 866		kvm_err("kvm_arch already initialized?\n");
 867		return -EINVAL;
 868	}
 869
 870	/* Allocate the HW PGD, making sure that each page gets its own refcount */
 871	pgd = alloc_pages_exact(S2_PGD_SIZE, GFP_KERNEL | __GFP_ZERO);
 872	if (!pgd)
 873		return -ENOMEM;
 874
 875	kvm->arch.pgd = pgd;
 876	return 0;
 877}
 878
 879static void stage2_unmap_memslot(struct kvm *kvm,
 880				 struct kvm_memory_slot *memslot)
 881{
 882	hva_t hva = memslot->userspace_addr;
 883	phys_addr_t addr = memslot->base_gfn << PAGE_SHIFT;
 884	phys_addr_t size = PAGE_SIZE * memslot->npages;
 885	hva_t reg_end = hva + size;
 886
 887	/*
 888	 * A memory region could potentially cover multiple VMAs, and any holes
 889	 * between them, so iterate over all of them to find out if we should
 890	 * unmap any of them.
 891	 *
 892	 *     +--------------------------------------------+
 893	 * +---------------+----------------+   +----------------+
 894	 * |   : VMA 1     |      VMA 2     |   |    VMA 3  :    |
 895	 * +---------------+----------------+   +----------------+
 896	 *     |               memory region                |
 897	 *     +--------------------------------------------+
 898	 */
 899	do {
 900		struct vm_area_struct *vma = find_vma(current->mm, hva);
 901		hva_t vm_start, vm_end;
 902
 903		if (!vma || vma->vm_start >= reg_end)
 904			break;
 905
 906		/*
 907		 * Take the intersection of this VMA with the memory region
 908		 */
 909		vm_start = max(hva, vma->vm_start);
 910		vm_end = min(reg_end, vma->vm_end);
 911
 912		if (!(vma->vm_flags & VM_PFNMAP)) {
 913			gpa_t gpa = addr + (vm_start - memslot->userspace_addr);
 914			unmap_stage2_range(kvm, gpa, vm_end - vm_start);
 915		}
 916		hva = vm_end;
 917	} while (hva < reg_end);
 918}
 919
 920/**
 921 * stage2_unmap_vm - Unmap Stage-2 RAM mappings
 922 * @kvm: The struct kvm pointer
 923 *
 924 * Go through the memregions and unmap any reguler RAM
 925 * backing memory already mapped to the VM.
 926 */
 927void stage2_unmap_vm(struct kvm *kvm)
 928{
 929	struct kvm_memslots *slots;
 930	struct kvm_memory_slot *memslot;
 931	int idx;
 932
 933	idx = srcu_read_lock(&kvm->srcu);
 934	down_read(&current->mm->mmap_sem);
 935	spin_lock(&kvm->mmu_lock);
 936
 937	slots = kvm_memslots(kvm);
 938	kvm_for_each_memslot(memslot, slots)
 939		stage2_unmap_memslot(kvm, memslot);
 940
 941	spin_unlock(&kvm->mmu_lock);
 942	up_read(&current->mm->mmap_sem);
 943	srcu_read_unlock(&kvm->srcu, idx);
 944}
 945
 946/**
 947 * kvm_free_stage2_pgd - free all stage-2 tables
 948 * @kvm:	The KVM struct pointer for the VM.
 949 *
 950 * Walks the level-1 page table pointed to by kvm->arch.pgd and frees all
 951 * underlying level-2 and level-3 tables before freeing the actual level-1 table
 952 * and setting the struct pointer to NULL.
 953 */
 954void kvm_free_stage2_pgd(struct kvm *kvm)
 955{
 956	void *pgd = NULL;
 957
 958	spin_lock(&kvm->mmu_lock);
 959	if (kvm->arch.pgd) {
 960		unmap_stage2_range(kvm, 0, KVM_PHYS_SIZE);
 961		pgd = READ_ONCE(kvm->arch.pgd);
 962		kvm->arch.pgd = NULL;
 963	}
 964	spin_unlock(&kvm->mmu_lock);
 965
 966	/* Free the HW pgd, one page at a time */
 967	if (pgd)
 968		free_pages_exact(pgd, S2_PGD_SIZE);
 969}
 970
 971static pud_t *stage2_get_pud(struct kvm *kvm, struct kvm_mmu_memory_cache *cache,
 972			     phys_addr_t addr)
 973{
 974	pgd_t *pgd;
 975	pud_t *pud;
 976
 977	pgd = kvm->arch.pgd + stage2_pgd_index(addr);
 978	if (WARN_ON(stage2_pgd_none(*pgd))) {
 979		if (!cache)
 980			return NULL;
 981		pud = mmu_memory_cache_alloc(cache);
 982		stage2_pgd_populate(pgd, pud);
 983		get_page(virt_to_page(pgd));
 984	}
 985
 986	return stage2_pud_offset(pgd, addr);
 987}
 988
 989static pmd_t *stage2_get_pmd(struct kvm *kvm, struct kvm_mmu_memory_cache *cache,
 990			     phys_addr_t addr)
 991{
 992	pud_t *pud;
 993	pmd_t *pmd;
 994
 995	pud = stage2_get_pud(kvm, cache, addr);
 996	if (!pud)
 997		return NULL;
 998
 999	if (stage2_pud_none(*pud)) {
1000		if (!cache)
1001			return NULL;
1002		pmd = mmu_memory_cache_alloc(cache);
1003		stage2_pud_populate(pud, pmd);
1004		get_page(virt_to_page(pud));
1005	}
1006
1007	return stage2_pmd_offset(pud, addr);
1008}
1009
1010static int stage2_set_pmd_huge(struct kvm *kvm, struct kvm_mmu_memory_cache
1011			       *cache, phys_addr_t addr, const pmd_t *new_pmd)
1012{
1013	pmd_t *pmd, old_pmd;
1014
1015	pmd = stage2_get_pmd(kvm, cache, addr);
1016	VM_BUG_ON(!pmd);
1017
1018	/*
1019	 * Mapping in huge pages should only happen through a fault.  If a
1020	 * page is merged into a transparent huge page, the individual
1021	 * subpages of that huge page should be unmapped through MMU
1022	 * notifiers before we get here.
1023	 *
1024	 * Merging of CompoundPages is not supported; they should become
1025	 * splitting first, unmapped, merged, and mapped back in on-demand.
1026	 */
1027	VM_BUG_ON(pmd_present(*pmd) && pmd_pfn(*pmd) != pmd_pfn(*new_pmd));
1028
1029	old_pmd = *pmd;
1030	if (pmd_present(old_pmd)) {
1031		pmd_clear(pmd);
1032		kvm_tlb_flush_vmid_ipa(kvm, addr);
1033	} else {
1034		get_page(virt_to_page(pmd));
1035	}
1036
1037	kvm_set_pmd(pmd, *new_pmd);
1038	return 0;
1039}
1040
1041static bool stage2_is_exec(struct kvm *kvm, phys_addr_t addr)
1042{
1043	pmd_t *pmdp;
1044	pte_t *ptep;
1045
1046	pmdp = stage2_get_pmd(kvm, NULL, addr);
1047	if (!pmdp || pmd_none(*pmdp) || !pmd_present(*pmdp))
1048		return false;
1049
1050	if (pmd_thp_or_huge(*pmdp))
1051		return kvm_s2pmd_exec(pmdp);
1052
1053	ptep = pte_offset_kernel(pmdp, addr);
1054	if (!ptep || pte_none(*ptep) || !pte_present(*ptep))
1055		return false;
1056
1057	return kvm_s2pte_exec(ptep);
1058}
1059
1060static int stage2_set_pte(struct kvm *kvm, struct kvm_mmu_memory_cache *cache,
1061			  phys_addr_t addr, const pte_t *new_pte,
1062			  unsigned long flags)
1063{
1064	pmd_t *pmd;
1065	pte_t *pte, old_pte;
1066	bool iomap = flags & KVM_S2PTE_FLAG_IS_IOMAP;
1067	bool logging_active = flags & KVM_S2_FLAG_LOGGING_ACTIVE;
1068
1069	VM_BUG_ON(logging_active && !cache);
1070
1071	/* Create stage-2 page table mapping - Levels 0 and 1 */
1072	pmd = stage2_get_pmd(kvm, cache, addr);
1073	if (!pmd) {
1074		/*
1075		 * Ignore calls from kvm_set_spte_hva for unallocated
1076		 * address ranges.
1077		 */
1078		return 0;
1079	}
1080
1081	/*
1082	 * While dirty page logging - dissolve huge PMD, then continue on to
1083	 * allocate page.
1084	 */
1085	if (logging_active)
1086		stage2_dissolve_pmd(kvm, addr, pmd);
1087
1088	/* Create stage-2 page mappings - Level 2 */
1089	if (pmd_none(*pmd)) {
1090		if (!cache)
1091			return 0; /* ignore calls from kvm_set_spte_hva */
1092		pte = mmu_memory_cache_alloc(cache);
1093		pmd_populate_kernel(NULL, pmd, pte);
1094		get_page(virt_to_page(pmd));
1095	}
1096
1097	pte = pte_offset_kernel(pmd, addr);
1098
1099	if (iomap && pte_present(*pte))
1100		return -EFAULT;
1101
1102	/* Create 2nd stage page table mapping - Level 3 */
1103	old_pte = *pte;
1104	if (pte_present(old_pte)) {
1105		kvm_set_pte(pte, __pte(0));
1106		kvm_tlb_flush_vmid_ipa(kvm, addr);
1107	} else {
1108		get_page(virt_to_page(pte));
1109	}
1110
1111	kvm_set_pte(pte, *new_pte);
1112	return 0;
1113}
1114
1115#ifndef __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
1116static int stage2_ptep_test_and_clear_young(pte_t *pte)
1117{
1118	if (pte_young(*pte)) {
1119		*pte = pte_mkold(*pte);
1120		return 1;
1121	}
1122	return 0;
1123}
1124#else
1125static int stage2_ptep_test_and_clear_young(pte_t *pte)
1126{
1127	return __ptep_test_and_clear_young(pte);
1128}
1129#endif
1130
1131static int stage2_pmdp_test_and_clear_young(pmd_t *pmd)
1132{
1133	return stage2_ptep_test_and_clear_young((pte_t *)pmd);
1134}
1135
1136/**
1137 * kvm_phys_addr_ioremap - map a device range to guest IPA
1138 *
1139 * @kvm:	The KVM pointer
1140 * @guest_ipa:	The IPA at which to insert the mapping
1141 * @pa:		The physical address of the device
1142 * @size:	The size of the mapping
1143 */
1144int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa,
1145			  phys_addr_t pa, unsigned long size, bool writable)
1146{
1147	phys_addr_t addr, end;
1148	int ret = 0;
1149	unsigned long pfn;
1150	struct kvm_mmu_memory_cache cache = { 0, };
1151
1152	end = (guest_ipa + size + PAGE_SIZE - 1) & PAGE_MASK;
1153	pfn = __phys_to_pfn(pa);
1154
1155	for (addr = guest_ipa; addr < end; addr += PAGE_SIZE) {
1156		pte_t pte = pfn_pte(pfn, PAGE_S2_DEVICE);
1157
1158		if (writable)
1159			pte = kvm_s2pte_mkwrite(pte);
1160
1161		ret = mmu_topup_memory_cache(&cache, KVM_MMU_CACHE_MIN_PAGES,
1162						KVM_NR_MEM_OBJS);
1163		if (ret)
1164			goto out;
1165		spin_lock(&kvm->mmu_lock);
1166		ret = stage2_set_pte(kvm, &cache, addr, &pte,
1167						KVM_S2PTE_FLAG_IS_IOMAP);
1168		spin_unlock(&kvm->mmu_lock);
1169		if (ret)
1170			goto out;
1171
1172		pfn++;
1173	}
1174
1175out:
1176	mmu_free_memory_cache(&cache);
1177	return ret;
1178}
1179
1180static bool transparent_hugepage_adjust(kvm_pfn_t *pfnp, phys_addr_t *ipap)
1181{
1182	kvm_pfn_t pfn = *pfnp;
1183	gfn_t gfn = *ipap >> PAGE_SHIFT;
1184
1185	if (PageTransCompoundMap(pfn_to_page(pfn))) {
1186		unsigned long mask;
1187		/*
1188		 * The address we faulted on is backed by a transparent huge
1189		 * page.  However, because we map the compound huge page and
1190		 * not the individual tail page, we need to transfer the
1191		 * refcount to the head page.  We have to be careful that the
1192		 * THP doesn't start to split while we are adjusting the
1193		 * refcounts.
1194		 *
1195		 * We are sure this doesn't happen, because mmu_notifier_retry
1196		 * was successful and we are holding the mmu_lock, so if this
1197		 * THP is trying to split, it will be blocked in the mmu
1198		 * notifier before touching any of the pages, specifically
1199		 * before being able to call __split_huge_page_refcount().
1200		 *
1201		 * We can therefore safely transfer the refcount from PG_tail
1202		 * to PG_head and switch the pfn from a tail page to the head
1203		 * page accordingly.
1204		 */
1205		mask = PTRS_PER_PMD - 1;
1206		VM_BUG_ON((gfn & mask) != (pfn & mask));
1207		if (pfn & mask) {
1208			*ipap &= PMD_MASK;
1209			kvm_release_pfn_clean(pfn);
1210			pfn &= ~mask;
1211			kvm_get_pfn(pfn);
1212			*pfnp = pfn;
1213		}
1214
1215		return true;
1216	}
1217
1218	return false;
1219}
1220
1221static bool kvm_is_write_fault(struct kvm_vcpu *vcpu)
1222{
1223	if (kvm_vcpu_trap_is_iabt(vcpu))
1224		return false;
1225
1226	return kvm_vcpu_dabt_iswrite(vcpu);
1227}
1228
1229/**
1230 * stage2_wp_ptes - write protect PMD range
1231 * @pmd:	pointer to pmd entry
1232 * @addr:	range start address
1233 * @end:	range end address
1234 */
1235static void stage2_wp_ptes(pmd_t *pmd, phys_addr_t addr, phys_addr_t end)
1236{
1237	pte_t *pte;
1238
1239	pte = pte_offset_kernel(pmd, addr);
1240	do {
1241		if (!pte_none(*pte)) {
1242			if (!kvm_s2pte_readonly(pte))
1243				kvm_set_s2pte_readonly(pte);
1244		}
1245	} while (pte++, addr += PAGE_SIZE, addr != end);
1246}
1247
1248/**
1249 * stage2_wp_pmds - write protect PUD range
1250 * @pud:	pointer to pud entry
1251 * @addr:	range start address
1252 * @end:	range end address
1253 */
1254static void stage2_wp_pmds(pud_t *pud, phys_addr_t addr, phys_addr_t end)
1255{
1256	pmd_t *pmd;
1257	phys_addr_t next;
1258
1259	pmd = stage2_pmd_offset(pud, addr);
1260
1261	do {
1262		next = stage2_pmd_addr_end(addr, end);
1263		if (!pmd_none(*pmd)) {
1264			if (pmd_thp_or_huge(*pmd)) {
1265				if (!kvm_s2pmd_readonly(pmd))
1266					kvm_set_s2pmd_readonly(pmd);
1267			} else {
1268				stage2_wp_ptes(pmd, addr, next);
1269			}
1270		}
1271	} while (pmd++, addr = next, addr != end);
1272}
1273
1274/**
1275  * stage2_wp_puds - write protect PGD range
1276  * @pgd:	pointer to pgd entry
1277  * @addr:	range start address
1278  * @end:	range end address
1279  *
1280  * Process PUD entries, for a huge PUD we cause a panic.
1281  */
1282static void  stage2_wp_puds(pgd_t *pgd, phys_addr_t addr, phys_addr_t end)
1283{
1284	pud_t *pud;
1285	phys_addr_t next;
1286
1287	pud = stage2_pud_offset(pgd, addr);
1288	do {
1289		next = stage2_pud_addr_end(addr, end);
1290		if (!stage2_pud_none(*pud)) {
1291			/* TODO:PUD not supported, revisit later if supported */
1292			BUG_ON(stage2_pud_huge(*pud));
1293			stage2_wp_pmds(pud, addr, next);
1294		}
1295	} while (pud++, addr = next, addr != end);
1296}
1297
1298/**
1299 * stage2_wp_range() - write protect stage2 memory region range
1300 * @kvm:	The KVM pointer
1301 * @addr:	Start address of range
1302 * @end:	End address of range
1303 */
1304static void stage2_wp_range(struct kvm *kvm, phys_addr_t addr, phys_addr_t end)
1305{
1306	pgd_t *pgd;
1307	phys_addr_t next;
1308
1309	pgd = kvm->arch.pgd + stage2_pgd_index(addr);
1310	do {
1311		/*
1312		 * Release kvm_mmu_lock periodically if the memory region is
1313		 * large. Otherwise, we may see kernel panics with
1314		 * CONFIG_DETECT_HUNG_TASK, CONFIG_LOCKUP_DETECTOR,
1315		 * CONFIG_LOCKDEP. Additionally, holding the lock too long
1316		 * will also starve other vCPUs. We have to also make sure
1317		 * that the page tables are not freed while we released
1318		 * the lock.
1319		 */
1320		cond_resched_lock(&kvm->mmu_lock);
1321		if (!READ_ONCE(kvm->arch.pgd))
1322			break;
1323		next = stage2_pgd_addr_end(addr, end);
1324		if (stage2_pgd_present(*pgd))
1325			stage2_wp_puds(pgd, addr, next);
1326	} while (pgd++, addr = next, addr != end);
1327}
1328
1329/**
1330 * kvm_mmu_wp_memory_region() - write protect stage 2 entries for memory slot
1331 * @kvm:	The KVM pointer
1332 * @slot:	The memory slot to write protect
1333 *
1334 * Called to start logging dirty pages after memory region
1335 * KVM_MEM_LOG_DIRTY_PAGES operation is called. After this function returns
1336 * all present PMD and PTEs are write protected in the memory region.
1337 * Afterwards read of dirty page log can be called.
1338 *
1339 * Acquires kvm_mmu_lock. Called with kvm->slots_lock mutex acquired,
1340 * serializing operations for VM memory regions.
1341 */
1342void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot)
1343{
1344	struct kvm_memslots *slots = kvm_memslots(kvm);
1345	struct kvm_memory_slot *memslot = id_to_memslot(slots, slot);
1346	phys_addr_t start = memslot->base_gfn << PAGE_SHIFT;
1347	phys_addr_t end = (memslot->base_gfn + memslot->npages) << PAGE_SHIFT;
1348
1349	spin_lock(&kvm->mmu_lock);
1350	stage2_wp_range(kvm, start, end);
1351	spin_unlock(&kvm->mmu_lock);
1352	kvm_flush_remote_tlbs(kvm);
1353}
1354
1355/**
1356 * kvm_mmu_write_protect_pt_masked() - write protect dirty pages
1357 * @kvm:	The KVM pointer
1358 * @slot:	The memory slot associated with mask
1359 * @gfn_offset:	The gfn offset in memory slot
1360 * @mask:	The mask of dirty pages at offset 'gfn_offset' in this memory
1361 *		slot to be write protected
1362 *
1363 * Walks bits set in mask write protects the associated pte's. Caller must
1364 * acquire kvm_mmu_lock.
1365 */
1366static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
1367		struct kvm_memory_slot *slot,
1368		gfn_t gfn_offset, unsigned long mask)
1369{
1370	phys_addr_t base_gfn = slot->base_gfn + gfn_offset;
1371	phys_addr_t start = (base_gfn +  __ffs(mask)) << PAGE_SHIFT;
1372	phys_addr_t end = (base_gfn + __fls(mask) + 1) << PAGE_SHIFT;
1373
1374	stage2_wp_range(kvm, start, end);
1375}
1376
1377/*
1378 * kvm_arch_mmu_enable_log_dirty_pt_masked - enable dirty logging for selected
1379 * dirty pages.
1380 *
1381 * It calls kvm_mmu_write_protect_pt_masked to write protect selected pages to
1382 * enable dirty logging for them.
1383 */
1384void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm,
1385		struct kvm_memory_slot *slot,
1386		gfn_t gfn_offset, unsigned long mask)
1387{
1388	kvm_mmu_write_protect_pt_masked(kvm, slot, gfn_offset, mask);
1389}
1390
1391static void clean_dcache_guest_page(kvm_pfn_t pfn, unsigned long size)
1392{
1393	__clean_dcache_guest_page(pfn, size);
1394}
1395
1396static void invalidate_icache_guest_page(kvm_pfn_t pfn, unsigned long size)
1397{
1398	__invalidate_icache_guest_page(pfn, size);
1399}
1400
1401static void kvm_send_hwpoison_signal(unsigned long address,
1402				     struct vm_area_struct *vma)
1403{
1404	siginfo_t info;
1405
1406	clear_siginfo(&info);
1407	info.si_signo   = SIGBUS;
1408	info.si_errno   = 0;
1409	info.si_code    = BUS_MCEERR_AR;
1410	info.si_addr    = (void __user *)address;
1411
1412	if (is_vm_hugetlb_page(vma))
1413		info.si_addr_lsb = huge_page_shift(hstate_vma(vma));
1414	else
1415		info.si_addr_lsb = PAGE_SHIFT;
1416
1417	send_sig_info(SIGBUS, &info, current);
1418}
1419
1420static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
1421			  struct kvm_memory_slot *memslot, unsigned long hva,
1422			  unsigned long fault_status)
1423{
1424	int ret;
1425	bool write_fault, exec_fault, writable, hugetlb = false, force_pte = false;
1426	unsigned long mmu_seq;
1427	gfn_t gfn = fault_ipa >> PAGE_SHIFT;
1428	struct kvm *kvm = vcpu->kvm;
1429	struct kvm_mmu_memory_cache *memcache = &vcpu->arch.mmu_page_cache;
1430	struct vm_area_struct *vma;
1431	kvm_pfn_t pfn;
1432	pgprot_t mem_type = PAGE_S2;
1433	bool logging_active = memslot_is_logging(memslot);
1434	unsigned long flags = 0;
1435
1436	write_fault = kvm_is_write_fault(vcpu);
1437	exec_fault = kvm_vcpu_trap_is_iabt(vcpu);
1438	VM_BUG_ON(write_fault && exec_fault);
1439
1440	if (fault_status == FSC_PERM && !write_fault && !exec_fault) {
1441		kvm_err("Unexpected L2 read permission error\n");
1442		return -EFAULT;
1443	}
1444
1445	/* Let's check if we will get back a huge page backed by hugetlbfs */
1446	down_read(&current->mm->mmap_sem);
1447	vma = find_vma_intersection(current->mm, hva, hva + 1);
1448	if (unlikely(!vma)) {
1449		kvm_err("Failed to find VMA for hva 0x%lx\n", hva);
1450		up_read(&current->mm->mmap_sem);
1451		return -EFAULT;
1452	}
1453
1454	if (vma_kernel_pagesize(vma) == PMD_SIZE && !logging_active) {
1455		hugetlb = true;
1456		gfn = (fault_ipa & PMD_MASK) >> PAGE_SHIFT;
1457	} else {
1458		/*
1459		 * Pages belonging to memslots that don't have the same
1460		 * alignment for userspace and IPA cannot be mapped using
1461		 * block descriptors even if the pages belong to a THP for
1462		 * the process, because the stage-2 block descriptor will
1463		 * cover more than a single THP and we loose atomicity for
1464		 * unmapping, updates, and splits of the THP or other pages
1465		 * in the stage-2 block range.
1466		 */
1467		if ((memslot->userspace_addr & ~PMD_MASK) !=
1468		    ((memslot->base_gfn << PAGE_SHIFT) & ~PMD_MASK))
1469			force_pte = true;
1470	}
1471	up_read(&current->mm->mmap_sem);
1472
1473	/* We need minimum second+third level pages */
1474	ret = mmu_topup_memory_cache(memcache, KVM_MMU_CACHE_MIN_PAGES,
1475				     KVM_NR_MEM_OBJS);
1476	if (ret)
1477		return ret;
1478
1479	mmu_seq = vcpu->kvm->mmu_notifier_seq;
1480	/*
1481	 * Ensure the read of mmu_notifier_seq happens before we call
1482	 * gfn_to_pfn_prot (which calls get_user_pages), so that we don't risk
1483	 * the page we just got a reference to gets unmapped before we have a
1484	 * chance to grab the mmu_lock, which ensure that if the page gets
1485	 * unmapped afterwards, the call to kvm_unmap_hva will take it away
1486	 * from us again properly. This smp_rmb() interacts with the smp_wmb()
1487	 * in kvm_mmu_notifier_invalidate_<page|range_end>.
1488	 */
1489	smp_rmb();
1490
1491	pfn = gfn_to_pfn_prot(kvm, gfn, write_fault, &writable);
1492	if (pfn == KVM_PFN_ERR_HWPOISON) {
1493		kvm_send_hwpoison_signal(hva, vma);
1494		return 0;
1495	}
1496	if (is_error_noslot_pfn(pfn))
1497		return -EFAULT;
1498
1499	if (kvm_is_device_pfn(pfn)) {
1500		mem_type = PAGE_S2_DEVICE;
1501		flags |= KVM_S2PTE_FLAG_IS_IOMAP;
1502	} else if (logging_active) {
1503		/*
1504		 * Faults on pages in a memslot with logging enabled
1505		 * should not be mapped with huge pages (it introduces churn
1506		 * and performance degradation), so force a pte mapping.
1507		 */
1508		force_pte = true;
1509		flags |= KVM_S2_FLAG_LOGGING_ACTIVE;
1510
1511		/*
1512		 * Only actually map the page as writable if this was a write
1513		 * fault.
1514		 */
1515		if (!write_fault)
1516			writable = false;
1517	}
1518
1519	spin_lock(&kvm->mmu_lock);
1520	if (mmu_notifier_retry(kvm, mmu_seq))
1521		goto out_unlock;
1522
1523	if (!hugetlb && !force_pte)
1524		hugetlb = transparent_hugepage_adjust(&pfn, &fault_ipa);
1525
1526	if (hugetlb) {
1527		pmd_t new_pmd = pfn_pmd(pfn, mem_type);
1528		new_pmd = pmd_mkhuge(new_pmd);
1529		if (writable) {
1530			new_pmd = kvm_s2pmd_mkwrite(new_pmd);
1531			kvm_set_pfn_dirty(pfn);
1532		}
1533
1534		if (fault_status != FSC_PERM)
1535			clean_dcache_guest_page(pfn, PMD_SIZE);
1536
1537		if (exec_fault) {
1538			new_pmd = kvm_s2pmd_mkexec(new_pmd);
1539			invalidate_icache_guest_page(pfn, PMD_SIZE);
1540		} else if (fault_status == FSC_PERM) {
1541			/* Preserve execute if XN was already cleared */
1542			if (stage2_is_exec(kvm, fault_ipa))
1543				new_pmd = kvm_s2pmd_mkexec(new_pmd);
1544		}
1545
1546		ret = stage2_set_pmd_huge(kvm, memcache, fault_ipa, &new_pmd);
1547	} else {
1548		pte_t new_pte = pfn_pte(pfn, mem_type);
1549
1550		if (writable) {
1551			new_pte = kvm_s2pte_mkwrite(new_pte);
1552			kvm_set_pfn_dirty(pfn);
1553			mark_page_dirty(kvm, gfn);
1554		}
1555
1556		if (fault_status != FSC_PERM)
1557			clean_dcache_guest_page(pfn, PAGE_SIZE);
1558
1559		if (exec_fault) {
1560			new_pte = kvm_s2pte_mkexec(new_pte);
1561			invalidate_icache_guest_page(pfn, PAGE_SIZE);
1562		} else if (fault_status == FSC_PERM) {
1563			/* Preserve execute if XN was already cleared */
1564			if (stage2_is_exec(kvm, fault_ipa))
1565				new_pte = kvm_s2pte_mkexec(new_pte);
1566		}
1567
1568		ret = stage2_set_pte(kvm, memcache, fault_ipa, &new_pte, flags);
1569	}
1570
1571out_unlock:
1572	spin_unlock(&kvm->mmu_lock);
1573	kvm_set_pfn_accessed(pfn);
1574	kvm_release_pfn_clean(pfn);
1575	return ret;
1576}
1577
1578/*
1579 * Resolve the access fault by making the page young again.
1580 * Note that because the faulting entry is guaranteed not to be
1581 * cached in the TLB, we don't need to invalidate anything.
1582 * Only the HW Access Flag updates are supported for Stage 2 (no DBM),
1583 * so there is no need for atomic (pte|pmd)_mkyoung operations.
1584 */
1585static void handle_access_fault(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa)
1586{
1587	pmd_t *pmd;
1588	pte_t *pte;
1589	kvm_pfn_t pfn;
1590	bool pfn_valid = false;
1591
1592	trace_kvm_access_fault(fault_ipa);
1593
1594	spin_lock(&vcpu->kvm->mmu_lock);
1595
1596	pmd = stage2_get_pmd(vcpu->kvm, NULL, fault_ipa);
1597	if (!pmd || pmd_none(*pmd))	/* Nothing there */
1598		goto out;
1599
1600	if (pmd_thp_or_huge(*pmd)) {	/* THP, HugeTLB */
1601		*pmd = pmd_mkyoung(*pmd);
1602		pfn = pmd_pfn(*pmd);
1603		pfn_valid = true;
1604		goto out;
1605	}
1606
1607	pte = pte_offset_kernel(pmd, fault_ipa);
1608	if (pte_none(*pte))		/* Nothing there either */
1609		goto out;
1610
1611	*pte = pte_mkyoung(*pte);	/* Just a page... */
1612	pfn = pte_pfn(*pte);
1613	pfn_valid = true;
1614out:
1615	spin_unlock(&vcpu->kvm->mmu_lock);
1616	if (pfn_valid)
1617		kvm_set_pfn_accessed(pfn);
1618}
1619
1620/**
1621 * kvm_handle_guest_abort - handles all 2nd stage aborts
1622 * @vcpu:	the VCPU pointer
1623 * @run:	the kvm_run structure
1624 *
1625 * Any abort that gets to the host is almost guaranteed to be caused by a
1626 * missing second stage translation table entry, which can mean that either the
1627 * guest simply needs more memory and we must allocate an appropriate page or it
1628 * can mean that the guest tried to access I/O memory, which is emulated by user
1629 * space. The distinction is based on the IPA causing the fault and whether this
1630 * memory region has been registered as standard RAM by user space.
1631 */
1632int kvm_handle_guest_abort(struct kvm_vcpu *vcpu, struct kvm_run *run)
1633{
1634	unsigned long fault_status;
1635	phys_addr_t fault_ipa;
1636	struct kvm_memory_slot *memslot;
1637	unsigned long hva;
1638	bool is_iabt, write_fault, writable;
1639	gfn_t gfn;
1640	int ret, idx;
1641
1642	fault_status = kvm_vcpu_trap_get_fault_type(vcpu);
1643
1644	fault_ipa = kvm_vcpu_get_fault_ipa(vcpu);
1645	is_iabt = kvm_vcpu_trap_is_iabt(vcpu);
1646
1647	/* Synchronous External Abort? */
1648	if (kvm_vcpu_dabt_isextabt(vcpu)) {
1649		/*
1650		 * For RAS the host kernel may handle this abort.
1651		 * There is no need to pass the error into the guest.
1652		 */
1653		if (!handle_guest_sea(fault_ipa, kvm_vcpu_get_hsr(vcpu)))
1654			return 1;
1655
1656		if (unlikely(!is_iabt)) {
1657			kvm_inject_vabt(vcpu);
1658			return 1;
1659		}
1660	}
1661
1662	trace_kvm_guest_fault(*vcpu_pc(vcpu), kvm_vcpu_get_hsr(vcpu),
1663			      kvm_vcpu_get_hfar(vcpu), fault_ipa);
1664
1665	/* Check the stage-2 fault is trans. fault or write fault */
1666	if (fault_status != FSC_FAULT && fault_status != FSC_PERM &&
1667	    fault_status != FSC_ACCESS) {
1668		kvm_err("Unsupported FSC: EC=%#x xFSC=%#lx ESR_EL2=%#lx\n",
1669			kvm_vcpu_trap_get_class(vcpu),
1670			(unsigned long)kvm_vcpu_trap_get_fault(vcpu),
1671			(unsigned long)kvm_vcpu_get_hsr(vcpu));
1672		return -EFAULT;
1673	}
1674
1675	idx = srcu_read_lock(&vcpu->kvm->srcu);
1676
1677	gfn = fault_ipa >> PAGE_SHIFT;
1678	memslot = gfn_to_memslot(vcpu->kvm, gfn);
1679	hva = gfn_to_hva_memslot_prot(memslot, gfn, &writable);
1680	write_fault = kvm_is_write_fault(vcpu);
1681	if (kvm_is_error_hva(hva) || (write_fault && !writable)) {
1682		if (is_iabt) {
1683			/* Prefetch Abort on I/O address */
1684			kvm_inject_pabt(vcpu, kvm_vcpu_get_hfar(vcpu));
1685			ret = 1;
1686			goto out_unlock;
1687		}
1688
1689		/*
1690		 * Check for a cache maintenance operation. Since we
1691		 * ended-up here, we know it is outside of any memory
1692		 * slot. But we can't find out if that is for a device,
1693		 * or if the guest is just being stupid. The only thing
1694		 * we know for sure is that this range cannot be cached.
1695		 *
1696		 * So let's assume that the guest is just being
1697		 * cautious, and skip the instruction.
1698		 */
1699		if (kvm_vcpu_dabt_is_cm(vcpu)) {
1700			kvm_skip_instr(vcpu, kvm_vcpu_trap_il_is32bit(vcpu));
1701			ret = 1;
1702			goto out_unlock;
1703		}
1704
1705		/*
1706		 * The IPA is reported as [MAX:12], so we need to
1707		 * complement it with the bottom 12 bits from the
1708		 * faulting VA. This is always 12 bits, irrespective
1709		 * of the page size.
1710		 */
1711		fault_ipa |= kvm_vcpu_get_hfar(vcpu) & ((1 << 12) - 1);
1712		ret = io_mem_abort(vcpu, run, fault_ipa);
1713		goto out_unlock;
1714	}
1715
1716	/* Userspace should not be able to register out-of-bounds IPAs */
1717	VM_BUG_ON(fault_ipa >= KVM_PHYS_SIZE);
1718
1719	if (fault_status == FSC_ACCESS) {
1720		handle_access_fault(vcpu, fault_ipa);
1721		ret = 1;
1722		goto out_unlock;
1723	}
1724
1725	ret = user_mem_abort(vcpu, fault_ipa, memslot, hva, fault_status);
1726	if (ret == 0)
1727		ret = 1;
1728out_unlock:
1729	srcu_read_unlock(&vcpu->kvm->srcu, idx);
1730	return ret;
1731}
1732
1733static int handle_hva_to_gpa(struct kvm *kvm,
1734			     unsigned long start,
1735			     unsigned long end,
1736			     int (*handler)(struct kvm *kvm,
1737					    gpa_t gpa, u64 size,
1738					    void *data),
1739			     void *data)
1740{
1741	struct kvm_memslots *slots;
1742	struct kvm_memory_slot *memslot;
1743	int ret = 0;
1744
1745	slots = kvm_memslots(kvm);
1746
1747	/* we only care about the pages that the guest sees */
1748	kvm_for_each_memslot(memslot, slots) {
1749		unsigned long hva_start, hva_end;
1750		gfn_t gpa;
1751
1752		hva_start = max(start, memslot->userspace_addr);
1753		hva_end = min(end, memslot->userspace_addr +
1754					(memslot->npages << PAGE_SHIFT));
1755		if (hva_start >= hva_end)
1756			continue;
1757
1758		gpa = hva_to_gfn_memslot(hva_start, memslot) << PAGE_SHIFT;
1759		ret |= handler(kvm, gpa, (u64)(hva_end - hva_start), data);
1760	}
1761
1762	return ret;
1763}
1764
1765static int kvm_unmap_hva_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data)
1766{
1767	unmap_stage2_range(kvm, gpa, size);
1768	return 0;
1769}
1770
1771int kvm_unmap_hva(struct kvm *kvm, unsigned long hva)
1772{
1773	unsigned long end = hva + PAGE_SIZE;
1774
1775	if (!kvm->arch.pgd)
1776		return 0;
1777
1778	trace_kvm_unmap_hva(hva);
1779	handle_hva_to_gpa(kvm, hva, end, &kvm_unmap_hva_handler, NULL);
1780	return 0;
1781}
1782
1783int kvm_unmap_hva_range(struct kvm *kvm,
1784			unsigned long start, unsigned long end)
1785{
1786	if (!kvm->arch.pgd)
1787		return 0;
1788
1789	trace_kvm_unmap_hva_range(start, end);
1790	handle_hva_to_gpa(kvm, start, end, &kvm_unmap_hva_handler, NULL);
1791	return 0;
1792}
1793
1794static int kvm_set_spte_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data)
1795{
1796	pte_t *pte = (pte_t *)data;
1797
1798	WARN_ON(size != PAGE_SIZE);
1799	/*
1800	 * We can always call stage2_set_pte with KVM_S2PTE_FLAG_LOGGING_ACTIVE
1801	 * flag clear because MMU notifiers will have unmapped a huge PMD before
1802	 * calling ->change_pte() (which in turn calls kvm_set_spte_hva()) and
1803	 * therefore stage2_set_pte() never needs to clear out a huge PMD
1804	 * through this calling path.
1805	 */
1806	stage2_set_pte(kvm, NULL, gpa, pte, 0);
1807	return 0;
1808}
1809
1810
1811void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte)
1812{
1813	unsigned long end = hva + PAGE_SIZE;
1814	pte_t stage2_pte;
1815
1816	if (!kvm->arch.pgd)
1817		return;
1818
1819	trace_kvm_set_spte_hva(hva);
1820	stage2_pte = pfn_pte(pte_pfn(pte), PAGE_S2);
1821	handle_hva_to_gpa(kvm, hva, end, &kvm_set_spte_handler, &stage2_pte);
1822}
1823
1824static int kvm_age_hva_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data)
1825{
1826	pmd_t *pmd;
1827	pte_t *pte;
1828
1829	WARN_ON(size != PAGE_SIZE && size != PMD_SIZE);
1830	pmd = stage2_get_pmd(kvm, NULL, gpa);
1831	if (!pmd || pmd_none(*pmd))	/* Nothing there */
1832		return 0;
1833
1834	if (pmd_thp_or_huge(*pmd))	/* THP, HugeTLB */
1835		return stage2_pmdp_test_and_clear_young(pmd);
1836
1837	pte = pte_offset_kernel(pmd, gpa);
1838	if (pte_none(*pte))
1839		return 0;
1840
1841	return stage2_ptep_test_and_clear_young(pte);
1842}
1843
1844static int kvm_test_age_hva_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data)
1845{
1846	pmd_t *pmd;
1847	pte_t *pte;
1848
1849	WARN_ON(size != PAGE_SIZE && size != PMD_SIZE);
1850	pmd = stage2_get_pmd(kvm, NULL, gpa);
1851	if (!pmd || pmd_none(*pmd))	/* Nothing there */
1852		return 0;
1853
1854	if (pmd_thp_or_huge(*pmd))		/* THP, HugeTLB */
1855		return pmd_young(*pmd);
1856
1857	pte = pte_offset_kernel(pmd, gpa);
1858	if (!pte_none(*pte))		/* Just a page... */
1859		return pte_young(*pte);
1860
1861	return 0;
1862}
1863
1864int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end)
1865{
1866	if (!kvm->arch.pgd)
1867		return 0;
1868	trace_kvm_age_hva(start, end);
1869	return handle_hva_to_gpa(kvm, start, end, kvm_age_hva_handler, NULL);
1870}
1871
1872int kvm_test_age_hva(struct kvm *kvm, unsigned long hva)
1873{
1874	if (!kvm->arch.pgd)
1875		return 0;
1876	trace_kvm_test_age_hva(hva);
1877	return handle_hva_to_gpa(kvm, hva, hva, kvm_test_age_hva_handler, NULL);
1878}
1879
1880void kvm_mmu_free_memory_caches(struct kvm_vcpu *vcpu)
1881{
1882	mmu_free_memory_cache(&vcpu->arch.mmu_page_cache);
1883}
1884
1885phys_addr_t kvm_mmu_get_httbr(void)
1886{
1887	if (__kvm_cpu_uses_extended_idmap())
1888		return virt_to_phys(merged_hyp_pgd);
1889	else
1890		return virt_to_phys(hyp_pgd);
1891}
1892
1893phys_addr_t kvm_get_idmap_vector(void)
1894{
1895	return hyp_idmap_vector;
1896}
1897
1898static int kvm_map_idmap_text(pgd_t *pgd)
1899{
1900	int err;
1901
1902	/* Create the idmap in the boot page tables */
1903	err = 	__create_hyp_mappings(pgd, __kvm_idmap_ptrs_per_pgd(),
1904				      hyp_idmap_start, hyp_idmap_end,
1905				      __phys_to_pfn(hyp_idmap_start),
1906				      PAGE_HYP_EXEC);
1907	if (err)
1908		kvm_err("Failed to idmap %lx-%lx\n",
1909			hyp_idmap_start, hyp_idmap_end);
1910
1911	return err;
1912}
1913
1914int kvm_mmu_init(void)
1915{
1916	int err;
1917
1918	hyp_idmap_start = kvm_virt_to_phys(__hyp_idmap_text_start);
1919	hyp_idmap_start = ALIGN_DOWN(hyp_idmap_start, PAGE_SIZE);
1920	hyp_idmap_end = kvm_virt_to_phys(__hyp_idmap_text_end);
1921	hyp_idmap_end = ALIGN(hyp_idmap_end, PAGE_SIZE);
1922	hyp_idmap_vector = kvm_virt_to_phys(__kvm_hyp_init);
1923
1924	/*
1925	 * We rely on the linker script to ensure at build time that the HYP
1926	 * init code does not cross a page boundary.
1927	 */
1928	BUG_ON((hyp_idmap_start ^ (hyp_idmap_end - 1)) & PAGE_MASK);
1929
1930	kvm_debug("IDMAP page: %lx\n", hyp_idmap_start);
1931	kvm_debug("HYP VA range: %lx:%lx\n",
1932		  kern_hyp_va(PAGE_OFFSET),
1933		  kern_hyp_va((unsigned long)high_memory - 1));
1934
1935	if (hyp_idmap_start >= kern_hyp_va(PAGE_OFFSET) &&
1936	    hyp_idmap_start <  kern_hyp_va((unsigned long)high_memory - 1) &&
1937	    hyp_idmap_start != (unsigned long)__hyp_idmap_text_start) {
1938		/*
1939		 * The idmap page is intersecting with the VA space,
1940		 * it is not safe to continue further.
1941		 */
1942		kvm_err("IDMAP intersecting with HYP VA, unable to continue\n");
1943		err = -EINVAL;
1944		goto out;
1945	}
1946
1947	hyp_pgd = (pgd_t *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, hyp_pgd_order);
1948	if (!hyp_pgd) {
1949		kvm_err("Hyp mode PGD not allocated\n");
1950		err = -ENOMEM;
1951		goto out;
1952	}
1953
1954	if (__kvm_cpu_uses_extended_idmap()) {
1955		boot_hyp_pgd = (pgd_t *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
1956							 hyp_pgd_order);
1957		if (!boot_hyp_pgd) {
1958			kvm_err("Hyp boot PGD not allocated\n");
1959			err = -ENOMEM;
1960			goto out;
1961		}
1962
1963		err = kvm_map_idmap_text(boot_hyp_pgd);
1964		if (err)
1965			goto out;
1966
1967		merged_hyp_pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
1968		if (!merged_hyp_pgd) {
1969			kvm_err("Failed to allocate extra HYP pgd\n");
1970			goto out;
1971		}
1972		__kvm_extend_hypmap(boot_hyp_pgd, hyp_pgd, merged_hyp_pgd,
1973				    hyp_idmap_start);
1974	} else {
1975		err = kvm_map_idmap_text(hyp_pgd);
1976		if (err)
1977			goto out;
1978	}
1979
1980	io_map_base = hyp_idmap_start;
1981	return 0;
1982out:
1983	free_hyp_pgds();
1984	return err;
1985}
1986
1987void kvm_arch_commit_memory_region(struct kvm *kvm,
1988				   const struct kvm_userspace_memory_region *mem,
1989				   const struct kvm_memory_slot *old,
1990				   const struct kvm_memory_slot *new,
1991				   enum kvm_mr_change change)
1992{
1993	/*
1994	 * At this point memslot has been committed and there is an
1995	 * allocated dirty_bitmap[], dirty pages will be be tracked while the
1996	 * memory slot is write protected.
1997	 */
1998	if (change != KVM_MR_DELETE && mem->flags & KVM_MEM_LOG_DIRTY_PAGES)
1999		kvm_mmu_wp_memory_region(kvm, mem->slot);
2000}
2001
2002int kvm_arch_prepare_memory_region(struct kvm *kvm,
2003				   struct kvm_memory_slot *memslot,
2004				   const struct kvm_userspace_memory_region *mem,
2005				   enum kvm_mr_change change)
2006{
2007	hva_t hva = mem->userspace_addr;
2008	hva_t reg_end = hva + mem->memory_size;
2009	bool writable = !(mem->flags & KVM_MEM_READONLY);
2010	int ret = 0;
2011
2012	if (change != KVM_MR_CREATE && change != KVM_MR_MOVE &&
2013			change != KVM_MR_FLAGS_ONLY)
2014		return 0;
2015
2016	/*
2017	 * Prevent userspace from creating a memory region outside of the IPA
2018	 * space addressable by the KVM guest IPA space.
2019	 */
2020	if (memslot->base_gfn + memslot->npages >=
2021	    (KVM_PHYS_SIZE >> PAGE_SHIFT))
2022		return -EFAULT;
2023
2024	down_read(&current->mm->mmap_sem);
2025	/*
2026	 * A memory region could potentially cover multiple VMAs, and any holes
2027	 * between them, so iterate over all of them to find out if we can map
2028	 * any of them right now.
2029	 *
2030	 *     +--------------------------------------------+
2031	 * +---------------+----------------+   +----------------+
2032	 * |   : VMA 1     |      VMA 2     |   |    VMA 3  :    |
2033	 * +---------------+----------------+   +----------------+
2034	 *     |               memory region                |
2035	 *     +--------------------------------------------+
2036	 */
2037	do {
2038		struct vm_area_struct *vma = find_vma(current->mm, hva);
2039		hva_t vm_start, vm_end;
2040
2041		if (!vma || vma->vm_start >= reg_end)
2042			break;
2043
2044		/*
2045		 * Mapping a read-only VMA is only allowed if the
2046		 * memory region is configured as read-only.
2047		 */
2048		if (writable && !(vma->vm_flags & VM_WRITE)) {
2049			ret = -EPERM;
2050			break;
2051		}
2052
2053		/*
2054		 * Take the intersection of this VMA with the memory region
2055		 */
2056		vm_start = max(hva, vma->vm_start);
2057		vm_end = min(reg_end, vma->vm_end);
2058
2059		if (vma->vm_flags & VM_PFNMAP) {
2060			gpa_t gpa = mem->guest_phys_addr +
2061				    (vm_start - mem->userspace_addr);
2062			phys_addr_t pa;
2063
2064			pa = (phys_addr_t)vma->vm_pgoff << PAGE_SHIFT;
2065			pa += vm_start - vma->vm_start;
2066
2067			/* IO region dirty page logging not allowed */
2068			if (memslot->flags & KVM_MEM_LOG_DIRTY_PAGES) {
2069				ret = -EINVAL;
2070				goto out;
2071			}
2072
2073			ret = kvm_phys_addr_ioremap(kvm, gpa, pa,
2074						    vm_end - vm_start,
2075						    writable);
2076			if (ret)
2077				break;
2078		}
2079		hva = vm_end;
2080	} while (hva < reg_end);
2081
2082	if (change == KVM_MR_FLAGS_ONLY)
2083		goto out;
2084
2085	spin_lock(&kvm->mmu_lock);
2086	if (ret)
2087		unmap_stage2_range(kvm, mem->guest_phys_addr, mem->memory_size);
2088	else
2089		stage2_flush_memslot(kvm, memslot);
2090	spin_unlock(&kvm->mmu_lock);
2091out:
2092	up_read(&current->mm->mmap_sem);
2093	return ret;
2094}
2095
2096void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free,
2097			   struct kvm_memory_slot *dont)
2098{
2099}
2100
2101int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot,
2102			    unsigned long npages)
2103{
2104	return 0;
2105}
2106
2107void kvm_arch_memslots_updated(struct kvm *kvm, struct kvm_memslots *slots)
2108{
2109}
2110
2111void kvm_arch_flush_shadow_all(struct kvm *kvm)
2112{
2113	kvm_free_stage2_pgd(kvm);
2114}
2115
2116void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
2117				   struct kvm_memory_slot *slot)
2118{
2119	gpa_t gpa = slot->base_gfn << PAGE_SHIFT;
2120	phys_addr_t size = slot->npages << PAGE_SHIFT;
2121
2122	spin_lock(&kvm->mmu_lock);
2123	unmap_stage2_range(kvm, gpa, size);
2124	spin_unlock(&kvm->mmu_lock);
2125}
2126
2127/*
2128 * See note at ARMv7 ARM B1.14.4 (TL;DR: S/W ops are not easily virtualized).
2129 *
2130 * Main problems:
2131 * - S/W ops are local to a CPU (not broadcast)
2132 * - We have line migration behind our back (speculation)
2133 * - System caches don't support S/W at all (damn!)
2134 *
2135 * In the face of the above, the best we can do is to try and convert
2136 * S/W ops to VA ops. Because the guest is not allowed to infer the
2137 * S/W to PA mapping, it can only use S/W to nuke the whole cache,
2138 * which is a rather good thing for us.
2139 *
2140 * Also, it is only used when turning caches on/off ("The expected
2141 * usage of the cache maintenance instructions that operate by set/way
2142 * is associated with the cache maintenance instructions associated
2143 * with the powerdown and powerup of caches, if this is required by
2144 * the implementation.").
2145 *
2146 * We use the following policy:
2147 *
2148 * - If we trap a S/W operation, we enable VM trapping to detect
2149 *   caches being turned on/off, and do a full clean.
2150 *
2151 * - We flush the caches on both caches being turned on and off.
2152 *
2153 * - Once the caches are enabled, we stop trapping VM ops.
2154 */
2155void kvm_set_way_flush(struct kvm_vcpu *vcpu)
2156{
2157	unsigned long hcr = *vcpu_hcr(vcpu);
2158
2159	/*
2160	 * If this is the first time we do a S/W operation
2161	 * (i.e. HCR_TVM not set) flush the whole memory, and set the
2162	 * VM trapping.
2163	 *
2164	 * Otherwise, rely on the VM trapping to wait for the MMU +
2165	 * Caches to be turned off. At that point, we'll be able to
2166	 * clean the caches again.
2167	 */
2168	if (!(hcr & HCR_TVM)) {
2169		trace_kvm_set_way_flush(*vcpu_pc(vcpu),
2170					vcpu_has_cache_enabled(vcpu));
2171		stage2_flush_vm(vcpu->kvm);
2172		*vcpu_hcr(vcpu) = hcr | HCR_TVM;
2173	}
2174}
2175
2176void kvm_toggle_cache(struct kvm_vcpu *vcpu, bool was_enabled)
2177{
2178	bool now_enabled = vcpu_has_cache_enabled(vcpu);
2179
2180	/*
2181	 * If switching the MMU+caches on, need to invalidate the caches.
2182	 * If switching it off, need to clean the caches.
2183	 * Clean + invalidate does the trick always.
2184	 */
2185	if (now_enabled != was_enabled)
2186		stage2_flush_vm(vcpu->kvm);
2187
2188	/* Caches are now on, stop trapping VM ops (until a S/W op) */
2189	if (now_enabled)
2190		*vcpu_hcr(vcpu) &= ~HCR_TVM;
2191
2192	trace_kvm_toggle_cache(*vcpu_pc(vcpu), was_enabled, now_enabled);
2193}
Configure Feed

Configure Feed