arch/x86/kvm/paging_tmpl.h at v3.17

tjh.dev / kernel
fork
Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
fork
kernel / arch / x86 / kvm / paging_tmpl.h
at v3.17 992 lines 28 kB view raw
wrap content
  1/*
  2 * Kernel-based Virtual Machine driver for Linux
  3 *
  4 * This module enables machines with Intel VT-x extensions to run virtual
  5 * machines without emulation or binary translation.
  6 *
  7 * MMU support
  8 *
  9 * Copyright (C) 2006 Qumranet, Inc.
 10 * Copyright 2010 Red Hat, Inc. and/or its affiliates.
 11 *
 12 * Authors:
 13 *   Yaniv Kamay  <yaniv@qumranet.com>
 14 *   Avi Kivity   <avi@qumranet.com>
 15 *
 16 * This work is licensed under the terms of the GNU GPL, version 2.  See
 17 * the COPYING file in the top-level directory.
 18 *
 19 */
 20
 21/*
 22 * We need the mmu code to access both 32-bit and 64-bit guest ptes,
 23 * so the code in this file is compiled twice, once per pte size.
 24 */
 25
 26/*
 27 * This is used to catch non optimized PT_GUEST_(DIRTY|ACCESS)_SHIFT macro
 28 * uses for EPT without A/D paging type.
 29 */
 30extern u64 __pure __using_nonexistent_pte_bit(void)
 31	       __compiletime_error("wrong use of PT_GUEST_(DIRTY|ACCESS)_SHIFT");
 32
 33#if PTTYPE == 64
 34	#define pt_element_t u64
 35	#define guest_walker guest_walker64
 36	#define FNAME(name) paging##64_##name
 37	#define PT_BASE_ADDR_MASK PT64_BASE_ADDR_MASK
 38	#define PT_LVL_ADDR_MASK(lvl) PT64_LVL_ADDR_MASK(lvl)
 39	#define PT_LVL_OFFSET_MASK(lvl) PT64_LVL_OFFSET_MASK(lvl)
 40	#define PT_INDEX(addr, level) PT64_INDEX(addr, level)
 41	#define PT_LEVEL_BITS PT64_LEVEL_BITS
 42	#define PT_GUEST_ACCESSED_MASK PT_ACCESSED_MASK
 43	#define PT_GUEST_DIRTY_MASK PT_DIRTY_MASK
 44	#define PT_GUEST_DIRTY_SHIFT PT_DIRTY_SHIFT
 45	#define PT_GUEST_ACCESSED_SHIFT PT_ACCESSED_SHIFT
 46	#ifdef CONFIG_X86_64
 47	#define PT_MAX_FULL_LEVELS 4
 48	#define CMPXCHG cmpxchg
 49	#else
 50	#define CMPXCHG cmpxchg64
 51	#define PT_MAX_FULL_LEVELS 2
 52	#endif
 53#elif PTTYPE == 32
 54	#define pt_element_t u32
 55	#define guest_walker guest_walker32
 56	#define FNAME(name) paging##32_##name
 57	#define PT_BASE_ADDR_MASK PT32_BASE_ADDR_MASK
 58	#define PT_LVL_ADDR_MASK(lvl) PT32_LVL_ADDR_MASK(lvl)
 59	#define PT_LVL_OFFSET_MASK(lvl) PT32_LVL_OFFSET_MASK(lvl)
 60	#define PT_INDEX(addr, level) PT32_INDEX(addr, level)
 61	#define PT_LEVEL_BITS PT32_LEVEL_BITS
 62	#define PT_MAX_FULL_LEVELS 2
 63	#define PT_GUEST_ACCESSED_MASK PT_ACCESSED_MASK
 64	#define PT_GUEST_DIRTY_MASK PT_DIRTY_MASK
 65	#define PT_GUEST_DIRTY_SHIFT PT_DIRTY_SHIFT
 66	#define PT_GUEST_ACCESSED_SHIFT PT_ACCESSED_SHIFT
 67	#define CMPXCHG cmpxchg
 68#elif PTTYPE == PTTYPE_EPT
 69	#define pt_element_t u64
 70	#define guest_walker guest_walkerEPT
 71	#define FNAME(name) ept_##name
 72	#define PT_BASE_ADDR_MASK PT64_BASE_ADDR_MASK
 73	#define PT_LVL_ADDR_MASK(lvl) PT64_LVL_ADDR_MASK(lvl)
 74	#define PT_LVL_OFFSET_MASK(lvl) PT64_LVL_OFFSET_MASK(lvl)
 75	#define PT_INDEX(addr, level) PT64_INDEX(addr, level)
 76	#define PT_LEVEL_BITS PT64_LEVEL_BITS
 77	#define PT_GUEST_ACCESSED_MASK 0
 78	#define PT_GUEST_DIRTY_MASK 0
 79	#define PT_GUEST_DIRTY_SHIFT __using_nonexistent_pte_bit()
 80	#define PT_GUEST_ACCESSED_SHIFT __using_nonexistent_pte_bit()
 81	#define CMPXCHG cmpxchg64
 82	#define PT_MAX_FULL_LEVELS 4
 83#else
 84	#error Invalid PTTYPE value
 85#endif
 86
 87#define gpte_to_gfn_lvl FNAME(gpte_to_gfn_lvl)
 88#define gpte_to_gfn(pte) gpte_to_gfn_lvl((pte), PT_PAGE_TABLE_LEVEL)
 89
 90/*
 91 * The guest_walker structure emulates the behavior of the hardware page
 92 * table walker.
 93 */
 94struct guest_walker {
 95	int level;
 96	unsigned max_level;
 97	gfn_t table_gfn[PT_MAX_FULL_LEVELS];
 98	pt_element_t ptes[PT_MAX_FULL_LEVELS];
 99	pt_element_t prefetch_ptes[PTE_PREFETCH_NUM];
100	gpa_t pte_gpa[PT_MAX_FULL_LEVELS];
101	pt_element_t __user *ptep_user[PT_MAX_FULL_LEVELS];
102	bool pte_writable[PT_MAX_FULL_LEVELS];
103	unsigned pt_access;
104	unsigned pte_access;
105	gfn_t gfn;
106	struct x86_exception fault;
107};
108
109static gfn_t gpte_to_gfn_lvl(pt_element_t gpte, int lvl)
110{
111	return (gpte & PT_LVL_ADDR_MASK(lvl)) >> PAGE_SHIFT;
112}
113
114static inline void FNAME(protect_clean_gpte)(unsigned *access, unsigned gpte)
115{
116	unsigned mask;
117
118	/* dirty bit is not supported, so no need to track it */
119	if (!PT_GUEST_DIRTY_MASK)
120		return;
121
122	BUILD_BUG_ON(PT_WRITABLE_MASK != ACC_WRITE_MASK);
123
124	mask = (unsigned)~ACC_WRITE_MASK;
125	/* Allow write access to dirty gptes */
126	mask |= (gpte >> (PT_GUEST_DIRTY_SHIFT - PT_WRITABLE_SHIFT)) &
127		PT_WRITABLE_MASK;
128	*access &= mask;
129}
130
131static bool FNAME(is_rsvd_bits_set)(struct kvm_mmu *mmu, u64 gpte, int level)
132{
133	int bit7 = (gpte >> 7) & 1, low6 = gpte & 0x3f;
134
135	return (gpte & mmu->rsvd_bits_mask[bit7][level-1]) |
136		((mmu->bad_mt_xwr & (1ull << low6)) != 0);
137}
138
139static inline int FNAME(is_present_gpte)(unsigned long pte)
140{
141#if PTTYPE != PTTYPE_EPT
142	return is_present_gpte(pte);
143#else
144	return pte & 7;
145#endif
146}
147
148static int FNAME(cmpxchg_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
149			       pt_element_t __user *ptep_user, unsigned index,
150			       pt_element_t orig_pte, pt_element_t new_pte)
151{
152	int npages;
153	pt_element_t ret;
154	pt_element_t *table;
155	struct page *page;
156
157	npages = get_user_pages_fast((unsigned long)ptep_user, 1, 1, &page);
158	/* Check if the user is doing something meaningless. */
159	if (unlikely(npages != 1))
160		return -EFAULT;
161
162	table = kmap_atomic(page);
163	ret = CMPXCHG(&table[index], orig_pte, new_pte);
164	kunmap_atomic(table);
165
166	kvm_release_page_dirty(page);
167
168	return (ret != orig_pte);
169}
170
171static bool FNAME(prefetch_invalid_gpte)(struct kvm_vcpu *vcpu,
172				  struct kvm_mmu_page *sp, u64 *spte,
173				  u64 gpte)
174{
175	if (FNAME(is_rsvd_bits_set)(&vcpu->arch.mmu, gpte, PT_PAGE_TABLE_LEVEL))
176		goto no_present;
177
178	if (!FNAME(is_present_gpte)(gpte))
179		goto no_present;
180
181	/* if accessed bit is not supported prefetch non accessed gpte */
182	if (PT_GUEST_ACCESSED_MASK && !(gpte & PT_GUEST_ACCESSED_MASK))
183		goto no_present;
184
185	return false;
186
187no_present:
188	drop_spte(vcpu->kvm, spte);
189	return true;
190}
191
192static inline unsigned FNAME(gpte_access)(struct kvm_vcpu *vcpu, u64 gpte)
193{
194	unsigned access;
195#if PTTYPE == PTTYPE_EPT
196	access = ((gpte & VMX_EPT_WRITABLE_MASK) ? ACC_WRITE_MASK : 0) |
197		((gpte & VMX_EPT_EXECUTABLE_MASK) ? ACC_EXEC_MASK : 0) |
198		ACC_USER_MASK;
199#else
200	access = (gpte & (PT_WRITABLE_MASK | PT_USER_MASK)) | ACC_EXEC_MASK;
201	access &= ~(gpte >> PT64_NX_SHIFT);
202#endif
203
204	return access;
205}
206
207static int FNAME(update_accessed_dirty_bits)(struct kvm_vcpu *vcpu,
208					     struct kvm_mmu *mmu,
209					     struct guest_walker *walker,
210					     int write_fault)
211{
212	unsigned level, index;
213	pt_element_t pte, orig_pte;
214	pt_element_t __user *ptep_user;
215	gfn_t table_gfn;
216	int ret;
217
218	/* dirty/accessed bits are not supported, so no need to update them */
219	if (!PT_GUEST_DIRTY_MASK)
220		return 0;
221
222	for (level = walker->max_level; level >= walker->level; --level) {
223		pte = orig_pte = walker->ptes[level - 1];
224		table_gfn = walker->table_gfn[level - 1];
225		ptep_user = walker->ptep_user[level - 1];
226		index = offset_in_page(ptep_user) / sizeof(pt_element_t);
227		if (!(pte & PT_GUEST_ACCESSED_MASK)) {
228			trace_kvm_mmu_set_accessed_bit(table_gfn, index, sizeof(pte));
229			pte |= PT_GUEST_ACCESSED_MASK;
230		}
231		if (level == walker->level && write_fault &&
232				!(pte & PT_GUEST_DIRTY_MASK)) {
233			trace_kvm_mmu_set_dirty_bit(table_gfn, index, sizeof(pte));
234			pte |= PT_GUEST_DIRTY_MASK;
235		}
236		if (pte == orig_pte)
237			continue;
238
239		/*
240		 * If the slot is read-only, simply do not process the accessed
241		 * and dirty bits.  This is the correct thing to do if the slot
242		 * is ROM, and page tables in read-as-ROM/write-as-MMIO slots
243		 * are only supported if the accessed and dirty bits are already
244		 * set in the ROM (so that MMIO writes are never needed).
245		 *
246		 * Note that NPT does not allow this at all and faults, since
247		 * it always wants nested page table entries for the guest
248		 * page tables to be writable.  And EPT works but will simply
249		 * overwrite the read-only memory to set the accessed and dirty
250		 * bits.
251		 */
252		if (unlikely(!walker->pte_writable[level - 1]))
253			continue;
254
255		ret = FNAME(cmpxchg_gpte)(vcpu, mmu, ptep_user, index, orig_pte, pte);
256		if (ret)
257			return ret;
258
259		mark_page_dirty(vcpu->kvm, table_gfn);
260		walker->ptes[level] = pte;
261	}
262	return 0;
263}
264
265/*
266 * Fetch a guest pte for a guest virtual address
267 */
268static int FNAME(walk_addr_generic)(struct guest_walker *walker,
269				    struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
270				    gva_t addr, u32 access)
271{
272	int ret;
273	pt_element_t pte;
274	pt_element_t __user *uninitialized_var(ptep_user);
275	gfn_t table_gfn;
276	unsigned index, pt_access, pte_access, accessed_dirty;
277	gpa_t pte_gpa;
278	int offset;
279	const int write_fault = access & PFERR_WRITE_MASK;
280	const int user_fault  = access & PFERR_USER_MASK;
281	const int fetch_fault = access & PFERR_FETCH_MASK;
282	u16 errcode = 0;
283	gpa_t real_gpa;
284	gfn_t gfn;
285
286	trace_kvm_mmu_pagetable_walk(addr, access);
287retry_walk:
288	walker->level = mmu->root_level;
289	pte           = mmu->get_cr3(vcpu);
290
291#if PTTYPE == 64
292	if (walker->level == PT32E_ROOT_LEVEL) {
293		pte = mmu->get_pdptr(vcpu, (addr >> 30) & 3);
294		trace_kvm_mmu_paging_element(pte, walker->level);
295		if (!FNAME(is_present_gpte)(pte))
296			goto error;
297		--walker->level;
298	}
299#endif
300	walker->max_level = walker->level;
301	ASSERT((!is_long_mode(vcpu) && is_pae(vcpu)) ||
302	       (mmu->get_cr3(vcpu) & CR3_NONPAE_RESERVED_BITS) == 0);
303
304	accessed_dirty = PT_GUEST_ACCESSED_MASK;
305	pt_access = pte_access = ACC_ALL;
306	++walker->level;
307
308	do {
309		gfn_t real_gfn;
310		unsigned long host_addr;
311
312		pt_access &= pte_access;
313		--walker->level;
314
315		index = PT_INDEX(addr, walker->level);
316
317		table_gfn = gpte_to_gfn(pte);
318		offset    = index * sizeof(pt_element_t);
319		pte_gpa   = gfn_to_gpa(table_gfn) + offset;
320		walker->table_gfn[walker->level - 1] = table_gfn;
321		walker->pte_gpa[walker->level - 1] = pte_gpa;
322
323		real_gfn = mmu->translate_gpa(vcpu, gfn_to_gpa(table_gfn),
324					      PFERR_USER_MASK|PFERR_WRITE_MASK);
325		if (unlikely(real_gfn == UNMAPPED_GVA))
326			goto error;
327		real_gfn = gpa_to_gfn(real_gfn);
328
329		host_addr = gfn_to_hva_prot(vcpu->kvm, real_gfn,
330					    &walker->pte_writable[walker->level - 1]);
331		if (unlikely(kvm_is_error_hva(host_addr)))
332			goto error;
333
334		ptep_user = (pt_element_t __user *)((void *)host_addr + offset);
335		if (unlikely(__copy_from_user(&pte, ptep_user, sizeof(pte))))
336			goto error;
337		walker->ptep_user[walker->level - 1] = ptep_user;
338
339		trace_kvm_mmu_paging_element(pte, walker->level);
340
341		if (unlikely(!FNAME(is_present_gpte)(pte)))
342			goto error;
343
344		if (unlikely(FNAME(is_rsvd_bits_set)(mmu, pte,
345					             walker->level))) {
346			errcode |= PFERR_RSVD_MASK | PFERR_PRESENT_MASK;
347			goto error;
348		}
349
350		accessed_dirty &= pte;
351		pte_access = pt_access & FNAME(gpte_access)(vcpu, pte);
352
353		walker->ptes[walker->level - 1] = pte;
354	} while (!is_last_gpte(mmu, walker->level, pte));
355
356	if (unlikely(permission_fault(vcpu, mmu, pte_access, access))) {
357		errcode |= PFERR_PRESENT_MASK;
358		goto error;
359	}
360
361	gfn = gpte_to_gfn_lvl(pte, walker->level);
362	gfn += (addr & PT_LVL_OFFSET_MASK(walker->level)) >> PAGE_SHIFT;
363
364	if (PTTYPE == 32 && walker->level == PT_DIRECTORY_LEVEL && is_cpuid_PSE36())
365		gfn += pse36_gfn_delta(pte);
366
367	real_gpa = mmu->translate_gpa(vcpu, gfn_to_gpa(gfn), access);
368	if (real_gpa == UNMAPPED_GVA)
369		return 0;
370
371	walker->gfn = real_gpa >> PAGE_SHIFT;
372
373	if (!write_fault)
374		FNAME(protect_clean_gpte)(&pte_access, pte);
375	else
376		/*
377		 * On a write fault, fold the dirty bit into accessed_dirty.
378		 * For modes without A/D bits support accessed_dirty will be
379		 * always clear.
380		 */
381		accessed_dirty &= pte >>
382			(PT_GUEST_DIRTY_SHIFT - PT_GUEST_ACCESSED_SHIFT);
383
384	if (unlikely(!accessed_dirty)) {
385		ret = FNAME(update_accessed_dirty_bits)(vcpu, mmu, walker, write_fault);
386		if (unlikely(ret < 0))
387			goto error;
388		else if (ret)
389			goto retry_walk;
390	}
391
392	walker->pt_access = pt_access;
393	walker->pte_access = pte_access;
394	pgprintk("%s: pte %llx pte_access %x pt_access %x\n",
395		 __func__, (u64)pte, pte_access, pt_access);
396	return 1;
397
398error:
399	errcode |= write_fault | user_fault;
400	if (fetch_fault && (mmu->nx ||
401			    kvm_read_cr4_bits(vcpu, X86_CR4_SMEP)))
402		errcode |= PFERR_FETCH_MASK;
403
404	walker->fault.vector = PF_VECTOR;
405	walker->fault.error_code_valid = true;
406	walker->fault.error_code = errcode;
407
408#if PTTYPE == PTTYPE_EPT
409	/*
410	 * Use PFERR_RSVD_MASK in error_code to to tell if EPT
411	 * misconfiguration requires to be injected. The detection is
412	 * done by is_rsvd_bits_set() above.
413	 *
414	 * We set up the value of exit_qualification to inject:
415	 * [2:0] - Derive from [2:0] of real exit_qualification at EPT violation
416	 * [5:3] - Calculated by the page walk of the guest EPT page tables
417	 * [7:8] - Derived from [7:8] of real exit_qualification
418	 *
419	 * The other bits are set to 0.
420	 */
421	if (!(errcode & PFERR_RSVD_MASK)) {
422		vcpu->arch.exit_qualification &= 0x187;
423		vcpu->arch.exit_qualification |= ((pt_access & pte) & 0x7) << 3;
424	}
425#endif
426	walker->fault.address = addr;
427	walker->fault.nested_page_fault = mmu != vcpu->arch.walk_mmu;
428
429	trace_kvm_mmu_walker_error(walker->fault.error_code);
430	return 0;
431}
432
433static int FNAME(walk_addr)(struct guest_walker *walker,
434			    struct kvm_vcpu *vcpu, gva_t addr, u32 access)
435{
436	return FNAME(walk_addr_generic)(walker, vcpu, &vcpu->arch.mmu, addr,
437					access);
438}
439
440#if PTTYPE != PTTYPE_EPT
441static int FNAME(walk_addr_nested)(struct guest_walker *walker,
442				   struct kvm_vcpu *vcpu, gva_t addr,
443				   u32 access)
444{
445	return FNAME(walk_addr_generic)(walker, vcpu, &vcpu->arch.nested_mmu,
446					addr, access);
447}
448#endif
449
450static bool
451FNAME(prefetch_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
452		     u64 *spte, pt_element_t gpte, bool no_dirty_log)
453{
454	unsigned pte_access;
455	gfn_t gfn;
456	pfn_t pfn;
457
458	if (FNAME(prefetch_invalid_gpte)(vcpu, sp, spte, gpte))
459		return false;
460
461	pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte);
462
463	gfn = gpte_to_gfn(gpte);
464	pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte);
465	FNAME(protect_clean_gpte)(&pte_access, gpte);
466	pfn = pte_prefetch_gfn_to_pfn(vcpu, gfn,
467			no_dirty_log && (pte_access & ACC_WRITE_MASK));
468	if (is_error_pfn(pfn))
469		return false;
470
471	/*
472	 * we call mmu_set_spte() with host_writable = true because
473	 * pte_prefetch_gfn_to_pfn always gets a writable pfn.
474	 */
475	mmu_set_spte(vcpu, spte, pte_access, 0, NULL, PT_PAGE_TABLE_LEVEL,
476		     gfn, pfn, true, true);
477
478	return true;
479}
480
481static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
482			      u64 *spte, const void *pte)
483{
484	pt_element_t gpte = *(const pt_element_t *)pte;
485
486	FNAME(prefetch_gpte)(vcpu, sp, spte, gpte, false);
487}
488
489static bool FNAME(gpte_changed)(struct kvm_vcpu *vcpu,
490				struct guest_walker *gw, int level)
491{
492	pt_element_t curr_pte;
493	gpa_t base_gpa, pte_gpa = gw->pte_gpa[level - 1];
494	u64 mask;
495	int r, index;
496
497	if (level == PT_PAGE_TABLE_LEVEL) {
498		mask = PTE_PREFETCH_NUM * sizeof(pt_element_t) - 1;
499		base_gpa = pte_gpa & ~mask;
500		index = (pte_gpa - base_gpa) / sizeof(pt_element_t);
501
502		r = kvm_read_guest_atomic(vcpu->kvm, base_gpa,
503				gw->prefetch_ptes, sizeof(gw->prefetch_ptes));
504		curr_pte = gw->prefetch_ptes[index];
505	} else
506		r = kvm_read_guest_atomic(vcpu->kvm, pte_gpa,
507				  &curr_pte, sizeof(curr_pte));
508
509	return r || curr_pte != gw->ptes[level - 1];
510}
511
512static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw,
513				u64 *sptep)
514{
515	struct kvm_mmu_page *sp;
516	pt_element_t *gptep = gw->prefetch_ptes;
517	u64 *spte;
518	int i;
519
520	sp = page_header(__pa(sptep));
521
522	if (sp->role.level > PT_PAGE_TABLE_LEVEL)
523		return;
524
525	if (sp->role.direct)
526		return __direct_pte_prefetch(vcpu, sp, sptep);
527
528	i = (sptep - sp->spt) & ~(PTE_PREFETCH_NUM - 1);
529	spte = sp->spt + i;
530
531	for (i = 0; i < PTE_PREFETCH_NUM; i++, spte++) {
532		if (spte == sptep)
533			continue;
534
535		if (is_shadow_present_pte(*spte))
536			continue;
537
538		if (!FNAME(prefetch_gpte)(vcpu, sp, spte, gptep[i], true))
539			break;
540	}
541}
542
543/*
544 * Fetch a shadow pte for a specific level in the paging hierarchy.
545 * If the guest tries to write a write-protected page, we need to
546 * emulate this operation, return 1 to indicate this case.
547 */
548static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
549			 struct guest_walker *gw,
550			 int write_fault, int hlevel,
551			 pfn_t pfn, bool map_writable, bool prefault)
552{
553	struct kvm_mmu_page *sp = NULL;
554	struct kvm_shadow_walk_iterator it;
555	unsigned direct_access, access = gw->pt_access;
556	int top_level, emulate = 0;
557
558	direct_access = gw->pte_access;
559
560	top_level = vcpu->arch.mmu.root_level;
561	if (top_level == PT32E_ROOT_LEVEL)
562		top_level = PT32_ROOT_LEVEL;
563	/*
564	 * Verify that the top-level gpte is still there.  Since the page
565	 * is a root page, it is either write protected (and cannot be
566	 * changed from now on) or it is invalid (in which case, we don't
567	 * really care if it changes underneath us after this point).
568	 */
569	if (FNAME(gpte_changed)(vcpu, gw, top_level))
570		goto out_gpte_changed;
571
572	if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
573		goto out_gpte_changed;
574
575	for (shadow_walk_init(&it, vcpu, addr);
576	     shadow_walk_okay(&it) && it.level > gw->level;
577	     shadow_walk_next(&it)) {
578		gfn_t table_gfn;
579
580		clear_sp_write_flooding_count(it.sptep);
581		drop_large_spte(vcpu, it.sptep);
582
583		sp = NULL;
584		if (!is_shadow_present_pte(*it.sptep)) {
585			table_gfn = gw->table_gfn[it.level - 2];
586			sp = kvm_mmu_get_page(vcpu, table_gfn, addr, it.level-1,
587					      false, access, it.sptep);
588		}
589
590		/*
591		 * Verify that the gpte in the page we've just write
592		 * protected is still there.
593		 */
594		if (FNAME(gpte_changed)(vcpu, gw, it.level - 1))
595			goto out_gpte_changed;
596
597		if (sp)
598			link_shadow_page(it.sptep, sp, PT_GUEST_ACCESSED_MASK);
599	}
600
601	for (;
602	     shadow_walk_okay(&it) && it.level > hlevel;
603	     shadow_walk_next(&it)) {
604		gfn_t direct_gfn;
605
606		clear_sp_write_flooding_count(it.sptep);
607		validate_direct_spte(vcpu, it.sptep, direct_access);
608
609		drop_large_spte(vcpu, it.sptep);
610
611		if (is_shadow_present_pte(*it.sptep))
612			continue;
613
614		direct_gfn = gw->gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1);
615
616		sp = kvm_mmu_get_page(vcpu, direct_gfn, addr, it.level-1,
617				      true, direct_access, it.sptep);
618		link_shadow_page(it.sptep, sp, PT_GUEST_ACCESSED_MASK);
619	}
620
621	clear_sp_write_flooding_count(it.sptep);
622	mmu_set_spte(vcpu, it.sptep, gw->pte_access, write_fault, &emulate,
623		     it.level, gw->gfn, pfn, prefault, map_writable);
624	FNAME(pte_prefetch)(vcpu, gw, it.sptep);
625
626	return emulate;
627
628out_gpte_changed:
629	if (sp)
630		kvm_mmu_put_page(sp, it.sptep);
631	kvm_release_pfn_clean(pfn);
632	return 0;
633}
634
635 /*
636 * To see whether the mapped gfn can write its page table in the current
637 * mapping.
638 *
639 * It is the helper function of FNAME(page_fault). When guest uses large page
640 * size to map the writable gfn which is used as current page table, we should
641 * force kvm to use small page size to map it because new shadow page will be
642 * created when kvm establishes shadow page table that stop kvm using large
643 * page size. Do it early can avoid unnecessary #PF and emulation.
644 *
645 * @write_fault_to_shadow_pgtable will return true if the fault gfn is
646 * currently used as its page table.
647 *
648 * Note: the PDPT page table is not checked for PAE-32 bit guest. It is ok
649 * since the PDPT is always shadowed, that means, we can not use large page
650 * size to map the gfn which is used as PDPT.
651 */
652static bool
653FNAME(is_self_change_mapping)(struct kvm_vcpu *vcpu,
654			      struct guest_walker *walker, int user_fault,
655			      bool *write_fault_to_shadow_pgtable)
656{
657	int level;
658	gfn_t mask = ~(KVM_PAGES_PER_HPAGE(walker->level) - 1);
659	bool self_changed = false;
660
661	if (!(walker->pte_access & ACC_WRITE_MASK ||
662	      (!is_write_protection(vcpu) && !user_fault)))
663		return false;
664
665	for (level = walker->level; level <= walker->max_level; level++) {
666		gfn_t gfn = walker->gfn ^ walker->table_gfn[level - 1];
667
668		self_changed |= !(gfn & mask);
669		*write_fault_to_shadow_pgtable |= !gfn;
670	}
671
672	return self_changed;
673}
674
675/*
676 * Page fault handler.  There are several causes for a page fault:
677 *   - there is no shadow pte for the guest pte
678 *   - write access through a shadow pte marked read only so that we can set
679 *     the dirty bit
680 *   - write access to a shadow pte marked read only so we can update the page
681 *     dirty bitmap, when userspace requests it
682 *   - mmio access; in this case we will never install a present shadow pte
683 *   - normal guest page fault due to the guest pte marked not present, not
684 *     writable, or not executable
685 *
686 *  Returns: 1 if we need to emulate the instruction, 0 otherwise, or
687 *           a negative value on error.
688 */
689static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
690			     bool prefault)
691{
692	int write_fault = error_code & PFERR_WRITE_MASK;
693	int user_fault = error_code & PFERR_USER_MASK;
694	struct guest_walker walker;
695	int r;
696	pfn_t pfn;
697	int level = PT_PAGE_TABLE_LEVEL;
698	int force_pt_level;
699	unsigned long mmu_seq;
700	bool map_writable, is_self_change_mapping;
701
702	pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code);
703
704	if (unlikely(error_code & PFERR_RSVD_MASK)) {
705		r = handle_mmio_page_fault(vcpu, addr, error_code,
706					      mmu_is_nested(vcpu));
707		if (likely(r != RET_MMIO_PF_INVALID))
708			return r;
709	};
710
711	r = mmu_topup_memory_caches(vcpu);
712	if (r)
713		return r;
714
715	/*
716	 * Look up the guest pte for the faulting address.
717	 */
718	r = FNAME(walk_addr)(&walker, vcpu, addr, error_code);
719
720	/*
721	 * The page is not mapped by the guest.  Let the guest handle it.
722	 */
723	if (!r) {
724		pgprintk("%s: guest page fault\n", __func__);
725		if (!prefault)
726			inject_page_fault(vcpu, &walker.fault);
727
728		return 0;
729	}
730
731	vcpu->arch.write_fault_to_shadow_pgtable = false;
732
733	is_self_change_mapping = FNAME(is_self_change_mapping)(vcpu,
734	      &walker, user_fault, &vcpu->arch.write_fault_to_shadow_pgtable);
735
736	if (walker.level >= PT_DIRECTORY_LEVEL)
737		force_pt_level = mapping_level_dirty_bitmap(vcpu, walker.gfn)
738		   || is_self_change_mapping;
739	else
740		force_pt_level = 1;
741	if (!force_pt_level) {
742		level = min(walker.level, mapping_level(vcpu, walker.gfn));
743		walker.gfn = walker.gfn & ~(KVM_PAGES_PER_HPAGE(level) - 1);
744	}
745
746	mmu_seq = vcpu->kvm->mmu_notifier_seq;
747	smp_rmb();
748
749	if (try_async_pf(vcpu, prefault, walker.gfn, addr, &pfn, write_fault,
750			 &map_writable))
751		return 0;
752
753	if (handle_abnormal_pfn(vcpu, mmu_is_nested(vcpu) ? 0 : addr,
754				walker.gfn, pfn, walker.pte_access, &r))
755		return r;
756
757	/*
758	 * Do not change pte_access if the pfn is a mmio page, otherwise
759	 * we will cache the incorrect access into mmio spte.
760	 */
761	if (write_fault && !(walker.pte_access & ACC_WRITE_MASK) &&
762	     !is_write_protection(vcpu) && !user_fault &&
763	      !is_noslot_pfn(pfn)) {
764		walker.pte_access |= ACC_WRITE_MASK;
765		walker.pte_access &= ~ACC_USER_MASK;
766
767		/*
768		 * If we converted a user page to a kernel page,
769		 * so that the kernel can write to it when cr0.wp=0,
770		 * then we should prevent the kernel from executing it
771		 * if SMEP is enabled.
772		 */
773		if (kvm_read_cr4_bits(vcpu, X86_CR4_SMEP))
774			walker.pte_access &= ~ACC_EXEC_MASK;
775	}
776
777	spin_lock(&vcpu->kvm->mmu_lock);
778	if (mmu_notifier_retry(vcpu->kvm, mmu_seq))
779		goto out_unlock;
780
781	kvm_mmu_audit(vcpu, AUDIT_PRE_PAGE_FAULT);
782	make_mmu_pages_available(vcpu);
783	if (!force_pt_level)
784		transparent_hugepage_adjust(vcpu, &walker.gfn, &pfn, &level);
785	r = FNAME(fetch)(vcpu, addr, &walker, write_fault,
786			 level, pfn, map_writable, prefault);
787	++vcpu->stat.pf_fixed;
788	kvm_mmu_audit(vcpu, AUDIT_POST_PAGE_FAULT);
789	spin_unlock(&vcpu->kvm->mmu_lock);
790
791	return r;
792
793out_unlock:
794	spin_unlock(&vcpu->kvm->mmu_lock);
795	kvm_release_pfn_clean(pfn);
796	return 0;
797}
798
799static gpa_t FNAME(get_level1_sp_gpa)(struct kvm_mmu_page *sp)
800{
801	int offset = 0;
802
803	WARN_ON(sp->role.level != PT_PAGE_TABLE_LEVEL);
804
805	if (PTTYPE == 32)
806		offset = sp->role.quadrant << PT64_LEVEL_BITS;
807
808	return gfn_to_gpa(sp->gfn) + offset * sizeof(pt_element_t);
809}
810
811static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
812{
813	struct kvm_shadow_walk_iterator iterator;
814	struct kvm_mmu_page *sp;
815	int level;
816	u64 *sptep;
817
818	vcpu_clear_mmio_info(vcpu, gva);
819
820	/*
821	 * No need to check return value here, rmap_can_add() can
822	 * help us to skip pte prefetch later.
823	 */
824	mmu_topup_memory_caches(vcpu);
825
826	if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) {
827		WARN_ON(1);
828		return;
829	}
830
831	spin_lock(&vcpu->kvm->mmu_lock);
832	for_each_shadow_entry(vcpu, gva, iterator) {
833		level = iterator.level;
834		sptep = iterator.sptep;
835
836		sp = page_header(__pa(sptep));
837		if (is_last_spte(*sptep, level)) {
838			pt_element_t gpte;
839			gpa_t pte_gpa;
840
841			if (!sp->unsync)
842				break;
843
844			pte_gpa = FNAME(get_level1_sp_gpa)(sp);
845			pte_gpa += (sptep - sp->spt) * sizeof(pt_element_t);
846
847			if (mmu_page_zap_pte(vcpu->kvm, sp, sptep))
848				kvm_flush_remote_tlbs(vcpu->kvm);
849
850			if (!rmap_can_add(vcpu))
851				break;
852
853			if (kvm_read_guest_atomic(vcpu->kvm, pte_gpa, &gpte,
854						  sizeof(pt_element_t)))
855				break;
856
857			FNAME(update_pte)(vcpu, sp, sptep, &gpte);
858		}
859
860		if (!is_shadow_present_pte(*sptep) || !sp->unsync_children)
861			break;
862	}
863	spin_unlock(&vcpu->kvm->mmu_lock);
864}
865
866static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr, u32 access,
867			       struct x86_exception *exception)
868{
869	struct guest_walker walker;
870	gpa_t gpa = UNMAPPED_GVA;
871	int r;
872
873	r = FNAME(walk_addr)(&walker, vcpu, vaddr, access);
874
875	if (r) {
876		gpa = gfn_to_gpa(walker.gfn);
877		gpa |= vaddr & ~PAGE_MASK;
878	} else if (exception)
879		*exception = walker.fault;
880
881	return gpa;
882}
883
884#if PTTYPE != PTTYPE_EPT
885static gpa_t FNAME(gva_to_gpa_nested)(struct kvm_vcpu *vcpu, gva_t vaddr,
886				      u32 access,
887				      struct x86_exception *exception)
888{
889	struct guest_walker walker;
890	gpa_t gpa = UNMAPPED_GVA;
891	int r;
892
893	r = FNAME(walk_addr_nested)(&walker, vcpu, vaddr, access);
894
895	if (r) {
896		gpa = gfn_to_gpa(walker.gfn);
897		gpa |= vaddr & ~PAGE_MASK;
898	} else if (exception)
899		*exception = walker.fault;
900
901	return gpa;
902}
903#endif
904
905/*
906 * Using the cached information from sp->gfns is safe because:
907 * - The spte has a reference to the struct page, so the pfn for a given gfn
908 *   can't change unless all sptes pointing to it are nuked first.
909 *
910 * Note:
911 *   We should flush all tlbs if spte is dropped even though guest is
912 *   responsible for it. Since if we don't, kvm_mmu_notifier_invalidate_page
913 *   and kvm_mmu_notifier_invalidate_range_start detect the mapping page isn't
914 *   used by guest then tlbs are not flushed, so guest is allowed to access the
915 *   freed pages.
916 *   And we increase kvm->tlbs_dirty to delay tlbs flush in this case.
917 */
918static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
919{
920	int i, nr_present = 0;
921	bool host_writable;
922	gpa_t first_pte_gpa;
923
924	/* direct kvm_mmu_page can not be unsync. */
925	BUG_ON(sp->role.direct);
926
927	first_pte_gpa = FNAME(get_level1_sp_gpa)(sp);
928
929	for (i = 0; i < PT64_ENT_PER_PAGE; i++) {
930		unsigned pte_access;
931		pt_element_t gpte;
932		gpa_t pte_gpa;
933		gfn_t gfn;
934
935		if (!sp->spt[i])
936			continue;
937
938		pte_gpa = first_pte_gpa + i * sizeof(pt_element_t);
939
940		if (kvm_read_guest_atomic(vcpu->kvm, pte_gpa, &gpte,
941					  sizeof(pt_element_t)))
942			return -EINVAL;
943
944		if (FNAME(prefetch_invalid_gpte)(vcpu, sp, &sp->spt[i], gpte)) {
945			vcpu->kvm->tlbs_dirty++;
946			continue;
947		}
948
949		gfn = gpte_to_gfn(gpte);
950		pte_access = sp->role.access;
951		pte_access &= FNAME(gpte_access)(vcpu, gpte);
952		FNAME(protect_clean_gpte)(&pte_access, gpte);
953
954		if (sync_mmio_spte(vcpu->kvm, &sp->spt[i], gfn, pte_access,
955		      &nr_present))
956			continue;
957
958		if (gfn != sp->gfns[i]) {
959			drop_spte(vcpu->kvm, &sp->spt[i]);
960			vcpu->kvm->tlbs_dirty++;
961			continue;
962		}
963
964		nr_present++;
965
966		host_writable = sp->spt[i] & SPTE_HOST_WRITEABLE;
967
968		set_spte(vcpu, &sp->spt[i], pte_access,
969			 PT_PAGE_TABLE_LEVEL, gfn,
970			 spte_to_pfn(sp->spt[i]), true, false,
971			 host_writable);
972	}
973
974	return !nr_present;
975}
976
977#undef pt_element_t
978#undef guest_walker
979#undef FNAME
980#undef PT_BASE_ADDR_MASK
981#undef PT_INDEX
982#undef PT_LVL_ADDR_MASK
983#undef PT_LVL_OFFSET_MASK
984#undef PT_LEVEL_BITS
985#undef PT_MAX_FULL_LEVELS
986#undef gpte_to_gfn
987#undef gpte_to_gfn_lvl
988#undef CMPXCHG
989#undef PT_GUEST_ACCESSED_MASK
990#undef PT_GUEST_DIRTY_MASK
991#undef PT_GUEST_DIRTY_SHIFT
992#undef PT_GUEST_ACCESSED_SHIFT
Configure Feed

Configure Feed