Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

KVM: MMU: Don't assume struct page for x86

This patch introduces a gfn_to_pfn() function and corresponding functions like
kvm_release_pfn_dirty(). Using these new functions, we can modify the x86
MMU to no longer assume that it can always get a struct page for any given gfn.

We don't want to eliminate gfn_to_page() entirely because a number of places
assume they can do gfn_to_page() and then kmap() the results. When we support
IO memory, gfn_to_page() will fail for IO pages although gfn_to_pfn() will
succeed.

This does not implement support for avoiding reference counting for reserved
RAM or for IO memory. However, it should make those things pretty straight
forward.

Since we're only introducing new common symbols, I don't think it will break
the non-x86 architectures but I haven't tested those. I've tested Intel,
AMD, NPT, and hugetlbfs with Windows and Linux guests.

[avi: fix overflow when shifting left pfns by adding casts]

Signed-off-by: Anthony Liguori <aliguori@us.ibm.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>

authored by

Anthony Liguori and committed by
Avi Kivity
35149e21 fdae862f

+134 -69
+43 -46
arch/x86/kvm/mmu.c
··· 240 240 return is_shadow_present_pte(pte); 241 241 } 242 242 243 - static struct page *spte_to_page(u64 pte) 243 + static pfn_t spte_to_pfn(u64 pte) 244 244 { 245 - hfn_t hfn = (pte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT; 246 - 247 - return pfn_to_page(hfn); 245 + return (pte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT; 248 246 } 249 247 250 248 static gfn_t pse36_gfn_delta(u32 gpte) ··· 539 541 struct kvm_rmap_desc *desc; 540 542 struct kvm_rmap_desc *prev_desc; 541 543 struct kvm_mmu_page *sp; 542 - struct page *page; 544 + pfn_t pfn; 543 545 unsigned long *rmapp; 544 546 int i; 545 547 546 548 if (!is_rmap_pte(*spte)) 547 549 return; 548 550 sp = page_header(__pa(spte)); 549 - page = spte_to_page(*spte); 551 + pfn = spte_to_pfn(*spte); 550 552 if (*spte & PT_ACCESSED_MASK) 551 - mark_page_accessed(page); 553 + kvm_set_pfn_accessed(pfn); 552 554 if (is_writeble_pte(*spte)) 553 - kvm_release_page_dirty(page); 555 + kvm_release_pfn_dirty(pfn); 554 556 else 555 - kvm_release_page_clean(page); 557 + kvm_release_pfn_clean(pfn); 556 558 rmapp = gfn_to_rmap(kvm, sp->gfns[spte - sp->spt], is_large_pte(*spte)); 557 559 if (!*rmapp) { 558 560 printk(KERN_ERR "rmap_remove: %p %llx 0->BUG\n", spte, *spte); ··· 633 635 spte = rmap_next(kvm, rmapp, spte); 634 636 } 635 637 if (write_protected) { 636 - struct page *page; 638 + pfn_t pfn; 637 639 638 640 spte = rmap_next(kvm, rmapp, NULL); 639 - page = spte_to_page(*spte); 640 - SetPageDirty(page); 641 + pfn = spte_to_pfn(*spte); 642 + kvm_set_pfn_dirty(pfn); 641 643 } 642 644 643 645 /* check for huge page mappings */ ··· 1034 1036 unsigned pt_access, unsigned pte_access, 1035 1037 int user_fault, int write_fault, int dirty, 1036 1038 int *ptwrite, int largepage, gfn_t gfn, 1037 - struct page *page, bool speculative) 1039 + pfn_t pfn, bool speculative) 1038 1040 { 1039 1041 u64 spte; 1040 1042 int was_rmapped = 0; ··· 1056 1058 1057 1059 child = page_header(pte & PT64_BASE_ADDR_MASK); 1058 1060 mmu_page_remove_parent_pte(child, shadow_pte); 1059 - } else if (page != spte_to_page(*shadow_pte)) { 1061 + } else if (pfn != spte_to_pfn(*shadow_pte)) { 1060 1062 pgprintk("hfn old %lx new %lx\n", 1061 - page_to_pfn(spte_to_page(*shadow_pte)), 1062 - page_to_pfn(page)); 1063 + spte_to_pfn(*shadow_pte), pfn); 1063 1064 rmap_remove(vcpu->kvm, shadow_pte); 1064 1065 } else { 1065 1066 if (largepage) ··· 1087 1090 if (largepage) 1088 1091 spte |= PT_PAGE_SIZE_MASK; 1089 1092 1090 - spte |= page_to_phys(page); 1093 + spte |= (u64)pfn << PAGE_SHIFT; 1091 1094 1092 1095 if ((pte_access & ACC_WRITE_MASK) 1093 1096 || (write_fault && !is_write_protection(vcpu) && !user_fault)) { ··· 1132 1135 if (!was_rmapped) { 1133 1136 rmap_add(vcpu, shadow_pte, gfn, largepage); 1134 1137 if (!is_rmap_pte(*shadow_pte)) 1135 - kvm_release_page_clean(page); 1138 + kvm_release_pfn_clean(pfn); 1136 1139 } else { 1137 1140 if (was_writeble) 1138 - kvm_release_page_dirty(page); 1141 + kvm_release_pfn_dirty(pfn); 1139 1142 else 1140 - kvm_release_page_clean(page); 1143 + kvm_release_pfn_clean(pfn); 1141 1144 } 1142 1145 if (!ptwrite || !*ptwrite) 1143 1146 vcpu->arch.last_pte_updated = shadow_pte; ··· 1148 1151 } 1149 1152 1150 1153 static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write, 1151 - int largepage, gfn_t gfn, struct page *page, 1154 + int largepage, gfn_t gfn, pfn_t pfn, 1152 1155 int level) 1153 1156 { 1154 1157 hpa_t table_addr = vcpu->arch.mmu.root_hpa; ··· 1163 1166 1164 1167 if (level == 1) { 1165 1168 mmu_set_spte(vcpu, &table[index], ACC_ALL, ACC_ALL, 1166 - 0, write, 1, &pt_write, 0, gfn, page, false); 1169 + 0, write, 1, &pt_write, 0, gfn, pfn, false); 1167 1170 return pt_write; 1168 1171 } 1169 1172 1170 1173 if (largepage && level == 2) { 1171 1174 mmu_set_spte(vcpu, &table[index], ACC_ALL, ACC_ALL, 1172 - 0, write, 1, &pt_write, 1, gfn, page, false); 1175 + 0, write, 1, &pt_write, 1, gfn, pfn, false); 1173 1176 return pt_write; 1174 1177 } 1175 1178 ··· 1184 1187 1, ACC_ALL, &table[index]); 1185 1188 if (!new_table) { 1186 1189 pgprintk("nonpaging_map: ENOMEM\n"); 1187 - kvm_release_page_clean(page); 1190 + kvm_release_pfn_clean(pfn); 1188 1191 return -ENOMEM; 1189 1192 } 1190 1193 ··· 1199 1202 { 1200 1203 int r; 1201 1204 int largepage = 0; 1202 - 1203 - struct page *page; 1205 + pfn_t pfn; 1204 1206 1205 1207 down_read(&current->mm->mmap_sem); 1206 1208 if (is_largepage_backed(vcpu, gfn & ~(KVM_PAGES_PER_HPAGE-1))) { ··· 1207 1211 largepage = 1; 1208 1212 } 1209 1213 1210 - page = gfn_to_page(vcpu->kvm, gfn); 1214 + pfn = gfn_to_pfn(vcpu->kvm, gfn); 1211 1215 up_read(&current->mm->mmap_sem); 1212 1216 1213 1217 /* mmio */ 1214 - if (is_error_page(page)) { 1215 - kvm_release_page_clean(page); 1218 + if (is_error_pfn(pfn)) { 1219 + kvm_release_pfn_clean(pfn); 1216 1220 return 1; 1217 1221 } 1218 1222 1219 1223 spin_lock(&vcpu->kvm->mmu_lock); 1220 1224 kvm_mmu_free_some_pages(vcpu); 1221 - r = __direct_map(vcpu, v, write, largepage, gfn, page, 1225 + r = __direct_map(vcpu, v, write, largepage, gfn, pfn, 1222 1226 PT32E_ROOT_LEVEL); 1223 1227 spin_unlock(&vcpu->kvm->mmu_lock); 1224 1228 ··· 1351 1355 static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, 1352 1356 u32 error_code) 1353 1357 { 1354 - struct page *page; 1358 + pfn_t pfn; 1355 1359 int r; 1356 1360 int largepage = 0; 1357 1361 gfn_t gfn = gpa >> PAGE_SHIFT; ··· 1368 1372 gfn &= ~(KVM_PAGES_PER_HPAGE-1); 1369 1373 largepage = 1; 1370 1374 } 1371 - page = gfn_to_page(vcpu->kvm, gfn); 1375 + pfn = gfn_to_pfn(vcpu->kvm, gfn); 1372 1376 up_read(&current->mm->mmap_sem); 1373 - if (is_error_page(page)) { 1374 - kvm_release_page_clean(page); 1377 + if (is_error_pfn(pfn)) { 1378 + kvm_release_pfn_clean(pfn); 1375 1379 return 1; 1376 1380 } 1377 1381 spin_lock(&vcpu->kvm->mmu_lock); 1378 1382 kvm_mmu_free_some_pages(vcpu); 1379 1383 r = __direct_map(vcpu, gpa, error_code & PFERR_WRITE_MASK, 1380 - largepage, gfn, page, TDP_ROOT_LEVEL); 1384 + largepage, gfn, pfn, TDP_ROOT_LEVEL); 1381 1385 spin_unlock(&vcpu->kvm->mmu_lock); 1382 1386 1383 1387 return r; ··· 1521 1525 1522 1526 static int init_kvm_mmu(struct kvm_vcpu *vcpu) 1523 1527 { 1528 + vcpu->arch.update_pte.pfn = bad_pfn; 1529 + 1524 1530 if (tdp_enabled) 1525 1531 return init_kvm_tdp_mmu(vcpu); 1526 1532 else ··· 1642 1644 gfn_t gfn; 1643 1645 int r; 1644 1646 u64 gpte = 0; 1645 - struct page *page; 1647 + pfn_t pfn; 1646 1648 1647 1649 vcpu->arch.update_pte.largepage = 0; 1648 1650 ··· 1678 1680 gfn &= ~(KVM_PAGES_PER_HPAGE-1); 1679 1681 vcpu->arch.update_pte.largepage = 1; 1680 1682 } 1681 - page = gfn_to_page(vcpu->kvm, gfn); 1683 + pfn = gfn_to_pfn(vcpu->kvm, gfn); 1682 1684 up_read(&current->mm->mmap_sem); 1683 1685 1684 - if (is_error_page(page)) { 1685 - kvm_release_page_clean(page); 1686 + if (is_error_pfn(pfn)) { 1687 + kvm_release_pfn_clean(pfn); 1686 1688 return; 1687 1689 } 1688 1690 vcpu->arch.update_pte.gfn = gfn; 1689 - vcpu->arch.update_pte.page = page; 1691 + vcpu->arch.update_pte.pfn = pfn; 1690 1692 } 1691 1693 1692 1694 void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, ··· 1791 1793 } 1792 1794 kvm_mmu_audit(vcpu, "post pte write"); 1793 1795 spin_unlock(&vcpu->kvm->mmu_lock); 1794 - if (vcpu->arch.update_pte.page) { 1795 - kvm_release_page_clean(vcpu->arch.update_pte.page); 1796 - vcpu->arch.update_pte.page = NULL; 1796 + if (!is_error_pfn(vcpu->arch.update_pte.pfn)) { 1797 + kvm_release_pfn_clean(vcpu->arch.update_pte.pfn); 1798 + vcpu->arch.update_pte.pfn = bad_pfn; 1797 1799 } 1798 1800 } 1799 1801 ··· 2234 2236 audit_mappings_page(vcpu, ent, va, level - 1); 2235 2237 } else { 2236 2238 gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, va); 2237 - struct page *page = gpa_to_page(vcpu, gpa); 2238 - hpa_t hpa = page_to_phys(page); 2239 + hpa_t hpa = (hpa_t)gpa_to_pfn(vcpu, gpa) << PAGE_SHIFT; 2239 2240 2240 2241 if (is_shadow_present_pte(ent) 2241 2242 && (ent & PT64_BASE_ADDR_MASK) != hpa) ··· 2247 2250 && !is_error_hpa(hpa)) 2248 2251 printk(KERN_ERR "audit: (%s) notrap shadow," 2249 2252 " valid guest gva %lx\n", audit_msg, va); 2250 - kvm_release_page_clean(page); 2253 + kvm_release_pfn_clean(pfn); 2251 2254 2252 2255 } 2253 2256 }
+13 -13
arch/x86/kvm/paging_tmpl.h
··· 247 247 { 248 248 pt_element_t gpte; 249 249 unsigned pte_access; 250 - struct page *npage; 250 + pfn_t pfn; 251 251 int largepage = vcpu->arch.update_pte.largepage; 252 252 253 253 gpte = *(const pt_element_t *)pte; ··· 260 260 pte_access = page->role.access & FNAME(gpte_access)(vcpu, gpte); 261 261 if (gpte_to_gfn(gpte) != vcpu->arch.update_pte.gfn) 262 262 return; 263 - npage = vcpu->arch.update_pte.page; 264 - if (!npage) 263 + pfn = vcpu->arch.update_pte.pfn; 264 + if (is_error_pfn(pfn)) 265 265 return; 266 - get_page(npage); 266 + kvm_get_pfn(pfn); 267 267 mmu_set_spte(vcpu, spte, page->role.access, pte_access, 0, 0, 268 268 gpte & PT_DIRTY_MASK, NULL, largepage, gpte_to_gfn(gpte), 269 - npage, true); 269 + pfn, true); 270 270 } 271 271 272 272 /* ··· 275 275 static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, 276 276 struct guest_walker *walker, 277 277 int user_fault, int write_fault, int largepage, 278 - int *ptwrite, struct page *page) 278 + int *ptwrite, pfn_t pfn) 279 279 { 280 280 hpa_t shadow_addr; 281 281 int level; ··· 336 336 walker->pte_gpa[level - 2], 337 337 &curr_pte, sizeof(curr_pte)); 338 338 if (r || curr_pte != walker->ptes[level - 2]) { 339 - kvm_release_page_clean(page); 339 + kvm_release_pfn_clean(pfn); 340 340 return NULL; 341 341 } 342 342 } ··· 349 349 mmu_set_spte(vcpu, shadow_ent, access, walker->pte_access & access, 350 350 user_fault, write_fault, 351 351 walker->ptes[walker->level-1] & PT_DIRTY_MASK, 352 - ptwrite, largepage, walker->gfn, page, false); 352 + ptwrite, largepage, walker->gfn, pfn, false); 353 353 354 354 return shadow_ent; 355 355 } ··· 378 378 u64 *shadow_pte; 379 379 int write_pt = 0; 380 380 int r; 381 - struct page *page; 381 + pfn_t pfn; 382 382 int largepage = 0; 383 383 384 384 pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code); ··· 413 413 largepage = 1; 414 414 } 415 415 } 416 - page = gfn_to_page(vcpu->kvm, walker.gfn); 416 + pfn = gfn_to_pfn(vcpu->kvm, walker.gfn); 417 417 up_read(&current->mm->mmap_sem); 418 418 419 419 /* mmio */ 420 - if (is_error_page(page)) { 420 + if (is_error_pfn(pfn)) { 421 421 pgprintk("gfn %x is mmio\n", walker.gfn); 422 - kvm_release_page_clean(page); 422 + kvm_release_pfn_clean(pfn); 423 423 return 1; 424 424 } 425 425 426 426 spin_lock(&vcpu->kvm->mmu_lock); 427 427 kvm_mmu_free_some_pages(vcpu); 428 428 shadow_pte = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault, 429 - largepage, &write_pt, page); 429 + largepage, &write_pt, pfn); 430 430 431 431 pgprintk("%s: shadow pte %p %llx ptwrite %d\n", __func__, 432 432 shadow_pte, *shadow_pte, write_pt);
+2 -2
include/asm-x86/kvm_host.h
··· 248 248 u64 *last_pte_updated; 249 249 250 250 struct { 251 - gfn_t gfn; /* presumed gfn during guest pte update */ 252 - struct page *page; /* page corresponding to that gfn */ 251 + gfn_t gfn; /* presumed gfn during guest pte update */ 252 + pfn_t pfn; /* pfn corresponding to that gfn */ 253 253 int largepage; 254 254 } update_pte; 255 255
+12
include/linux/kvm_host.h
··· 150 150 struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva); 151 151 152 152 extern struct page *bad_page; 153 + extern pfn_t bad_pfn; 153 154 154 155 int is_error_page(struct page *page); 156 + int is_error_pfn(pfn_t pfn); 155 157 int kvm_is_error_hva(unsigned long addr); 156 158 int kvm_set_memory_region(struct kvm *kvm, 157 159 struct kvm_userspace_memory_region *mem, ··· 170 168 unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn); 171 169 void kvm_release_page_clean(struct page *page); 172 170 void kvm_release_page_dirty(struct page *page); 171 + void kvm_set_page_dirty(struct page *page); 172 + void kvm_set_page_accessed(struct page *page); 173 + 174 + pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn); 175 + void kvm_release_pfn_dirty(pfn_t); 176 + void kvm_release_pfn_clean(pfn_t pfn); 177 + void kvm_set_pfn_dirty(pfn_t pfn); 178 + void kvm_set_pfn_accessed(pfn_t pfn); 179 + void kvm_get_pfn(pfn_t pfn); 180 + 173 181 int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset, 174 182 int len); 175 183 int kvm_read_guest_atomic(struct kvm *kvm, gpa_t gpa, void *data,
+2
include/linux/kvm_types.h
··· 38 38 typedef u64 hpa_t; 39 39 typedef unsigned long hfn_t; 40 40 41 + typedef hfn_t pfn_t; 42 + 41 43 struct kvm_pio_request { 42 44 unsigned long count; 43 45 int cur_count;
+62 -8
virt/kvm/kvm_main.c
··· 40 40 #include <linux/kvm_para.h> 41 41 #include <linux/pagemap.h> 42 42 #include <linux/mman.h> 43 + #include <linux/swap.h> 43 44 44 45 #include <asm/processor.h> 45 46 #include <asm/io.h> ··· 459 458 } 460 459 EXPORT_SYMBOL_GPL(is_error_page); 461 460 461 + int is_error_pfn(pfn_t pfn) 462 + { 463 + return pfn == bad_pfn; 464 + } 465 + EXPORT_SYMBOL_GPL(is_error_pfn); 466 + 462 467 static inline unsigned long bad_hva(void) 463 468 { 464 469 return PAGE_OFFSET; ··· 526 519 /* 527 520 * Requires current->mm->mmap_sem to be held 528 521 */ 529 - struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn) 522 + pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn) 530 523 { 531 524 struct page *page[1]; 532 525 unsigned long addr; ··· 537 530 addr = gfn_to_hva(kvm, gfn); 538 531 if (kvm_is_error_hva(addr)) { 539 532 get_page(bad_page); 540 - return bad_page; 533 + return page_to_pfn(bad_page); 541 534 } 542 535 543 536 npages = get_user_pages(current, current->mm, addr, 1, 1, 1, page, ··· 545 538 546 539 if (npages != 1) { 547 540 get_page(bad_page); 548 - return bad_page; 541 + return page_to_pfn(bad_page); 549 542 } 550 543 551 - return page[0]; 544 + return page_to_pfn(page[0]); 545 + } 546 + 547 + EXPORT_SYMBOL_GPL(gfn_to_pfn); 548 + 549 + struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn) 550 + { 551 + return pfn_to_page(gfn_to_pfn(kvm, gfn)); 552 552 } 553 553 554 554 EXPORT_SYMBOL_GPL(gfn_to_page); 555 555 556 556 void kvm_release_page_clean(struct page *page) 557 557 { 558 - put_page(page); 558 + kvm_release_pfn_clean(page_to_pfn(page)); 559 559 } 560 560 EXPORT_SYMBOL_GPL(kvm_release_page_clean); 561 561 562 + void kvm_release_pfn_clean(pfn_t pfn) 563 + { 564 + put_page(pfn_to_page(pfn)); 565 + } 566 + EXPORT_SYMBOL_GPL(kvm_release_pfn_clean); 567 + 562 568 void kvm_release_page_dirty(struct page *page) 563 569 { 564 - if (!PageReserved(page)) 565 - SetPageDirty(page); 566 - put_page(page); 570 + kvm_release_pfn_dirty(page_to_pfn(page)); 567 571 } 568 572 EXPORT_SYMBOL_GPL(kvm_release_page_dirty); 573 + 574 + void kvm_release_pfn_dirty(pfn_t pfn) 575 + { 576 + kvm_set_pfn_dirty(pfn); 577 + kvm_release_pfn_clean(pfn); 578 + } 579 + EXPORT_SYMBOL_GPL(kvm_release_pfn_dirty); 580 + 581 + void kvm_set_page_dirty(struct page *page) 582 + { 583 + kvm_set_pfn_dirty(page_to_pfn(page)); 584 + } 585 + EXPORT_SYMBOL_GPL(kvm_set_page_dirty); 586 + 587 + void kvm_set_pfn_dirty(pfn_t pfn) 588 + { 589 + struct page *page = pfn_to_page(pfn); 590 + if (!PageReserved(page)) 591 + SetPageDirty(page); 592 + } 593 + EXPORT_SYMBOL_GPL(kvm_set_pfn_dirty); 594 + 595 + void kvm_set_pfn_accessed(pfn_t pfn) 596 + { 597 + mark_page_accessed(pfn_to_page(pfn)); 598 + } 599 + EXPORT_SYMBOL_GPL(kvm_set_pfn_accessed); 600 + 601 + void kvm_get_pfn(pfn_t pfn) 602 + { 603 + get_page(pfn_to_page(pfn)); 604 + } 605 + EXPORT_SYMBOL_GPL(kvm_get_pfn); 569 606 570 607 static int next_segment(unsigned long len, int offset) 571 608 { ··· 1402 1351 }; 1403 1352 1404 1353 struct page *bad_page; 1354 + pfn_t bad_pfn; 1405 1355 1406 1356 static inline 1407 1357 struct kvm_vcpu *preempt_notifier_to_vcpu(struct preempt_notifier *pn) ··· 1443 1391 r = -ENOMEM; 1444 1392 goto out; 1445 1393 } 1394 + 1395 + bad_pfn = page_to_pfn(bad_page); 1446 1396 1447 1397 r = kvm_arch_hardware_setup(); 1448 1398 if (r < 0)