Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge branch 'kvm-guest-sev-migration' into kvm-master

Add guest api and guest kernel support for SEV live migration.

Introduces a new hypercall to notify the host of changes to the page
encryption status. If the page is encrypted then it must be migrated
through the SEV firmware or a helper VM sharing the key. If page is
not encrypted then it can be migrated normally by userspace. This new
hypercall is invoked using paravirt_ops.

Conflicts: sev_active() replaced by cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT).

+209 -16
+12
arch/x86/include/asm/kvm_para.h
··· 83 83 return ret; 84 84 } 85 85 86 + static inline long kvm_sev_hypercall3(unsigned int nr, unsigned long p1, 87 + unsigned long p2, unsigned long p3) 88 + { 89 + long ret; 90 + 91 + asm volatile("vmmcall" 92 + : "=a"(ret) 93 + : "a"(nr), "b"(p1), "c"(p2), "d"(p3) 94 + : "memory"); 95 + return ret; 96 + } 97 + 86 98 #ifdef CONFIG_KVM_GUEST 87 99 void kvmclock_init(void); 88 100 void kvmclock_disable(void);
+4
arch/x86/include/asm/mem_encrypt.h
··· 44 44 45 45 int __init early_set_memory_decrypted(unsigned long vaddr, unsigned long size); 46 46 int __init early_set_memory_encrypted(unsigned long vaddr, unsigned long size); 47 + void __init early_set_mem_enc_dec_hypercall(unsigned long vaddr, int npages, 48 + bool enc); 47 49 48 50 void __init mem_encrypt_free_decrypted_mem(void); 49 51 ··· 80 78 early_set_memory_decrypted(unsigned long vaddr, unsigned long size) { return 0; } 81 79 static inline int __init 82 80 early_set_memory_encrypted(unsigned long vaddr, unsigned long size) { return 0; } 81 + static inline void __init 82 + early_set_mem_enc_dec_hypercall(unsigned long vaddr, int npages, bool enc) {} 83 83 84 84 static inline void mem_encrypt_free_decrypted_mem(void) { } 85 85
+6
arch/x86/include/asm/paravirt.h
··· 97 97 PVOP_VCALL1(mmu.exit_mmap, mm); 98 98 } 99 99 100 + static inline void notify_page_enc_status_changed(unsigned long pfn, 101 + int npages, bool enc) 102 + { 103 + PVOP_VCALL3(mmu.notify_page_enc_status_changed, pfn, npages, enc); 104 + } 105 + 100 106 #ifdef CONFIG_PARAVIRT_XXL 101 107 static inline void load_sp0(unsigned long sp0) 102 108 {
+1
arch/x86/include/asm/paravirt_types.h
··· 168 168 169 169 /* Hook for intercepting the destruction of an mm_struct. */ 170 170 void (*exit_mmap)(struct mm_struct *mm); 171 + void (*notify_page_enc_status_changed)(unsigned long pfn, int npages, bool enc); 171 172 172 173 #ifdef CONFIG_PARAVIRT_XXL 173 174 struct paravirt_callee_save read_cr2;
+1
arch/x86/include/asm/set_memory.h
··· 83 83 int set_direct_map_invalid_noflush(struct page *page); 84 84 int set_direct_map_default_noflush(struct page *page); 85 85 bool kernel_page_present(struct page *page); 86 + void notify_range_enc_status_changed(unsigned long vaddr, int npages, bool enc); 86 87 87 88 extern int kernel_set_to_readonly; 88 89
+107
arch/x86/kernel/kvm.c
··· 28 28 #include <linux/swait.h> 29 29 #include <linux/syscore_ops.h> 30 30 #include <linux/cc_platform.h> 31 + #include <linux/efi.h> 31 32 #include <asm/timer.h> 32 33 #include <asm/cpu.h> 33 34 #include <asm/traps.h> ··· 42 41 #include <asm/ptrace.h> 43 42 #include <asm/reboot.h> 44 43 #include <asm/svm.h> 44 + #include <asm/e820/api.h> 45 45 46 46 DEFINE_STATIC_KEY_FALSE(kvm_async_pf_enabled); 47 47 ··· 436 434 kvm_disable_steal_time(); 437 435 if (kvm_para_has_feature(KVM_FEATURE_PV_EOI)) 438 436 wrmsrl(MSR_KVM_PV_EOI_EN, 0); 437 + if (kvm_para_has_feature(KVM_FEATURE_MIGRATION_CONTROL)) 438 + wrmsrl(MSR_KVM_MIGRATION_CONTROL, 0); 439 439 kvm_pv_disable_apf(); 440 440 if (!shutdown) 441 441 apf_task_wake_all(); ··· 551 547 local_mask = new_mask; 552 548 __send_ipi_mask(local_mask, vector); 553 549 } 550 + 551 + static int __init setup_efi_kvm_sev_migration(void) 552 + { 553 + efi_char16_t efi_sev_live_migration_enabled[] = L"SevLiveMigrationEnabled"; 554 + efi_guid_t efi_variable_guid = AMD_SEV_MEM_ENCRYPT_GUID; 555 + efi_status_t status; 556 + unsigned long size; 557 + bool enabled; 558 + 559 + if (!cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT) || 560 + !kvm_para_has_feature(KVM_FEATURE_MIGRATION_CONTROL)) 561 + return 0; 562 + 563 + if (!efi_enabled(EFI_BOOT)) 564 + return 0; 565 + 566 + if (!efi_enabled(EFI_RUNTIME_SERVICES)) { 567 + pr_info("%s : EFI runtime services are not enabled\n", __func__); 568 + return 0; 569 + } 570 + 571 + size = sizeof(enabled); 572 + 573 + /* Get variable contents into buffer */ 574 + status = efi.get_variable(efi_sev_live_migration_enabled, 575 + &efi_variable_guid, NULL, &size, &enabled); 576 + 577 + if (status == EFI_NOT_FOUND) { 578 + pr_info("%s : EFI live migration variable not found\n", __func__); 579 + return 0; 580 + } 581 + 582 + if (status != EFI_SUCCESS) { 583 + pr_info("%s : EFI variable retrieval failed\n", __func__); 584 + return 0; 585 + } 586 + 587 + if (enabled == 0) { 588 + pr_info("%s: live migration disabled in EFI\n", __func__); 589 + return 0; 590 + } 591 + 592 + pr_info("%s : live migration enabled in EFI\n", __func__); 593 + wrmsrl(MSR_KVM_MIGRATION_CONTROL, KVM_MIGRATION_READY); 594 + 595 + return 1; 596 + } 597 + 598 + late_initcall(setup_efi_kvm_sev_migration); 554 599 555 600 /* 556 601 * Set the IPI entry points ··· 859 806 return kvm_para_has_feature(KVM_FEATURE_MSI_EXT_DEST_ID); 860 807 } 861 808 809 + static void kvm_sev_hc_page_enc_status(unsigned long pfn, int npages, bool enc) 810 + { 811 + kvm_sev_hypercall3(KVM_HC_MAP_GPA_RANGE, pfn << PAGE_SHIFT, npages, 812 + KVM_MAP_GPA_RANGE_ENC_STAT(enc) | KVM_MAP_GPA_RANGE_PAGE_SZ_4K); 813 + } 814 + 862 815 static void __init kvm_init_platform(void) 863 816 { 817 + if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT) && 818 + kvm_para_has_feature(KVM_FEATURE_MIGRATION_CONTROL)) { 819 + unsigned long nr_pages; 820 + int i; 821 + 822 + pv_ops.mmu.notify_page_enc_status_changed = 823 + kvm_sev_hc_page_enc_status; 824 + 825 + /* 826 + * Reset the host's shared pages list related to kernel 827 + * specific page encryption status settings before we load a 828 + * new kernel by kexec. Reset the page encryption status 829 + * during early boot intead of just before kexec to avoid SMP 830 + * races during kvm_pv_guest_cpu_reboot(). 831 + * NOTE: We cannot reset the complete shared pages list 832 + * here as we need to retain the UEFI/OVMF firmware 833 + * specific settings. 834 + */ 835 + 836 + for (i = 0; i < e820_table->nr_entries; i++) { 837 + struct e820_entry *entry = &e820_table->entries[i]; 838 + 839 + if (entry->type != E820_TYPE_RAM) 840 + continue; 841 + 842 + nr_pages = DIV_ROUND_UP(entry->size, PAGE_SIZE); 843 + 844 + kvm_sev_hypercall3(KVM_HC_MAP_GPA_RANGE, entry->addr, 845 + nr_pages, 846 + KVM_MAP_GPA_RANGE_ENCRYPTED | KVM_MAP_GPA_RANGE_PAGE_SZ_4K); 847 + } 848 + 849 + /* 850 + * Ensure that _bss_decrypted section is marked as decrypted in the 851 + * shared pages list. 852 + */ 853 + nr_pages = DIV_ROUND_UP(__end_bss_decrypted - __start_bss_decrypted, 854 + PAGE_SIZE); 855 + early_set_mem_enc_dec_hypercall((unsigned long)__start_bss_decrypted, 856 + nr_pages, 0); 857 + 858 + /* 859 + * If not booted using EFI, enable Live migration support. 860 + */ 861 + if (!efi_enabled(EFI_BOOT)) 862 + wrmsrl(MSR_KVM_MIGRATION_CONTROL, 863 + KVM_MIGRATION_READY); 864 + } 864 865 kvmclock_init(); 865 866 x86_platform.apic_post_init = kvm_apic_init; 866 867 }
+1
arch/x86/kernel/paravirt.c
··· 337 337 (void (*)(struct mmu_gather *, void *))tlb_remove_page, 338 338 339 339 .mmu.exit_mmap = paravirt_nop, 340 + .mmu.notify_page_enc_status_changed = paravirt_nop, 340 341 341 342 #ifdef CONFIG_PARAVIRT_XXL 342 343 .mmu.read_cr2 = __PV_IS_CALLEE_SAVE(pv_native_read_cr2),
+70 -16
arch/x86/mm/mem_encrypt.c
··· 229 229 swiotlb_adjust_size(size); 230 230 } 231 231 232 + static unsigned long pg_level_to_pfn(int level, pte_t *kpte, pgprot_t *ret_prot) 233 + { 234 + unsigned long pfn = 0; 235 + pgprot_t prot; 236 + 237 + switch (level) { 238 + case PG_LEVEL_4K: 239 + pfn = pte_pfn(*kpte); 240 + prot = pte_pgprot(*kpte); 241 + break; 242 + case PG_LEVEL_2M: 243 + pfn = pmd_pfn(*(pmd_t *)kpte); 244 + prot = pmd_pgprot(*(pmd_t *)kpte); 245 + break; 246 + case PG_LEVEL_1G: 247 + pfn = pud_pfn(*(pud_t *)kpte); 248 + prot = pud_pgprot(*(pud_t *)kpte); 249 + break; 250 + default: 251 + WARN_ONCE(1, "Invalid level for kpte\n"); 252 + return 0; 253 + } 254 + 255 + if (ret_prot) 256 + *ret_prot = prot; 257 + 258 + return pfn; 259 + } 260 + 261 + void notify_range_enc_status_changed(unsigned long vaddr, int npages, bool enc) 262 + { 263 + #ifdef CONFIG_PARAVIRT 264 + unsigned long sz = npages << PAGE_SHIFT; 265 + unsigned long vaddr_end = vaddr + sz; 266 + 267 + while (vaddr < vaddr_end) { 268 + int psize, pmask, level; 269 + unsigned long pfn; 270 + pte_t *kpte; 271 + 272 + kpte = lookup_address(vaddr, &level); 273 + if (!kpte || pte_none(*kpte)) { 274 + WARN_ONCE(1, "kpte lookup for vaddr\n"); 275 + return; 276 + } 277 + 278 + pfn = pg_level_to_pfn(level, kpte, NULL); 279 + if (!pfn) 280 + continue; 281 + 282 + psize = page_level_size(level); 283 + pmask = page_level_mask(level); 284 + 285 + notify_page_enc_status_changed(pfn, psize >> PAGE_SHIFT, enc); 286 + 287 + vaddr = (vaddr & pmask) + psize; 288 + } 289 + #endif 290 + } 291 + 232 292 static void __init __set_clr_pte_enc(pte_t *kpte, int level, bool enc) 233 293 { 234 294 pgprot_t old_prot, new_prot; 235 295 unsigned long pfn, pa, size; 236 296 pte_t new_pte; 237 297 238 - switch (level) { 239 - case PG_LEVEL_4K: 240 - pfn = pte_pfn(*kpte); 241 - old_prot = pte_pgprot(*kpte); 242 - break; 243 - case PG_LEVEL_2M: 244 - pfn = pmd_pfn(*(pmd_t *)kpte); 245 - old_prot = pmd_pgprot(*(pmd_t *)kpte); 246 - break; 247 - case PG_LEVEL_1G: 248 - pfn = pud_pfn(*(pud_t *)kpte); 249 - old_prot = pud_pgprot(*(pud_t *)kpte); 250 - break; 251 - default: 298 + pfn = pg_level_to_pfn(level, kpte, &old_prot); 299 + if (!pfn) 252 300 return; 253 - } 254 301 255 302 new_prot = old_prot; 256 303 if (enc) ··· 333 286 static int __init early_set_memory_enc_dec(unsigned long vaddr, 334 287 unsigned long size, bool enc) 335 288 { 336 - unsigned long vaddr_end, vaddr_next; 289 + unsigned long vaddr_end, vaddr_next, start; 337 290 unsigned long psize, pmask; 338 291 int split_page_size_mask; 339 292 int level, ret; 340 293 pte_t *kpte; 341 294 295 + start = vaddr; 342 296 vaddr_next = vaddr; 343 297 vaddr_end = vaddr + size; 344 298 ··· 394 346 395 347 ret = 0; 396 348 349 + notify_range_enc_status_changed(start, PAGE_ALIGN(size) >> PAGE_SHIFT, enc); 397 350 out: 398 351 __flush_tlb_all(); 399 352 return ret; ··· 408 359 int __init early_set_memory_encrypted(unsigned long vaddr, unsigned long size) 409 360 { 410 361 return early_set_memory_enc_dec(vaddr, size, true); 362 + } 363 + 364 + void __init early_set_mem_enc_dec_hypercall(unsigned long vaddr, int npages, bool enc) 365 + { 366 + notify_range_enc_status_changed(vaddr, npages, enc); 411 367 } 412 368 413 369 /* Override for DMA direct allocation check - ARCH_HAS_FORCE_DMA_UNENCRYPTED */
+6
arch/x86/mm/pat/set_memory.c
··· 2023 2023 */ 2024 2024 cpa_flush(&cpa, 0); 2025 2025 2026 + /* 2027 + * Notify hypervisor that a given memory range is mapped encrypted 2028 + * or decrypted. 2029 + */ 2030 + notify_range_enc_status_changed(addr, numpages, enc); 2031 + 2026 2032 return ret; 2027 2033 } 2028 2034
+1
include/linux/efi.h
··· 362 362 363 363 /* OEM GUIDs */ 364 364 #define DELLEMC_EFI_RCI2_TABLE_GUID EFI_GUID(0x2d9f28a2, 0xa886, 0x456a, 0x97, 0xa8, 0xf1, 0x1e, 0xf2, 0x4f, 0xf4, 0x55) 365 + #define AMD_SEV_MEM_ENCRYPT_GUID EFI_GUID(0x0cf29b71, 0x9e51, 0x433a, 0xa3, 0xb7, 0x81, 0xf3, 0xab, 0x16, 0xb8, 0x75) 365 366 366 367 typedef struct { 367 368 efi_guid_t guid;