x86/sev: Do not touch VMSA pages during SNP guest memory kdump

When kdump is running makedumpfile to generate vmcore and dump SNP guest
memory it touches the VMSA page of the vCPU executing kdump.

It then results in unrecoverable #NPF/RMP faults as the VMSA page is
marked busy/in-use when the vCPU is running and subsequently a causes
guest softlockup/hang.

Additionally, other APs may be halted in guest mode and their VMSA pages
are marked busy and touching these VMSA pages during guest memory dump
will also cause #NPF.

Issue AP_DESTROY GHCB calls on other APs to ensure they are kicked out
of guest mode and then clear the VMSA bit on their VMSA pages.

If the vCPU running kdump is an AP, mark it's VMSA page as offline to
ensure that makedumpfile excludes that page while dumping guest memory.

Fixes: 3074152e56c9 ("x86/sev: Convert shared memory back to private on kexec")
Signed-off-by: Ashish Kalra <ashish.kalra@amd.com>
Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
Reviewed-by: Pankaj Gupta <pankaj.gupta@amd.com>
Reviewed-by: Tom Lendacky <thomas.lendacky@amd.com>
Tested-by: Srikanth Aithal <sraithal@amd.com>
Cc: stable@vger.kernel.org
Link: https://lore.kernel.org/20250428214151.155464-1-Ashish.Kalra@amd.com

authored by Ashish Kalra and committed by Borislav Petkov (AMD) d2062cc1 386cd3dc

+158 -86
+158 -86
arch/x86/coco/sev/core.c
··· 959 set_pages_state(vaddr, npages, SNP_PAGE_STATE_PRIVATE); 960 } 961 962 static void set_pte_enc(pte_t *kpte, int level, void *va) 963 { 964 struct pte_enc_desc d = { ··· 1151 pr_warn("Failed to stop shared<->private conversions\n"); 1152 } 1153 1154 void snp_kexec_finish(void) 1155 { 1156 struct sev_es_runtime_data *data; ··· 1223 1224 if (!IS_ENABLED(CONFIG_KEXEC_CORE)) 1225 return; 1226 1227 unshare_all_memory(); 1228 ··· 1245 set_pte_enc(pte, level, (void *)ghcb); 1246 snp_set_memory_private((unsigned long)ghcb, (size / PAGE_SIZE)); 1247 } 1248 - } 1249 - 1250 - static int snp_set_vmsa(void *va, void *caa, int apic_id, bool make_vmsa) 1251 - { 1252 - int ret; 1253 - 1254 - if (snp_vmpl) { 1255 - struct svsm_call call = {}; 1256 - unsigned long flags; 1257 - 1258 - local_irq_save(flags); 1259 - 1260 - call.caa = this_cpu_read(svsm_caa); 1261 - call.rcx = __pa(va); 1262 - 1263 - if (make_vmsa) { 1264 - /* Protocol 0, Call ID 2 */ 1265 - call.rax = SVSM_CORE_CALL(SVSM_CORE_CREATE_VCPU); 1266 - call.rdx = __pa(caa); 1267 - call.r8 = apic_id; 1268 - } else { 1269 - /* Protocol 0, Call ID 3 */ 1270 - call.rax = SVSM_CORE_CALL(SVSM_CORE_DELETE_VCPU); 1271 - } 1272 - 1273 - ret = svsm_perform_call_protocol(&call); 1274 - 1275 - local_irq_restore(flags); 1276 - } else { 1277 - /* 1278 - * If the kernel runs at VMPL0, it can change the VMSA 1279 - * bit for a page using the RMPADJUST instruction. 1280 - * However, for the instruction to succeed it must 1281 - * target the permissions of a lesser privileged (higher 1282 - * numbered) VMPL level, so use VMPL1. 1283 - */ 1284 - u64 attrs = 1; 1285 - 1286 - if (make_vmsa) 1287 - attrs |= RMPADJUST_VMSA_PAGE_BIT; 1288 - 1289 - ret = rmpadjust((unsigned long)va, RMP_PG_SIZE_4K, attrs); 1290 - } 1291 - 1292 - return ret; 1293 } 1294 1295 #define __ATTR_BASE (SVM_SELECTOR_P_MASK | SVM_SELECTOR_S_MASK) ··· 1278 return page_address(p + 1); 1279 } 1280 1281 - static void snp_cleanup_vmsa(struct sev_es_save_area *vmsa, int apic_id) 1282 - { 1283 - int err; 1284 - 1285 - err = snp_set_vmsa(vmsa, NULL, apic_id, false); 1286 - if (err) 1287 - pr_err("clear VMSA page failed (%u), leaking page\n", err); 1288 - else 1289 - free_page((unsigned long)vmsa); 1290 - } 1291 - 1292 static int wakeup_cpu_via_vmgexit(u32 apic_id, unsigned long start_ip) 1293 { 1294 struct sev_es_save_area *cur_vmsa, *vmsa; 1295 - struct ghcb_state state; 1296 struct svsm_ca *caa; 1297 - unsigned long flags; 1298 - struct ghcb *ghcb; 1299 u8 sipi_vector; 1300 int cpu, ret; 1301 u64 cr4; ··· 1395 } 1396 1397 /* Issue VMGEXIT AP Creation NAE event */ 1398 - local_irq_save(flags); 1399 - 1400 - ghcb = __sev_get_ghcb(&state); 1401 - 1402 - vc_ghcb_invalidate(ghcb); 1403 - ghcb_set_rax(ghcb, vmsa->sev_features); 1404 - ghcb_set_sw_exit_code(ghcb, SVM_VMGEXIT_AP_CREATION); 1405 - ghcb_set_sw_exit_info_1(ghcb, 1406 - ((u64)apic_id << 32) | 1407 - ((u64)snp_vmpl << 16) | 1408 - SVM_VMGEXIT_AP_CREATE); 1409 - ghcb_set_sw_exit_info_2(ghcb, __pa(vmsa)); 1410 - 1411 - sev_es_wr_ghcb_msr(__pa(ghcb)); 1412 - VMGEXIT(); 1413 - 1414 - if (!ghcb_sw_exit_info_1_is_valid(ghcb) || 1415 - lower_32_bits(ghcb->save.sw_exit_info_1)) { 1416 - pr_err("SNP AP Creation error\n"); 1417 - ret = -EINVAL; 1418 - } 1419 - 1420 - __sev_put_ghcb(&state); 1421 - 1422 - local_irq_restore(flags); 1423 - 1424 - /* Perform cleanup if there was an error */ 1425 if (ret) { 1426 snp_cleanup_vmsa(vmsa, apic_id); 1427 vmsa = NULL;
··· 959 set_pages_state(vaddr, npages, SNP_PAGE_STATE_PRIVATE); 960 } 961 962 + static int vmgexit_ap_control(u64 event, struct sev_es_save_area *vmsa, u32 apic_id) 963 + { 964 + bool create = event != SVM_VMGEXIT_AP_DESTROY; 965 + struct ghcb_state state; 966 + unsigned long flags; 967 + struct ghcb *ghcb; 968 + int ret = 0; 969 + 970 + local_irq_save(flags); 971 + 972 + ghcb = __sev_get_ghcb(&state); 973 + 974 + vc_ghcb_invalidate(ghcb); 975 + 976 + if (create) 977 + ghcb_set_rax(ghcb, vmsa->sev_features); 978 + 979 + ghcb_set_sw_exit_code(ghcb, SVM_VMGEXIT_AP_CREATION); 980 + ghcb_set_sw_exit_info_1(ghcb, 981 + ((u64)apic_id << 32) | 982 + ((u64)snp_vmpl << 16) | 983 + event); 984 + ghcb_set_sw_exit_info_2(ghcb, __pa(vmsa)); 985 + 986 + sev_es_wr_ghcb_msr(__pa(ghcb)); 987 + VMGEXIT(); 988 + 989 + if (!ghcb_sw_exit_info_1_is_valid(ghcb) || 990 + lower_32_bits(ghcb->save.sw_exit_info_1)) { 991 + pr_err("SNP AP %s error\n", (create ? "CREATE" : "DESTROY")); 992 + ret = -EINVAL; 993 + } 994 + 995 + __sev_put_ghcb(&state); 996 + 997 + local_irq_restore(flags); 998 + 999 + return ret; 1000 + } 1001 + 1002 + static int snp_set_vmsa(void *va, void *caa, int apic_id, bool make_vmsa) 1003 + { 1004 + int ret; 1005 + 1006 + if (snp_vmpl) { 1007 + struct svsm_call call = {}; 1008 + unsigned long flags; 1009 + 1010 + local_irq_save(flags); 1011 + 1012 + call.caa = this_cpu_read(svsm_caa); 1013 + call.rcx = __pa(va); 1014 + 1015 + if (make_vmsa) { 1016 + /* Protocol 0, Call ID 2 */ 1017 + call.rax = SVSM_CORE_CALL(SVSM_CORE_CREATE_VCPU); 1018 + call.rdx = __pa(caa); 1019 + call.r8 = apic_id; 1020 + } else { 1021 + /* Protocol 0, Call ID 3 */ 1022 + call.rax = SVSM_CORE_CALL(SVSM_CORE_DELETE_VCPU); 1023 + } 1024 + 1025 + ret = svsm_perform_call_protocol(&call); 1026 + 1027 + local_irq_restore(flags); 1028 + } else { 1029 + /* 1030 + * If the kernel runs at VMPL0, it can change the VMSA 1031 + * bit for a page using the RMPADJUST instruction. 1032 + * However, for the instruction to succeed it must 1033 + * target the permissions of a lesser privileged (higher 1034 + * numbered) VMPL level, so use VMPL1. 1035 + */ 1036 + u64 attrs = 1; 1037 + 1038 + if (make_vmsa) 1039 + attrs |= RMPADJUST_VMSA_PAGE_BIT; 1040 + 1041 + ret = rmpadjust((unsigned long)va, RMP_PG_SIZE_4K, attrs); 1042 + } 1043 + 1044 + return ret; 1045 + } 1046 + 1047 + static void snp_cleanup_vmsa(struct sev_es_save_area *vmsa, int apic_id) 1048 + { 1049 + int err; 1050 + 1051 + err = snp_set_vmsa(vmsa, NULL, apic_id, false); 1052 + if (err) 1053 + pr_err("clear VMSA page failed (%u), leaking page\n", err); 1054 + else 1055 + free_page((unsigned long)vmsa); 1056 + } 1057 + 1058 static void set_pte_enc(pte_t *kpte, int level, void *va) 1059 { 1060 struct pte_enc_desc d = { ··· 1055 pr_warn("Failed to stop shared<->private conversions\n"); 1056 } 1057 1058 + /* 1059 + * Shutdown all APs except the one handling kexec/kdump and clearing 1060 + * the VMSA tag on AP's VMSA pages as they are not being used as 1061 + * VMSA page anymore. 1062 + */ 1063 + static void shutdown_all_aps(void) 1064 + { 1065 + struct sev_es_save_area *vmsa; 1066 + int apic_id, this_cpu, cpu; 1067 + 1068 + this_cpu = get_cpu(); 1069 + 1070 + /* 1071 + * APs are already in HLT loop when enc_kexec_finish() callback 1072 + * is invoked. 1073 + */ 1074 + for_each_present_cpu(cpu) { 1075 + vmsa = per_cpu(sev_vmsa, cpu); 1076 + 1077 + /* 1078 + * The BSP or offlined APs do not have guest allocated VMSA 1079 + * and there is no need to clear the VMSA tag for this page. 1080 + */ 1081 + if (!vmsa) 1082 + continue; 1083 + 1084 + /* 1085 + * Cannot clear the VMSA tag for the currently running vCPU. 1086 + */ 1087 + if (this_cpu == cpu) { 1088 + unsigned long pa; 1089 + struct page *p; 1090 + 1091 + pa = __pa(vmsa); 1092 + /* 1093 + * Mark the VMSA page of the running vCPU as offline 1094 + * so that is excluded and not touched by makedumpfile 1095 + * while generating vmcore during kdump. 1096 + */ 1097 + p = pfn_to_online_page(pa >> PAGE_SHIFT); 1098 + if (p) 1099 + __SetPageOffline(p); 1100 + continue; 1101 + } 1102 + 1103 + apic_id = cpuid_to_apicid[cpu]; 1104 + 1105 + /* 1106 + * Issue AP destroy to ensure AP gets kicked out of guest mode 1107 + * to allow using RMPADJUST to remove the VMSA tag on it's 1108 + * VMSA page. 1109 + */ 1110 + vmgexit_ap_control(SVM_VMGEXIT_AP_DESTROY, vmsa, apic_id); 1111 + snp_cleanup_vmsa(vmsa, apic_id); 1112 + } 1113 + 1114 + put_cpu(); 1115 + } 1116 + 1117 void snp_kexec_finish(void) 1118 { 1119 struct sev_es_runtime_data *data; ··· 1068 1069 if (!IS_ENABLED(CONFIG_KEXEC_CORE)) 1070 return; 1071 + 1072 + shutdown_all_aps(); 1073 1074 unshare_all_memory(); 1075 ··· 1088 set_pte_enc(pte, level, (void *)ghcb); 1089 snp_set_memory_private((unsigned long)ghcb, (size / PAGE_SIZE)); 1090 } 1091 } 1092 1093 #define __ATTR_BASE (SVM_SELECTOR_P_MASK | SVM_SELECTOR_S_MASK) ··· 1166 return page_address(p + 1); 1167 } 1168 1169 static int wakeup_cpu_via_vmgexit(u32 apic_id, unsigned long start_ip) 1170 { 1171 struct sev_es_save_area *cur_vmsa, *vmsa; 1172 struct svsm_ca *caa; 1173 u8 sipi_vector; 1174 int cpu, ret; 1175 u64 cr4; ··· 1297 } 1298 1299 /* Issue VMGEXIT AP Creation NAE event */ 1300 + ret = vmgexit_ap_control(SVM_VMGEXIT_AP_CREATE, vmsa, apic_id); 1301 if (ret) { 1302 snp_cleanup_vmsa(vmsa, apic_id); 1303 vmsa = NULL;