Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

x86/hyperv: fix root partition faults when writing to VP assist page MSR

For root partition the VP assist pages are pre-determined by the
hypervisor. The root kernel is not allowed to change them to
different locations. And thus, we are getting below stack as in
current implementation root is trying to perform write to specific
MSR.

[ 2.778197] unchecked MSR access error: WRMSR to 0x40000073 (tried to write 0x0000000145ac5001) at rIP: 0xffffffff810c1084 (native_write_msr+0x4/0x30)
[ 2.784867] Call Trace:
[ 2.791507] hv_cpu_init+0xf1/0x1c0
[ 2.798144] ? hyperv_report_panic+0xd0/0xd0
[ 2.804806] cpuhp_invoke_callback+0x11a/0x440
[ 2.811465] ? hv_resume+0x90/0x90
[ 2.818137] cpuhp_issue_call+0x126/0x130
[ 2.824782] __cpuhp_setup_state_cpuslocked+0x102/0x2b0
[ 2.831427] ? hyperv_report_panic+0xd0/0xd0
[ 2.838075] ? hyperv_report_panic+0xd0/0xd0
[ 2.844723] ? hv_resume+0x90/0x90
[ 2.851375] __cpuhp_setup_state+0x3d/0x90
[ 2.858030] hyperv_init+0x14e/0x410
[ 2.864689] ? enable_IR_x2apic+0x190/0x1a0
[ 2.871349] apic_intr_mode_init+0x8b/0x100
[ 2.878017] x86_late_time_init+0x20/0x30
[ 2.884675] start_kernel+0x459/0x4fb
[ 2.891329] secondary_startup_64_no_verify+0xb0/0xbb

Since the hypervisor already provides the VP assist pages for root
partition, we need to memremap the memory from hypervisor for root
kernel to use. The mapping is done in hv_cpu_init during bringup and is
unmapped in hv_cpu_die during teardown.

Signed-off-by: Praveen Kumar <kumarpraveen@linux.microsoft.com>
Reviewed-by: Sunil Muthuswamy <sunilmut@microsoft.com>
Link: https://lore.kernel.org/r/20210731120519.17154-1-kumarpraveen@linux.microsoft.com
Signed-off-by: Wei Liu <wei.liu@kernel.org>

authored by

Praveen Kumar and committed by
Wei Liu
e5d9b714 63fb60c2

+53 -20
+44 -20
arch/x86/hyperv/hv_init.c
··· 44 44 45 45 static int hv_cpu_init(unsigned int cpu) 46 46 { 47 + union hv_vp_assist_msr_contents msr = { 0 }; 47 48 struct hv_vp_assist_page **hvp = &hv_vp_assist_page[smp_processor_id()]; 48 49 int ret; 49 50 ··· 55 54 if (!hv_vp_assist_page) 56 55 return 0; 57 56 58 - /* 59 - * The VP ASSIST PAGE is an "overlay" page (see Hyper-V TLFS's Section 60 - * 5.2.1 "GPA Overlay Pages"). Here it must be zeroed out to make sure 61 - * we always write the EOI MSR in hv_apic_eoi_write() *after* the 62 - * EOI optimization is disabled in hv_cpu_die(), otherwise a CPU may 63 - * not be stopped in the case of CPU offlining and the VM will hang. 64 - */ 65 57 if (!*hvp) { 66 - *hvp = __vmalloc(PAGE_SIZE, GFP_KERNEL | __GFP_ZERO); 67 - } 68 - 69 - if (*hvp) { 70 - u64 val; 71 - 72 - val = vmalloc_to_pfn(*hvp); 73 - val = (val << HV_X64_MSR_VP_ASSIST_PAGE_ADDRESS_SHIFT) | 74 - HV_X64_MSR_VP_ASSIST_PAGE_ENABLE; 75 - 76 - wrmsrl(HV_X64_MSR_VP_ASSIST_PAGE, val); 58 + if (hv_root_partition) { 59 + /* 60 + * For root partition we get the hypervisor provided VP assist 61 + * page, instead of allocating a new page. 62 + */ 63 + rdmsrl(HV_X64_MSR_VP_ASSIST_PAGE, msr.as_uint64); 64 + *hvp = memremap(msr.pfn << 65 + HV_X64_MSR_VP_ASSIST_PAGE_ADDRESS_SHIFT, 66 + PAGE_SIZE, MEMREMAP_WB); 67 + } else { 68 + /* 69 + * The VP assist page is an "overlay" page (see Hyper-V TLFS's 70 + * Section 5.2.1 "GPA Overlay Pages"). Here it must be zeroed 71 + * out to make sure we always write the EOI MSR in 72 + * hv_apic_eoi_write() *after* the EOI optimization is disabled 73 + * in hv_cpu_die(), otherwise a CPU may not be stopped in the 74 + * case of CPU offlining and the VM will hang. 75 + */ 76 + *hvp = __vmalloc(PAGE_SIZE, GFP_KERNEL | __GFP_ZERO); 77 + if (*hvp) 78 + msr.pfn = vmalloc_to_pfn(*hvp); 79 + } 80 + WARN_ON(!(*hvp)); 81 + if (*hvp) { 82 + msr.enable = 1; 83 + wrmsrl(HV_X64_MSR_VP_ASSIST_PAGE, msr.as_uint64); 84 + } 77 85 } 78 86 79 87 return 0; ··· 180 170 181 171 hv_common_cpu_die(cpu); 182 172 183 - if (hv_vp_assist_page && hv_vp_assist_page[cpu]) 184 - wrmsrl(HV_X64_MSR_VP_ASSIST_PAGE, 0); 173 + if (hv_vp_assist_page && hv_vp_assist_page[cpu]) { 174 + union hv_vp_assist_msr_contents msr = { 0 }; 175 + if (hv_root_partition) { 176 + /* 177 + * For root partition the VP assist page is mapped to 178 + * hypervisor provided page, and thus we unmap the 179 + * page here and nullify it, so that in future we have 180 + * correct page address mapped in hv_cpu_init. 181 + */ 182 + memunmap(hv_vp_assist_page[cpu]); 183 + hv_vp_assist_page[cpu] = NULL; 184 + rdmsrl(HV_X64_MSR_VP_ASSIST_PAGE, msr.as_uint64); 185 + msr.enable = 0; 186 + } 187 + wrmsrl(HV_X64_MSR_VP_ASSIST_PAGE, msr.as_uint64); 188 + } 185 189 186 190 if (hv_reenlightenment_cb == NULL) 187 191 return 0;
+9
arch/x86/include/asm/hyperv-tlfs.h
··· 288 288 } __packed; 289 289 }; 290 290 291 + union hv_vp_assist_msr_contents { 292 + u64 as_uint64; 293 + struct { 294 + u64 enable:1; 295 + u64 reserved:11; 296 + u64 pfn:52; 297 + } __packed; 298 + }; 299 + 291 300 struct hv_reenlightenment_control { 292 301 __u64 vector:8; 293 302 __u64 reserved1:8;