Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

x86/hyperv: Add trampoline asm code to transition from hypervisor

Introduce a small asm stub to transition from the hypervisor to Linux
after devirtualization. Devirtualization means disabling hypervisor on
the fly, so after it is done, the code is running on physical processor
instead of virtual, and hypervisor is gone. This can be done by a
root vm only.

At a high level, during panic of either the hypervisor or the root,
the NMI handler asks hypervisor to devirtualize. As part of that,
the arguments include an entry point to return back to Linux. This asm
stub implements that entry point.

The stub is entered in protected mode, uses temporary gdt and page table
to enable long mode and get to kernel entry point which then restores full
kernel context to resume execution to kexec.

Signed-off-by: Mukesh Rathor <mrathor@linux.microsoft.com>
Signed-off-by: Wei Liu <wei.liu@kernel.org>

authored by

Mukesh Rathor and committed by
Wei Liu
b0574ba7 e0a975ec

+101
+101
arch/x86/hyperv/hv_trampoline.S
··· 1 + /* SPDX-License-Identifier: GPL-2.0-only */ 2 + /* 3 + * X86 specific Hyper-V kdump/crash related code. 4 + * 5 + * Copyright (C) 2025, Microsoft, Inc. 6 + * 7 + */ 8 + #include <linux/linkage.h> 9 + #include <asm/alternative.h> 10 + #include <asm/msr.h> 11 + #include <asm/processor-flags.h> 12 + #include <asm/nospec-branch.h> 13 + 14 + /* 15 + * void noreturn hv_crash_asm32(arg1) 16 + * arg1 == edi == 32bit PA of struct hv_crash_tramp_data 17 + * 18 + * The hypervisor jumps here upon devirtualization in protected mode. This 19 + * code gets copied to a page in the low 4G ie, 32bit space so it can run 20 + * in the protected mode. Hence we cannot use any compile/link time offsets or 21 + * addresses. It restores long mode via temporary gdt and page tables and 22 + * eventually jumps to kernel code entry at HV_CRASHDATA_OFFS_C_entry. 23 + * 24 + * PreCondition (ie, Hypervisor call back ABI): 25 + * o CR0 is set to 0x0021: PE(prot mode) and NE are set, paging is disabled 26 + * o CR4 is set to 0x0 27 + * o IA32_EFER is set to 0x901 (SCE and NXE are set) 28 + * o EDI is set to the Arg passed to HVCALL_DISABLE_HYP_EX. 29 + * o CS, DS, ES, FS, GS are all initialized with a base of 0 and limit 0xFFFF 30 + * o IDTR, TR and GDTR are initialized with a base of 0 and limit of 0xFFFF 31 + * o LDTR is initialized as invalid (limit of 0) 32 + * o MSR PAT is power on default. 33 + * o Other state/registers are cleared. All TLBs flushed. 34 + */ 35 + 36 + #define HV_CRASHDATA_OFFS_TRAMPCR3 0x0 /* 0 */ 37 + #define HV_CRASHDATA_OFFS_KERNCR3 0x8 /* 8 */ 38 + #define HV_CRASHDATA_OFFS_GDTRLIMIT 0x12 /* 18 */ 39 + #define HV_CRASHDATA_OFFS_CS_JMPTGT 0x28 /* 40 */ 40 + #define HV_CRASHDATA_OFFS_C_entry 0x30 /* 48 */ 41 + 42 + .text 43 + .code32 44 + 45 + SYM_CODE_START(hv_crash_asm32) 46 + UNWIND_HINT_UNDEFINED 47 + ENDBR 48 + movl $X86_CR4_PAE, %ecx 49 + movl %ecx, %cr4 50 + 51 + movl %edi, %ebx 52 + add $HV_CRASHDATA_OFFS_TRAMPCR3, %ebx 53 + movl %cs:(%ebx), %eax 54 + movl %eax, %cr3 55 + 56 + /* Setup EFER for long mode now */ 57 + movl $MSR_EFER, %ecx 58 + rdmsr 59 + btsl $_EFER_LME, %eax 60 + wrmsr 61 + 62 + /* Turn paging on using the temp 32bit trampoline page table */ 63 + movl %cr0, %eax 64 + orl $(X86_CR0_PG), %eax 65 + movl %eax, %cr0 66 + 67 + /* since kernel cr3 could be above 4G, we need to be in the long mode 68 + * before we can load 64bits of the kernel cr3. We use a temp gdt for 69 + * that with CS.L=1 and CS.D=0 */ 70 + mov %edi, %eax 71 + add $HV_CRASHDATA_OFFS_GDTRLIMIT, %eax 72 + lgdtl %cs:(%eax) 73 + 74 + /* not done yet, restore CS now to switch to CS.L=1 */ 75 + mov %edi, %eax 76 + add $HV_CRASHDATA_OFFS_CS_JMPTGT, %eax 77 + ljmp %cs:*(%eax) 78 + SYM_CODE_END(hv_crash_asm32) 79 + 80 + /* we now run in full 64bit IA32-e long mode, CS.L=1 and CS.D=0 */ 81 + .code64 82 + .balign 8 83 + SYM_CODE_START(hv_crash_asm64) 84 + UNWIND_HINT_UNDEFINED 85 + ENDBR 86 + /* restore kernel page tables so we can jump to kernel code */ 87 + mov %edi, %eax 88 + add $HV_CRASHDATA_OFFS_KERNCR3, %eax 89 + movq %cs:(%eax), %rbx 90 + movq %rbx, %cr3 91 + 92 + mov %edi, %eax 93 + add $HV_CRASHDATA_OFFS_C_entry, %eax 94 + movq %cs:(%eax), %rbx 95 + ANNOTATE_RETPOLINE_SAFE 96 + jmp *%rbx 97 + 98 + int $3 99 + 100 + SYM_INNER_LABEL(hv_crash_asm_end, SYM_L_GLOBAL) 101 + SYM_CODE_END(hv_crash_asm64)