Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge tag 'hyperv-next-signed-20250324' of git://git.kernel.org/pub/scm/linux/kernel/git/hyperv/linux

Pull hyperv updates from Wei Liu:

- Add support for running as the root partition in Hyper-V (Microsoft
Hypervisor) by exposing /dev/mshv (Nuno and various people)

- Add support for CPU offlining in Hyper-V (Hamza Mahfooz)

- Misc fixes and cleanups (Roman Kisel, Tianyu Lan, Wei Liu, Michael
Kelley, Thorsten Blum)

* tag 'hyperv-next-signed-20250324' of git://git.kernel.org/pub/scm/linux/kernel/git/hyperv/linux: (24 commits)
x86/hyperv: fix an indentation issue in mshyperv.h
x86/hyperv: Add comments about hv_vpset and var size hypercall input args
Drivers: hv: Introduce mshv_root module to expose /dev/mshv to VMMs
hyperv: Add definitions for root partition driver to hv headers
x86: hyperv: Add mshv_handler() irq handler and setup function
Drivers: hv: Introduce per-cpu event ring tail
Drivers: hv: Export some functions for use by root partition module
acpi: numa: Export node_to_pxm()
hyperv: Introduce hv_recommend_using_aeoi()
arm64/hyperv: Add some missing functions to arm64
x86/mshyperv: Add support for extended Hyper-V features
hyperv: Log hypercall status codes as strings
x86/hyperv: Fix check of return value from snp_set_vmsa()
x86/hyperv: Add VTL mode callback for restarting the system
x86/hyperv: Add VTL mode emergency restart callback
hyperv: Remove unused union and structs
hyperv: Add CONFIG_MSHV_ROOT to gate root partition support
hyperv: Change hv_root_partition into a function
hyperv: Convert hypercall statuses to linux error codes
drivers/hv: add CPU offlining support
...

+6514 -228
+2
Documentation/userspace-api/ioctl/ioctl-number.rst
··· 370 370 0xB7 all uapi/linux/remoteproc_cdev.h <mailto:linux-remoteproc@vger.kernel.org> 371 371 0xB7 all uapi/linux/nsfs.h <mailto:Andrei Vagin <avagin@openvz.org>> 372 372 0xB8 01-02 uapi/misc/mrvl_cn10k_dpi.h Marvell CN10K DPI driver 373 + 0xB8 all uapi/linux/mshv.h Microsoft Hyper-V /dev/mshv driver 374 + <mailto:linux-hyperv@vger.kernel.org> 373 375 0xC0 00-0F linux/usb/iowarrior.h 374 376 0xCA 00-0F uapi/misc/cxl.h 375 377 0xCA 10-2F uapi/misc/ocxl.h
+17
arch/arm64/hyperv/hv_core.c
··· 54 54 EXPORT_SYMBOL_GPL(hv_do_fast_hypercall8); 55 55 56 56 /* 57 + * hv_do_fast_hypercall16 -- Invoke the specified hypercall 58 + * with arguments in registers instead of physical memory. 59 + * Avoids the overhead of virt_to_phys for simple hypercalls. 60 + */ 61 + u64 hv_do_fast_hypercall16(u16 code, u64 input1, u64 input2) 62 + { 63 + struct arm_smccc_res res; 64 + u64 control; 65 + 66 + control = (u64)code | HV_HYPERCALL_FAST_BIT; 67 + 68 + arm_smccc_1_1_hvc(HV_FUNC_ID, control, input1, input2, &res); 69 + return res.a0; 70 + } 71 + EXPORT_SYMBOL_GPL(hv_do_fast_hypercall16); 72 + 73 + /* 57 74 * Set a single VP register to a 64-bit value. 58 75 */ 59 76 void hv_set_vpreg(u32 msr, u64 value)
+6
arch/arm64/hyperv/mshyperv.c
··· 26 26 27 27 return 0; 28 28 } 29 + EXPORT_SYMBOL_GPL(hv_get_hypervisor_version); 29 30 30 31 static int __init hyperv_init(void) 31 32 { ··· 62 61 ms_hyperv.features, ms_hyperv.priv_high, ms_hyperv.hints, 63 62 ms_hyperv.misc_features); 64 63 64 + hv_identify_partition_type(); 65 + 65 66 ret = hv_common_init(); 66 67 if (ret) 67 68 return ret; ··· 74 71 hv_common_free(); 75 72 return ret; 76 73 } 74 + 75 + if (ms_hyperv.priv_high & HV_ACCESS_PARTITION_ID) 76 + hv_get_partition_id(); 77 77 78 78 ms_hyperv_late_init(); 79 79
+13
arch/arm64/include/asm/mshyperv.h
··· 40 40 return hv_get_vpreg(reg); 41 41 } 42 42 43 + /* 44 + * Nested is not supported on arm64 45 + */ 46 + static inline void hv_set_non_nested_msr(unsigned int reg, u64 value) 47 + { 48 + hv_set_msr(reg, value); 49 + } 50 + 51 + static inline u64 hv_get_non_nested_msr(unsigned int reg) 52 + { 53 + return hv_get_msr(reg); 54 + } 55 + 43 56 /* SMCCC hypercall parameters */ 44 57 #define HV_SMCCC_FUNC_NUMBER 1 45 58 #define HV_FUNC_ID ARM_SMCCC_CALL_VAL( \
+1 -1
arch/x86/hyperv/Makefile
··· 1 1 # SPDX-License-Identifier: GPL-2.0-only 2 2 obj-y := hv_init.o mmu.o nested.o irqdomain.o ivm.o 3 - obj-$(CONFIG_X86_64) += hv_apic.o hv_proc.o 3 + obj-$(CONFIG_X86_64) += hv_apic.o 4 4 obj-$(CONFIG_HYPERV_VTL_MODE) += hv_vtl.o 5 5 6 6 ifdef CONFIG_X86_64
+5
arch/x86/hyperv/hv_apic.c
··· 145 145 ipi_arg->vp_set.format = HV_GENERIC_SET_ALL; 146 146 } 147 147 148 + /* 149 + * For this hypercall, Hyper-V treats the valid_bank_mask field 150 + * of ipi_arg->vp_set as part of the fixed size input header. 151 + * So the variable input header size is equal to nr_bank. 152 + */ 148 153 status = hv_do_rep_hypercall(HVCALL_SEND_IPI_EX, 0, nr_bank, 149 154 ipi_arg, NULL); 150 155
+6 -29
arch/x86/hyperv/hv_init.c
··· 34 34 #include <clocksource/hyperv_timer.h> 35 35 #include <linux/highmem.h> 36 36 37 - u64 hv_current_partition_id = ~0ull; 38 - EXPORT_SYMBOL_GPL(hv_current_partition_id); 39 - 40 37 void *hv_hypercall_pg; 41 38 EXPORT_SYMBOL_GPL(hv_hypercall_pg); 42 39 ··· 90 93 return 0; 91 94 92 95 hvp = &hv_vp_assist_page[cpu]; 93 - if (hv_root_partition) { 96 + if (hv_root_partition()) { 94 97 /* 95 98 * For root partition we get the hypervisor provided VP assist 96 99 * page, instead of allocating a new page. ··· 242 245 243 246 if (hv_vp_assist_page && hv_vp_assist_page[cpu]) { 244 247 union hv_vp_assist_msr_contents msr = { 0 }; 245 - if (hv_root_partition) { 248 + if (hv_root_partition()) { 246 249 /* 247 250 * For root partition the VP assist page is mapped to 248 251 * hypervisor provided page, and thus we unmap the ··· 317 320 union hv_x64_msr_hypercall_contents hypercall_msr; 318 321 int ret; 319 322 320 - if (hv_root_partition) 323 + if (hv_root_partition()) 321 324 return -EPERM; 322 325 323 326 /* ··· 388 391 */ 389 392 if (old_setup_percpu_clockev) 390 393 old_setup_percpu_clockev(); 391 - } 392 - 393 - static void __init hv_get_partition_id(void) 394 - { 395 - struct hv_get_partition_id *output_page; 396 - u64 status; 397 - unsigned long flags; 398 - 399 - local_irq_save(flags); 400 - output_page = *this_cpu_ptr(hyperv_pcpu_output_arg); 401 - status = hv_do_hypercall(HVCALL_GET_PARTITION_ID, NULL, output_page); 402 - if (!hv_result_success(status)) { 403 - /* No point in proceeding if this failed */ 404 - pr_err("Failed to get partition ID: %lld\n", status); 405 - BUG(); 406 - } 407 - hv_current_partition_id = output_page->partition_id; 408 - local_irq_restore(flags); 409 394 } 410 395 411 396 #if IS_ENABLED(CONFIG_HYPERV_VTL_MODE) ··· 518 539 rdmsrl(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64); 519 540 hypercall_msr.enable = 1; 520 541 521 - if (hv_root_partition) { 542 + if (hv_root_partition()) { 522 543 struct page *pg; 523 544 void *src; 524 545 ··· 584 605 585 606 register_syscore_ops(&hv_syscore_ops); 586 607 587 - if (cpuid_ebx(HYPERV_CPUID_FEATURES) & HV_ACCESS_PARTITION_ID) 608 + if (ms_hyperv.priv_high & HV_ACCESS_PARTITION_ID) 588 609 hv_get_partition_id(); 589 - 590 - BUG_ON(hv_root_partition && hv_current_partition_id == ~0ull); 591 610 592 611 #ifdef CONFIG_PCI_MSI 593 612 /* 594 613 * If we're running as root, we want to create our own PCI MSI domain. 595 614 * We can't set this in hv_pci_init because that would be too late. 596 615 */ 597 - if (hv_root_partition) 616 + if (hv_root_partition()) 598 617 x86_init.irqs.create_pci_msi_domain = hv_create_pci_msi_domain; 599 618 #endif 600 619
+12 -15
arch/x86/hyperv/hv_proc.c drivers/hv/hv_proc.c
··· 6 6 #include <linux/slab.h> 7 7 #include <linux/cpuhotplug.h> 8 8 #include <linux/minmax.h> 9 - #include <asm/hypervisor.h> 10 9 #include <asm/mshyperv.h> 11 - #include <asm/apic.h> 12 - 13 - #include <asm/trace/hyperv.h> 14 10 15 11 /* 16 12 * See struct hv_deposit_memory. The first u64 is partition ID, the rest ··· 87 91 page_count, 0, input_page, NULL); 88 92 local_irq_restore(flags); 89 93 if (!hv_result_success(status)) { 90 - pr_err("Failed to deposit pages: %lld\n", status); 91 - ret = hv_result(status); 94 + hv_status_err(status, "\n"); 95 + ret = hv_result_to_errno(status); 92 96 goto err_free_allocations; 93 97 } 94 98 ··· 107 111 kfree(counts); 108 112 return ret; 109 113 } 114 + EXPORT_SYMBOL_GPL(hv_call_deposit_pages); 110 115 111 116 int hv_call_add_logical_proc(int node, u32 lp_index, u32 apic_id) 112 117 { ··· 115 118 struct hv_output_add_logical_processor *output; 116 119 u64 status; 117 120 unsigned long flags; 118 - int ret = HV_STATUS_SUCCESS; 121 + int ret = 0; 119 122 120 123 /* 121 124 * When adding a logical processor, the hypervisor may return ··· 138 141 139 142 if (hv_result(status) != HV_STATUS_INSUFFICIENT_MEMORY) { 140 143 if (!hv_result_success(status)) { 141 - pr_err("%s: cpu %u apic ID %u, %lld\n", __func__, 142 - lp_index, apic_id, status); 143 - ret = hv_result(status); 144 + hv_status_err(status, "cpu %u apic ID: %u\n", 145 + lp_index, apic_id); 146 + ret = hv_result_to_errno(status); 144 147 } 145 148 break; 146 149 } ··· 155 158 struct hv_create_vp *input; 156 159 u64 status; 157 160 unsigned long irq_flags; 158 - int ret = HV_STATUS_SUCCESS; 161 + int ret = 0; 159 162 160 163 /* Root VPs don't seem to need pages deposited */ 161 164 if (partition_id != hv_current_partition_id) { ··· 180 183 181 184 if (hv_result(status) != HV_STATUS_INSUFFICIENT_MEMORY) { 182 185 if (!hv_result_success(status)) { 183 - pr_err("%s: vcpu %u, lp %u, %lld\n", __func__, 184 - vp_index, flags, status); 185 - ret = hv_result(status); 186 + hv_status_err(status, "vcpu: %u, lp: %u\n", 187 + vp_index, flags); 188 + ret = hv_result_to_errno(status); 186 189 } 187 190 break; 188 191 } ··· 192 195 193 196 return ret; 194 197 } 195 - 198 + EXPORT_SYMBOL_GPL(hv_call_create_vp);
+34
arch/x86/hyperv/hv_vtl.c
··· 12 12 #include <asm/i8259.h> 13 13 #include <asm/mshyperv.h> 14 14 #include <asm/realmode.h> 15 + #include <asm/reboot.h> 15 16 #include <../kernel/smpboot.h> 16 17 17 18 extern struct boot_params boot_params; ··· 21 20 static bool __init hv_vtl_msi_ext_dest_id(void) 22 21 { 23 22 return true; 23 + } 24 + 25 + /* 26 + * The `native_machine_emergency_restart` function from `reboot.c` writes 27 + * to the physical address 0x472 to indicate the type of reboot for the 28 + * firmware. We cannot have that in VSM as the memory composition might 29 + * be more generic, and such write effectively corrupts the memory thus 30 + * making diagnostics harder at the very least. 31 + */ 32 + static void __noreturn hv_vtl_emergency_restart(void) 33 + { 34 + /* 35 + * Cause a triple fault and the immediate reset. Here the code does not run 36 + * on the top of any firmware, whereby cannot reach out to its services. 37 + * The inifinite loop is for the improbable case that the triple fault does 38 + * not work and have to preserve the state intact for debugging. 39 + */ 40 + for (;;) { 41 + idt_invalidate(); 42 + __asm__ __volatile__("int3"); 43 + } 44 + } 45 + 46 + /* 47 + * The only way to restart in the VTL mode is to triple fault as the kernel runs 48 + * as firmware. 49 + */ 50 + static void __noreturn hv_vtl_restart(char __maybe_unused *cmd) 51 + { 52 + hv_vtl_emergency_restart(); 24 53 } 25 54 26 55 void __init hv_vtl_init_platform(void) ··· 267 236 268 237 int __init hv_vtl_early_init(void) 269 238 { 239 + machine_ops.emergency_restart = hv_vtl_emergency_restart; 240 + machine_ops.restart = hv_vtl_restart; 241 + 270 242 /* 271 243 * `boot_cpu_has` returns the runtime feature support, 272 244 * and here is the earliest it can be used.
+3 -3
arch/x86/hyperv/irqdomain.c
··· 64 64 local_irq_restore(flags); 65 65 66 66 if (!hv_result_success(status)) 67 - pr_err("%s: hypercall failed, status %lld\n", __func__, status); 67 + hv_status_err(status, "\n"); 68 68 69 69 return hv_result(status); 70 70 } ··· 224 224 kfree(stored_entry); 225 225 226 226 if (status != HV_STATUS_SUCCESS) { 227 - pr_debug("%s: failed to unmap, status %lld", __func__, status); 227 + hv_status_debug(status, "failed to unmap\n"); 228 228 return; 229 229 } 230 230 } ··· 273 273 status = hv_unmap_msi_interrupt(dev, &old_entry); 274 274 275 275 if (status != HV_STATUS_SUCCESS) 276 - pr_err("%s: hypercall failed, status %lld\n", __func__, status); 276 + hv_status_err(status, "\n"); 277 277 } 278 278 279 279 static void hv_msi_free_irq(struct irq_domain *domain,
+1 -1
arch/x86/hyperv/ivm.c
··· 338 338 vmsa->sev_features = sev_status >> 2; 339 339 340 340 ret = snp_set_vmsa(vmsa, true); 341 - if (!ret) { 341 + if (ret) { 342 342 pr_err("RMPADJUST(%llx) failed: %llx\n", (u64)vmsa, ret); 343 343 free_page((u64)vmsa); 344 344 return ret;
+4
arch/x86/hyperv/mmu.c
··· 205 205 /* 206 206 * We can flush not more than max_gvas with one hypercall. Flush the 207 207 * whole address space if we were asked to do more. 208 + * 209 + * For these hypercalls, Hyper-V treats the valid_bank_mask field 210 + * of flush->hv_vp_set as part of the fixed size input header. 211 + * So the variable input header size is equal to nr_bank. 208 212 */ 209 213 max_gvas = 210 214 (PAGE_SIZE - sizeof(*flush) - nr_bank *
+1 -7
arch/x86/include/asm/mshyperv.h
··· 43 43 44 44 extern void *hv_hypercall_pg; 45 45 46 - extern u64 hv_current_partition_id; 47 - 48 46 extern union hv_ghcb * __percpu *hv_ghcb_pg; 49 47 50 48 bool hv_isolation_type_snp(void); ··· 55 57 */ 56 58 #define HV_AP_INIT_GPAT_DEFAULT 0x0007040600070406ULL 57 59 #define HV_AP_SEGMENT_LIMIT 0xffffffff 58 - 59 - int hv_call_deposit_pages(int node, u64 partition_id, u32 num_pages); 60 - int hv_call_add_logical_proc(int node, u32 lp_index, u32 acpi_id); 61 - int hv_call_create_vp(int node, u64 partition_id, u32 vp_index, u32 flags); 62 60 63 61 /* 64 62 * If the hypercall involves no input or output parameters, the hypervisor ··· 154 160 : "cc", "edi", "esi"); 155 161 } 156 162 #endif 157 - return hv_status; 163 + return hv_status; 158 164 } 159 165 160 166 static inline u64 hv_do_fast_hypercall8(u16 code, u64 input1)
+16 -24
arch/x86/kernel/cpu/mshyperv.c
··· 33 33 #include <asm/numa.h> 34 34 #include <asm/svm.h> 35 35 36 - /* Is Linux running as the root partition? */ 37 - bool hv_root_partition; 38 36 /* Is Linux running on nested Microsoft Hypervisor */ 39 37 bool hv_nested; 40 38 struct ms_hyperv_info ms_hyperv; ··· 107 109 } 108 110 EXPORT_SYMBOL_GPL(hv_set_msr); 109 111 112 + static void (*mshv_handler)(void); 110 113 static void (*vmbus_handler)(void); 111 114 static void (*hv_stimer0_handler)(void); 112 115 static void (*hv_kexec_handler)(void); ··· 118 119 struct pt_regs *old_regs = set_irq_regs(regs); 119 120 120 121 inc_irq_stat(irq_hv_callback_count); 122 + if (mshv_handler) 123 + mshv_handler(); 124 + 121 125 if (vmbus_handler) 122 126 vmbus_handler(); 123 127 ··· 128 126 apic_eoi(); 129 127 130 128 set_irq_regs(old_regs); 129 + } 130 + 131 + void hv_setup_mshv_handler(void (*handler)(void)) 132 + { 133 + mshv_handler = handler; 131 134 } 132 135 133 136 void hv_setup_vmbus_handler(void (*handler)(void)) ··· 429 422 430 423 return 0; 431 424 } 425 + EXPORT_SYMBOL_GPL(hv_get_hypervisor_version); 432 426 433 427 static void __init ms_hyperv_init_platform(void) 434 428 { ··· 444 436 */ 445 437 ms_hyperv.features = cpuid_eax(HYPERV_CPUID_FEATURES); 446 438 ms_hyperv.priv_high = cpuid_ebx(HYPERV_CPUID_FEATURES); 439 + ms_hyperv.ext_features = cpuid_ecx(HYPERV_CPUID_FEATURES); 447 440 ms_hyperv.misc_features = cpuid_edx(HYPERV_CPUID_FEATURES); 448 441 ms_hyperv.hints = cpuid_eax(HYPERV_CPUID_ENLIGHTMENT_INFO); 449 442 450 443 hv_max_functions_eax = cpuid_eax(HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS); 451 444 452 - pr_info("Hyper-V: privilege flags low 0x%x, high 0x%x, hints 0x%x, misc 0x%x\n", 453 - ms_hyperv.features, ms_hyperv.priv_high, ms_hyperv.hints, 445 + pr_info("Hyper-V: privilege flags low %#x, high %#x, ext %#x, hints %#x, misc %#x\n", 446 + ms_hyperv.features, ms_hyperv.priv_high, 447 + ms_hyperv.ext_features, ms_hyperv.hints, 454 448 ms_hyperv.misc_features); 455 449 456 450 ms_hyperv.max_vp_index = cpuid_eax(HYPERV_CPUID_IMPLEMENT_LIMITS); ··· 461 451 pr_debug("Hyper-V: max %u virtual processors, %u logical processors\n", 462 452 ms_hyperv.max_vp_index, ms_hyperv.max_lp_index); 463 453 464 - /* 465 - * Check CPU management privilege. 466 - * 467 - * To mirror what Windows does we should extract CPU management 468 - * features and use the ReservedIdentityBit to detect if Linux is the 469 - * root partition. But that requires negotiating CPU management 470 - * interface (a process to be finalized). For now, use the privilege 471 - * flag as the indicator for running as root. 472 - * 473 - * Hyper-V should never specify running as root and as a Confidential 474 - * VM. But to protect against a compromised/malicious Hyper-V trying 475 - * to exploit root behavior to expose Confidential VM memory, ignore 476 - * the root partition setting if also a Confidential VM. 477 - */ 478 - if ((ms_hyperv.priv_high & HV_CPU_MANAGEMENT) && 479 - !(ms_hyperv.priv_high & HV_ISOLATION)) { 480 - hv_root_partition = true; 481 - pr_info("Hyper-V: running as root partition\n"); 482 - } 454 + hv_identify_partition_type(); 483 455 484 456 if (ms_hyperv.hints & HV_X64_HYPERV_NESTED) { 485 457 hv_nested = true; ··· 610 618 611 619 # ifdef CONFIG_SMP 612 620 smp_ops.smp_prepare_boot_cpu = hv_smp_prepare_boot_cpu; 613 - if (hv_root_partition || 621 + if (hv_root_partition() || 614 622 (!ms_hyperv.paravisor_present && hv_isolation_type_snp())) 615 623 smp_ops.smp_prepare_cpus = hv_smp_prepare_cpus; 616 624 # endif
+1
drivers/acpi/numa/srat.c
··· 51 51 return PXM_INVAL; 52 52 return node_to_pxm_map[node]; 53 53 } 54 + EXPORT_SYMBOL_GPL(node_to_pxm); 54 55 55 56 static void __acpi_map_pxm_to_node(int pxm, int node) 56 57 {
+2 -2
drivers/clocksource/hyperv_timer.c
··· 582 582 * mapped. 583 583 */ 584 584 tsc_msr.as_uint64 = hv_get_msr(HV_MSR_REFERENCE_TSC); 585 - if (hv_root_partition) 585 + if (hv_root_partition()) 586 586 tsc_pfn = tsc_msr.pfn; 587 587 else 588 588 tsc_pfn = HVPFN_DOWN(virt_to_phys(tsc_page)); ··· 627 627 if (!(ms_hyperv.features & HV_MSR_REFERENCE_TSC_AVAILABLE)) 628 628 return; 629 629 630 - if (!hv_root_partition) { 630 + if (!hv_root_partition()) { 631 631 WARN(1, "%s: attempt to remap TSC page in guest partition\n", 632 632 __func__); 633 633 return;
+17
drivers/hv/Kconfig
··· 55 55 help 56 56 Select this option to enable Hyper-V Balloon driver. 57 57 58 + config MSHV_ROOT 59 + tristate "Microsoft Hyper-V root partition support" 60 + depends on HYPERV && (X86_64 || ARM64) 61 + depends on !HYPERV_VTL_MODE 62 + # The hypervisor interface operates on 4k pages. Enforcing it here 63 + # simplifies many assumptions in the root partition code. 64 + # e.g. When withdrawing memory, the hypervisor gives back 4k pages in 65 + # no particular order, making it impossible to reassemble larger pages 66 + depends on PAGE_SIZE_4KB 67 + select EVENTFD 68 + default n 69 + help 70 + Select this option to enable support for booting and running as root 71 + partition on Microsoft Hyper-V. 72 + 73 + If unsure, say N. 74 + 58 75 endmenu
+4
drivers/hv/Makefile
··· 2 2 obj-$(CONFIG_HYPERV) += hv_vmbus.o 3 3 obj-$(CONFIG_HYPERV_UTILS) += hv_utils.o 4 4 obj-$(CONFIG_HYPERV_BALLOON) += hv_balloon.o 5 + obj-$(CONFIG_MSHV_ROOT) += mshv_root.o 5 6 6 7 CFLAGS_hv_trace.o = -I$(src) 7 8 CFLAGS_hv_balloon.o = -I$(src) ··· 12 11 channel_mgmt.o ring_buffer.o hv_trace.o 13 12 hv_vmbus-$(CONFIG_HYPERV_TESTING) += hv_debugfs.o 14 13 hv_utils-y := hv_util.o hv_kvp.o hv_snapshot.o hv_utils_transport.o 14 + mshv_root-y := mshv_root_main.o mshv_synic.o mshv_eventfd.o mshv_irq.o \ 15 + mshv_root_hv_call.o mshv_portid_table.o 15 16 16 17 # Code that must be built-in 17 18 obj-$(subst m,y,$(CONFIG_HYPERV)) += hv_common.o 19 + obj-$(subst m,y,$(CONFIG_MSHV_ROOT)) += hv_proc.o mshv_common.o
+57 -37
drivers/hv/hv.c
··· 144 144 * Synic message and event pages are allocated by paravisor. 145 145 * Skip these pages allocation here. 146 146 */ 147 - if (!ms_hyperv.paravisor_present && !hv_root_partition) { 147 + if (!ms_hyperv.paravisor_present && !hv_root_partition()) { 148 148 hv_cpu->synic_message_page = 149 149 (void *)get_zeroed_page(GFP_ATOMIC); 150 150 if (!hv_cpu->synic_message_page) { ··· 272 272 simp.as_uint64 = hv_get_msr(HV_MSR_SIMP); 273 273 simp.simp_enabled = 1; 274 274 275 - if (ms_hyperv.paravisor_present || hv_root_partition) { 275 + if (ms_hyperv.paravisor_present || hv_root_partition()) { 276 276 /* Mask out vTOM bit. ioremap_cache() maps decrypted */ 277 277 u64 base = (simp.base_simp_gpa << HV_HYP_PAGE_SHIFT) & 278 278 ~ms_hyperv.shared_gpa_boundary; ··· 291 291 siefp.as_uint64 = hv_get_msr(HV_MSR_SIEFP); 292 292 siefp.siefp_enabled = 1; 293 293 294 - if (ms_hyperv.paravisor_present || hv_root_partition) { 294 + if (ms_hyperv.paravisor_present || hv_root_partition()) { 295 295 /* Mask out vTOM bit. ioremap_cache() maps decrypted */ 296 296 u64 base = (siefp.base_siefp_gpa << HV_HYP_PAGE_SHIFT) & 297 297 ~ms_hyperv.shared_gpa_boundary; ··· 313 313 314 314 shared_sint.vector = vmbus_interrupt; 315 315 shared_sint.masked = false; 316 - 317 - /* 318 - * On architectures where Hyper-V doesn't support AEOI (e.g., ARM64), 319 - * it doesn't provide a recommendation flag and AEOI must be disabled. 320 - */ 321 - #ifdef HV_DEPRECATING_AEOI_RECOMMENDED 322 - shared_sint.auto_eoi = 323 - !(ms_hyperv.hints & HV_DEPRECATING_AEOI_RECOMMENDED); 324 - #else 325 - shared_sint.auto_eoi = 0; 326 - #endif 316 + shared_sint.auto_eoi = hv_recommend_using_aeoi(); 327 317 hv_set_msr(HV_MSR_SINT0 + VMBUS_MESSAGE_SINT, shared_sint.as_uint64); 328 318 329 319 /* Enable the global synic bit */ ··· 357 367 * addresses. 358 368 */ 359 369 simp.simp_enabled = 0; 360 - if (ms_hyperv.paravisor_present || hv_root_partition) { 370 + if (ms_hyperv.paravisor_present || hv_root_partition()) { 361 371 iounmap(hv_cpu->synic_message_page); 362 372 hv_cpu->synic_message_page = NULL; 363 373 } else { ··· 369 379 siefp.as_uint64 = hv_get_msr(HV_MSR_SIEFP); 370 380 siefp.siefp_enabled = 0; 371 381 372 - if (ms_hyperv.paravisor_present || hv_root_partition) { 382 + if (ms_hyperv.paravisor_present || hv_root_partition()) { 373 383 iounmap(hv_cpu->synic_event_page); 374 384 hv_cpu->synic_event_page = NULL; 375 385 } else { ··· 423 433 return pending; 424 434 } 425 435 436 + static int hv_pick_new_cpu(struct vmbus_channel *channel) 437 + { 438 + int ret = -EBUSY; 439 + int start; 440 + int cpu; 441 + 442 + lockdep_assert_cpus_held(); 443 + lockdep_assert_held(&vmbus_connection.channel_mutex); 444 + 445 + /* 446 + * We can't assume that the relevant interrupts will be sent before 447 + * the cpu is offlined on older versions of hyperv. 448 + */ 449 + if (vmbus_proto_version < VERSION_WIN10_V5_3) 450 + return -EBUSY; 451 + 452 + start = get_random_u32_below(nr_cpu_ids); 453 + 454 + for_each_cpu_wrap(cpu, cpu_online_mask, start) { 455 + if (channel->target_cpu == cpu || 456 + channel->target_cpu == VMBUS_CONNECT_CPU) 457 + continue; 458 + 459 + ret = vmbus_channel_set_cpu(channel, cpu); 460 + if (!ret) 461 + break; 462 + } 463 + 464 + if (ret) 465 + ret = vmbus_channel_set_cpu(channel, VMBUS_CONNECT_CPU); 466 + 467 + return ret; 468 + } 469 + 426 470 /* 427 471 * hv_synic_cleanup - Cleanup routine for hv_synic_init(). 428 472 */ 429 473 int hv_synic_cleanup(unsigned int cpu) 430 474 { 431 475 struct vmbus_channel *channel, *sc; 432 - bool channel_found = false; 476 + int ret = 0; 433 477 434 478 if (vmbus_connection.conn_state != CONNECTED) 435 479 goto always_cleanup; ··· 480 456 481 457 /* 482 458 * Search for channels which are bound to the CPU we're about to 483 - * cleanup. In case we find one and vmbus is still connected, we 484 - * fail; this will effectively prevent CPU offlining. 485 - * 486 - * TODO: Re-bind the channels to different CPUs. 459 + * cleanup. 487 460 */ 488 461 mutex_lock(&vmbus_connection.channel_mutex); 489 462 list_for_each_entry(channel, &vmbus_connection.chn_list, listentry) { 490 463 if (channel->target_cpu == cpu) { 491 - channel_found = true; 492 - break; 464 + ret = hv_pick_new_cpu(channel); 465 + if (ret) { 466 + mutex_unlock(&vmbus_connection.channel_mutex); 467 + return ret; 468 + } 493 469 } 494 470 list_for_each_entry(sc, &channel->sc_list, sc_list) { 495 471 if (sc->target_cpu == cpu) { 496 - channel_found = true; 497 - break; 472 + ret = hv_pick_new_cpu(sc); 473 + if (ret) { 474 + mutex_unlock(&vmbus_connection.channel_mutex); 475 + return ret; 476 + } 498 477 } 499 478 } 500 - if (channel_found) 501 - break; 502 479 } 503 480 mutex_unlock(&vmbus_connection.channel_mutex); 504 481 505 - if (channel_found) 506 - return -EBUSY; 507 - 508 482 /* 509 - * channel_found == false means that any channels that were previously 510 - * assigned to the CPU have been reassigned elsewhere with a call of 511 - * vmbus_send_modifychannel(). Scan the event flags page looking for 512 - * bits that are set and waiting with a timeout for vmbus_chan_sched() 513 - * to process such bits. If bits are still set after this operation 514 - * and VMBus is connected, fail the CPU offlining operation. 483 + * Scan the event flags page looking for bits that are set and waiting 484 + * with a timeout for vmbus_chan_sched() to process such bits. If bits 485 + * are still set after this operation and VMBus is connected, fail the 486 + * CPU offlining operation. 515 487 */ 516 488 if (vmbus_proto_version >= VERSION_WIN10_V4_1 && hv_synic_event_pending()) 517 489 return -EBUSY; ··· 517 497 518 498 hv_synic_disable_regs(cpu); 519 499 520 - return 0; 500 + return ret; 521 501 }
+188 -10
drivers/hv/hv_common.c
··· 31 31 #include <hyperv/hvhdk.h> 32 32 #include <asm/mshyperv.h> 33 33 34 + u64 hv_current_partition_id = HV_PARTITION_ID_SELF; 35 + EXPORT_SYMBOL_GPL(hv_current_partition_id); 36 + 37 + enum hv_partition_type hv_curr_partition_type; 38 + EXPORT_SYMBOL_GPL(hv_curr_partition_type); 39 + 34 40 /* 35 - * hv_root_partition, ms_hyperv and hv_nested are defined here with other 41 + * ms_hyperv and hv_nested are defined here with other 36 42 * Hyper-V specific globals so they are shared across all architectures and are 37 43 * built only when CONFIG_HYPERV is defined. But on x86, 38 44 * ms_hyperv_init_platform() is built even when CONFIG_HYPERV is not ··· 46 40 * here, allowing for an overriding definition in the module containing 47 41 * ms_hyperv_init_platform(). 48 42 */ 49 - bool __weak hv_root_partition; 50 - EXPORT_SYMBOL_GPL(hv_root_partition); 51 - 52 43 bool __weak hv_nested; 53 44 EXPORT_SYMBOL_GPL(hv_nested); 54 45 ··· 69 66 static struct ctl_table_header *hv_ctl_table_hdr; 70 67 71 68 /* 69 + * Per-cpu array holding the tail pointer for the SynIC event ring buffer 70 + * for each SINT. 71 + * 72 + * We cannot maintain this in mshv driver because the tail pointer should 73 + * persist even if the mshv driver is unloaded. 74 + */ 75 + u8 * __percpu *hv_synic_eventring_tail; 76 + EXPORT_SYMBOL_GPL(hv_synic_eventring_tail); 77 + 78 + /* 72 79 * Hyper-V specific initialization and shutdown code that is 73 80 * common across all architectures. Called from architecture 74 81 * specific initialization functions. ··· 100 87 101 88 free_percpu(hyperv_pcpu_input_arg); 102 89 hyperv_pcpu_input_arg = NULL; 90 + 91 + free_percpu(hv_synic_eventring_tail); 92 + hv_synic_eventring_tail = NULL; 103 93 } 104 94 105 95 /* ··· 296 280 297 281 static inline bool hv_output_page_exists(void) 298 282 { 299 - return hv_root_partition || IS_ENABLED(CONFIG_HYPERV_VTL_MODE); 283 + return hv_root_partition() || IS_ENABLED(CONFIG_HYPERV_VTL_MODE); 284 + } 285 + 286 + void __init hv_get_partition_id(void) 287 + { 288 + struct hv_output_get_partition_id *output; 289 + unsigned long flags; 290 + u64 status, pt_id; 291 + 292 + local_irq_save(flags); 293 + output = *this_cpu_ptr(hyperv_pcpu_input_arg); 294 + status = hv_do_hypercall(HVCALL_GET_PARTITION_ID, NULL, &output); 295 + pt_id = output->partition_id; 296 + local_irq_restore(flags); 297 + 298 + if (hv_result_success(status)) 299 + hv_current_partition_id = pt_id; 300 + else 301 + pr_err("Hyper-V: failed to get partition ID: %#x\n", 302 + hv_result(status)); 300 303 } 301 304 302 305 int __init hv_common_init(void) ··· 383 348 if (hv_output_page_exists()) { 384 349 hyperv_pcpu_output_arg = alloc_percpu(void *); 385 350 BUG_ON(!hyperv_pcpu_output_arg); 351 + } 352 + 353 + if (hv_root_partition()) { 354 + hv_synic_eventring_tail = alloc_percpu(u8 *); 355 + BUG_ON(!hv_synic_eventring_tail); 386 356 } 387 357 388 358 hv_vp_index = kmalloc_array(nr_cpu_ids, sizeof(*hv_vp_index), ··· 478 438 int hv_common_cpu_init(unsigned int cpu) 479 439 { 480 440 void **inputarg, **outputarg; 441 + u8 **synic_eventring_tail; 481 442 u64 msr_vp_index; 482 443 gfp_t flags; 483 444 const int pgcount = hv_output_page_exists() ? 2 : 1; 484 445 void *mem; 485 - int ret; 446 + int ret = 0; 486 447 487 448 /* hv_cpu_init() can be called with IRQs disabled from hv_resume() */ 488 449 flags = irqs_disabled() ? GFP_ATOMIC : GFP_KERNEL; ··· 491 450 inputarg = (void **)this_cpu_ptr(hyperv_pcpu_input_arg); 492 451 493 452 /* 494 - * hyperv_pcpu_input_arg and hyperv_pcpu_output_arg memory is already 495 - * allocated if this CPU was previously online and then taken offline 453 + * The per-cpu memory is already allocated if this CPU was previously 454 + * online and then taken offline 496 455 */ 497 456 if (!*inputarg) { 498 457 mem = kmalloc(pgcount * HV_HYP_PAGE_SIZE, flags); ··· 539 498 if (msr_vp_index > hv_max_vp_index) 540 499 hv_max_vp_index = msr_vp_index; 541 500 542 - return 0; 501 + if (hv_root_partition()) { 502 + synic_eventring_tail = (u8 **)this_cpu_ptr(hv_synic_eventring_tail); 503 + *synic_eventring_tail = kcalloc(HV_SYNIC_SINT_COUNT, 504 + sizeof(u8), flags); 505 + /* No need to unwind any of the above on failure here */ 506 + if (unlikely(!*synic_eventring_tail)) 507 + ret = -ENOMEM; 508 + } 509 + 510 + return ret; 543 511 } 544 512 545 513 int hv_common_cpu_die(unsigned int cpu) 546 514 { 515 + u8 **synic_eventring_tail; 547 516 /* 548 517 * The hyperv_pcpu_input_arg and hyperv_pcpu_output_arg memory 549 518 * is not freed when the CPU goes offline as the hyperv_pcpu_input_arg ··· 565 514 * If a previously offlined CPU is brought back online again, the 566 515 * originally allocated memory is reused in hv_common_cpu_init(). 567 516 */ 517 + 518 + synic_eventring_tail = this_cpu_ptr(hv_synic_eventring_tail); 519 + kfree(*synic_eventring_tail); 520 + *synic_eventring_tail = NULL; 568 521 569 522 return 0; 570 523 } ··· 627 572 628 573 bool hv_is_hibernation_supported(void) 629 574 { 630 - return !hv_root_partition && acpi_sleep_state_supported(ACPI_STATE_S4); 575 + return !hv_root_partition() && acpi_sleep_state_supported(ACPI_STATE_S4); 631 576 } 632 577 EXPORT_SYMBOL_GPL(hv_is_hibernation_supported); 633 578 ··· 680 625 } 681 626 EXPORT_SYMBOL_GPL(hv_remove_vmbus_handler); 682 627 628 + void __weak hv_setup_mshv_handler(void (*handler)(void)) 629 + { 630 + } 631 + EXPORT_SYMBOL_GPL(hv_setup_mshv_handler); 632 + 683 633 void __weak hv_setup_kexec_handler(void (*handler)(void)) 684 634 { 685 635 } ··· 721 661 return HV_STATUS_INVALID_PARAMETER; 722 662 } 723 663 EXPORT_SYMBOL_GPL(hv_tdx_hypercall); 664 + 665 + void hv_identify_partition_type(void) 666 + { 667 + /* Assume guest role */ 668 + hv_curr_partition_type = HV_PARTITION_TYPE_GUEST; 669 + /* 670 + * Check partition creation and cpu management privileges 671 + * 672 + * Hyper-V should never specify running as root and as a Confidential 673 + * VM. But to protect against a compromised/malicious Hyper-V trying 674 + * to exploit root behavior to expose Confidential VM memory, ignore 675 + * the root partition setting if also a Confidential VM. 676 + */ 677 + if ((ms_hyperv.priv_high & HV_CREATE_PARTITIONS) && 678 + (ms_hyperv.priv_high & HV_CPU_MANAGEMENT) && 679 + !(ms_hyperv.priv_high & HV_ISOLATION)) { 680 + pr_info("Hyper-V: running as root partition\n"); 681 + if (IS_ENABLED(CONFIG_MSHV_ROOT)) 682 + hv_curr_partition_type = HV_PARTITION_TYPE_ROOT; 683 + else 684 + pr_crit("Hyper-V: CONFIG_MSHV_ROOT not enabled!\n"); 685 + } 686 + } 687 + 688 + struct hv_status_info { 689 + char *string; 690 + int errno; 691 + u16 code; 692 + }; 693 + 694 + /* 695 + * Note on the errno mappings: 696 + * A failed hypercall is usually only recoverable (or loggable) near 697 + * the call site where the HV_STATUS_* code is known. So the errno 698 + * it gets converted to is not too useful further up the stack. 699 + * Provide a few mappings that could be useful, and revert to -EIO 700 + * as a fallback. 701 + */ 702 + static const struct hv_status_info hv_status_infos[] = { 703 + #define _STATUS_INFO(status, errno) { #status, (errno), (status) } 704 + _STATUS_INFO(HV_STATUS_SUCCESS, 0), 705 + _STATUS_INFO(HV_STATUS_INVALID_HYPERCALL_CODE, -EINVAL), 706 + _STATUS_INFO(HV_STATUS_INVALID_HYPERCALL_INPUT, -EINVAL), 707 + _STATUS_INFO(HV_STATUS_INVALID_ALIGNMENT, -EIO), 708 + _STATUS_INFO(HV_STATUS_INVALID_PARAMETER, -EINVAL), 709 + _STATUS_INFO(HV_STATUS_ACCESS_DENIED, -EIO), 710 + _STATUS_INFO(HV_STATUS_INVALID_PARTITION_STATE, -EIO), 711 + _STATUS_INFO(HV_STATUS_OPERATION_DENIED, -EIO), 712 + _STATUS_INFO(HV_STATUS_UNKNOWN_PROPERTY, -EIO), 713 + _STATUS_INFO(HV_STATUS_PROPERTY_VALUE_OUT_OF_RANGE, -EIO), 714 + _STATUS_INFO(HV_STATUS_INSUFFICIENT_MEMORY, -ENOMEM), 715 + _STATUS_INFO(HV_STATUS_INVALID_PARTITION_ID, -EINVAL), 716 + _STATUS_INFO(HV_STATUS_INVALID_VP_INDEX, -EINVAL), 717 + _STATUS_INFO(HV_STATUS_NOT_FOUND, -EIO), 718 + _STATUS_INFO(HV_STATUS_INVALID_PORT_ID, -EINVAL), 719 + _STATUS_INFO(HV_STATUS_INVALID_CONNECTION_ID, -EINVAL), 720 + _STATUS_INFO(HV_STATUS_INSUFFICIENT_BUFFERS, -EIO), 721 + _STATUS_INFO(HV_STATUS_NOT_ACKNOWLEDGED, -EIO), 722 + _STATUS_INFO(HV_STATUS_INVALID_VP_STATE, -EIO), 723 + _STATUS_INFO(HV_STATUS_NO_RESOURCES, -EIO), 724 + _STATUS_INFO(HV_STATUS_PROCESSOR_FEATURE_NOT_SUPPORTED, -EIO), 725 + _STATUS_INFO(HV_STATUS_INVALID_LP_INDEX, -EINVAL), 726 + _STATUS_INFO(HV_STATUS_INVALID_REGISTER_VALUE, -EINVAL), 727 + _STATUS_INFO(HV_STATUS_INVALID_LP_INDEX, -EIO), 728 + _STATUS_INFO(HV_STATUS_INVALID_REGISTER_VALUE, -EIO), 729 + _STATUS_INFO(HV_STATUS_OPERATION_FAILED, -EIO), 730 + _STATUS_INFO(HV_STATUS_TIME_OUT, -EIO), 731 + _STATUS_INFO(HV_STATUS_CALL_PENDING, -EIO), 732 + _STATUS_INFO(HV_STATUS_VTL_ALREADY_ENABLED, -EIO), 733 + #undef _STATUS_INFO 734 + }; 735 + 736 + static inline const struct hv_status_info *find_hv_status_info(u64 hv_status) 737 + { 738 + int i; 739 + u16 code = hv_result(hv_status); 740 + 741 + for (i = 0; i < ARRAY_SIZE(hv_status_infos); ++i) { 742 + const struct hv_status_info *info = &hv_status_infos[i]; 743 + 744 + if (info->code == code) 745 + return info; 746 + } 747 + 748 + return NULL; 749 + } 750 + 751 + /* Convert a hypercall result into a linux-friendly error code. */ 752 + int hv_result_to_errno(u64 status) 753 + { 754 + const struct hv_status_info *info; 755 + 756 + /* hv_do_hypercall() may return U64_MAX, hypercalls aren't possible */ 757 + if (unlikely(status == U64_MAX)) 758 + return -EOPNOTSUPP; 759 + 760 + info = find_hv_status_info(status); 761 + if (info) 762 + return info->errno; 763 + 764 + return -EIO; 765 + } 766 + EXPORT_SYMBOL_GPL(hv_result_to_errno); 767 + 768 + const char *hv_result_to_string(u64 status) 769 + { 770 + const struct hv_status_info *info; 771 + 772 + if (unlikely(status == U64_MAX)) 773 + return "Hypercall page missing!"; 774 + 775 + info = find_hv_status_info(status); 776 + if (info) 777 + return info->string; 778 + 779 + return "Unknown"; 780 + } 781 + EXPORT_SYMBOL_GPL(hv_result_to_string);
+30
drivers/hv/mshv.h
··· 1 + /* SPDX-License-Identifier: GPL-2.0-only */ 2 + /* 3 + * Copyright (c) 2023, Microsoft Corporation. 4 + */ 5 + 6 + #ifndef _MSHV_H_ 7 + #define _MSHV_H_ 8 + 9 + #include <linux/stddef.h> 10 + #include <linux/string.h> 11 + #include <hyperv/hvhdk.h> 12 + 13 + #define mshv_field_nonzero(STRUCT, MEMBER) \ 14 + memchr_inv(&((STRUCT).MEMBER), \ 15 + 0, sizeof_field(typeof(STRUCT), MEMBER)) 16 + 17 + int hv_call_get_vp_registers(u32 vp_index, u64 partition_id, u16 count, 18 + union hv_input_vtl input_vtl, 19 + struct hv_register_assoc *registers); 20 + 21 + int hv_call_set_vp_registers(u32 vp_index, u64 partition_id, u16 count, 22 + union hv_input_vtl input_vtl, 23 + struct hv_register_assoc *registers); 24 + 25 + int hv_call_get_partition_property(u64 partition_id, u64 property_code, 26 + u64 *property_value); 27 + 28 + int mshv_do_pre_guest_mode_work(ulong th_flags); 29 + 30 + #endif /* _MSHV_H */
+161
drivers/hv/mshv_common.c
··· 1 + // SPDX-License-Identifier: GPL-2.0-only 2 + /* 3 + * Copyright (c) 2024, Microsoft Corporation. 4 + * 5 + * This file contains functions that will be called from one or more modules. 6 + * If any of these modules are configured to build, this file is built and just 7 + * statically linked in. 8 + * 9 + * Authors: Microsoft Linux virtualization team 10 + */ 11 + 12 + #include <linux/kernel.h> 13 + #include <linux/mm.h> 14 + #include <asm/mshyperv.h> 15 + #include <linux/resume_user_mode.h> 16 + 17 + #include "mshv.h" 18 + 19 + #define HV_GET_REGISTER_BATCH_SIZE \ 20 + (HV_HYP_PAGE_SIZE / sizeof(union hv_register_value)) 21 + #define HV_SET_REGISTER_BATCH_SIZE \ 22 + ((HV_HYP_PAGE_SIZE - sizeof(struct hv_input_set_vp_registers)) \ 23 + / sizeof(struct hv_register_assoc)) 24 + 25 + int hv_call_get_vp_registers(u32 vp_index, u64 partition_id, u16 count, 26 + union hv_input_vtl input_vtl, 27 + struct hv_register_assoc *registers) 28 + { 29 + struct hv_input_get_vp_registers *input_page; 30 + union hv_register_value *output_page; 31 + u16 completed = 0; 32 + unsigned long remaining = count; 33 + int rep_count, i; 34 + u64 status = HV_STATUS_SUCCESS; 35 + unsigned long flags; 36 + 37 + local_irq_save(flags); 38 + 39 + input_page = *this_cpu_ptr(hyperv_pcpu_input_arg); 40 + output_page = *this_cpu_ptr(hyperv_pcpu_output_arg); 41 + 42 + input_page->partition_id = partition_id; 43 + input_page->vp_index = vp_index; 44 + input_page->input_vtl.as_uint8 = input_vtl.as_uint8; 45 + input_page->rsvd_z8 = 0; 46 + input_page->rsvd_z16 = 0; 47 + 48 + while (remaining) { 49 + rep_count = min(remaining, HV_GET_REGISTER_BATCH_SIZE); 50 + for (i = 0; i < rep_count; ++i) 51 + input_page->names[i] = registers[i].name; 52 + 53 + status = hv_do_rep_hypercall(HVCALL_GET_VP_REGISTERS, rep_count, 54 + 0, input_page, output_page); 55 + if (!hv_result_success(status)) 56 + break; 57 + 58 + completed = hv_repcomp(status); 59 + for (i = 0; i < completed; ++i) 60 + registers[i].value = output_page[i]; 61 + 62 + registers += completed; 63 + remaining -= completed; 64 + } 65 + local_irq_restore(flags); 66 + 67 + return hv_result_to_errno(status); 68 + } 69 + EXPORT_SYMBOL_GPL(hv_call_get_vp_registers); 70 + 71 + int hv_call_set_vp_registers(u32 vp_index, u64 partition_id, u16 count, 72 + union hv_input_vtl input_vtl, 73 + struct hv_register_assoc *registers) 74 + { 75 + struct hv_input_set_vp_registers *input_page; 76 + u16 completed = 0; 77 + unsigned long remaining = count; 78 + int rep_count; 79 + u64 status = HV_STATUS_SUCCESS; 80 + unsigned long flags; 81 + 82 + local_irq_save(flags); 83 + input_page = *this_cpu_ptr(hyperv_pcpu_input_arg); 84 + 85 + input_page->partition_id = partition_id; 86 + input_page->vp_index = vp_index; 87 + input_page->input_vtl.as_uint8 = input_vtl.as_uint8; 88 + input_page->rsvd_z8 = 0; 89 + input_page->rsvd_z16 = 0; 90 + 91 + while (remaining) { 92 + rep_count = min(remaining, HV_SET_REGISTER_BATCH_SIZE); 93 + memcpy(input_page->elements, registers, 94 + sizeof(struct hv_register_assoc) * rep_count); 95 + 96 + status = hv_do_rep_hypercall(HVCALL_SET_VP_REGISTERS, rep_count, 97 + 0, input_page, NULL); 98 + if (!hv_result_success(status)) 99 + break; 100 + 101 + completed = hv_repcomp(status); 102 + registers += completed; 103 + remaining -= completed; 104 + } 105 + 106 + local_irq_restore(flags); 107 + 108 + return hv_result_to_errno(status); 109 + } 110 + EXPORT_SYMBOL_GPL(hv_call_set_vp_registers); 111 + 112 + int hv_call_get_partition_property(u64 partition_id, 113 + u64 property_code, 114 + u64 *property_value) 115 + { 116 + u64 status; 117 + unsigned long flags; 118 + struct hv_input_get_partition_property *input; 119 + struct hv_output_get_partition_property *output; 120 + 121 + local_irq_save(flags); 122 + input = *this_cpu_ptr(hyperv_pcpu_input_arg); 123 + output = *this_cpu_ptr(hyperv_pcpu_output_arg); 124 + memset(input, 0, sizeof(*input)); 125 + input->partition_id = partition_id; 126 + input->property_code = property_code; 127 + status = hv_do_hypercall(HVCALL_GET_PARTITION_PROPERTY, input, output); 128 + 129 + if (!hv_result_success(status)) { 130 + local_irq_restore(flags); 131 + return hv_result_to_errno(status); 132 + } 133 + *property_value = output->property_value; 134 + 135 + local_irq_restore(flags); 136 + 137 + return 0; 138 + } 139 + EXPORT_SYMBOL_GPL(hv_call_get_partition_property); 140 + 141 + /* 142 + * Handle any pre-processing before going into the guest mode on this cpu, most 143 + * notably call schedule(). Must be invoked with both preemption and 144 + * interrupts enabled. 145 + * 146 + * Returns: 0 on success, -errno on error. 147 + */ 148 + int mshv_do_pre_guest_mode_work(ulong th_flags) 149 + { 150 + if (th_flags & (_TIF_SIGPENDING | _TIF_NOTIFY_SIGNAL)) 151 + return -EINTR; 152 + 153 + if (th_flags & _TIF_NEED_RESCHED) 154 + schedule(); 155 + 156 + if (th_flags & _TIF_NOTIFY_RESUME) 157 + resume_user_mode_work(NULL); 158 + 159 + return 0; 160 + } 161 + EXPORT_SYMBOL_GPL(mshv_do_pre_guest_mode_work);
+833
drivers/hv/mshv_eventfd.c
··· 1 + // SPDX-License-Identifier: GPL-2.0-only 2 + /* 3 + * eventfd support for mshv 4 + * 5 + * Heavily inspired from KVM implementation of irqfd/ioeventfd. The basic 6 + * framework code is taken from the kvm implementation. 7 + * 8 + * All credits to kvm developers. 9 + */ 10 + 11 + #include <linux/syscalls.h> 12 + #include <linux/wait.h> 13 + #include <linux/poll.h> 14 + #include <linux/file.h> 15 + #include <linux/list.h> 16 + #include <linux/workqueue.h> 17 + #include <linux/eventfd.h> 18 + 19 + #if IS_ENABLED(CONFIG_X86_64) 20 + #include <asm/apic.h> 21 + #endif 22 + #include <asm/mshyperv.h> 23 + 24 + #include "mshv_eventfd.h" 25 + #include "mshv.h" 26 + #include "mshv_root.h" 27 + 28 + static struct workqueue_struct *irqfd_cleanup_wq; 29 + 30 + void mshv_register_irq_ack_notifier(struct mshv_partition *partition, 31 + struct mshv_irq_ack_notifier *mian) 32 + { 33 + mutex_lock(&partition->pt_irq_lock); 34 + hlist_add_head_rcu(&mian->link, &partition->irq_ack_notifier_list); 35 + mutex_unlock(&partition->pt_irq_lock); 36 + } 37 + 38 + void mshv_unregister_irq_ack_notifier(struct mshv_partition *partition, 39 + struct mshv_irq_ack_notifier *mian) 40 + { 41 + mutex_lock(&partition->pt_irq_lock); 42 + hlist_del_init_rcu(&mian->link); 43 + mutex_unlock(&partition->pt_irq_lock); 44 + synchronize_rcu(); 45 + } 46 + 47 + bool mshv_notify_acked_gsi(struct mshv_partition *partition, int gsi) 48 + { 49 + struct mshv_irq_ack_notifier *mian; 50 + bool acked = false; 51 + 52 + rcu_read_lock(); 53 + hlist_for_each_entry_rcu(mian, &partition->irq_ack_notifier_list, 54 + link) { 55 + if (mian->irq_ack_gsi == gsi) { 56 + mian->irq_acked(mian); 57 + acked = true; 58 + } 59 + } 60 + rcu_read_unlock(); 61 + 62 + return acked; 63 + } 64 + 65 + #if IS_ENABLED(CONFIG_ARM64) 66 + static inline bool hv_should_clear_interrupt(enum hv_interrupt_type type) 67 + { 68 + return false; 69 + } 70 + #elif IS_ENABLED(CONFIG_X86_64) 71 + static inline bool hv_should_clear_interrupt(enum hv_interrupt_type type) 72 + { 73 + return type == HV_X64_INTERRUPT_TYPE_EXTINT; 74 + } 75 + #endif 76 + 77 + static void mshv_irqfd_resampler_ack(struct mshv_irq_ack_notifier *mian) 78 + { 79 + struct mshv_irqfd_resampler *resampler; 80 + struct mshv_partition *partition; 81 + struct mshv_irqfd *irqfd; 82 + int idx; 83 + 84 + resampler = container_of(mian, struct mshv_irqfd_resampler, 85 + rsmplr_notifier); 86 + partition = resampler->rsmplr_partn; 87 + 88 + idx = srcu_read_lock(&partition->pt_irq_srcu); 89 + 90 + hlist_for_each_entry_rcu(irqfd, &resampler->rsmplr_irqfd_list, 91 + irqfd_resampler_hnode) { 92 + if (hv_should_clear_interrupt(irqfd->irqfd_lapic_irq.lapic_control.interrupt_type)) 93 + hv_call_clear_virtual_interrupt(partition->pt_id); 94 + 95 + eventfd_signal(irqfd->irqfd_resamplefd); 96 + } 97 + 98 + srcu_read_unlock(&partition->pt_irq_srcu, idx); 99 + } 100 + 101 + #if IS_ENABLED(CONFIG_X86_64) 102 + static bool 103 + mshv_vp_irq_vector_injected(union hv_vp_register_page_interrupt_vectors iv, 104 + u32 vector) 105 + { 106 + int i; 107 + 108 + for (i = 0; i < iv.vector_count; i++) { 109 + if (iv.vector[i] == vector) 110 + return true; 111 + } 112 + 113 + return false; 114 + } 115 + 116 + static int mshv_vp_irq_try_set_vector(struct mshv_vp *vp, u32 vector) 117 + { 118 + union hv_vp_register_page_interrupt_vectors iv, new_iv; 119 + 120 + iv = vp->vp_register_page->interrupt_vectors; 121 + new_iv = iv; 122 + 123 + if (mshv_vp_irq_vector_injected(iv, vector)) 124 + return 0; 125 + 126 + if (iv.vector_count >= HV_VP_REGISTER_PAGE_MAX_VECTOR_COUNT) 127 + return -ENOSPC; 128 + 129 + new_iv.vector[new_iv.vector_count++] = vector; 130 + 131 + if (cmpxchg(&vp->vp_register_page->interrupt_vectors.as_uint64, 132 + iv.as_uint64, new_iv.as_uint64) != iv.as_uint64) 133 + return -EAGAIN; 134 + 135 + return 0; 136 + } 137 + 138 + static int mshv_vp_irq_set_vector(struct mshv_vp *vp, u32 vector) 139 + { 140 + int ret; 141 + 142 + do { 143 + ret = mshv_vp_irq_try_set_vector(vp, vector); 144 + } while (ret == -EAGAIN && !need_resched()); 145 + 146 + return ret; 147 + } 148 + 149 + /* 150 + * Try to raise irq for guest via shared vector array. hyp does the actual 151 + * inject of the interrupt. 152 + */ 153 + static int mshv_try_assert_irq_fast(struct mshv_irqfd *irqfd) 154 + { 155 + struct mshv_partition *partition = irqfd->irqfd_partn; 156 + struct mshv_lapic_irq *irq = &irqfd->irqfd_lapic_irq; 157 + struct mshv_vp *vp; 158 + 159 + if (!(ms_hyperv.ext_features & 160 + HV_VP_DISPATCH_INTERRUPT_INJECTION_AVAILABLE)) 161 + return -EOPNOTSUPP; 162 + 163 + if (hv_scheduler_type != HV_SCHEDULER_TYPE_ROOT) 164 + return -EOPNOTSUPP; 165 + 166 + if (irq->lapic_control.logical_dest_mode) 167 + return -EOPNOTSUPP; 168 + 169 + vp = partition->pt_vp_array[irq->lapic_apic_id]; 170 + 171 + if (!vp->vp_register_page) 172 + return -EOPNOTSUPP; 173 + 174 + if (mshv_vp_irq_set_vector(vp, irq->lapic_vector)) 175 + return -EINVAL; 176 + 177 + if (vp->run.flags.root_sched_dispatched && 178 + vp->vp_register_page->interrupt_vectors.as_uint64) 179 + return -EBUSY; 180 + 181 + wake_up(&vp->run.vp_suspend_queue); 182 + 183 + return 0; 184 + } 185 + #else /* CONFIG_X86_64 */ 186 + static int mshv_try_assert_irq_fast(struct mshv_irqfd *irqfd) 187 + { 188 + return -EOPNOTSUPP; 189 + } 190 + #endif 191 + 192 + static void mshv_assert_irq_slow(struct mshv_irqfd *irqfd) 193 + { 194 + struct mshv_partition *partition = irqfd->irqfd_partn; 195 + struct mshv_lapic_irq *irq = &irqfd->irqfd_lapic_irq; 196 + unsigned int seq; 197 + int idx; 198 + 199 + WARN_ON(irqfd->irqfd_resampler && 200 + !irq->lapic_control.level_triggered); 201 + 202 + idx = srcu_read_lock(&partition->pt_irq_srcu); 203 + if (irqfd->irqfd_girq_ent.guest_irq_num) { 204 + if (!irqfd->irqfd_girq_ent.girq_entry_valid) { 205 + srcu_read_unlock(&partition->pt_irq_srcu, idx); 206 + return; 207 + } 208 + 209 + do { 210 + seq = read_seqcount_begin(&irqfd->irqfd_irqe_sc); 211 + } while (read_seqcount_retry(&irqfd->irqfd_irqe_sc, seq)); 212 + } 213 + 214 + hv_call_assert_virtual_interrupt(irqfd->irqfd_partn->pt_id, 215 + irq->lapic_vector, irq->lapic_apic_id, 216 + irq->lapic_control); 217 + srcu_read_unlock(&partition->pt_irq_srcu, idx); 218 + } 219 + 220 + static void mshv_irqfd_resampler_shutdown(struct mshv_irqfd *irqfd) 221 + { 222 + struct mshv_irqfd_resampler *rp = irqfd->irqfd_resampler; 223 + struct mshv_partition *pt = rp->rsmplr_partn; 224 + 225 + mutex_lock(&pt->irqfds_resampler_lock); 226 + 227 + hlist_del_rcu(&irqfd->irqfd_resampler_hnode); 228 + synchronize_srcu(&pt->pt_irq_srcu); 229 + 230 + if (hlist_empty(&rp->rsmplr_irqfd_list)) { 231 + hlist_del(&rp->rsmplr_hnode); 232 + mshv_unregister_irq_ack_notifier(pt, &rp->rsmplr_notifier); 233 + kfree(rp); 234 + } 235 + 236 + mutex_unlock(&pt->irqfds_resampler_lock); 237 + } 238 + 239 + /* 240 + * Race-free decouple logic (ordering is critical) 241 + */ 242 + static void mshv_irqfd_shutdown(struct work_struct *work) 243 + { 244 + struct mshv_irqfd *irqfd = 245 + container_of(work, struct mshv_irqfd, irqfd_shutdown); 246 + 247 + /* 248 + * Synchronize with the wait-queue and unhook ourselves to prevent 249 + * further events. 250 + */ 251 + remove_wait_queue(irqfd->irqfd_wqh, &irqfd->irqfd_wait); 252 + 253 + if (irqfd->irqfd_resampler) { 254 + mshv_irqfd_resampler_shutdown(irqfd); 255 + eventfd_ctx_put(irqfd->irqfd_resamplefd); 256 + } 257 + 258 + /* 259 + * It is now safe to release the object's resources 260 + */ 261 + eventfd_ctx_put(irqfd->irqfd_eventfd_ctx); 262 + kfree(irqfd); 263 + } 264 + 265 + /* assumes partition->pt_irqfds_lock is held */ 266 + static bool mshv_irqfd_is_active(struct mshv_irqfd *irqfd) 267 + { 268 + return !hlist_unhashed(&irqfd->irqfd_hnode); 269 + } 270 + 271 + /* 272 + * Mark the irqfd as inactive and schedule it for removal 273 + * 274 + * assumes partition->pt_irqfds_lock is held 275 + */ 276 + static void mshv_irqfd_deactivate(struct mshv_irqfd *irqfd) 277 + { 278 + if (!mshv_irqfd_is_active(irqfd)) 279 + return; 280 + 281 + hlist_del(&irqfd->irqfd_hnode); 282 + 283 + queue_work(irqfd_cleanup_wq, &irqfd->irqfd_shutdown); 284 + } 285 + 286 + /* 287 + * Called with wqh->lock held and interrupts disabled 288 + */ 289 + static int mshv_irqfd_wakeup(wait_queue_entry_t *wait, unsigned int mode, 290 + int sync, void *key) 291 + { 292 + struct mshv_irqfd *irqfd = container_of(wait, struct mshv_irqfd, 293 + irqfd_wait); 294 + unsigned long flags = (unsigned long)key; 295 + int idx; 296 + unsigned int seq; 297 + struct mshv_partition *pt = irqfd->irqfd_partn; 298 + int ret = 0; 299 + 300 + if (flags & POLLIN) { 301 + u64 cnt; 302 + 303 + eventfd_ctx_do_read(irqfd->irqfd_eventfd_ctx, &cnt); 304 + idx = srcu_read_lock(&pt->pt_irq_srcu); 305 + do { 306 + seq = read_seqcount_begin(&irqfd->irqfd_irqe_sc); 307 + } while (read_seqcount_retry(&irqfd->irqfd_irqe_sc, seq)); 308 + 309 + /* An event has been signaled, raise an interrupt */ 310 + ret = mshv_try_assert_irq_fast(irqfd); 311 + if (ret) 312 + mshv_assert_irq_slow(irqfd); 313 + 314 + srcu_read_unlock(&pt->pt_irq_srcu, idx); 315 + 316 + ret = 1; 317 + } 318 + 319 + if (flags & POLLHUP) { 320 + /* The eventfd is closing, detach from the partition */ 321 + unsigned long flags; 322 + 323 + spin_lock_irqsave(&pt->pt_irqfds_lock, flags); 324 + 325 + /* 326 + * We must check if someone deactivated the irqfd before 327 + * we could acquire the pt_irqfds_lock since the item is 328 + * deactivated from the mshv side before it is unhooked from 329 + * the wait-queue. If it is already deactivated, we can 330 + * simply return knowing the other side will cleanup for us. 331 + * We cannot race against the irqfd going away since the 332 + * other side is required to acquire wqh->lock, which we hold 333 + */ 334 + if (mshv_irqfd_is_active(irqfd)) 335 + mshv_irqfd_deactivate(irqfd); 336 + 337 + spin_unlock_irqrestore(&pt->pt_irqfds_lock, flags); 338 + } 339 + 340 + return ret; 341 + } 342 + 343 + /* Must be called under pt_irqfds_lock */ 344 + static void mshv_irqfd_update(struct mshv_partition *pt, 345 + struct mshv_irqfd *irqfd) 346 + { 347 + write_seqcount_begin(&irqfd->irqfd_irqe_sc); 348 + irqfd->irqfd_girq_ent = mshv_ret_girq_entry(pt, 349 + irqfd->irqfd_irqnum); 350 + mshv_copy_girq_info(&irqfd->irqfd_girq_ent, &irqfd->irqfd_lapic_irq); 351 + write_seqcount_end(&irqfd->irqfd_irqe_sc); 352 + } 353 + 354 + void mshv_irqfd_routing_update(struct mshv_partition *pt) 355 + { 356 + struct mshv_irqfd *irqfd; 357 + 358 + spin_lock_irq(&pt->pt_irqfds_lock); 359 + hlist_for_each_entry(irqfd, &pt->pt_irqfds_list, irqfd_hnode) 360 + mshv_irqfd_update(pt, irqfd); 361 + spin_unlock_irq(&pt->pt_irqfds_lock); 362 + } 363 + 364 + static void mshv_irqfd_queue_proc(struct file *file, wait_queue_head_t *wqh, 365 + poll_table *polltbl) 366 + { 367 + struct mshv_irqfd *irqfd = 368 + container_of(polltbl, struct mshv_irqfd, irqfd_polltbl); 369 + 370 + irqfd->irqfd_wqh = wqh; 371 + add_wait_queue_priority(wqh, &irqfd->irqfd_wait); 372 + } 373 + 374 + static int mshv_irqfd_assign(struct mshv_partition *pt, 375 + struct mshv_user_irqfd *args) 376 + { 377 + struct eventfd_ctx *eventfd = NULL, *resamplefd = NULL; 378 + struct mshv_irqfd *irqfd, *tmp; 379 + unsigned int events; 380 + struct fd f; 381 + int ret; 382 + int idx; 383 + 384 + irqfd = kzalloc(sizeof(*irqfd), GFP_KERNEL); 385 + if (!irqfd) 386 + return -ENOMEM; 387 + 388 + irqfd->irqfd_partn = pt; 389 + irqfd->irqfd_irqnum = args->gsi; 390 + INIT_WORK(&irqfd->irqfd_shutdown, mshv_irqfd_shutdown); 391 + seqcount_spinlock_init(&irqfd->irqfd_irqe_sc, &pt->pt_irqfds_lock); 392 + 393 + f = fdget(args->fd); 394 + if (!fd_file(f)) { 395 + ret = -EBADF; 396 + goto out; 397 + } 398 + 399 + eventfd = eventfd_ctx_fileget(fd_file(f)); 400 + if (IS_ERR(eventfd)) { 401 + ret = PTR_ERR(eventfd); 402 + goto fail; 403 + } 404 + 405 + irqfd->irqfd_eventfd_ctx = eventfd; 406 + 407 + if (args->flags & BIT(MSHV_IRQFD_BIT_RESAMPLE)) { 408 + struct mshv_irqfd_resampler *rp; 409 + 410 + resamplefd = eventfd_ctx_fdget(args->resamplefd); 411 + if (IS_ERR(resamplefd)) { 412 + ret = PTR_ERR(resamplefd); 413 + goto fail; 414 + } 415 + 416 + irqfd->irqfd_resamplefd = resamplefd; 417 + 418 + mutex_lock(&pt->irqfds_resampler_lock); 419 + 420 + hlist_for_each_entry(rp, &pt->irqfds_resampler_list, 421 + rsmplr_hnode) { 422 + if (rp->rsmplr_notifier.irq_ack_gsi == 423 + irqfd->irqfd_irqnum) { 424 + irqfd->irqfd_resampler = rp; 425 + break; 426 + } 427 + } 428 + 429 + if (!irqfd->irqfd_resampler) { 430 + rp = kzalloc(sizeof(*rp), GFP_KERNEL_ACCOUNT); 431 + if (!rp) { 432 + ret = -ENOMEM; 433 + mutex_unlock(&pt->irqfds_resampler_lock); 434 + goto fail; 435 + } 436 + 437 + rp->rsmplr_partn = pt; 438 + INIT_HLIST_HEAD(&rp->rsmplr_irqfd_list); 439 + rp->rsmplr_notifier.irq_ack_gsi = irqfd->irqfd_irqnum; 440 + rp->rsmplr_notifier.irq_acked = 441 + mshv_irqfd_resampler_ack; 442 + 443 + hlist_add_head(&rp->rsmplr_hnode, 444 + &pt->irqfds_resampler_list); 445 + mshv_register_irq_ack_notifier(pt, 446 + &rp->rsmplr_notifier); 447 + irqfd->irqfd_resampler = rp; 448 + } 449 + 450 + hlist_add_head_rcu(&irqfd->irqfd_resampler_hnode, 451 + &irqfd->irqfd_resampler->rsmplr_irqfd_list); 452 + 453 + mutex_unlock(&pt->irqfds_resampler_lock); 454 + } 455 + 456 + /* 457 + * Install our own custom wake-up handling so we are notified via 458 + * a callback whenever someone signals the underlying eventfd 459 + */ 460 + init_waitqueue_func_entry(&irqfd->irqfd_wait, mshv_irqfd_wakeup); 461 + init_poll_funcptr(&irqfd->irqfd_polltbl, mshv_irqfd_queue_proc); 462 + 463 + spin_lock_irq(&pt->pt_irqfds_lock); 464 + if (args->flags & BIT(MSHV_IRQFD_BIT_RESAMPLE) && 465 + !irqfd->irqfd_lapic_irq.lapic_control.level_triggered) { 466 + /* 467 + * Resample Fd must be for level triggered interrupt 468 + * Otherwise return with failure 469 + */ 470 + spin_unlock_irq(&pt->pt_irqfds_lock); 471 + ret = -EINVAL; 472 + goto fail; 473 + } 474 + ret = 0; 475 + hlist_for_each_entry(tmp, &pt->pt_irqfds_list, irqfd_hnode) { 476 + if (irqfd->irqfd_eventfd_ctx != tmp->irqfd_eventfd_ctx) 477 + continue; 478 + /* This fd is used for another irq already. */ 479 + ret = -EBUSY; 480 + spin_unlock_irq(&pt->pt_irqfds_lock); 481 + goto fail; 482 + } 483 + 484 + idx = srcu_read_lock(&pt->pt_irq_srcu); 485 + mshv_irqfd_update(pt, irqfd); 486 + hlist_add_head(&irqfd->irqfd_hnode, &pt->pt_irqfds_list); 487 + spin_unlock_irq(&pt->pt_irqfds_lock); 488 + 489 + /* 490 + * Check if there was an event already pending on the eventfd 491 + * before we registered, and trigger it as if we didn't miss it. 492 + */ 493 + events = vfs_poll(fd_file(f), &irqfd->irqfd_polltbl); 494 + 495 + if (events & POLLIN) 496 + mshv_assert_irq_slow(irqfd); 497 + 498 + srcu_read_unlock(&pt->pt_irq_srcu, idx); 499 + /* 500 + * do not drop the file until the irqfd is fully initialized, otherwise 501 + * we might race against the POLLHUP 502 + */ 503 + fdput(f); 504 + 505 + return 0; 506 + 507 + fail: 508 + if (irqfd->irqfd_resampler) 509 + mshv_irqfd_resampler_shutdown(irqfd); 510 + 511 + if (resamplefd && !IS_ERR(resamplefd)) 512 + eventfd_ctx_put(resamplefd); 513 + 514 + if (eventfd && !IS_ERR(eventfd)) 515 + eventfd_ctx_put(eventfd); 516 + 517 + fdput(f); 518 + 519 + out: 520 + kfree(irqfd); 521 + return ret; 522 + } 523 + 524 + /* 525 + * shutdown any irqfd's that match fd+gsi 526 + */ 527 + static int mshv_irqfd_deassign(struct mshv_partition *pt, 528 + struct mshv_user_irqfd *args) 529 + { 530 + struct mshv_irqfd *irqfd; 531 + struct hlist_node *n; 532 + struct eventfd_ctx *eventfd; 533 + 534 + eventfd = eventfd_ctx_fdget(args->fd); 535 + if (IS_ERR(eventfd)) 536 + return PTR_ERR(eventfd); 537 + 538 + hlist_for_each_entry_safe(irqfd, n, &pt->pt_irqfds_list, 539 + irqfd_hnode) { 540 + if (irqfd->irqfd_eventfd_ctx == eventfd && 541 + irqfd->irqfd_irqnum == args->gsi) 542 + 543 + mshv_irqfd_deactivate(irqfd); 544 + } 545 + 546 + eventfd_ctx_put(eventfd); 547 + 548 + /* 549 + * Block until we know all outstanding shutdown jobs have completed 550 + * so that we guarantee there will not be any more interrupts on this 551 + * gsi once this deassign function returns. 552 + */ 553 + flush_workqueue(irqfd_cleanup_wq); 554 + 555 + return 0; 556 + } 557 + 558 + int mshv_set_unset_irqfd(struct mshv_partition *pt, 559 + struct mshv_user_irqfd *args) 560 + { 561 + if (args->flags & ~MSHV_IRQFD_FLAGS_MASK) 562 + return -EINVAL; 563 + 564 + if (args->flags & BIT(MSHV_IRQFD_BIT_DEASSIGN)) 565 + return mshv_irqfd_deassign(pt, args); 566 + 567 + return mshv_irqfd_assign(pt, args); 568 + } 569 + 570 + /* 571 + * This function is called as the mshv VM fd is being released. 572 + * Shutdown all irqfds that still remain open 573 + */ 574 + static void mshv_irqfd_release(struct mshv_partition *pt) 575 + { 576 + struct mshv_irqfd *irqfd; 577 + struct hlist_node *n; 578 + 579 + spin_lock_irq(&pt->pt_irqfds_lock); 580 + 581 + hlist_for_each_entry_safe(irqfd, n, &pt->pt_irqfds_list, irqfd_hnode) 582 + mshv_irqfd_deactivate(irqfd); 583 + 584 + spin_unlock_irq(&pt->pt_irqfds_lock); 585 + 586 + /* 587 + * Block until we know all outstanding shutdown jobs have completed 588 + * since we do not take a mshv_partition* reference. 589 + */ 590 + flush_workqueue(irqfd_cleanup_wq); 591 + } 592 + 593 + int mshv_irqfd_wq_init(void) 594 + { 595 + irqfd_cleanup_wq = alloc_workqueue("mshv-irqfd-cleanup", 0, 0); 596 + if (!irqfd_cleanup_wq) 597 + return -ENOMEM; 598 + 599 + return 0; 600 + } 601 + 602 + void mshv_irqfd_wq_cleanup(void) 603 + { 604 + destroy_workqueue(irqfd_cleanup_wq); 605 + } 606 + 607 + /* 608 + * -------------------------------------------------------------------- 609 + * ioeventfd: translate a MMIO memory write to an eventfd signal. 610 + * 611 + * userspace can register a MMIO address with an eventfd for receiving 612 + * notification when the memory has been touched. 613 + * -------------------------------------------------------------------- 614 + */ 615 + 616 + static void ioeventfd_release(struct mshv_ioeventfd *p, u64 partition_id) 617 + { 618 + if (p->iovntfd_doorbell_id > 0) 619 + mshv_unregister_doorbell(partition_id, p->iovntfd_doorbell_id); 620 + eventfd_ctx_put(p->iovntfd_eventfd); 621 + kfree(p); 622 + } 623 + 624 + /* MMIO writes trigger an event if the addr/val match */ 625 + static void ioeventfd_mmio_write(int doorbell_id, void *data) 626 + { 627 + struct mshv_partition *partition = (struct mshv_partition *)data; 628 + struct mshv_ioeventfd *p; 629 + 630 + rcu_read_lock(); 631 + hlist_for_each_entry_rcu(p, &partition->ioeventfds_list, iovntfd_hnode) 632 + if (p->iovntfd_doorbell_id == doorbell_id) { 633 + eventfd_signal(p->iovntfd_eventfd); 634 + break; 635 + } 636 + 637 + rcu_read_unlock(); 638 + } 639 + 640 + static bool ioeventfd_check_collision(struct mshv_partition *pt, 641 + struct mshv_ioeventfd *p) 642 + __must_hold(&pt->mutex) 643 + { 644 + struct mshv_ioeventfd *_p; 645 + 646 + hlist_for_each_entry(_p, &pt->ioeventfds_list, iovntfd_hnode) 647 + if (_p->iovntfd_addr == p->iovntfd_addr && 648 + _p->iovntfd_length == p->iovntfd_length && 649 + (_p->iovntfd_wildcard || p->iovntfd_wildcard || 650 + _p->iovntfd_datamatch == p->iovntfd_datamatch)) 651 + return true; 652 + 653 + return false; 654 + } 655 + 656 + static int mshv_assign_ioeventfd(struct mshv_partition *pt, 657 + struct mshv_user_ioeventfd *args) 658 + __must_hold(&pt->mutex) 659 + { 660 + struct mshv_ioeventfd *p; 661 + struct eventfd_ctx *eventfd; 662 + u64 doorbell_flags = 0; 663 + int ret; 664 + 665 + /* This mutex is currently protecting ioeventfd.items list */ 666 + WARN_ON_ONCE(!mutex_is_locked(&pt->pt_mutex)); 667 + 668 + if (args->flags & BIT(MSHV_IOEVENTFD_BIT_PIO)) 669 + return -EOPNOTSUPP; 670 + 671 + /* must be natural-word sized */ 672 + switch (args->len) { 673 + case 0: 674 + doorbell_flags = HV_DOORBELL_FLAG_TRIGGER_SIZE_ANY; 675 + break; 676 + case 1: 677 + doorbell_flags = HV_DOORBELL_FLAG_TRIGGER_SIZE_BYTE; 678 + break; 679 + case 2: 680 + doorbell_flags = HV_DOORBELL_FLAG_TRIGGER_SIZE_WORD; 681 + break; 682 + case 4: 683 + doorbell_flags = HV_DOORBELL_FLAG_TRIGGER_SIZE_DWORD; 684 + break; 685 + case 8: 686 + doorbell_flags = HV_DOORBELL_FLAG_TRIGGER_SIZE_QWORD; 687 + break; 688 + default: 689 + return -EINVAL; 690 + } 691 + 692 + /* check for range overflow */ 693 + if (args->addr + args->len < args->addr) 694 + return -EINVAL; 695 + 696 + /* check for extra flags that we don't understand */ 697 + if (args->flags & ~MSHV_IOEVENTFD_FLAGS_MASK) 698 + return -EINVAL; 699 + 700 + eventfd = eventfd_ctx_fdget(args->fd); 701 + if (IS_ERR(eventfd)) 702 + return PTR_ERR(eventfd); 703 + 704 + p = kzalloc(sizeof(*p), GFP_KERNEL); 705 + if (!p) { 706 + ret = -ENOMEM; 707 + goto fail; 708 + } 709 + 710 + p->iovntfd_addr = args->addr; 711 + p->iovntfd_length = args->len; 712 + p->iovntfd_eventfd = eventfd; 713 + 714 + /* The datamatch feature is optional, otherwise this is a wildcard */ 715 + if (args->flags & BIT(MSHV_IOEVENTFD_BIT_DATAMATCH)) { 716 + p->iovntfd_datamatch = args->datamatch; 717 + } else { 718 + p->iovntfd_wildcard = true; 719 + doorbell_flags |= HV_DOORBELL_FLAG_TRIGGER_ANY_VALUE; 720 + } 721 + 722 + if (ioeventfd_check_collision(pt, p)) { 723 + ret = -EEXIST; 724 + goto unlock_fail; 725 + } 726 + 727 + ret = mshv_register_doorbell(pt->pt_id, ioeventfd_mmio_write, 728 + (void *)pt, p->iovntfd_addr, 729 + p->iovntfd_datamatch, doorbell_flags); 730 + if (ret < 0) 731 + goto unlock_fail; 732 + 733 + p->iovntfd_doorbell_id = ret; 734 + 735 + hlist_add_head_rcu(&p->iovntfd_hnode, &pt->ioeventfds_list); 736 + 737 + return 0; 738 + 739 + unlock_fail: 740 + kfree(p); 741 + 742 + fail: 743 + eventfd_ctx_put(eventfd); 744 + 745 + return ret; 746 + } 747 + 748 + static int mshv_deassign_ioeventfd(struct mshv_partition *pt, 749 + struct mshv_user_ioeventfd *args) 750 + __must_hold(&pt->mutex) 751 + { 752 + struct mshv_ioeventfd *p; 753 + struct eventfd_ctx *eventfd; 754 + struct hlist_node *n; 755 + int ret = -ENOENT; 756 + 757 + /* This mutex is currently protecting ioeventfd.items list */ 758 + WARN_ON_ONCE(!mutex_is_locked(&pt->pt_mutex)); 759 + 760 + eventfd = eventfd_ctx_fdget(args->fd); 761 + if (IS_ERR(eventfd)) 762 + return PTR_ERR(eventfd); 763 + 764 + hlist_for_each_entry_safe(p, n, &pt->ioeventfds_list, iovntfd_hnode) { 765 + bool wildcard = !(args->flags & BIT(MSHV_IOEVENTFD_BIT_DATAMATCH)); 766 + 767 + if (p->iovntfd_eventfd != eventfd || 768 + p->iovntfd_addr != args->addr || 769 + p->iovntfd_length != args->len || 770 + p->iovntfd_wildcard != wildcard) 771 + continue; 772 + 773 + if (!p->iovntfd_wildcard && 774 + p->iovntfd_datamatch != args->datamatch) 775 + continue; 776 + 777 + hlist_del_rcu(&p->iovntfd_hnode); 778 + synchronize_rcu(); 779 + ioeventfd_release(p, pt->pt_id); 780 + ret = 0; 781 + break; 782 + } 783 + 784 + eventfd_ctx_put(eventfd); 785 + 786 + return ret; 787 + } 788 + 789 + int mshv_set_unset_ioeventfd(struct mshv_partition *pt, 790 + struct mshv_user_ioeventfd *args) 791 + __must_hold(&pt->mutex) 792 + { 793 + if ((args->flags & ~MSHV_IOEVENTFD_FLAGS_MASK) || 794 + mshv_field_nonzero(*args, rsvd)) 795 + return -EINVAL; 796 + 797 + /* PIO not yet implemented */ 798 + if (args->flags & BIT(MSHV_IOEVENTFD_BIT_PIO)) 799 + return -EOPNOTSUPP; 800 + 801 + if (args->flags & BIT(MSHV_IOEVENTFD_BIT_DEASSIGN)) 802 + return mshv_deassign_ioeventfd(pt, args); 803 + 804 + return mshv_assign_ioeventfd(pt, args); 805 + } 806 + 807 + void mshv_eventfd_init(struct mshv_partition *pt) 808 + { 809 + spin_lock_init(&pt->pt_irqfds_lock); 810 + INIT_HLIST_HEAD(&pt->pt_irqfds_list); 811 + 812 + INIT_HLIST_HEAD(&pt->irqfds_resampler_list); 813 + mutex_init(&pt->irqfds_resampler_lock); 814 + 815 + INIT_HLIST_HEAD(&pt->ioeventfds_list); 816 + } 817 + 818 + void mshv_eventfd_release(struct mshv_partition *pt) 819 + { 820 + struct hlist_head items; 821 + struct hlist_node *n; 822 + struct mshv_ioeventfd *p; 823 + 824 + hlist_move_list(&pt->ioeventfds_list, &items); 825 + synchronize_rcu(); 826 + 827 + hlist_for_each_entry_safe(p, n, &items, iovntfd_hnode) { 828 + hlist_del(&p->iovntfd_hnode); 829 + ioeventfd_release(p, pt->pt_id); 830 + } 831 + 832 + mshv_irqfd_release(pt); 833 + }
+71
drivers/hv/mshv_eventfd.h
··· 1 + /* SPDX-License-Identifier: GPL-2.0-only */ 2 + /* 3 + * irqfd: Allows an fd to be used to inject an interrupt to the guest. 4 + * ioeventfd: Allow an fd to be used to receive a signal from the guest. 5 + * All credit goes to kvm developers. 6 + */ 7 + 8 + #ifndef __LINUX_MSHV_EVENTFD_H 9 + #define __LINUX_MSHV_EVENTFD_H 10 + 11 + #include <linux/poll.h> 12 + 13 + #include "mshv.h" 14 + #include "mshv_root.h" 15 + 16 + /* struct to contain list of irqfds sharing an irq. Updates are protected by 17 + * partition.irqfds.resampler_lock 18 + */ 19 + struct mshv_irqfd_resampler { 20 + struct mshv_partition *rsmplr_partn; 21 + struct hlist_head rsmplr_irqfd_list; 22 + struct mshv_irq_ack_notifier rsmplr_notifier; 23 + struct hlist_node rsmplr_hnode; 24 + }; 25 + 26 + struct mshv_irqfd { 27 + struct mshv_partition *irqfd_partn; 28 + struct eventfd_ctx *irqfd_eventfd_ctx; 29 + struct mshv_guest_irq_ent irqfd_girq_ent; 30 + seqcount_spinlock_t irqfd_irqe_sc; 31 + u32 irqfd_irqnum; 32 + struct mshv_lapic_irq irqfd_lapic_irq; 33 + struct hlist_node irqfd_hnode; 34 + poll_table irqfd_polltbl; 35 + wait_queue_head_t *irqfd_wqh; 36 + wait_queue_entry_t irqfd_wait; 37 + struct work_struct irqfd_shutdown; 38 + struct mshv_irqfd_resampler *irqfd_resampler; 39 + struct eventfd_ctx *irqfd_resamplefd; 40 + struct hlist_node irqfd_resampler_hnode; 41 + }; 42 + 43 + void mshv_eventfd_init(struct mshv_partition *partition); 44 + void mshv_eventfd_release(struct mshv_partition *partition); 45 + 46 + void mshv_register_irq_ack_notifier(struct mshv_partition *partition, 47 + struct mshv_irq_ack_notifier *mian); 48 + void mshv_unregister_irq_ack_notifier(struct mshv_partition *partition, 49 + struct mshv_irq_ack_notifier *mian); 50 + bool mshv_notify_acked_gsi(struct mshv_partition *partition, int gsi); 51 + 52 + int mshv_set_unset_irqfd(struct mshv_partition *partition, 53 + struct mshv_user_irqfd *args); 54 + 55 + int mshv_irqfd_wq_init(void); 56 + void mshv_irqfd_wq_cleanup(void); 57 + 58 + struct mshv_ioeventfd { 59 + struct hlist_node iovntfd_hnode; 60 + u64 iovntfd_addr; 61 + int iovntfd_length; 62 + struct eventfd_ctx *iovntfd_eventfd; 63 + u64 iovntfd_datamatch; 64 + int iovntfd_doorbell_id; 65 + bool iovntfd_wildcard; 66 + }; 67 + 68 + int mshv_set_unset_ioeventfd(struct mshv_partition *pt, 69 + struct mshv_user_ioeventfd *args); 70 + 71 + #endif /* __LINUX_MSHV_EVENTFD_H */
+124
drivers/hv/mshv_irq.c
··· 1 + // SPDX-License-Identifier: GPL-2.0-only 2 + /* 3 + * Copyright (c) 2023, Microsoft Corporation. 4 + * 5 + * Authors: Microsoft Linux virtualization team 6 + */ 7 + 8 + #include <linux/kernel.h> 9 + #include <linux/module.h> 10 + #include <linux/slab.h> 11 + #include <asm/mshyperv.h> 12 + 13 + #include "mshv_eventfd.h" 14 + #include "mshv.h" 15 + #include "mshv_root.h" 16 + 17 + /* called from the ioctl code, user wants to update the guest irq table */ 18 + int mshv_update_routing_table(struct mshv_partition *partition, 19 + const struct mshv_user_irq_entry *ue, 20 + unsigned int numents) 21 + { 22 + struct mshv_girq_routing_table *new = NULL, *old; 23 + u32 i, nr_rt_entries = 0; 24 + int r = 0; 25 + 26 + if (numents == 0) 27 + goto swap_routes; 28 + 29 + for (i = 0; i < numents; i++) { 30 + if (ue[i].gsi >= MSHV_MAX_GUEST_IRQS) 31 + return -EINVAL; 32 + 33 + if (ue[i].address_hi) 34 + return -EINVAL; 35 + 36 + nr_rt_entries = max(nr_rt_entries, ue[i].gsi); 37 + } 38 + nr_rt_entries += 1; 39 + 40 + new = kzalloc(struct_size(new, mshv_girq_info_tbl, nr_rt_entries), 41 + GFP_KERNEL_ACCOUNT); 42 + if (!new) 43 + return -ENOMEM; 44 + 45 + new->num_rt_entries = nr_rt_entries; 46 + for (i = 0; i < numents; i++) { 47 + struct mshv_guest_irq_ent *girq; 48 + 49 + girq = &new->mshv_girq_info_tbl[ue[i].gsi]; 50 + 51 + /* 52 + * Allow only one to one mapping between GSI and MSI routing. 53 + */ 54 + if (girq->guest_irq_num != 0) { 55 + r = -EINVAL; 56 + goto out; 57 + } 58 + 59 + girq->guest_irq_num = ue[i].gsi; 60 + girq->girq_addr_lo = ue[i].address_lo; 61 + girq->girq_addr_hi = ue[i].address_hi; 62 + girq->girq_irq_data = ue[i].data; 63 + girq->girq_entry_valid = true; 64 + } 65 + 66 + swap_routes: 67 + mutex_lock(&partition->pt_irq_lock); 68 + old = rcu_dereference_protected(partition->pt_girq_tbl, 1); 69 + rcu_assign_pointer(partition->pt_girq_tbl, new); 70 + mshv_irqfd_routing_update(partition); 71 + mutex_unlock(&partition->pt_irq_lock); 72 + 73 + synchronize_srcu_expedited(&partition->pt_irq_srcu); 74 + new = old; 75 + 76 + out: 77 + kfree(new); 78 + 79 + return r; 80 + } 81 + 82 + /* vm is going away, kfree the irq routing table */ 83 + void mshv_free_routing_table(struct mshv_partition *partition) 84 + { 85 + struct mshv_girq_routing_table *rt = 86 + rcu_access_pointer(partition->pt_girq_tbl); 87 + 88 + kfree(rt); 89 + } 90 + 91 + struct mshv_guest_irq_ent 92 + mshv_ret_girq_entry(struct mshv_partition *partition, u32 irqnum) 93 + { 94 + struct mshv_guest_irq_ent entry = { 0 }; 95 + struct mshv_girq_routing_table *girq_tbl; 96 + 97 + girq_tbl = srcu_dereference_check(partition->pt_girq_tbl, 98 + &partition->pt_irq_srcu, 99 + lockdep_is_held(&partition->pt_irq_lock)); 100 + if (!girq_tbl || irqnum >= girq_tbl->num_rt_entries) { 101 + /* 102 + * Premature register_irqfd, setting valid_entry = 0 103 + * would ignore this entry anyway 104 + */ 105 + entry.guest_irq_num = irqnum; 106 + return entry; 107 + } 108 + 109 + return girq_tbl->mshv_girq_info_tbl[irqnum]; 110 + } 111 + 112 + void mshv_copy_girq_info(struct mshv_guest_irq_ent *ent, 113 + struct mshv_lapic_irq *lirq) 114 + { 115 + memset(lirq, 0, sizeof(*lirq)); 116 + if (!ent || !ent->girq_entry_valid) 117 + return; 118 + 119 + lirq->lapic_vector = ent->girq_irq_data & 0xFF; 120 + lirq->lapic_apic_id = (ent->girq_addr_lo >> 12) & 0xFF; 121 + lirq->lapic_control.interrupt_type = (ent->girq_irq_data & 0x700) >> 8; 122 + lirq->lapic_control.level_triggered = (ent->girq_irq_data >> 15) & 0x1; 123 + lirq->lapic_control.logical_dest_mode = (ent->girq_addr_lo >> 2) & 0x1; 124 + }
+83
drivers/hv/mshv_portid_table.c
··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + #include <linux/types.h> 3 + #include <linux/mm.h> 4 + #include <linux/slab.h> 5 + #include <linux/idr.h> 6 + #include <asm/mshyperv.h> 7 + 8 + #include "mshv.h" 9 + #include "mshv_root.h" 10 + 11 + /* 12 + * Ports and connections are hypervisor struct used for inter-partition 13 + * communication. Port represents the source and connection represents 14 + * the destination. Partitions are responsible for managing the port and 15 + * connection ids. 16 + * 17 + */ 18 + 19 + #define PORTID_MIN 1 20 + #define PORTID_MAX INT_MAX 21 + 22 + static DEFINE_IDR(port_table_idr); 23 + 24 + void 25 + mshv_port_table_fini(void) 26 + { 27 + struct port_table_info *port_info; 28 + unsigned long i, tmp; 29 + 30 + idr_lock(&port_table_idr); 31 + if (!idr_is_empty(&port_table_idr)) { 32 + idr_for_each_entry_ul(&port_table_idr, port_info, tmp, i) { 33 + port_info = idr_remove(&port_table_idr, i); 34 + kfree_rcu(port_info, portbl_rcu); 35 + } 36 + } 37 + idr_unlock(&port_table_idr); 38 + } 39 + 40 + int 41 + mshv_portid_alloc(struct port_table_info *info) 42 + { 43 + int ret = 0; 44 + 45 + idr_lock(&port_table_idr); 46 + ret = idr_alloc(&port_table_idr, info, PORTID_MIN, 47 + PORTID_MAX, GFP_KERNEL); 48 + idr_unlock(&port_table_idr); 49 + 50 + return ret; 51 + } 52 + 53 + void 54 + mshv_portid_free(int port_id) 55 + { 56 + struct port_table_info *info; 57 + 58 + idr_lock(&port_table_idr); 59 + info = idr_remove(&port_table_idr, port_id); 60 + WARN_ON(!info); 61 + idr_unlock(&port_table_idr); 62 + 63 + synchronize_rcu(); 64 + kfree(info); 65 + } 66 + 67 + int 68 + mshv_portid_lookup(int port_id, struct port_table_info *info) 69 + { 70 + struct port_table_info *_info; 71 + int ret = -ENOENT; 72 + 73 + rcu_read_lock(); 74 + _info = idr_find(&port_table_idr, port_id); 75 + rcu_read_unlock(); 76 + 77 + if (_info) { 78 + *info = *_info; 79 + ret = 0; 80 + } 81 + 82 + return ret; 83 + }
+311
drivers/hv/mshv_root.h
··· 1 + /* SPDX-License-Identifier: GPL-2.0-only */ 2 + /* 3 + * Copyright (c) 2023, Microsoft Corporation. 4 + */ 5 + 6 + #ifndef _MSHV_ROOT_H_ 7 + #define _MSHV_ROOT_H_ 8 + 9 + #include <linux/spinlock.h> 10 + #include <linux/mutex.h> 11 + #include <linux/semaphore.h> 12 + #include <linux/sched.h> 13 + #include <linux/srcu.h> 14 + #include <linux/wait.h> 15 + #include <linux/hashtable.h> 16 + #include <linux/dev_printk.h> 17 + #include <linux/build_bug.h> 18 + #include <uapi/linux/mshv.h> 19 + 20 + /* 21 + * Hypervisor must be between these version numbers (inclusive) 22 + * to guarantee compatibility 23 + */ 24 + #define MSHV_HV_MIN_VERSION (27744) 25 + #define MSHV_HV_MAX_VERSION (27751) 26 + 27 + static_assert(HV_HYP_PAGE_SIZE == MSHV_HV_PAGE_SIZE); 28 + 29 + #define MSHV_MAX_VPS 256 30 + 31 + #define MSHV_PARTITIONS_HASH_BITS 9 32 + 33 + #define MSHV_PIN_PAGES_BATCH_SIZE (0x10000000ULL / HV_HYP_PAGE_SIZE) 34 + 35 + struct mshv_vp { 36 + u32 vp_index; 37 + struct mshv_partition *vp_partition; 38 + struct mutex vp_mutex; 39 + struct hv_vp_register_page *vp_register_page; 40 + struct hv_message *vp_intercept_msg_page; 41 + void *vp_ghcb_page; 42 + struct hv_stats_page *vp_stats_pages[2]; 43 + struct { 44 + atomic64_t vp_signaled_count; 45 + struct { 46 + u64 intercept_suspend: 1; 47 + u64 root_sched_blocked: 1; /* root scheduler only */ 48 + u64 root_sched_dispatched: 1; /* root scheduler only */ 49 + u64 reserved: 61; 50 + } flags; 51 + unsigned int kicked_by_hv; 52 + wait_queue_head_t vp_suspend_queue; 53 + } run; 54 + }; 55 + 56 + #define vp_fmt(fmt) "p%lluvp%u: " fmt 57 + #define vp_devprintk(level, v, fmt, ...) \ 58 + do { \ 59 + const struct mshv_vp *__vp = (v); \ 60 + const struct mshv_partition *__pt = __vp->vp_partition; \ 61 + dev_##level(__pt->pt_module_dev, vp_fmt(fmt), __pt->pt_id, \ 62 + __vp->vp_index, ##__VA_ARGS__); \ 63 + } while (0) 64 + #define vp_emerg(v, fmt, ...) vp_devprintk(emerg, v, fmt, ##__VA_ARGS__) 65 + #define vp_crit(v, fmt, ...) vp_devprintk(crit, v, fmt, ##__VA_ARGS__) 66 + #define vp_alert(v, fmt, ...) vp_devprintk(alert, v, fmt, ##__VA_ARGS__) 67 + #define vp_err(v, fmt, ...) vp_devprintk(err, v, fmt, ##__VA_ARGS__) 68 + #define vp_warn(v, fmt, ...) vp_devprintk(warn, v, fmt, ##__VA_ARGS__) 69 + #define vp_notice(v, fmt, ...) vp_devprintk(notice, v, fmt, ##__VA_ARGS__) 70 + #define vp_info(v, fmt, ...) vp_devprintk(info, v, fmt, ##__VA_ARGS__) 71 + #define vp_dbg(v, fmt, ...) vp_devprintk(dbg, v, fmt, ##__VA_ARGS__) 72 + 73 + struct mshv_mem_region { 74 + struct hlist_node hnode; 75 + u64 nr_pages; 76 + u64 start_gfn; 77 + u64 start_uaddr; 78 + u32 hv_map_flags; 79 + struct { 80 + u64 large_pages: 1; /* 2MiB */ 81 + u64 range_pinned: 1; 82 + u64 reserved: 62; 83 + } flags; 84 + struct mshv_partition *partition; 85 + struct page *pages[]; 86 + }; 87 + 88 + struct mshv_irq_ack_notifier { 89 + struct hlist_node link; 90 + unsigned int irq_ack_gsi; 91 + void (*irq_acked)(struct mshv_irq_ack_notifier *mian); 92 + }; 93 + 94 + struct mshv_partition { 95 + struct device *pt_module_dev; 96 + 97 + struct hlist_node pt_hnode; 98 + u64 pt_id; 99 + refcount_t pt_ref_count; 100 + struct mutex pt_mutex; 101 + struct hlist_head pt_mem_regions; // not ordered 102 + 103 + u32 pt_vp_count; 104 + struct mshv_vp *pt_vp_array[MSHV_MAX_VPS]; 105 + 106 + struct mutex pt_irq_lock; 107 + struct srcu_struct pt_irq_srcu; 108 + struct hlist_head irq_ack_notifier_list; 109 + 110 + struct hlist_head pt_devices; 111 + 112 + /* 113 + * MSHV does not support more than one async hypercall in flight 114 + * for a single partition. Thus, it is okay to define per partition 115 + * async hypercall status. 116 + */ 117 + struct completion async_hypercall; 118 + u64 async_hypercall_status; 119 + 120 + spinlock_t pt_irqfds_lock; 121 + struct hlist_head pt_irqfds_list; 122 + struct mutex irqfds_resampler_lock; 123 + struct hlist_head irqfds_resampler_list; 124 + 125 + struct hlist_head ioeventfds_list; 126 + 127 + struct mshv_girq_routing_table __rcu *pt_girq_tbl; 128 + u64 isolation_type; 129 + bool import_completed; 130 + bool pt_initialized; 131 + }; 132 + 133 + #define pt_fmt(fmt) "p%llu: " fmt 134 + #define pt_devprintk(level, p, fmt, ...) \ 135 + do { \ 136 + const struct mshv_partition *__pt = (p); \ 137 + dev_##level(__pt->pt_module_dev, pt_fmt(fmt), __pt->pt_id, \ 138 + ##__VA_ARGS__); \ 139 + } while (0) 140 + #define pt_emerg(p, fmt, ...) pt_devprintk(emerg, p, fmt, ##__VA_ARGS__) 141 + #define pt_crit(p, fmt, ...) pt_devprintk(crit, p, fmt, ##__VA_ARGS__) 142 + #define pt_alert(p, fmt, ...) pt_devprintk(alert, p, fmt, ##__VA_ARGS__) 143 + #define pt_err(p, fmt, ...) pt_devprintk(err, p, fmt, ##__VA_ARGS__) 144 + #define pt_warn(p, fmt, ...) pt_devprintk(warn, p, fmt, ##__VA_ARGS__) 145 + #define pt_notice(p, fmt, ...) pt_devprintk(notice, p, fmt, ##__VA_ARGS__) 146 + #define pt_info(p, fmt, ...) pt_devprintk(info, p, fmt, ##__VA_ARGS__) 147 + #define pt_dbg(p, fmt, ...) pt_devprintk(dbg, p, fmt, ##__VA_ARGS__) 148 + 149 + struct mshv_lapic_irq { 150 + u32 lapic_vector; 151 + u64 lapic_apic_id; 152 + union hv_interrupt_control lapic_control; 153 + }; 154 + 155 + #define MSHV_MAX_GUEST_IRQS 4096 156 + 157 + /* representation of one guest irq entry, either msi or legacy */ 158 + struct mshv_guest_irq_ent { 159 + u32 girq_entry_valid; /* vfio looks at this */ 160 + u32 guest_irq_num; /* a unique number for each irq */ 161 + u32 girq_addr_lo; /* guest irq msi address info */ 162 + u32 girq_addr_hi; 163 + u32 girq_irq_data; /* idt vector in some cases */ 164 + }; 165 + 166 + struct mshv_girq_routing_table { 167 + u32 num_rt_entries; 168 + struct mshv_guest_irq_ent mshv_girq_info_tbl[]; 169 + }; 170 + 171 + struct hv_synic_pages { 172 + struct hv_message_page *synic_message_page; 173 + struct hv_synic_event_flags_page *synic_event_flags_page; 174 + struct hv_synic_event_ring_page *synic_event_ring_page; 175 + }; 176 + 177 + struct mshv_root { 178 + struct hv_synic_pages __percpu *synic_pages; 179 + spinlock_t pt_ht_lock; 180 + DECLARE_HASHTABLE(pt_htable, MSHV_PARTITIONS_HASH_BITS); 181 + }; 182 + 183 + /* 184 + * Callback for doorbell events. 185 + * NOTE: This is called in interrupt context. Callback 186 + * should defer slow and sleeping logic to later. 187 + */ 188 + typedef void (*doorbell_cb_t) (int doorbell_id, void *); 189 + 190 + /* 191 + * port table information 192 + */ 193 + struct port_table_info { 194 + struct rcu_head portbl_rcu; 195 + enum hv_port_type hv_port_type; 196 + union { 197 + struct { 198 + u64 reserved[2]; 199 + } hv_port_message; 200 + struct { 201 + u64 reserved[2]; 202 + } hv_port_event; 203 + struct { 204 + u64 reserved[2]; 205 + } hv_port_monitor; 206 + struct { 207 + doorbell_cb_t doorbell_cb; 208 + void *data; 209 + } hv_port_doorbell; 210 + }; 211 + }; 212 + 213 + int mshv_update_routing_table(struct mshv_partition *partition, 214 + const struct mshv_user_irq_entry *entries, 215 + unsigned int numents); 216 + void mshv_free_routing_table(struct mshv_partition *partition); 217 + 218 + struct mshv_guest_irq_ent mshv_ret_girq_entry(struct mshv_partition *partition, 219 + u32 irq_num); 220 + 221 + void mshv_copy_girq_info(struct mshv_guest_irq_ent *src_irq, 222 + struct mshv_lapic_irq *dest_irq); 223 + 224 + void mshv_irqfd_routing_update(struct mshv_partition *partition); 225 + 226 + void mshv_port_table_fini(void); 227 + int mshv_portid_alloc(struct port_table_info *info); 228 + int mshv_portid_lookup(int port_id, struct port_table_info *info); 229 + void mshv_portid_free(int port_id); 230 + 231 + int mshv_register_doorbell(u64 partition_id, doorbell_cb_t doorbell_cb, 232 + void *data, u64 gpa, u64 val, u64 flags); 233 + void mshv_unregister_doorbell(u64 partition_id, int doorbell_portid); 234 + 235 + void mshv_isr(void); 236 + int mshv_synic_init(unsigned int cpu); 237 + int mshv_synic_cleanup(unsigned int cpu); 238 + 239 + static inline bool mshv_partition_encrypted(struct mshv_partition *partition) 240 + { 241 + return partition->isolation_type == HV_PARTITION_ISOLATION_TYPE_SNP; 242 + } 243 + 244 + struct mshv_partition *mshv_partition_get(struct mshv_partition *partition); 245 + void mshv_partition_put(struct mshv_partition *partition); 246 + struct mshv_partition *mshv_partition_find(u64 partition_id) __must_hold(RCU); 247 + 248 + /* hypercalls */ 249 + 250 + int hv_call_withdraw_memory(u64 count, int node, u64 partition_id); 251 + int hv_call_create_partition(u64 flags, 252 + struct hv_partition_creation_properties creation_properties, 253 + union hv_partition_isolation_properties isolation_properties, 254 + u64 *partition_id); 255 + int hv_call_initialize_partition(u64 partition_id); 256 + int hv_call_finalize_partition(u64 partition_id); 257 + int hv_call_delete_partition(u64 partition_id); 258 + int hv_call_map_mmio_pages(u64 partition_id, u64 gfn, u64 mmio_spa, u64 numpgs); 259 + int hv_call_map_gpa_pages(u64 partition_id, u64 gpa_target, u64 page_count, 260 + u32 flags, struct page **pages); 261 + int hv_call_unmap_gpa_pages(u64 partition_id, u64 gpa_target, u64 page_count, 262 + u32 flags); 263 + int hv_call_delete_vp(u64 partition_id, u32 vp_index); 264 + int hv_call_assert_virtual_interrupt(u64 partition_id, u32 vector, 265 + u64 dest_addr, 266 + union hv_interrupt_control control); 267 + int hv_call_clear_virtual_interrupt(u64 partition_id); 268 + int hv_call_get_gpa_access_states(u64 partition_id, u32 count, u64 gpa_base_pfn, 269 + union hv_gpa_page_access_state_flags state_flags, 270 + int *written_total, 271 + union hv_gpa_page_access_state *states); 272 + int hv_call_get_vp_state(u32 vp_index, u64 partition_id, 273 + struct hv_vp_state_data state_data, 274 + /* Choose between pages and ret_output */ 275 + u64 page_count, struct page **pages, 276 + union hv_output_get_vp_state *ret_output); 277 + int hv_call_set_vp_state(u32 vp_index, u64 partition_id, 278 + /* Choose between pages and bytes */ 279 + struct hv_vp_state_data state_data, u64 page_count, 280 + struct page **pages, u32 num_bytes, u8 *bytes); 281 + int hv_call_map_vp_state_page(u64 partition_id, u32 vp_index, u32 type, 282 + union hv_input_vtl input_vtl, 283 + struct page **state_page); 284 + int hv_call_unmap_vp_state_page(u64 partition_id, u32 vp_index, u32 type, 285 + union hv_input_vtl input_vtl); 286 + int hv_call_create_port(u64 port_partition_id, union hv_port_id port_id, 287 + u64 connection_partition_id, struct hv_port_info *port_info, 288 + u8 port_vtl, u8 min_connection_vtl, int node); 289 + int hv_call_delete_port(u64 port_partition_id, union hv_port_id port_id); 290 + int hv_call_connect_port(u64 port_partition_id, union hv_port_id port_id, 291 + u64 connection_partition_id, 292 + union hv_connection_id connection_id, 293 + struct hv_connection_info *connection_info, 294 + u8 connection_vtl, int node); 295 + int hv_call_disconnect_port(u64 connection_partition_id, 296 + union hv_connection_id connection_id); 297 + int hv_call_notify_port_ring_empty(u32 sint_index); 298 + int hv_call_map_stat_page(enum hv_stats_object_type type, 299 + const union hv_stats_object_identity *identity, 300 + void **addr); 301 + int hv_call_unmap_stat_page(enum hv_stats_object_type type, 302 + const union hv_stats_object_identity *identity); 303 + int hv_call_modify_spa_host_access(u64 partition_id, struct page **pages, 304 + u64 page_struct_count, u32 host_access, 305 + u32 flags, u8 acquire); 306 + 307 + extern struct mshv_root mshv_root; 308 + extern enum hv_scheduler_type hv_scheduler_type; 309 + extern u8 * __percpu *hv_synic_eventring_tail; 310 + 311 + #endif /* _MSHV_ROOT_H_ */
+849
drivers/hv/mshv_root_hv_call.c
··· 1 + // SPDX-License-Identifier: GPL-2.0-only 2 + /* 3 + * Copyright (c) 2023, Microsoft Corporation. 4 + * 5 + * Hypercall helper functions used by the mshv_root module. 6 + * 7 + * Authors: Microsoft Linux virtualization team 8 + */ 9 + 10 + #include <linux/kernel.h> 11 + #include <linux/mm.h> 12 + #include <asm/mshyperv.h> 13 + 14 + #include "mshv_root.h" 15 + 16 + /* Determined empirically */ 17 + #define HV_INIT_PARTITION_DEPOSIT_PAGES 208 18 + #define HV_MAP_GPA_DEPOSIT_PAGES 256 19 + #define HV_UMAP_GPA_PAGES 512 20 + 21 + #define HV_PAGE_COUNT_2M_ALIGNED(pg_count) (!((pg_count) & (0x200 - 1))) 22 + 23 + #define HV_WITHDRAW_BATCH_SIZE (HV_HYP_PAGE_SIZE / sizeof(u64)) 24 + #define HV_MAP_GPA_BATCH_SIZE \ 25 + ((HV_HYP_PAGE_SIZE - sizeof(struct hv_input_map_gpa_pages)) \ 26 + / sizeof(u64)) 27 + #define HV_GET_VP_STATE_BATCH_SIZE \ 28 + ((HV_HYP_PAGE_SIZE - sizeof(struct hv_input_get_vp_state)) \ 29 + / sizeof(u64)) 30 + #define HV_SET_VP_STATE_BATCH_SIZE \ 31 + ((HV_HYP_PAGE_SIZE - sizeof(struct hv_input_set_vp_state)) \ 32 + / sizeof(u64)) 33 + #define HV_GET_GPA_ACCESS_STATES_BATCH_SIZE \ 34 + ((HV_HYP_PAGE_SIZE - sizeof(union hv_gpa_page_access_state)) \ 35 + / sizeof(union hv_gpa_page_access_state)) 36 + #define HV_MODIFY_SPARSE_SPA_PAGE_HOST_ACCESS_MAX_PAGE_COUNT \ 37 + ((HV_HYP_PAGE_SIZE - \ 38 + sizeof(struct hv_input_modify_sparse_spa_page_host_access)) / \ 39 + sizeof(u64)) 40 + 41 + int hv_call_withdraw_memory(u64 count, int node, u64 partition_id) 42 + { 43 + struct hv_input_withdraw_memory *input_page; 44 + struct hv_output_withdraw_memory *output_page; 45 + struct page *page; 46 + u16 completed; 47 + unsigned long remaining = count; 48 + u64 status; 49 + int i; 50 + unsigned long flags; 51 + 52 + page = alloc_page(GFP_KERNEL); 53 + if (!page) 54 + return -ENOMEM; 55 + output_page = page_address(page); 56 + 57 + while (remaining) { 58 + local_irq_save(flags); 59 + 60 + input_page = *this_cpu_ptr(hyperv_pcpu_input_arg); 61 + 62 + memset(input_page, 0, sizeof(*input_page)); 63 + input_page->partition_id = partition_id; 64 + status = hv_do_rep_hypercall(HVCALL_WITHDRAW_MEMORY, 65 + min(remaining, HV_WITHDRAW_BATCH_SIZE), 66 + 0, input_page, output_page); 67 + 68 + local_irq_restore(flags); 69 + 70 + completed = hv_repcomp(status); 71 + 72 + for (i = 0; i < completed; i++) 73 + __free_page(pfn_to_page(output_page->gpa_page_list[i])); 74 + 75 + if (!hv_result_success(status)) { 76 + if (hv_result(status) == HV_STATUS_NO_RESOURCES) 77 + status = HV_STATUS_SUCCESS; 78 + break; 79 + } 80 + 81 + remaining -= completed; 82 + } 83 + free_page((unsigned long)output_page); 84 + 85 + return hv_result_to_errno(status); 86 + } 87 + 88 + int hv_call_create_partition(u64 flags, 89 + struct hv_partition_creation_properties creation_properties, 90 + union hv_partition_isolation_properties isolation_properties, 91 + u64 *partition_id) 92 + { 93 + struct hv_input_create_partition *input; 94 + struct hv_output_create_partition *output; 95 + u64 status; 96 + int ret; 97 + unsigned long irq_flags; 98 + 99 + do { 100 + local_irq_save(irq_flags); 101 + input = *this_cpu_ptr(hyperv_pcpu_input_arg); 102 + output = *this_cpu_ptr(hyperv_pcpu_output_arg); 103 + 104 + memset(input, 0, sizeof(*input)); 105 + input->flags = flags; 106 + input->compatibility_version = HV_COMPATIBILITY_21_H2; 107 + 108 + memcpy(&input->partition_creation_properties, &creation_properties, 109 + sizeof(creation_properties)); 110 + 111 + memcpy(&input->isolation_properties, &isolation_properties, 112 + sizeof(isolation_properties)); 113 + 114 + status = hv_do_hypercall(HVCALL_CREATE_PARTITION, 115 + input, output); 116 + 117 + if (hv_result(status) != HV_STATUS_INSUFFICIENT_MEMORY) { 118 + if (hv_result_success(status)) 119 + *partition_id = output->partition_id; 120 + local_irq_restore(irq_flags); 121 + ret = hv_result_to_errno(status); 122 + break; 123 + } 124 + local_irq_restore(irq_flags); 125 + ret = hv_call_deposit_pages(NUMA_NO_NODE, 126 + hv_current_partition_id, 1); 127 + } while (!ret); 128 + 129 + return ret; 130 + } 131 + 132 + int hv_call_initialize_partition(u64 partition_id) 133 + { 134 + struct hv_input_initialize_partition input; 135 + u64 status; 136 + int ret; 137 + 138 + input.partition_id = partition_id; 139 + 140 + ret = hv_call_deposit_pages(NUMA_NO_NODE, partition_id, 141 + HV_INIT_PARTITION_DEPOSIT_PAGES); 142 + if (ret) 143 + return ret; 144 + 145 + do { 146 + status = hv_do_fast_hypercall8(HVCALL_INITIALIZE_PARTITION, 147 + *(u64 *)&input); 148 + 149 + if (hv_result(status) != HV_STATUS_INSUFFICIENT_MEMORY) { 150 + ret = hv_result_to_errno(status); 151 + break; 152 + } 153 + ret = hv_call_deposit_pages(NUMA_NO_NODE, partition_id, 1); 154 + } while (!ret); 155 + 156 + return ret; 157 + } 158 + 159 + int hv_call_finalize_partition(u64 partition_id) 160 + { 161 + struct hv_input_finalize_partition input; 162 + u64 status; 163 + 164 + input.partition_id = partition_id; 165 + status = hv_do_fast_hypercall8(HVCALL_FINALIZE_PARTITION, 166 + *(u64 *)&input); 167 + 168 + return hv_result_to_errno(status); 169 + } 170 + 171 + int hv_call_delete_partition(u64 partition_id) 172 + { 173 + struct hv_input_delete_partition input; 174 + u64 status; 175 + 176 + input.partition_id = partition_id; 177 + status = hv_do_fast_hypercall8(HVCALL_DELETE_PARTITION, *(u64 *)&input); 178 + 179 + return hv_result_to_errno(status); 180 + } 181 + 182 + /* Ask the hypervisor to map guest ram pages or the guest mmio space */ 183 + static int hv_do_map_gpa_hcall(u64 partition_id, u64 gfn, u64 page_struct_count, 184 + u32 flags, struct page **pages, u64 mmio_spa) 185 + { 186 + struct hv_input_map_gpa_pages *input_page; 187 + u64 status, *pfnlist; 188 + unsigned long irq_flags, large_shift = 0; 189 + int ret = 0, done = 0; 190 + u64 page_count = page_struct_count; 191 + 192 + if (page_count == 0 || (pages && mmio_spa)) 193 + return -EINVAL; 194 + 195 + if (flags & HV_MAP_GPA_LARGE_PAGE) { 196 + if (mmio_spa) 197 + return -EINVAL; 198 + 199 + if (!HV_PAGE_COUNT_2M_ALIGNED(page_count)) 200 + return -EINVAL; 201 + 202 + large_shift = HV_HYP_LARGE_PAGE_SHIFT - HV_HYP_PAGE_SHIFT; 203 + page_count >>= large_shift; 204 + } 205 + 206 + while (done < page_count) { 207 + ulong i, completed, remain = page_count - done; 208 + int rep_count = min(remain, HV_MAP_GPA_BATCH_SIZE); 209 + 210 + local_irq_save(irq_flags); 211 + input_page = *this_cpu_ptr(hyperv_pcpu_input_arg); 212 + 213 + input_page->target_partition_id = partition_id; 214 + input_page->target_gpa_base = gfn + (done << large_shift); 215 + input_page->map_flags = flags; 216 + pfnlist = input_page->source_gpa_page_list; 217 + 218 + for (i = 0; i < rep_count; i++) 219 + if (flags & HV_MAP_GPA_NO_ACCESS) { 220 + pfnlist[i] = 0; 221 + } else if (pages) { 222 + u64 index = (done + i) << large_shift; 223 + 224 + if (index >= page_struct_count) { 225 + ret = -EINVAL; 226 + break; 227 + } 228 + pfnlist[i] = page_to_pfn(pages[index]); 229 + } else { 230 + pfnlist[i] = mmio_spa + done + i; 231 + } 232 + if (ret) 233 + break; 234 + 235 + status = hv_do_rep_hypercall(HVCALL_MAP_GPA_PAGES, rep_count, 0, 236 + input_page, NULL); 237 + local_irq_restore(irq_flags); 238 + 239 + completed = hv_repcomp(status); 240 + 241 + if (hv_result(status) == HV_STATUS_INSUFFICIENT_MEMORY) { 242 + ret = hv_call_deposit_pages(NUMA_NO_NODE, partition_id, 243 + HV_MAP_GPA_DEPOSIT_PAGES); 244 + if (ret) 245 + break; 246 + 247 + } else if (!hv_result_success(status)) { 248 + ret = hv_result_to_errno(status); 249 + break; 250 + } 251 + 252 + done += completed; 253 + } 254 + 255 + if (ret && done) { 256 + u32 unmap_flags = 0; 257 + 258 + if (flags & HV_MAP_GPA_LARGE_PAGE) 259 + unmap_flags |= HV_UNMAP_GPA_LARGE_PAGE; 260 + hv_call_unmap_gpa_pages(partition_id, gfn, done, unmap_flags); 261 + } 262 + 263 + return ret; 264 + } 265 + 266 + /* Ask the hypervisor to map guest ram pages */ 267 + int hv_call_map_gpa_pages(u64 partition_id, u64 gpa_target, u64 page_count, 268 + u32 flags, struct page **pages) 269 + { 270 + return hv_do_map_gpa_hcall(partition_id, gpa_target, page_count, 271 + flags, pages, 0); 272 + } 273 + 274 + /* Ask the hypervisor to map guest mmio space */ 275 + int hv_call_map_mmio_pages(u64 partition_id, u64 gfn, u64 mmio_spa, u64 numpgs) 276 + { 277 + int i; 278 + u32 flags = HV_MAP_GPA_READABLE | HV_MAP_GPA_WRITABLE | 279 + HV_MAP_GPA_NOT_CACHED; 280 + 281 + for (i = 0; i < numpgs; i++) 282 + if (page_is_ram(mmio_spa + i)) 283 + return -EINVAL; 284 + 285 + return hv_do_map_gpa_hcall(partition_id, gfn, numpgs, flags, NULL, 286 + mmio_spa); 287 + } 288 + 289 + int hv_call_unmap_gpa_pages(u64 partition_id, u64 gfn, u64 page_count_4k, 290 + u32 flags) 291 + { 292 + struct hv_input_unmap_gpa_pages *input_page; 293 + u64 status, page_count = page_count_4k; 294 + unsigned long irq_flags, large_shift = 0; 295 + int ret = 0, done = 0; 296 + 297 + if (page_count == 0) 298 + return -EINVAL; 299 + 300 + if (flags & HV_UNMAP_GPA_LARGE_PAGE) { 301 + if (!HV_PAGE_COUNT_2M_ALIGNED(page_count)) 302 + return -EINVAL; 303 + 304 + large_shift = HV_HYP_LARGE_PAGE_SHIFT - HV_HYP_PAGE_SHIFT; 305 + page_count >>= large_shift; 306 + } 307 + 308 + while (done < page_count) { 309 + ulong completed, remain = page_count - done; 310 + int rep_count = min(remain, HV_UMAP_GPA_PAGES); 311 + 312 + local_irq_save(irq_flags); 313 + input_page = *this_cpu_ptr(hyperv_pcpu_input_arg); 314 + 315 + input_page->target_partition_id = partition_id; 316 + input_page->target_gpa_base = gfn + (done << large_shift); 317 + input_page->unmap_flags = flags; 318 + status = hv_do_rep_hypercall(HVCALL_UNMAP_GPA_PAGES, rep_count, 319 + 0, input_page, NULL); 320 + local_irq_restore(irq_flags); 321 + 322 + completed = hv_repcomp(status); 323 + if (!hv_result_success(status)) { 324 + ret = hv_result_to_errno(status); 325 + break; 326 + } 327 + 328 + done += completed; 329 + } 330 + 331 + return ret; 332 + } 333 + 334 + int hv_call_get_gpa_access_states(u64 partition_id, u32 count, u64 gpa_base_pfn, 335 + union hv_gpa_page_access_state_flags state_flags, 336 + int *written_total, 337 + union hv_gpa_page_access_state *states) 338 + { 339 + struct hv_input_get_gpa_pages_access_state *input_page; 340 + union hv_gpa_page_access_state *output_page; 341 + int completed = 0; 342 + unsigned long remaining = count; 343 + int rep_count, i; 344 + u64 status = 0; 345 + unsigned long flags; 346 + 347 + *written_total = 0; 348 + while (remaining) { 349 + local_irq_save(flags); 350 + input_page = *this_cpu_ptr(hyperv_pcpu_input_arg); 351 + output_page = *this_cpu_ptr(hyperv_pcpu_output_arg); 352 + 353 + input_page->partition_id = partition_id; 354 + input_page->hv_gpa_page_number = gpa_base_pfn + *written_total; 355 + input_page->flags = state_flags; 356 + rep_count = min(remaining, HV_GET_GPA_ACCESS_STATES_BATCH_SIZE); 357 + 358 + status = hv_do_rep_hypercall(HVCALL_GET_GPA_PAGES_ACCESS_STATES, rep_count, 359 + 0, input_page, output_page); 360 + if (!hv_result_success(status)) { 361 + local_irq_restore(flags); 362 + break; 363 + } 364 + completed = hv_repcomp(status); 365 + for (i = 0; i < completed; ++i) 366 + states[i].as_uint8 = output_page[i].as_uint8; 367 + 368 + local_irq_restore(flags); 369 + states += completed; 370 + *written_total += completed; 371 + remaining -= completed; 372 + } 373 + 374 + return hv_result_to_errno(status); 375 + } 376 + 377 + int hv_call_assert_virtual_interrupt(u64 partition_id, u32 vector, 378 + u64 dest_addr, 379 + union hv_interrupt_control control) 380 + { 381 + struct hv_input_assert_virtual_interrupt *input; 382 + unsigned long flags; 383 + u64 status; 384 + 385 + local_irq_save(flags); 386 + input = *this_cpu_ptr(hyperv_pcpu_input_arg); 387 + memset(input, 0, sizeof(*input)); 388 + input->partition_id = partition_id; 389 + input->vector = vector; 390 + input->dest_addr = dest_addr; 391 + input->control = control; 392 + status = hv_do_hypercall(HVCALL_ASSERT_VIRTUAL_INTERRUPT, input, NULL); 393 + local_irq_restore(flags); 394 + 395 + return hv_result_to_errno(status); 396 + } 397 + 398 + int hv_call_delete_vp(u64 partition_id, u32 vp_index) 399 + { 400 + union hv_input_delete_vp input = {}; 401 + u64 status; 402 + 403 + input.partition_id = partition_id; 404 + input.vp_index = vp_index; 405 + 406 + status = hv_do_fast_hypercall16(HVCALL_DELETE_VP, 407 + input.as_uint64[0], input.as_uint64[1]); 408 + 409 + return hv_result_to_errno(status); 410 + } 411 + EXPORT_SYMBOL_GPL(hv_call_delete_vp); 412 + 413 + int hv_call_get_vp_state(u32 vp_index, u64 partition_id, 414 + struct hv_vp_state_data state_data, 415 + /* Choose between pages and ret_output */ 416 + u64 page_count, struct page **pages, 417 + union hv_output_get_vp_state *ret_output) 418 + { 419 + struct hv_input_get_vp_state *input; 420 + union hv_output_get_vp_state *output; 421 + u64 status; 422 + int i; 423 + u64 control; 424 + unsigned long flags; 425 + int ret = 0; 426 + 427 + if (page_count > HV_GET_VP_STATE_BATCH_SIZE) 428 + return -EINVAL; 429 + 430 + if (!page_count && !ret_output) 431 + return -EINVAL; 432 + 433 + do { 434 + local_irq_save(flags); 435 + input = *this_cpu_ptr(hyperv_pcpu_input_arg); 436 + output = *this_cpu_ptr(hyperv_pcpu_output_arg); 437 + memset(input, 0, sizeof(*input)); 438 + memset(output, 0, sizeof(*output)); 439 + 440 + input->partition_id = partition_id; 441 + input->vp_index = vp_index; 442 + input->state_data = state_data; 443 + for (i = 0; i < page_count; i++) 444 + input->output_data_pfns[i] = page_to_pfn(pages[i]); 445 + 446 + control = (HVCALL_GET_VP_STATE) | 447 + (page_count << HV_HYPERCALL_VARHEAD_OFFSET); 448 + 449 + status = hv_do_hypercall(control, input, output); 450 + 451 + if (hv_result(status) != HV_STATUS_INSUFFICIENT_MEMORY) { 452 + if (hv_result_success(status) && ret_output) 453 + memcpy(ret_output, output, sizeof(*output)); 454 + 455 + local_irq_restore(flags); 456 + ret = hv_result_to_errno(status); 457 + break; 458 + } 459 + local_irq_restore(flags); 460 + 461 + ret = hv_call_deposit_pages(NUMA_NO_NODE, 462 + partition_id, 1); 463 + } while (!ret); 464 + 465 + return ret; 466 + } 467 + 468 + int hv_call_set_vp_state(u32 vp_index, u64 partition_id, 469 + /* Choose between pages and bytes */ 470 + struct hv_vp_state_data state_data, u64 page_count, 471 + struct page **pages, u32 num_bytes, u8 *bytes) 472 + { 473 + struct hv_input_set_vp_state *input; 474 + u64 status; 475 + int i; 476 + u64 control; 477 + unsigned long flags; 478 + int ret = 0; 479 + u16 varhead_sz; 480 + 481 + if (page_count > HV_SET_VP_STATE_BATCH_SIZE) 482 + return -EINVAL; 483 + if (sizeof(*input) + num_bytes > HV_HYP_PAGE_SIZE) 484 + return -EINVAL; 485 + 486 + if (num_bytes) 487 + /* round up to 8 and divide by 8 */ 488 + varhead_sz = (num_bytes + 7) >> 3; 489 + else if (page_count) 490 + varhead_sz = page_count; 491 + else 492 + return -EINVAL; 493 + 494 + do { 495 + local_irq_save(flags); 496 + input = *this_cpu_ptr(hyperv_pcpu_input_arg); 497 + memset(input, 0, sizeof(*input)); 498 + 499 + input->partition_id = partition_id; 500 + input->vp_index = vp_index; 501 + input->state_data = state_data; 502 + if (num_bytes) { 503 + memcpy((u8 *)input->data, bytes, num_bytes); 504 + } else { 505 + for (i = 0; i < page_count; i++) 506 + input->data[i].pfns = page_to_pfn(pages[i]); 507 + } 508 + 509 + control = (HVCALL_SET_VP_STATE) | 510 + (varhead_sz << HV_HYPERCALL_VARHEAD_OFFSET); 511 + 512 + status = hv_do_hypercall(control, input, NULL); 513 + 514 + if (hv_result(status) != HV_STATUS_INSUFFICIENT_MEMORY) { 515 + local_irq_restore(flags); 516 + ret = hv_result_to_errno(status); 517 + break; 518 + } 519 + local_irq_restore(flags); 520 + 521 + ret = hv_call_deposit_pages(NUMA_NO_NODE, 522 + partition_id, 1); 523 + } while (!ret); 524 + 525 + return ret; 526 + } 527 + 528 + int hv_call_map_vp_state_page(u64 partition_id, u32 vp_index, u32 type, 529 + union hv_input_vtl input_vtl, 530 + struct page **state_page) 531 + { 532 + struct hv_input_map_vp_state_page *input; 533 + struct hv_output_map_vp_state_page *output; 534 + u64 status; 535 + int ret; 536 + unsigned long flags; 537 + 538 + do { 539 + local_irq_save(flags); 540 + 541 + input = *this_cpu_ptr(hyperv_pcpu_input_arg); 542 + output = *this_cpu_ptr(hyperv_pcpu_output_arg); 543 + 544 + input->partition_id = partition_id; 545 + input->vp_index = vp_index; 546 + input->type = type; 547 + input->input_vtl = input_vtl; 548 + 549 + status = hv_do_hypercall(HVCALL_MAP_VP_STATE_PAGE, input, output); 550 + 551 + if (hv_result(status) != HV_STATUS_INSUFFICIENT_MEMORY) { 552 + if (hv_result_success(status)) 553 + *state_page = pfn_to_page(output->map_location); 554 + local_irq_restore(flags); 555 + ret = hv_result_to_errno(status); 556 + break; 557 + } 558 + 559 + local_irq_restore(flags); 560 + 561 + ret = hv_call_deposit_pages(NUMA_NO_NODE, partition_id, 1); 562 + } while (!ret); 563 + 564 + return ret; 565 + } 566 + 567 + int hv_call_unmap_vp_state_page(u64 partition_id, u32 vp_index, u32 type, 568 + union hv_input_vtl input_vtl) 569 + { 570 + unsigned long flags; 571 + u64 status; 572 + struct hv_input_unmap_vp_state_page *input; 573 + 574 + local_irq_save(flags); 575 + 576 + input = *this_cpu_ptr(hyperv_pcpu_input_arg); 577 + 578 + memset(input, 0, sizeof(*input)); 579 + 580 + input->partition_id = partition_id; 581 + input->vp_index = vp_index; 582 + input->type = type; 583 + input->input_vtl = input_vtl; 584 + 585 + status = hv_do_hypercall(HVCALL_UNMAP_VP_STATE_PAGE, input, NULL); 586 + 587 + local_irq_restore(flags); 588 + 589 + return hv_result_to_errno(status); 590 + } 591 + 592 + int 593 + hv_call_clear_virtual_interrupt(u64 partition_id) 594 + { 595 + int status; 596 + 597 + status = hv_do_fast_hypercall8(HVCALL_CLEAR_VIRTUAL_INTERRUPT, 598 + partition_id); 599 + 600 + return hv_result_to_errno(status); 601 + } 602 + 603 + int 604 + hv_call_create_port(u64 port_partition_id, union hv_port_id port_id, 605 + u64 connection_partition_id, 606 + struct hv_port_info *port_info, 607 + u8 port_vtl, u8 min_connection_vtl, int node) 608 + { 609 + struct hv_input_create_port *input; 610 + unsigned long flags; 611 + int ret = 0; 612 + int status; 613 + 614 + do { 615 + local_irq_save(flags); 616 + input = *this_cpu_ptr(hyperv_pcpu_input_arg); 617 + memset(input, 0, sizeof(*input)); 618 + 619 + input->port_partition_id = port_partition_id; 620 + input->port_id = port_id; 621 + input->connection_partition_id = connection_partition_id; 622 + input->port_info = *port_info; 623 + input->port_vtl = port_vtl; 624 + input->min_connection_vtl = min_connection_vtl; 625 + input->proximity_domain_info = hv_numa_node_to_pxm_info(node); 626 + status = hv_do_hypercall(HVCALL_CREATE_PORT, input, NULL); 627 + local_irq_restore(flags); 628 + if (hv_result_success(status)) 629 + break; 630 + 631 + if (hv_result(status) != HV_STATUS_INSUFFICIENT_MEMORY) { 632 + ret = hv_result_to_errno(status); 633 + break; 634 + } 635 + ret = hv_call_deposit_pages(NUMA_NO_NODE, port_partition_id, 1); 636 + 637 + } while (!ret); 638 + 639 + return ret; 640 + } 641 + 642 + int 643 + hv_call_delete_port(u64 port_partition_id, union hv_port_id port_id) 644 + { 645 + union hv_input_delete_port input = { 0 }; 646 + int status; 647 + 648 + input.port_partition_id = port_partition_id; 649 + input.port_id = port_id; 650 + status = hv_do_fast_hypercall16(HVCALL_DELETE_PORT, 651 + input.as_uint64[0], 652 + input.as_uint64[1]); 653 + 654 + return hv_result_to_errno(status); 655 + } 656 + 657 + int 658 + hv_call_connect_port(u64 port_partition_id, union hv_port_id port_id, 659 + u64 connection_partition_id, 660 + union hv_connection_id connection_id, 661 + struct hv_connection_info *connection_info, 662 + u8 connection_vtl, int node) 663 + { 664 + struct hv_input_connect_port *input; 665 + unsigned long flags; 666 + int ret = 0, status; 667 + 668 + do { 669 + local_irq_save(flags); 670 + input = *this_cpu_ptr(hyperv_pcpu_input_arg); 671 + memset(input, 0, sizeof(*input)); 672 + input->port_partition_id = port_partition_id; 673 + input->port_id = port_id; 674 + input->connection_partition_id = connection_partition_id; 675 + input->connection_id = connection_id; 676 + input->connection_info = *connection_info; 677 + input->connection_vtl = connection_vtl; 678 + input->proximity_domain_info = hv_numa_node_to_pxm_info(node); 679 + status = hv_do_hypercall(HVCALL_CONNECT_PORT, input, NULL); 680 + 681 + local_irq_restore(flags); 682 + if (hv_result_success(status)) 683 + break; 684 + 685 + if (hv_result(status) != HV_STATUS_INSUFFICIENT_MEMORY) { 686 + ret = hv_result_to_errno(status); 687 + break; 688 + } 689 + ret = hv_call_deposit_pages(NUMA_NO_NODE, 690 + connection_partition_id, 1); 691 + } while (!ret); 692 + 693 + return ret; 694 + } 695 + 696 + int 697 + hv_call_disconnect_port(u64 connection_partition_id, 698 + union hv_connection_id connection_id) 699 + { 700 + union hv_input_disconnect_port input = { 0 }; 701 + int status; 702 + 703 + input.connection_partition_id = connection_partition_id; 704 + input.connection_id = connection_id; 705 + input.is_doorbell = 1; 706 + status = hv_do_fast_hypercall16(HVCALL_DISCONNECT_PORT, 707 + input.as_uint64[0], 708 + input.as_uint64[1]); 709 + 710 + return hv_result_to_errno(status); 711 + } 712 + 713 + int 714 + hv_call_notify_port_ring_empty(u32 sint_index) 715 + { 716 + union hv_input_notify_port_ring_empty input = { 0 }; 717 + int status; 718 + 719 + input.sint_index = sint_index; 720 + status = hv_do_fast_hypercall8(HVCALL_NOTIFY_PORT_RING_EMPTY, 721 + input.as_uint64); 722 + 723 + return hv_result_to_errno(status); 724 + } 725 + 726 + int hv_call_map_stat_page(enum hv_stats_object_type type, 727 + const union hv_stats_object_identity *identity, 728 + void **addr) 729 + { 730 + unsigned long flags; 731 + struct hv_input_map_stats_page *input; 732 + struct hv_output_map_stats_page *output; 733 + u64 status, pfn; 734 + int ret = 0; 735 + 736 + do { 737 + local_irq_save(flags); 738 + input = *this_cpu_ptr(hyperv_pcpu_input_arg); 739 + output = *this_cpu_ptr(hyperv_pcpu_output_arg); 740 + 741 + memset(input, 0, sizeof(*input)); 742 + input->type = type; 743 + input->identity = *identity; 744 + 745 + status = hv_do_hypercall(HVCALL_MAP_STATS_PAGE, input, output); 746 + pfn = output->map_location; 747 + 748 + local_irq_restore(flags); 749 + if (hv_result(status) != HV_STATUS_INSUFFICIENT_MEMORY) { 750 + ret = hv_result_to_errno(status); 751 + if (hv_result_success(status)) 752 + break; 753 + return ret; 754 + } 755 + 756 + ret = hv_call_deposit_pages(NUMA_NO_NODE, 757 + hv_current_partition_id, 1); 758 + if (ret) 759 + return ret; 760 + } while (!ret); 761 + 762 + *addr = page_address(pfn_to_page(pfn)); 763 + 764 + return ret; 765 + } 766 + 767 + int hv_call_unmap_stat_page(enum hv_stats_object_type type, 768 + const union hv_stats_object_identity *identity) 769 + { 770 + unsigned long flags; 771 + struct hv_input_unmap_stats_page *input; 772 + u64 status; 773 + 774 + local_irq_save(flags); 775 + input = *this_cpu_ptr(hyperv_pcpu_input_arg); 776 + 777 + memset(input, 0, sizeof(*input)); 778 + input->type = type; 779 + input->identity = *identity; 780 + 781 + status = hv_do_hypercall(HVCALL_UNMAP_STATS_PAGE, input, NULL); 782 + local_irq_restore(flags); 783 + 784 + return hv_result_to_errno(status); 785 + } 786 + 787 + int hv_call_modify_spa_host_access(u64 partition_id, struct page **pages, 788 + u64 page_struct_count, u32 host_access, 789 + u32 flags, u8 acquire) 790 + { 791 + struct hv_input_modify_sparse_spa_page_host_access *input_page; 792 + u64 status; 793 + int done = 0; 794 + unsigned long irq_flags, large_shift = 0; 795 + u64 page_count = page_struct_count; 796 + u16 code = acquire ? HVCALL_ACQUIRE_SPARSE_SPA_PAGE_HOST_ACCESS : 797 + HVCALL_RELEASE_SPARSE_SPA_PAGE_HOST_ACCESS; 798 + 799 + if (page_count == 0) 800 + return -EINVAL; 801 + 802 + if (flags & HV_MODIFY_SPA_PAGE_HOST_ACCESS_LARGE_PAGE) { 803 + if (!HV_PAGE_COUNT_2M_ALIGNED(page_count)) 804 + return -EINVAL; 805 + large_shift = HV_HYP_LARGE_PAGE_SHIFT - HV_HYP_PAGE_SHIFT; 806 + page_count >>= large_shift; 807 + } 808 + 809 + while (done < page_count) { 810 + ulong i, completed, remain = page_count - done; 811 + int rep_count = min(remain, 812 + HV_MODIFY_SPARSE_SPA_PAGE_HOST_ACCESS_MAX_PAGE_COUNT); 813 + 814 + local_irq_save(irq_flags); 815 + input_page = *this_cpu_ptr(hyperv_pcpu_input_arg); 816 + 817 + memset(input_page, 0, sizeof(*input_page)); 818 + /* Only set the partition id if you are making the pages 819 + * exclusive 820 + */ 821 + if (flags & HV_MODIFY_SPA_PAGE_HOST_ACCESS_MAKE_EXCLUSIVE) 822 + input_page->partition_id = partition_id; 823 + input_page->flags = flags; 824 + input_page->host_access = host_access; 825 + 826 + for (i = 0; i < rep_count; i++) { 827 + u64 index = (done + i) << large_shift; 828 + 829 + if (index >= page_struct_count) 830 + return -EINVAL; 831 + 832 + input_page->spa_page_list[i] = 833 + page_to_pfn(pages[index]); 834 + } 835 + 836 + status = hv_do_rep_hypercall(code, rep_count, 0, input_page, 837 + NULL); 838 + local_irq_restore(irq_flags); 839 + 840 + completed = hv_repcomp(status); 841 + 842 + if (!hv_result_success(status)) 843 + return hv_result_to_errno(status); 844 + 845 + done += completed; 846 + } 847 + 848 + return 0; 849 + }
+2307
drivers/hv/mshv_root_main.c
··· 1 + // SPDX-License-Identifier: GPL-2.0-only 2 + /* 3 + * Copyright (c) 2024, Microsoft Corporation. 4 + * 5 + * The main part of the mshv_root module, providing APIs to create 6 + * and manage guest partitions. 7 + * 8 + * Authors: Microsoft Linux virtualization team 9 + */ 10 + 11 + #include <linux/kernel.h> 12 + #include <linux/module.h> 13 + #include <linux/fs.h> 14 + #include <linux/miscdevice.h> 15 + #include <linux/slab.h> 16 + #include <linux/file.h> 17 + #include <linux/anon_inodes.h> 18 + #include <linux/mm.h> 19 + #include <linux/io.h> 20 + #include <linux/cpuhotplug.h> 21 + #include <linux/random.h> 22 + #include <asm/mshyperv.h> 23 + #include <linux/hyperv.h> 24 + #include <linux/notifier.h> 25 + #include <linux/reboot.h> 26 + #include <linux/kexec.h> 27 + #include <linux/page-flags.h> 28 + #include <linux/crash_dump.h> 29 + #include <linux/panic_notifier.h> 30 + #include <linux/vmalloc.h> 31 + 32 + #include "mshv_eventfd.h" 33 + #include "mshv.h" 34 + #include "mshv_root.h" 35 + 36 + MODULE_AUTHOR("Microsoft"); 37 + MODULE_LICENSE("GPL"); 38 + MODULE_DESCRIPTION("Microsoft Hyper-V root partition VMM interface /dev/mshv"); 39 + 40 + /* TODO move this to mshyperv.h when needed outside driver */ 41 + static inline bool hv_parent_partition(void) 42 + { 43 + return hv_root_partition(); 44 + } 45 + 46 + /* TODO move this to another file when debugfs code is added */ 47 + enum hv_stats_vp_counters { /* HV_THREAD_COUNTER */ 48 + #if defined(CONFIG_X86) 49 + VpRootDispatchThreadBlocked = 201, 50 + #elif defined(CONFIG_ARM64) 51 + VpRootDispatchThreadBlocked = 94, 52 + #endif 53 + VpStatsMaxCounter 54 + }; 55 + 56 + struct hv_stats_page { 57 + union { 58 + u64 vp_cntrs[VpStatsMaxCounter]; /* VP counters */ 59 + u8 data[HV_HYP_PAGE_SIZE]; 60 + }; 61 + } __packed; 62 + 63 + struct mshv_root mshv_root; 64 + 65 + enum hv_scheduler_type hv_scheduler_type; 66 + 67 + /* Once we implement the fast extended hypercall ABI they can go away. */ 68 + static void * __percpu *root_scheduler_input; 69 + static void * __percpu *root_scheduler_output; 70 + 71 + static long mshv_dev_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg); 72 + static int mshv_dev_open(struct inode *inode, struct file *filp); 73 + static int mshv_dev_release(struct inode *inode, struct file *filp); 74 + static int mshv_vp_release(struct inode *inode, struct file *filp); 75 + static long mshv_vp_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg); 76 + static int mshv_partition_release(struct inode *inode, struct file *filp); 77 + static long mshv_partition_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg); 78 + static int mshv_vp_mmap(struct file *file, struct vm_area_struct *vma); 79 + static vm_fault_t mshv_vp_fault(struct vm_fault *vmf); 80 + static int mshv_init_async_handler(struct mshv_partition *partition); 81 + static void mshv_async_hvcall_handler(void *data, u64 *status); 82 + 83 + static const union hv_input_vtl input_vtl_zero; 84 + static const union hv_input_vtl input_vtl_normal = { 85 + .target_vtl = HV_NORMAL_VTL, 86 + .use_target_vtl = 1, 87 + }; 88 + 89 + static const struct vm_operations_struct mshv_vp_vm_ops = { 90 + .fault = mshv_vp_fault, 91 + }; 92 + 93 + static const struct file_operations mshv_vp_fops = { 94 + .owner = THIS_MODULE, 95 + .release = mshv_vp_release, 96 + .unlocked_ioctl = mshv_vp_ioctl, 97 + .llseek = noop_llseek, 98 + .mmap = mshv_vp_mmap, 99 + }; 100 + 101 + static const struct file_operations mshv_partition_fops = { 102 + .owner = THIS_MODULE, 103 + .release = mshv_partition_release, 104 + .unlocked_ioctl = mshv_partition_ioctl, 105 + .llseek = noop_llseek, 106 + }; 107 + 108 + static const struct file_operations mshv_dev_fops = { 109 + .owner = THIS_MODULE, 110 + .open = mshv_dev_open, 111 + .release = mshv_dev_release, 112 + .unlocked_ioctl = mshv_dev_ioctl, 113 + .llseek = noop_llseek, 114 + }; 115 + 116 + static struct miscdevice mshv_dev = { 117 + .minor = MISC_DYNAMIC_MINOR, 118 + .name = "mshv", 119 + .fops = &mshv_dev_fops, 120 + .mode = 0600, 121 + }; 122 + 123 + /* 124 + * Only allow hypercalls that have a u64 partition id as the first member of 125 + * the input structure. 126 + * These are sorted by value. 127 + */ 128 + static u16 mshv_passthru_hvcalls[] = { 129 + HVCALL_GET_PARTITION_PROPERTY, 130 + HVCALL_SET_PARTITION_PROPERTY, 131 + HVCALL_INSTALL_INTERCEPT, 132 + HVCALL_GET_VP_REGISTERS, 133 + HVCALL_SET_VP_REGISTERS, 134 + HVCALL_TRANSLATE_VIRTUAL_ADDRESS, 135 + HVCALL_CLEAR_VIRTUAL_INTERRUPT, 136 + HVCALL_REGISTER_INTERCEPT_RESULT, 137 + HVCALL_ASSERT_VIRTUAL_INTERRUPT, 138 + HVCALL_GET_GPA_PAGES_ACCESS_STATES, 139 + HVCALL_SIGNAL_EVENT_DIRECT, 140 + HVCALL_POST_MESSAGE_DIRECT, 141 + HVCALL_GET_VP_CPUID_VALUES, 142 + }; 143 + 144 + static bool mshv_hvcall_is_async(u16 code) 145 + { 146 + switch (code) { 147 + case HVCALL_SET_PARTITION_PROPERTY: 148 + return true; 149 + default: 150 + break; 151 + } 152 + return false; 153 + } 154 + 155 + static int mshv_ioctl_passthru_hvcall(struct mshv_partition *partition, 156 + bool partition_locked, 157 + void __user *user_args) 158 + { 159 + u64 status; 160 + int ret = 0, i; 161 + bool is_async; 162 + struct mshv_root_hvcall args; 163 + struct page *page; 164 + unsigned int pages_order; 165 + void *input_pg = NULL; 166 + void *output_pg = NULL; 167 + 168 + if (copy_from_user(&args, user_args, sizeof(args))) 169 + return -EFAULT; 170 + 171 + if (args.status || !args.in_ptr || args.in_sz < sizeof(u64) || 172 + mshv_field_nonzero(args, rsvd) || args.in_sz > HV_HYP_PAGE_SIZE) 173 + return -EINVAL; 174 + 175 + if (args.out_ptr && (!args.out_sz || args.out_sz > HV_HYP_PAGE_SIZE)) 176 + return -EINVAL; 177 + 178 + for (i = 0; i < ARRAY_SIZE(mshv_passthru_hvcalls); ++i) 179 + if (args.code == mshv_passthru_hvcalls[i]) 180 + break; 181 + 182 + if (i >= ARRAY_SIZE(mshv_passthru_hvcalls)) 183 + return -EINVAL; 184 + 185 + is_async = mshv_hvcall_is_async(args.code); 186 + if (is_async) { 187 + /* async hypercalls can only be called from partition fd */ 188 + if (!partition_locked) 189 + return -EINVAL; 190 + ret = mshv_init_async_handler(partition); 191 + if (ret) 192 + return ret; 193 + } 194 + 195 + pages_order = args.out_ptr ? 1 : 0; 196 + page = alloc_pages(GFP_KERNEL, pages_order); 197 + if (!page) 198 + return -ENOMEM; 199 + input_pg = page_address(page); 200 + 201 + if (args.out_ptr) 202 + output_pg = (char *)input_pg + PAGE_SIZE; 203 + else 204 + output_pg = NULL; 205 + 206 + if (copy_from_user(input_pg, (void __user *)args.in_ptr, 207 + args.in_sz)) { 208 + ret = -EFAULT; 209 + goto free_pages_out; 210 + } 211 + 212 + /* 213 + * NOTE: This only works because all the allowed hypercalls' input 214 + * structs begin with a u64 partition_id field. 215 + */ 216 + *(u64 *)input_pg = partition->pt_id; 217 + 218 + if (args.reps) 219 + status = hv_do_rep_hypercall(args.code, args.reps, 0, 220 + input_pg, output_pg); 221 + else 222 + status = hv_do_hypercall(args.code, input_pg, output_pg); 223 + 224 + if (hv_result(status) == HV_STATUS_CALL_PENDING) { 225 + if (is_async) { 226 + mshv_async_hvcall_handler(partition, &status); 227 + } else { /* Paranoia check. This shouldn't happen! */ 228 + ret = -EBADFD; 229 + goto free_pages_out; 230 + } 231 + } 232 + 233 + if (hv_result(status) == HV_STATUS_INSUFFICIENT_MEMORY) { 234 + ret = hv_call_deposit_pages(NUMA_NO_NODE, partition->pt_id, 1); 235 + if (!ret) 236 + ret = -EAGAIN; 237 + } else if (!hv_result_success(status)) { 238 + ret = hv_result_to_errno(status); 239 + } 240 + 241 + /* 242 + * Always return the status and output data regardless of result. 243 + * The VMM may need it to determine how to proceed. E.g. the status may 244 + * contain the number of reps completed if a rep hypercall partially 245 + * succeeded. 246 + */ 247 + args.status = hv_result(status); 248 + args.reps = args.reps ? hv_repcomp(status) : 0; 249 + if (copy_to_user(user_args, &args, sizeof(args))) 250 + ret = -EFAULT; 251 + 252 + if (output_pg && 253 + copy_to_user((void __user *)args.out_ptr, output_pg, args.out_sz)) 254 + ret = -EFAULT; 255 + 256 + free_pages_out: 257 + free_pages((unsigned long)input_pg, pages_order); 258 + 259 + return ret; 260 + } 261 + 262 + static inline bool is_ghcb_mapping_available(void) 263 + { 264 + #if IS_ENABLED(CONFIG_X86_64) 265 + return ms_hyperv.ext_features & HV_VP_GHCB_ROOT_MAPPING_AVAILABLE; 266 + #else 267 + return 0; 268 + #endif 269 + } 270 + 271 + static int mshv_get_vp_registers(u32 vp_index, u64 partition_id, u16 count, 272 + struct hv_register_assoc *registers) 273 + { 274 + return hv_call_get_vp_registers(vp_index, partition_id, 275 + count, input_vtl_zero, registers); 276 + } 277 + 278 + static int mshv_set_vp_registers(u32 vp_index, u64 partition_id, u16 count, 279 + struct hv_register_assoc *registers) 280 + { 281 + return hv_call_set_vp_registers(vp_index, partition_id, 282 + count, input_vtl_zero, registers); 283 + } 284 + 285 + /* 286 + * Explicit guest vCPU suspend is asynchronous by nature (as it is requested by 287 + * dom0 vCPU for guest vCPU) and thus it can race with "intercept" suspend, 288 + * done by the hypervisor. 289 + * "Intercept" suspend leads to asynchronous message delivery to dom0 which 290 + * should be awaited to keep the VP loop consistent (i.e. no message pending 291 + * upon VP resume). 292 + * VP intercept suspend can't be done when the VP is explicitly suspended 293 + * already, and thus can be only two possible race scenarios: 294 + * 1. implicit suspend bit set -> explicit suspend bit set -> message sent 295 + * 2. implicit suspend bit set -> message sent -> explicit suspend bit set 296 + * Checking for implicit suspend bit set after explicit suspend request has 297 + * succeeded in either case allows us to reliably identify, if there is a 298 + * message to receive and deliver to VMM. 299 + */ 300 + static int 301 + mshv_suspend_vp(const struct mshv_vp *vp, bool *message_in_flight) 302 + { 303 + struct hv_register_assoc explicit_suspend = { 304 + .name = HV_REGISTER_EXPLICIT_SUSPEND 305 + }; 306 + struct hv_register_assoc intercept_suspend = { 307 + .name = HV_REGISTER_INTERCEPT_SUSPEND 308 + }; 309 + union hv_explicit_suspend_register *es = 310 + &explicit_suspend.value.explicit_suspend; 311 + union hv_intercept_suspend_register *is = 312 + &intercept_suspend.value.intercept_suspend; 313 + int ret; 314 + 315 + es->suspended = 1; 316 + 317 + ret = mshv_set_vp_registers(vp->vp_index, vp->vp_partition->pt_id, 318 + 1, &explicit_suspend); 319 + if (ret) { 320 + vp_err(vp, "Failed to explicitly suspend vCPU\n"); 321 + return ret; 322 + } 323 + 324 + ret = mshv_get_vp_registers(vp->vp_index, vp->vp_partition->pt_id, 325 + 1, &intercept_suspend); 326 + if (ret) { 327 + vp_err(vp, "Failed to get intercept suspend state\n"); 328 + return ret; 329 + } 330 + 331 + *message_in_flight = is->suspended; 332 + 333 + return 0; 334 + } 335 + 336 + /* 337 + * This function is used when VPs are scheduled by the hypervisor's 338 + * scheduler. 339 + * 340 + * Caller has to make sure the registers contain cleared 341 + * HV_REGISTER_INTERCEPT_SUSPEND and HV_REGISTER_EXPLICIT_SUSPEND registers 342 + * exactly in this order (the hypervisor clears them sequentially) to avoid 343 + * potential invalid clearing a newly arrived HV_REGISTER_INTERCEPT_SUSPEND 344 + * after VP is released from HV_REGISTER_EXPLICIT_SUSPEND in case of the 345 + * opposite order. 346 + */ 347 + static long mshv_run_vp_with_hyp_scheduler(struct mshv_vp *vp) 348 + { 349 + long ret; 350 + struct hv_register_assoc suspend_regs[2] = { 351 + { .name = HV_REGISTER_INTERCEPT_SUSPEND }, 352 + { .name = HV_REGISTER_EXPLICIT_SUSPEND } 353 + }; 354 + size_t count = ARRAY_SIZE(suspend_regs); 355 + 356 + /* Resume VP execution */ 357 + ret = mshv_set_vp_registers(vp->vp_index, vp->vp_partition->pt_id, 358 + count, suspend_regs); 359 + if (ret) { 360 + vp_err(vp, "Failed to resume vp execution. %lx\n", ret); 361 + return ret; 362 + } 363 + 364 + ret = wait_event_interruptible(vp->run.vp_suspend_queue, 365 + vp->run.kicked_by_hv == 1); 366 + if (ret) { 367 + bool message_in_flight; 368 + 369 + /* 370 + * Otherwise the waiting was interrupted by a signal: suspend 371 + * the vCPU explicitly and copy message in flight (if any). 372 + */ 373 + ret = mshv_suspend_vp(vp, &message_in_flight); 374 + if (ret) 375 + return ret; 376 + 377 + /* Return if no message in flight */ 378 + if (!message_in_flight) 379 + return -EINTR; 380 + 381 + /* Wait for the message in flight. */ 382 + wait_event(vp->run.vp_suspend_queue, vp->run.kicked_by_hv == 1); 383 + } 384 + 385 + /* 386 + * Reset the flag to make the wait_event call above work 387 + * next time. 388 + */ 389 + vp->run.kicked_by_hv = 0; 390 + 391 + return 0; 392 + } 393 + 394 + static int 395 + mshv_vp_dispatch(struct mshv_vp *vp, u32 flags, 396 + struct hv_output_dispatch_vp *res) 397 + { 398 + struct hv_input_dispatch_vp *input; 399 + struct hv_output_dispatch_vp *output; 400 + u64 status; 401 + 402 + preempt_disable(); 403 + input = *this_cpu_ptr(root_scheduler_input); 404 + output = *this_cpu_ptr(root_scheduler_output); 405 + 406 + memset(input, 0, sizeof(*input)); 407 + memset(output, 0, sizeof(*output)); 408 + 409 + input->partition_id = vp->vp_partition->pt_id; 410 + input->vp_index = vp->vp_index; 411 + input->time_slice = 0; /* Run forever until something happens */ 412 + input->spec_ctrl = 0; /* TODO: set sensible flags */ 413 + input->flags = flags; 414 + 415 + vp->run.flags.root_sched_dispatched = 1; 416 + status = hv_do_hypercall(HVCALL_DISPATCH_VP, input, output); 417 + vp->run.flags.root_sched_dispatched = 0; 418 + 419 + *res = *output; 420 + preempt_enable(); 421 + 422 + if (!hv_result_success(status)) 423 + vp_err(vp, "%s: status %s\n", __func__, 424 + hv_result_to_string(status)); 425 + 426 + return hv_result_to_errno(status); 427 + } 428 + 429 + static int 430 + mshv_vp_clear_explicit_suspend(struct mshv_vp *vp) 431 + { 432 + struct hv_register_assoc explicit_suspend = { 433 + .name = HV_REGISTER_EXPLICIT_SUSPEND, 434 + .value.explicit_suspend.suspended = 0, 435 + }; 436 + int ret; 437 + 438 + ret = mshv_set_vp_registers(vp->vp_index, vp->vp_partition->pt_id, 439 + 1, &explicit_suspend); 440 + 441 + if (ret) 442 + vp_err(vp, "Failed to unsuspend\n"); 443 + 444 + return ret; 445 + } 446 + 447 + #if IS_ENABLED(CONFIG_X86_64) 448 + static u64 mshv_vp_interrupt_pending(struct mshv_vp *vp) 449 + { 450 + if (!vp->vp_register_page) 451 + return 0; 452 + return vp->vp_register_page->interrupt_vectors.as_uint64; 453 + } 454 + #else 455 + static u64 mshv_vp_interrupt_pending(struct mshv_vp *vp) 456 + { 457 + return 0; 458 + } 459 + #endif 460 + 461 + static bool mshv_vp_dispatch_thread_blocked(struct mshv_vp *vp) 462 + { 463 + struct hv_stats_page **stats = vp->vp_stats_pages; 464 + u64 *self_vp_cntrs = stats[HV_STATS_AREA_SELF]->vp_cntrs; 465 + u64 *parent_vp_cntrs = stats[HV_STATS_AREA_PARENT]->vp_cntrs; 466 + 467 + if (self_vp_cntrs[VpRootDispatchThreadBlocked]) 468 + return self_vp_cntrs[VpRootDispatchThreadBlocked]; 469 + return parent_vp_cntrs[VpRootDispatchThreadBlocked]; 470 + } 471 + 472 + static int 473 + mshv_vp_wait_for_hv_kick(struct mshv_vp *vp) 474 + { 475 + int ret; 476 + 477 + ret = wait_event_interruptible(vp->run.vp_suspend_queue, 478 + (vp->run.kicked_by_hv == 1 && 479 + !mshv_vp_dispatch_thread_blocked(vp)) || 480 + mshv_vp_interrupt_pending(vp)); 481 + if (ret) 482 + return -EINTR; 483 + 484 + vp->run.flags.root_sched_blocked = 0; 485 + vp->run.kicked_by_hv = 0; 486 + 487 + return 0; 488 + } 489 + 490 + static int mshv_pre_guest_mode_work(struct mshv_vp *vp) 491 + { 492 + const ulong work_flags = _TIF_NOTIFY_SIGNAL | _TIF_SIGPENDING | 493 + _TIF_NEED_RESCHED | _TIF_NOTIFY_RESUME; 494 + ulong th_flags; 495 + 496 + th_flags = read_thread_flags(); 497 + while (th_flags & work_flags) { 498 + int ret; 499 + 500 + /* nb: following will call schedule */ 501 + ret = mshv_do_pre_guest_mode_work(th_flags); 502 + 503 + if (ret) 504 + return ret; 505 + 506 + th_flags = read_thread_flags(); 507 + } 508 + 509 + return 0; 510 + } 511 + 512 + /* Must be called with interrupts enabled */ 513 + static long mshv_run_vp_with_root_scheduler(struct mshv_vp *vp) 514 + { 515 + long ret; 516 + 517 + if (vp->run.flags.root_sched_blocked) { 518 + /* 519 + * Dispatch state of this VP is blocked. Need to wait 520 + * for the hypervisor to clear the blocked state before 521 + * dispatching it. 522 + */ 523 + ret = mshv_vp_wait_for_hv_kick(vp); 524 + if (ret) 525 + return ret; 526 + } 527 + 528 + do { 529 + u32 flags = 0; 530 + struct hv_output_dispatch_vp output; 531 + 532 + ret = mshv_pre_guest_mode_work(vp); 533 + if (ret) 534 + break; 535 + 536 + if (vp->run.flags.intercept_suspend) 537 + flags |= HV_DISPATCH_VP_FLAG_CLEAR_INTERCEPT_SUSPEND; 538 + 539 + if (mshv_vp_interrupt_pending(vp)) 540 + flags |= HV_DISPATCH_VP_FLAG_SCAN_INTERRUPT_INJECTION; 541 + 542 + ret = mshv_vp_dispatch(vp, flags, &output); 543 + if (ret) 544 + break; 545 + 546 + vp->run.flags.intercept_suspend = 0; 547 + 548 + if (output.dispatch_state == HV_VP_DISPATCH_STATE_BLOCKED) { 549 + if (output.dispatch_event == 550 + HV_VP_DISPATCH_EVENT_SUSPEND) { 551 + /* 552 + * TODO: remove the warning once VP canceling 553 + * is supported 554 + */ 555 + WARN_ONCE(atomic64_read(&vp->run.vp_signaled_count), 556 + "%s: vp#%d: unexpected explicit suspend\n", 557 + __func__, vp->vp_index); 558 + /* 559 + * Need to clear explicit suspend before 560 + * dispatching. 561 + * Explicit suspend is either: 562 + * - set right after the first VP dispatch or 563 + * - set explicitly via hypercall 564 + * Since the latter case is not yet supported, 565 + * simply clear it here. 566 + */ 567 + ret = mshv_vp_clear_explicit_suspend(vp); 568 + if (ret) 569 + break; 570 + 571 + ret = mshv_vp_wait_for_hv_kick(vp); 572 + if (ret) 573 + break; 574 + } else { 575 + vp->run.flags.root_sched_blocked = 1; 576 + ret = mshv_vp_wait_for_hv_kick(vp); 577 + if (ret) 578 + break; 579 + } 580 + } else { 581 + /* HV_VP_DISPATCH_STATE_READY */ 582 + if (output.dispatch_event == 583 + HV_VP_DISPATCH_EVENT_INTERCEPT) 584 + vp->run.flags.intercept_suspend = 1; 585 + } 586 + } while (!vp->run.flags.intercept_suspend); 587 + 588 + return ret; 589 + } 590 + 591 + static_assert(sizeof(struct hv_message) <= MSHV_RUN_VP_BUF_SZ, 592 + "sizeof(struct hv_message) must not exceed MSHV_RUN_VP_BUF_SZ"); 593 + 594 + static long mshv_vp_ioctl_run_vp(struct mshv_vp *vp, void __user *ret_msg) 595 + { 596 + long rc; 597 + 598 + if (hv_scheduler_type == HV_SCHEDULER_TYPE_ROOT) 599 + rc = mshv_run_vp_with_root_scheduler(vp); 600 + else 601 + rc = mshv_run_vp_with_hyp_scheduler(vp); 602 + 603 + if (rc) 604 + return rc; 605 + 606 + if (copy_to_user(ret_msg, vp->vp_intercept_msg_page, 607 + sizeof(struct hv_message))) 608 + rc = -EFAULT; 609 + 610 + return rc; 611 + } 612 + 613 + static int 614 + mshv_vp_ioctl_get_set_state_pfn(struct mshv_vp *vp, 615 + struct hv_vp_state_data state_data, 616 + unsigned long user_pfn, size_t page_count, 617 + bool is_set) 618 + { 619 + int completed, ret = 0; 620 + unsigned long check; 621 + struct page **pages; 622 + 623 + if (page_count > INT_MAX) 624 + return -EINVAL; 625 + /* 626 + * Check the arithmetic for wraparound/overflow. 627 + * The last page address in the buffer is: 628 + * (user_pfn + (page_count - 1)) * PAGE_SIZE 629 + */ 630 + if (check_add_overflow(user_pfn, (page_count - 1), &check)) 631 + return -EOVERFLOW; 632 + if (check_mul_overflow(check, PAGE_SIZE, &check)) 633 + return -EOVERFLOW; 634 + 635 + /* Pin user pages so hypervisor can copy directly to them */ 636 + pages = kcalloc(page_count, sizeof(struct page *), GFP_KERNEL); 637 + if (!pages) 638 + return -ENOMEM; 639 + 640 + for (completed = 0; completed < page_count; completed += ret) { 641 + unsigned long user_addr = (user_pfn + completed) * PAGE_SIZE; 642 + int remaining = page_count - completed; 643 + 644 + ret = pin_user_pages_fast(user_addr, remaining, FOLL_WRITE, 645 + &pages[completed]); 646 + if (ret < 0) { 647 + vp_err(vp, "%s: Failed to pin user pages error %i\n", 648 + __func__, ret); 649 + goto unpin_pages; 650 + } 651 + } 652 + 653 + if (is_set) 654 + ret = hv_call_set_vp_state(vp->vp_index, 655 + vp->vp_partition->pt_id, 656 + state_data, page_count, pages, 657 + 0, NULL); 658 + else 659 + ret = hv_call_get_vp_state(vp->vp_index, 660 + vp->vp_partition->pt_id, 661 + state_data, page_count, pages, 662 + NULL); 663 + 664 + unpin_pages: 665 + unpin_user_pages(pages, completed); 666 + kfree(pages); 667 + return ret; 668 + } 669 + 670 + static long 671 + mshv_vp_ioctl_get_set_state(struct mshv_vp *vp, 672 + struct mshv_get_set_vp_state __user *user_args, 673 + bool is_set) 674 + { 675 + struct mshv_get_set_vp_state args; 676 + long ret = 0; 677 + union hv_output_get_vp_state vp_state; 678 + u32 data_sz; 679 + struct hv_vp_state_data state_data = {}; 680 + 681 + if (copy_from_user(&args, user_args, sizeof(args))) 682 + return -EFAULT; 683 + 684 + if (args.type >= MSHV_VP_STATE_COUNT || mshv_field_nonzero(args, rsvd) || 685 + !args.buf_sz || !PAGE_ALIGNED(args.buf_sz) || 686 + !PAGE_ALIGNED(args.buf_ptr)) 687 + return -EINVAL; 688 + 689 + if (!access_ok((void __user *)args.buf_ptr, args.buf_sz)) 690 + return -EFAULT; 691 + 692 + switch (args.type) { 693 + case MSHV_VP_STATE_LAPIC: 694 + state_data.type = HV_GET_SET_VP_STATE_LAPIC_STATE; 695 + data_sz = HV_HYP_PAGE_SIZE; 696 + break; 697 + case MSHV_VP_STATE_XSAVE: 698 + { 699 + u64 data_sz_64; 700 + 701 + ret = hv_call_get_partition_property(vp->vp_partition->pt_id, 702 + HV_PARTITION_PROPERTY_XSAVE_STATES, 703 + &state_data.xsave.states.as_uint64); 704 + if (ret) 705 + return ret; 706 + 707 + ret = hv_call_get_partition_property(vp->vp_partition->pt_id, 708 + HV_PARTITION_PROPERTY_MAX_XSAVE_DATA_SIZE, 709 + &data_sz_64); 710 + if (ret) 711 + return ret; 712 + 713 + data_sz = (u32)data_sz_64; 714 + state_data.xsave.flags = 0; 715 + /* Always request legacy states */ 716 + state_data.xsave.states.legacy_x87 = 1; 717 + state_data.xsave.states.legacy_sse = 1; 718 + state_data.type = HV_GET_SET_VP_STATE_XSAVE; 719 + break; 720 + } 721 + case MSHV_VP_STATE_SIMP: 722 + state_data.type = HV_GET_SET_VP_STATE_SIM_PAGE; 723 + data_sz = HV_HYP_PAGE_SIZE; 724 + break; 725 + case MSHV_VP_STATE_SIEFP: 726 + state_data.type = HV_GET_SET_VP_STATE_SIEF_PAGE; 727 + data_sz = HV_HYP_PAGE_SIZE; 728 + break; 729 + case MSHV_VP_STATE_SYNTHETIC_TIMERS: 730 + state_data.type = HV_GET_SET_VP_STATE_SYNTHETIC_TIMERS; 731 + data_sz = sizeof(vp_state.synthetic_timers_state); 732 + break; 733 + default: 734 + return -EINVAL; 735 + } 736 + 737 + if (copy_to_user(&user_args->buf_sz, &data_sz, sizeof(user_args->buf_sz))) 738 + return -EFAULT; 739 + 740 + if (data_sz > args.buf_sz) 741 + return -EINVAL; 742 + 743 + /* If the data is transmitted via pfns, delegate to helper */ 744 + if (state_data.type & HV_GET_SET_VP_STATE_TYPE_PFN) { 745 + unsigned long user_pfn = PFN_DOWN(args.buf_ptr); 746 + size_t page_count = PFN_DOWN(args.buf_sz); 747 + 748 + return mshv_vp_ioctl_get_set_state_pfn(vp, state_data, user_pfn, 749 + page_count, is_set); 750 + } 751 + 752 + /* Paranoia check - this shouldn't happen! */ 753 + if (data_sz > sizeof(vp_state)) { 754 + vp_err(vp, "Invalid vp state data size!\n"); 755 + return -EINVAL; 756 + } 757 + 758 + if (is_set) { 759 + if (copy_from_user(&vp_state, (__user void *)args.buf_ptr, data_sz)) 760 + return -EFAULT; 761 + 762 + return hv_call_set_vp_state(vp->vp_index, 763 + vp->vp_partition->pt_id, 764 + state_data, 0, NULL, 765 + sizeof(vp_state), (u8 *)&vp_state); 766 + } 767 + 768 + ret = hv_call_get_vp_state(vp->vp_index, vp->vp_partition->pt_id, 769 + state_data, 0, NULL, &vp_state); 770 + if (ret) 771 + return ret; 772 + 773 + if (copy_to_user((void __user *)args.buf_ptr, &vp_state, data_sz)) 774 + return -EFAULT; 775 + 776 + return 0; 777 + } 778 + 779 + static long 780 + mshv_vp_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) 781 + { 782 + struct mshv_vp *vp = filp->private_data; 783 + long r = -ENOTTY; 784 + 785 + if (mutex_lock_killable(&vp->vp_mutex)) 786 + return -EINTR; 787 + 788 + switch (ioctl) { 789 + case MSHV_RUN_VP: 790 + r = mshv_vp_ioctl_run_vp(vp, (void __user *)arg); 791 + break; 792 + case MSHV_GET_VP_STATE: 793 + r = mshv_vp_ioctl_get_set_state(vp, (void __user *)arg, false); 794 + break; 795 + case MSHV_SET_VP_STATE: 796 + r = mshv_vp_ioctl_get_set_state(vp, (void __user *)arg, true); 797 + break; 798 + case MSHV_ROOT_HVCALL: 799 + r = mshv_ioctl_passthru_hvcall(vp->vp_partition, false, 800 + (void __user *)arg); 801 + break; 802 + default: 803 + vp_warn(vp, "Invalid ioctl: %#x\n", ioctl); 804 + break; 805 + } 806 + mutex_unlock(&vp->vp_mutex); 807 + 808 + return r; 809 + } 810 + 811 + static vm_fault_t mshv_vp_fault(struct vm_fault *vmf) 812 + { 813 + struct mshv_vp *vp = vmf->vma->vm_file->private_data; 814 + 815 + switch (vmf->vma->vm_pgoff) { 816 + case MSHV_VP_MMAP_OFFSET_REGISTERS: 817 + vmf->page = virt_to_page(vp->vp_register_page); 818 + break; 819 + case MSHV_VP_MMAP_OFFSET_INTERCEPT_MESSAGE: 820 + vmf->page = virt_to_page(vp->vp_intercept_msg_page); 821 + break; 822 + case MSHV_VP_MMAP_OFFSET_GHCB: 823 + vmf->page = virt_to_page(vp->vp_ghcb_page); 824 + break; 825 + default: 826 + return VM_FAULT_SIGBUS; 827 + } 828 + 829 + get_page(vmf->page); 830 + 831 + return 0; 832 + } 833 + 834 + static int mshv_vp_mmap(struct file *file, struct vm_area_struct *vma) 835 + { 836 + struct mshv_vp *vp = file->private_data; 837 + 838 + switch (vma->vm_pgoff) { 839 + case MSHV_VP_MMAP_OFFSET_REGISTERS: 840 + if (!vp->vp_register_page) 841 + return -ENODEV; 842 + break; 843 + case MSHV_VP_MMAP_OFFSET_INTERCEPT_MESSAGE: 844 + if (!vp->vp_intercept_msg_page) 845 + return -ENODEV; 846 + break; 847 + case MSHV_VP_MMAP_OFFSET_GHCB: 848 + if (!vp->vp_ghcb_page) 849 + return -ENODEV; 850 + break; 851 + default: 852 + return -EINVAL; 853 + } 854 + 855 + vma->vm_ops = &mshv_vp_vm_ops; 856 + return 0; 857 + } 858 + 859 + static int 860 + mshv_vp_release(struct inode *inode, struct file *filp) 861 + { 862 + struct mshv_vp *vp = filp->private_data; 863 + 864 + /* Rest of VP cleanup happens in destroy_partition() */ 865 + mshv_partition_put(vp->vp_partition); 866 + return 0; 867 + } 868 + 869 + static void mshv_vp_stats_unmap(u64 partition_id, u32 vp_index) 870 + { 871 + union hv_stats_object_identity identity = { 872 + .vp.partition_id = partition_id, 873 + .vp.vp_index = vp_index, 874 + }; 875 + 876 + identity.vp.stats_area_type = HV_STATS_AREA_SELF; 877 + hv_call_unmap_stat_page(HV_STATS_OBJECT_VP, &identity); 878 + 879 + identity.vp.stats_area_type = HV_STATS_AREA_PARENT; 880 + hv_call_unmap_stat_page(HV_STATS_OBJECT_VP, &identity); 881 + } 882 + 883 + static int mshv_vp_stats_map(u64 partition_id, u32 vp_index, 884 + void *stats_pages[]) 885 + { 886 + union hv_stats_object_identity identity = { 887 + .vp.partition_id = partition_id, 888 + .vp.vp_index = vp_index, 889 + }; 890 + int err; 891 + 892 + identity.vp.stats_area_type = HV_STATS_AREA_SELF; 893 + err = hv_call_map_stat_page(HV_STATS_OBJECT_VP, &identity, 894 + &stats_pages[HV_STATS_AREA_SELF]); 895 + if (err) 896 + return err; 897 + 898 + identity.vp.stats_area_type = HV_STATS_AREA_PARENT; 899 + err = hv_call_map_stat_page(HV_STATS_OBJECT_VP, &identity, 900 + &stats_pages[HV_STATS_AREA_PARENT]); 901 + if (err) 902 + goto unmap_self; 903 + 904 + return 0; 905 + 906 + unmap_self: 907 + identity.vp.stats_area_type = HV_STATS_AREA_SELF; 908 + hv_call_unmap_stat_page(HV_STATS_OBJECT_VP, &identity); 909 + return err; 910 + } 911 + 912 + static long 913 + mshv_partition_ioctl_create_vp(struct mshv_partition *partition, 914 + void __user *arg) 915 + { 916 + struct mshv_create_vp args; 917 + struct mshv_vp *vp; 918 + struct page *intercept_message_page, *register_page, *ghcb_page; 919 + void *stats_pages[2]; 920 + long ret; 921 + 922 + if (copy_from_user(&args, arg, sizeof(args))) 923 + return -EFAULT; 924 + 925 + if (args.vp_index >= MSHV_MAX_VPS) 926 + return -EINVAL; 927 + 928 + if (partition->pt_vp_array[args.vp_index]) 929 + return -EEXIST; 930 + 931 + ret = hv_call_create_vp(NUMA_NO_NODE, partition->pt_id, args.vp_index, 932 + 0 /* Only valid for root partition VPs */); 933 + if (ret) 934 + return ret; 935 + 936 + ret = hv_call_map_vp_state_page(partition->pt_id, args.vp_index, 937 + HV_VP_STATE_PAGE_INTERCEPT_MESSAGE, 938 + input_vtl_zero, 939 + &intercept_message_page); 940 + if (ret) 941 + goto destroy_vp; 942 + 943 + if (!mshv_partition_encrypted(partition)) { 944 + ret = hv_call_map_vp_state_page(partition->pt_id, args.vp_index, 945 + HV_VP_STATE_PAGE_REGISTERS, 946 + input_vtl_zero, 947 + &register_page); 948 + if (ret) 949 + goto unmap_intercept_message_page; 950 + } 951 + 952 + if (mshv_partition_encrypted(partition) && 953 + is_ghcb_mapping_available()) { 954 + ret = hv_call_map_vp_state_page(partition->pt_id, args.vp_index, 955 + HV_VP_STATE_PAGE_GHCB, 956 + input_vtl_normal, 957 + &ghcb_page); 958 + if (ret) 959 + goto unmap_register_page; 960 + } 961 + 962 + if (hv_parent_partition()) { 963 + ret = mshv_vp_stats_map(partition->pt_id, args.vp_index, 964 + stats_pages); 965 + if (ret) 966 + goto unmap_ghcb_page; 967 + } 968 + 969 + vp = kzalloc(sizeof(*vp), GFP_KERNEL); 970 + if (!vp) 971 + goto unmap_stats_pages; 972 + 973 + vp->vp_partition = mshv_partition_get(partition); 974 + if (!vp->vp_partition) { 975 + ret = -EBADF; 976 + goto free_vp; 977 + } 978 + 979 + mutex_init(&vp->vp_mutex); 980 + init_waitqueue_head(&vp->run.vp_suspend_queue); 981 + atomic64_set(&vp->run.vp_signaled_count, 0); 982 + 983 + vp->vp_index = args.vp_index; 984 + vp->vp_intercept_msg_page = page_to_virt(intercept_message_page); 985 + if (!mshv_partition_encrypted(partition)) 986 + vp->vp_register_page = page_to_virt(register_page); 987 + 988 + if (mshv_partition_encrypted(partition) && is_ghcb_mapping_available()) 989 + vp->vp_ghcb_page = page_to_virt(ghcb_page); 990 + 991 + if (hv_parent_partition()) 992 + memcpy(vp->vp_stats_pages, stats_pages, sizeof(stats_pages)); 993 + 994 + /* 995 + * Keep anon_inode_getfd last: it installs fd in the file struct and 996 + * thus makes the state accessible in user space. 997 + */ 998 + ret = anon_inode_getfd("mshv_vp", &mshv_vp_fops, vp, 999 + O_RDWR | O_CLOEXEC); 1000 + if (ret < 0) 1001 + goto put_partition; 1002 + 1003 + /* already exclusive with the partition mutex for all ioctls */ 1004 + partition->pt_vp_count++; 1005 + partition->pt_vp_array[args.vp_index] = vp; 1006 + 1007 + return ret; 1008 + 1009 + put_partition: 1010 + mshv_partition_put(partition); 1011 + free_vp: 1012 + kfree(vp); 1013 + unmap_stats_pages: 1014 + if (hv_parent_partition()) 1015 + mshv_vp_stats_unmap(partition->pt_id, args.vp_index); 1016 + unmap_ghcb_page: 1017 + if (mshv_partition_encrypted(partition) && is_ghcb_mapping_available()) { 1018 + hv_call_unmap_vp_state_page(partition->pt_id, args.vp_index, 1019 + HV_VP_STATE_PAGE_GHCB, 1020 + input_vtl_normal); 1021 + } 1022 + unmap_register_page: 1023 + if (!mshv_partition_encrypted(partition)) { 1024 + hv_call_unmap_vp_state_page(partition->pt_id, args.vp_index, 1025 + HV_VP_STATE_PAGE_REGISTERS, 1026 + input_vtl_zero); 1027 + } 1028 + unmap_intercept_message_page: 1029 + hv_call_unmap_vp_state_page(partition->pt_id, args.vp_index, 1030 + HV_VP_STATE_PAGE_INTERCEPT_MESSAGE, 1031 + input_vtl_zero); 1032 + destroy_vp: 1033 + hv_call_delete_vp(partition->pt_id, args.vp_index); 1034 + return ret; 1035 + } 1036 + 1037 + static int mshv_init_async_handler(struct mshv_partition *partition) 1038 + { 1039 + if (completion_done(&partition->async_hypercall)) { 1040 + pt_err(partition, 1041 + "Cannot issue async hypercall while another one in progress!\n"); 1042 + return -EPERM; 1043 + } 1044 + 1045 + reinit_completion(&partition->async_hypercall); 1046 + return 0; 1047 + } 1048 + 1049 + static void mshv_async_hvcall_handler(void *data, u64 *status) 1050 + { 1051 + struct mshv_partition *partition = data; 1052 + 1053 + wait_for_completion(&partition->async_hypercall); 1054 + pt_dbg(partition, "Async hypercall completed!\n"); 1055 + 1056 + *status = partition->async_hypercall_status; 1057 + } 1058 + 1059 + static int 1060 + mshv_partition_region_share(struct mshv_mem_region *region) 1061 + { 1062 + u32 flags = HV_MODIFY_SPA_PAGE_HOST_ACCESS_MAKE_SHARED; 1063 + 1064 + if (region->flags.large_pages) 1065 + flags |= HV_MODIFY_SPA_PAGE_HOST_ACCESS_LARGE_PAGE; 1066 + 1067 + return hv_call_modify_spa_host_access(region->partition->pt_id, 1068 + region->pages, region->nr_pages, 1069 + HV_MAP_GPA_READABLE | HV_MAP_GPA_WRITABLE, 1070 + flags, true); 1071 + } 1072 + 1073 + static int 1074 + mshv_partition_region_unshare(struct mshv_mem_region *region) 1075 + { 1076 + u32 flags = HV_MODIFY_SPA_PAGE_HOST_ACCESS_MAKE_EXCLUSIVE; 1077 + 1078 + if (region->flags.large_pages) 1079 + flags |= HV_MODIFY_SPA_PAGE_HOST_ACCESS_LARGE_PAGE; 1080 + 1081 + return hv_call_modify_spa_host_access(region->partition->pt_id, 1082 + region->pages, region->nr_pages, 1083 + 0, 1084 + flags, false); 1085 + } 1086 + 1087 + static int 1088 + mshv_region_remap_pages(struct mshv_mem_region *region, u32 map_flags, 1089 + u64 page_offset, u64 page_count) 1090 + { 1091 + if (page_offset + page_count > region->nr_pages) 1092 + return -EINVAL; 1093 + 1094 + if (region->flags.large_pages) 1095 + map_flags |= HV_MAP_GPA_LARGE_PAGE; 1096 + 1097 + /* ask the hypervisor to map guest ram */ 1098 + return hv_call_map_gpa_pages(region->partition->pt_id, 1099 + region->start_gfn + page_offset, 1100 + page_count, map_flags, 1101 + region->pages + page_offset); 1102 + } 1103 + 1104 + static int 1105 + mshv_region_map(struct mshv_mem_region *region) 1106 + { 1107 + u32 map_flags = region->hv_map_flags; 1108 + 1109 + return mshv_region_remap_pages(region, map_flags, 1110 + 0, region->nr_pages); 1111 + } 1112 + 1113 + static void 1114 + mshv_region_evict_pages(struct mshv_mem_region *region, 1115 + u64 page_offset, u64 page_count) 1116 + { 1117 + if (region->flags.range_pinned) 1118 + unpin_user_pages(region->pages + page_offset, page_count); 1119 + 1120 + memset(region->pages + page_offset, 0, 1121 + page_count * sizeof(struct page *)); 1122 + } 1123 + 1124 + static void 1125 + mshv_region_evict(struct mshv_mem_region *region) 1126 + { 1127 + mshv_region_evict_pages(region, 0, region->nr_pages); 1128 + } 1129 + 1130 + static int 1131 + mshv_region_populate_pages(struct mshv_mem_region *region, 1132 + u64 page_offset, u64 page_count) 1133 + { 1134 + u64 done_count, nr_pages; 1135 + struct page **pages; 1136 + __u64 userspace_addr; 1137 + int ret; 1138 + 1139 + if (page_offset + page_count > region->nr_pages) 1140 + return -EINVAL; 1141 + 1142 + for (done_count = 0; done_count < page_count; done_count += ret) { 1143 + pages = region->pages + page_offset + done_count; 1144 + userspace_addr = region->start_uaddr + 1145 + (page_offset + done_count) * 1146 + HV_HYP_PAGE_SIZE; 1147 + nr_pages = min(page_count - done_count, 1148 + MSHV_PIN_PAGES_BATCH_SIZE); 1149 + 1150 + /* 1151 + * Pinning assuming 4k pages works for large pages too. 1152 + * All page structs within the large page are returned. 1153 + * 1154 + * Pin requests are batched because pin_user_pages_fast 1155 + * with the FOLL_LONGTERM flag does a large temporary 1156 + * allocation of contiguous memory. 1157 + */ 1158 + if (region->flags.range_pinned) 1159 + ret = pin_user_pages_fast(userspace_addr, 1160 + nr_pages, 1161 + FOLL_WRITE | FOLL_LONGTERM, 1162 + pages); 1163 + else 1164 + ret = -EOPNOTSUPP; 1165 + 1166 + if (ret < 0) 1167 + goto release_pages; 1168 + } 1169 + 1170 + if (PageHuge(region->pages[page_offset])) 1171 + region->flags.large_pages = true; 1172 + 1173 + return 0; 1174 + 1175 + release_pages: 1176 + mshv_region_evict_pages(region, page_offset, done_count); 1177 + return ret; 1178 + } 1179 + 1180 + static int 1181 + mshv_region_populate(struct mshv_mem_region *region) 1182 + { 1183 + return mshv_region_populate_pages(region, 0, region->nr_pages); 1184 + } 1185 + 1186 + static struct mshv_mem_region * 1187 + mshv_partition_region_by_gfn(struct mshv_partition *partition, u64 gfn) 1188 + { 1189 + struct mshv_mem_region *region; 1190 + 1191 + hlist_for_each_entry(region, &partition->pt_mem_regions, hnode) { 1192 + if (gfn >= region->start_gfn && 1193 + gfn < region->start_gfn + region->nr_pages) 1194 + return region; 1195 + } 1196 + 1197 + return NULL; 1198 + } 1199 + 1200 + static struct mshv_mem_region * 1201 + mshv_partition_region_by_uaddr(struct mshv_partition *partition, u64 uaddr) 1202 + { 1203 + struct mshv_mem_region *region; 1204 + 1205 + hlist_for_each_entry(region, &partition->pt_mem_regions, hnode) { 1206 + if (uaddr >= region->start_uaddr && 1207 + uaddr < region->start_uaddr + 1208 + (region->nr_pages << HV_HYP_PAGE_SHIFT)) 1209 + return region; 1210 + } 1211 + 1212 + return NULL; 1213 + } 1214 + 1215 + /* 1216 + * NB: caller checks and makes sure mem->size is page aligned 1217 + * Returns: 0 with regionpp updated on success, or -errno 1218 + */ 1219 + static int mshv_partition_create_region(struct mshv_partition *partition, 1220 + struct mshv_user_mem_region *mem, 1221 + struct mshv_mem_region **regionpp, 1222 + bool is_mmio) 1223 + { 1224 + struct mshv_mem_region *region; 1225 + u64 nr_pages = HVPFN_DOWN(mem->size); 1226 + 1227 + /* Reject overlapping regions */ 1228 + if (mshv_partition_region_by_gfn(partition, mem->guest_pfn) || 1229 + mshv_partition_region_by_gfn(partition, mem->guest_pfn + nr_pages - 1) || 1230 + mshv_partition_region_by_uaddr(partition, mem->userspace_addr) || 1231 + mshv_partition_region_by_uaddr(partition, mem->userspace_addr + mem->size - 1)) 1232 + return -EEXIST; 1233 + 1234 + region = vzalloc(sizeof(*region) + sizeof(struct page *) * nr_pages); 1235 + if (!region) 1236 + return -ENOMEM; 1237 + 1238 + region->nr_pages = nr_pages; 1239 + region->start_gfn = mem->guest_pfn; 1240 + region->start_uaddr = mem->userspace_addr; 1241 + region->hv_map_flags = HV_MAP_GPA_READABLE | HV_MAP_GPA_ADJUSTABLE; 1242 + if (mem->flags & BIT(MSHV_SET_MEM_BIT_WRITABLE)) 1243 + region->hv_map_flags |= HV_MAP_GPA_WRITABLE; 1244 + if (mem->flags & BIT(MSHV_SET_MEM_BIT_EXECUTABLE)) 1245 + region->hv_map_flags |= HV_MAP_GPA_EXECUTABLE; 1246 + 1247 + /* Note: large_pages flag populated when we pin the pages */ 1248 + if (!is_mmio) 1249 + region->flags.range_pinned = true; 1250 + 1251 + region->partition = partition; 1252 + 1253 + *regionpp = region; 1254 + 1255 + return 0; 1256 + } 1257 + 1258 + /* 1259 + * Map guest ram. if snp, make sure to release that from the host first 1260 + * Side Effects: In case of failure, pages are unpinned when feasible. 1261 + */ 1262 + static int 1263 + mshv_partition_mem_region_map(struct mshv_mem_region *region) 1264 + { 1265 + struct mshv_partition *partition = region->partition; 1266 + int ret; 1267 + 1268 + ret = mshv_region_populate(region); 1269 + if (ret) { 1270 + pt_err(partition, "Failed to populate memory region: %d\n", 1271 + ret); 1272 + goto err_out; 1273 + } 1274 + 1275 + /* 1276 + * For an SNP partition it is a requirement that for every memory region 1277 + * that we are going to map for this partition we should make sure that 1278 + * host access to that region is released. This is ensured by doing an 1279 + * additional hypercall which will update the SLAT to release host 1280 + * access to guest memory regions. 1281 + */ 1282 + if (mshv_partition_encrypted(partition)) { 1283 + ret = mshv_partition_region_unshare(region); 1284 + if (ret) { 1285 + pt_err(partition, 1286 + "Failed to unshare memory region (guest_pfn: %llu): %d\n", 1287 + region->start_gfn, ret); 1288 + goto evict_region; 1289 + } 1290 + } 1291 + 1292 + ret = mshv_region_map(region); 1293 + if (ret && mshv_partition_encrypted(partition)) { 1294 + int shrc; 1295 + 1296 + shrc = mshv_partition_region_share(region); 1297 + if (!shrc) 1298 + goto evict_region; 1299 + 1300 + pt_err(partition, 1301 + "Failed to share memory region (guest_pfn: %llu): %d\n", 1302 + region->start_gfn, shrc); 1303 + /* 1304 + * Don't unpin if marking shared failed because pages are no 1305 + * longer mapped in the host, ie root, anymore. 1306 + */ 1307 + goto err_out; 1308 + } 1309 + 1310 + return 0; 1311 + 1312 + evict_region: 1313 + mshv_region_evict(region); 1314 + err_out: 1315 + return ret; 1316 + } 1317 + 1318 + /* 1319 + * This maps two things: guest RAM and for pci passthru mmio space. 1320 + * 1321 + * mmio: 1322 + * - vfio overloads vm_pgoff to store the mmio start pfn/spa. 1323 + * - Two things need to happen for mapping mmio range: 1324 + * 1. mapped in the uaddr so VMM can access it. 1325 + * 2. mapped in the hwpt (gfn <-> mmio phys addr) so guest can access it. 1326 + * 1327 + * This function takes care of the second. The first one is managed by vfio, 1328 + * and hence is taken care of via vfio_pci_mmap_fault(). 1329 + */ 1330 + static long 1331 + mshv_map_user_memory(struct mshv_partition *partition, 1332 + struct mshv_user_mem_region mem) 1333 + { 1334 + struct mshv_mem_region *region; 1335 + struct vm_area_struct *vma; 1336 + bool is_mmio; 1337 + ulong mmio_pfn; 1338 + long ret; 1339 + 1340 + if (mem.flags & BIT(MSHV_SET_MEM_BIT_UNMAP) || 1341 + !access_ok((const void *)mem.userspace_addr, mem.size)) 1342 + return -EINVAL; 1343 + 1344 + mmap_read_lock(current->mm); 1345 + vma = vma_lookup(current->mm, mem.userspace_addr); 1346 + is_mmio = vma ? !!(vma->vm_flags & (VM_IO | VM_PFNMAP)) : 0; 1347 + mmio_pfn = is_mmio ? vma->vm_pgoff : 0; 1348 + mmap_read_unlock(current->mm); 1349 + 1350 + if (!vma) 1351 + return -EINVAL; 1352 + 1353 + ret = mshv_partition_create_region(partition, &mem, &region, 1354 + is_mmio); 1355 + if (ret) 1356 + return ret; 1357 + 1358 + if (is_mmio) 1359 + ret = hv_call_map_mmio_pages(partition->pt_id, mem.guest_pfn, 1360 + mmio_pfn, HVPFN_DOWN(mem.size)); 1361 + else 1362 + ret = mshv_partition_mem_region_map(region); 1363 + 1364 + if (ret) 1365 + goto errout; 1366 + 1367 + /* Install the new region */ 1368 + hlist_add_head(&region->hnode, &partition->pt_mem_regions); 1369 + 1370 + return 0; 1371 + 1372 + errout: 1373 + vfree(region); 1374 + return ret; 1375 + } 1376 + 1377 + /* Called for unmapping both the guest ram and the mmio space */ 1378 + static long 1379 + mshv_unmap_user_memory(struct mshv_partition *partition, 1380 + struct mshv_user_mem_region mem) 1381 + { 1382 + struct mshv_mem_region *region; 1383 + u32 unmap_flags = 0; 1384 + 1385 + if (!(mem.flags & BIT(MSHV_SET_MEM_BIT_UNMAP))) 1386 + return -EINVAL; 1387 + 1388 + region = mshv_partition_region_by_gfn(partition, mem.guest_pfn); 1389 + if (!region) 1390 + return -EINVAL; 1391 + 1392 + /* Paranoia check */ 1393 + if (region->start_uaddr != mem.userspace_addr || 1394 + region->start_gfn != mem.guest_pfn || 1395 + region->nr_pages != HVPFN_DOWN(mem.size)) 1396 + return -EINVAL; 1397 + 1398 + hlist_del(&region->hnode); 1399 + 1400 + if (region->flags.large_pages) 1401 + unmap_flags |= HV_UNMAP_GPA_LARGE_PAGE; 1402 + 1403 + /* ignore unmap failures and continue as process may be exiting */ 1404 + hv_call_unmap_gpa_pages(partition->pt_id, region->start_gfn, 1405 + region->nr_pages, unmap_flags); 1406 + 1407 + mshv_region_evict(region); 1408 + 1409 + vfree(region); 1410 + return 0; 1411 + } 1412 + 1413 + static long 1414 + mshv_partition_ioctl_set_memory(struct mshv_partition *partition, 1415 + struct mshv_user_mem_region __user *user_mem) 1416 + { 1417 + struct mshv_user_mem_region mem; 1418 + 1419 + if (copy_from_user(&mem, user_mem, sizeof(mem))) 1420 + return -EFAULT; 1421 + 1422 + if (!mem.size || 1423 + !PAGE_ALIGNED(mem.size) || 1424 + !PAGE_ALIGNED(mem.userspace_addr) || 1425 + (mem.flags & ~MSHV_SET_MEM_FLAGS_MASK) || 1426 + mshv_field_nonzero(mem, rsvd)) 1427 + return -EINVAL; 1428 + 1429 + if (mem.flags & BIT(MSHV_SET_MEM_BIT_UNMAP)) 1430 + return mshv_unmap_user_memory(partition, mem); 1431 + 1432 + return mshv_map_user_memory(partition, mem); 1433 + } 1434 + 1435 + static long 1436 + mshv_partition_ioctl_ioeventfd(struct mshv_partition *partition, 1437 + void __user *user_args) 1438 + { 1439 + struct mshv_user_ioeventfd args; 1440 + 1441 + if (copy_from_user(&args, user_args, sizeof(args))) 1442 + return -EFAULT; 1443 + 1444 + return mshv_set_unset_ioeventfd(partition, &args); 1445 + } 1446 + 1447 + static long 1448 + mshv_partition_ioctl_irqfd(struct mshv_partition *partition, 1449 + void __user *user_args) 1450 + { 1451 + struct mshv_user_irqfd args; 1452 + 1453 + if (copy_from_user(&args, user_args, sizeof(args))) 1454 + return -EFAULT; 1455 + 1456 + return mshv_set_unset_irqfd(partition, &args); 1457 + } 1458 + 1459 + static long 1460 + mshv_partition_ioctl_get_gpap_access_bitmap(struct mshv_partition *partition, 1461 + void __user *user_args) 1462 + { 1463 + struct mshv_gpap_access_bitmap args; 1464 + union hv_gpa_page_access_state *states; 1465 + long ret, i; 1466 + union hv_gpa_page_access_state_flags hv_flags = {}; 1467 + u8 hv_type_mask; 1468 + ulong bitmap_buf_sz, states_buf_sz; 1469 + int written = 0; 1470 + 1471 + if (copy_from_user(&args, user_args, sizeof(args))) 1472 + return -EFAULT; 1473 + 1474 + if (args.access_type >= MSHV_GPAP_ACCESS_TYPE_COUNT || 1475 + args.access_op >= MSHV_GPAP_ACCESS_OP_COUNT || 1476 + mshv_field_nonzero(args, rsvd) || !args.page_count || 1477 + !args.bitmap_ptr) 1478 + return -EINVAL; 1479 + 1480 + if (check_mul_overflow(args.page_count, sizeof(*states), &states_buf_sz)) 1481 + return -E2BIG; 1482 + 1483 + /* Num bytes needed to store bitmap; one bit per page rounded up */ 1484 + bitmap_buf_sz = DIV_ROUND_UP(args.page_count, 8); 1485 + 1486 + /* Sanity check */ 1487 + if (bitmap_buf_sz > states_buf_sz) 1488 + return -EBADFD; 1489 + 1490 + switch (args.access_type) { 1491 + case MSHV_GPAP_ACCESS_TYPE_ACCESSED: 1492 + hv_type_mask = 1; 1493 + if (args.access_op == MSHV_GPAP_ACCESS_OP_CLEAR) { 1494 + hv_flags.clear_accessed = 1; 1495 + /* not accessed implies not dirty */ 1496 + hv_flags.clear_dirty = 1; 1497 + } else { /* MSHV_GPAP_ACCESS_OP_SET */ 1498 + hv_flags.set_accessed = 1; 1499 + } 1500 + break; 1501 + case MSHV_GPAP_ACCESS_TYPE_DIRTY: 1502 + hv_type_mask = 2; 1503 + if (args.access_op == MSHV_GPAP_ACCESS_OP_CLEAR) { 1504 + hv_flags.clear_dirty = 1; 1505 + } else { /* MSHV_GPAP_ACCESS_OP_SET */ 1506 + hv_flags.set_dirty = 1; 1507 + /* dirty implies accessed */ 1508 + hv_flags.set_accessed = 1; 1509 + } 1510 + break; 1511 + } 1512 + 1513 + states = vzalloc(states_buf_sz); 1514 + if (!states) 1515 + return -ENOMEM; 1516 + 1517 + ret = hv_call_get_gpa_access_states(partition->pt_id, args.page_count, 1518 + args.gpap_base, hv_flags, &written, 1519 + states); 1520 + if (ret) 1521 + goto free_return; 1522 + 1523 + /* 1524 + * Overwrite states buffer with bitmap - the bits in hv_type_mask 1525 + * correspond to bitfields in hv_gpa_page_access_state 1526 + */ 1527 + for (i = 0; i < written; ++i) 1528 + __assign_bit(i, (ulong *)states, 1529 + states[i].as_uint8 & hv_type_mask); 1530 + 1531 + /* zero the unused bits in the last byte(s) of the returned bitmap */ 1532 + for (i = written; i < bitmap_buf_sz * 8; ++i) 1533 + __clear_bit(i, (ulong *)states); 1534 + 1535 + if (copy_to_user((void __user *)args.bitmap_ptr, states, bitmap_buf_sz)) 1536 + ret = -EFAULT; 1537 + 1538 + free_return: 1539 + vfree(states); 1540 + return ret; 1541 + } 1542 + 1543 + static long 1544 + mshv_partition_ioctl_set_msi_routing(struct mshv_partition *partition, 1545 + void __user *user_args) 1546 + { 1547 + struct mshv_user_irq_entry *entries = NULL; 1548 + struct mshv_user_irq_table args; 1549 + long ret; 1550 + 1551 + if (copy_from_user(&args, user_args, sizeof(args))) 1552 + return -EFAULT; 1553 + 1554 + if (args.nr > MSHV_MAX_GUEST_IRQS || 1555 + mshv_field_nonzero(args, rsvd)) 1556 + return -EINVAL; 1557 + 1558 + if (args.nr) { 1559 + struct mshv_user_irq_table __user *urouting = user_args; 1560 + 1561 + entries = vmemdup_user(urouting->entries, 1562 + array_size(sizeof(*entries), 1563 + args.nr)); 1564 + if (IS_ERR(entries)) 1565 + return PTR_ERR(entries); 1566 + } 1567 + ret = mshv_update_routing_table(partition, entries, args.nr); 1568 + kvfree(entries); 1569 + 1570 + return ret; 1571 + } 1572 + 1573 + static long 1574 + mshv_partition_ioctl_initialize(struct mshv_partition *partition) 1575 + { 1576 + long ret; 1577 + 1578 + if (partition->pt_initialized) 1579 + return 0; 1580 + 1581 + ret = hv_call_initialize_partition(partition->pt_id); 1582 + if (ret) 1583 + goto withdraw_mem; 1584 + 1585 + partition->pt_initialized = true; 1586 + 1587 + return 0; 1588 + 1589 + withdraw_mem: 1590 + hv_call_withdraw_memory(U64_MAX, NUMA_NO_NODE, partition->pt_id); 1591 + 1592 + return ret; 1593 + } 1594 + 1595 + static long 1596 + mshv_partition_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) 1597 + { 1598 + struct mshv_partition *partition = filp->private_data; 1599 + long ret; 1600 + void __user *uarg = (void __user *)arg; 1601 + 1602 + if (mutex_lock_killable(&partition->pt_mutex)) 1603 + return -EINTR; 1604 + 1605 + switch (ioctl) { 1606 + case MSHV_INITIALIZE_PARTITION: 1607 + ret = mshv_partition_ioctl_initialize(partition); 1608 + break; 1609 + case MSHV_SET_GUEST_MEMORY: 1610 + ret = mshv_partition_ioctl_set_memory(partition, uarg); 1611 + break; 1612 + case MSHV_CREATE_VP: 1613 + ret = mshv_partition_ioctl_create_vp(partition, uarg); 1614 + break; 1615 + case MSHV_IRQFD: 1616 + ret = mshv_partition_ioctl_irqfd(partition, uarg); 1617 + break; 1618 + case MSHV_IOEVENTFD: 1619 + ret = mshv_partition_ioctl_ioeventfd(partition, uarg); 1620 + break; 1621 + case MSHV_SET_MSI_ROUTING: 1622 + ret = mshv_partition_ioctl_set_msi_routing(partition, uarg); 1623 + break; 1624 + case MSHV_GET_GPAP_ACCESS_BITMAP: 1625 + ret = mshv_partition_ioctl_get_gpap_access_bitmap(partition, 1626 + uarg); 1627 + break; 1628 + case MSHV_ROOT_HVCALL: 1629 + ret = mshv_ioctl_passthru_hvcall(partition, true, uarg); 1630 + break; 1631 + default: 1632 + ret = -ENOTTY; 1633 + } 1634 + 1635 + mutex_unlock(&partition->pt_mutex); 1636 + return ret; 1637 + } 1638 + 1639 + static int 1640 + disable_vp_dispatch(struct mshv_vp *vp) 1641 + { 1642 + int ret; 1643 + struct hv_register_assoc dispatch_suspend = { 1644 + .name = HV_REGISTER_DISPATCH_SUSPEND, 1645 + .value.dispatch_suspend.suspended = 1, 1646 + }; 1647 + 1648 + ret = mshv_set_vp_registers(vp->vp_index, vp->vp_partition->pt_id, 1649 + 1, &dispatch_suspend); 1650 + if (ret) 1651 + vp_err(vp, "failed to suspend\n"); 1652 + 1653 + return ret; 1654 + } 1655 + 1656 + static int 1657 + get_vp_signaled_count(struct mshv_vp *vp, u64 *count) 1658 + { 1659 + int ret; 1660 + struct hv_register_assoc root_signal_count = { 1661 + .name = HV_REGISTER_VP_ROOT_SIGNAL_COUNT, 1662 + }; 1663 + 1664 + ret = mshv_get_vp_registers(vp->vp_index, vp->vp_partition->pt_id, 1665 + 1, &root_signal_count); 1666 + 1667 + if (ret) { 1668 + vp_err(vp, "Failed to get root signal count"); 1669 + *count = 0; 1670 + return ret; 1671 + } 1672 + 1673 + *count = root_signal_count.value.reg64; 1674 + 1675 + return ret; 1676 + } 1677 + 1678 + static void 1679 + drain_vp_signals(struct mshv_vp *vp) 1680 + { 1681 + u64 hv_signal_count; 1682 + u64 vp_signal_count; 1683 + 1684 + get_vp_signaled_count(vp, &hv_signal_count); 1685 + 1686 + vp_signal_count = atomic64_read(&vp->run.vp_signaled_count); 1687 + 1688 + /* 1689 + * There should be at most 1 outstanding notification, but be extra 1690 + * careful anyway. 1691 + */ 1692 + while (hv_signal_count != vp_signal_count) { 1693 + WARN_ON(hv_signal_count - vp_signal_count != 1); 1694 + 1695 + if (wait_event_interruptible(vp->run.vp_suspend_queue, 1696 + vp->run.kicked_by_hv == 1)) 1697 + break; 1698 + vp->run.kicked_by_hv = 0; 1699 + vp_signal_count = atomic64_read(&vp->run.vp_signaled_count); 1700 + } 1701 + } 1702 + 1703 + static void drain_all_vps(const struct mshv_partition *partition) 1704 + { 1705 + int i; 1706 + struct mshv_vp *vp; 1707 + 1708 + /* 1709 + * VPs are reachable from ISR. It is safe to not take the partition 1710 + * lock because nobody else can enter this function and drop the 1711 + * partition from the list. 1712 + */ 1713 + for (i = 0; i < MSHV_MAX_VPS; i++) { 1714 + vp = partition->pt_vp_array[i]; 1715 + if (!vp) 1716 + continue; 1717 + /* 1718 + * Disable dispatching of the VP in the hypervisor. After this 1719 + * the hypervisor guarantees it won't generate any signals for 1720 + * the VP and the hypervisor's VP signal count won't change. 1721 + */ 1722 + disable_vp_dispatch(vp); 1723 + drain_vp_signals(vp); 1724 + } 1725 + } 1726 + 1727 + static void 1728 + remove_partition(struct mshv_partition *partition) 1729 + { 1730 + spin_lock(&mshv_root.pt_ht_lock); 1731 + hlist_del_rcu(&partition->pt_hnode); 1732 + spin_unlock(&mshv_root.pt_ht_lock); 1733 + 1734 + synchronize_rcu(); 1735 + } 1736 + 1737 + /* 1738 + * Tear down a partition and remove it from the list. 1739 + * Partition's refcount must be 0 1740 + */ 1741 + static void destroy_partition(struct mshv_partition *partition) 1742 + { 1743 + struct mshv_vp *vp; 1744 + struct mshv_mem_region *region; 1745 + int i, ret; 1746 + struct hlist_node *n; 1747 + 1748 + if (refcount_read(&partition->pt_ref_count)) { 1749 + pt_err(partition, 1750 + "Attempt to destroy partition but refcount > 0\n"); 1751 + return; 1752 + } 1753 + 1754 + if (partition->pt_initialized) { 1755 + /* 1756 + * We only need to drain signals for root scheduler. This should be 1757 + * done before removing the partition from the partition list. 1758 + */ 1759 + if (hv_scheduler_type == HV_SCHEDULER_TYPE_ROOT) 1760 + drain_all_vps(partition); 1761 + 1762 + /* Remove vps */ 1763 + for (i = 0; i < MSHV_MAX_VPS; ++i) { 1764 + vp = partition->pt_vp_array[i]; 1765 + if (!vp) 1766 + continue; 1767 + 1768 + if (hv_parent_partition()) 1769 + mshv_vp_stats_unmap(partition->pt_id, vp->vp_index); 1770 + 1771 + if (vp->vp_register_page) { 1772 + (void)hv_call_unmap_vp_state_page(partition->pt_id, 1773 + vp->vp_index, 1774 + HV_VP_STATE_PAGE_REGISTERS, 1775 + input_vtl_zero); 1776 + vp->vp_register_page = NULL; 1777 + } 1778 + 1779 + (void)hv_call_unmap_vp_state_page(partition->pt_id, 1780 + vp->vp_index, 1781 + HV_VP_STATE_PAGE_INTERCEPT_MESSAGE, 1782 + input_vtl_zero); 1783 + vp->vp_intercept_msg_page = NULL; 1784 + 1785 + if (vp->vp_ghcb_page) { 1786 + (void)hv_call_unmap_vp_state_page(partition->pt_id, 1787 + vp->vp_index, 1788 + HV_VP_STATE_PAGE_GHCB, 1789 + input_vtl_normal); 1790 + vp->vp_ghcb_page = NULL; 1791 + } 1792 + 1793 + kfree(vp); 1794 + 1795 + partition->pt_vp_array[i] = NULL; 1796 + } 1797 + 1798 + /* Deallocates and unmaps everything including vcpus, GPA mappings etc */ 1799 + hv_call_finalize_partition(partition->pt_id); 1800 + 1801 + partition->pt_initialized = false; 1802 + } 1803 + 1804 + remove_partition(partition); 1805 + 1806 + /* Remove regions, regain access to the memory and unpin the pages */ 1807 + hlist_for_each_entry_safe(region, n, &partition->pt_mem_regions, 1808 + hnode) { 1809 + hlist_del(&region->hnode); 1810 + 1811 + if (mshv_partition_encrypted(partition)) { 1812 + ret = mshv_partition_region_share(region); 1813 + if (ret) { 1814 + pt_err(partition, 1815 + "Failed to regain access to memory, unpinning user pages will fail and crash the host error: %d\n", 1816 + ret); 1817 + return; 1818 + } 1819 + } 1820 + 1821 + mshv_region_evict(region); 1822 + 1823 + vfree(region); 1824 + } 1825 + 1826 + /* Withdraw and free all pages we deposited */ 1827 + hv_call_withdraw_memory(U64_MAX, NUMA_NO_NODE, partition->pt_id); 1828 + hv_call_delete_partition(partition->pt_id); 1829 + 1830 + mshv_free_routing_table(partition); 1831 + kfree(partition); 1832 + } 1833 + 1834 + struct 1835 + mshv_partition *mshv_partition_get(struct mshv_partition *partition) 1836 + { 1837 + if (refcount_inc_not_zero(&partition->pt_ref_count)) 1838 + return partition; 1839 + return NULL; 1840 + } 1841 + 1842 + struct 1843 + mshv_partition *mshv_partition_find(u64 partition_id) 1844 + __must_hold(RCU) 1845 + { 1846 + struct mshv_partition *p; 1847 + 1848 + hash_for_each_possible_rcu(mshv_root.pt_htable, p, pt_hnode, 1849 + partition_id) 1850 + if (p->pt_id == partition_id) 1851 + return p; 1852 + 1853 + return NULL; 1854 + } 1855 + 1856 + void 1857 + mshv_partition_put(struct mshv_partition *partition) 1858 + { 1859 + if (refcount_dec_and_test(&partition->pt_ref_count)) 1860 + destroy_partition(partition); 1861 + } 1862 + 1863 + static int 1864 + mshv_partition_release(struct inode *inode, struct file *filp) 1865 + { 1866 + struct mshv_partition *partition = filp->private_data; 1867 + 1868 + mshv_eventfd_release(partition); 1869 + 1870 + cleanup_srcu_struct(&partition->pt_irq_srcu); 1871 + 1872 + mshv_partition_put(partition); 1873 + 1874 + return 0; 1875 + } 1876 + 1877 + static int 1878 + add_partition(struct mshv_partition *partition) 1879 + { 1880 + spin_lock(&mshv_root.pt_ht_lock); 1881 + 1882 + hash_add_rcu(mshv_root.pt_htable, &partition->pt_hnode, 1883 + partition->pt_id); 1884 + 1885 + spin_unlock(&mshv_root.pt_ht_lock); 1886 + 1887 + return 0; 1888 + } 1889 + 1890 + static long 1891 + mshv_ioctl_create_partition(void __user *user_arg, struct device *module_dev) 1892 + { 1893 + struct mshv_create_partition args; 1894 + u64 creation_flags; 1895 + struct hv_partition_creation_properties creation_properties = {}; 1896 + union hv_partition_isolation_properties isolation_properties = {}; 1897 + struct mshv_partition *partition; 1898 + struct file *file; 1899 + int fd; 1900 + long ret; 1901 + 1902 + if (copy_from_user(&args, user_arg, sizeof(args))) 1903 + return -EFAULT; 1904 + 1905 + if ((args.pt_flags & ~MSHV_PT_FLAGS_MASK) || 1906 + args.pt_isolation >= MSHV_PT_ISOLATION_COUNT) 1907 + return -EINVAL; 1908 + 1909 + /* Only support EXO partitions */ 1910 + creation_flags = HV_PARTITION_CREATION_FLAG_EXO_PARTITION | 1911 + HV_PARTITION_CREATION_FLAG_INTERCEPT_MESSAGE_PAGE_ENABLED; 1912 + 1913 + if (args.pt_flags & BIT(MSHV_PT_BIT_LAPIC)) 1914 + creation_flags |= HV_PARTITION_CREATION_FLAG_LAPIC_ENABLED; 1915 + if (args.pt_flags & BIT(MSHV_PT_BIT_X2APIC)) 1916 + creation_flags |= HV_PARTITION_CREATION_FLAG_X2APIC_CAPABLE; 1917 + if (args.pt_flags & BIT(MSHV_PT_BIT_GPA_SUPER_PAGES)) 1918 + creation_flags |= HV_PARTITION_CREATION_FLAG_GPA_SUPER_PAGES_ENABLED; 1919 + 1920 + switch (args.pt_isolation) { 1921 + case MSHV_PT_ISOLATION_NONE: 1922 + isolation_properties.isolation_type = 1923 + HV_PARTITION_ISOLATION_TYPE_NONE; 1924 + break; 1925 + } 1926 + 1927 + partition = kzalloc(sizeof(*partition), GFP_KERNEL); 1928 + if (!partition) 1929 + return -ENOMEM; 1930 + 1931 + partition->pt_module_dev = module_dev; 1932 + partition->isolation_type = isolation_properties.isolation_type; 1933 + 1934 + refcount_set(&partition->pt_ref_count, 1); 1935 + 1936 + mutex_init(&partition->pt_mutex); 1937 + 1938 + mutex_init(&partition->pt_irq_lock); 1939 + 1940 + init_completion(&partition->async_hypercall); 1941 + 1942 + INIT_HLIST_HEAD(&partition->irq_ack_notifier_list); 1943 + 1944 + INIT_HLIST_HEAD(&partition->pt_devices); 1945 + 1946 + INIT_HLIST_HEAD(&partition->pt_mem_regions); 1947 + 1948 + mshv_eventfd_init(partition); 1949 + 1950 + ret = init_srcu_struct(&partition->pt_irq_srcu); 1951 + if (ret) 1952 + goto free_partition; 1953 + 1954 + ret = hv_call_create_partition(creation_flags, 1955 + creation_properties, 1956 + isolation_properties, 1957 + &partition->pt_id); 1958 + if (ret) 1959 + goto cleanup_irq_srcu; 1960 + 1961 + ret = add_partition(partition); 1962 + if (ret) 1963 + goto delete_partition; 1964 + 1965 + ret = mshv_init_async_handler(partition); 1966 + if (ret) 1967 + goto remove_partition; 1968 + 1969 + fd = get_unused_fd_flags(O_CLOEXEC); 1970 + if (fd < 0) { 1971 + ret = fd; 1972 + goto remove_partition; 1973 + } 1974 + 1975 + file = anon_inode_getfile("mshv_partition", &mshv_partition_fops, 1976 + partition, O_RDWR); 1977 + if (IS_ERR(file)) { 1978 + ret = PTR_ERR(file); 1979 + goto put_fd; 1980 + } 1981 + 1982 + fd_install(fd, file); 1983 + 1984 + return fd; 1985 + 1986 + put_fd: 1987 + put_unused_fd(fd); 1988 + remove_partition: 1989 + remove_partition(partition); 1990 + delete_partition: 1991 + hv_call_delete_partition(partition->pt_id); 1992 + cleanup_irq_srcu: 1993 + cleanup_srcu_struct(&partition->pt_irq_srcu); 1994 + free_partition: 1995 + kfree(partition); 1996 + 1997 + return ret; 1998 + } 1999 + 2000 + static long mshv_dev_ioctl(struct file *filp, unsigned int ioctl, 2001 + unsigned long arg) 2002 + { 2003 + struct miscdevice *misc = filp->private_data; 2004 + 2005 + switch (ioctl) { 2006 + case MSHV_CREATE_PARTITION: 2007 + return mshv_ioctl_create_partition((void __user *)arg, 2008 + misc->this_device); 2009 + } 2010 + 2011 + return -ENOTTY; 2012 + } 2013 + 2014 + static int 2015 + mshv_dev_open(struct inode *inode, struct file *filp) 2016 + { 2017 + return 0; 2018 + } 2019 + 2020 + static int 2021 + mshv_dev_release(struct inode *inode, struct file *filp) 2022 + { 2023 + return 0; 2024 + } 2025 + 2026 + static int mshv_cpuhp_online; 2027 + static int mshv_root_sched_online; 2028 + 2029 + static const char *scheduler_type_to_string(enum hv_scheduler_type type) 2030 + { 2031 + switch (type) { 2032 + case HV_SCHEDULER_TYPE_LP: 2033 + return "classic scheduler without SMT"; 2034 + case HV_SCHEDULER_TYPE_LP_SMT: 2035 + return "classic scheduler with SMT"; 2036 + case HV_SCHEDULER_TYPE_CORE_SMT: 2037 + return "core scheduler"; 2038 + case HV_SCHEDULER_TYPE_ROOT: 2039 + return "root scheduler"; 2040 + default: 2041 + return "unknown scheduler"; 2042 + }; 2043 + } 2044 + 2045 + /* TODO move this to hv_common.c when needed outside */ 2046 + static int __init hv_retrieve_scheduler_type(enum hv_scheduler_type *out) 2047 + { 2048 + struct hv_input_get_system_property *input; 2049 + struct hv_output_get_system_property *output; 2050 + unsigned long flags; 2051 + u64 status; 2052 + 2053 + local_irq_save(flags); 2054 + input = *this_cpu_ptr(hyperv_pcpu_input_arg); 2055 + output = *this_cpu_ptr(hyperv_pcpu_output_arg); 2056 + 2057 + memset(input, 0, sizeof(*input)); 2058 + memset(output, 0, sizeof(*output)); 2059 + input->property_id = HV_SYSTEM_PROPERTY_SCHEDULER_TYPE; 2060 + 2061 + status = hv_do_hypercall(HVCALL_GET_SYSTEM_PROPERTY, input, output); 2062 + if (!hv_result_success(status)) { 2063 + local_irq_restore(flags); 2064 + pr_err("%s: %s\n", __func__, hv_result_to_string(status)); 2065 + return hv_result_to_errno(status); 2066 + } 2067 + 2068 + *out = output->scheduler_type; 2069 + local_irq_restore(flags); 2070 + 2071 + return 0; 2072 + } 2073 + 2074 + /* Retrieve and stash the supported scheduler type */ 2075 + static int __init mshv_retrieve_scheduler_type(struct device *dev) 2076 + { 2077 + int ret; 2078 + 2079 + ret = hv_retrieve_scheduler_type(&hv_scheduler_type); 2080 + if (ret) 2081 + return ret; 2082 + 2083 + dev_info(dev, "Hypervisor using %s\n", 2084 + scheduler_type_to_string(hv_scheduler_type)); 2085 + 2086 + switch (hv_scheduler_type) { 2087 + case HV_SCHEDULER_TYPE_CORE_SMT: 2088 + case HV_SCHEDULER_TYPE_LP_SMT: 2089 + case HV_SCHEDULER_TYPE_ROOT: 2090 + case HV_SCHEDULER_TYPE_LP: 2091 + /* Supported scheduler, nothing to do */ 2092 + break; 2093 + default: 2094 + dev_err(dev, "unsupported scheduler 0x%x, bailing.\n", 2095 + hv_scheduler_type); 2096 + return -EOPNOTSUPP; 2097 + } 2098 + 2099 + return 0; 2100 + } 2101 + 2102 + static int mshv_root_scheduler_init(unsigned int cpu) 2103 + { 2104 + void **inputarg, **outputarg, *p; 2105 + 2106 + inputarg = (void **)this_cpu_ptr(root_scheduler_input); 2107 + outputarg = (void **)this_cpu_ptr(root_scheduler_output); 2108 + 2109 + /* Allocate two consecutive pages. One for input, one for output. */ 2110 + p = kmalloc(2 * HV_HYP_PAGE_SIZE, GFP_KERNEL); 2111 + if (!p) 2112 + return -ENOMEM; 2113 + 2114 + *inputarg = p; 2115 + *outputarg = (char *)p + HV_HYP_PAGE_SIZE; 2116 + 2117 + return 0; 2118 + } 2119 + 2120 + static int mshv_root_scheduler_cleanup(unsigned int cpu) 2121 + { 2122 + void *p, **inputarg, **outputarg; 2123 + 2124 + inputarg = (void **)this_cpu_ptr(root_scheduler_input); 2125 + outputarg = (void **)this_cpu_ptr(root_scheduler_output); 2126 + 2127 + p = *inputarg; 2128 + 2129 + *inputarg = NULL; 2130 + *outputarg = NULL; 2131 + 2132 + kfree(p); 2133 + 2134 + return 0; 2135 + } 2136 + 2137 + /* Must be called after retrieving the scheduler type */ 2138 + static int 2139 + root_scheduler_init(struct device *dev) 2140 + { 2141 + int ret; 2142 + 2143 + if (hv_scheduler_type != HV_SCHEDULER_TYPE_ROOT) 2144 + return 0; 2145 + 2146 + root_scheduler_input = alloc_percpu(void *); 2147 + root_scheduler_output = alloc_percpu(void *); 2148 + 2149 + if (!root_scheduler_input || !root_scheduler_output) { 2150 + dev_err(dev, "Failed to allocate root scheduler buffers\n"); 2151 + ret = -ENOMEM; 2152 + goto out; 2153 + } 2154 + 2155 + ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "mshv_root_sched", 2156 + mshv_root_scheduler_init, 2157 + mshv_root_scheduler_cleanup); 2158 + 2159 + if (ret < 0) { 2160 + dev_err(dev, "Failed to setup root scheduler state: %i\n", ret); 2161 + goto out; 2162 + } 2163 + 2164 + mshv_root_sched_online = ret; 2165 + 2166 + return 0; 2167 + 2168 + out: 2169 + free_percpu(root_scheduler_input); 2170 + free_percpu(root_scheduler_output); 2171 + return ret; 2172 + } 2173 + 2174 + static void 2175 + root_scheduler_deinit(void) 2176 + { 2177 + if (hv_scheduler_type != HV_SCHEDULER_TYPE_ROOT) 2178 + return; 2179 + 2180 + cpuhp_remove_state(mshv_root_sched_online); 2181 + free_percpu(root_scheduler_input); 2182 + free_percpu(root_scheduler_output); 2183 + } 2184 + 2185 + static int mshv_reboot_notify(struct notifier_block *nb, 2186 + unsigned long code, void *unused) 2187 + { 2188 + cpuhp_remove_state(mshv_cpuhp_online); 2189 + return 0; 2190 + } 2191 + 2192 + struct notifier_block mshv_reboot_nb = { 2193 + .notifier_call = mshv_reboot_notify, 2194 + }; 2195 + 2196 + static void mshv_root_partition_exit(void) 2197 + { 2198 + unregister_reboot_notifier(&mshv_reboot_nb); 2199 + root_scheduler_deinit(); 2200 + } 2201 + 2202 + static int __init mshv_root_partition_init(struct device *dev) 2203 + { 2204 + int err; 2205 + 2206 + if (mshv_retrieve_scheduler_type(dev)) 2207 + return -ENODEV; 2208 + 2209 + err = root_scheduler_init(dev); 2210 + if (err) 2211 + return err; 2212 + 2213 + err = register_reboot_notifier(&mshv_reboot_nb); 2214 + if (err) 2215 + goto root_sched_deinit; 2216 + 2217 + return 0; 2218 + 2219 + root_sched_deinit: 2220 + root_scheduler_deinit(); 2221 + return err; 2222 + } 2223 + 2224 + static int __init mshv_parent_partition_init(void) 2225 + { 2226 + int ret; 2227 + struct device *dev; 2228 + union hv_hypervisor_version_info version_info; 2229 + 2230 + if (!hv_root_partition() || is_kdump_kernel()) 2231 + return -ENODEV; 2232 + 2233 + if (hv_get_hypervisor_version(&version_info)) 2234 + return -ENODEV; 2235 + 2236 + ret = misc_register(&mshv_dev); 2237 + if (ret) 2238 + return ret; 2239 + 2240 + dev = mshv_dev.this_device; 2241 + 2242 + if (version_info.build_number < MSHV_HV_MIN_VERSION || 2243 + version_info.build_number > MSHV_HV_MAX_VERSION) { 2244 + dev_err(dev, "Running on unvalidated Hyper-V version\n"); 2245 + dev_err(dev, "Versions: current: %u min: %u max: %u\n", 2246 + version_info.build_number, MSHV_HV_MIN_VERSION, 2247 + MSHV_HV_MAX_VERSION); 2248 + } 2249 + 2250 + mshv_root.synic_pages = alloc_percpu(struct hv_synic_pages); 2251 + if (!mshv_root.synic_pages) { 2252 + dev_err(dev, "Failed to allocate percpu synic page\n"); 2253 + ret = -ENOMEM; 2254 + goto device_deregister; 2255 + } 2256 + 2257 + ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "mshv_synic", 2258 + mshv_synic_init, 2259 + mshv_synic_cleanup); 2260 + if (ret < 0) { 2261 + dev_err(dev, "Failed to setup cpu hotplug state: %i\n", ret); 2262 + goto free_synic_pages; 2263 + } 2264 + 2265 + mshv_cpuhp_online = ret; 2266 + 2267 + ret = mshv_root_partition_init(dev); 2268 + if (ret) 2269 + goto remove_cpu_state; 2270 + 2271 + ret = mshv_irqfd_wq_init(); 2272 + if (ret) 2273 + goto exit_partition; 2274 + 2275 + spin_lock_init(&mshv_root.pt_ht_lock); 2276 + hash_init(mshv_root.pt_htable); 2277 + 2278 + hv_setup_mshv_handler(mshv_isr); 2279 + 2280 + return 0; 2281 + 2282 + exit_partition: 2283 + if (hv_root_partition()) 2284 + mshv_root_partition_exit(); 2285 + remove_cpu_state: 2286 + cpuhp_remove_state(mshv_cpuhp_online); 2287 + free_synic_pages: 2288 + free_percpu(mshv_root.synic_pages); 2289 + device_deregister: 2290 + misc_deregister(&mshv_dev); 2291 + return ret; 2292 + } 2293 + 2294 + static void __exit mshv_parent_partition_exit(void) 2295 + { 2296 + hv_setup_mshv_handler(NULL); 2297 + mshv_port_table_fini(); 2298 + misc_deregister(&mshv_dev); 2299 + mshv_irqfd_wq_cleanup(); 2300 + if (hv_root_partition()) 2301 + mshv_root_partition_exit(); 2302 + cpuhp_remove_state(mshv_cpuhp_online); 2303 + free_percpu(mshv_root.synic_pages); 2304 + } 2305 + 2306 + module_init(mshv_parent_partition_init); 2307 + module_exit(mshv_parent_partition_exit);
+665
drivers/hv/mshv_synic.c
··· 1 + // SPDX-License-Identifier: GPL-2.0-only 2 + /* 3 + * Copyright (c) 2023, Microsoft Corporation. 4 + * 5 + * mshv_root module's main interrupt handler and associated functionality. 6 + * 7 + * Authors: Microsoft Linux virtualization team 8 + */ 9 + 10 + #include <linux/kernel.h> 11 + #include <linux/slab.h> 12 + #include <linux/mm.h> 13 + #include <linux/io.h> 14 + #include <linux/random.h> 15 + #include <asm/mshyperv.h> 16 + 17 + #include "mshv_eventfd.h" 18 + #include "mshv.h" 19 + 20 + static u32 synic_event_ring_get_queued_port(u32 sint_index) 21 + { 22 + struct hv_synic_event_ring_page **event_ring_page; 23 + volatile struct hv_synic_event_ring *ring; 24 + struct hv_synic_pages *spages; 25 + u8 **synic_eventring_tail; 26 + u32 message; 27 + u8 tail; 28 + 29 + spages = this_cpu_ptr(mshv_root.synic_pages); 30 + event_ring_page = &spages->synic_event_ring_page; 31 + synic_eventring_tail = (u8 **)this_cpu_ptr(hv_synic_eventring_tail); 32 + 33 + if (unlikely(!*synic_eventring_tail)) { 34 + pr_debug("Missing synic event ring tail!\n"); 35 + return 0; 36 + } 37 + tail = (*synic_eventring_tail)[sint_index]; 38 + 39 + if (unlikely(!*event_ring_page)) { 40 + pr_debug("Missing synic event ring page!\n"); 41 + return 0; 42 + } 43 + 44 + ring = &(*event_ring_page)->sint_event_ring[sint_index]; 45 + 46 + /* 47 + * Get the message. 48 + */ 49 + message = ring->data[tail]; 50 + 51 + if (!message) { 52 + if (ring->ring_full) { 53 + /* 54 + * Ring is marked full, but we would have consumed all 55 + * the messages. Notify the hypervisor that ring is now 56 + * empty and check again. 57 + */ 58 + ring->ring_full = 0; 59 + hv_call_notify_port_ring_empty(sint_index); 60 + message = ring->data[tail]; 61 + } 62 + 63 + if (!message) { 64 + ring->signal_masked = 0; 65 + /* 66 + * Unmask the signal and sync with hypervisor 67 + * before one last check for any message. 68 + */ 69 + mb(); 70 + message = ring->data[tail]; 71 + 72 + /* 73 + * Ok, lets bail out. 74 + */ 75 + if (!message) 76 + return 0; 77 + } 78 + 79 + ring->signal_masked = 1; 80 + } 81 + 82 + /* 83 + * Clear the message in the ring buffer. 84 + */ 85 + ring->data[tail] = 0; 86 + 87 + if (++tail == HV_SYNIC_EVENT_RING_MESSAGE_COUNT) 88 + tail = 0; 89 + 90 + (*synic_eventring_tail)[sint_index] = tail; 91 + 92 + return message; 93 + } 94 + 95 + static bool 96 + mshv_doorbell_isr(struct hv_message *msg) 97 + { 98 + struct hv_notification_message_payload *notification; 99 + u32 port; 100 + 101 + if (msg->header.message_type != HVMSG_SYNIC_SINT_INTERCEPT) 102 + return false; 103 + 104 + notification = (struct hv_notification_message_payload *)msg->u.payload; 105 + if (notification->sint_index != HV_SYNIC_DOORBELL_SINT_INDEX) 106 + return false; 107 + 108 + while ((port = synic_event_ring_get_queued_port(HV_SYNIC_DOORBELL_SINT_INDEX))) { 109 + struct port_table_info ptinfo = { 0 }; 110 + 111 + if (mshv_portid_lookup(port, &ptinfo)) { 112 + pr_debug("Failed to get port info from port_table!\n"); 113 + continue; 114 + } 115 + 116 + if (ptinfo.hv_port_type != HV_PORT_TYPE_DOORBELL) { 117 + pr_debug("Not a doorbell port!, port: %d, port_type: %d\n", 118 + port, ptinfo.hv_port_type); 119 + continue; 120 + } 121 + 122 + /* Invoke the callback */ 123 + ptinfo.hv_port_doorbell.doorbell_cb(port, 124 + ptinfo.hv_port_doorbell.data); 125 + } 126 + 127 + return true; 128 + } 129 + 130 + static bool mshv_async_call_completion_isr(struct hv_message *msg) 131 + { 132 + bool handled = false; 133 + struct hv_async_completion_message_payload *async_msg; 134 + struct mshv_partition *partition; 135 + u64 partition_id; 136 + 137 + if (msg->header.message_type != HVMSG_ASYNC_CALL_COMPLETION) 138 + goto out; 139 + 140 + async_msg = 141 + (struct hv_async_completion_message_payload *)msg->u.payload; 142 + 143 + partition_id = async_msg->partition_id; 144 + 145 + /* 146 + * Hold this lock for the rest of the isr, because the partition could 147 + * be released anytime. 148 + * e.g. the MSHV_RUN_VP thread could wake on another cpu; it could 149 + * release the partition unless we hold this! 150 + */ 151 + rcu_read_lock(); 152 + 153 + partition = mshv_partition_find(partition_id); 154 + 155 + if (unlikely(!partition)) { 156 + pr_debug("failed to find partition %llu\n", partition_id); 157 + goto unlock_out; 158 + } 159 + 160 + partition->async_hypercall_status = async_msg->status; 161 + complete(&partition->async_hypercall); 162 + 163 + handled = true; 164 + 165 + unlock_out: 166 + rcu_read_unlock(); 167 + out: 168 + return handled; 169 + } 170 + 171 + static void kick_vp(struct mshv_vp *vp) 172 + { 173 + atomic64_inc(&vp->run.vp_signaled_count); 174 + vp->run.kicked_by_hv = 1; 175 + wake_up(&vp->run.vp_suspend_queue); 176 + } 177 + 178 + static void 179 + handle_bitset_message(const struct hv_vp_signal_bitset_scheduler_message *msg) 180 + { 181 + int bank_idx, vps_signaled = 0, bank_mask_size; 182 + struct mshv_partition *partition; 183 + const struct hv_vpset *vpset; 184 + const u64 *bank_contents; 185 + u64 partition_id = msg->partition_id; 186 + 187 + if (msg->vp_bitset.bitset.format != HV_GENERIC_SET_SPARSE_4K) { 188 + pr_debug("scheduler message format is not HV_GENERIC_SET_SPARSE_4K"); 189 + return; 190 + } 191 + 192 + if (msg->vp_count == 0) { 193 + pr_debug("scheduler message with no VP specified"); 194 + return; 195 + } 196 + 197 + rcu_read_lock(); 198 + 199 + partition = mshv_partition_find(partition_id); 200 + if (unlikely(!partition)) { 201 + pr_debug("failed to find partition %llu\n", partition_id); 202 + goto unlock_out; 203 + } 204 + 205 + vpset = &msg->vp_bitset.bitset; 206 + 207 + bank_idx = -1; 208 + bank_contents = vpset->bank_contents; 209 + bank_mask_size = sizeof(vpset->valid_bank_mask) * BITS_PER_BYTE; 210 + 211 + while (true) { 212 + int vp_bank_idx = -1; 213 + int vp_bank_size = sizeof(*bank_contents) * BITS_PER_BYTE; 214 + int vp_index; 215 + 216 + bank_idx = find_next_bit((unsigned long *)&vpset->valid_bank_mask, 217 + bank_mask_size, bank_idx + 1); 218 + if (bank_idx == bank_mask_size) 219 + break; 220 + 221 + while (true) { 222 + struct mshv_vp *vp; 223 + 224 + vp_bank_idx = find_next_bit((unsigned long *)bank_contents, 225 + vp_bank_size, vp_bank_idx + 1); 226 + if (vp_bank_idx == vp_bank_size) 227 + break; 228 + 229 + vp_index = (bank_idx * vp_bank_size) + vp_bank_idx; 230 + 231 + /* This shouldn't happen, but just in case. */ 232 + if (unlikely(vp_index >= MSHV_MAX_VPS)) { 233 + pr_debug("VP index %u out of bounds\n", 234 + vp_index); 235 + goto unlock_out; 236 + } 237 + 238 + vp = partition->pt_vp_array[vp_index]; 239 + if (unlikely(!vp)) { 240 + pr_debug("failed to find VP %u\n", vp_index); 241 + goto unlock_out; 242 + } 243 + 244 + kick_vp(vp); 245 + vps_signaled++; 246 + } 247 + 248 + bank_contents++; 249 + } 250 + 251 + unlock_out: 252 + rcu_read_unlock(); 253 + 254 + if (vps_signaled != msg->vp_count) 255 + pr_debug("asked to signal %u VPs but only did %u\n", 256 + msg->vp_count, vps_signaled); 257 + } 258 + 259 + static void 260 + handle_pair_message(const struct hv_vp_signal_pair_scheduler_message *msg) 261 + { 262 + struct mshv_partition *partition = NULL; 263 + struct mshv_vp *vp; 264 + int idx; 265 + 266 + rcu_read_lock(); 267 + 268 + for (idx = 0; idx < msg->vp_count; idx++) { 269 + u64 partition_id = msg->partition_ids[idx]; 270 + u32 vp_index = msg->vp_indexes[idx]; 271 + 272 + if (idx == 0 || partition->pt_id != partition_id) { 273 + partition = mshv_partition_find(partition_id); 274 + if (unlikely(!partition)) { 275 + pr_debug("failed to find partition %llu\n", 276 + partition_id); 277 + break; 278 + } 279 + } 280 + 281 + /* This shouldn't happen, but just in case. */ 282 + if (unlikely(vp_index >= MSHV_MAX_VPS)) { 283 + pr_debug("VP index %u out of bounds\n", vp_index); 284 + break; 285 + } 286 + 287 + vp = partition->pt_vp_array[vp_index]; 288 + if (!vp) { 289 + pr_debug("failed to find VP %u\n", vp_index); 290 + break; 291 + } 292 + 293 + kick_vp(vp); 294 + } 295 + 296 + rcu_read_unlock(); 297 + } 298 + 299 + static bool 300 + mshv_scheduler_isr(struct hv_message *msg) 301 + { 302 + if (msg->header.message_type != HVMSG_SCHEDULER_VP_SIGNAL_BITSET && 303 + msg->header.message_type != HVMSG_SCHEDULER_VP_SIGNAL_PAIR) 304 + return false; 305 + 306 + if (msg->header.message_type == HVMSG_SCHEDULER_VP_SIGNAL_BITSET) 307 + handle_bitset_message((struct hv_vp_signal_bitset_scheduler_message *) 308 + msg->u.payload); 309 + else 310 + handle_pair_message((struct hv_vp_signal_pair_scheduler_message *) 311 + msg->u.payload); 312 + 313 + return true; 314 + } 315 + 316 + static bool 317 + mshv_intercept_isr(struct hv_message *msg) 318 + { 319 + struct mshv_partition *partition; 320 + bool handled = false; 321 + struct mshv_vp *vp; 322 + u64 partition_id; 323 + u32 vp_index; 324 + 325 + partition_id = msg->header.sender; 326 + 327 + rcu_read_lock(); 328 + 329 + partition = mshv_partition_find(partition_id); 330 + if (unlikely(!partition)) { 331 + pr_debug("failed to find partition %llu\n", 332 + partition_id); 333 + goto unlock_out; 334 + } 335 + 336 + if (msg->header.message_type == HVMSG_X64_APIC_EOI) { 337 + /* 338 + * Check if this gsi is registered in the 339 + * ack_notifier list and invoke the callback 340 + * if registered. 341 + */ 342 + 343 + /* 344 + * If there is a notifier, the ack callback is supposed 345 + * to handle the VMEXIT. So we need not pass this message 346 + * to vcpu thread. 347 + */ 348 + struct hv_x64_apic_eoi_message *eoi_msg = 349 + (struct hv_x64_apic_eoi_message *)&msg->u.payload[0]; 350 + 351 + if (mshv_notify_acked_gsi(partition, eoi_msg->interrupt_vector)) { 352 + handled = true; 353 + goto unlock_out; 354 + } 355 + } 356 + 357 + /* 358 + * We should get an opaque intercept message here for all intercept 359 + * messages, since we're using the mapped VP intercept message page. 360 + * 361 + * The intercept message will have been placed in intercept message 362 + * page at this point. 363 + * 364 + * Make sure the message type matches our expectation. 365 + */ 366 + if (msg->header.message_type != HVMSG_OPAQUE_INTERCEPT) { 367 + pr_debug("wrong message type %d", msg->header.message_type); 368 + goto unlock_out; 369 + } 370 + 371 + /* 372 + * Since we directly index the vp, and it has to exist for us to be here 373 + * (because the vp is only deleted when the partition is), no additional 374 + * locking is needed here 375 + */ 376 + vp_index = 377 + ((struct hv_opaque_intercept_message *)msg->u.payload)->vp_index; 378 + vp = partition->pt_vp_array[vp_index]; 379 + if (unlikely(!vp)) { 380 + pr_debug("failed to find VP %u\n", vp_index); 381 + goto unlock_out; 382 + } 383 + 384 + kick_vp(vp); 385 + 386 + handled = true; 387 + 388 + unlock_out: 389 + rcu_read_unlock(); 390 + 391 + return handled; 392 + } 393 + 394 + void mshv_isr(void) 395 + { 396 + struct hv_synic_pages *spages = this_cpu_ptr(mshv_root.synic_pages); 397 + struct hv_message_page **msg_page = &spages->synic_message_page; 398 + struct hv_message *msg; 399 + bool handled; 400 + 401 + if (unlikely(!(*msg_page))) { 402 + pr_debug("Missing synic page!\n"); 403 + return; 404 + } 405 + 406 + msg = &((*msg_page)->sint_message[HV_SYNIC_INTERCEPTION_SINT_INDEX]); 407 + 408 + /* 409 + * If the type isn't set, there isn't really a message; 410 + * it may be some other hyperv interrupt 411 + */ 412 + if (msg->header.message_type == HVMSG_NONE) 413 + return; 414 + 415 + handled = mshv_doorbell_isr(msg); 416 + 417 + if (!handled) 418 + handled = mshv_scheduler_isr(msg); 419 + 420 + if (!handled) 421 + handled = mshv_async_call_completion_isr(msg); 422 + 423 + if (!handled) 424 + handled = mshv_intercept_isr(msg); 425 + 426 + if (handled) { 427 + /* 428 + * Acknowledge message with hypervisor if another message is 429 + * pending. 430 + */ 431 + msg->header.message_type = HVMSG_NONE; 432 + /* 433 + * Ensure the write is complete so the hypervisor will deliver 434 + * the next message if available. 435 + */ 436 + mb(); 437 + if (msg->header.message_flags.msg_pending) 438 + hv_set_non_nested_msr(HV_MSR_EOM, 0); 439 + 440 + #ifdef HYPERVISOR_CALLBACK_VECTOR 441 + add_interrupt_randomness(HYPERVISOR_CALLBACK_VECTOR); 442 + #endif 443 + } else { 444 + pr_warn_once("%s: unknown message type 0x%x\n", __func__, 445 + msg->header.message_type); 446 + } 447 + } 448 + 449 + int mshv_synic_init(unsigned int cpu) 450 + { 451 + union hv_synic_simp simp; 452 + union hv_synic_siefp siefp; 453 + union hv_synic_sirbp sirbp; 454 + #ifdef HYPERVISOR_CALLBACK_VECTOR 455 + union hv_synic_sint sint; 456 + #endif 457 + union hv_synic_scontrol sctrl; 458 + struct hv_synic_pages *spages = this_cpu_ptr(mshv_root.synic_pages); 459 + struct hv_message_page **msg_page = &spages->synic_message_page; 460 + struct hv_synic_event_flags_page **event_flags_page = 461 + &spages->synic_event_flags_page; 462 + struct hv_synic_event_ring_page **event_ring_page = 463 + &spages->synic_event_ring_page; 464 + 465 + /* Setup the Synic's message page */ 466 + simp.as_uint64 = hv_get_non_nested_msr(HV_MSR_SIMP); 467 + simp.simp_enabled = true; 468 + *msg_page = memremap(simp.base_simp_gpa << HV_HYP_PAGE_SHIFT, 469 + HV_HYP_PAGE_SIZE, 470 + MEMREMAP_WB); 471 + 472 + if (!(*msg_page)) 473 + return -EFAULT; 474 + 475 + hv_set_non_nested_msr(HV_MSR_SIMP, simp.as_uint64); 476 + 477 + /* Setup the Synic's event flags page */ 478 + siefp.as_uint64 = hv_get_non_nested_msr(HV_MSR_SIEFP); 479 + siefp.siefp_enabled = true; 480 + *event_flags_page = memremap(siefp.base_siefp_gpa << PAGE_SHIFT, 481 + PAGE_SIZE, MEMREMAP_WB); 482 + 483 + if (!(*event_flags_page)) 484 + goto cleanup; 485 + 486 + hv_set_non_nested_msr(HV_MSR_SIEFP, siefp.as_uint64); 487 + 488 + /* Setup the Synic's event ring page */ 489 + sirbp.as_uint64 = hv_get_non_nested_msr(HV_MSR_SIRBP); 490 + sirbp.sirbp_enabled = true; 491 + *event_ring_page = memremap(sirbp.base_sirbp_gpa << PAGE_SHIFT, 492 + PAGE_SIZE, MEMREMAP_WB); 493 + 494 + if (!(*event_ring_page)) 495 + goto cleanup; 496 + 497 + hv_set_non_nested_msr(HV_MSR_SIRBP, sirbp.as_uint64); 498 + 499 + #ifdef HYPERVISOR_CALLBACK_VECTOR 500 + /* Enable intercepts */ 501 + sint.as_uint64 = 0; 502 + sint.vector = HYPERVISOR_CALLBACK_VECTOR; 503 + sint.masked = false; 504 + sint.auto_eoi = hv_recommend_using_aeoi(); 505 + hv_set_non_nested_msr(HV_MSR_SINT0 + HV_SYNIC_INTERCEPTION_SINT_INDEX, 506 + sint.as_uint64); 507 + 508 + /* Doorbell SINT */ 509 + sint.as_uint64 = 0; 510 + sint.vector = HYPERVISOR_CALLBACK_VECTOR; 511 + sint.masked = false; 512 + sint.as_intercept = 1; 513 + sint.auto_eoi = hv_recommend_using_aeoi(); 514 + hv_set_non_nested_msr(HV_MSR_SINT0 + HV_SYNIC_DOORBELL_SINT_INDEX, 515 + sint.as_uint64); 516 + #endif 517 + 518 + /* Enable global synic bit */ 519 + sctrl.as_uint64 = hv_get_non_nested_msr(HV_MSR_SCONTROL); 520 + sctrl.enable = 1; 521 + hv_set_non_nested_msr(HV_MSR_SCONTROL, sctrl.as_uint64); 522 + 523 + return 0; 524 + 525 + cleanup: 526 + if (*event_ring_page) { 527 + sirbp.sirbp_enabled = false; 528 + hv_set_non_nested_msr(HV_MSR_SIRBP, sirbp.as_uint64); 529 + memunmap(*event_ring_page); 530 + } 531 + if (*event_flags_page) { 532 + siefp.siefp_enabled = false; 533 + hv_set_non_nested_msr(HV_MSR_SIEFP, siefp.as_uint64); 534 + memunmap(*event_flags_page); 535 + } 536 + if (*msg_page) { 537 + simp.simp_enabled = false; 538 + hv_set_non_nested_msr(HV_MSR_SIMP, simp.as_uint64); 539 + memunmap(*msg_page); 540 + } 541 + 542 + return -EFAULT; 543 + } 544 + 545 + int mshv_synic_cleanup(unsigned int cpu) 546 + { 547 + union hv_synic_sint sint; 548 + union hv_synic_simp simp; 549 + union hv_synic_siefp siefp; 550 + union hv_synic_sirbp sirbp; 551 + union hv_synic_scontrol sctrl; 552 + struct hv_synic_pages *spages = this_cpu_ptr(mshv_root.synic_pages); 553 + struct hv_message_page **msg_page = &spages->synic_message_page; 554 + struct hv_synic_event_flags_page **event_flags_page = 555 + &spages->synic_event_flags_page; 556 + struct hv_synic_event_ring_page **event_ring_page = 557 + &spages->synic_event_ring_page; 558 + 559 + /* Disable the interrupt */ 560 + sint.as_uint64 = hv_get_non_nested_msr(HV_MSR_SINT0 + HV_SYNIC_INTERCEPTION_SINT_INDEX); 561 + sint.masked = true; 562 + hv_set_non_nested_msr(HV_MSR_SINT0 + HV_SYNIC_INTERCEPTION_SINT_INDEX, 563 + sint.as_uint64); 564 + 565 + /* Disable Doorbell SINT */ 566 + sint.as_uint64 = hv_get_non_nested_msr(HV_MSR_SINT0 + HV_SYNIC_DOORBELL_SINT_INDEX); 567 + sint.masked = true; 568 + hv_set_non_nested_msr(HV_MSR_SINT0 + HV_SYNIC_DOORBELL_SINT_INDEX, 569 + sint.as_uint64); 570 + 571 + /* Disable Synic's event ring page */ 572 + sirbp.as_uint64 = hv_get_non_nested_msr(HV_MSR_SIRBP); 573 + sirbp.sirbp_enabled = false; 574 + hv_set_non_nested_msr(HV_MSR_SIRBP, sirbp.as_uint64); 575 + memunmap(*event_ring_page); 576 + 577 + /* Disable Synic's event flags page */ 578 + siefp.as_uint64 = hv_get_non_nested_msr(HV_MSR_SIEFP); 579 + siefp.siefp_enabled = false; 580 + hv_set_non_nested_msr(HV_MSR_SIEFP, siefp.as_uint64); 581 + memunmap(*event_flags_page); 582 + 583 + /* Disable Synic's message page */ 584 + simp.as_uint64 = hv_get_non_nested_msr(HV_MSR_SIMP); 585 + simp.simp_enabled = false; 586 + hv_set_non_nested_msr(HV_MSR_SIMP, simp.as_uint64); 587 + memunmap(*msg_page); 588 + 589 + /* Disable global synic bit */ 590 + sctrl.as_uint64 = hv_get_non_nested_msr(HV_MSR_SCONTROL); 591 + sctrl.enable = 0; 592 + hv_set_non_nested_msr(HV_MSR_SCONTROL, sctrl.as_uint64); 593 + 594 + return 0; 595 + } 596 + 597 + int 598 + mshv_register_doorbell(u64 partition_id, doorbell_cb_t doorbell_cb, void *data, 599 + u64 gpa, u64 val, u64 flags) 600 + { 601 + struct hv_connection_info connection_info = { 0 }; 602 + union hv_connection_id connection_id = { 0 }; 603 + struct port_table_info *port_table_info; 604 + struct hv_port_info port_info = { 0 }; 605 + union hv_port_id port_id = { 0 }; 606 + int ret; 607 + 608 + port_table_info = kmalloc(sizeof(*port_table_info), GFP_KERNEL); 609 + if (!port_table_info) 610 + return -ENOMEM; 611 + 612 + port_table_info->hv_port_type = HV_PORT_TYPE_DOORBELL; 613 + port_table_info->hv_port_doorbell.doorbell_cb = doorbell_cb; 614 + port_table_info->hv_port_doorbell.data = data; 615 + ret = mshv_portid_alloc(port_table_info); 616 + if (ret < 0) { 617 + kfree(port_table_info); 618 + return ret; 619 + } 620 + 621 + port_id.u.id = ret; 622 + port_info.port_type = HV_PORT_TYPE_DOORBELL; 623 + port_info.doorbell_port_info.target_sint = HV_SYNIC_DOORBELL_SINT_INDEX; 624 + port_info.doorbell_port_info.target_vp = HV_ANY_VP; 625 + ret = hv_call_create_port(hv_current_partition_id, port_id, partition_id, 626 + &port_info, 627 + 0, 0, NUMA_NO_NODE); 628 + 629 + if (ret < 0) { 630 + mshv_portid_free(port_id.u.id); 631 + return ret; 632 + } 633 + 634 + connection_id.u.id = port_id.u.id; 635 + connection_info.port_type = HV_PORT_TYPE_DOORBELL; 636 + connection_info.doorbell_connection_info.gpa = gpa; 637 + connection_info.doorbell_connection_info.trigger_value = val; 638 + connection_info.doorbell_connection_info.flags = flags; 639 + 640 + ret = hv_call_connect_port(hv_current_partition_id, port_id, partition_id, 641 + connection_id, &connection_info, 0, NUMA_NO_NODE); 642 + if (ret < 0) { 643 + hv_call_delete_port(hv_current_partition_id, port_id); 644 + mshv_portid_free(port_id.u.id); 645 + return ret; 646 + } 647 + 648 + // lets use the port_id as the doorbell_id 649 + return port_id.u.id; 650 + } 651 + 652 + void 653 + mshv_unregister_doorbell(u64 partition_id, int doorbell_portid) 654 + { 655 + union hv_port_id port_id = { 0 }; 656 + union hv_connection_id connection_id = { 0 }; 657 + 658 + connection_id.u.id = doorbell_portid; 659 + hv_call_disconnect_port(partition_id, connection_id); 660 + 661 + port_id.u.id = doorbell_portid; 662 + hv_call_delete_port(hv_current_partition_id, port_id); 663 + 664 + mshv_portid_free(doorbell_portid); 665 + }
+32 -22
drivers/hv/vmbus_drv.c
··· 1611 1611 { 1612 1612 return sprintf(buf, "%u\n", channel->target_cpu); 1613 1613 } 1614 - static ssize_t target_cpu_store(struct vmbus_channel *channel, 1615 - const char *buf, size_t count) 1614 + 1615 + int vmbus_channel_set_cpu(struct vmbus_channel *channel, u32 target_cpu) 1616 1616 { 1617 - u32 target_cpu, origin_cpu; 1618 - ssize_t ret = count; 1617 + u32 origin_cpu; 1618 + int ret = 0; 1619 + 1620 + lockdep_assert_cpus_held(); 1621 + lockdep_assert_held(&vmbus_connection.channel_mutex); 1619 1622 1620 1623 if (vmbus_proto_version < VERSION_WIN10_V4_1) 1621 - return -EIO; 1622 - 1623 - if (sscanf(buf, "%uu", &target_cpu) != 1) 1624 1624 return -EIO; 1625 1625 1626 1626 /* Validate target_cpu for the cpumask_test_cpu() operation below. */ ··· 1630 1630 if (!cpumask_test_cpu(target_cpu, housekeeping_cpumask(HK_TYPE_MANAGED_IRQ))) 1631 1631 return -EINVAL; 1632 1632 1633 - /* No CPUs should come up or down during this. */ 1634 - cpus_read_lock(); 1635 - 1636 - if (!cpu_online(target_cpu)) { 1637 - cpus_read_unlock(); 1633 + if (!cpu_online(target_cpu)) 1638 1634 return -EINVAL; 1639 - } 1640 1635 1641 1636 /* 1642 - * Synchronizes target_cpu_store() and channel closure: 1637 + * Synchronizes vmbus_channel_set_cpu() and channel closure: 1643 1638 * 1644 1639 * { Initially: state = CHANNEL_OPENED } 1645 1640 * 1646 1641 * CPU1 CPU2 1647 1642 * 1648 - * [target_cpu_store()] [vmbus_disconnect_ring()] 1643 + * [vmbus_channel_set_cpu()] [vmbus_disconnect_ring()] 1649 1644 * 1650 1645 * LOCK channel_mutex LOCK channel_mutex 1651 1646 * LOAD r1 = state LOAD r2 = state ··· 1655 1660 * Note. The host processes the channel messages "sequentially", in 1656 1661 * the order in which they are received on a per-partition basis. 1657 1662 */ 1658 - mutex_lock(&vmbus_connection.channel_mutex); 1659 1663 1660 1664 /* 1661 1665 * Hyper-V will ignore MODIFYCHANNEL messages for "non-open" channels; ··· 1662 1668 */ 1663 1669 if (channel->state != CHANNEL_OPENED_STATE) { 1664 1670 ret = -EIO; 1665 - goto cpu_store_unlock; 1671 + goto end; 1666 1672 } 1667 1673 1668 1674 origin_cpu = channel->target_cpu; 1669 1675 if (target_cpu == origin_cpu) 1670 - goto cpu_store_unlock; 1676 + goto end; 1671 1677 1672 1678 if (vmbus_send_modifychannel(channel, 1673 1679 hv_cpu_number_to_vp_number(target_cpu))) { 1674 1680 ret = -EIO; 1675 - goto cpu_store_unlock; 1681 + goto end; 1676 1682 } 1677 1683 1678 1684 /* ··· 1702 1708 origin_cpu, target_cpu); 1703 1709 } 1704 1710 1705 - cpu_store_unlock: 1711 + end: 1712 + return ret; 1713 + } 1714 + 1715 + static ssize_t target_cpu_store(struct vmbus_channel *channel, 1716 + const char *buf, size_t count) 1717 + { 1718 + u32 target_cpu; 1719 + ssize_t ret; 1720 + 1721 + if (sscanf(buf, "%uu", &target_cpu) != 1) 1722 + return -EIO; 1723 + 1724 + cpus_read_lock(); 1725 + mutex_lock(&vmbus_connection.channel_mutex); 1726 + ret = vmbus_channel_set_cpu(channel, target_cpu); 1706 1727 mutex_unlock(&vmbus_connection.channel_mutex); 1707 1728 cpus_read_unlock(); 1708 - return ret; 1729 + 1730 + return ret ?: count; 1709 1731 } 1710 1732 static VMBUS_CHAN_ATTR(cpu, 0644, target_cpu_show, target_cpu_store); 1711 1733 ··· 2669 2659 if (!hv_is_hyperv_initialized()) 2670 2660 return -ENODEV; 2671 2661 2672 - if (hv_root_partition && !hv_nested) 2662 + if (hv_root_partition() && !hv_nested) 2673 2663 return 0; 2674 2664 2675 2665 /*
+4 -4
drivers/iommu/hyperv-iommu.c
··· 130 130 x86_init.hyper.msi_ext_dest_id()) 131 131 return -ENODEV; 132 132 133 - if (hv_root_partition) { 133 + if (hv_root_partition()) { 134 134 name = "HYPERV-ROOT-IR"; 135 135 ops = &hyperv_root_ir_domain_ops; 136 136 } else { ··· 151 151 return -ENOMEM; 152 152 } 153 153 154 - if (hv_root_partition) 154 + if (hv_root_partition()) 155 155 return 0; /* The rest is only relevant to guests */ 156 156 157 157 /* ··· 217 217 status = hv_unmap_ioapic_interrupt(ioapic_id, &entry); 218 218 219 219 if (status != HV_STATUS_SUCCESS) 220 - pr_debug("%s: unexpected unmap status %lld\n", __func__, status); 220 + hv_status_debug(status, "failed to unmap\n"); 221 221 222 222 data->entry.ioapic_rte.as_uint64 = 0; 223 223 data->entry.source = 0; /* Invalid source */ ··· 228 228 vector, &entry); 229 229 230 230 if (status != HV_STATUS_SUCCESS) { 231 - pr_err("%s: map hypercall failed, status %lld\n", __func__, status); 231 + hv_status_err(status, "map failed\n"); 232 232 return; 233 233 } 234 234
+68 -4
include/asm-generic/mshyperv.h
··· 28 28 29 29 #define VTPM_BASE_ADDRESS 0xfed40000 30 30 31 + enum hv_partition_type { 32 + HV_PARTITION_TYPE_GUEST, 33 + HV_PARTITION_TYPE_ROOT, 34 + }; 35 + 31 36 struct ms_hyperv_info { 32 37 u32 features; 33 38 u32 priv_high; 39 + u32 ext_features; 34 40 u32 misc_features; 35 41 u32 hints; 36 42 u32 nested_features; ··· 64 58 }; 65 59 extern struct ms_hyperv_info ms_hyperv; 66 60 extern bool hv_nested; 61 + extern u64 hv_current_partition_id; 62 + extern enum hv_partition_type hv_curr_partition_type; 67 63 68 64 extern void * __percpu *hyperv_pcpu_input_arg; 69 65 extern void * __percpu *hyperv_pcpu_output_arg; 70 66 71 - extern u64 hv_do_hypercall(u64 control, void *inputaddr, void *outputaddr); 72 - extern u64 hv_do_fast_hypercall8(u16 control, u64 input8); 67 + u64 hv_do_hypercall(u64 control, void *inputaddr, void *outputaddr); 68 + u64 hv_do_fast_hypercall8(u16 control, u64 input8); 69 + u64 hv_do_fast_hypercall16(u16 control, u64 input1, u64 input2); 70 + 73 71 bool hv_isolation_type_snp(void); 74 72 bool hv_isolation_type_tdx(void); 73 + 74 + /* 75 + * On architectures where Hyper-V doesn't support AEOI (e.g., ARM64), 76 + * it doesn't provide a recommendation flag and AEOI must be disabled. 77 + */ 78 + static inline bool hv_recommend_using_aeoi(void) 79 + { 80 + #ifdef HV_DEPRECATING_AEOI_RECOMMENDED 81 + return !(ms_hyperv.hints & HV_DEPRECATING_AEOI_RECOMMENDED); 82 + #else 83 + return false; 84 + #endif 85 + } 75 86 76 87 static inline struct hv_proximity_domain_info hv_numa_node_to_pxm_info(int node) 77 88 { ··· 208 185 void hv_remove_kexec_handler(void); 209 186 void hv_setup_crash_handler(void (*handler)(struct pt_regs *regs)); 210 187 void hv_remove_crash_handler(void); 188 + void hv_setup_mshv_handler(void (*handler)(void)); 211 189 212 190 extern int vmbus_interrupt; 213 191 extern int vmbus_irq; 214 - 215 - extern bool hv_root_partition; 216 192 217 193 #if IS_ENABLED(CONFIG_HYPERV) 218 194 /* ··· 229 207 #define VP_INVAL U32_MAX 230 208 231 209 int __init hv_common_init(void); 210 + void __init hv_get_partition_id(void); 232 211 void __init hv_common_free(void); 233 212 void __init ms_hyperv_late_init(void); 234 213 int hv_common_cpu_init(unsigned int cpu); 235 214 int hv_common_cpu_die(unsigned int cpu); 215 + void hv_identify_partition_type(void); 236 216 237 217 void *hv_alloc_hyperv_page(void); 238 218 void *hv_alloc_hyperv_zeroed_page(void); ··· 315 291 return __cpumask_to_vpset(vpset, cpus, func); 316 292 } 317 293 294 + #define _hv_status_fmt(fmt) "%s: Hyper-V status: %#x = %s: " fmt 295 + #define hv_status_printk(level, status, fmt, ...) \ 296 + do { \ 297 + u64 __status = (status); \ 298 + pr_##level(_hv_status_fmt(fmt), __func__, hv_result(__status), \ 299 + hv_result_to_string(__status), ##__VA_ARGS__); \ 300 + } while (0) 301 + #define hv_status_err(status, fmt, ...) \ 302 + hv_status_printk(err, status, fmt, ##__VA_ARGS__) 303 + #define hv_status_debug(status, fmt, ...) \ 304 + hv_status_printk(debug, status, fmt, ##__VA_ARGS__) 305 + 306 + const char *hv_result_to_string(u64 hv_status); 307 + int hv_result_to_errno(u64 status); 318 308 void hyperv_report_panic(struct pt_regs *regs, long err, bool in_die); 319 309 bool hv_is_hyperv_initialized(void); 320 310 bool hv_is_hibernation_supported(void); ··· 341 303 bool hv_query_ext_cap(u64 cap_query); 342 304 void hv_setup_dma_ops(struct device *dev, bool coherent); 343 305 #else /* CONFIG_HYPERV */ 306 + static inline void hv_identify_partition_type(void) {} 344 307 static inline bool hv_is_hyperv_initialized(void) { return false; } 345 308 static inline bool hv_is_hibernation_supported(void) { return false; } 346 309 static inline void hyperv_cleanup(void) {} ··· 352 313 return HV_ISOLATION_TYPE_NONE; 353 314 } 354 315 #endif /* CONFIG_HYPERV */ 316 + 317 + #if IS_ENABLED(CONFIG_MSHV_ROOT) 318 + static inline bool hv_root_partition(void) 319 + { 320 + return hv_curr_partition_type == HV_PARTITION_TYPE_ROOT; 321 + } 322 + int hv_call_deposit_pages(int node, u64 partition_id, u32 num_pages); 323 + int hv_call_add_logical_proc(int node, u32 lp_index, u32 acpi_id); 324 + int hv_call_create_vp(int node, u64 partition_id, u32 vp_index, u32 flags); 325 + 326 + #else /* CONFIG_MSHV_ROOT */ 327 + static inline bool hv_root_partition(void) { return false; } 328 + static inline int hv_call_deposit_pages(int node, u64 partition_id, u32 num_pages) 329 + { 330 + return -EOPNOTSUPP; 331 + } 332 + static inline int hv_call_add_logical_proc(int node, u32 lp_index, u32 acpi_id) 333 + { 334 + return -EOPNOTSUPP; 335 + } 336 + static inline int hv_call_create_vp(int node, u64 partition_id, u32 vp_index, u32 flags) 337 + { 338 + return -EOPNOTSUPP; 339 + } 340 + #endif /* CONFIG_MSHV_ROOT */ 355 341 356 342 #endif
+76 -7
include/hyperv/hvgdk_mini.h
··· 13 13 u64 high_part; 14 14 } __packed; 15 15 16 - /* NOTE: when adding below, update hv_status_to_string() */ 16 + /* NOTE: when adding below, update hv_result_to_string() */ 17 17 #define HV_STATUS_SUCCESS 0x0 18 18 #define HV_STATUS_INVALID_HYPERCALL_CODE 0x2 19 19 #define HV_STATUS_INVALID_HYPERCALL_INPUT 0x3 ··· 51 51 #define HV_HYP_PAGE_SHIFT 12 52 52 #define HV_HYP_PAGE_SIZE BIT(HV_HYP_PAGE_SHIFT) 53 53 #define HV_HYP_PAGE_MASK (~(HV_HYP_PAGE_SIZE - 1)) 54 + #define HV_HYP_LARGE_PAGE_SHIFT 21 54 55 55 56 #define HV_PARTITION_ID_INVALID ((u64)0) 56 57 #define HV_PARTITION_ID_SELF ((u64)-1) ··· 183 182 184 183 #endif /* CONFIG_X86 */ 185 184 186 - struct hv_get_partition_id { /* HV_OUTPUT_GET_PARTITION_ID */ 185 + struct hv_output_get_partition_id { 187 186 u64 partition_id; 188 187 } __packed; 189 188 ··· 205 204 /* The number of vCPUs in one sparse bank */ 206 205 #define HV_VCPUS_PER_SPARSE_BANK (64) 207 206 208 - /* Some of Hyper-V structs do not use hv_vpset where linux uses them */ 207 + /* 208 + * Some of Hyper-V structs do not use hv_vpset where linux uses them. 209 + * 210 + * struct hv_vpset is usually used as part of hypercall input. The portion 211 + * that counts as "fixed size input header" vs. "variable size input header" 212 + * varies per hypercall. See comments at relevant hypercall call sites as to 213 + * how the "valid_bank_mask" field should be accounted. 214 + */ 209 215 struct hv_vpset { /* HV_VP_SET */ 210 216 u64 format; 211 217 u64 valid_bank_mask; ··· 382 374 #define HV_SHARED_GPA_BOUNDARY_ACTIVE BIT(5) 383 375 #define HV_SHARED_GPA_BOUNDARY_BITS GENMASK(11, 6) 384 376 377 + /* HYPERV_CPUID_FEATURES.ECX bits. */ 378 + #define HV_VP_DISPATCH_INTERRUPT_INJECTION_AVAILABLE BIT(9) 379 + #define HV_VP_GHCB_ROOT_MAPPING_AVAILABLE BIT(10) 380 + 385 381 enum hv_isolation_type { 386 382 HV_ISOLATION_TYPE_NONE = 0, /* HV_PARTITION_ISOLATION_TYPE_NONE */ 387 383 HV_ISOLATION_TYPE_VBS = 1, ··· 448 436 #define HVCALL_WITHDRAW_MEMORY 0x0049 449 437 #define HVCALL_MAP_GPA_PAGES 0x004b 450 438 #define HVCALL_UNMAP_GPA_PAGES 0x004c 439 + #define HVCALL_INSTALL_INTERCEPT 0x004d 451 440 #define HVCALL_CREATE_VP 0x004e 452 441 #define HVCALL_DELETE_VP 0x004f 453 442 #define HVCALL_GET_VP_REGISTERS 0x0050 454 443 #define HVCALL_SET_VP_REGISTERS 0x0051 444 + #define HVCALL_TRANSLATE_VIRTUAL_ADDRESS 0x0052 445 + #define HVCALL_CLEAR_VIRTUAL_INTERRUPT 0x0056 455 446 #define HVCALL_DELETE_PORT 0x0058 456 447 #define HVCALL_DISCONNECT_PORT 0x005b 457 448 #define HVCALL_POST_MESSAGE 0x005c ··· 462 447 #define HVCALL_POST_DEBUG_DATA 0x0069 463 448 #define HVCALL_RETRIEVE_DEBUG_DATA 0x006a 464 449 #define HVCALL_RESET_DEBUG_SESSION 0x006b 450 + #define HVCALL_MAP_STATS_PAGE 0x006c 451 + #define HVCALL_UNMAP_STATS_PAGE 0x006d 465 452 #define HVCALL_ADD_LOGICAL_PROCESSOR 0x0076 466 453 #define HVCALL_GET_SYSTEM_PROPERTY 0x007b 467 454 #define HVCALL_MAP_DEVICE_INTERRUPT 0x007c 468 455 #define HVCALL_UNMAP_DEVICE_INTERRUPT 0x007d 469 456 #define HVCALL_RETARGET_INTERRUPT 0x007e 470 457 #define HVCALL_NOTIFY_PORT_RING_EMPTY 0x008b 458 + #define HVCALL_REGISTER_INTERCEPT_RESULT 0x0091 471 459 #define HVCALL_ASSERT_VIRTUAL_INTERRUPT 0x0094 472 460 #define HVCALL_CREATE_PORT 0x0095 473 461 #define HVCALL_CONNECT_PORT 0x0096 ··· 478 460 #define HVCALL_GET_VP_ID_FROM_APIC_ID 0x009a 479 461 #define HVCALL_FLUSH_GUEST_PHYSICAL_ADDRESS_SPACE 0x00af 480 462 #define HVCALL_FLUSH_GUEST_PHYSICAL_ADDRESS_LIST 0x00b0 463 + #define HVCALL_SIGNAL_EVENT_DIRECT 0x00c0 464 + #define HVCALL_POST_MESSAGE_DIRECT 0x00c1 481 465 #define HVCALL_DISPATCH_VP 0x00c2 466 + #define HVCALL_GET_GPA_PAGES_ACCESS_STATES 0x00c9 467 + #define HVCALL_ACQUIRE_SPARSE_SPA_PAGE_HOST_ACCESS 0x00d7 468 + #define HVCALL_RELEASE_SPARSE_SPA_PAGE_HOST_ACCESS 0x00d8 482 469 #define HVCALL_MODIFY_SPARSE_GPA_PAGE_HOST_VISIBILITY 0x00db 483 470 #define HVCALL_MAP_VP_STATE_PAGE 0x00e1 484 471 #define HVCALL_UNMAP_VP_STATE_PAGE 0x00e2 485 472 #define HVCALL_GET_VP_STATE 0x00e3 486 473 #define HVCALL_SET_VP_STATE 0x00e4 474 + #define HVCALL_GET_VP_CPUID_VALUES 0x00f4 487 475 #define HVCALL_MMIO_READ 0x0106 488 476 #define HVCALL_MMIO_WRITE 0x0107 489 477 ··· 799 775 800 776 /* Define timer message payload structure. */ 801 777 struct hv_timer_message_payload { 802 - __u32 timer_index; 803 - __u32 reserved; 804 - __u64 expiration_time; /* When the timer expired */ 805 - __u64 delivery_time; /* When the message was delivered */ 778 + u32 timer_index; 779 + u32 reserved; 780 + u64 expiration_time; /* When the timer expired */ 781 + u64 delivery_time; /* When the message was delivered */ 806 782 } __packed; 807 783 808 784 struct hv_x64_segment_register { ··· 830 806 u16 limit; 831 807 u64 base; 832 808 } __packed; 809 + 810 + #define HV_NORMAL_VTL 0 833 811 834 812 union hv_input_vtl { 835 813 u8 as_uint8; ··· 1350 1324 u64 reserved2; 1351 1325 struct hv_device_interrupt_target int_target; 1352 1326 } __packed __aligned(8); 1327 + 1328 + enum hv_intercept_type { 1329 + #if defined(CONFIG_X86) 1330 + HV_INTERCEPT_TYPE_X64_IO_PORT = 0x00000000, 1331 + HV_INTERCEPT_TYPE_X64_MSR = 0x00000001, 1332 + HV_INTERCEPT_TYPE_X64_CPUID = 0x00000002, 1333 + #endif 1334 + HV_INTERCEPT_TYPE_EXCEPTION = 0x00000003, 1335 + /* Used to be HV_INTERCEPT_TYPE_REGISTER */ 1336 + HV_INTERCEPT_TYPE_RESERVED0 = 0x00000004, 1337 + HV_INTERCEPT_TYPE_MMIO = 0x00000005, 1338 + #if defined(CONFIG_X86) 1339 + HV_INTERCEPT_TYPE_X64_GLOBAL_CPUID = 0x00000006, 1340 + HV_INTERCEPT_TYPE_X64_APIC_SMI = 0x00000007, 1341 + #endif 1342 + HV_INTERCEPT_TYPE_HYPERCALL = 0x00000008, 1343 + #if defined(CONFIG_X86) 1344 + HV_INTERCEPT_TYPE_X64_APIC_INIT_SIPI = 0x00000009, 1345 + HV_INTERCEPT_MC_UPDATE_PATCH_LEVEL_MSR_READ = 0x0000000A, 1346 + HV_INTERCEPT_TYPE_X64_APIC_WRITE = 0x0000000B, 1347 + HV_INTERCEPT_TYPE_X64_MSR_INDEX = 0x0000000C, 1348 + #endif 1349 + HV_INTERCEPT_TYPE_MAX, 1350 + HV_INTERCEPT_TYPE_INVALID = 0xFFFFFFFF, 1351 + }; 1352 + 1353 + union hv_intercept_parameters { 1354 + /* HV_INTERCEPT_PARAMETERS is defined to be an 8-byte field. */ 1355 + u64 as_uint64; 1356 + #if defined(CONFIG_X86) 1357 + /* HV_INTERCEPT_TYPE_X64_IO_PORT */ 1358 + u16 io_port; 1359 + /* HV_INTERCEPT_TYPE_X64_CPUID */ 1360 + u32 cpuid_index; 1361 + /* HV_INTERCEPT_TYPE_X64_APIC_WRITE */ 1362 + u32 apic_write_mask; 1363 + /* HV_INTERCEPT_TYPE_EXCEPTION */ 1364 + u16 exception_vector; 1365 + /* HV_INTERCEPT_TYPE_X64_MSR_INDEX */ 1366 + u32 msr_index; 1367 + #endif 1368 + /* N.B. Other intercept types do not have any parameters. */ 1369 + }; 1353 1370 1354 1371 /* Data structures for HVCALL_MMIO_READ and HVCALL_MMIO_WRITE */ 1355 1372 #define HV_HYPERCALL_MMIO_MAX_DATA_LENGTH 64
+126 -6
include/hyperv/hvhdk.h
··· 19 19 20 20 #define HV_VP_REGISTER_PAGE_VERSION_1 1u 21 21 22 + #define HV_VP_REGISTER_PAGE_MAX_VECTOR_COUNT 7 23 + 24 + union hv_vp_register_page_interrupt_vectors { 25 + u64 as_uint64; 26 + struct { 27 + u8 vector_count; 28 + u8 vector[HV_VP_REGISTER_PAGE_MAX_VECTOR_COUNT]; 29 + } __packed; 30 + }; 31 + 22 32 struct hv_vp_register_page { 23 33 u16 version; 24 34 u8 isvalid; 25 35 u8 rsvdz; 26 36 u32 dirty; 37 + 38 + #if IS_ENABLED(CONFIG_X86) 39 + 27 40 union { 28 41 struct { 29 42 /* General purpose registers ··· 108 95 union hv_x64_pending_interruption_register pending_interruption; 109 96 union hv_x64_interrupt_state_register interrupt_state; 110 97 u64 instruction_emulation_hints; 98 + u64 xfem; 99 + 100 + /* 101 + * Fields from this point are not included in the register page save chunk. 102 + * The reserved field is intended to maintain alignment for unsaved fields. 103 + */ 104 + u8 reserved1[0x100]; 105 + 106 + /* 107 + * Interrupts injected as part of HvCallDispatchVp. 108 + */ 109 + union hv_vp_register_page_interrupt_vectors interrupt_vectors; 110 + 111 + #elif IS_ENABLED(CONFIG_ARM64) 112 + /* Not yet supported in ARM */ 113 + #endif 111 114 } __packed; 112 115 113 116 #define HV_PARTITION_PROCESSOR_FEATURES_BANKS 2 ··· 328 299 #define HV_PARTITION_ISOLATION_HOST_TYPE_RESERVED 0x2 329 300 330 301 /* Note: Exo partition is enabled by default */ 331 - #define HV_PARTITION_CREATION_FLAG_EXO_PARTITION BIT(8) 332 - #define HV_PARTITION_CREATION_FLAG_LAPIC_ENABLED BIT(13) 333 - #define HV_PARTITION_CREATION_FLAG_INTERCEPT_MESSAGE_PAGE_ENABLED BIT(19) 334 - #define HV_PARTITION_CREATION_FLAG_X2APIC_CAPABLE BIT(22) 302 + #define HV_PARTITION_CREATION_FLAG_GPA_SUPER_PAGES_ENABLED BIT(4) 303 + #define HV_PARTITION_CREATION_FLAG_EXO_PARTITION BIT(8) 304 + #define HV_PARTITION_CREATION_FLAG_LAPIC_ENABLED BIT(13) 305 + #define HV_PARTITION_CREATION_FLAG_INTERCEPT_MESSAGE_PAGE_ENABLED BIT(19) 306 + #define HV_PARTITION_CREATION_FLAG_X2APIC_CAPABLE BIT(22) 335 307 336 308 struct hv_input_create_partition { 337 309 u64 flags; ··· 379 349 enum hv_vp_state_page_type { 380 350 HV_VP_STATE_PAGE_REGISTERS = 0, 381 351 HV_VP_STATE_PAGE_INTERCEPT_MESSAGE = 1, 352 + HV_VP_STATE_PAGE_GHCB = 2, 382 353 HV_VP_STATE_PAGE_COUNT 383 354 }; 384 355 385 356 struct hv_input_map_vp_state_page { 386 357 u64 partition_id; 387 358 u32 vp_index; 388 - u32 type; /* enum hv_vp_state_page_type */ 359 + u16 type; /* enum hv_vp_state_page_type */ 360 + union hv_input_vtl input_vtl; 361 + union { 362 + u8 as_uint8; 363 + struct { 364 + u8 map_location_provided : 1; 365 + u8 reserved : 7; 366 + }; 367 + } flags; 368 + u64 requested_map_location; 389 369 } __packed; 390 370 391 371 struct hv_output_map_vp_state_page { ··· 405 365 struct hv_input_unmap_vp_state_page { 406 366 u64 partition_id; 407 367 u32 vp_index; 408 - u32 type; /* enum hv_vp_state_page_type */ 368 + u16 type; /* enum hv_vp_state_page_type */ 369 + union hv_input_vtl input_vtl; 370 + u8 reserved0; 371 + } __packed; 372 + 373 + struct hv_x64_apic_eoi_message { 374 + u32 vp_index; 375 + u32 interrupt_vector; 409 376 } __packed; 410 377 411 378 struct hv_opaque_intercept_message { ··· 562 515 u64 reserved[5]; 563 516 } __packed; 564 517 518 + struct hv_async_completion_message_payload { 519 + u64 partition_id; 520 + u32 status; 521 + u32 completion_count; 522 + u64 sub_status; 523 + } __packed; 524 + 565 525 union hv_input_delete_vp { 566 526 u64 as_uint64[2]; 567 527 struct { ··· 703 649 union hv_input_set_vp_state_data data[]; 704 650 } __packed; 705 651 652 + union hv_x64_vp_execution_state { 653 + u16 as_uint16; 654 + struct { 655 + u16 cpl:2; 656 + u16 cr0_pe:1; 657 + u16 cr0_am:1; 658 + u16 efer_lma:1; 659 + u16 debug_active:1; 660 + u16 interruption_pending:1; 661 + u16 vtl:4; 662 + u16 enclave_mode:1; 663 + u16 interrupt_shadow:1; 664 + u16 virtualization_fault_active:1; 665 + u16 reserved:2; 666 + } __packed; 667 + }; 668 + 669 + struct hv_x64_intercept_message_header { 670 + u32 vp_index; 671 + u8 instruction_length:4; 672 + u8 cr8:4; /* Only set for exo partitions */ 673 + u8 intercept_access_type; 674 + union hv_x64_vp_execution_state execution_state; 675 + struct hv_x64_segment_register cs_segment; 676 + u64 rip; 677 + u64 rflags; 678 + } __packed; 679 + 680 + union hv_x64_memory_access_info { 681 + u8 as_uint8; 682 + struct { 683 + u8 gva_valid:1; 684 + u8 gva_gpa_valid:1; 685 + u8 hypercall_output_pending:1; 686 + u8 tlb_locked_no_overlay:1; 687 + u8 reserved:4; 688 + } __packed; 689 + }; 690 + 691 + struct hv_x64_memory_intercept_message { 692 + struct hv_x64_intercept_message_header header; 693 + u32 cache_type; /* enum hv_cache_type */ 694 + u8 instruction_byte_count; 695 + union hv_x64_memory_access_info memory_access_info; 696 + u8 tpr_priority; 697 + u8 reserved1; 698 + u64 guest_virtual_address; 699 + u64 guest_physical_address; 700 + u8 instruction_bytes[16]; 701 + } __packed; 702 + 706 703 /* 707 704 * Dispatch state for the VP communicated by the hypervisor to the 708 705 * VP-dispatching thread in the root on return from HVCALL_DISPATCH_VP. ··· 821 716 #define HV_DISPATCH_VP_FLAG_SKIP_VP_SPEC_FLUSH 0x8 822 717 #define HV_DISPATCH_VP_FLAG_SKIP_CALLER_SPEC_FLUSH 0x10 823 718 #define HV_DISPATCH_VP_FLAG_SKIP_CALLER_USER_SPEC_FLUSH 0x20 719 + #define HV_DISPATCH_VP_FLAG_SCAN_INTERRUPT_INJECTION 0x40 824 720 825 721 struct hv_input_dispatch_vp { 826 722 u64 partition_id; ··· 835 729 u32 dispatch_state; /* enum hv_vp_dispatch_state */ 836 730 u32 dispatch_event; /* enum hv_vp_dispatch_event */ 837 731 } __packed; 732 + 733 + struct hv_input_modify_sparse_spa_page_host_access { 734 + u32 host_access : 2; 735 + u32 reserved : 30; 736 + u32 flags; 737 + u64 partition_id; 738 + u64 spa_page_list[]; 739 + } __packed; 740 + 741 + /* hv_input_modify_sparse_spa_page_host_access flags */ 742 + #define HV_MODIFY_SPA_PAGE_HOST_ACCESS_MAKE_EXCLUSIVE 0x1 743 + #define HV_MODIFY_SPA_PAGE_HOST_ACCESS_MAKE_SHARED 0x2 744 + #define HV_MODIFY_SPA_PAGE_HOST_ACCESS_LARGE_PAGE 0x4 745 + #define HV_MODIFY_SPA_PAGE_HOST_ACCESS_HUGE_PAGE 0x8 838 746 839 747 #endif /* _HV_HVHDK_H */
+91
include/hyperv/hvhdk_mini.h
··· 36 36 HV_SCHEDULER_TYPE_MAX 37 37 }; 38 38 39 + /* HV_STATS_AREA_TYPE */ 40 + enum hv_stats_area_type { 41 + HV_STATS_AREA_SELF = 0, 42 + HV_STATS_AREA_PARENT = 1, 43 + HV_STATS_AREA_INTERNAL = 2, 44 + HV_STATS_AREA_COUNT 45 + }; 46 + 47 + enum hv_stats_object_type { 48 + HV_STATS_OBJECT_HYPERVISOR = 0x00000001, 49 + HV_STATS_OBJECT_LOGICAL_PROCESSOR = 0x00000002, 50 + HV_STATS_OBJECT_PARTITION = 0x00010001, 51 + HV_STATS_OBJECT_VP = 0x00010002 52 + }; 53 + 54 + union hv_stats_object_identity { 55 + /* hv_stats_hypervisor */ 56 + struct { 57 + u8 reserved[15]; 58 + u8 stats_area_type; 59 + } __packed hv; 60 + 61 + /* hv_stats_logical_processor */ 62 + struct { 63 + u32 lp_index; 64 + u8 reserved[11]; 65 + u8 stats_area_type; 66 + } __packed lp; 67 + 68 + /* hv_stats_partition */ 69 + struct { 70 + u64 partition_id; 71 + u8 reserved[7]; 72 + u8 stats_area_type; 73 + } __packed partition; 74 + 75 + /* hv_stats_vp */ 76 + struct { 77 + u64 partition_id; 78 + u32 vp_index; 79 + u16 flags; 80 + u8 reserved; 81 + u8 stats_area_type; 82 + } __packed vp; 83 + }; 84 + 39 85 enum hv_partition_property_code { 40 86 /* Privilege properties */ 41 87 HV_PARTITION_PROPERTY_PRIVILEGE_FLAGS = 0x00010000, ··· 93 47 94 48 /* Compatibility properties */ 95 49 HV_PARTITION_PROPERTY_PROCESSOR_XSAVE_FEATURES = 0x00060002, 50 + HV_PARTITION_PROPERTY_XSAVE_STATES = 0x00060007, 96 51 HV_PARTITION_PROPERTY_MAX_XSAVE_DATA_SIZE = 0x00060008, 97 52 HV_PARTITION_PROPERTY_PROCESSOR_CLOCK_FREQUENCY = 0x00060009, 53 + }; 54 + 55 + enum hv_snp_status { 56 + HV_SNP_STATUS_NONE = 0, 57 + HV_SNP_STATUS_AVAILABLE = 1, 58 + HV_SNP_STATUS_INCOMPATIBLE = 2, 59 + HV_SNP_STATUS_PSP_UNAVAILABLE = 3, 60 + HV_SNP_STATUS_PSP_INIT_FAILED = 4, 61 + HV_SNP_STATUS_PSP_BAD_FW_VERSION = 5, 62 + HV_SNP_STATUS_BAD_CONFIGURATION = 6, 63 + HV_SNP_STATUS_PSP_FW_UPDATE_IN_PROGRESS = 7, 64 + HV_SNP_STATUS_PSP_RB_INIT_FAILED = 8, 65 + HV_SNP_STATUS_PSP_PLATFORM_STATUS_FAILED = 9, 66 + HV_SNP_STATUS_PSP_INIT_LATE_FAILED = 10, 98 67 }; 99 68 100 69 enum hv_system_property { 101 70 /* Add more values when needed */ 102 71 HV_SYSTEM_PROPERTY_SCHEDULER_TYPE = 15, 72 + HV_DYNAMIC_PROCESSOR_FEATURE_PROPERTY = 21, 73 + }; 74 + 75 + enum hv_dynamic_processor_feature_property { 76 + /* Add more values when needed */ 77 + HV_X64_DYNAMIC_PROCESSOR_FEATURE_MAX_ENCRYPTED_PARTITIONS = 13, 78 + HV_X64_DYNAMIC_PROCESSOR_FEATURE_SNP_STATUS = 16, 103 79 }; 104 80 105 81 struct hv_input_get_system_property { 106 82 u32 property_id; /* enum hv_system_property */ 107 83 union { 108 84 u32 as_uint32; 85 + #if IS_ENABLED(CONFIG_X86) 86 + /* enum hv_dynamic_processor_feature_property */ 87 + u32 hv_processor_feature; 88 + #endif 109 89 /* More fields to be filled in when needed */ 110 90 }; 111 91 } __packed; ··· 139 67 struct hv_output_get_system_property { 140 68 union { 141 69 u32 scheduler_type; /* enum hv_scheduler_type */ 70 + #if IS_ENABLED(CONFIG_X86) 71 + u64 hv_processor_feature_value; 72 + #endif 142 73 }; 74 + } __packed; 75 + 76 + struct hv_input_map_stats_page { 77 + u32 type; /* enum hv_stats_object_type */ 78 + u32 padding; 79 + union hv_stats_object_identity identity; 80 + } __packed; 81 + 82 + struct hv_output_map_stats_page { 83 + u64 map_location; 84 + } __packed; 85 + 86 + struct hv_input_unmap_stats_page { 87 + u32 type; /* enum hv_stats_object_type */ 88 + u32 padding; 89 + union hv_stats_object_identity identity; 143 90 } __packed; 144 91 145 92 struct hv_proximity_domain_flags {
+1 -56
include/linux/hyperv.h
··· 371 371 struct vmtransfer_page_range ranges[]; 372 372 } __packed; 373 373 374 - struct vmgpadl_packet_header { 375 - struct vmpacket_descriptor d; 376 - u32 gpadl; 377 - u32 reserved; 378 - } __packed; 379 - 380 - struct vmadd_remove_transfer_page_set { 381 - struct vmpacket_descriptor d; 382 - u32 gpadl; 383 - u16 xfer_pageset_id; 384 - u16 reserved; 385 - } __packed; 386 - 387 374 /* 388 375 * This structure defines a range in guest physical space that can be made to 389 376 * look virtually contiguous. ··· 382 395 }; 383 396 384 397 /* 385 - * This is the format for an Establish Gpadl packet, which contains a handle by 386 - * which this GPADL will be known and a set of GPA ranges associated with it. 387 - * This can be converted to a MDL by the guest OS. If there are multiple GPA 388 - * ranges, then the resulting MDL will be "chained," representing multiple VA 389 - * ranges. 390 - */ 391 - struct vmestablish_gpadl { 392 - struct vmpacket_descriptor d; 393 - u32 gpadl; 394 - u32 range_cnt; 395 - struct gpa_range range[1]; 396 - } __packed; 397 - 398 - /* 399 - * This is the format for a Teardown Gpadl packet, which indicates that the 400 - * GPADL handle in the Establish Gpadl packet will never be referenced again. 401 - */ 402 - struct vmteardown_gpadl { 403 - struct vmpacket_descriptor d; 404 - u32 gpadl; 405 - u32 reserved; /* for alignment to a 8-byte boundary */ 406 - } __packed; 407 - 408 - /* 409 398 * This is the format for a GPA-Direct packet, which contains a set of GPA 410 399 * ranges, in addition to commands and/or data. 411 400 */ ··· 391 428 u32 range_cnt; 392 429 struct gpa_range range[1]; 393 430 } __packed; 394 - 395 - /* This is the format for a Additional Data Packet. */ 396 - struct vmadditional_data { 397 - struct vmpacket_descriptor d; 398 - u64 total_bytes; 399 - u32 offset; 400 - u32 byte_cnt; 401 - unsigned char data[1]; 402 - } __packed; 403 - 404 - union vmpacket_largest_possible_header { 405 - struct vmpacket_descriptor simple_hdr; 406 - struct vmtransfer_page_packet_header xfer_page_hdr; 407 - struct vmgpadl_packet_header gpadl_hdr; 408 - struct vmadd_remove_transfer_page_set add_rm_xfer_page_hdr; 409 - struct vmestablish_gpadl establish_gpadl_hdr; 410 - struct vmteardown_gpadl teardown_gpadl_hdr; 411 - struct vmdata_gpa_direct data_gpa_direct_hdr; 412 - }; 413 431 414 432 #define VMPACKET_DATA_START_ADDRESS(__packet) \ 415 433 (void *)(((unsigned char *)__packet) + \ ··· 1605 1661 const guid_t *shv_host_servie_id); 1606 1662 int vmbus_send_modifychannel(struct vmbus_channel *channel, u32 target_vp); 1607 1663 void vmbus_set_event(struct vmbus_channel *channel); 1664 + int vmbus_channel_set_cpu(struct vmbus_channel *channel, u32 target_cpu); 1608 1665 1609 1666 /* Get the start of the ring buffer. */ 1610 1667 static inline void *
+291
include/uapi/linux/mshv.h
··· 1 + /* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ 2 + /* 3 + * Userspace interfaces for /dev/mshv* devices and derived fds 4 + * 5 + * This file is divided into sections containing data structures and IOCTLs for 6 + * a particular set of related devices or derived file descriptors. 7 + * 8 + * The IOCTL definitions are at the end of each section. They are grouped by 9 + * device/fd, so that new IOCTLs can easily be added with a monotonically 10 + * increasing number. 11 + */ 12 + #ifndef _UAPI_LINUX_MSHV_H 13 + #define _UAPI_LINUX_MSHV_H 14 + 15 + #include <linux/types.h> 16 + 17 + #define MSHV_IOCTL 0xB8 18 + 19 + /* 20 + ******************************************* 21 + * Entry point to main VMM APIs: /dev/mshv * 22 + ******************************************* 23 + */ 24 + 25 + enum { 26 + MSHV_PT_BIT_LAPIC, 27 + MSHV_PT_BIT_X2APIC, 28 + MSHV_PT_BIT_GPA_SUPER_PAGES, 29 + MSHV_PT_BIT_COUNT, 30 + }; 31 + 32 + #define MSHV_PT_FLAGS_MASK ((1 << MSHV_PT_BIT_COUNT) - 1) 33 + 34 + enum { 35 + MSHV_PT_ISOLATION_NONE, 36 + MSHV_PT_ISOLATION_COUNT, 37 + }; 38 + 39 + /** 40 + * struct mshv_create_partition - arguments for MSHV_CREATE_PARTITION 41 + * @pt_flags: Bitmask of 1 << MSHV_PT_BIT_* 42 + * @pt_isolation: MSHV_PT_ISOLATION_* 43 + * 44 + * Returns a file descriptor to act as a handle to a guest partition. 45 + * At this point the partition is not yet initialized in the hypervisor. 46 + * Some operations must be done with the partition in this state, e.g. setting 47 + * so-called "early" partition properties. The partition can then be 48 + * initialized with MSHV_INITIALIZE_PARTITION. 49 + */ 50 + struct mshv_create_partition { 51 + __u64 pt_flags; 52 + __u64 pt_isolation; 53 + }; 54 + 55 + /* /dev/mshv */ 56 + #define MSHV_CREATE_PARTITION _IOW(MSHV_IOCTL, 0x00, struct mshv_create_partition) 57 + 58 + /* 59 + ************************ 60 + * Child partition APIs * 61 + ************************ 62 + */ 63 + 64 + struct mshv_create_vp { 65 + __u32 vp_index; 66 + }; 67 + 68 + enum { 69 + MSHV_SET_MEM_BIT_WRITABLE, 70 + MSHV_SET_MEM_BIT_EXECUTABLE, 71 + MSHV_SET_MEM_BIT_UNMAP, 72 + MSHV_SET_MEM_BIT_COUNT 73 + }; 74 + 75 + #define MSHV_SET_MEM_FLAGS_MASK ((1 << MSHV_SET_MEM_BIT_COUNT) - 1) 76 + 77 + /* The hypervisor's "native" page size */ 78 + #define MSHV_HV_PAGE_SIZE 0x1000 79 + 80 + /** 81 + * struct mshv_user_mem_region - arguments for MSHV_SET_GUEST_MEMORY 82 + * @size: Size of the memory region (bytes). Must be aligned to 83 + * MSHV_HV_PAGE_SIZE 84 + * @guest_pfn: Base guest page number to map 85 + * @userspace_addr: Base address of userspace memory. Must be aligned to 86 + * MSHV_HV_PAGE_SIZE 87 + * @flags: Bitmask of 1 << MSHV_SET_MEM_BIT_*. If (1 << MSHV_SET_MEM_BIT_UNMAP) 88 + * is set, ignore other bits. 89 + * @rsvd: MBZ 90 + * 91 + * Map or unmap a region of userspace memory to Guest Physical Addresses (GPA). 92 + * Mappings can't overlap in GPA space or userspace. 93 + * To unmap, these fields must match an existing mapping. 94 + */ 95 + struct mshv_user_mem_region { 96 + __u64 size; 97 + __u64 guest_pfn; 98 + __u64 userspace_addr; 99 + __u8 flags; 100 + __u8 rsvd[7]; 101 + }; 102 + 103 + enum { 104 + MSHV_IRQFD_BIT_DEASSIGN, 105 + MSHV_IRQFD_BIT_RESAMPLE, 106 + MSHV_IRQFD_BIT_COUNT, 107 + }; 108 + 109 + #define MSHV_IRQFD_FLAGS_MASK ((1 << MSHV_IRQFD_BIT_COUNT) - 1) 110 + 111 + struct mshv_user_irqfd { 112 + __s32 fd; 113 + __s32 resamplefd; 114 + __u32 gsi; 115 + __u32 flags; 116 + }; 117 + 118 + enum { 119 + MSHV_IOEVENTFD_BIT_DATAMATCH, 120 + MSHV_IOEVENTFD_BIT_PIO, 121 + MSHV_IOEVENTFD_BIT_DEASSIGN, 122 + MSHV_IOEVENTFD_BIT_COUNT, 123 + }; 124 + 125 + #define MSHV_IOEVENTFD_FLAGS_MASK ((1 << MSHV_IOEVENTFD_BIT_COUNT) - 1) 126 + 127 + struct mshv_user_ioeventfd { 128 + __u64 datamatch; 129 + __u64 addr; /* legal pio/mmio address */ 130 + __u32 len; /* 1, 2, 4, or 8 bytes */ 131 + __s32 fd; 132 + __u32 flags; 133 + __u8 rsvd[4]; 134 + }; 135 + 136 + struct mshv_user_irq_entry { 137 + __u32 gsi; 138 + __u32 address_lo; 139 + __u32 address_hi; 140 + __u32 data; 141 + }; 142 + 143 + struct mshv_user_irq_table { 144 + __u32 nr; 145 + __u32 rsvd; /* MBZ */ 146 + struct mshv_user_irq_entry entries[]; 147 + }; 148 + 149 + enum { 150 + MSHV_GPAP_ACCESS_TYPE_ACCESSED, 151 + MSHV_GPAP_ACCESS_TYPE_DIRTY, 152 + MSHV_GPAP_ACCESS_TYPE_COUNT /* Count of enum members */ 153 + }; 154 + 155 + enum { 156 + MSHV_GPAP_ACCESS_OP_NOOP, 157 + MSHV_GPAP_ACCESS_OP_CLEAR, 158 + MSHV_GPAP_ACCESS_OP_SET, 159 + MSHV_GPAP_ACCESS_OP_COUNT /* Count of enum members */ 160 + }; 161 + 162 + /** 163 + * struct mshv_gpap_access_bitmap - arguments for MSHV_GET_GPAP_ACCESS_BITMAP 164 + * @access_type: MSHV_GPAP_ACCESS_TYPE_* - The type of access to record in the 165 + * bitmap 166 + * @access_op: MSHV_GPAP_ACCESS_OP_* - Allows an optional clear or set of all 167 + * the access states in the range, after retrieving the current 168 + * states. 169 + * @rsvd: MBZ 170 + * @page_count: Number of pages 171 + * @gpap_base: Base gpa page number 172 + * @bitmap_ptr: Output buffer for bitmap, at least (page_count + 7) / 8 bytes 173 + * 174 + * Retrieve a bitmap of either ACCESSED or DIRTY bits for a given range of guest 175 + * memory, and optionally clear or set the bits. 176 + */ 177 + struct mshv_gpap_access_bitmap { 178 + __u8 access_type; 179 + __u8 access_op; 180 + __u8 rsvd[6]; 181 + __u64 page_count; 182 + __u64 gpap_base; 183 + __u64 bitmap_ptr; 184 + }; 185 + 186 + /** 187 + * struct mshv_root_hvcall - arguments for MSHV_ROOT_HVCALL 188 + * @code: Hypercall code (HVCALL_*) 189 + * @reps: in: Rep count ('repcount') 190 + * out: Reps completed ('repcomp'). MBZ unless rep hvcall 191 + * @in_sz: Size of input incl rep data. <= MSHV_HV_PAGE_SIZE 192 + * @out_sz: Size of output buffer. <= MSHV_HV_PAGE_SIZE. MBZ if out_ptr is 0 193 + * @status: in: MBZ 194 + * out: HV_STATUS_* from hypercall 195 + * @rsvd: MBZ 196 + * @in_ptr: Input data buffer (struct hv_input_*). If used with partition or 197 + * vp fd, partition id field is populated by kernel. 198 + * @out_ptr: Output data buffer (optional) 199 + */ 200 + struct mshv_root_hvcall { 201 + __u16 code; 202 + __u16 reps; 203 + __u16 in_sz; 204 + __u16 out_sz; 205 + __u16 status; 206 + __u8 rsvd[6]; 207 + __u64 in_ptr; 208 + __u64 out_ptr; 209 + }; 210 + 211 + /* Partition fds created with MSHV_CREATE_PARTITION */ 212 + #define MSHV_INITIALIZE_PARTITION _IO(MSHV_IOCTL, 0x00) 213 + #define MSHV_CREATE_VP _IOW(MSHV_IOCTL, 0x01, struct mshv_create_vp) 214 + #define MSHV_SET_GUEST_MEMORY _IOW(MSHV_IOCTL, 0x02, struct mshv_user_mem_region) 215 + #define MSHV_IRQFD _IOW(MSHV_IOCTL, 0x03, struct mshv_user_irqfd) 216 + #define MSHV_IOEVENTFD _IOW(MSHV_IOCTL, 0x04, struct mshv_user_ioeventfd) 217 + #define MSHV_SET_MSI_ROUTING _IOW(MSHV_IOCTL, 0x05, struct mshv_user_irq_table) 218 + #define MSHV_GET_GPAP_ACCESS_BITMAP _IOWR(MSHV_IOCTL, 0x06, struct mshv_gpap_access_bitmap) 219 + /* Generic hypercall */ 220 + #define MSHV_ROOT_HVCALL _IOWR(MSHV_IOCTL, 0x07, struct mshv_root_hvcall) 221 + 222 + /* 223 + ******************************** 224 + * VP APIs for child partitions * 225 + ******************************** 226 + */ 227 + 228 + #define MSHV_RUN_VP_BUF_SZ 256 229 + 230 + /* 231 + * VP state pages may be mapped to userspace via mmap(). 232 + * To specify which state page, use MSHV_VP_MMAP_OFFSET_ values multiplied by 233 + * the system page size. 234 + * e.g. 235 + * long page_size = sysconf(_SC_PAGE_SIZE); 236 + * void *reg_page = mmap(NULL, MSHV_HV_PAGE_SIZE, PROT_READ|PROT_WRITE, 237 + * MAP_SHARED, vp_fd, 238 + * MSHV_VP_MMAP_OFFSET_REGISTERS * page_size); 239 + */ 240 + enum { 241 + MSHV_VP_MMAP_OFFSET_REGISTERS, 242 + MSHV_VP_MMAP_OFFSET_INTERCEPT_MESSAGE, 243 + MSHV_VP_MMAP_OFFSET_GHCB, 244 + MSHV_VP_MMAP_OFFSET_COUNT 245 + }; 246 + 247 + /** 248 + * struct mshv_run_vp - argument for MSHV_RUN_VP 249 + * @msg_buf: On success, the intercept message is copied here. It can be 250 + * interpreted using the relevant hypervisor definitions. 251 + */ 252 + struct mshv_run_vp { 253 + __u8 msg_buf[MSHV_RUN_VP_BUF_SZ]; 254 + }; 255 + 256 + enum { 257 + MSHV_VP_STATE_LAPIC, /* Local interrupt controller state (either arch) */ 258 + MSHV_VP_STATE_XSAVE, /* XSAVE data in compacted form (x86_64) */ 259 + MSHV_VP_STATE_SIMP, 260 + MSHV_VP_STATE_SIEFP, 261 + MSHV_VP_STATE_SYNTHETIC_TIMERS, 262 + MSHV_VP_STATE_COUNT, 263 + }; 264 + 265 + /** 266 + * struct mshv_get_set_vp_state - arguments for MSHV_[GET,SET]_VP_STATE 267 + * @type: MSHV_VP_STATE_* 268 + * @rsvd: MBZ 269 + * @buf_sz: in: 4k page-aligned size of buffer 270 + * out: Actual size of data (on EINVAL, check this to see if buffer 271 + * was too small) 272 + * @buf_ptr: 4k page-aligned data buffer 273 + */ 274 + struct mshv_get_set_vp_state { 275 + __u8 type; 276 + __u8 rsvd[3]; 277 + __u32 buf_sz; 278 + __u64 buf_ptr; 279 + }; 280 + 281 + /* VP fds created with MSHV_CREATE_VP */ 282 + #define MSHV_RUN_VP _IOR(MSHV_IOCTL, 0x00, struct mshv_run_vp) 283 + #define MSHV_GET_VP_STATE _IOWR(MSHV_IOCTL, 0x01, struct mshv_get_set_vp_state) 284 + #define MSHV_SET_VP_STATE _IOWR(MSHV_IOCTL, 0x02, struct mshv_get_set_vp_state) 285 + /* 286 + * Generic hypercall 287 + * Defined above in partition IOCTLs, avoid redefining it here 288 + * #define MSHV_ROOT_HVCALL _IOWR(MSHV_IOCTL, 0x07, struct mshv_root_hvcall) 289 + */ 290 + 291 + #endif
+1
kernel/cpu.c
··· 526 526 527 527 percpu_rwsem_assert_held(&cpu_hotplug_lock); 528 528 } 529 + EXPORT_SYMBOL_GPL(lockdep_assert_cpus_held); 529 530 530 531 #ifdef CONFIG_LOCKDEP 531 532 int lockdep_is_cpus_held(void)