Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Drivers: hv: Introduce mshv_root module to expose /dev/mshv to VMMs

Provide a set of IOCTLs for creating and managing child partitions when
running as root partition on Hyper-V. The new driver is enabled via
CONFIG_MSHV_ROOT.

A brief overview of the interface:

MSHV_CREATE_PARTITION is the entry point, returning a file descriptor
representing a child partition. IOCTLs on this fd can be used to map
memory, create VPs, etc.

Creating a VP returns another file descriptor representing that VP which
in turn has another set of corresponding IOCTLs for running the VP,
getting/setting state, etc.

MSHV_ROOT_HVCALL is a generic "passthrough" hypercall IOCTL which can be
used for a number of partition or VP hypercalls. This is for hypercalls
that do not affect any state in the kernel driver, such as getting and
setting VP registers and partition properties, translating addresses,
etc. It is "passthrough" because the binary input and output for the
hypercall is only interpreted by the VMM - the kernel driver does
nothing but insert the VP and partition id where necessary (which are
always in the same place), and execute the hypercall.

Co-developed-by: Anirudh Rayabharam <anrayabh@linux.microsoft.com>
Signed-off-by: Anirudh Rayabharam <anrayabh@linux.microsoft.com>
Co-developed-by: Jinank Jain <jinankjain@microsoft.com>
Signed-off-by: Jinank Jain <jinankjain@microsoft.com>
Co-developed-by: Mukesh Rathor <mrathor@linux.microsoft.com>
Signed-off-by: Mukesh Rathor <mrathor@linux.microsoft.com>
Co-developed-by: Muminul Islam <muislam@microsoft.com>
Signed-off-by: Muminul Islam <muislam@microsoft.com>
Co-developed-by: Praveen K Paladugu <prapal@linux.microsoft.com>
Signed-off-by: Praveen K Paladugu <prapal@linux.microsoft.com>
Co-developed-by: Stanislav Kinsburskii <skinsburskii@linux.microsoft.com>
Signed-off-by: Stanislav Kinsburskii <skinsburskii@linux.microsoft.com>
Co-developed-by: Wei Liu <wei.liu@kernel.org>
Signed-off-by: Nuno Das Neves <nunodasneves@linux.microsoft.com>
Reviewed-by: Roman Kisel <romank@linux.microsoft.com>
Link: https://lore.kernel.org/r/1741980536-3865-11-git-send-email-nunodasneves@linux.microsoft.com
Signed-off-by: Wei Liu <wei.liu@kernel.org>
Message-ID: <1741980536-3865-11-git-send-email-nunodasneves@linux.microsoft.com>

authored by

Nuno Das Neves and committed by
Wei Liu
621191d7 0bd921a4

+5732 -1
+2
Documentation/userspace-api/ioctl/ioctl-number.rst
··· 370 370 0xB7 all uapi/linux/remoteproc_cdev.h <mailto:linux-remoteproc@vger.kernel.org> 371 371 0xB7 all uapi/linux/nsfs.h <mailto:Andrei Vagin <avagin@openvz.org>> 372 372 0xB8 01-02 uapi/misc/mrvl_cn10k_dpi.h Marvell CN10K DPI driver 373 + 0xB8 all uapi/linux/mshv.h Microsoft Hyper-V /dev/mshv driver 374 + <mailto:linux-hyperv@vger.kernel.org> 373 375 0xC0 00-0F linux/usb/iowarrior.h 374 376 0xCA 00-0F uapi/misc/cxl.h 375 377 0xCA 10-2F uapi/misc/ocxl.h
+1
drivers/hv/Kconfig
··· 64 64 # e.g. When withdrawing memory, the hypervisor gives back 4k pages in 65 65 # no particular order, making it impossible to reassemble larger pages 66 66 depends on PAGE_SIZE_4KB 67 + select EVENTFD 67 68 default n 68 69 help 69 70 Select this option to enable support for booting and running as root
+4 -1
drivers/hv/Makefile
··· 2 2 obj-$(CONFIG_HYPERV) += hv_vmbus.o 3 3 obj-$(CONFIG_HYPERV_UTILS) += hv_utils.o 4 4 obj-$(CONFIG_HYPERV_BALLOON) += hv_balloon.o 5 + obj-$(CONFIG_MSHV_ROOT) += mshv_root.o 5 6 6 7 CFLAGS_hv_trace.o = -I$(src) 7 8 CFLAGS_hv_balloon.o = -I$(src) ··· 12 11 channel_mgmt.o ring_buffer.o hv_trace.o 13 12 hv_vmbus-$(CONFIG_HYPERV_TESTING) += hv_debugfs.o 14 13 hv_utils-y := hv_util.o hv_kvp.o hv_snapshot.o hv_utils_transport.o 14 + mshv_root-y := mshv_root_main.o mshv_synic.o mshv_eventfd.o mshv_irq.o \ 15 + mshv_root_hv_call.o mshv_portid_table.o 15 16 16 17 # Code that must be built-in 17 18 obj-$(subst m,y,$(CONFIG_HYPERV)) += hv_common.o 18 - obj-$(subst m,y,$(CONFIG_MSHV_ROOT)) += hv_proc.o 19 + obj-$(subst m,y,$(CONFIG_MSHV_ROOT)) += hv_proc.o mshv_common.o
+30
drivers/hv/mshv.h
··· 1 + /* SPDX-License-Identifier: GPL-2.0-only */ 2 + /* 3 + * Copyright (c) 2023, Microsoft Corporation. 4 + */ 5 + 6 + #ifndef _MSHV_H_ 7 + #define _MSHV_H_ 8 + 9 + #include <linux/stddef.h> 10 + #include <linux/string.h> 11 + #include <hyperv/hvhdk.h> 12 + 13 + #define mshv_field_nonzero(STRUCT, MEMBER) \ 14 + memchr_inv(&((STRUCT).MEMBER), \ 15 + 0, sizeof_field(typeof(STRUCT), MEMBER)) 16 + 17 + int hv_call_get_vp_registers(u32 vp_index, u64 partition_id, u16 count, 18 + union hv_input_vtl input_vtl, 19 + struct hv_register_assoc *registers); 20 + 21 + int hv_call_set_vp_registers(u32 vp_index, u64 partition_id, u16 count, 22 + union hv_input_vtl input_vtl, 23 + struct hv_register_assoc *registers); 24 + 25 + int hv_call_get_partition_property(u64 partition_id, u64 property_code, 26 + u64 *property_value); 27 + 28 + int mshv_do_pre_guest_mode_work(ulong th_flags); 29 + 30 + #endif /* _MSHV_H */
+161
drivers/hv/mshv_common.c
··· 1 + // SPDX-License-Identifier: GPL-2.0-only 2 + /* 3 + * Copyright (c) 2024, Microsoft Corporation. 4 + * 5 + * This file contains functions that will be called from one or more modules. 6 + * If any of these modules are configured to build, this file is built and just 7 + * statically linked in. 8 + * 9 + * Authors: Microsoft Linux virtualization team 10 + */ 11 + 12 + #include <linux/kernel.h> 13 + #include <linux/mm.h> 14 + #include <asm/mshyperv.h> 15 + #include <linux/resume_user_mode.h> 16 + 17 + #include "mshv.h" 18 + 19 + #define HV_GET_REGISTER_BATCH_SIZE \ 20 + (HV_HYP_PAGE_SIZE / sizeof(union hv_register_value)) 21 + #define HV_SET_REGISTER_BATCH_SIZE \ 22 + ((HV_HYP_PAGE_SIZE - sizeof(struct hv_input_set_vp_registers)) \ 23 + / sizeof(struct hv_register_assoc)) 24 + 25 + int hv_call_get_vp_registers(u32 vp_index, u64 partition_id, u16 count, 26 + union hv_input_vtl input_vtl, 27 + struct hv_register_assoc *registers) 28 + { 29 + struct hv_input_get_vp_registers *input_page; 30 + union hv_register_value *output_page; 31 + u16 completed = 0; 32 + unsigned long remaining = count; 33 + int rep_count, i; 34 + u64 status = HV_STATUS_SUCCESS; 35 + unsigned long flags; 36 + 37 + local_irq_save(flags); 38 + 39 + input_page = *this_cpu_ptr(hyperv_pcpu_input_arg); 40 + output_page = *this_cpu_ptr(hyperv_pcpu_output_arg); 41 + 42 + input_page->partition_id = partition_id; 43 + input_page->vp_index = vp_index; 44 + input_page->input_vtl.as_uint8 = input_vtl.as_uint8; 45 + input_page->rsvd_z8 = 0; 46 + input_page->rsvd_z16 = 0; 47 + 48 + while (remaining) { 49 + rep_count = min(remaining, HV_GET_REGISTER_BATCH_SIZE); 50 + for (i = 0; i < rep_count; ++i) 51 + input_page->names[i] = registers[i].name; 52 + 53 + status = hv_do_rep_hypercall(HVCALL_GET_VP_REGISTERS, rep_count, 54 + 0, input_page, output_page); 55 + if (!hv_result_success(status)) 56 + break; 57 + 58 + completed = hv_repcomp(status); 59 + for (i = 0; i < completed; ++i) 60 + registers[i].value = output_page[i]; 61 + 62 + registers += completed; 63 + remaining -= completed; 64 + } 65 + local_irq_restore(flags); 66 + 67 + return hv_result_to_errno(status); 68 + } 69 + EXPORT_SYMBOL_GPL(hv_call_get_vp_registers); 70 + 71 + int hv_call_set_vp_registers(u32 vp_index, u64 partition_id, u16 count, 72 + union hv_input_vtl input_vtl, 73 + struct hv_register_assoc *registers) 74 + { 75 + struct hv_input_set_vp_registers *input_page; 76 + u16 completed = 0; 77 + unsigned long remaining = count; 78 + int rep_count; 79 + u64 status = HV_STATUS_SUCCESS; 80 + unsigned long flags; 81 + 82 + local_irq_save(flags); 83 + input_page = *this_cpu_ptr(hyperv_pcpu_input_arg); 84 + 85 + input_page->partition_id = partition_id; 86 + input_page->vp_index = vp_index; 87 + input_page->input_vtl.as_uint8 = input_vtl.as_uint8; 88 + input_page->rsvd_z8 = 0; 89 + input_page->rsvd_z16 = 0; 90 + 91 + while (remaining) { 92 + rep_count = min(remaining, HV_SET_REGISTER_BATCH_SIZE); 93 + memcpy(input_page->elements, registers, 94 + sizeof(struct hv_register_assoc) * rep_count); 95 + 96 + status = hv_do_rep_hypercall(HVCALL_SET_VP_REGISTERS, rep_count, 97 + 0, input_page, NULL); 98 + if (!hv_result_success(status)) 99 + break; 100 + 101 + completed = hv_repcomp(status); 102 + registers += completed; 103 + remaining -= completed; 104 + } 105 + 106 + local_irq_restore(flags); 107 + 108 + return hv_result_to_errno(status); 109 + } 110 + EXPORT_SYMBOL_GPL(hv_call_set_vp_registers); 111 + 112 + int hv_call_get_partition_property(u64 partition_id, 113 + u64 property_code, 114 + u64 *property_value) 115 + { 116 + u64 status; 117 + unsigned long flags; 118 + struct hv_input_get_partition_property *input; 119 + struct hv_output_get_partition_property *output; 120 + 121 + local_irq_save(flags); 122 + input = *this_cpu_ptr(hyperv_pcpu_input_arg); 123 + output = *this_cpu_ptr(hyperv_pcpu_output_arg); 124 + memset(input, 0, sizeof(*input)); 125 + input->partition_id = partition_id; 126 + input->property_code = property_code; 127 + status = hv_do_hypercall(HVCALL_GET_PARTITION_PROPERTY, input, output); 128 + 129 + if (!hv_result_success(status)) { 130 + local_irq_restore(flags); 131 + return hv_result_to_errno(status); 132 + } 133 + *property_value = output->property_value; 134 + 135 + local_irq_restore(flags); 136 + 137 + return 0; 138 + } 139 + EXPORT_SYMBOL_GPL(hv_call_get_partition_property); 140 + 141 + /* 142 + * Handle any pre-processing before going into the guest mode on this cpu, most 143 + * notably call schedule(). Must be invoked with both preemption and 144 + * interrupts enabled. 145 + * 146 + * Returns: 0 on success, -errno on error. 147 + */ 148 + int mshv_do_pre_guest_mode_work(ulong th_flags) 149 + { 150 + if (th_flags & (_TIF_SIGPENDING | _TIF_NOTIFY_SIGNAL)) 151 + return -EINTR; 152 + 153 + if (th_flags & _TIF_NEED_RESCHED) 154 + schedule(); 155 + 156 + if (th_flags & _TIF_NOTIFY_RESUME) 157 + resume_user_mode_work(NULL); 158 + 159 + return 0; 160 + } 161 + EXPORT_SYMBOL_GPL(mshv_do_pre_guest_mode_work);
+833
drivers/hv/mshv_eventfd.c
··· 1 + // SPDX-License-Identifier: GPL-2.0-only 2 + /* 3 + * eventfd support for mshv 4 + * 5 + * Heavily inspired from KVM implementation of irqfd/ioeventfd. The basic 6 + * framework code is taken from the kvm implementation. 7 + * 8 + * All credits to kvm developers. 9 + */ 10 + 11 + #include <linux/syscalls.h> 12 + #include <linux/wait.h> 13 + #include <linux/poll.h> 14 + #include <linux/file.h> 15 + #include <linux/list.h> 16 + #include <linux/workqueue.h> 17 + #include <linux/eventfd.h> 18 + 19 + #if IS_ENABLED(CONFIG_X86_64) 20 + #include <asm/apic.h> 21 + #endif 22 + #include <asm/mshyperv.h> 23 + 24 + #include "mshv_eventfd.h" 25 + #include "mshv.h" 26 + #include "mshv_root.h" 27 + 28 + static struct workqueue_struct *irqfd_cleanup_wq; 29 + 30 + void mshv_register_irq_ack_notifier(struct mshv_partition *partition, 31 + struct mshv_irq_ack_notifier *mian) 32 + { 33 + mutex_lock(&partition->pt_irq_lock); 34 + hlist_add_head_rcu(&mian->link, &partition->irq_ack_notifier_list); 35 + mutex_unlock(&partition->pt_irq_lock); 36 + } 37 + 38 + void mshv_unregister_irq_ack_notifier(struct mshv_partition *partition, 39 + struct mshv_irq_ack_notifier *mian) 40 + { 41 + mutex_lock(&partition->pt_irq_lock); 42 + hlist_del_init_rcu(&mian->link); 43 + mutex_unlock(&partition->pt_irq_lock); 44 + synchronize_rcu(); 45 + } 46 + 47 + bool mshv_notify_acked_gsi(struct mshv_partition *partition, int gsi) 48 + { 49 + struct mshv_irq_ack_notifier *mian; 50 + bool acked = false; 51 + 52 + rcu_read_lock(); 53 + hlist_for_each_entry_rcu(mian, &partition->irq_ack_notifier_list, 54 + link) { 55 + if (mian->irq_ack_gsi == gsi) { 56 + mian->irq_acked(mian); 57 + acked = true; 58 + } 59 + } 60 + rcu_read_unlock(); 61 + 62 + return acked; 63 + } 64 + 65 + #if IS_ENABLED(CONFIG_ARM64) 66 + static inline bool hv_should_clear_interrupt(enum hv_interrupt_type type) 67 + { 68 + return false; 69 + } 70 + #elif IS_ENABLED(CONFIG_X86_64) 71 + static inline bool hv_should_clear_interrupt(enum hv_interrupt_type type) 72 + { 73 + return type == HV_X64_INTERRUPT_TYPE_EXTINT; 74 + } 75 + #endif 76 + 77 + static void mshv_irqfd_resampler_ack(struct mshv_irq_ack_notifier *mian) 78 + { 79 + struct mshv_irqfd_resampler *resampler; 80 + struct mshv_partition *partition; 81 + struct mshv_irqfd *irqfd; 82 + int idx; 83 + 84 + resampler = container_of(mian, struct mshv_irqfd_resampler, 85 + rsmplr_notifier); 86 + partition = resampler->rsmplr_partn; 87 + 88 + idx = srcu_read_lock(&partition->pt_irq_srcu); 89 + 90 + hlist_for_each_entry_rcu(irqfd, &resampler->rsmplr_irqfd_list, 91 + irqfd_resampler_hnode) { 92 + if (hv_should_clear_interrupt(irqfd->irqfd_lapic_irq.lapic_control.interrupt_type)) 93 + hv_call_clear_virtual_interrupt(partition->pt_id); 94 + 95 + eventfd_signal(irqfd->irqfd_resamplefd); 96 + } 97 + 98 + srcu_read_unlock(&partition->pt_irq_srcu, idx); 99 + } 100 + 101 + #if IS_ENABLED(CONFIG_X86_64) 102 + static bool 103 + mshv_vp_irq_vector_injected(union hv_vp_register_page_interrupt_vectors iv, 104 + u32 vector) 105 + { 106 + int i; 107 + 108 + for (i = 0; i < iv.vector_count; i++) { 109 + if (iv.vector[i] == vector) 110 + return true; 111 + } 112 + 113 + return false; 114 + } 115 + 116 + static int mshv_vp_irq_try_set_vector(struct mshv_vp *vp, u32 vector) 117 + { 118 + union hv_vp_register_page_interrupt_vectors iv, new_iv; 119 + 120 + iv = vp->vp_register_page->interrupt_vectors; 121 + new_iv = iv; 122 + 123 + if (mshv_vp_irq_vector_injected(iv, vector)) 124 + return 0; 125 + 126 + if (iv.vector_count >= HV_VP_REGISTER_PAGE_MAX_VECTOR_COUNT) 127 + return -ENOSPC; 128 + 129 + new_iv.vector[new_iv.vector_count++] = vector; 130 + 131 + if (cmpxchg(&vp->vp_register_page->interrupt_vectors.as_uint64, 132 + iv.as_uint64, new_iv.as_uint64) != iv.as_uint64) 133 + return -EAGAIN; 134 + 135 + return 0; 136 + } 137 + 138 + static int mshv_vp_irq_set_vector(struct mshv_vp *vp, u32 vector) 139 + { 140 + int ret; 141 + 142 + do { 143 + ret = mshv_vp_irq_try_set_vector(vp, vector); 144 + } while (ret == -EAGAIN && !need_resched()); 145 + 146 + return ret; 147 + } 148 + 149 + /* 150 + * Try to raise irq for guest via shared vector array. hyp does the actual 151 + * inject of the interrupt. 152 + */ 153 + static int mshv_try_assert_irq_fast(struct mshv_irqfd *irqfd) 154 + { 155 + struct mshv_partition *partition = irqfd->irqfd_partn; 156 + struct mshv_lapic_irq *irq = &irqfd->irqfd_lapic_irq; 157 + struct mshv_vp *vp; 158 + 159 + if (!(ms_hyperv.ext_features & 160 + HV_VP_DISPATCH_INTERRUPT_INJECTION_AVAILABLE)) 161 + return -EOPNOTSUPP; 162 + 163 + if (hv_scheduler_type != HV_SCHEDULER_TYPE_ROOT) 164 + return -EOPNOTSUPP; 165 + 166 + if (irq->lapic_control.logical_dest_mode) 167 + return -EOPNOTSUPP; 168 + 169 + vp = partition->pt_vp_array[irq->lapic_apic_id]; 170 + 171 + if (!vp->vp_register_page) 172 + return -EOPNOTSUPP; 173 + 174 + if (mshv_vp_irq_set_vector(vp, irq->lapic_vector)) 175 + return -EINVAL; 176 + 177 + if (vp->run.flags.root_sched_dispatched && 178 + vp->vp_register_page->interrupt_vectors.as_uint64) 179 + return -EBUSY; 180 + 181 + wake_up(&vp->run.vp_suspend_queue); 182 + 183 + return 0; 184 + } 185 + #else /* CONFIG_X86_64 */ 186 + static int mshv_try_assert_irq_fast(struct mshv_irqfd *irqfd) 187 + { 188 + return -EOPNOTSUPP; 189 + } 190 + #endif 191 + 192 + static void mshv_assert_irq_slow(struct mshv_irqfd *irqfd) 193 + { 194 + struct mshv_partition *partition = irqfd->irqfd_partn; 195 + struct mshv_lapic_irq *irq = &irqfd->irqfd_lapic_irq; 196 + unsigned int seq; 197 + int idx; 198 + 199 + WARN_ON(irqfd->irqfd_resampler && 200 + !irq->lapic_control.level_triggered); 201 + 202 + idx = srcu_read_lock(&partition->pt_irq_srcu); 203 + if (irqfd->irqfd_girq_ent.guest_irq_num) { 204 + if (!irqfd->irqfd_girq_ent.girq_entry_valid) { 205 + srcu_read_unlock(&partition->pt_irq_srcu, idx); 206 + return; 207 + } 208 + 209 + do { 210 + seq = read_seqcount_begin(&irqfd->irqfd_irqe_sc); 211 + } while (read_seqcount_retry(&irqfd->irqfd_irqe_sc, seq)); 212 + } 213 + 214 + hv_call_assert_virtual_interrupt(irqfd->irqfd_partn->pt_id, 215 + irq->lapic_vector, irq->lapic_apic_id, 216 + irq->lapic_control); 217 + srcu_read_unlock(&partition->pt_irq_srcu, idx); 218 + } 219 + 220 + static void mshv_irqfd_resampler_shutdown(struct mshv_irqfd *irqfd) 221 + { 222 + struct mshv_irqfd_resampler *rp = irqfd->irqfd_resampler; 223 + struct mshv_partition *pt = rp->rsmplr_partn; 224 + 225 + mutex_lock(&pt->irqfds_resampler_lock); 226 + 227 + hlist_del_rcu(&irqfd->irqfd_resampler_hnode); 228 + synchronize_srcu(&pt->pt_irq_srcu); 229 + 230 + if (hlist_empty(&rp->rsmplr_irqfd_list)) { 231 + hlist_del(&rp->rsmplr_hnode); 232 + mshv_unregister_irq_ack_notifier(pt, &rp->rsmplr_notifier); 233 + kfree(rp); 234 + } 235 + 236 + mutex_unlock(&pt->irqfds_resampler_lock); 237 + } 238 + 239 + /* 240 + * Race-free decouple logic (ordering is critical) 241 + */ 242 + static void mshv_irqfd_shutdown(struct work_struct *work) 243 + { 244 + struct mshv_irqfd *irqfd = 245 + container_of(work, struct mshv_irqfd, irqfd_shutdown); 246 + 247 + /* 248 + * Synchronize with the wait-queue and unhook ourselves to prevent 249 + * further events. 250 + */ 251 + remove_wait_queue(irqfd->irqfd_wqh, &irqfd->irqfd_wait); 252 + 253 + if (irqfd->irqfd_resampler) { 254 + mshv_irqfd_resampler_shutdown(irqfd); 255 + eventfd_ctx_put(irqfd->irqfd_resamplefd); 256 + } 257 + 258 + /* 259 + * It is now safe to release the object's resources 260 + */ 261 + eventfd_ctx_put(irqfd->irqfd_eventfd_ctx); 262 + kfree(irqfd); 263 + } 264 + 265 + /* assumes partition->pt_irqfds_lock is held */ 266 + static bool mshv_irqfd_is_active(struct mshv_irqfd *irqfd) 267 + { 268 + return !hlist_unhashed(&irqfd->irqfd_hnode); 269 + } 270 + 271 + /* 272 + * Mark the irqfd as inactive and schedule it for removal 273 + * 274 + * assumes partition->pt_irqfds_lock is held 275 + */ 276 + static void mshv_irqfd_deactivate(struct mshv_irqfd *irqfd) 277 + { 278 + if (!mshv_irqfd_is_active(irqfd)) 279 + return; 280 + 281 + hlist_del(&irqfd->irqfd_hnode); 282 + 283 + queue_work(irqfd_cleanup_wq, &irqfd->irqfd_shutdown); 284 + } 285 + 286 + /* 287 + * Called with wqh->lock held and interrupts disabled 288 + */ 289 + static int mshv_irqfd_wakeup(wait_queue_entry_t *wait, unsigned int mode, 290 + int sync, void *key) 291 + { 292 + struct mshv_irqfd *irqfd = container_of(wait, struct mshv_irqfd, 293 + irqfd_wait); 294 + unsigned long flags = (unsigned long)key; 295 + int idx; 296 + unsigned int seq; 297 + struct mshv_partition *pt = irqfd->irqfd_partn; 298 + int ret = 0; 299 + 300 + if (flags & POLLIN) { 301 + u64 cnt; 302 + 303 + eventfd_ctx_do_read(irqfd->irqfd_eventfd_ctx, &cnt); 304 + idx = srcu_read_lock(&pt->pt_irq_srcu); 305 + do { 306 + seq = read_seqcount_begin(&irqfd->irqfd_irqe_sc); 307 + } while (read_seqcount_retry(&irqfd->irqfd_irqe_sc, seq)); 308 + 309 + /* An event has been signaled, raise an interrupt */ 310 + ret = mshv_try_assert_irq_fast(irqfd); 311 + if (ret) 312 + mshv_assert_irq_slow(irqfd); 313 + 314 + srcu_read_unlock(&pt->pt_irq_srcu, idx); 315 + 316 + ret = 1; 317 + } 318 + 319 + if (flags & POLLHUP) { 320 + /* The eventfd is closing, detach from the partition */ 321 + unsigned long flags; 322 + 323 + spin_lock_irqsave(&pt->pt_irqfds_lock, flags); 324 + 325 + /* 326 + * We must check if someone deactivated the irqfd before 327 + * we could acquire the pt_irqfds_lock since the item is 328 + * deactivated from the mshv side before it is unhooked from 329 + * the wait-queue. If it is already deactivated, we can 330 + * simply return knowing the other side will cleanup for us. 331 + * We cannot race against the irqfd going away since the 332 + * other side is required to acquire wqh->lock, which we hold 333 + */ 334 + if (mshv_irqfd_is_active(irqfd)) 335 + mshv_irqfd_deactivate(irqfd); 336 + 337 + spin_unlock_irqrestore(&pt->pt_irqfds_lock, flags); 338 + } 339 + 340 + return ret; 341 + } 342 + 343 + /* Must be called under pt_irqfds_lock */ 344 + static void mshv_irqfd_update(struct mshv_partition *pt, 345 + struct mshv_irqfd *irqfd) 346 + { 347 + write_seqcount_begin(&irqfd->irqfd_irqe_sc); 348 + irqfd->irqfd_girq_ent = mshv_ret_girq_entry(pt, 349 + irqfd->irqfd_irqnum); 350 + mshv_copy_girq_info(&irqfd->irqfd_girq_ent, &irqfd->irqfd_lapic_irq); 351 + write_seqcount_end(&irqfd->irqfd_irqe_sc); 352 + } 353 + 354 + void mshv_irqfd_routing_update(struct mshv_partition *pt) 355 + { 356 + struct mshv_irqfd *irqfd; 357 + 358 + spin_lock_irq(&pt->pt_irqfds_lock); 359 + hlist_for_each_entry(irqfd, &pt->pt_irqfds_list, irqfd_hnode) 360 + mshv_irqfd_update(pt, irqfd); 361 + spin_unlock_irq(&pt->pt_irqfds_lock); 362 + } 363 + 364 + static void mshv_irqfd_queue_proc(struct file *file, wait_queue_head_t *wqh, 365 + poll_table *polltbl) 366 + { 367 + struct mshv_irqfd *irqfd = 368 + container_of(polltbl, struct mshv_irqfd, irqfd_polltbl); 369 + 370 + irqfd->irqfd_wqh = wqh; 371 + add_wait_queue_priority(wqh, &irqfd->irqfd_wait); 372 + } 373 + 374 + static int mshv_irqfd_assign(struct mshv_partition *pt, 375 + struct mshv_user_irqfd *args) 376 + { 377 + struct eventfd_ctx *eventfd = NULL, *resamplefd = NULL; 378 + struct mshv_irqfd *irqfd, *tmp; 379 + unsigned int events; 380 + struct fd f; 381 + int ret; 382 + int idx; 383 + 384 + irqfd = kzalloc(sizeof(*irqfd), GFP_KERNEL); 385 + if (!irqfd) 386 + return -ENOMEM; 387 + 388 + irqfd->irqfd_partn = pt; 389 + irqfd->irqfd_irqnum = args->gsi; 390 + INIT_WORK(&irqfd->irqfd_shutdown, mshv_irqfd_shutdown); 391 + seqcount_spinlock_init(&irqfd->irqfd_irqe_sc, &pt->pt_irqfds_lock); 392 + 393 + f = fdget(args->fd); 394 + if (!fd_file(f)) { 395 + ret = -EBADF; 396 + goto out; 397 + } 398 + 399 + eventfd = eventfd_ctx_fileget(fd_file(f)); 400 + if (IS_ERR(eventfd)) { 401 + ret = PTR_ERR(eventfd); 402 + goto fail; 403 + } 404 + 405 + irqfd->irqfd_eventfd_ctx = eventfd; 406 + 407 + if (args->flags & BIT(MSHV_IRQFD_BIT_RESAMPLE)) { 408 + struct mshv_irqfd_resampler *rp; 409 + 410 + resamplefd = eventfd_ctx_fdget(args->resamplefd); 411 + if (IS_ERR(resamplefd)) { 412 + ret = PTR_ERR(resamplefd); 413 + goto fail; 414 + } 415 + 416 + irqfd->irqfd_resamplefd = resamplefd; 417 + 418 + mutex_lock(&pt->irqfds_resampler_lock); 419 + 420 + hlist_for_each_entry(rp, &pt->irqfds_resampler_list, 421 + rsmplr_hnode) { 422 + if (rp->rsmplr_notifier.irq_ack_gsi == 423 + irqfd->irqfd_irqnum) { 424 + irqfd->irqfd_resampler = rp; 425 + break; 426 + } 427 + } 428 + 429 + if (!irqfd->irqfd_resampler) { 430 + rp = kzalloc(sizeof(*rp), GFP_KERNEL_ACCOUNT); 431 + if (!rp) { 432 + ret = -ENOMEM; 433 + mutex_unlock(&pt->irqfds_resampler_lock); 434 + goto fail; 435 + } 436 + 437 + rp->rsmplr_partn = pt; 438 + INIT_HLIST_HEAD(&rp->rsmplr_irqfd_list); 439 + rp->rsmplr_notifier.irq_ack_gsi = irqfd->irqfd_irqnum; 440 + rp->rsmplr_notifier.irq_acked = 441 + mshv_irqfd_resampler_ack; 442 + 443 + hlist_add_head(&rp->rsmplr_hnode, 444 + &pt->irqfds_resampler_list); 445 + mshv_register_irq_ack_notifier(pt, 446 + &rp->rsmplr_notifier); 447 + irqfd->irqfd_resampler = rp; 448 + } 449 + 450 + hlist_add_head_rcu(&irqfd->irqfd_resampler_hnode, 451 + &irqfd->irqfd_resampler->rsmplr_irqfd_list); 452 + 453 + mutex_unlock(&pt->irqfds_resampler_lock); 454 + } 455 + 456 + /* 457 + * Install our own custom wake-up handling so we are notified via 458 + * a callback whenever someone signals the underlying eventfd 459 + */ 460 + init_waitqueue_func_entry(&irqfd->irqfd_wait, mshv_irqfd_wakeup); 461 + init_poll_funcptr(&irqfd->irqfd_polltbl, mshv_irqfd_queue_proc); 462 + 463 + spin_lock_irq(&pt->pt_irqfds_lock); 464 + if (args->flags & BIT(MSHV_IRQFD_BIT_RESAMPLE) && 465 + !irqfd->irqfd_lapic_irq.lapic_control.level_triggered) { 466 + /* 467 + * Resample Fd must be for level triggered interrupt 468 + * Otherwise return with failure 469 + */ 470 + spin_unlock_irq(&pt->pt_irqfds_lock); 471 + ret = -EINVAL; 472 + goto fail; 473 + } 474 + ret = 0; 475 + hlist_for_each_entry(tmp, &pt->pt_irqfds_list, irqfd_hnode) { 476 + if (irqfd->irqfd_eventfd_ctx != tmp->irqfd_eventfd_ctx) 477 + continue; 478 + /* This fd is used for another irq already. */ 479 + ret = -EBUSY; 480 + spin_unlock_irq(&pt->pt_irqfds_lock); 481 + goto fail; 482 + } 483 + 484 + idx = srcu_read_lock(&pt->pt_irq_srcu); 485 + mshv_irqfd_update(pt, irqfd); 486 + hlist_add_head(&irqfd->irqfd_hnode, &pt->pt_irqfds_list); 487 + spin_unlock_irq(&pt->pt_irqfds_lock); 488 + 489 + /* 490 + * Check if there was an event already pending on the eventfd 491 + * before we registered, and trigger it as if we didn't miss it. 492 + */ 493 + events = vfs_poll(fd_file(f), &irqfd->irqfd_polltbl); 494 + 495 + if (events & POLLIN) 496 + mshv_assert_irq_slow(irqfd); 497 + 498 + srcu_read_unlock(&pt->pt_irq_srcu, idx); 499 + /* 500 + * do not drop the file until the irqfd is fully initialized, otherwise 501 + * we might race against the POLLHUP 502 + */ 503 + fdput(f); 504 + 505 + return 0; 506 + 507 + fail: 508 + if (irqfd->irqfd_resampler) 509 + mshv_irqfd_resampler_shutdown(irqfd); 510 + 511 + if (resamplefd && !IS_ERR(resamplefd)) 512 + eventfd_ctx_put(resamplefd); 513 + 514 + if (eventfd && !IS_ERR(eventfd)) 515 + eventfd_ctx_put(eventfd); 516 + 517 + fdput(f); 518 + 519 + out: 520 + kfree(irqfd); 521 + return ret; 522 + } 523 + 524 + /* 525 + * shutdown any irqfd's that match fd+gsi 526 + */ 527 + static int mshv_irqfd_deassign(struct mshv_partition *pt, 528 + struct mshv_user_irqfd *args) 529 + { 530 + struct mshv_irqfd *irqfd; 531 + struct hlist_node *n; 532 + struct eventfd_ctx *eventfd; 533 + 534 + eventfd = eventfd_ctx_fdget(args->fd); 535 + if (IS_ERR(eventfd)) 536 + return PTR_ERR(eventfd); 537 + 538 + hlist_for_each_entry_safe(irqfd, n, &pt->pt_irqfds_list, 539 + irqfd_hnode) { 540 + if (irqfd->irqfd_eventfd_ctx == eventfd && 541 + irqfd->irqfd_irqnum == args->gsi) 542 + 543 + mshv_irqfd_deactivate(irqfd); 544 + } 545 + 546 + eventfd_ctx_put(eventfd); 547 + 548 + /* 549 + * Block until we know all outstanding shutdown jobs have completed 550 + * so that we guarantee there will not be any more interrupts on this 551 + * gsi once this deassign function returns. 552 + */ 553 + flush_workqueue(irqfd_cleanup_wq); 554 + 555 + return 0; 556 + } 557 + 558 + int mshv_set_unset_irqfd(struct mshv_partition *pt, 559 + struct mshv_user_irqfd *args) 560 + { 561 + if (args->flags & ~MSHV_IRQFD_FLAGS_MASK) 562 + return -EINVAL; 563 + 564 + if (args->flags & BIT(MSHV_IRQFD_BIT_DEASSIGN)) 565 + return mshv_irqfd_deassign(pt, args); 566 + 567 + return mshv_irqfd_assign(pt, args); 568 + } 569 + 570 + /* 571 + * This function is called as the mshv VM fd is being released. 572 + * Shutdown all irqfds that still remain open 573 + */ 574 + static void mshv_irqfd_release(struct mshv_partition *pt) 575 + { 576 + struct mshv_irqfd *irqfd; 577 + struct hlist_node *n; 578 + 579 + spin_lock_irq(&pt->pt_irqfds_lock); 580 + 581 + hlist_for_each_entry_safe(irqfd, n, &pt->pt_irqfds_list, irqfd_hnode) 582 + mshv_irqfd_deactivate(irqfd); 583 + 584 + spin_unlock_irq(&pt->pt_irqfds_lock); 585 + 586 + /* 587 + * Block until we know all outstanding shutdown jobs have completed 588 + * since we do not take a mshv_partition* reference. 589 + */ 590 + flush_workqueue(irqfd_cleanup_wq); 591 + } 592 + 593 + int mshv_irqfd_wq_init(void) 594 + { 595 + irqfd_cleanup_wq = alloc_workqueue("mshv-irqfd-cleanup", 0, 0); 596 + if (!irqfd_cleanup_wq) 597 + return -ENOMEM; 598 + 599 + return 0; 600 + } 601 + 602 + void mshv_irqfd_wq_cleanup(void) 603 + { 604 + destroy_workqueue(irqfd_cleanup_wq); 605 + } 606 + 607 + /* 608 + * -------------------------------------------------------------------- 609 + * ioeventfd: translate a MMIO memory write to an eventfd signal. 610 + * 611 + * userspace can register a MMIO address with an eventfd for receiving 612 + * notification when the memory has been touched. 613 + * -------------------------------------------------------------------- 614 + */ 615 + 616 + static void ioeventfd_release(struct mshv_ioeventfd *p, u64 partition_id) 617 + { 618 + if (p->iovntfd_doorbell_id > 0) 619 + mshv_unregister_doorbell(partition_id, p->iovntfd_doorbell_id); 620 + eventfd_ctx_put(p->iovntfd_eventfd); 621 + kfree(p); 622 + } 623 + 624 + /* MMIO writes trigger an event if the addr/val match */ 625 + static void ioeventfd_mmio_write(int doorbell_id, void *data) 626 + { 627 + struct mshv_partition *partition = (struct mshv_partition *)data; 628 + struct mshv_ioeventfd *p; 629 + 630 + rcu_read_lock(); 631 + hlist_for_each_entry_rcu(p, &partition->ioeventfds_list, iovntfd_hnode) 632 + if (p->iovntfd_doorbell_id == doorbell_id) { 633 + eventfd_signal(p->iovntfd_eventfd); 634 + break; 635 + } 636 + 637 + rcu_read_unlock(); 638 + } 639 + 640 + static bool ioeventfd_check_collision(struct mshv_partition *pt, 641 + struct mshv_ioeventfd *p) 642 + __must_hold(&pt->mutex) 643 + { 644 + struct mshv_ioeventfd *_p; 645 + 646 + hlist_for_each_entry(_p, &pt->ioeventfds_list, iovntfd_hnode) 647 + if (_p->iovntfd_addr == p->iovntfd_addr && 648 + _p->iovntfd_length == p->iovntfd_length && 649 + (_p->iovntfd_wildcard || p->iovntfd_wildcard || 650 + _p->iovntfd_datamatch == p->iovntfd_datamatch)) 651 + return true; 652 + 653 + return false; 654 + } 655 + 656 + static int mshv_assign_ioeventfd(struct mshv_partition *pt, 657 + struct mshv_user_ioeventfd *args) 658 + __must_hold(&pt->mutex) 659 + { 660 + struct mshv_ioeventfd *p; 661 + struct eventfd_ctx *eventfd; 662 + u64 doorbell_flags = 0; 663 + int ret; 664 + 665 + /* This mutex is currently protecting ioeventfd.items list */ 666 + WARN_ON_ONCE(!mutex_is_locked(&pt->pt_mutex)); 667 + 668 + if (args->flags & BIT(MSHV_IOEVENTFD_BIT_PIO)) 669 + return -EOPNOTSUPP; 670 + 671 + /* must be natural-word sized */ 672 + switch (args->len) { 673 + case 0: 674 + doorbell_flags = HV_DOORBELL_FLAG_TRIGGER_SIZE_ANY; 675 + break; 676 + case 1: 677 + doorbell_flags = HV_DOORBELL_FLAG_TRIGGER_SIZE_BYTE; 678 + break; 679 + case 2: 680 + doorbell_flags = HV_DOORBELL_FLAG_TRIGGER_SIZE_WORD; 681 + break; 682 + case 4: 683 + doorbell_flags = HV_DOORBELL_FLAG_TRIGGER_SIZE_DWORD; 684 + break; 685 + case 8: 686 + doorbell_flags = HV_DOORBELL_FLAG_TRIGGER_SIZE_QWORD; 687 + break; 688 + default: 689 + return -EINVAL; 690 + } 691 + 692 + /* check for range overflow */ 693 + if (args->addr + args->len < args->addr) 694 + return -EINVAL; 695 + 696 + /* check for extra flags that we don't understand */ 697 + if (args->flags & ~MSHV_IOEVENTFD_FLAGS_MASK) 698 + return -EINVAL; 699 + 700 + eventfd = eventfd_ctx_fdget(args->fd); 701 + if (IS_ERR(eventfd)) 702 + return PTR_ERR(eventfd); 703 + 704 + p = kzalloc(sizeof(*p), GFP_KERNEL); 705 + if (!p) { 706 + ret = -ENOMEM; 707 + goto fail; 708 + } 709 + 710 + p->iovntfd_addr = args->addr; 711 + p->iovntfd_length = args->len; 712 + p->iovntfd_eventfd = eventfd; 713 + 714 + /* The datamatch feature is optional, otherwise this is a wildcard */ 715 + if (args->flags & BIT(MSHV_IOEVENTFD_BIT_DATAMATCH)) { 716 + p->iovntfd_datamatch = args->datamatch; 717 + } else { 718 + p->iovntfd_wildcard = true; 719 + doorbell_flags |= HV_DOORBELL_FLAG_TRIGGER_ANY_VALUE; 720 + } 721 + 722 + if (ioeventfd_check_collision(pt, p)) { 723 + ret = -EEXIST; 724 + goto unlock_fail; 725 + } 726 + 727 + ret = mshv_register_doorbell(pt->pt_id, ioeventfd_mmio_write, 728 + (void *)pt, p->iovntfd_addr, 729 + p->iovntfd_datamatch, doorbell_flags); 730 + if (ret < 0) 731 + goto unlock_fail; 732 + 733 + p->iovntfd_doorbell_id = ret; 734 + 735 + hlist_add_head_rcu(&p->iovntfd_hnode, &pt->ioeventfds_list); 736 + 737 + return 0; 738 + 739 + unlock_fail: 740 + kfree(p); 741 + 742 + fail: 743 + eventfd_ctx_put(eventfd); 744 + 745 + return ret; 746 + } 747 + 748 + static int mshv_deassign_ioeventfd(struct mshv_partition *pt, 749 + struct mshv_user_ioeventfd *args) 750 + __must_hold(&pt->mutex) 751 + { 752 + struct mshv_ioeventfd *p; 753 + struct eventfd_ctx *eventfd; 754 + struct hlist_node *n; 755 + int ret = -ENOENT; 756 + 757 + /* This mutex is currently protecting ioeventfd.items list */ 758 + WARN_ON_ONCE(!mutex_is_locked(&pt->pt_mutex)); 759 + 760 + eventfd = eventfd_ctx_fdget(args->fd); 761 + if (IS_ERR(eventfd)) 762 + return PTR_ERR(eventfd); 763 + 764 + hlist_for_each_entry_safe(p, n, &pt->ioeventfds_list, iovntfd_hnode) { 765 + bool wildcard = !(args->flags & BIT(MSHV_IOEVENTFD_BIT_DATAMATCH)); 766 + 767 + if (p->iovntfd_eventfd != eventfd || 768 + p->iovntfd_addr != args->addr || 769 + p->iovntfd_length != args->len || 770 + p->iovntfd_wildcard != wildcard) 771 + continue; 772 + 773 + if (!p->iovntfd_wildcard && 774 + p->iovntfd_datamatch != args->datamatch) 775 + continue; 776 + 777 + hlist_del_rcu(&p->iovntfd_hnode); 778 + synchronize_rcu(); 779 + ioeventfd_release(p, pt->pt_id); 780 + ret = 0; 781 + break; 782 + } 783 + 784 + eventfd_ctx_put(eventfd); 785 + 786 + return ret; 787 + } 788 + 789 + int mshv_set_unset_ioeventfd(struct mshv_partition *pt, 790 + struct mshv_user_ioeventfd *args) 791 + __must_hold(&pt->mutex) 792 + { 793 + if ((args->flags & ~MSHV_IOEVENTFD_FLAGS_MASK) || 794 + mshv_field_nonzero(*args, rsvd)) 795 + return -EINVAL; 796 + 797 + /* PIO not yet implemented */ 798 + if (args->flags & BIT(MSHV_IOEVENTFD_BIT_PIO)) 799 + return -EOPNOTSUPP; 800 + 801 + if (args->flags & BIT(MSHV_IOEVENTFD_BIT_DEASSIGN)) 802 + return mshv_deassign_ioeventfd(pt, args); 803 + 804 + return mshv_assign_ioeventfd(pt, args); 805 + } 806 + 807 + void mshv_eventfd_init(struct mshv_partition *pt) 808 + { 809 + spin_lock_init(&pt->pt_irqfds_lock); 810 + INIT_HLIST_HEAD(&pt->pt_irqfds_list); 811 + 812 + INIT_HLIST_HEAD(&pt->irqfds_resampler_list); 813 + mutex_init(&pt->irqfds_resampler_lock); 814 + 815 + INIT_HLIST_HEAD(&pt->ioeventfds_list); 816 + } 817 + 818 + void mshv_eventfd_release(struct mshv_partition *pt) 819 + { 820 + struct hlist_head items; 821 + struct hlist_node *n; 822 + struct mshv_ioeventfd *p; 823 + 824 + hlist_move_list(&pt->ioeventfds_list, &items); 825 + synchronize_rcu(); 826 + 827 + hlist_for_each_entry_safe(p, n, &items, iovntfd_hnode) { 828 + hlist_del(&p->iovntfd_hnode); 829 + ioeventfd_release(p, pt->pt_id); 830 + } 831 + 832 + mshv_irqfd_release(pt); 833 + }
+71
drivers/hv/mshv_eventfd.h
··· 1 + /* SPDX-License-Identifier: GPL-2.0-only */ 2 + /* 3 + * irqfd: Allows an fd to be used to inject an interrupt to the guest. 4 + * ioeventfd: Allow an fd to be used to receive a signal from the guest. 5 + * All credit goes to kvm developers. 6 + */ 7 + 8 + #ifndef __LINUX_MSHV_EVENTFD_H 9 + #define __LINUX_MSHV_EVENTFD_H 10 + 11 + #include <linux/poll.h> 12 + 13 + #include "mshv.h" 14 + #include "mshv_root.h" 15 + 16 + /* struct to contain list of irqfds sharing an irq. Updates are protected by 17 + * partition.irqfds.resampler_lock 18 + */ 19 + struct mshv_irqfd_resampler { 20 + struct mshv_partition *rsmplr_partn; 21 + struct hlist_head rsmplr_irqfd_list; 22 + struct mshv_irq_ack_notifier rsmplr_notifier; 23 + struct hlist_node rsmplr_hnode; 24 + }; 25 + 26 + struct mshv_irqfd { 27 + struct mshv_partition *irqfd_partn; 28 + struct eventfd_ctx *irqfd_eventfd_ctx; 29 + struct mshv_guest_irq_ent irqfd_girq_ent; 30 + seqcount_spinlock_t irqfd_irqe_sc; 31 + u32 irqfd_irqnum; 32 + struct mshv_lapic_irq irqfd_lapic_irq; 33 + struct hlist_node irqfd_hnode; 34 + poll_table irqfd_polltbl; 35 + wait_queue_head_t *irqfd_wqh; 36 + wait_queue_entry_t irqfd_wait; 37 + struct work_struct irqfd_shutdown; 38 + struct mshv_irqfd_resampler *irqfd_resampler; 39 + struct eventfd_ctx *irqfd_resamplefd; 40 + struct hlist_node irqfd_resampler_hnode; 41 + }; 42 + 43 + void mshv_eventfd_init(struct mshv_partition *partition); 44 + void mshv_eventfd_release(struct mshv_partition *partition); 45 + 46 + void mshv_register_irq_ack_notifier(struct mshv_partition *partition, 47 + struct mshv_irq_ack_notifier *mian); 48 + void mshv_unregister_irq_ack_notifier(struct mshv_partition *partition, 49 + struct mshv_irq_ack_notifier *mian); 50 + bool mshv_notify_acked_gsi(struct mshv_partition *partition, int gsi); 51 + 52 + int mshv_set_unset_irqfd(struct mshv_partition *partition, 53 + struct mshv_user_irqfd *args); 54 + 55 + int mshv_irqfd_wq_init(void); 56 + void mshv_irqfd_wq_cleanup(void); 57 + 58 + struct mshv_ioeventfd { 59 + struct hlist_node iovntfd_hnode; 60 + u64 iovntfd_addr; 61 + int iovntfd_length; 62 + struct eventfd_ctx *iovntfd_eventfd; 63 + u64 iovntfd_datamatch; 64 + int iovntfd_doorbell_id; 65 + bool iovntfd_wildcard; 66 + }; 67 + 68 + int mshv_set_unset_ioeventfd(struct mshv_partition *pt, 69 + struct mshv_user_ioeventfd *args); 70 + 71 + #endif /* __LINUX_MSHV_EVENTFD_H */
+124
drivers/hv/mshv_irq.c
··· 1 + // SPDX-License-Identifier: GPL-2.0-only 2 + /* 3 + * Copyright (c) 2023, Microsoft Corporation. 4 + * 5 + * Authors: Microsoft Linux virtualization team 6 + */ 7 + 8 + #include <linux/kernel.h> 9 + #include <linux/module.h> 10 + #include <linux/slab.h> 11 + #include <asm/mshyperv.h> 12 + 13 + #include "mshv_eventfd.h" 14 + #include "mshv.h" 15 + #include "mshv_root.h" 16 + 17 + /* called from the ioctl code, user wants to update the guest irq table */ 18 + int mshv_update_routing_table(struct mshv_partition *partition, 19 + const struct mshv_user_irq_entry *ue, 20 + unsigned int numents) 21 + { 22 + struct mshv_girq_routing_table *new = NULL, *old; 23 + u32 i, nr_rt_entries = 0; 24 + int r = 0; 25 + 26 + if (numents == 0) 27 + goto swap_routes; 28 + 29 + for (i = 0; i < numents; i++) { 30 + if (ue[i].gsi >= MSHV_MAX_GUEST_IRQS) 31 + return -EINVAL; 32 + 33 + if (ue[i].address_hi) 34 + return -EINVAL; 35 + 36 + nr_rt_entries = max(nr_rt_entries, ue[i].gsi); 37 + } 38 + nr_rt_entries += 1; 39 + 40 + new = kzalloc(struct_size(new, mshv_girq_info_tbl, nr_rt_entries), 41 + GFP_KERNEL_ACCOUNT); 42 + if (!new) 43 + return -ENOMEM; 44 + 45 + new->num_rt_entries = nr_rt_entries; 46 + for (i = 0; i < numents; i++) { 47 + struct mshv_guest_irq_ent *girq; 48 + 49 + girq = &new->mshv_girq_info_tbl[ue[i].gsi]; 50 + 51 + /* 52 + * Allow only one to one mapping between GSI and MSI routing. 53 + */ 54 + if (girq->guest_irq_num != 0) { 55 + r = -EINVAL; 56 + goto out; 57 + } 58 + 59 + girq->guest_irq_num = ue[i].gsi; 60 + girq->girq_addr_lo = ue[i].address_lo; 61 + girq->girq_addr_hi = ue[i].address_hi; 62 + girq->girq_irq_data = ue[i].data; 63 + girq->girq_entry_valid = true; 64 + } 65 + 66 + swap_routes: 67 + mutex_lock(&partition->pt_irq_lock); 68 + old = rcu_dereference_protected(partition->pt_girq_tbl, 1); 69 + rcu_assign_pointer(partition->pt_girq_tbl, new); 70 + mshv_irqfd_routing_update(partition); 71 + mutex_unlock(&partition->pt_irq_lock); 72 + 73 + synchronize_srcu_expedited(&partition->pt_irq_srcu); 74 + new = old; 75 + 76 + out: 77 + kfree(new); 78 + 79 + return r; 80 + } 81 + 82 + /* vm is going away, kfree the irq routing table */ 83 + void mshv_free_routing_table(struct mshv_partition *partition) 84 + { 85 + struct mshv_girq_routing_table *rt = 86 + rcu_access_pointer(partition->pt_girq_tbl); 87 + 88 + kfree(rt); 89 + } 90 + 91 + struct mshv_guest_irq_ent 92 + mshv_ret_girq_entry(struct mshv_partition *partition, u32 irqnum) 93 + { 94 + struct mshv_guest_irq_ent entry = { 0 }; 95 + struct mshv_girq_routing_table *girq_tbl; 96 + 97 + girq_tbl = srcu_dereference_check(partition->pt_girq_tbl, 98 + &partition->pt_irq_srcu, 99 + lockdep_is_held(&partition->pt_irq_lock)); 100 + if (!girq_tbl || irqnum >= girq_tbl->num_rt_entries) { 101 + /* 102 + * Premature register_irqfd, setting valid_entry = 0 103 + * would ignore this entry anyway 104 + */ 105 + entry.guest_irq_num = irqnum; 106 + return entry; 107 + } 108 + 109 + return girq_tbl->mshv_girq_info_tbl[irqnum]; 110 + } 111 + 112 + void mshv_copy_girq_info(struct mshv_guest_irq_ent *ent, 113 + struct mshv_lapic_irq *lirq) 114 + { 115 + memset(lirq, 0, sizeof(*lirq)); 116 + if (!ent || !ent->girq_entry_valid) 117 + return; 118 + 119 + lirq->lapic_vector = ent->girq_irq_data & 0xFF; 120 + lirq->lapic_apic_id = (ent->girq_addr_lo >> 12) & 0xFF; 121 + lirq->lapic_control.interrupt_type = (ent->girq_irq_data & 0x700) >> 8; 122 + lirq->lapic_control.level_triggered = (ent->girq_irq_data >> 15) & 0x1; 123 + lirq->lapic_control.logical_dest_mode = (ent->girq_addr_lo >> 2) & 0x1; 124 + }
+83
drivers/hv/mshv_portid_table.c
··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + #include <linux/types.h> 3 + #include <linux/mm.h> 4 + #include <linux/slab.h> 5 + #include <linux/idr.h> 6 + #include <asm/mshyperv.h> 7 + 8 + #include "mshv.h" 9 + #include "mshv_root.h" 10 + 11 + /* 12 + * Ports and connections are hypervisor struct used for inter-partition 13 + * communication. Port represents the source and connection represents 14 + * the destination. Partitions are responsible for managing the port and 15 + * connection ids. 16 + * 17 + */ 18 + 19 + #define PORTID_MIN 1 20 + #define PORTID_MAX INT_MAX 21 + 22 + static DEFINE_IDR(port_table_idr); 23 + 24 + void 25 + mshv_port_table_fini(void) 26 + { 27 + struct port_table_info *port_info; 28 + unsigned long i, tmp; 29 + 30 + idr_lock(&port_table_idr); 31 + if (!idr_is_empty(&port_table_idr)) { 32 + idr_for_each_entry_ul(&port_table_idr, port_info, tmp, i) { 33 + port_info = idr_remove(&port_table_idr, i); 34 + kfree_rcu(port_info, portbl_rcu); 35 + } 36 + } 37 + idr_unlock(&port_table_idr); 38 + } 39 + 40 + int 41 + mshv_portid_alloc(struct port_table_info *info) 42 + { 43 + int ret = 0; 44 + 45 + idr_lock(&port_table_idr); 46 + ret = idr_alloc(&port_table_idr, info, PORTID_MIN, 47 + PORTID_MAX, GFP_KERNEL); 48 + idr_unlock(&port_table_idr); 49 + 50 + return ret; 51 + } 52 + 53 + void 54 + mshv_portid_free(int port_id) 55 + { 56 + struct port_table_info *info; 57 + 58 + idr_lock(&port_table_idr); 59 + info = idr_remove(&port_table_idr, port_id); 60 + WARN_ON(!info); 61 + idr_unlock(&port_table_idr); 62 + 63 + synchronize_rcu(); 64 + kfree(info); 65 + } 66 + 67 + int 68 + mshv_portid_lookup(int port_id, struct port_table_info *info) 69 + { 70 + struct port_table_info *_info; 71 + int ret = -ENOENT; 72 + 73 + rcu_read_lock(); 74 + _info = idr_find(&port_table_idr, port_id); 75 + rcu_read_unlock(); 76 + 77 + if (_info) { 78 + *info = *_info; 79 + ret = 0; 80 + } 81 + 82 + return ret; 83 + }
+311
drivers/hv/mshv_root.h
··· 1 + /* SPDX-License-Identifier: GPL-2.0-only */ 2 + /* 3 + * Copyright (c) 2023, Microsoft Corporation. 4 + */ 5 + 6 + #ifndef _MSHV_ROOT_H_ 7 + #define _MSHV_ROOT_H_ 8 + 9 + #include <linux/spinlock.h> 10 + #include <linux/mutex.h> 11 + #include <linux/semaphore.h> 12 + #include <linux/sched.h> 13 + #include <linux/srcu.h> 14 + #include <linux/wait.h> 15 + #include <linux/hashtable.h> 16 + #include <linux/dev_printk.h> 17 + #include <linux/build_bug.h> 18 + #include <uapi/linux/mshv.h> 19 + 20 + /* 21 + * Hypervisor must be between these version numbers (inclusive) 22 + * to guarantee compatibility 23 + */ 24 + #define MSHV_HV_MIN_VERSION (27744) 25 + #define MSHV_HV_MAX_VERSION (27751) 26 + 27 + static_assert(HV_HYP_PAGE_SIZE == MSHV_HV_PAGE_SIZE); 28 + 29 + #define MSHV_MAX_VPS 256 30 + 31 + #define MSHV_PARTITIONS_HASH_BITS 9 32 + 33 + #define MSHV_PIN_PAGES_BATCH_SIZE (0x10000000ULL / HV_HYP_PAGE_SIZE) 34 + 35 + struct mshv_vp { 36 + u32 vp_index; 37 + struct mshv_partition *vp_partition; 38 + struct mutex vp_mutex; 39 + struct hv_vp_register_page *vp_register_page; 40 + struct hv_message *vp_intercept_msg_page; 41 + void *vp_ghcb_page; 42 + struct hv_stats_page *vp_stats_pages[2]; 43 + struct { 44 + atomic64_t vp_signaled_count; 45 + struct { 46 + u64 intercept_suspend: 1; 47 + u64 root_sched_blocked: 1; /* root scheduler only */ 48 + u64 root_sched_dispatched: 1; /* root scheduler only */ 49 + u64 reserved: 61; 50 + } flags; 51 + unsigned int kicked_by_hv; 52 + wait_queue_head_t vp_suspend_queue; 53 + } run; 54 + }; 55 + 56 + #define vp_fmt(fmt) "p%lluvp%u: " fmt 57 + #define vp_devprintk(level, v, fmt, ...) \ 58 + do { \ 59 + const struct mshv_vp *__vp = (v); \ 60 + const struct mshv_partition *__pt = __vp->vp_partition; \ 61 + dev_##level(__pt->pt_module_dev, vp_fmt(fmt), __pt->pt_id, \ 62 + __vp->vp_index, ##__VA_ARGS__); \ 63 + } while (0) 64 + #define vp_emerg(v, fmt, ...) vp_devprintk(emerg, v, fmt, ##__VA_ARGS__) 65 + #define vp_crit(v, fmt, ...) vp_devprintk(crit, v, fmt, ##__VA_ARGS__) 66 + #define vp_alert(v, fmt, ...) vp_devprintk(alert, v, fmt, ##__VA_ARGS__) 67 + #define vp_err(v, fmt, ...) vp_devprintk(err, v, fmt, ##__VA_ARGS__) 68 + #define vp_warn(v, fmt, ...) vp_devprintk(warn, v, fmt, ##__VA_ARGS__) 69 + #define vp_notice(v, fmt, ...) vp_devprintk(notice, v, fmt, ##__VA_ARGS__) 70 + #define vp_info(v, fmt, ...) vp_devprintk(info, v, fmt, ##__VA_ARGS__) 71 + #define vp_dbg(v, fmt, ...) vp_devprintk(dbg, v, fmt, ##__VA_ARGS__) 72 + 73 + struct mshv_mem_region { 74 + struct hlist_node hnode; 75 + u64 nr_pages; 76 + u64 start_gfn; 77 + u64 start_uaddr; 78 + u32 hv_map_flags; 79 + struct { 80 + u64 large_pages: 1; /* 2MiB */ 81 + u64 range_pinned: 1; 82 + u64 reserved: 62; 83 + } flags; 84 + struct mshv_partition *partition; 85 + struct page *pages[]; 86 + }; 87 + 88 + struct mshv_irq_ack_notifier { 89 + struct hlist_node link; 90 + unsigned int irq_ack_gsi; 91 + void (*irq_acked)(struct mshv_irq_ack_notifier *mian); 92 + }; 93 + 94 + struct mshv_partition { 95 + struct device *pt_module_dev; 96 + 97 + struct hlist_node pt_hnode; 98 + u64 pt_id; 99 + refcount_t pt_ref_count; 100 + struct mutex pt_mutex; 101 + struct hlist_head pt_mem_regions; // not ordered 102 + 103 + u32 pt_vp_count; 104 + struct mshv_vp *pt_vp_array[MSHV_MAX_VPS]; 105 + 106 + struct mutex pt_irq_lock; 107 + struct srcu_struct pt_irq_srcu; 108 + struct hlist_head irq_ack_notifier_list; 109 + 110 + struct hlist_head pt_devices; 111 + 112 + /* 113 + * MSHV does not support more than one async hypercall in flight 114 + * for a single partition. Thus, it is okay to define per partition 115 + * async hypercall status. 116 + */ 117 + struct completion async_hypercall; 118 + u64 async_hypercall_status; 119 + 120 + spinlock_t pt_irqfds_lock; 121 + struct hlist_head pt_irqfds_list; 122 + struct mutex irqfds_resampler_lock; 123 + struct hlist_head irqfds_resampler_list; 124 + 125 + struct hlist_head ioeventfds_list; 126 + 127 + struct mshv_girq_routing_table __rcu *pt_girq_tbl; 128 + u64 isolation_type; 129 + bool import_completed; 130 + bool pt_initialized; 131 + }; 132 + 133 + #define pt_fmt(fmt) "p%llu: " fmt 134 + #define pt_devprintk(level, p, fmt, ...) \ 135 + do { \ 136 + const struct mshv_partition *__pt = (p); \ 137 + dev_##level(__pt->pt_module_dev, pt_fmt(fmt), __pt->pt_id, \ 138 + ##__VA_ARGS__); \ 139 + } while (0) 140 + #define pt_emerg(p, fmt, ...) pt_devprintk(emerg, p, fmt, ##__VA_ARGS__) 141 + #define pt_crit(p, fmt, ...) pt_devprintk(crit, p, fmt, ##__VA_ARGS__) 142 + #define pt_alert(p, fmt, ...) pt_devprintk(alert, p, fmt, ##__VA_ARGS__) 143 + #define pt_err(p, fmt, ...) pt_devprintk(err, p, fmt, ##__VA_ARGS__) 144 + #define pt_warn(p, fmt, ...) pt_devprintk(warn, p, fmt, ##__VA_ARGS__) 145 + #define pt_notice(p, fmt, ...) pt_devprintk(notice, p, fmt, ##__VA_ARGS__) 146 + #define pt_info(p, fmt, ...) pt_devprintk(info, p, fmt, ##__VA_ARGS__) 147 + #define pt_dbg(p, fmt, ...) pt_devprintk(dbg, p, fmt, ##__VA_ARGS__) 148 + 149 + struct mshv_lapic_irq { 150 + u32 lapic_vector; 151 + u64 lapic_apic_id; 152 + union hv_interrupt_control lapic_control; 153 + }; 154 + 155 + #define MSHV_MAX_GUEST_IRQS 4096 156 + 157 + /* representation of one guest irq entry, either msi or legacy */ 158 + struct mshv_guest_irq_ent { 159 + u32 girq_entry_valid; /* vfio looks at this */ 160 + u32 guest_irq_num; /* a unique number for each irq */ 161 + u32 girq_addr_lo; /* guest irq msi address info */ 162 + u32 girq_addr_hi; 163 + u32 girq_irq_data; /* idt vector in some cases */ 164 + }; 165 + 166 + struct mshv_girq_routing_table { 167 + u32 num_rt_entries; 168 + struct mshv_guest_irq_ent mshv_girq_info_tbl[]; 169 + }; 170 + 171 + struct hv_synic_pages { 172 + struct hv_message_page *synic_message_page; 173 + struct hv_synic_event_flags_page *synic_event_flags_page; 174 + struct hv_synic_event_ring_page *synic_event_ring_page; 175 + }; 176 + 177 + struct mshv_root { 178 + struct hv_synic_pages __percpu *synic_pages; 179 + spinlock_t pt_ht_lock; 180 + DECLARE_HASHTABLE(pt_htable, MSHV_PARTITIONS_HASH_BITS); 181 + }; 182 + 183 + /* 184 + * Callback for doorbell events. 185 + * NOTE: This is called in interrupt context. Callback 186 + * should defer slow and sleeping logic to later. 187 + */ 188 + typedef void (*doorbell_cb_t) (int doorbell_id, void *); 189 + 190 + /* 191 + * port table information 192 + */ 193 + struct port_table_info { 194 + struct rcu_head portbl_rcu; 195 + enum hv_port_type hv_port_type; 196 + union { 197 + struct { 198 + u64 reserved[2]; 199 + } hv_port_message; 200 + struct { 201 + u64 reserved[2]; 202 + } hv_port_event; 203 + struct { 204 + u64 reserved[2]; 205 + } hv_port_monitor; 206 + struct { 207 + doorbell_cb_t doorbell_cb; 208 + void *data; 209 + } hv_port_doorbell; 210 + }; 211 + }; 212 + 213 + int mshv_update_routing_table(struct mshv_partition *partition, 214 + const struct mshv_user_irq_entry *entries, 215 + unsigned int numents); 216 + void mshv_free_routing_table(struct mshv_partition *partition); 217 + 218 + struct mshv_guest_irq_ent mshv_ret_girq_entry(struct mshv_partition *partition, 219 + u32 irq_num); 220 + 221 + void mshv_copy_girq_info(struct mshv_guest_irq_ent *src_irq, 222 + struct mshv_lapic_irq *dest_irq); 223 + 224 + void mshv_irqfd_routing_update(struct mshv_partition *partition); 225 + 226 + void mshv_port_table_fini(void); 227 + int mshv_portid_alloc(struct port_table_info *info); 228 + int mshv_portid_lookup(int port_id, struct port_table_info *info); 229 + void mshv_portid_free(int port_id); 230 + 231 + int mshv_register_doorbell(u64 partition_id, doorbell_cb_t doorbell_cb, 232 + void *data, u64 gpa, u64 val, u64 flags); 233 + void mshv_unregister_doorbell(u64 partition_id, int doorbell_portid); 234 + 235 + void mshv_isr(void); 236 + int mshv_synic_init(unsigned int cpu); 237 + int mshv_synic_cleanup(unsigned int cpu); 238 + 239 + static inline bool mshv_partition_encrypted(struct mshv_partition *partition) 240 + { 241 + return partition->isolation_type == HV_PARTITION_ISOLATION_TYPE_SNP; 242 + } 243 + 244 + struct mshv_partition *mshv_partition_get(struct mshv_partition *partition); 245 + void mshv_partition_put(struct mshv_partition *partition); 246 + struct mshv_partition *mshv_partition_find(u64 partition_id) __must_hold(RCU); 247 + 248 + /* hypercalls */ 249 + 250 + int hv_call_withdraw_memory(u64 count, int node, u64 partition_id); 251 + int hv_call_create_partition(u64 flags, 252 + struct hv_partition_creation_properties creation_properties, 253 + union hv_partition_isolation_properties isolation_properties, 254 + u64 *partition_id); 255 + int hv_call_initialize_partition(u64 partition_id); 256 + int hv_call_finalize_partition(u64 partition_id); 257 + int hv_call_delete_partition(u64 partition_id); 258 + int hv_call_map_mmio_pages(u64 partition_id, u64 gfn, u64 mmio_spa, u64 numpgs); 259 + int hv_call_map_gpa_pages(u64 partition_id, u64 gpa_target, u64 page_count, 260 + u32 flags, struct page **pages); 261 + int hv_call_unmap_gpa_pages(u64 partition_id, u64 gpa_target, u64 page_count, 262 + u32 flags); 263 + int hv_call_delete_vp(u64 partition_id, u32 vp_index); 264 + int hv_call_assert_virtual_interrupt(u64 partition_id, u32 vector, 265 + u64 dest_addr, 266 + union hv_interrupt_control control); 267 + int hv_call_clear_virtual_interrupt(u64 partition_id); 268 + int hv_call_get_gpa_access_states(u64 partition_id, u32 count, u64 gpa_base_pfn, 269 + union hv_gpa_page_access_state_flags state_flags, 270 + int *written_total, 271 + union hv_gpa_page_access_state *states); 272 + int hv_call_get_vp_state(u32 vp_index, u64 partition_id, 273 + struct hv_vp_state_data state_data, 274 + /* Choose between pages and ret_output */ 275 + u64 page_count, struct page **pages, 276 + union hv_output_get_vp_state *ret_output); 277 + int hv_call_set_vp_state(u32 vp_index, u64 partition_id, 278 + /* Choose between pages and bytes */ 279 + struct hv_vp_state_data state_data, u64 page_count, 280 + struct page **pages, u32 num_bytes, u8 *bytes); 281 + int hv_call_map_vp_state_page(u64 partition_id, u32 vp_index, u32 type, 282 + union hv_input_vtl input_vtl, 283 + struct page **state_page); 284 + int hv_call_unmap_vp_state_page(u64 partition_id, u32 vp_index, u32 type, 285 + union hv_input_vtl input_vtl); 286 + int hv_call_create_port(u64 port_partition_id, union hv_port_id port_id, 287 + u64 connection_partition_id, struct hv_port_info *port_info, 288 + u8 port_vtl, u8 min_connection_vtl, int node); 289 + int hv_call_delete_port(u64 port_partition_id, union hv_port_id port_id); 290 + int hv_call_connect_port(u64 port_partition_id, union hv_port_id port_id, 291 + u64 connection_partition_id, 292 + union hv_connection_id connection_id, 293 + struct hv_connection_info *connection_info, 294 + u8 connection_vtl, int node); 295 + int hv_call_disconnect_port(u64 connection_partition_id, 296 + union hv_connection_id connection_id); 297 + int hv_call_notify_port_ring_empty(u32 sint_index); 298 + int hv_call_map_stat_page(enum hv_stats_object_type type, 299 + const union hv_stats_object_identity *identity, 300 + void **addr); 301 + int hv_call_unmap_stat_page(enum hv_stats_object_type type, 302 + const union hv_stats_object_identity *identity); 303 + int hv_call_modify_spa_host_access(u64 partition_id, struct page **pages, 304 + u64 page_struct_count, u32 host_access, 305 + u32 flags, u8 acquire); 306 + 307 + extern struct mshv_root mshv_root; 308 + extern enum hv_scheduler_type hv_scheduler_type; 309 + extern u8 * __percpu *hv_synic_eventring_tail; 310 + 311 + #endif /* _MSHV_ROOT_H_ */
+849
drivers/hv/mshv_root_hv_call.c
··· 1 + // SPDX-License-Identifier: GPL-2.0-only 2 + /* 3 + * Copyright (c) 2023, Microsoft Corporation. 4 + * 5 + * Hypercall helper functions used by the mshv_root module. 6 + * 7 + * Authors: Microsoft Linux virtualization team 8 + */ 9 + 10 + #include <linux/kernel.h> 11 + #include <linux/mm.h> 12 + #include <asm/mshyperv.h> 13 + 14 + #include "mshv_root.h" 15 + 16 + /* Determined empirically */ 17 + #define HV_INIT_PARTITION_DEPOSIT_PAGES 208 18 + #define HV_MAP_GPA_DEPOSIT_PAGES 256 19 + #define HV_UMAP_GPA_PAGES 512 20 + 21 + #define HV_PAGE_COUNT_2M_ALIGNED(pg_count) (!((pg_count) & (0x200 - 1))) 22 + 23 + #define HV_WITHDRAW_BATCH_SIZE (HV_HYP_PAGE_SIZE / sizeof(u64)) 24 + #define HV_MAP_GPA_BATCH_SIZE \ 25 + ((HV_HYP_PAGE_SIZE - sizeof(struct hv_input_map_gpa_pages)) \ 26 + / sizeof(u64)) 27 + #define HV_GET_VP_STATE_BATCH_SIZE \ 28 + ((HV_HYP_PAGE_SIZE - sizeof(struct hv_input_get_vp_state)) \ 29 + / sizeof(u64)) 30 + #define HV_SET_VP_STATE_BATCH_SIZE \ 31 + ((HV_HYP_PAGE_SIZE - sizeof(struct hv_input_set_vp_state)) \ 32 + / sizeof(u64)) 33 + #define HV_GET_GPA_ACCESS_STATES_BATCH_SIZE \ 34 + ((HV_HYP_PAGE_SIZE - sizeof(union hv_gpa_page_access_state)) \ 35 + / sizeof(union hv_gpa_page_access_state)) 36 + #define HV_MODIFY_SPARSE_SPA_PAGE_HOST_ACCESS_MAX_PAGE_COUNT \ 37 + ((HV_HYP_PAGE_SIZE - \ 38 + sizeof(struct hv_input_modify_sparse_spa_page_host_access)) / \ 39 + sizeof(u64)) 40 + 41 + int hv_call_withdraw_memory(u64 count, int node, u64 partition_id) 42 + { 43 + struct hv_input_withdraw_memory *input_page; 44 + struct hv_output_withdraw_memory *output_page; 45 + struct page *page; 46 + u16 completed; 47 + unsigned long remaining = count; 48 + u64 status; 49 + int i; 50 + unsigned long flags; 51 + 52 + page = alloc_page(GFP_KERNEL); 53 + if (!page) 54 + return -ENOMEM; 55 + output_page = page_address(page); 56 + 57 + while (remaining) { 58 + local_irq_save(flags); 59 + 60 + input_page = *this_cpu_ptr(hyperv_pcpu_input_arg); 61 + 62 + memset(input_page, 0, sizeof(*input_page)); 63 + input_page->partition_id = partition_id; 64 + status = hv_do_rep_hypercall(HVCALL_WITHDRAW_MEMORY, 65 + min(remaining, HV_WITHDRAW_BATCH_SIZE), 66 + 0, input_page, output_page); 67 + 68 + local_irq_restore(flags); 69 + 70 + completed = hv_repcomp(status); 71 + 72 + for (i = 0; i < completed; i++) 73 + __free_page(pfn_to_page(output_page->gpa_page_list[i])); 74 + 75 + if (!hv_result_success(status)) { 76 + if (hv_result(status) == HV_STATUS_NO_RESOURCES) 77 + status = HV_STATUS_SUCCESS; 78 + break; 79 + } 80 + 81 + remaining -= completed; 82 + } 83 + free_page((unsigned long)output_page); 84 + 85 + return hv_result_to_errno(status); 86 + } 87 + 88 + int hv_call_create_partition(u64 flags, 89 + struct hv_partition_creation_properties creation_properties, 90 + union hv_partition_isolation_properties isolation_properties, 91 + u64 *partition_id) 92 + { 93 + struct hv_input_create_partition *input; 94 + struct hv_output_create_partition *output; 95 + u64 status; 96 + int ret; 97 + unsigned long irq_flags; 98 + 99 + do { 100 + local_irq_save(irq_flags); 101 + input = *this_cpu_ptr(hyperv_pcpu_input_arg); 102 + output = *this_cpu_ptr(hyperv_pcpu_output_arg); 103 + 104 + memset(input, 0, sizeof(*input)); 105 + input->flags = flags; 106 + input->compatibility_version = HV_COMPATIBILITY_21_H2; 107 + 108 + memcpy(&input->partition_creation_properties, &creation_properties, 109 + sizeof(creation_properties)); 110 + 111 + memcpy(&input->isolation_properties, &isolation_properties, 112 + sizeof(isolation_properties)); 113 + 114 + status = hv_do_hypercall(HVCALL_CREATE_PARTITION, 115 + input, output); 116 + 117 + if (hv_result(status) != HV_STATUS_INSUFFICIENT_MEMORY) { 118 + if (hv_result_success(status)) 119 + *partition_id = output->partition_id; 120 + local_irq_restore(irq_flags); 121 + ret = hv_result_to_errno(status); 122 + break; 123 + } 124 + local_irq_restore(irq_flags); 125 + ret = hv_call_deposit_pages(NUMA_NO_NODE, 126 + hv_current_partition_id, 1); 127 + } while (!ret); 128 + 129 + return ret; 130 + } 131 + 132 + int hv_call_initialize_partition(u64 partition_id) 133 + { 134 + struct hv_input_initialize_partition input; 135 + u64 status; 136 + int ret; 137 + 138 + input.partition_id = partition_id; 139 + 140 + ret = hv_call_deposit_pages(NUMA_NO_NODE, partition_id, 141 + HV_INIT_PARTITION_DEPOSIT_PAGES); 142 + if (ret) 143 + return ret; 144 + 145 + do { 146 + status = hv_do_fast_hypercall8(HVCALL_INITIALIZE_PARTITION, 147 + *(u64 *)&input); 148 + 149 + if (hv_result(status) != HV_STATUS_INSUFFICIENT_MEMORY) { 150 + ret = hv_result_to_errno(status); 151 + break; 152 + } 153 + ret = hv_call_deposit_pages(NUMA_NO_NODE, partition_id, 1); 154 + } while (!ret); 155 + 156 + return ret; 157 + } 158 + 159 + int hv_call_finalize_partition(u64 partition_id) 160 + { 161 + struct hv_input_finalize_partition input; 162 + u64 status; 163 + 164 + input.partition_id = partition_id; 165 + status = hv_do_fast_hypercall8(HVCALL_FINALIZE_PARTITION, 166 + *(u64 *)&input); 167 + 168 + return hv_result_to_errno(status); 169 + } 170 + 171 + int hv_call_delete_partition(u64 partition_id) 172 + { 173 + struct hv_input_delete_partition input; 174 + u64 status; 175 + 176 + input.partition_id = partition_id; 177 + status = hv_do_fast_hypercall8(HVCALL_DELETE_PARTITION, *(u64 *)&input); 178 + 179 + return hv_result_to_errno(status); 180 + } 181 + 182 + /* Ask the hypervisor to map guest ram pages or the guest mmio space */ 183 + static int hv_do_map_gpa_hcall(u64 partition_id, u64 gfn, u64 page_struct_count, 184 + u32 flags, struct page **pages, u64 mmio_spa) 185 + { 186 + struct hv_input_map_gpa_pages *input_page; 187 + u64 status, *pfnlist; 188 + unsigned long irq_flags, large_shift = 0; 189 + int ret = 0, done = 0; 190 + u64 page_count = page_struct_count; 191 + 192 + if (page_count == 0 || (pages && mmio_spa)) 193 + return -EINVAL; 194 + 195 + if (flags & HV_MAP_GPA_LARGE_PAGE) { 196 + if (mmio_spa) 197 + return -EINVAL; 198 + 199 + if (!HV_PAGE_COUNT_2M_ALIGNED(page_count)) 200 + return -EINVAL; 201 + 202 + large_shift = HV_HYP_LARGE_PAGE_SHIFT - HV_HYP_PAGE_SHIFT; 203 + page_count >>= large_shift; 204 + } 205 + 206 + while (done < page_count) { 207 + ulong i, completed, remain = page_count - done; 208 + int rep_count = min(remain, HV_MAP_GPA_BATCH_SIZE); 209 + 210 + local_irq_save(irq_flags); 211 + input_page = *this_cpu_ptr(hyperv_pcpu_input_arg); 212 + 213 + input_page->target_partition_id = partition_id; 214 + input_page->target_gpa_base = gfn + (done << large_shift); 215 + input_page->map_flags = flags; 216 + pfnlist = input_page->source_gpa_page_list; 217 + 218 + for (i = 0; i < rep_count; i++) 219 + if (flags & HV_MAP_GPA_NO_ACCESS) { 220 + pfnlist[i] = 0; 221 + } else if (pages) { 222 + u64 index = (done + i) << large_shift; 223 + 224 + if (index >= page_struct_count) { 225 + ret = -EINVAL; 226 + break; 227 + } 228 + pfnlist[i] = page_to_pfn(pages[index]); 229 + } else { 230 + pfnlist[i] = mmio_spa + done + i; 231 + } 232 + if (ret) 233 + break; 234 + 235 + status = hv_do_rep_hypercall(HVCALL_MAP_GPA_PAGES, rep_count, 0, 236 + input_page, NULL); 237 + local_irq_restore(irq_flags); 238 + 239 + completed = hv_repcomp(status); 240 + 241 + if (hv_result(status) == HV_STATUS_INSUFFICIENT_MEMORY) { 242 + ret = hv_call_deposit_pages(NUMA_NO_NODE, partition_id, 243 + HV_MAP_GPA_DEPOSIT_PAGES); 244 + if (ret) 245 + break; 246 + 247 + } else if (!hv_result_success(status)) { 248 + ret = hv_result_to_errno(status); 249 + break; 250 + } 251 + 252 + done += completed; 253 + } 254 + 255 + if (ret && done) { 256 + u32 unmap_flags = 0; 257 + 258 + if (flags & HV_MAP_GPA_LARGE_PAGE) 259 + unmap_flags |= HV_UNMAP_GPA_LARGE_PAGE; 260 + hv_call_unmap_gpa_pages(partition_id, gfn, done, unmap_flags); 261 + } 262 + 263 + return ret; 264 + } 265 + 266 + /* Ask the hypervisor to map guest ram pages */ 267 + int hv_call_map_gpa_pages(u64 partition_id, u64 gpa_target, u64 page_count, 268 + u32 flags, struct page **pages) 269 + { 270 + return hv_do_map_gpa_hcall(partition_id, gpa_target, page_count, 271 + flags, pages, 0); 272 + } 273 + 274 + /* Ask the hypervisor to map guest mmio space */ 275 + int hv_call_map_mmio_pages(u64 partition_id, u64 gfn, u64 mmio_spa, u64 numpgs) 276 + { 277 + int i; 278 + u32 flags = HV_MAP_GPA_READABLE | HV_MAP_GPA_WRITABLE | 279 + HV_MAP_GPA_NOT_CACHED; 280 + 281 + for (i = 0; i < numpgs; i++) 282 + if (page_is_ram(mmio_spa + i)) 283 + return -EINVAL; 284 + 285 + return hv_do_map_gpa_hcall(partition_id, gfn, numpgs, flags, NULL, 286 + mmio_spa); 287 + } 288 + 289 + int hv_call_unmap_gpa_pages(u64 partition_id, u64 gfn, u64 page_count_4k, 290 + u32 flags) 291 + { 292 + struct hv_input_unmap_gpa_pages *input_page; 293 + u64 status, page_count = page_count_4k; 294 + unsigned long irq_flags, large_shift = 0; 295 + int ret = 0, done = 0; 296 + 297 + if (page_count == 0) 298 + return -EINVAL; 299 + 300 + if (flags & HV_UNMAP_GPA_LARGE_PAGE) { 301 + if (!HV_PAGE_COUNT_2M_ALIGNED(page_count)) 302 + return -EINVAL; 303 + 304 + large_shift = HV_HYP_LARGE_PAGE_SHIFT - HV_HYP_PAGE_SHIFT; 305 + page_count >>= large_shift; 306 + } 307 + 308 + while (done < page_count) { 309 + ulong completed, remain = page_count - done; 310 + int rep_count = min(remain, HV_UMAP_GPA_PAGES); 311 + 312 + local_irq_save(irq_flags); 313 + input_page = *this_cpu_ptr(hyperv_pcpu_input_arg); 314 + 315 + input_page->target_partition_id = partition_id; 316 + input_page->target_gpa_base = gfn + (done << large_shift); 317 + input_page->unmap_flags = flags; 318 + status = hv_do_rep_hypercall(HVCALL_UNMAP_GPA_PAGES, rep_count, 319 + 0, input_page, NULL); 320 + local_irq_restore(irq_flags); 321 + 322 + completed = hv_repcomp(status); 323 + if (!hv_result_success(status)) { 324 + ret = hv_result_to_errno(status); 325 + break; 326 + } 327 + 328 + done += completed; 329 + } 330 + 331 + return ret; 332 + } 333 + 334 + int hv_call_get_gpa_access_states(u64 partition_id, u32 count, u64 gpa_base_pfn, 335 + union hv_gpa_page_access_state_flags state_flags, 336 + int *written_total, 337 + union hv_gpa_page_access_state *states) 338 + { 339 + struct hv_input_get_gpa_pages_access_state *input_page; 340 + union hv_gpa_page_access_state *output_page; 341 + int completed = 0; 342 + unsigned long remaining = count; 343 + int rep_count, i; 344 + u64 status = 0; 345 + unsigned long flags; 346 + 347 + *written_total = 0; 348 + while (remaining) { 349 + local_irq_save(flags); 350 + input_page = *this_cpu_ptr(hyperv_pcpu_input_arg); 351 + output_page = *this_cpu_ptr(hyperv_pcpu_output_arg); 352 + 353 + input_page->partition_id = partition_id; 354 + input_page->hv_gpa_page_number = gpa_base_pfn + *written_total; 355 + input_page->flags = state_flags; 356 + rep_count = min(remaining, HV_GET_GPA_ACCESS_STATES_BATCH_SIZE); 357 + 358 + status = hv_do_rep_hypercall(HVCALL_GET_GPA_PAGES_ACCESS_STATES, rep_count, 359 + 0, input_page, output_page); 360 + if (!hv_result_success(status)) { 361 + local_irq_restore(flags); 362 + break; 363 + } 364 + completed = hv_repcomp(status); 365 + for (i = 0; i < completed; ++i) 366 + states[i].as_uint8 = output_page[i].as_uint8; 367 + 368 + local_irq_restore(flags); 369 + states += completed; 370 + *written_total += completed; 371 + remaining -= completed; 372 + } 373 + 374 + return hv_result_to_errno(status); 375 + } 376 + 377 + int hv_call_assert_virtual_interrupt(u64 partition_id, u32 vector, 378 + u64 dest_addr, 379 + union hv_interrupt_control control) 380 + { 381 + struct hv_input_assert_virtual_interrupt *input; 382 + unsigned long flags; 383 + u64 status; 384 + 385 + local_irq_save(flags); 386 + input = *this_cpu_ptr(hyperv_pcpu_input_arg); 387 + memset(input, 0, sizeof(*input)); 388 + input->partition_id = partition_id; 389 + input->vector = vector; 390 + input->dest_addr = dest_addr; 391 + input->control = control; 392 + status = hv_do_hypercall(HVCALL_ASSERT_VIRTUAL_INTERRUPT, input, NULL); 393 + local_irq_restore(flags); 394 + 395 + return hv_result_to_errno(status); 396 + } 397 + 398 + int hv_call_delete_vp(u64 partition_id, u32 vp_index) 399 + { 400 + union hv_input_delete_vp input = {}; 401 + u64 status; 402 + 403 + input.partition_id = partition_id; 404 + input.vp_index = vp_index; 405 + 406 + status = hv_do_fast_hypercall16(HVCALL_DELETE_VP, 407 + input.as_uint64[0], input.as_uint64[1]); 408 + 409 + return hv_result_to_errno(status); 410 + } 411 + EXPORT_SYMBOL_GPL(hv_call_delete_vp); 412 + 413 + int hv_call_get_vp_state(u32 vp_index, u64 partition_id, 414 + struct hv_vp_state_data state_data, 415 + /* Choose between pages and ret_output */ 416 + u64 page_count, struct page **pages, 417 + union hv_output_get_vp_state *ret_output) 418 + { 419 + struct hv_input_get_vp_state *input; 420 + union hv_output_get_vp_state *output; 421 + u64 status; 422 + int i; 423 + u64 control; 424 + unsigned long flags; 425 + int ret = 0; 426 + 427 + if (page_count > HV_GET_VP_STATE_BATCH_SIZE) 428 + return -EINVAL; 429 + 430 + if (!page_count && !ret_output) 431 + return -EINVAL; 432 + 433 + do { 434 + local_irq_save(flags); 435 + input = *this_cpu_ptr(hyperv_pcpu_input_arg); 436 + output = *this_cpu_ptr(hyperv_pcpu_output_arg); 437 + memset(input, 0, sizeof(*input)); 438 + memset(output, 0, sizeof(*output)); 439 + 440 + input->partition_id = partition_id; 441 + input->vp_index = vp_index; 442 + input->state_data = state_data; 443 + for (i = 0; i < page_count; i++) 444 + input->output_data_pfns[i] = page_to_pfn(pages[i]); 445 + 446 + control = (HVCALL_GET_VP_STATE) | 447 + (page_count << HV_HYPERCALL_VARHEAD_OFFSET); 448 + 449 + status = hv_do_hypercall(control, input, output); 450 + 451 + if (hv_result(status) != HV_STATUS_INSUFFICIENT_MEMORY) { 452 + if (hv_result_success(status) && ret_output) 453 + memcpy(ret_output, output, sizeof(*output)); 454 + 455 + local_irq_restore(flags); 456 + ret = hv_result_to_errno(status); 457 + break; 458 + } 459 + local_irq_restore(flags); 460 + 461 + ret = hv_call_deposit_pages(NUMA_NO_NODE, 462 + partition_id, 1); 463 + } while (!ret); 464 + 465 + return ret; 466 + } 467 + 468 + int hv_call_set_vp_state(u32 vp_index, u64 partition_id, 469 + /* Choose between pages and bytes */ 470 + struct hv_vp_state_data state_data, u64 page_count, 471 + struct page **pages, u32 num_bytes, u8 *bytes) 472 + { 473 + struct hv_input_set_vp_state *input; 474 + u64 status; 475 + int i; 476 + u64 control; 477 + unsigned long flags; 478 + int ret = 0; 479 + u16 varhead_sz; 480 + 481 + if (page_count > HV_SET_VP_STATE_BATCH_SIZE) 482 + return -EINVAL; 483 + if (sizeof(*input) + num_bytes > HV_HYP_PAGE_SIZE) 484 + return -EINVAL; 485 + 486 + if (num_bytes) 487 + /* round up to 8 and divide by 8 */ 488 + varhead_sz = (num_bytes + 7) >> 3; 489 + else if (page_count) 490 + varhead_sz = page_count; 491 + else 492 + return -EINVAL; 493 + 494 + do { 495 + local_irq_save(flags); 496 + input = *this_cpu_ptr(hyperv_pcpu_input_arg); 497 + memset(input, 0, sizeof(*input)); 498 + 499 + input->partition_id = partition_id; 500 + input->vp_index = vp_index; 501 + input->state_data = state_data; 502 + if (num_bytes) { 503 + memcpy((u8 *)input->data, bytes, num_bytes); 504 + } else { 505 + for (i = 0; i < page_count; i++) 506 + input->data[i].pfns = page_to_pfn(pages[i]); 507 + } 508 + 509 + control = (HVCALL_SET_VP_STATE) | 510 + (varhead_sz << HV_HYPERCALL_VARHEAD_OFFSET); 511 + 512 + status = hv_do_hypercall(control, input, NULL); 513 + 514 + if (hv_result(status) != HV_STATUS_INSUFFICIENT_MEMORY) { 515 + local_irq_restore(flags); 516 + ret = hv_result_to_errno(status); 517 + break; 518 + } 519 + local_irq_restore(flags); 520 + 521 + ret = hv_call_deposit_pages(NUMA_NO_NODE, 522 + partition_id, 1); 523 + } while (!ret); 524 + 525 + return ret; 526 + } 527 + 528 + int hv_call_map_vp_state_page(u64 partition_id, u32 vp_index, u32 type, 529 + union hv_input_vtl input_vtl, 530 + struct page **state_page) 531 + { 532 + struct hv_input_map_vp_state_page *input; 533 + struct hv_output_map_vp_state_page *output; 534 + u64 status; 535 + int ret; 536 + unsigned long flags; 537 + 538 + do { 539 + local_irq_save(flags); 540 + 541 + input = *this_cpu_ptr(hyperv_pcpu_input_arg); 542 + output = *this_cpu_ptr(hyperv_pcpu_output_arg); 543 + 544 + input->partition_id = partition_id; 545 + input->vp_index = vp_index; 546 + input->type = type; 547 + input->input_vtl = input_vtl; 548 + 549 + status = hv_do_hypercall(HVCALL_MAP_VP_STATE_PAGE, input, output); 550 + 551 + if (hv_result(status) != HV_STATUS_INSUFFICIENT_MEMORY) { 552 + if (hv_result_success(status)) 553 + *state_page = pfn_to_page(output->map_location); 554 + local_irq_restore(flags); 555 + ret = hv_result_to_errno(status); 556 + break; 557 + } 558 + 559 + local_irq_restore(flags); 560 + 561 + ret = hv_call_deposit_pages(NUMA_NO_NODE, partition_id, 1); 562 + } while (!ret); 563 + 564 + return ret; 565 + } 566 + 567 + int hv_call_unmap_vp_state_page(u64 partition_id, u32 vp_index, u32 type, 568 + union hv_input_vtl input_vtl) 569 + { 570 + unsigned long flags; 571 + u64 status; 572 + struct hv_input_unmap_vp_state_page *input; 573 + 574 + local_irq_save(flags); 575 + 576 + input = *this_cpu_ptr(hyperv_pcpu_input_arg); 577 + 578 + memset(input, 0, sizeof(*input)); 579 + 580 + input->partition_id = partition_id; 581 + input->vp_index = vp_index; 582 + input->type = type; 583 + input->input_vtl = input_vtl; 584 + 585 + status = hv_do_hypercall(HVCALL_UNMAP_VP_STATE_PAGE, input, NULL); 586 + 587 + local_irq_restore(flags); 588 + 589 + return hv_result_to_errno(status); 590 + } 591 + 592 + int 593 + hv_call_clear_virtual_interrupt(u64 partition_id) 594 + { 595 + int status; 596 + 597 + status = hv_do_fast_hypercall8(HVCALL_CLEAR_VIRTUAL_INTERRUPT, 598 + partition_id); 599 + 600 + return hv_result_to_errno(status); 601 + } 602 + 603 + int 604 + hv_call_create_port(u64 port_partition_id, union hv_port_id port_id, 605 + u64 connection_partition_id, 606 + struct hv_port_info *port_info, 607 + u8 port_vtl, u8 min_connection_vtl, int node) 608 + { 609 + struct hv_input_create_port *input; 610 + unsigned long flags; 611 + int ret = 0; 612 + int status; 613 + 614 + do { 615 + local_irq_save(flags); 616 + input = *this_cpu_ptr(hyperv_pcpu_input_arg); 617 + memset(input, 0, sizeof(*input)); 618 + 619 + input->port_partition_id = port_partition_id; 620 + input->port_id = port_id; 621 + input->connection_partition_id = connection_partition_id; 622 + input->port_info = *port_info; 623 + input->port_vtl = port_vtl; 624 + input->min_connection_vtl = min_connection_vtl; 625 + input->proximity_domain_info = hv_numa_node_to_pxm_info(node); 626 + status = hv_do_hypercall(HVCALL_CREATE_PORT, input, NULL); 627 + local_irq_restore(flags); 628 + if (hv_result_success(status)) 629 + break; 630 + 631 + if (hv_result(status) != HV_STATUS_INSUFFICIENT_MEMORY) { 632 + ret = hv_result_to_errno(status); 633 + break; 634 + } 635 + ret = hv_call_deposit_pages(NUMA_NO_NODE, port_partition_id, 1); 636 + 637 + } while (!ret); 638 + 639 + return ret; 640 + } 641 + 642 + int 643 + hv_call_delete_port(u64 port_partition_id, union hv_port_id port_id) 644 + { 645 + union hv_input_delete_port input = { 0 }; 646 + int status; 647 + 648 + input.port_partition_id = port_partition_id; 649 + input.port_id = port_id; 650 + status = hv_do_fast_hypercall16(HVCALL_DELETE_PORT, 651 + input.as_uint64[0], 652 + input.as_uint64[1]); 653 + 654 + return hv_result_to_errno(status); 655 + } 656 + 657 + int 658 + hv_call_connect_port(u64 port_partition_id, union hv_port_id port_id, 659 + u64 connection_partition_id, 660 + union hv_connection_id connection_id, 661 + struct hv_connection_info *connection_info, 662 + u8 connection_vtl, int node) 663 + { 664 + struct hv_input_connect_port *input; 665 + unsigned long flags; 666 + int ret = 0, status; 667 + 668 + do { 669 + local_irq_save(flags); 670 + input = *this_cpu_ptr(hyperv_pcpu_input_arg); 671 + memset(input, 0, sizeof(*input)); 672 + input->port_partition_id = port_partition_id; 673 + input->port_id = port_id; 674 + input->connection_partition_id = connection_partition_id; 675 + input->connection_id = connection_id; 676 + input->connection_info = *connection_info; 677 + input->connection_vtl = connection_vtl; 678 + input->proximity_domain_info = hv_numa_node_to_pxm_info(node); 679 + status = hv_do_hypercall(HVCALL_CONNECT_PORT, input, NULL); 680 + 681 + local_irq_restore(flags); 682 + if (hv_result_success(status)) 683 + break; 684 + 685 + if (hv_result(status) != HV_STATUS_INSUFFICIENT_MEMORY) { 686 + ret = hv_result_to_errno(status); 687 + break; 688 + } 689 + ret = hv_call_deposit_pages(NUMA_NO_NODE, 690 + connection_partition_id, 1); 691 + } while (!ret); 692 + 693 + return ret; 694 + } 695 + 696 + int 697 + hv_call_disconnect_port(u64 connection_partition_id, 698 + union hv_connection_id connection_id) 699 + { 700 + union hv_input_disconnect_port input = { 0 }; 701 + int status; 702 + 703 + input.connection_partition_id = connection_partition_id; 704 + input.connection_id = connection_id; 705 + input.is_doorbell = 1; 706 + status = hv_do_fast_hypercall16(HVCALL_DISCONNECT_PORT, 707 + input.as_uint64[0], 708 + input.as_uint64[1]); 709 + 710 + return hv_result_to_errno(status); 711 + } 712 + 713 + int 714 + hv_call_notify_port_ring_empty(u32 sint_index) 715 + { 716 + union hv_input_notify_port_ring_empty input = { 0 }; 717 + int status; 718 + 719 + input.sint_index = sint_index; 720 + status = hv_do_fast_hypercall8(HVCALL_NOTIFY_PORT_RING_EMPTY, 721 + input.as_uint64); 722 + 723 + return hv_result_to_errno(status); 724 + } 725 + 726 + int hv_call_map_stat_page(enum hv_stats_object_type type, 727 + const union hv_stats_object_identity *identity, 728 + void **addr) 729 + { 730 + unsigned long flags; 731 + struct hv_input_map_stats_page *input; 732 + struct hv_output_map_stats_page *output; 733 + u64 status, pfn; 734 + int ret = 0; 735 + 736 + do { 737 + local_irq_save(flags); 738 + input = *this_cpu_ptr(hyperv_pcpu_input_arg); 739 + output = *this_cpu_ptr(hyperv_pcpu_output_arg); 740 + 741 + memset(input, 0, sizeof(*input)); 742 + input->type = type; 743 + input->identity = *identity; 744 + 745 + status = hv_do_hypercall(HVCALL_MAP_STATS_PAGE, input, output); 746 + pfn = output->map_location; 747 + 748 + local_irq_restore(flags); 749 + if (hv_result(status) != HV_STATUS_INSUFFICIENT_MEMORY) { 750 + ret = hv_result_to_errno(status); 751 + if (hv_result_success(status)) 752 + break; 753 + return ret; 754 + } 755 + 756 + ret = hv_call_deposit_pages(NUMA_NO_NODE, 757 + hv_current_partition_id, 1); 758 + if (ret) 759 + return ret; 760 + } while (!ret); 761 + 762 + *addr = page_address(pfn_to_page(pfn)); 763 + 764 + return ret; 765 + } 766 + 767 + int hv_call_unmap_stat_page(enum hv_stats_object_type type, 768 + const union hv_stats_object_identity *identity) 769 + { 770 + unsigned long flags; 771 + struct hv_input_unmap_stats_page *input; 772 + u64 status; 773 + 774 + local_irq_save(flags); 775 + input = *this_cpu_ptr(hyperv_pcpu_input_arg); 776 + 777 + memset(input, 0, sizeof(*input)); 778 + input->type = type; 779 + input->identity = *identity; 780 + 781 + status = hv_do_hypercall(HVCALL_UNMAP_STATS_PAGE, input, NULL); 782 + local_irq_restore(flags); 783 + 784 + return hv_result_to_errno(status); 785 + } 786 + 787 + int hv_call_modify_spa_host_access(u64 partition_id, struct page **pages, 788 + u64 page_struct_count, u32 host_access, 789 + u32 flags, u8 acquire) 790 + { 791 + struct hv_input_modify_sparse_spa_page_host_access *input_page; 792 + u64 status; 793 + int done = 0; 794 + unsigned long irq_flags, large_shift = 0; 795 + u64 page_count = page_struct_count; 796 + u16 code = acquire ? HVCALL_ACQUIRE_SPARSE_SPA_PAGE_HOST_ACCESS : 797 + HVCALL_RELEASE_SPARSE_SPA_PAGE_HOST_ACCESS; 798 + 799 + if (page_count == 0) 800 + return -EINVAL; 801 + 802 + if (flags & HV_MODIFY_SPA_PAGE_HOST_ACCESS_LARGE_PAGE) { 803 + if (!HV_PAGE_COUNT_2M_ALIGNED(page_count)) 804 + return -EINVAL; 805 + large_shift = HV_HYP_LARGE_PAGE_SHIFT - HV_HYP_PAGE_SHIFT; 806 + page_count >>= large_shift; 807 + } 808 + 809 + while (done < page_count) { 810 + ulong i, completed, remain = page_count - done; 811 + int rep_count = min(remain, 812 + HV_MODIFY_SPARSE_SPA_PAGE_HOST_ACCESS_MAX_PAGE_COUNT); 813 + 814 + local_irq_save(irq_flags); 815 + input_page = *this_cpu_ptr(hyperv_pcpu_input_arg); 816 + 817 + memset(input_page, 0, sizeof(*input_page)); 818 + /* Only set the partition id if you are making the pages 819 + * exclusive 820 + */ 821 + if (flags & HV_MODIFY_SPA_PAGE_HOST_ACCESS_MAKE_EXCLUSIVE) 822 + input_page->partition_id = partition_id; 823 + input_page->flags = flags; 824 + input_page->host_access = host_access; 825 + 826 + for (i = 0; i < rep_count; i++) { 827 + u64 index = (done + i) << large_shift; 828 + 829 + if (index >= page_struct_count) 830 + return -EINVAL; 831 + 832 + input_page->spa_page_list[i] = 833 + page_to_pfn(pages[index]); 834 + } 835 + 836 + status = hv_do_rep_hypercall(code, rep_count, 0, input_page, 837 + NULL); 838 + local_irq_restore(irq_flags); 839 + 840 + completed = hv_repcomp(status); 841 + 842 + if (!hv_result_success(status)) 843 + return hv_result_to_errno(status); 844 + 845 + done += completed; 846 + } 847 + 848 + return 0; 849 + }
+2307
drivers/hv/mshv_root_main.c
··· 1 + // SPDX-License-Identifier: GPL-2.0-only 2 + /* 3 + * Copyright (c) 2024, Microsoft Corporation. 4 + * 5 + * The main part of the mshv_root module, providing APIs to create 6 + * and manage guest partitions. 7 + * 8 + * Authors: Microsoft Linux virtualization team 9 + */ 10 + 11 + #include <linux/kernel.h> 12 + #include <linux/module.h> 13 + #include <linux/fs.h> 14 + #include <linux/miscdevice.h> 15 + #include <linux/slab.h> 16 + #include <linux/file.h> 17 + #include <linux/anon_inodes.h> 18 + #include <linux/mm.h> 19 + #include <linux/io.h> 20 + #include <linux/cpuhotplug.h> 21 + #include <linux/random.h> 22 + #include <asm/mshyperv.h> 23 + #include <linux/hyperv.h> 24 + #include <linux/notifier.h> 25 + #include <linux/reboot.h> 26 + #include <linux/kexec.h> 27 + #include <linux/page-flags.h> 28 + #include <linux/crash_dump.h> 29 + #include <linux/panic_notifier.h> 30 + #include <linux/vmalloc.h> 31 + 32 + #include "mshv_eventfd.h" 33 + #include "mshv.h" 34 + #include "mshv_root.h" 35 + 36 + MODULE_AUTHOR("Microsoft"); 37 + MODULE_LICENSE("GPL"); 38 + MODULE_DESCRIPTION("Microsoft Hyper-V root partition VMM interface /dev/mshv"); 39 + 40 + /* TODO move this to mshyperv.h when needed outside driver */ 41 + static inline bool hv_parent_partition(void) 42 + { 43 + return hv_root_partition(); 44 + } 45 + 46 + /* TODO move this to another file when debugfs code is added */ 47 + enum hv_stats_vp_counters { /* HV_THREAD_COUNTER */ 48 + #if defined(CONFIG_X86) 49 + VpRootDispatchThreadBlocked = 201, 50 + #elif defined(CONFIG_ARM64) 51 + VpRootDispatchThreadBlocked = 94, 52 + #endif 53 + VpStatsMaxCounter 54 + }; 55 + 56 + struct hv_stats_page { 57 + union { 58 + u64 vp_cntrs[VpStatsMaxCounter]; /* VP counters */ 59 + u8 data[HV_HYP_PAGE_SIZE]; 60 + }; 61 + } __packed; 62 + 63 + struct mshv_root mshv_root; 64 + 65 + enum hv_scheduler_type hv_scheduler_type; 66 + 67 + /* Once we implement the fast extended hypercall ABI they can go away. */ 68 + static void * __percpu *root_scheduler_input; 69 + static void * __percpu *root_scheduler_output; 70 + 71 + static long mshv_dev_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg); 72 + static int mshv_dev_open(struct inode *inode, struct file *filp); 73 + static int mshv_dev_release(struct inode *inode, struct file *filp); 74 + static int mshv_vp_release(struct inode *inode, struct file *filp); 75 + static long mshv_vp_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg); 76 + static int mshv_partition_release(struct inode *inode, struct file *filp); 77 + static long mshv_partition_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg); 78 + static int mshv_vp_mmap(struct file *file, struct vm_area_struct *vma); 79 + static vm_fault_t mshv_vp_fault(struct vm_fault *vmf); 80 + static int mshv_init_async_handler(struct mshv_partition *partition); 81 + static void mshv_async_hvcall_handler(void *data, u64 *status); 82 + 83 + static const union hv_input_vtl input_vtl_zero; 84 + static const union hv_input_vtl input_vtl_normal = { 85 + .target_vtl = HV_NORMAL_VTL, 86 + .use_target_vtl = 1, 87 + }; 88 + 89 + static const struct vm_operations_struct mshv_vp_vm_ops = { 90 + .fault = mshv_vp_fault, 91 + }; 92 + 93 + static const struct file_operations mshv_vp_fops = { 94 + .owner = THIS_MODULE, 95 + .release = mshv_vp_release, 96 + .unlocked_ioctl = mshv_vp_ioctl, 97 + .llseek = noop_llseek, 98 + .mmap = mshv_vp_mmap, 99 + }; 100 + 101 + static const struct file_operations mshv_partition_fops = { 102 + .owner = THIS_MODULE, 103 + .release = mshv_partition_release, 104 + .unlocked_ioctl = mshv_partition_ioctl, 105 + .llseek = noop_llseek, 106 + }; 107 + 108 + static const struct file_operations mshv_dev_fops = { 109 + .owner = THIS_MODULE, 110 + .open = mshv_dev_open, 111 + .release = mshv_dev_release, 112 + .unlocked_ioctl = mshv_dev_ioctl, 113 + .llseek = noop_llseek, 114 + }; 115 + 116 + static struct miscdevice mshv_dev = { 117 + .minor = MISC_DYNAMIC_MINOR, 118 + .name = "mshv", 119 + .fops = &mshv_dev_fops, 120 + .mode = 0600, 121 + }; 122 + 123 + /* 124 + * Only allow hypercalls that have a u64 partition id as the first member of 125 + * the input structure. 126 + * These are sorted by value. 127 + */ 128 + static u16 mshv_passthru_hvcalls[] = { 129 + HVCALL_GET_PARTITION_PROPERTY, 130 + HVCALL_SET_PARTITION_PROPERTY, 131 + HVCALL_INSTALL_INTERCEPT, 132 + HVCALL_GET_VP_REGISTERS, 133 + HVCALL_SET_VP_REGISTERS, 134 + HVCALL_TRANSLATE_VIRTUAL_ADDRESS, 135 + HVCALL_CLEAR_VIRTUAL_INTERRUPT, 136 + HVCALL_REGISTER_INTERCEPT_RESULT, 137 + HVCALL_ASSERT_VIRTUAL_INTERRUPT, 138 + HVCALL_GET_GPA_PAGES_ACCESS_STATES, 139 + HVCALL_SIGNAL_EVENT_DIRECT, 140 + HVCALL_POST_MESSAGE_DIRECT, 141 + HVCALL_GET_VP_CPUID_VALUES, 142 + }; 143 + 144 + static bool mshv_hvcall_is_async(u16 code) 145 + { 146 + switch (code) { 147 + case HVCALL_SET_PARTITION_PROPERTY: 148 + return true; 149 + default: 150 + break; 151 + } 152 + return false; 153 + } 154 + 155 + static int mshv_ioctl_passthru_hvcall(struct mshv_partition *partition, 156 + bool partition_locked, 157 + void __user *user_args) 158 + { 159 + u64 status; 160 + int ret = 0, i; 161 + bool is_async; 162 + struct mshv_root_hvcall args; 163 + struct page *page; 164 + unsigned int pages_order; 165 + void *input_pg = NULL; 166 + void *output_pg = NULL; 167 + 168 + if (copy_from_user(&args, user_args, sizeof(args))) 169 + return -EFAULT; 170 + 171 + if (args.status || !args.in_ptr || args.in_sz < sizeof(u64) || 172 + mshv_field_nonzero(args, rsvd) || args.in_sz > HV_HYP_PAGE_SIZE) 173 + return -EINVAL; 174 + 175 + if (args.out_ptr && (!args.out_sz || args.out_sz > HV_HYP_PAGE_SIZE)) 176 + return -EINVAL; 177 + 178 + for (i = 0; i < ARRAY_SIZE(mshv_passthru_hvcalls); ++i) 179 + if (args.code == mshv_passthru_hvcalls[i]) 180 + break; 181 + 182 + if (i >= ARRAY_SIZE(mshv_passthru_hvcalls)) 183 + return -EINVAL; 184 + 185 + is_async = mshv_hvcall_is_async(args.code); 186 + if (is_async) { 187 + /* async hypercalls can only be called from partition fd */ 188 + if (!partition_locked) 189 + return -EINVAL; 190 + ret = mshv_init_async_handler(partition); 191 + if (ret) 192 + return ret; 193 + } 194 + 195 + pages_order = args.out_ptr ? 1 : 0; 196 + page = alloc_pages(GFP_KERNEL, pages_order); 197 + if (!page) 198 + return -ENOMEM; 199 + input_pg = page_address(page); 200 + 201 + if (args.out_ptr) 202 + output_pg = (char *)input_pg + PAGE_SIZE; 203 + else 204 + output_pg = NULL; 205 + 206 + if (copy_from_user(input_pg, (void __user *)args.in_ptr, 207 + args.in_sz)) { 208 + ret = -EFAULT; 209 + goto free_pages_out; 210 + } 211 + 212 + /* 213 + * NOTE: This only works because all the allowed hypercalls' input 214 + * structs begin with a u64 partition_id field. 215 + */ 216 + *(u64 *)input_pg = partition->pt_id; 217 + 218 + if (args.reps) 219 + status = hv_do_rep_hypercall(args.code, args.reps, 0, 220 + input_pg, output_pg); 221 + else 222 + status = hv_do_hypercall(args.code, input_pg, output_pg); 223 + 224 + if (hv_result(status) == HV_STATUS_CALL_PENDING) { 225 + if (is_async) { 226 + mshv_async_hvcall_handler(partition, &status); 227 + } else { /* Paranoia check. This shouldn't happen! */ 228 + ret = -EBADFD; 229 + goto free_pages_out; 230 + } 231 + } 232 + 233 + if (hv_result(status) == HV_STATUS_INSUFFICIENT_MEMORY) { 234 + ret = hv_call_deposit_pages(NUMA_NO_NODE, partition->pt_id, 1); 235 + if (!ret) 236 + ret = -EAGAIN; 237 + } else if (!hv_result_success(status)) { 238 + ret = hv_result_to_errno(status); 239 + } 240 + 241 + /* 242 + * Always return the status and output data regardless of result. 243 + * The VMM may need it to determine how to proceed. E.g. the status may 244 + * contain the number of reps completed if a rep hypercall partially 245 + * succeeded. 246 + */ 247 + args.status = hv_result(status); 248 + args.reps = args.reps ? hv_repcomp(status) : 0; 249 + if (copy_to_user(user_args, &args, sizeof(args))) 250 + ret = -EFAULT; 251 + 252 + if (output_pg && 253 + copy_to_user((void __user *)args.out_ptr, output_pg, args.out_sz)) 254 + ret = -EFAULT; 255 + 256 + free_pages_out: 257 + free_pages((unsigned long)input_pg, pages_order); 258 + 259 + return ret; 260 + } 261 + 262 + static inline bool is_ghcb_mapping_available(void) 263 + { 264 + #if IS_ENABLED(CONFIG_X86_64) 265 + return ms_hyperv.ext_features & HV_VP_GHCB_ROOT_MAPPING_AVAILABLE; 266 + #else 267 + return 0; 268 + #endif 269 + } 270 + 271 + static int mshv_get_vp_registers(u32 vp_index, u64 partition_id, u16 count, 272 + struct hv_register_assoc *registers) 273 + { 274 + return hv_call_get_vp_registers(vp_index, partition_id, 275 + count, input_vtl_zero, registers); 276 + } 277 + 278 + static int mshv_set_vp_registers(u32 vp_index, u64 partition_id, u16 count, 279 + struct hv_register_assoc *registers) 280 + { 281 + return hv_call_set_vp_registers(vp_index, partition_id, 282 + count, input_vtl_zero, registers); 283 + } 284 + 285 + /* 286 + * Explicit guest vCPU suspend is asynchronous by nature (as it is requested by 287 + * dom0 vCPU for guest vCPU) and thus it can race with "intercept" suspend, 288 + * done by the hypervisor. 289 + * "Intercept" suspend leads to asynchronous message delivery to dom0 which 290 + * should be awaited to keep the VP loop consistent (i.e. no message pending 291 + * upon VP resume). 292 + * VP intercept suspend can't be done when the VP is explicitly suspended 293 + * already, and thus can be only two possible race scenarios: 294 + * 1. implicit suspend bit set -> explicit suspend bit set -> message sent 295 + * 2. implicit suspend bit set -> message sent -> explicit suspend bit set 296 + * Checking for implicit suspend bit set after explicit suspend request has 297 + * succeeded in either case allows us to reliably identify, if there is a 298 + * message to receive and deliver to VMM. 299 + */ 300 + static int 301 + mshv_suspend_vp(const struct mshv_vp *vp, bool *message_in_flight) 302 + { 303 + struct hv_register_assoc explicit_suspend = { 304 + .name = HV_REGISTER_EXPLICIT_SUSPEND 305 + }; 306 + struct hv_register_assoc intercept_suspend = { 307 + .name = HV_REGISTER_INTERCEPT_SUSPEND 308 + }; 309 + union hv_explicit_suspend_register *es = 310 + &explicit_suspend.value.explicit_suspend; 311 + union hv_intercept_suspend_register *is = 312 + &intercept_suspend.value.intercept_suspend; 313 + int ret; 314 + 315 + es->suspended = 1; 316 + 317 + ret = mshv_set_vp_registers(vp->vp_index, vp->vp_partition->pt_id, 318 + 1, &explicit_suspend); 319 + if (ret) { 320 + vp_err(vp, "Failed to explicitly suspend vCPU\n"); 321 + return ret; 322 + } 323 + 324 + ret = mshv_get_vp_registers(vp->vp_index, vp->vp_partition->pt_id, 325 + 1, &intercept_suspend); 326 + if (ret) { 327 + vp_err(vp, "Failed to get intercept suspend state\n"); 328 + return ret; 329 + } 330 + 331 + *message_in_flight = is->suspended; 332 + 333 + return 0; 334 + } 335 + 336 + /* 337 + * This function is used when VPs are scheduled by the hypervisor's 338 + * scheduler. 339 + * 340 + * Caller has to make sure the registers contain cleared 341 + * HV_REGISTER_INTERCEPT_SUSPEND and HV_REGISTER_EXPLICIT_SUSPEND registers 342 + * exactly in this order (the hypervisor clears them sequentially) to avoid 343 + * potential invalid clearing a newly arrived HV_REGISTER_INTERCEPT_SUSPEND 344 + * after VP is released from HV_REGISTER_EXPLICIT_SUSPEND in case of the 345 + * opposite order. 346 + */ 347 + static long mshv_run_vp_with_hyp_scheduler(struct mshv_vp *vp) 348 + { 349 + long ret; 350 + struct hv_register_assoc suspend_regs[2] = { 351 + { .name = HV_REGISTER_INTERCEPT_SUSPEND }, 352 + { .name = HV_REGISTER_EXPLICIT_SUSPEND } 353 + }; 354 + size_t count = ARRAY_SIZE(suspend_regs); 355 + 356 + /* Resume VP execution */ 357 + ret = mshv_set_vp_registers(vp->vp_index, vp->vp_partition->pt_id, 358 + count, suspend_regs); 359 + if (ret) { 360 + vp_err(vp, "Failed to resume vp execution. %lx\n", ret); 361 + return ret; 362 + } 363 + 364 + ret = wait_event_interruptible(vp->run.vp_suspend_queue, 365 + vp->run.kicked_by_hv == 1); 366 + if (ret) { 367 + bool message_in_flight; 368 + 369 + /* 370 + * Otherwise the waiting was interrupted by a signal: suspend 371 + * the vCPU explicitly and copy message in flight (if any). 372 + */ 373 + ret = mshv_suspend_vp(vp, &message_in_flight); 374 + if (ret) 375 + return ret; 376 + 377 + /* Return if no message in flight */ 378 + if (!message_in_flight) 379 + return -EINTR; 380 + 381 + /* Wait for the message in flight. */ 382 + wait_event(vp->run.vp_suspend_queue, vp->run.kicked_by_hv == 1); 383 + } 384 + 385 + /* 386 + * Reset the flag to make the wait_event call above work 387 + * next time. 388 + */ 389 + vp->run.kicked_by_hv = 0; 390 + 391 + return 0; 392 + } 393 + 394 + static int 395 + mshv_vp_dispatch(struct mshv_vp *vp, u32 flags, 396 + struct hv_output_dispatch_vp *res) 397 + { 398 + struct hv_input_dispatch_vp *input; 399 + struct hv_output_dispatch_vp *output; 400 + u64 status; 401 + 402 + preempt_disable(); 403 + input = *this_cpu_ptr(root_scheduler_input); 404 + output = *this_cpu_ptr(root_scheduler_output); 405 + 406 + memset(input, 0, sizeof(*input)); 407 + memset(output, 0, sizeof(*output)); 408 + 409 + input->partition_id = vp->vp_partition->pt_id; 410 + input->vp_index = vp->vp_index; 411 + input->time_slice = 0; /* Run forever until something happens */ 412 + input->spec_ctrl = 0; /* TODO: set sensible flags */ 413 + input->flags = flags; 414 + 415 + vp->run.flags.root_sched_dispatched = 1; 416 + status = hv_do_hypercall(HVCALL_DISPATCH_VP, input, output); 417 + vp->run.flags.root_sched_dispatched = 0; 418 + 419 + *res = *output; 420 + preempt_enable(); 421 + 422 + if (!hv_result_success(status)) 423 + vp_err(vp, "%s: status %s\n", __func__, 424 + hv_result_to_string(status)); 425 + 426 + return hv_result_to_errno(status); 427 + } 428 + 429 + static int 430 + mshv_vp_clear_explicit_suspend(struct mshv_vp *vp) 431 + { 432 + struct hv_register_assoc explicit_suspend = { 433 + .name = HV_REGISTER_EXPLICIT_SUSPEND, 434 + .value.explicit_suspend.suspended = 0, 435 + }; 436 + int ret; 437 + 438 + ret = mshv_set_vp_registers(vp->vp_index, vp->vp_partition->pt_id, 439 + 1, &explicit_suspend); 440 + 441 + if (ret) 442 + vp_err(vp, "Failed to unsuspend\n"); 443 + 444 + return ret; 445 + } 446 + 447 + #if IS_ENABLED(CONFIG_X86_64) 448 + static u64 mshv_vp_interrupt_pending(struct mshv_vp *vp) 449 + { 450 + if (!vp->vp_register_page) 451 + return 0; 452 + return vp->vp_register_page->interrupt_vectors.as_uint64; 453 + } 454 + #else 455 + static u64 mshv_vp_interrupt_pending(struct mshv_vp *vp) 456 + { 457 + return 0; 458 + } 459 + #endif 460 + 461 + static bool mshv_vp_dispatch_thread_blocked(struct mshv_vp *vp) 462 + { 463 + struct hv_stats_page **stats = vp->vp_stats_pages; 464 + u64 *self_vp_cntrs = stats[HV_STATS_AREA_SELF]->vp_cntrs; 465 + u64 *parent_vp_cntrs = stats[HV_STATS_AREA_PARENT]->vp_cntrs; 466 + 467 + if (self_vp_cntrs[VpRootDispatchThreadBlocked]) 468 + return self_vp_cntrs[VpRootDispatchThreadBlocked]; 469 + return parent_vp_cntrs[VpRootDispatchThreadBlocked]; 470 + } 471 + 472 + static int 473 + mshv_vp_wait_for_hv_kick(struct mshv_vp *vp) 474 + { 475 + int ret; 476 + 477 + ret = wait_event_interruptible(vp->run.vp_suspend_queue, 478 + (vp->run.kicked_by_hv == 1 && 479 + !mshv_vp_dispatch_thread_blocked(vp)) || 480 + mshv_vp_interrupt_pending(vp)); 481 + if (ret) 482 + return -EINTR; 483 + 484 + vp->run.flags.root_sched_blocked = 0; 485 + vp->run.kicked_by_hv = 0; 486 + 487 + return 0; 488 + } 489 + 490 + static int mshv_pre_guest_mode_work(struct mshv_vp *vp) 491 + { 492 + const ulong work_flags = _TIF_NOTIFY_SIGNAL | _TIF_SIGPENDING | 493 + _TIF_NEED_RESCHED | _TIF_NOTIFY_RESUME; 494 + ulong th_flags; 495 + 496 + th_flags = read_thread_flags(); 497 + while (th_flags & work_flags) { 498 + int ret; 499 + 500 + /* nb: following will call schedule */ 501 + ret = mshv_do_pre_guest_mode_work(th_flags); 502 + 503 + if (ret) 504 + return ret; 505 + 506 + th_flags = read_thread_flags(); 507 + } 508 + 509 + return 0; 510 + } 511 + 512 + /* Must be called with interrupts enabled */ 513 + static long mshv_run_vp_with_root_scheduler(struct mshv_vp *vp) 514 + { 515 + long ret; 516 + 517 + if (vp->run.flags.root_sched_blocked) { 518 + /* 519 + * Dispatch state of this VP is blocked. Need to wait 520 + * for the hypervisor to clear the blocked state before 521 + * dispatching it. 522 + */ 523 + ret = mshv_vp_wait_for_hv_kick(vp); 524 + if (ret) 525 + return ret; 526 + } 527 + 528 + do { 529 + u32 flags = 0; 530 + struct hv_output_dispatch_vp output; 531 + 532 + ret = mshv_pre_guest_mode_work(vp); 533 + if (ret) 534 + break; 535 + 536 + if (vp->run.flags.intercept_suspend) 537 + flags |= HV_DISPATCH_VP_FLAG_CLEAR_INTERCEPT_SUSPEND; 538 + 539 + if (mshv_vp_interrupt_pending(vp)) 540 + flags |= HV_DISPATCH_VP_FLAG_SCAN_INTERRUPT_INJECTION; 541 + 542 + ret = mshv_vp_dispatch(vp, flags, &output); 543 + if (ret) 544 + break; 545 + 546 + vp->run.flags.intercept_suspend = 0; 547 + 548 + if (output.dispatch_state == HV_VP_DISPATCH_STATE_BLOCKED) { 549 + if (output.dispatch_event == 550 + HV_VP_DISPATCH_EVENT_SUSPEND) { 551 + /* 552 + * TODO: remove the warning once VP canceling 553 + * is supported 554 + */ 555 + WARN_ONCE(atomic64_read(&vp->run.vp_signaled_count), 556 + "%s: vp#%d: unexpected explicit suspend\n", 557 + __func__, vp->vp_index); 558 + /* 559 + * Need to clear explicit suspend before 560 + * dispatching. 561 + * Explicit suspend is either: 562 + * - set right after the first VP dispatch or 563 + * - set explicitly via hypercall 564 + * Since the latter case is not yet supported, 565 + * simply clear it here. 566 + */ 567 + ret = mshv_vp_clear_explicit_suspend(vp); 568 + if (ret) 569 + break; 570 + 571 + ret = mshv_vp_wait_for_hv_kick(vp); 572 + if (ret) 573 + break; 574 + } else { 575 + vp->run.flags.root_sched_blocked = 1; 576 + ret = mshv_vp_wait_for_hv_kick(vp); 577 + if (ret) 578 + break; 579 + } 580 + } else { 581 + /* HV_VP_DISPATCH_STATE_READY */ 582 + if (output.dispatch_event == 583 + HV_VP_DISPATCH_EVENT_INTERCEPT) 584 + vp->run.flags.intercept_suspend = 1; 585 + } 586 + } while (!vp->run.flags.intercept_suspend); 587 + 588 + return ret; 589 + } 590 + 591 + static_assert(sizeof(struct hv_message) <= MSHV_RUN_VP_BUF_SZ, 592 + "sizeof(struct hv_message) must not exceed MSHV_RUN_VP_BUF_SZ"); 593 + 594 + static long mshv_vp_ioctl_run_vp(struct mshv_vp *vp, void __user *ret_msg) 595 + { 596 + long rc; 597 + 598 + if (hv_scheduler_type == HV_SCHEDULER_TYPE_ROOT) 599 + rc = mshv_run_vp_with_root_scheduler(vp); 600 + else 601 + rc = mshv_run_vp_with_hyp_scheduler(vp); 602 + 603 + if (rc) 604 + return rc; 605 + 606 + if (copy_to_user(ret_msg, vp->vp_intercept_msg_page, 607 + sizeof(struct hv_message))) 608 + rc = -EFAULT; 609 + 610 + return rc; 611 + } 612 + 613 + static int 614 + mshv_vp_ioctl_get_set_state_pfn(struct mshv_vp *vp, 615 + struct hv_vp_state_data state_data, 616 + unsigned long user_pfn, size_t page_count, 617 + bool is_set) 618 + { 619 + int completed, ret = 0; 620 + unsigned long check; 621 + struct page **pages; 622 + 623 + if (page_count > INT_MAX) 624 + return -EINVAL; 625 + /* 626 + * Check the arithmetic for wraparound/overflow. 627 + * The last page address in the buffer is: 628 + * (user_pfn + (page_count - 1)) * PAGE_SIZE 629 + */ 630 + if (check_add_overflow(user_pfn, (page_count - 1), &check)) 631 + return -EOVERFLOW; 632 + if (check_mul_overflow(check, PAGE_SIZE, &check)) 633 + return -EOVERFLOW; 634 + 635 + /* Pin user pages so hypervisor can copy directly to them */ 636 + pages = kcalloc(page_count, sizeof(struct page *), GFP_KERNEL); 637 + if (!pages) 638 + return -ENOMEM; 639 + 640 + for (completed = 0; completed < page_count; completed += ret) { 641 + unsigned long user_addr = (user_pfn + completed) * PAGE_SIZE; 642 + int remaining = page_count - completed; 643 + 644 + ret = pin_user_pages_fast(user_addr, remaining, FOLL_WRITE, 645 + &pages[completed]); 646 + if (ret < 0) { 647 + vp_err(vp, "%s: Failed to pin user pages error %i\n", 648 + __func__, ret); 649 + goto unpin_pages; 650 + } 651 + } 652 + 653 + if (is_set) 654 + ret = hv_call_set_vp_state(vp->vp_index, 655 + vp->vp_partition->pt_id, 656 + state_data, page_count, pages, 657 + 0, NULL); 658 + else 659 + ret = hv_call_get_vp_state(vp->vp_index, 660 + vp->vp_partition->pt_id, 661 + state_data, page_count, pages, 662 + NULL); 663 + 664 + unpin_pages: 665 + unpin_user_pages(pages, completed); 666 + kfree(pages); 667 + return ret; 668 + } 669 + 670 + static long 671 + mshv_vp_ioctl_get_set_state(struct mshv_vp *vp, 672 + struct mshv_get_set_vp_state __user *user_args, 673 + bool is_set) 674 + { 675 + struct mshv_get_set_vp_state args; 676 + long ret = 0; 677 + union hv_output_get_vp_state vp_state; 678 + u32 data_sz; 679 + struct hv_vp_state_data state_data = {}; 680 + 681 + if (copy_from_user(&args, user_args, sizeof(args))) 682 + return -EFAULT; 683 + 684 + if (args.type >= MSHV_VP_STATE_COUNT || mshv_field_nonzero(args, rsvd) || 685 + !args.buf_sz || !PAGE_ALIGNED(args.buf_sz) || 686 + !PAGE_ALIGNED(args.buf_ptr)) 687 + return -EINVAL; 688 + 689 + if (!access_ok((void __user *)args.buf_ptr, args.buf_sz)) 690 + return -EFAULT; 691 + 692 + switch (args.type) { 693 + case MSHV_VP_STATE_LAPIC: 694 + state_data.type = HV_GET_SET_VP_STATE_LAPIC_STATE; 695 + data_sz = HV_HYP_PAGE_SIZE; 696 + break; 697 + case MSHV_VP_STATE_XSAVE: 698 + { 699 + u64 data_sz_64; 700 + 701 + ret = hv_call_get_partition_property(vp->vp_partition->pt_id, 702 + HV_PARTITION_PROPERTY_XSAVE_STATES, 703 + &state_data.xsave.states.as_uint64); 704 + if (ret) 705 + return ret; 706 + 707 + ret = hv_call_get_partition_property(vp->vp_partition->pt_id, 708 + HV_PARTITION_PROPERTY_MAX_XSAVE_DATA_SIZE, 709 + &data_sz_64); 710 + if (ret) 711 + return ret; 712 + 713 + data_sz = (u32)data_sz_64; 714 + state_data.xsave.flags = 0; 715 + /* Always request legacy states */ 716 + state_data.xsave.states.legacy_x87 = 1; 717 + state_data.xsave.states.legacy_sse = 1; 718 + state_data.type = HV_GET_SET_VP_STATE_XSAVE; 719 + break; 720 + } 721 + case MSHV_VP_STATE_SIMP: 722 + state_data.type = HV_GET_SET_VP_STATE_SIM_PAGE; 723 + data_sz = HV_HYP_PAGE_SIZE; 724 + break; 725 + case MSHV_VP_STATE_SIEFP: 726 + state_data.type = HV_GET_SET_VP_STATE_SIEF_PAGE; 727 + data_sz = HV_HYP_PAGE_SIZE; 728 + break; 729 + case MSHV_VP_STATE_SYNTHETIC_TIMERS: 730 + state_data.type = HV_GET_SET_VP_STATE_SYNTHETIC_TIMERS; 731 + data_sz = sizeof(vp_state.synthetic_timers_state); 732 + break; 733 + default: 734 + return -EINVAL; 735 + } 736 + 737 + if (copy_to_user(&user_args->buf_sz, &data_sz, sizeof(user_args->buf_sz))) 738 + return -EFAULT; 739 + 740 + if (data_sz > args.buf_sz) 741 + return -EINVAL; 742 + 743 + /* If the data is transmitted via pfns, delegate to helper */ 744 + if (state_data.type & HV_GET_SET_VP_STATE_TYPE_PFN) { 745 + unsigned long user_pfn = PFN_DOWN(args.buf_ptr); 746 + size_t page_count = PFN_DOWN(args.buf_sz); 747 + 748 + return mshv_vp_ioctl_get_set_state_pfn(vp, state_data, user_pfn, 749 + page_count, is_set); 750 + } 751 + 752 + /* Paranoia check - this shouldn't happen! */ 753 + if (data_sz > sizeof(vp_state)) { 754 + vp_err(vp, "Invalid vp state data size!\n"); 755 + return -EINVAL; 756 + } 757 + 758 + if (is_set) { 759 + if (copy_from_user(&vp_state, (__user void *)args.buf_ptr, data_sz)) 760 + return -EFAULT; 761 + 762 + return hv_call_set_vp_state(vp->vp_index, 763 + vp->vp_partition->pt_id, 764 + state_data, 0, NULL, 765 + sizeof(vp_state), (u8 *)&vp_state); 766 + } 767 + 768 + ret = hv_call_get_vp_state(vp->vp_index, vp->vp_partition->pt_id, 769 + state_data, 0, NULL, &vp_state); 770 + if (ret) 771 + return ret; 772 + 773 + if (copy_to_user((void __user *)args.buf_ptr, &vp_state, data_sz)) 774 + return -EFAULT; 775 + 776 + return 0; 777 + } 778 + 779 + static long 780 + mshv_vp_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) 781 + { 782 + struct mshv_vp *vp = filp->private_data; 783 + long r = -ENOTTY; 784 + 785 + if (mutex_lock_killable(&vp->vp_mutex)) 786 + return -EINTR; 787 + 788 + switch (ioctl) { 789 + case MSHV_RUN_VP: 790 + r = mshv_vp_ioctl_run_vp(vp, (void __user *)arg); 791 + break; 792 + case MSHV_GET_VP_STATE: 793 + r = mshv_vp_ioctl_get_set_state(vp, (void __user *)arg, false); 794 + break; 795 + case MSHV_SET_VP_STATE: 796 + r = mshv_vp_ioctl_get_set_state(vp, (void __user *)arg, true); 797 + break; 798 + case MSHV_ROOT_HVCALL: 799 + r = mshv_ioctl_passthru_hvcall(vp->vp_partition, false, 800 + (void __user *)arg); 801 + break; 802 + default: 803 + vp_warn(vp, "Invalid ioctl: %#x\n", ioctl); 804 + break; 805 + } 806 + mutex_unlock(&vp->vp_mutex); 807 + 808 + return r; 809 + } 810 + 811 + static vm_fault_t mshv_vp_fault(struct vm_fault *vmf) 812 + { 813 + struct mshv_vp *vp = vmf->vma->vm_file->private_data; 814 + 815 + switch (vmf->vma->vm_pgoff) { 816 + case MSHV_VP_MMAP_OFFSET_REGISTERS: 817 + vmf->page = virt_to_page(vp->vp_register_page); 818 + break; 819 + case MSHV_VP_MMAP_OFFSET_INTERCEPT_MESSAGE: 820 + vmf->page = virt_to_page(vp->vp_intercept_msg_page); 821 + break; 822 + case MSHV_VP_MMAP_OFFSET_GHCB: 823 + vmf->page = virt_to_page(vp->vp_ghcb_page); 824 + break; 825 + default: 826 + return VM_FAULT_SIGBUS; 827 + } 828 + 829 + get_page(vmf->page); 830 + 831 + return 0; 832 + } 833 + 834 + static int mshv_vp_mmap(struct file *file, struct vm_area_struct *vma) 835 + { 836 + struct mshv_vp *vp = file->private_data; 837 + 838 + switch (vma->vm_pgoff) { 839 + case MSHV_VP_MMAP_OFFSET_REGISTERS: 840 + if (!vp->vp_register_page) 841 + return -ENODEV; 842 + break; 843 + case MSHV_VP_MMAP_OFFSET_INTERCEPT_MESSAGE: 844 + if (!vp->vp_intercept_msg_page) 845 + return -ENODEV; 846 + break; 847 + case MSHV_VP_MMAP_OFFSET_GHCB: 848 + if (!vp->vp_ghcb_page) 849 + return -ENODEV; 850 + break; 851 + default: 852 + return -EINVAL; 853 + } 854 + 855 + vma->vm_ops = &mshv_vp_vm_ops; 856 + return 0; 857 + } 858 + 859 + static int 860 + mshv_vp_release(struct inode *inode, struct file *filp) 861 + { 862 + struct mshv_vp *vp = filp->private_data; 863 + 864 + /* Rest of VP cleanup happens in destroy_partition() */ 865 + mshv_partition_put(vp->vp_partition); 866 + return 0; 867 + } 868 + 869 + static void mshv_vp_stats_unmap(u64 partition_id, u32 vp_index) 870 + { 871 + union hv_stats_object_identity identity = { 872 + .vp.partition_id = partition_id, 873 + .vp.vp_index = vp_index, 874 + }; 875 + 876 + identity.vp.stats_area_type = HV_STATS_AREA_SELF; 877 + hv_call_unmap_stat_page(HV_STATS_OBJECT_VP, &identity); 878 + 879 + identity.vp.stats_area_type = HV_STATS_AREA_PARENT; 880 + hv_call_unmap_stat_page(HV_STATS_OBJECT_VP, &identity); 881 + } 882 + 883 + static int mshv_vp_stats_map(u64 partition_id, u32 vp_index, 884 + void *stats_pages[]) 885 + { 886 + union hv_stats_object_identity identity = { 887 + .vp.partition_id = partition_id, 888 + .vp.vp_index = vp_index, 889 + }; 890 + int err; 891 + 892 + identity.vp.stats_area_type = HV_STATS_AREA_SELF; 893 + err = hv_call_map_stat_page(HV_STATS_OBJECT_VP, &identity, 894 + &stats_pages[HV_STATS_AREA_SELF]); 895 + if (err) 896 + return err; 897 + 898 + identity.vp.stats_area_type = HV_STATS_AREA_PARENT; 899 + err = hv_call_map_stat_page(HV_STATS_OBJECT_VP, &identity, 900 + &stats_pages[HV_STATS_AREA_PARENT]); 901 + if (err) 902 + goto unmap_self; 903 + 904 + return 0; 905 + 906 + unmap_self: 907 + identity.vp.stats_area_type = HV_STATS_AREA_SELF; 908 + hv_call_unmap_stat_page(HV_STATS_OBJECT_VP, &identity); 909 + return err; 910 + } 911 + 912 + static long 913 + mshv_partition_ioctl_create_vp(struct mshv_partition *partition, 914 + void __user *arg) 915 + { 916 + struct mshv_create_vp args; 917 + struct mshv_vp *vp; 918 + struct page *intercept_message_page, *register_page, *ghcb_page; 919 + void *stats_pages[2]; 920 + long ret; 921 + 922 + if (copy_from_user(&args, arg, sizeof(args))) 923 + return -EFAULT; 924 + 925 + if (args.vp_index >= MSHV_MAX_VPS) 926 + return -EINVAL; 927 + 928 + if (partition->pt_vp_array[args.vp_index]) 929 + return -EEXIST; 930 + 931 + ret = hv_call_create_vp(NUMA_NO_NODE, partition->pt_id, args.vp_index, 932 + 0 /* Only valid for root partition VPs */); 933 + if (ret) 934 + return ret; 935 + 936 + ret = hv_call_map_vp_state_page(partition->pt_id, args.vp_index, 937 + HV_VP_STATE_PAGE_INTERCEPT_MESSAGE, 938 + input_vtl_zero, 939 + &intercept_message_page); 940 + if (ret) 941 + goto destroy_vp; 942 + 943 + if (!mshv_partition_encrypted(partition)) { 944 + ret = hv_call_map_vp_state_page(partition->pt_id, args.vp_index, 945 + HV_VP_STATE_PAGE_REGISTERS, 946 + input_vtl_zero, 947 + &register_page); 948 + if (ret) 949 + goto unmap_intercept_message_page; 950 + } 951 + 952 + if (mshv_partition_encrypted(partition) && 953 + is_ghcb_mapping_available()) { 954 + ret = hv_call_map_vp_state_page(partition->pt_id, args.vp_index, 955 + HV_VP_STATE_PAGE_GHCB, 956 + input_vtl_normal, 957 + &ghcb_page); 958 + if (ret) 959 + goto unmap_register_page; 960 + } 961 + 962 + if (hv_parent_partition()) { 963 + ret = mshv_vp_stats_map(partition->pt_id, args.vp_index, 964 + stats_pages); 965 + if (ret) 966 + goto unmap_ghcb_page; 967 + } 968 + 969 + vp = kzalloc(sizeof(*vp), GFP_KERNEL); 970 + if (!vp) 971 + goto unmap_stats_pages; 972 + 973 + vp->vp_partition = mshv_partition_get(partition); 974 + if (!vp->vp_partition) { 975 + ret = -EBADF; 976 + goto free_vp; 977 + } 978 + 979 + mutex_init(&vp->vp_mutex); 980 + init_waitqueue_head(&vp->run.vp_suspend_queue); 981 + atomic64_set(&vp->run.vp_signaled_count, 0); 982 + 983 + vp->vp_index = args.vp_index; 984 + vp->vp_intercept_msg_page = page_to_virt(intercept_message_page); 985 + if (!mshv_partition_encrypted(partition)) 986 + vp->vp_register_page = page_to_virt(register_page); 987 + 988 + if (mshv_partition_encrypted(partition) && is_ghcb_mapping_available()) 989 + vp->vp_ghcb_page = page_to_virt(ghcb_page); 990 + 991 + if (hv_parent_partition()) 992 + memcpy(vp->vp_stats_pages, stats_pages, sizeof(stats_pages)); 993 + 994 + /* 995 + * Keep anon_inode_getfd last: it installs fd in the file struct and 996 + * thus makes the state accessible in user space. 997 + */ 998 + ret = anon_inode_getfd("mshv_vp", &mshv_vp_fops, vp, 999 + O_RDWR | O_CLOEXEC); 1000 + if (ret < 0) 1001 + goto put_partition; 1002 + 1003 + /* already exclusive with the partition mutex for all ioctls */ 1004 + partition->pt_vp_count++; 1005 + partition->pt_vp_array[args.vp_index] = vp; 1006 + 1007 + return ret; 1008 + 1009 + put_partition: 1010 + mshv_partition_put(partition); 1011 + free_vp: 1012 + kfree(vp); 1013 + unmap_stats_pages: 1014 + if (hv_parent_partition()) 1015 + mshv_vp_stats_unmap(partition->pt_id, args.vp_index); 1016 + unmap_ghcb_page: 1017 + if (mshv_partition_encrypted(partition) && is_ghcb_mapping_available()) { 1018 + hv_call_unmap_vp_state_page(partition->pt_id, args.vp_index, 1019 + HV_VP_STATE_PAGE_GHCB, 1020 + input_vtl_normal); 1021 + } 1022 + unmap_register_page: 1023 + if (!mshv_partition_encrypted(partition)) { 1024 + hv_call_unmap_vp_state_page(partition->pt_id, args.vp_index, 1025 + HV_VP_STATE_PAGE_REGISTERS, 1026 + input_vtl_zero); 1027 + } 1028 + unmap_intercept_message_page: 1029 + hv_call_unmap_vp_state_page(partition->pt_id, args.vp_index, 1030 + HV_VP_STATE_PAGE_INTERCEPT_MESSAGE, 1031 + input_vtl_zero); 1032 + destroy_vp: 1033 + hv_call_delete_vp(partition->pt_id, args.vp_index); 1034 + return ret; 1035 + } 1036 + 1037 + static int mshv_init_async_handler(struct mshv_partition *partition) 1038 + { 1039 + if (completion_done(&partition->async_hypercall)) { 1040 + pt_err(partition, 1041 + "Cannot issue async hypercall while another one in progress!\n"); 1042 + return -EPERM; 1043 + } 1044 + 1045 + reinit_completion(&partition->async_hypercall); 1046 + return 0; 1047 + } 1048 + 1049 + static void mshv_async_hvcall_handler(void *data, u64 *status) 1050 + { 1051 + struct mshv_partition *partition = data; 1052 + 1053 + wait_for_completion(&partition->async_hypercall); 1054 + pt_dbg(partition, "Async hypercall completed!\n"); 1055 + 1056 + *status = partition->async_hypercall_status; 1057 + } 1058 + 1059 + static int 1060 + mshv_partition_region_share(struct mshv_mem_region *region) 1061 + { 1062 + u32 flags = HV_MODIFY_SPA_PAGE_HOST_ACCESS_MAKE_SHARED; 1063 + 1064 + if (region->flags.large_pages) 1065 + flags |= HV_MODIFY_SPA_PAGE_HOST_ACCESS_LARGE_PAGE; 1066 + 1067 + return hv_call_modify_spa_host_access(region->partition->pt_id, 1068 + region->pages, region->nr_pages, 1069 + HV_MAP_GPA_READABLE | HV_MAP_GPA_WRITABLE, 1070 + flags, true); 1071 + } 1072 + 1073 + static int 1074 + mshv_partition_region_unshare(struct mshv_mem_region *region) 1075 + { 1076 + u32 flags = HV_MODIFY_SPA_PAGE_HOST_ACCESS_MAKE_EXCLUSIVE; 1077 + 1078 + if (region->flags.large_pages) 1079 + flags |= HV_MODIFY_SPA_PAGE_HOST_ACCESS_LARGE_PAGE; 1080 + 1081 + return hv_call_modify_spa_host_access(region->partition->pt_id, 1082 + region->pages, region->nr_pages, 1083 + 0, 1084 + flags, false); 1085 + } 1086 + 1087 + static int 1088 + mshv_region_remap_pages(struct mshv_mem_region *region, u32 map_flags, 1089 + u64 page_offset, u64 page_count) 1090 + { 1091 + if (page_offset + page_count > region->nr_pages) 1092 + return -EINVAL; 1093 + 1094 + if (region->flags.large_pages) 1095 + map_flags |= HV_MAP_GPA_LARGE_PAGE; 1096 + 1097 + /* ask the hypervisor to map guest ram */ 1098 + return hv_call_map_gpa_pages(region->partition->pt_id, 1099 + region->start_gfn + page_offset, 1100 + page_count, map_flags, 1101 + region->pages + page_offset); 1102 + } 1103 + 1104 + static int 1105 + mshv_region_map(struct mshv_mem_region *region) 1106 + { 1107 + u32 map_flags = region->hv_map_flags; 1108 + 1109 + return mshv_region_remap_pages(region, map_flags, 1110 + 0, region->nr_pages); 1111 + } 1112 + 1113 + static void 1114 + mshv_region_evict_pages(struct mshv_mem_region *region, 1115 + u64 page_offset, u64 page_count) 1116 + { 1117 + if (region->flags.range_pinned) 1118 + unpin_user_pages(region->pages + page_offset, page_count); 1119 + 1120 + memset(region->pages + page_offset, 0, 1121 + page_count * sizeof(struct page *)); 1122 + } 1123 + 1124 + static void 1125 + mshv_region_evict(struct mshv_mem_region *region) 1126 + { 1127 + mshv_region_evict_pages(region, 0, region->nr_pages); 1128 + } 1129 + 1130 + static int 1131 + mshv_region_populate_pages(struct mshv_mem_region *region, 1132 + u64 page_offset, u64 page_count) 1133 + { 1134 + u64 done_count, nr_pages; 1135 + struct page **pages; 1136 + __u64 userspace_addr; 1137 + int ret; 1138 + 1139 + if (page_offset + page_count > region->nr_pages) 1140 + return -EINVAL; 1141 + 1142 + for (done_count = 0; done_count < page_count; done_count += ret) { 1143 + pages = region->pages + page_offset + done_count; 1144 + userspace_addr = region->start_uaddr + 1145 + (page_offset + done_count) * 1146 + HV_HYP_PAGE_SIZE; 1147 + nr_pages = min(page_count - done_count, 1148 + MSHV_PIN_PAGES_BATCH_SIZE); 1149 + 1150 + /* 1151 + * Pinning assuming 4k pages works for large pages too. 1152 + * All page structs within the large page are returned. 1153 + * 1154 + * Pin requests are batched because pin_user_pages_fast 1155 + * with the FOLL_LONGTERM flag does a large temporary 1156 + * allocation of contiguous memory. 1157 + */ 1158 + if (region->flags.range_pinned) 1159 + ret = pin_user_pages_fast(userspace_addr, 1160 + nr_pages, 1161 + FOLL_WRITE | FOLL_LONGTERM, 1162 + pages); 1163 + else 1164 + ret = -EOPNOTSUPP; 1165 + 1166 + if (ret < 0) 1167 + goto release_pages; 1168 + } 1169 + 1170 + if (PageHuge(region->pages[page_offset])) 1171 + region->flags.large_pages = true; 1172 + 1173 + return 0; 1174 + 1175 + release_pages: 1176 + mshv_region_evict_pages(region, page_offset, done_count); 1177 + return ret; 1178 + } 1179 + 1180 + static int 1181 + mshv_region_populate(struct mshv_mem_region *region) 1182 + { 1183 + return mshv_region_populate_pages(region, 0, region->nr_pages); 1184 + } 1185 + 1186 + static struct mshv_mem_region * 1187 + mshv_partition_region_by_gfn(struct mshv_partition *partition, u64 gfn) 1188 + { 1189 + struct mshv_mem_region *region; 1190 + 1191 + hlist_for_each_entry(region, &partition->pt_mem_regions, hnode) { 1192 + if (gfn >= region->start_gfn && 1193 + gfn < region->start_gfn + region->nr_pages) 1194 + return region; 1195 + } 1196 + 1197 + return NULL; 1198 + } 1199 + 1200 + static struct mshv_mem_region * 1201 + mshv_partition_region_by_uaddr(struct mshv_partition *partition, u64 uaddr) 1202 + { 1203 + struct mshv_mem_region *region; 1204 + 1205 + hlist_for_each_entry(region, &partition->pt_mem_regions, hnode) { 1206 + if (uaddr >= region->start_uaddr && 1207 + uaddr < region->start_uaddr + 1208 + (region->nr_pages << HV_HYP_PAGE_SHIFT)) 1209 + return region; 1210 + } 1211 + 1212 + return NULL; 1213 + } 1214 + 1215 + /* 1216 + * NB: caller checks and makes sure mem->size is page aligned 1217 + * Returns: 0 with regionpp updated on success, or -errno 1218 + */ 1219 + static int mshv_partition_create_region(struct mshv_partition *partition, 1220 + struct mshv_user_mem_region *mem, 1221 + struct mshv_mem_region **regionpp, 1222 + bool is_mmio) 1223 + { 1224 + struct mshv_mem_region *region; 1225 + u64 nr_pages = HVPFN_DOWN(mem->size); 1226 + 1227 + /* Reject overlapping regions */ 1228 + if (mshv_partition_region_by_gfn(partition, mem->guest_pfn) || 1229 + mshv_partition_region_by_gfn(partition, mem->guest_pfn + nr_pages - 1) || 1230 + mshv_partition_region_by_uaddr(partition, mem->userspace_addr) || 1231 + mshv_partition_region_by_uaddr(partition, mem->userspace_addr + mem->size - 1)) 1232 + return -EEXIST; 1233 + 1234 + region = vzalloc(sizeof(*region) + sizeof(struct page *) * nr_pages); 1235 + if (!region) 1236 + return -ENOMEM; 1237 + 1238 + region->nr_pages = nr_pages; 1239 + region->start_gfn = mem->guest_pfn; 1240 + region->start_uaddr = mem->userspace_addr; 1241 + region->hv_map_flags = HV_MAP_GPA_READABLE | HV_MAP_GPA_ADJUSTABLE; 1242 + if (mem->flags & BIT(MSHV_SET_MEM_BIT_WRITABLE)) 1243 + region->hv_map_flags |= HV_MAP_GPA_WRITABLE; 1244 + if (mem->flags & BIT(MSHV_SET_MEM_BIT_EXECUTABLE)) 1245 + region->hv_map_flags |= HV_MAP_GPA_EXECUTABLE; 1246 + 1247 + /* Note: large_pages flag populated when we pin the pages */ 1248 + if (!is_mmio) 1249 + region->flags.range_pinned = true; 1250 + 1251 + region->partition = partition; 1252 + 1253 + *regionpp = region; 1254 + 1255 + return 0; 1256 + } 1257 + 1258 + /* 1259 + * Map guest ram. if snp, make sure to release that from the host first 1260 + * Side Effects: In case of failure, pages are unpinned when feasible. 1261 + */ 1262 + static int 1263 + mshv_partition_mem_region_map(struct mshv_mem_region *region) 1264 + { 1265 + struct mshv_partition *partition = region->partition; 1266 + int ret; 1267 + 1268 + ret = mshv_region_populate(region); 1269 + if (ret) { 1270 + pt_err(partition, "Failed to populate memory region: %d\n", 1271 + ret); 1272 + goto err_out; 1273 + } 1274 + 1275 + /* 1276 + * For an SNP partition it is a requirement that for every memory region 1277 + * that we are going to map for this partition we should make sure that 1278 + * host access to that region is released. This is ensured by doing an 1279 + * additional hypercall which will update the SLAT to release host 1280 + * access to guest memory regions. 1281 + */ 1282 + if (mshv_partition_encrypted(partition)) { 1283 + ret = mshv_partition_region_unshare(region); 1284 + if (ret) { 1285 + pt_err(partition, 1286 + "Failed to unshare memory region (guest_pfn: %llu): %d\n", 1287 + region->start_gfn, ret); 1288 + goto evict_region; 1289 + } 1290 + } 1291 + 1292 + ret = mshv_region_map(region); 1293 + if (ret && mshv_partition_encrypted(partition)) { 1294 + int shrc; 1295 + 1296 + shrc = mshv_partition_region_share(region); 1297 + if (!shrc) 1298 + goto evict_region; 1299 + 1300 + pt_err(partition, 1301 + "Failed to share memory region (guest_pfn: %llu): %d\n", 1302 + region->start_gfn, shrc); 1303 + /* 1304 + * Don't unpin if marking shared failed because pages are no 1305 + * longer mapped in the host, ie root, anymore. 1306 + */ 1307 + goto err_out; 1308 + } 1309 + 1310 + return 0; 1311 + 1312 + evict_region: 1313 + mshv_region_evict(region); 1314 + err_out: 1315 + return ret; 1316 + } 1317 + 1318 + /* 1319 + * This maps two things: guest RAM and for pci passthru mmio space. 1320 + * 1321 + * mmio: 1322 + * - vfio overloads vm_pgoff to store the mmio start pfn/spa. 1323 + * - Two things need to happen for mapping mmio range: 1324 + * 1. mapped in the uaddr so VMM can access it. 1325 + * 2. mapped in the hwpt (gfn <-> mmio phys addr) so guest can access it. 1326 + * 1327 + * This function takes care of the second. The first one is managed by vfio, 1328 + * and hence is taken care of via vfio_pci_mmap_fault(). 1329 + */ 1330 + static long 1331 + mshv_map_user_memory(struct mshv_partition *partition, 1332 + struct mshv_user_mem_region mem) 1333 + { 1334 + struct mshv_mem_region *region; 1335 + struct vm_area_struct *vma; 1336 + bool is_mmio; 1337 + ulong mmio_pfn; 1338 + long ret; 1339 + 1340 + if (mem.flags & BIT(MSHV_SET_MEM_BIT_UNMAP) || 1341 + !access_ok((const void *)mem.userspace_addr, mem.size)) 1342 + return -EINVAL; 1343 + 1344 + mmap_read_lock(current->mm); 1345 + vma = vma_lookup(current->mm, mem.userspace_addr); 1346 + is_mmio = vma ? !!(vma->vm_flags & (VM_IO | VM_PFNMAP)) : 0; 1347 + mmio_pfn = is_mmio ? vma->vm_pgoff : 0; 1348 + mmap_read_unlock(current->mm); 1349 + 1350 + if (!vma) 1351 + return -EINVAL; 1352 + 1353 + ret = mshv_partition_create_region(partition, &mem, &region, 1354 + is_mmio); 1355 + if (ret) 1356 + return ret; 1357 + 1358 + if (is_mmio) 1359 + ret = hv_call_map_mmio_pages(partition->pt_id, mem.guest_pfn, 1360 + mmio_pfn, HVPFN_DOWN(mem.size)); 1361 + else 1362 + ret = mshv_partition_mem_region_map(region); 1363 + 1364 + if (ret) 1365 + goto errout; 1366 + 1367 + /* Install the new region */ 1368 + hlist_add_head(&region->hnode, &partition->pt_mem_regions); 1369 + 1370 + return 0; 1371 + 1372 + errout: 1373 + vfree(region); 1374 + return ret; 1375 + } 1376 + 1377 + /* Called for unmapping both the guest ram and the mmio space */ 1378 + static long 1379 + mshv_unmap_user_memory(struct mshv_partition *partition, 1380 + struct mshv_user_mem_region mem) 1381 + { 1382 + struct mshv_mem_region *region; 1383 + u32 unmap_flags = 0; 1384 + 1385 + if (!(mem.flags & BIT(MSHV_SET_MEM_BIT_UNMAP))) 1386 + return -EINVAL; 1387 + 1388 + region = mshv_partition_region_by_gfn(partition, mem.guest_pfn); 1389 + if (!region) 1390 + return -EINVAL; 1391 + 1392 + /* Paranoia check */ 1393 + if (region->start_uaddr != mem.userspace_addr || 1394 + region->start_gfn != mem.guest_pfn || 1395 + region->nr_pages != HVPFN_DOWN(mem.size)) 1396 + return -EINVAL; 1397 + 1398 + hlist_del(&region->hnode); 1399 + 1400 + if (region->flags.large_pages) 1401 + unmap_flags |= HV_UNMAP_GPA_LARGE_PAGE; 1402 + 1403 + /* ignore unmap failures and continue as process may be exiting */ 1404 + hv_call_unmap_gpa_pages(partition->pt_id, region->start_gfn, 1405 + region->nr_pages, unmap_flags); 1406 + 1407 + mshv_region_evict(region); 1408 + 1409 + vfree(region); 1410 + return 0; 1411 + } 1412 + 1413 + static long 1414 + mshv_partition_ioctl_set_memory(struct mshv_partition *partition, 1415 + struct mshv_user_mem_region __user *user_mem) 1416 + { 1417 + struct mshv_user_mem_region mem; 1418 + 1419 + if (copy_from_user(&mem, user_mem, sizeof(mem))) 1420 + return -EFAULT; 1421 + 1422 + if (!mem.size || 1423 + !PAGE_ALIGNED(mem.size) || 1424 + !PAGE_ALIGNED(mem.userspace_addr) || 1425 + (mem.flags & ~MSHV_SET_MEM_FLAGS_MASK) || 1426 + mshv_field_nonzero(mem, rsvd)) 1427 + return -EINVAL; 1428 + 1429 + if (mem.flags & BIT(MSHV_SET_MEM_BIT_UNMAP)) 1430 + return mshv_unmap_user_memory(partition, mem); 1431 + 1432 + return mshv_map_user_memory(partition, mem); 1433 + } 1434 + 1435 + static long 1436 + mshv_partition_ioctl_ioeventfd(struct mshv_partition *partition, 1437 + void __user *user_args) 1438 + { 1439 + struct mshv_user_ioeventfd args; 1440 + 1441 + if (copy_from_user(&args, user_args, sizeof(args))) 1442 + return -EFAULT; 1443 + 1444 + return mshv_set_unset_ioeventfd(partition, &args); 1445 + } 1446 + 1447 + static long 1448 + mshv_partition_ioctl_irqfd(struct mshv_partition *partition, 1449 + void __user *user_args) 1450 + { 1451 + struct mshv_user_irqfd args; 1452 + 1453 + if (copy_from_user(&args, user_args, sizeof(args))) 1454 + return -EFAULT; 1455 + 1456 + return mshv_set_unset_irqfd(partition, &args); 1457 + } 1458 + 1459 + static long 1460 + mshv_partition_ioctl_get_gpap_access_bitmap(struct mshv_partition *partition, 1461 + void __user *user_args) 1462 + { 1463 + struct mshv_gpap_access_bitmap args; 1464 + union hv_gpa_page_access_state *states; 1465 + long ret, i; 1466 + union hv_gpa_page_access_state_flags hv_flags = {}; 1467 + u8 hv_type_mask; 1468 + ulong bitmap_buf_sz, states_buf_sz; 1469 + int written = 0; 1470 + 1471 + if (copy_from_user(&args, user_args, sizeof(args))) 1472 + return -EFAULT; 1473 + 1474 + if (args.access_type >= MSHV_GPAP_ACCESS_TYPE_COUNT || 1475 + args.access_op >= MSHV_GPAP_ACCESS_OP_COUNT || 1476 + mshv_field_nonzero(args, rsvd) || !args.page_count || 1477 + !args.bitmap_ptr) 1478 + return -EINVAL; 1479 + 1480 + if (check_mul_overflow(args.page_count, sizeof(*states), &states_buf_sz)) 1481 + return -E2BIG; 1482 + 1483 + /* Num bytes needed to store bitmap; one bit per page rounded up */ 1484 + bitmap_buf_sz = DIV_ROUND_UP(args.page_count, 8); 1485 + 1486 + /* Sanity check */ 1487 + if (bitmap_buf_sz > states_buf_sz) 1488 + return -EBADFD; 1489 + 1490 + switch (args.access_type) { 1491 + case MSHV_GPAP_ACCESS_TYPE_ACCESSED: 1492 + hv_type_mask = 1; 1493 + if (args.access_op == MSHV_GPAP_ACCESS_OP_CLEAR) { 1494 + hv_flags.clear_accessed = 1; 1495 + /* not accessed implies not dirty */ 1496 + hv_flags.clear_dirty = 1; 1497 + } else { /* MSHV_GPAP_ACCESS_OP_SET */ 1498 + hv_flags.set_accessed = 1; 1499 + } 1500 + break; 1501 + case MSHV_GPAP_ACCESS_TYPE_DIRTY: 1502 + hv_type_mask = 2; 1503 + if (args.access_op == MSHV_GPAP_ACCESS_OP_CLEAR) { 1504 + hv_flags.clear_dirty = 1; 1505 + } else { /* MSHV_GPAP_ACCESS_OP_SET */ 1506 + hv_flags.set_dirty = 1; 1507 + /* dirty implies accessed */ 1508 + hv_flags.set_accessed = 1; 1509 + } 1510 + break; 1511 + } 1512 + 1513 + states = vzalloc(states_buf_sz); 1514 + if (!states) 1515 + return -ENOMEM; 1516 + 1517 + ret = hv_call_get_gpa_access_states(partition->pt_id, args.page_count, 1518 + args.gpap_base, hv_flags, &written, 1519 + states); 1520 + if (ret) 1521 + goto free_return; 1522 + 1523 + /* 1524 + * Overwrite states buffer with bitmap - the bits in hv_type_mask 1525 + * correspond to bitfields in hv_gpa_page_access_state 1526 + */ 1527 + for (i = 0; i < written; ++i) 1528 + __assign_bit(i, (ulong *)states, 1529 + states[i].as_uint8 & hv_type_mask); 1530 + 1531 + /* zero the unused bits in the last byte(s) of the returned bitmap */ 1532 + for (i = written; i < bitmap_buf_sz * 8; ++i) 1533 + __clear_bit(i, (ulong *)states); 1534 + 1535 + if (copy_to_user((void __user *)args.bitmap_ptr, states, bitmap_buf_sz)) 1536 + ret = -EFAULT; 1537 + 1538 + free_return: 1539 + vfree(states); 1540 + return ret; 1541 + } 1542 + 1543 + static long 1544 + mshv_partition_ioctl_set_msi_routing(struct mshv_partition *partition, 1545 + void __user *user_args) 1546 + { 1547 + struct mshv_user_irq_entry *entries = NULL; 1548 + struct mshv_user_irq_table args; 1549 + long ret; 1550 + 1551 + if (copy_from_user(&args, user_args, sizeof(args))) 1552 + return -EFAULT; 1553 + 1554 + if (args.nr > MSHV_MAX_GUEST_IRQS || 1555 + mshv_field_nonzero(args, rsvd)) 1556 + return -EINVAL; 1557 + 1558 + if (args.nr) { 1559 + struct mshv_user_irq_table __user *urouting = user_args; 1560 + 1561 + entries = vmemdup_user(urouting->entries, 1562 + array_size(sizeof(*entries), 1563 + args.nr)); 1564 + if (IS_ERR(entries)) 1565 + return PTR_ERR(entries); 1566 + } 1567 + ret = mshv_update_routing_table(partition, entries, args.nr); 1568 + kvfree(entries); 1569 + 1570 + return ret; 1571 + } 1572 + 1573 + static long 1574 + mshv_partition_ioctl_initialize(struct mshv_partition *partition) 1575 + { 1576 + long ret; 1577 + 1578 + if (partition->pt_initialized) 1579 + return 0; 1580 + 1581 + ret = hv_call_initialize_partition(partition->pt_id); 1582 + if (ret) 1583 + goto withdraw_mem; 1584 + 1585 + partition->pt_initialized = true; 1586 + 1587 + return 0; 1588 + 1589 + withdraw_mem: 1590 + hv_call_withdraw_memory(U64_MAX, NUMA_NO_NODE, partition->pt_id); 1591 + 1592 + return ret; 1593 + } 1594 + 1595 + static long 1596 + mshv_partition_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) 1597 + { 1598 + struct mshv_partition *partition = filp->private_data; 1599 + long ret; 1600 + void __user *uarg = (void __user *)arg; 1601 + 1602 + if (mutex_lock_killable(&partition->pt_mutex)) 1603 + return -EINTR; 1604 + 1605 + switch (ioctl) { 1606 + case MSHV_INITIALIZE_PARTITION: 1607 + ret = mshv_partition_ioctl_initialize(partition); 1608 + break; 1609 + case MSHV_SET_GUEST_MEMORY: 1610 + ret = mshv_partition_ioctl_set_memory(partition, uarg); 1611 + break; 1612 + case MSHV_CREATE_VP: 1613 + ret = mshv_partition_ioctl_create_vp(partition, uarg); 1614 + break; 1615 + case MSHV_IRQFD: 1616 + ret = mshv_partition_ioctl_irqfd(partition, uarg); 1617 + break; 1618 + case MSHV_IOEVENTFD: 1619 + ret = mshv_partition_ioctl_ioeventfd(partition, uarg); 1620 + break; 1621 + case MSHV_SET_MSI_ROUTING: 1622 + ret = mshv_partition_ioctl_set_msi_routing(partition, uarg); 1623 + break; 1624 + case MSHV_GET_GPAP_ACCESS_BITMAP: 1625 + ret = mshv_partition_ioctl_get_gpap_access_bitmap(partition, 1626 + uarg); 1627 + break; 1628 + case MSHV_ROOT_HVCALL: 1629 + ret = mshv_ioctl_passthru_hvcall(partition, true, uarg); 1630 + break; 1631 + default: 1632 + ret = -ENOTTY; 1633 + } 1634 + 1635 + mutex_unlock(&partition->pt_mutex); 1636 + return ret; 1637 + } 1638 + 1639 + static int 1640 + disable_vp_dispatch(struct mshv_vp *vp) 1641 + { 1642 + int ret; 1643 + struct hv_register_assoc dispatch_suspend = { 1644 + .name = HV_REGISTER_DISPATCH_SUSPEND, 1645 + .value.dispatch_suspend.suspended = 1, 1646 + }; 1647 + 1648 + ret = mshv_set_vp_registers(vp->vp_index, vp->vp_partition->pt_id, 1649 + 1, &dispatch_suspend); 1650 + if (ret) 1651 + vp_err(vp, "failed to suspend\n"); 1652 + 1653 + return ret; 1654 + } 1655 + 1656 + static int 1657 + get_vp_signaled_count(struct mshv_vp *vp, u64 *count) 1658 + { 1659 + int ret; 1660 + struct hv_register_assoc root_signal_count = { 1661 + .name = HV_REGISTER_VP_ROOT_SIGNAL_COUNT, 1662 + }; 1663 + 1664 + ret = mshv_get_vp_registers(vp->vp_index, vp->vp_partition->pt_id, 1665 + 1, &root_signal_count); 1666 + 1667 + if (ret) { 1668 + vp_err(vp, "Failed to get root signal count"); 1669 + *count = 0; 1670 + return ret; 1671 + } 1672 + 1673 + *count = root_signal_count.value.reg64; 1674 + 1675 + return ret; 1676 + } 1677 + 1678 + static void 1679 + drain_vp_signals(struct mshv_vp *vp) 1680 + { 1681 + u64 hv_signal_count; 1682 + u64 vp_signal_count; 1683 + 1684 + get_vp_signaled_count(vp, &hv_signal_count); 1685 + 1686 + vp_signal_count = atomic64_read(&vp->run.vp_signaled_count); 1687 + 1688 + /* 1689 + * There should be at most 1 outstanding notification, but be extra 1690 + * careful anyway. 1691 + */ 1692 + while (hv_signal_count != vp_signal_count) { 1693 + WARN_ON(hv_signal_count - vp_signal_count != 1); 1694 + 1695 + if (wait_event_interruptible(vp->run.vp_suspend_queue, 1696 + vp->run.kicked_by_hv == 1)) 1697 + break; 1698 + vp->run.kicked_by_hv = 0; 1699 + vp_signal_count = atomic64_read(&vp->run.vp_signaled_count); 1700 + } 1701 + } 1702 + 1703 + static void drain_all_vps(const struct mshv_partition *partition) 1704 + { 1705 + int i; 1706 + struct mshv_vp *vp; 1707 + 1708 + /* 1709 + * VPs are reachable from ISR. It is safe to not take the partition 1710 + * lock because nobody else can enter this function and drop the 1711 + * partition from the list. 1712 + */ 1713 + for (i = 0; i < MSHV_MAX_VPS; i++) { 1714 + vp = partition->pt_vp_array[i]; 1715 + if (!vp) 1716 + continue; 1717 + /* 1718 + * Disable dispatching of the VP in the hypervisor. After this 1719 + * the hypervisor guarantees it won't generate any signals for 1720 + * the VP and the hypervisor's VP signal count won't change. 1721 + */ 1722 + disable_vp_dispatch(vp); 1723 + drain_vp_signals(vp); 1724 + } 1725 + } 1726 + 1727 + static void 1728 + remove_partition(struct mshv_partition *partition) 1729 + { 1730 + spin_lock(&mshv_root.pt_ht_lock); 1731 + hlist_del_rcu(&partition->pt_hnode); 1732 + spin_unlock(&mshv_root.pt_ht_lock); 1733 + 1734 + synchronize_rcu(); 1735 + } 1736 + 1737 + /* 1738 + * Tear down a partition and remove it from the list. 1739 + * Partition's refcount must be 0 1740 + */ 1741 + static void destroy_partition(struct mshv_partition *partition) 1742 + { 1743 + struct mshv_vp *vp; 1744 + struct mshv_mem_region *region; 1745 + int i, ret; 1746 + struct hlist_node *n; 1747 + 1748 + if (refcount_read(&partition->pt_ref_count)) { 1749 + pt_err(partition, 1750 + "Attempt to destroy partition but refcount > 0\n"); 1751 + return; 1752 + } 1753 + 1754 + if (partition->pt_initialized) { 1755 + /* 1756 + * We only need to drain signals for root scheduler. This should be 1757 + * done before removing the partition from the partition list. 1758 + */ 1759 + if (hv_scheduler_type == HV_SCHEDULER_TYPE_ROOT) 1760 + drain_all_vps(partition); 1761 + 1762 + /* Remove vps */ 1763 + for (i = 0; i < MSHV_MAX_VPS; ++i) { 1764 + vp = partition->pt_vp_array[i]; 1765 + if (!vp) 1766 + continue; 1767 + 1768 + if (hv_parent_partition()) 1769 + mshv_vp_stats_unmap(partition->pt_id, vp->vp_index); 1770 + 1771 + if (vp->vp_register_page) { 1772 + (void)hv_call_unmap_vp_state_page(partition->pt_id, 1773 + vp->vp_index, 1774 + HV_VP_STATE_PAGE_REGISTERS, 1775 + input_vtl_zero); 1776 + vp->vp_register_page = NULL; 1777 + } 1778 + 1779 + (void)hv_call_unmap_vp_state_page(partition->pt_id, 1780 + vp->vp_index, 1781 + HV_VP_STATE_PAGE_INTERCEPT_MESSAGE, 1782 + input_vtl_zero); 1783 + vp->vp_intercept_msg_page = NULL; 1784 + 1785 + if (vp->vp_ghcb_page) { 1786 + (void)hv_call_unmap_vp_state_page(partition->pt_id, 1787 + vp->vp_index, 1788 + HV_VP_STATE_PAGE_GHCB, 1789 + input_vtl_normal); 1790 + vp->vp_ghcb_page = NULL; 1791 + } 1792 + 1793 + kfree(vp); 1794 + 1795 + partition->pt_vp_array[i] = NULL; 1796 + } 1797 + 1798 + /* Deallocates and unmaps everything including vcpus, GPA mappings etc */ 1799 + hv_call_finalize_partition(partition->pt_id); 1800 + 1801 + partition->pt_initialized = false; 1802 + } 1803 + 1804 + remove_partition(partition); 1805 + 1806 + /* Remove regions, regain access to the memory and unpin the pages */ 1807 + hlist_for_each_entry_safe(region, n, &partition->pt_mem_regions, 1808 + hnode) { 1809 + hlist_del(&region->hnode); 1810 + 1811 + if (mshv_partition_encrypted(partition)) { 1812 + ret = mshv_partition_region_share(region); 1813 + if (ret) { 1814 + pt_err(partition, 1815 + "Failed to regain access to memory, unpinning user pages will fail and crash the host error: %d\n", 1816 + ret); 1817 + return; 1818 + } 1819 + } 1820 + 1821 + mshv_region_evict(region); 1822 + 1823 + vfree(region); 1824 + } 1825 + 1826 + /* Withdraw and free all pages we deposited */ 1827 + hv_call_withdraw_memory(U64_MAX, NUMA_NO_NODE, partition->pt_id); 1828 + hv_call_delete_partition(partition->pt_id); 1829 + 1830 + mshv_free_routing_table(partition); 1831 + kfree(partition); 1832 + } 1833 + 1834 + struct 1835 + mshv_partition *mshv_partition_get(struct mshv_partition *partition) 1836 + { 1837 + if (refcount_inc_not_zero(&partition->pt_ref_count)) 1838 + return partition; 1839 + return NULL; 1840 + } 1841 + 1842 + struct 1843 + mshv_partition *mshv_partition_find(u64 partition_id) 1844 + __must_hold(RCU) 1845 + { 1846 + struct mshv_partition *p; 1847 + 1848 + hash_for_each_possible_rcu(mshv_root.pt_htable, p, pt_hnode, 1849 + partition_id) 1850 + if (p->pt_id == partition_id) 1851 + return p; 1852 + 1853 + return NULL; 1854 + } 1855 + 1856 + void 1857 + mshv_partition_put(struct mshv_partition *partition) 1858 + { 1859 + if (refcount_dec_and_test(&partition->pt_ref_count)) 1860 + destroy_partition(partition); 1861 + } 1862 + 1863 + static int 1864 + mshv_partition_release(struct inode *inode, struct file *filp) 1865 + { 1866 + struct mshv_partition *partition = filp->private_data; 1867 + 1868 + mshv_eventfd_release(partition); 1869 + 1870 + cleanup_srcu_struct(&partition->pt_irq_srcu); 1871 + 1872 + mshv_partition_put(partition); 1873 + 1874 + return 0; 1875 + } 1876 + 1877 + static int 1878 + add_partition(struct mshv_partition *partition) 1879 + { 1880 + spin_lock(&mshv_root.pt_ht_lock); 1881 + 1882 + hash_add_rcu(mshv_root.pt_htable, &partition->pt_hnode, 1883 + partition->pt_id); 1884 + 1885 + spin_unlock(&mshv_root.pt_ht_lock); 1886 + 1887 + return 0; 1888 + } 1889 + 1890 + static long 1891 + mshv_ioctl_create_partition(void __user *user_arg, struct device *module_dev) 1892 + { 1893 + struct mshv_create_partition args; 1894 + u64 creation_flags; 1895 + struct hv_partition_creation_properties creation_properties = {}; 1896 + union hv_partition_isolation_properties isolation_properties = {}; 1897 + struct mshv_partition *partition; 1898 + struct file *file; 1899 + int fd; 1900 + long ret; 1901 + 1902 + if (copy_from_user(&args, user_arg, sizeof(args))) 1903 + return -EFAULT; 1904 + 1905 + if ((args.pt_flags & ~MSHV_PT_FLAGS_MASK) || 1906 + args.pt_isolation >= MSHV_PT_ISOLATION_COUNT) 1907 + return -EINVAL; 1908 + 1909 + /* Only support EXO partitions */ 1910 + creation_flags = HV_PARTITION_CREATION_FLAG_EXO_PARTITION | 1911 + HV_PARTITION_CREATION_FLAG_INTERCEPT_MESSAGE_PAGE_ENABLED; 1912 + 1913 + if (args.pt_flags & BIT(MSHV_PT_BIT_LAPIC)) 1914 + creation_flags |= HV_PARTITION_CREATION_FLAG_LAPIC_ENABLED; 1915 + if (args.pt_flags & BIT(MSHV_PT_BIT_X2APIC)) 1916 + creation_flags |= HV_PARTITION_CREATION_FLAG_X2APIC_CAPABLE; 1917 + if (args.pt_flags & BIT(MSHV_PT_BIT_GPA_SUPER_PAGES)) 1918 + creation_flags |= HV_PARTITION_CREATION_FLAG_GPA_SUPER_PAGES_ENABLED; 1919 + 1920 + switch (args.pt_isolation) { 1921 + case MSHV_PT_ISOLATION_NONE: 1922 + isolation_properties.isolation_type = 1923 + HV_PARTITION_ISOLATION_TYPE_NONE; 1924 + break; 1925 + } 1926 + 1927 + partition = kzalloc(sizeof(*partition), GFP_KERNEL); 1928 + if (!partition) 1929 + return -ENOMEM; 1930 + 1931 + partition->pt_module_dev = module_dev; 1932 + partition->isolation_type = isolation_properties.isolation_type; 1933 + 1934 + refcount_set(&partition->pt_ref_count, 1); 1935 + 1936 + mutex_init(&partition->pt_mutex); 1937 + 1938 + mutex_init(&partition->pt_irq_lock); 1939 + 1940 + init_completion(&partition->async_hypercall); 1941 + 1942 + INIT_HLIST_HEAD(&partition->irq_ack_notifier_list); 1943 + 1944 + INIT_HLIST_HEAD(&partition->pt_devices); 1945 + 1946 + INIT_HLIST_HEAD(&partition->pt_mem_regions); 1947 + 1948 + mshv_eventfd_init(partition); 1949 + 1950 + ret = init_srcu_struct(&partition->pt_irq_srcu); 1951 + if (ret) 1952 + goto free_partition; 1953 + 1954 + ret = hv_call_create_partition(creation_flags, 1955 + creation_properties, 1956 + isolation_properties, 1957 + &partition->pt_id); 1958 + if (ret) 1959 + goto cleanup_irq_srcu; 1960 + 1961 + ret = add_partition(partition); 1962 + if (ret) 1963 + goto delete_partition; 1964 + 1965 + ret = mshv_init_async_handler(partition); 1966 + if (ret) 1967 + goto remove_partition; 1968 + 1969 + fd = get_unused_fd_flags(O_CLOEXEC); 1970 + if (fd < 0) { 1971 + ret = fd; 1972 + goto remove_partition; 1973 + } 1974 + 1975 + file = anon_inode_getfile("mshv_partition", &mshv_partition_fops, 1976 + partition, O_RDWR); 1977 + if (IS_ERR(file)) { 1978 + ret = PTR_ERR(file); 1979 + goto put_fd; 1980 + } 1981 + 1982 + fd_install(fd, file); 1983 + 1984 + return fd; 1985 + 1986 + put_fd: 1987 + put_unused_fd(fd); 1988 + remove_partition: 1989 + remove_partition(partition); 1990 + delete_partition: 1991 + hv_call_delete_partition(partition->pt_id); 1992 + cleanup_irq_srcu: 1993 + cleanup_srcu_struct(&partition->pt_irq_srcu); 1994 + free_partition: 1995 + kfree(partition); 1996 + 1997 + return ret; 1998 + } 1999 + 2000 + static long mshv_dev_ioctl(struct file *filp, unsigned int ioctl, 2001 + unsigned long arg) 2002 + { 2003 + struct miscdevice *misc = filp->private_data; 2004 + 2005 + switch (ioctl) { 2006 + case MSHV_CREATE_PARTITION: 2007 + return mshv_ioctl_create_partition((void __user *)arg, 2008 + misc->this_device); 2009 + } 2010 + 2011 + return -ENOTTY; 2012 + } 2013 + 2014 + static int 2015 + mshv_dev_open(struct inode *inode, struct file *filp) 2016 + { 2017 + return 0; 2018 + } 2019 + 2020 + static int 2021 + mshv_dev_release(struct inode *inode, struct file *filp) 2022 + { 2023 + return 0; 2024 + } 2025 + 2026 + static int mshv_cpuhp_online; 2027 + static int mshv_root_sched_online; 2028 + 2029 + static const char *scheduler_type_to_string(enum hv_scheduler_type type) 2030 + { 2031 + switch (type) { 2032 + case HV_SCHEDULER_TYPE_LP: 2033 + return "classic scheduler without SMT"; 2034 + case HV_SCHEDULER_TYPE_LP_SMT: 2035 + return "classic scheduler with SMT"; 2036 + case HV_SCHEDULER_TYPE_CORE_SMT: 2037 + return "core scheduler"; 2038 + case HV_SCHEDULER_TYPE_ROOT: 2039 + return "root scheduler"; 2040 + default: 2041 + return "unknown scheduler"; 2042 + }; 2043 + } 2044 + 2045 + /* TODO move this to hv_common.c when needed outside */ 2046 + static int __init hv_retrieve_scheduler_type(enum hv_scheduler_type *out) 2047 + { 2048 + struct hv_input_get_system_property *input; 2049 + struct hv_output_get_system_property *output; 2050 + unsigned long flags; 2051 + u64 status; 2052 + 2053 + local_irq_save(flags); 2054 + input = *this_cpu_ptr(hyperv_pcpu_input_arg); 2055 + output = *this_cpu_ptr(hyperv_pcpu_output_arg); 2056 + 2057 + memset(input, 0, sizeof(*input)); 2058 + memset(output, 0, sizeof(*output)); 2059 + input->property_id = HV_SYSTEM_PROPERTY_SCHEDULER_TYPE; 2060 + 2061 + status = hv_do_hypercall(HVCALL_GET_SYSTEM_PROPERTY, input, output); 2062 + if (!hv_result_success(status)) { 2063 + local_irq_restore(flags); 2064 + pr_err("%s: %s\n", __func__, hv_result_to_string(status)); 2065 + return hv_result_to_errno(status); 2066 + } 2067 + 2068 + *out = output->scheduler_type; 2069 + local_irq_restore(flags); 2070 + 2071 + return 0; 2072 + } 2073 + 2074 + /* Retrieve and stash the supported scheduler type */ 2075 + static int __init mshv_retrieve_scheduler_type(struct device *dev) 2076 + { 2077 + int ret; 2078 + 2079 + ret = hv_retrieve_scheduler_type(&hv_scheduler_type); 2080 + if (ret) 2081 + return ret; 2082 + 2083 + dev_info(dev, "Hypervisor using %s\n", 2084 + scheduler_type_to_string(hv_scheduler_type)); 2085 + 2086 + switch (hv_scheduler_type) { 2087 + case HV_SCHEDULER_TYPE_CORE_SMT: 2088 + case HV_SCHEDULER_TYPE_LP_SMT: 2089 + case HV_SCHEDULER_TYPE_ROOT: 2090 + case HV_SCHEDULER_TYPE_LP: 2091 + /* Supported scheduler, nothing to do */ 2092 + break; 2093 + default: 2094 + dev_err(dev, "unsupported scheduler 0x%x, bailing.\n", 2095 + hv_scheduler_type); 2096 + return -EOPNOTSUPP; 2097 + } 2098 + 2099 + return 0; 2100 + } 2101 + 2102 + static int mshv_root_scheduler_init(unsigned int cpu) 2103 + { 2104 + void **inputarg, **outputarg, *p; 2105 + 2106 + inputarg = (void **)this_cpu_ptr(root_scheduler_input); 2107 + outputarg = (void **)this_cpu_ptr(root_scheduler_output); 2108 + 2109 + /* Allocate two consecutive pages. One for input, one for output. */ 2110 + p = kmalloc(2 * HV_HYP_PAGE_SIZE, GFP_KERNEL); 2111 + if (!p) 2112 + return -ENOMEM; 2113 + 2114 + *inputarg = p; 2115 + *outputarg = (char *)p + HV_HYP_PAGE_SIZE; 2116 + 2117 + return 0; 2118 + } 2119 + 2120 + static int mshv_root_scheduler_cleanup(unsigned int cpu) 2121 + { 2122 + void *p, **inputarg, **outputarg; 2123 + 2124 + inputarg = (void **)this_cpu_ptr(root_scheduler_input); 2125 + outputarg = (void **)this_cpu_ptr(root_scheduler_output); 2126 + 2127 + p = *inputarg; 2128 + 2129 + *inputarg = NULL; 2130 + *outputarg = NULL; 2131 + 2132 + kfree(p); 2133 + 2134 + return 0; 2135 + } 2136 + 2137 + /* Must be called after retrieving the scheduler type */ 2138 + static int 2139 + root_scheduler_init(struct device *dev) 2140 + { 2141 + int ret; 2142 + 2143 + if (hv_scheduler_type != HV_SCHEDULER_TYPE_ROOT) 2144 + return 0; 2145 + 2146 + root_scheduler_input = alloc_percpu(void *); 2147 + root_scheduler_output = alloc_percpu(void *); 2148 + 2149 + if (!root_scheduler_input || !root_scheduler_output) { 2150 + dev_err(dev, "Failed to allocate root scheduler buffers\n"); 2151 + ret = -ENOMEM; 2152 + goto out; 2153 + } 2154 + 2155 + ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "mshv_root_sched", 2156 + mshv_root_scheduler_init, 2157 + mshv_root_scheduler_cleanup); 2158 + 2159 + if (ret < 0) { 2160 + dev_err(dev, "Failed to setup root scheduler state: %i\n", ret); 2161 + goto out; 2162 + } 2163 + 2164 + mshv_root_sched_online = ret; 2165 + 2166 + return 0; 2167 + 2168 + out: 2169 + free_percpu(root_scheduler_input); 2170 + free_percpu(root_scheduler_output); 2171 + return ret; 2172 + } 2173 + 2174 + static void 2175 + root_scheduler_deinit(void) 2176 + { 2177 + if (hv_scheduler_type != HV_SCHEDULER_TYPE_ROOT) 2178 + return; 2179 + 2180 + cpuhp_remove_state(mshv_root_sched_online); 2181 + free_percpu(root_scheduler_input); 2182 + free_percpu(root_scheduler_output); 2183 + } 2184 + 2185 + static int mshv_reboot_notify(struct notifier_block *nb, 2186 + unsigned long code, void *unused) 2187 + { 2188 + cpuhp_remove_state(mshv_cpuhp_online); 2189 + return 0; 2190 + } 2191 + 2192 + struct notifier_block mshv_reboot_nb = { 2193 + .notifier_call = mshv_reboot_notify, 2194 + }; 2195 + 2196 + static void mshv_root_partition_exit(void) 2197 + { 2198 + unregister_reboot_notifier(&mshv_reboot_nb); 2199 + root_scheduler_deinit(); 2200 + } 2201 + 2202 + static int __init mshv_root_partition_init(struct device *dev) 2203 + { 2204 + int err; 2205 + 2206 + if (mshv_retrieve_scheduler_type(dev)) 2207 + return -ENODEV; 2208 + 2209 + err = root_scheduler_init(dev); 2210 + if (err) 2211 + return err; 2212 + 2213 + err = register_reboot_notifier(&mshv_reboot_nb); 2214 + if (err) 2215 + goto root_sched_deinit; 2216 + 2217 + return 0; 2218 + 2219 + root_sched_deinit: 2220 + root_scheduler_deinit(); 2221 + return err; 2222 + } 2223 + 2224 + static int __init mshv_parent_partition_init(void) 2225 + { 2226 + int ret; 2227 + struct device *dev; 2228 + union hv_hypervisor_version_info version_info; 2229 + 2230 + if (!hv_root_partition() || is_kdump_kernel()) 2231 + return -ENODEV; 2232 + 2233 + if (hv_get_hypervisor_version(&version_info)) 2234 + return -ENODEV; 2235 + 2236 + ret = misc_register(&mshv_dev); 2237 + if (ret) 2238 + return ret; 2239 + 2240 + dev = mshv_dev.this_device; 2241 + 2242 + if (version_info.build_number < MSHV_HV_MIN_VERSION || 2243 + version_info.build_number > MSHV_HV_MAX_VERSION) { 2244 + dev_err(dev, "Running on unvalidated Hyper-V version\n"); 2245 + dev_err(dev, "Versions: current: %u min: %u max: %u\n", 2246 + version_info.build_number, MSHV_HV_MIN_VERSION, 2247 + MSHV_HV_MAX_VERSION); 2248 + } 2249 + 2250 + mshv_root.synic_pages = alloc_percpu(struct hv_synic_pages); 2251 + if (!mshv_root.synic_pages) { 2252 + dev_err(dev, "Failed to allocate percpu synic page\n"); 2253 + ret = -ENOMEM; 2254 + goto device_deregister; 2255 + } 2256 + 2257 + ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "mshv_synic", 2258 + mshv_synic_init, 2259 + mshv_synic_cleanup); 2260 + if (ret < 0) { 2261 + dev_err(dev, "Failed to setup cpu hotplug state: %i\n", ret); 2262 + goto free_synic_pages; 2263 + } 2264 + 2265 + mshv_cpuhp_online = ret; 2266 + 2267 + ret = mshv_root_partition_init(dev); 2268 + if (ret) 2269 + goto remove_cpu_state; 2270 + 2271 + ret = mshv_irqfd_wq_init(); 2272 + if (ret) 2273 + goto exit_partition; 2274 + 2275 + spin_lock_init(&mshv_root.pt_ht_lock); 2276 + hash_init(mshv_root.pt_htable); 2277 + 2278 + hv_setup_mshv_handler(mshv_isr); 2279 + 2280 + return 0; 2281 + 2282 + exit_partition: 2283 + if (hv_root_partition()) 2284 + mshv_root_partition_exit(); 2285 + remove_cpu_state: 2286 + cpuhp_remove_state(mshv_cpuhp_online); 2287 + free_synic_pages: 2288 + free_percpu(mshv_root.synic_pages); 2289 + device_deregister: 2290 + misc_deregister(&mshv_dev); 2291 + return ret; 2292 + } 2293 + 2294 + static void __exit mshv_parent_partition_exit(void) 2295 + { 2296 + hv_setup_mshv_handler(NULL); 2297 + mshv_port_table_fini(); 2298 + misc_deregister(&mshv_dev); 2299 + mshv_irqfd_wq_cleanup(); 2300 + if (hv_root_partition()) 2301 + mshv_root_partition_exit(); 2302 + cpuhp_remove_state(mshv_cpuhp_online); 2303 + free_percpu(mshv_root.synic_pages); 2304 + } 2305 + 2306 + module_init(mshv_parent_partition_init); 2307 + module_exit(mshv_parent_partition_exit);
+665
drivers/hv/mshv_synic.c
··· 1 + // SPDX-License-Identifier: GPL-2.0-only 2 + /* 3 + * Copyright (c) 2023, Microsoft Corporation. 4 + * 5 + * mshv_root module's main interrupt handler and associated functionality. 6 + * 7 + * Authors: Microsoft Linux virtualization team 8 + */ 9 + 10 + #include <linux/kernel.h> 11 + #include <linux/slab.h> 12 + #include <linux/mm.h> 13 + #include <linux/io.h> 14 + #include <linux/random.h> 15 + #include <asm/mshyperv.h> 16 + 17 + #include "mshv_eventfd.h" 18 + #include "mshv.h" 19 + 20 + static u32 synic_event_ring_get_queued_port(u32 sint_index) 21 + { 22 + struct hv_synic_event_ring_page **event_ring_page; 23 + volatile struct hv_synic_event_ring *ring; 24 + struct hv_synic_pages *spages; 25 + u8 **synic_eventring_tail; 26 + u32 message; 27 + u8 tail; 28 + 29 + spages = this_cpu_ptr(mshv_root.synic_pages); 30 + event_ring_page = &spages->synic_event_ring_page; 31 + synic_eventring_tail = (u8 **)this_cpu_ptr(hv_synic_eventring_tail); 32 + 33 + if (unlikely(!*synic_eventring_tail)) { 34 + pr_debug("Missing synic event ring tail!\n"); 35 + return 0; 36 + } 37 + tail = (*synic_eventring_tail)[sint_index]; 38 + 39 + if (unlikely(!*event_ring_page)) { 40 + pr_debug("Missing synic event ring page!\n"); 41 + return 0; 42 + } 43 + 44 + ring = &(*event_ring_page)->sint_event_ring[sint_index]; 45 + 46 + /* 47 + * Get the message. 48 + */ 49 + message = ring->data[tail]; 50 + 51 + if (!message) { 52 + if (ring->ring_full) { 53 + /* 54 + * Ring is marked full, but we would have consumed all 55 + * the messages. Notify the hypervisor that ring is now 56 + * empty and check again. 57 + */ 58 + ring->ring_full = 0; 59 + hv_call_notify_port_ring_empty(sint_index); 60 + message = ring->data[tail]; 61 + } 62 + 63 + if (!message) { 64 + ring->signal_masked = 0; 65 + /* 66 + * Unmask the signal and sync with hypervisor 67 + * before one last check for any message. 68 + */ 69 + mb(); 70 + message = ring->data[tail]; 71 + 72 + /* 73 + * Ok, lets bail out. 74 + */ 75 + if (!message) 76 + return 0; 77 + } 78 + 79 + ring->signal_masked = 1; 80 + } 81 + 82 + /* 83 + * Clear the message in the ring buffer. 84 + */ 85 + ring->data[tail] = 0; 86 + 87 + if (++tail == HV_SYNIC_EVENT_RING_MESSAGE_COUNT) 88 + tail = 0; 89 + 90 + (*synic_eventring_tail)[sint_index] = tail; 91 + 92 + return message; 93 + } 94 + 95 + static bool 96 + mshv_doorbell_isr(struct hv_message *msg) 97 + { 98 + struct hv_notification_message_payload *notification; 99 + u32 port; 100 + 101 + if (msg->header.message_type != HVMSG_SYNIC_SINT_INTERCEPT) 102 + return false; 103 + 104 + notification = (struct hv_notification_message_payload *)msg->u.payload; 105 + if (notification->sint_index != HV_SYNIC_DOORBELL_SINT_INDEX) 106 + return false; 107 + 108 + while ((port = synic_event_ring_get_queued_port(HV_SYNIC_DOORBELL_SINT_INDEX))) { 109 + struct port_table_info ptinfo = { 0 }; 110 + 111 + if (mshv_portid_lookup(port, &ptinfo)) { 112 + pr_debug("Failed to get port info from port_table!\n"); 113 + continue; 114 + } 115 + 116 + if (ptinfo.hv_port_type != HV_PORT_TYPE_DOORBELL) { 117 + pr_debug("Not a doorbell port!, port: %d, port_type: %d\n", 118 + port, ptinfo.hv_port_type); 119 + continue; 120 + } 121 + 122 + /* Invoke the callback */ 123 + ptinfo.hv_port_doorbell.doorbell_cb(port, 124 + ptinfo.hv_port_doorbell.data); 125 + } 126 + 127 + return true; 128 + } 129 + 130 + static bool mshv_async_call_completion_isr(struct hv_message *msg) 131 + { 132 + bool handled = false; 133 + struct hv_async_completion_message_payload *async_msg; 134 + struct mshv_partition *partition; 135 + u64 partition_id; 136 + 137 + if (msg->header.message_type != HVMSG_ASYNC_CALL_COMPLETION) 138 + goto out; 139 + 140 + async_msg = 141 + (struct hv_async_completion_message_payload *)msg->u.payload; 142 + 143 + partition_id = async_msg->partition_id; 144 + 145 + /* 146 + * Hold this lock for the rest of the isr, because the partition could 147 + * be released anytime. 148 + * e.g. the MSHV_RUN_VP thread could wake on another cpu; it could 149 + * release the partition unless we hold this! 150 + */ 151 + rcu_read_lock(); 152 + 153 + partition = mshv_partition_find(partition_id); 154 + 155 + if (unlikely(!partition)) { 156 + pr_debug("failed to find partition %llu\n", partition_id); 157 + goto unlock_out; 158 + } 159 + 160 + partition->async_hypercall_status = async_msg->status; 161 + complete(&partition->async_hypercall); 162 + 163 + handled = true; 164 + 165 + unlock_out: 166 + rcu_read_unlock(); 167 + out: 168 + return handled; 169 + } 170 + 171 + static void kick_vp(struct mshv_vp *vp) 172 + { 173 + atomic64_inc(&vp->run.vp_signaled_count); 174 + vp->run.kicked_by_hv = 1; 175 + wake_up(&vp->run.vp_suspend_queue); 176 + } 177 + 178 + static void 179 + handle_bitset_message(const struct hv_vp_signal_bitset_scheduler_message *msg) 180 + { 181 + int bank_idx, vps_signaled = 0, bank_mask_size; 182 + struct mshv_partition *partition; 183 + const struct hv_vpset *vpset; 184 + const u64 *bank_contents; 185 + u64 partition_id = msg->partition_id; 186 + 187 + if (msg->vp_bitset.bitset.format != HV_GENERIC_SET_SPARSE_4K) { 188 + pr_debug("scheduler message format is not HV_GENERIC_SET_SPARSE_4K"); 189 + return; 190 + } 191 + 192 + if (msg->vp_count == 0) { 193 + pr_debug("scheduler message with no VP specified"); 194 + return; 195 + } 196 + 197 + rcu_read_lock(); 198 + 199 + partition = mshv_partition_find(partition_id); 200 + if (unlikely(!partition)) { 201 + pr_debug("failed to find partition %llu\n", partition_id); 202 + goto unlock_out; 203 + } 204 + 205 + vpset = &msg->vp_bitset.bitset; 206 + 207 + bank_idx = -1; 208 + bank_contents = vpset->bank_contents; 209 + bank_mask_size = sizeof(vpset->valid_bank_mask) * BITS_PER_BYTE; 210 + 211 + while (true) { 212 + int vp_bank_idx = -1; 213 + int vp_bank_size = sizeof(*bank_contents) * BITS_PER_BYTE; 214 + int vp_index; 215 + 216 + bank_idx = find_next_bit((unsigned long *)&vpset->valid_bank_mask, 217 + bank_mask_size, bank_idx + 1); 218 + if (bank_idx == bank_mask_size) 219 + break; 220 + 221 + while (true) { 222 + struct mshv_vp *vp; 223 + 224 + vp_bank_idx = find_next_bit((unsigned long *)bank_contents, 225 + vp_bank_size, vp_bank_idx + 1); 226 + if (vp_bank_idx == vp_bank_size) 227 + break; 228 + 229 + vp_index = (bank_idx * vp_bank_size) + vp_bank_idx; 230 + 231 + /* This shouldn't happen, but just in case. */ 232 + if (unlikely(vp_index >= MSHV_MAX_VPS)) { 233 + pr_debug("VP index %u out of bounds\n", 234 + vp_index); 235 + goto unlock_out; 236 + } 237 + 238 + vp = partition->pt_vp_array[vp_index]; 239 + if (unlikely(!vp)) { 240 + pr_debug("failed to find VP %u\n", vp_index); 241 + goto unlock_out; 242 + } 243 + 244 + kick_vp(vp); 245 + vps_signaled++; 246 + } 247 + 248 + bank_contents++; 249 + } 250 + 251 + unlock_out: 252 + rcu_read_unlock(); 253 + 254 + if (vps_signaled != msg->vp_count) 255 + pr_debug("asked to signal %u VPs but only did %u\n", 256 + msg->vp_count, vps_signaled); 257 + } 258 + 259 + static void 260 + handle_pair_message(const struct hv_vp_signal_pair_scheduler_message *msg) 261 + { 262 + struct mshv_partition *partition = NULL; 263 + struct mshv_vp *vp; 264 + int idx; 265 + 266 + rcu_read_lock(); 267 + 268 + for (idx = 0; idx < msg->vp_count; idx++) { 269 + u64 partition_id = msg->partition_ids[idx]; 270 + u32 vp_index = msg->vp_indexes[idx]; 271 + 272 + if (idx == 0 || partition->pt_id != partition_id) { 273 + partition = mshv_partition_find(partition_id); 274 + if (unlikely(!partition)) { 275 + pr_debug("failed to find partition %llu\n", 276 + partition_id); 277 + break; 278 + } 279 + } 280 + 281 + /* This shouldn't happen, but just in case. */ 282 + if (unlikely(vp_index >= MSHV_MAX_VPS)) { 283 + pr_debug("VP index %u out of bounds\n", vp_index); 284 + break; 285 + } 286 + 287 + vp = partition->pt_vp_array[vp_index]; 288 + if (!vp) { 289 + pr_debug("failed to find VP %u\n", vp_index); 290 + break; 291 + } 292 + 293 + kick_vp(vp); 294 + } 295 + 296 + rcu_read_unlock(); 297 + } 298 + 299 + static bool 300 + mshv_scheduler_isr(struct hv_message *msg) 301 + { 302 + if (msg->header.message_type != HVMSG_SCHEDULER_VP_SIGNAL_BITSET && 303 + msg->header.message_type != HVMSG_SCHEDULER_VP_SIGNAL_PAIR) 304 + return false; 305 + 306 + if (msg->header.message_type == HVMSG_SCHEDULER_VP_SIGNAL_BITSET) 307 + handle_bitset_message((struct hv_vp_signal_bitset_scheduler_message *) 308 + msg->u.payload); 309 + else 310 + handle_pair_message((struct hv_vp_signal_pair_scheduler_message *) 311 + msg->u.payload); 312 + 313 + return true; 314 + } 315 + 316 + static bool 317 + mshv_intercept_isr(struct hv_message *msg) 318 + { 319 + struct mshv_partition *partition; 320 + bool handled = false; 321 + struct mshv_vp *vp; 322 + u64 partition_id; 323 + u32 vp_index; 324 + 325 + partition_id = msg->header.sender; 326 + 327 + rcu_read_lock(); 328 + 329 + partition = mshv_partition_find(partition_id); 330 + if (unlikely(!partition)) { 331 + pr_debug("failed to find partition %llu\n", 332 + partition_id); 333 + goto unlock_out; 334 + } 335 + 336 + if (msg->header.message_type == HVMSG_X64_APIC_EOI) { 337 + /* 338 + * Check if this gsi is registered in the 339 + * ack_notifier list and invoke the callback 340 + * if registered. 341 + */ 342 + 343 + /* 344 + * If there is a notifier, the ack callback is supposed 345 + * to handle the VMEXIT. So we need not pass this message 346 + * to vcpu thread. 347 + */ 348 + struct hv_x64_apic_eoi_message *eoi_msg = 349 + (struct hv_x64_apic_eoi_message *)&msg->u.payload[0]; 350 + 351 + if (mshv_notify_acked_gsi(partition, eoi_msg->interrupt_vector)) { 352 + handled = true; 353 + goto unlock_out; 354 + } 355 + } 356 + 357 + /* 358 + * We should get an opaque intercept message here for all intercept 359 + * messages, since we're using the mapped VP intercept message page. 360 + * 361 + * The intercept message will have been placed in intercept message 362 + * page at this point. 363 + * 364 + * Make sure the message type matches our expectation. 365 + */ 366 + if (msg->header.message_type != HVMSG_OPAQUE_INTERCEPT) { 367 + pr_debug("wrong message type %d", msg->header.message_type); 368 + goto unlock_out; 369 + } 370 + 371 + /* 372 + * Since we directly index the vp, and it has to exist for us to be here 373 + * (because the vp is only deleted when the partition is), no additional 374 + * locking is needed here 375 + */ 376 + vp_index = 377 + ((struct hv_opaque_intercept_message *)msg->u.payload)->vp_index; 378 + vp = partition->pt_vp_array[vp_index]; 379 + if (unlikely(!vp)) { 380 + pr_debug("failed to find VP %u\n", vp_index); 381 + goto unlock_out; 382 + } 383 + 384 + kick_vp(vp); 385 + 386 + handled = true; 387 + 388 + unlock_out: 389 + rcu_read_unlock(); 390 + 391 + return handled; 392 + } 393 + 394 + void mshv_isr(void) 395 + { 396 + struct hv_synic_pages *spages = this_cpu_ptr(mshv_root.synic_pages); 397 + struct hv_message_page **msg_page = &spages->synic_message_page; 398 + struct hv_message *msg; 399 + bool handled; 400 + 401 + if (unlikely(!(*msg_page))) { 402 + pr_debug("Missing synic page!\n"); 403 + return; 404 + } 405 + 406 + msg = &((*msg_page)->sint_message[HV_SYNIC_INTERCEPTION_SINT_INDEX]); 407 + 408 + /* 409 + * If the type isn't set, there isn't really a message; 410 + * it may be some other hyperv interrupt 411 + */ 412 + if (msg->header.message_type == HVMSG_NONE) 413 + return; 414 + 415 + handled = mshv_doorbell_isr(msg); 416 + 417 + if (!handled) 418 + handled = mshv_scheduler_isr(msg); 419 + 420 + if (!handled) 421 + handled = mshv_async_call_completion_isr(msg); 422 + 423 + if (!handled) 424 + handled = mshv_intercept_isr(msg); 425 + 426 + if (handled) { 427 + /* 428 + * Acknowledge message with hypervisor if another message is 429 + * pending. 430 + */ 431 + msg->header.message_type = HVMSG_NONE; 432 + /* 433 + * Ensure the write is complete so the hypervisor will deliver 434 + * the next message if available. 435 + */ 436 + mb(); 437 + if (msg->header.message_flags.msg_pending) 438 + hv_set_non_nested_msr(HV_MSR_EOM, 0); 439 + 440 + #ifdef HYPERVISOR_CALLBACK_VECTOR 441 + add_interrupt_randomness(HYPERVISOR_CALLBACK_VECTOR); 442 + #endif 443 + } else { 444 + pr_warn_once("%s: unknown message type 0x%x\n", __func__, 445 + msg->header.message_type); 446 + } 447 + } 448 + 449 + int mshv_synic_init(unsigned int cpu) 450 + { 451 + union hv_synic_simp simp; 452 + union hv_synic_siefp siefp; 453 + union hv_synic_sirbp sirbp; 454 + #ifdef HYPERVISOR_CALLBACK_VECTOR 455 + union hv_synic_sint sint; 456 + #endif 457 + union hv_synic_scontrol sctrl; 458 + struct hv_synic_pages *spages = this_cpu_ptr(mshv_root.synic_pages); 459 + struct hv_message_page **msg_page = &spages->synic_message_page; 460 + struct hv_synic_event_flags_page **event_flags_page = 461 + &spages->synic_event_flags_page; 462 + struct hv_synic_event_ring_page **event_ring_page = 463 + &spages->synic_event_ring_page; 464 + 465 + /* Setup the Synic's message page */ 466 + simp.as_uint64 = hv_get_non_nested_msr(HV_MSR_SIMP); 467 + simp.simp_enabled = true; 468 + *msg_page = memremap(simp.base_simp_gpa << HV_HYP_PAGE_SHIFT, 469 + HV_HYP_PAGE_SIZE, 470 + MEMREMAP_WB); 471 + 472 + if (!(*msg_page)) 473 + return -EFAULT; 474 + 475 + hv_set_non_nested_msr(HV_MSR_SIMP, simp.as_uint64); 476 + 477 + /* Setup the Synic's event flags page */ 478 + siefp.as_uint64 = hv_get_non_nested_msr(HV_MSR_SIEFP); 479 + siefp.siefp_enabled = true; 480 + *event_flags_page = memremap(siefp.base_siefp_gpa << PAGE_SHIFT, 481 + PAGE_SIZE, MEMREMAP_WB); 482 + 483 + if (!(*event_flags_page)) 484 + goto cleanup; 485 + 486 + hv_set_non_nested_msr(HV_MSR_SIEFP, siefp.as_uint64); 487 + 488 + /* Setup the Synic's event ring page */ 489 + sirbp.as_uint64 = hv_get_non_nested_msr(HV_MSR_SIRBP); 490 + sirbp.sirbp_enabled = true; 491 + *event_ring_page = memremap(sirbp.base_sirbp_gpa << PAGE_SHIFT, 492 + PAGE_SIZE, MEMREMAP_WB); 493 + 494 + if (!(*event_ring_page)) 495 + goto cleanup; 496 + 497 + hv_set_non_nested_msr(HV_MSR_SIRBP, sirbp.as_uint64); 498 + 499 + #ifdef HYPERVISOR_CALLBACK_VECTOR 500 + /* Enable intercepts */ 501 + sint.as_uint64 = 0; 502 + sint.vector = HYPERVISOR_CALLBACK_VECTOR; 503 + sint.masked = false; 504 + sint.auto_eoi = hv_recommend_using_aeoi(); 505 + hv_set_non_nested_msr(HV_MSR_SINT0 + HV_SYNIC_INTERCEPTION_SINT_INDEX, 506 + sint.as_uint64); 507 + 508 + /* Doorbell SINT */ 509 + sint.as_uint64 = 0; 510 + sint.vector = HYPERVISOR_CALLBACK_VECTOR; 511 + sint.masked = false; 512 + sint.as_intercept = 1; 513 + sint.auto_eoi = hv_recommend_using_aeoi(); 514 + hv_set_non_nested_msr(HV_MSR_SINT0 + HV_SYNIC_DOORBELL_SINT_INDEX, 515 + sint.as_uint64); 516 + #endif 517 + 518 + /* Enable global synic bit */ 519 + sctrl.as_uint64 = hv_get_non_nested_msr(HV_MSR_SCONTROL); 520 + sctrl.enable = 1; 521 + hv_set_non_nested_msr(HV_MSR_SCONTROL, sctrl.as_uint64); 522 + 523 + return 0; 524 + 525 + cleanup: 526 + if (*event_ring_page) { 527 + sirbp.sirbp_enabled = false; 528 + hv_set_non_nested_msr(HV_MSR_SIRBP, sirbp.as_uint64); 529 + memunmap(*event_ring_page); 530 + } 531 + if (*event_flags_page) { 532 + siefp.siefp_enabled = false; 533 + hv_set_non_nested_msr(HV_MSR_SIEFP, siefp.as_uint64); 534 + memunmap(*event_flags_page); 535 + } 536 + if (*msg_page) { 537 + simp.simp_enabled = false; 538 + hv_set_non_nested_msr(HV_MSR_SIMP, simp.as_uint64); 539 + memunmap(*msg_page); 540 + } 541 + 542 + return -EFAULT; 543 + } 544 + 545 + int mshv_synic_cleanup(unsigned int cpu) 546 + { 547 + union hv_synic_sint sint; 548 + union hv_synic_simp simp; 549 + union hv_synic_siefp siefp; 550 + union hv_synic_sirbp sirbp; 551 + union hv_synic_scontrol sctrl; 552 + struct hv_synic_pages *spages = this_cpu_ptr(mshv_root.synic_pages); 553 + struct hv_message_page **msg_page = &spages->synic_message_page; 554 + struct hv_synic_event_flags_page **event_flags_page = 555 + &spages->synic_event_flags_page; 556 + struct hv_synic_event_ring_page **event_ring_page = 557 + &spages->synic_event_ring_page; 558 + 559 + /* Disable the interrupt */ 560 + sint.as_uint64 = hv_get_non_nested_msr(HV_MSR_SINT0 + HV_SYNIC_INTERCEPTION_SINT_INDEX); 561 + sint.masked = true; 562 + hv_set_non_nested_msr(HV_MSR_SINT0 + HV_SYNIC_INTERCEPTION_SINT_INDEX, 563 + sint.as_uint64); 564 + 565 + /* Disable Doorbell SINT */ 566 + sint.as_uint64 = hv_get_non_nested_msr(HV_MSR_SINT0 + HV_SYNIC_DOORBELL_SINT_INDEX); 567 + sint.masked = true; 568 + hv_set_non_nested_msr(HV_MSR_SINT0 + HV_SYNIC_DOORBELL_SINT_INDEX, 569 + sint.as_uint64); 570 + 571 + /* Disable Synic's event ring page */ 572 + sirbp.as_uint64 = hv_get_non_nested_msr(HV_MSR_SIRBP); 573 + sirbp.sirbp_enabled = false; 574 + hv_set_non_nested_msr(HV_MSR_SIRBP, sirbp.as_uint64); 575 + memunmap(*event_ring_page); 576 + 577 + /* Disable Synic's event flags page */ 578 + siefp.as_uint64 = hv_get_non_nested_msr(HV_MSR_SIEFP); 579 + siefp.siefp_enabled = false; 580 + hv_set_non_nested_msr(HV_MSR_SIEFP, siefp.as_uint64); 581 + memunmap(*event_flags_page); 582 + 583 + /* Disable Synic's message page */ 584 + simp.as_uint64 = hv_get_non_nested_msr(HV_MSR_SIMP); 585 + simp.simp_enabled = false; 586 + hv_set_non_nested_msr(HV_MSR_SIMP, simp.as_uint64); 587 + memunmap(*msg_page); 588 + 589 + /* Disable global synic bit */ 590 + sctrl.as_uint64 = hv_get_non_nested_msr(HV_MSR_SCONTROL); 591 + sctrl.enable = 0; 592 + hv_set_non_nested_msr(HV_MSR_SCONTROL, sctrl.as_uint64); 593 + 594 + return 0; 595 + } 596 + 597 + int 598 + mshv_register_doorbell(u64 partition_id, doorbell_cb_t doorbell_cb, void *data, 599 + u64 gpa, u64 val, u64 flags) 600 + { 601 + struct hv_connection_info connection_info = { 0 }; 602 + union hv_connection_id connection_id = { 0 }; 603 + struct port_table_info *port_table_info; 604 + struct hv_port_info port_info = { 0 }; 605 + union hv_port_id port_id = { 0 }; 606 + int ret; 607 + 608 + port_table_info = kmalloc(sizeof(*port_table_info), GFP_KERNEL); 609 + if (!port_table_info) 610 + return -ENOMEM; 611 + 612 + port_table_info->hv_port_type = HV_PORT_TYPE_DOORBELL; 613 + port_table_info->hv_port_doorbell.doorbell_cb = doorbell_cb; 614 + port_table_info->hv_port_doorbell.data = data; 615 + ret = mshv_portid_alloc(port_table_info); 616 + if (ret < 0) { 617 + kfree(port_table_info); 618 + return ret; 619 + } 620 + 621 + port_id.u.id = ret; 622 + port_info.port_type = HV_PORT_TYPE_DOORBELL; 623 + port_info.doorbell_port_info.target_sint = HV_SYNIC_DOORBELL_SINT_INDEX; 624 + port_info.doorbell_port_info.target_vp = HV_ANY_VP; 625 + ret = hv_call_create_port(hv_current_partition_id, port_id, partition_id, 626 + &port_info, 627 + 0, 0, NUMA_NO_NODE); 628 + 629 + if (ret < 0) { 630 + mshv_portid_free(port_id.u.id); 631 + return ret; 632 + } 633 + 634 + connection_id.u.id = port_id.u.id; 635 + connection_info.port_type = HV_PORT_TYPE_DOORBELL; 636 + connection_info.doorbell_connection_info.gpa = gpa; 637 + connection_info.doorbell_connection_info.trigger_value = val; 638 + connection_info.doorbell_connection_info.flags = flags; 639 + 640 + ret = hv_call_connect_port(hv_current_partition_id, port_id, partition_id, 641 + connection_id, &connection_info, 0, NUMA_NO_NODE); 642 + if (ret < 0) { 643 + hv_call_delete_port(hv_current_partition_id, port_id); 644 + mshv_portid_free(port_id.u.id); 645 + return ret; 646 + } 647 + 648 + // lets use the port_id as the doorbell_id 649 + return port_id.u.id; 650 + } 651 + 652 + void 653 + mshv_unregister_doorbell(u64 partition_id, int doorbell_portid) 654 + { 655 + union hv_port_id port_id = { 0 }; 656 + union hv_connection_id connection_id = { 0 }; 657 + 658 + connection_id.u.id = doorbell_portid; 659 + hv_call_disconnect_port(partition_id, connection_id); 660 + 661 + port_id.u.id = doorbell_portid; 662 + hv_call_delete_port(hv_current_partition_id, port_id); 663 + 664 + mshv_portid_free(doorbell_portid); 665 + }
+291
include/uapi/linux/mshv.h
··· 1 + /* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ 2 + /* 3 + * Userspace interfaces for /dev/mshv* devices and derived fds 4 + * 5 + * This file is divided into sections containing data structures and IOCTLs for 6 + * a particular set of related devices or derived file descriptors. 7 + * 8 + * The IOCTL definitions are at the end of each section. They are grouped by 9 + * device/fd, so that new IOCTLs can easily be added with a monotonically 10 + * increasing number. 11 + */ 12 + #ifndef _UAPI_LINUX_MSHV_H 13 + #define _UAPI_LINUX_MSHV_H 14 + 15 + #include <linux/types.h> 16 + 17 + #define MSHV_IOCTL 0xB8 18 + 19 + /* 20 + ******************************************* 21 + * Entry point to main VMM APIs: /dev/mshv * 22 + ******************************************* 23 + */ 24 + 25 + enum { 26 + MSHV_PT_BIT_LAPIC, 27 + MSHV_PT_BIT_X2APIC, 28 + MSHV_PT_BIT_GPA_SUPER_PAGES, 29 + MSHV_PT_BIT_COUNT, 30 + }; 31 + 32 + #define MSHV_PT_FLAGS_MASK ((1 << MSHV_PT_BIT_COUNT) - 1) 33 + 34 + enum { 35 + MSHV_PT_ISOLATION_NONE, 36 + MSHV_PT_ISOLATION_COUNT, 37 + }; 38 + 39 + /** 40 + * struct mshv_create_partition - arguments for MSHV_CREATE_PARTITION 41 + * @pt_flags: Bitmask of 1 << MSHV_PT_BIT_* 42 + * @pt_isolation: MSHV_PT_ISOLATION_* 43 + * 44 + * Returns a file descriptor to act as a handle to a guest partition. 45 + * At this point the partition is not yet initialized in the hypervisor. 46 + * Some operations must be done with the partition in this state, e.g. setting 47 + * so-called "early" partition properties. The partition can then be 48 + * initialized with MSHV_INITIALIZE_PARTITION. 49 + */ 50 + struct mshv_create_partition { 51 + __u64 pt_flags; 52 + __u64 pt_isolation; 53 + }; 54 + 55 + /* /dev/mshv */ 56 + #define MSHV_CREATE_PARTITION _IOW(MSHV_IOCTL, 0x00, struct mshv_create_partition) 57 + 58 + /* 59 + ************************ 60 + * Child partition APIs * 61 + ************************ 62 + */ 63 + 64 + struct mshv_create_vp { 65 + __u32 vp_index; 66 + }; 67 + 68 + enum { 69 + MSHV_SET_MEM_BIT_WRITABLE, 70 + MSHV_SET_MEM_BIT_EXECUTABLE, 71 + MSHV_SET_MEM_BIT_UNMAP, 72 + MSHV_SET_MEM_BIT_COUNT 73 + }; 74 + 75 + #define MSHV_SET_MEM_FLAGS_MASK ((1 << MSHV_SET_MEM_BIT_COUNT) - 1) 76 + 77 + /* The hypervisor's "native" page size */ 78 + #define MSHV_HV_PAGE_SIZE 0x1000 79 + 80 + /** 81 + * struct mshv_user_mem_region - arguments for MSHV_SET_GUEST_MEMORY 82 + * @size: Size of the memory region (bytes). Must be aligned to 83 + * MSHV_HV_PAGE_SIZE 84 + * @guest_pfn: Base guest page number to map 85 + * @userspace_addr: Base address of userspace memory. Must be aligned to 86 + * MSHV_HV_PAGE_SIZE 87 + * @flags: Bitmask of 1 << MSHV_SET_MEM_BIT_*. If (1 << MSHV_SET_MEM_BIT_UNMAP) 88 + * is set, ignore other bits. 89 + * @rsvd: MBZ 90 + * 91 + * Map or unmap a region of userspace memory to Guest Physical Addresses (GPA). 92 + * Mappings can't overlap in GPA space or userspace. 93 + * To unmap, these fields must match an existing mapping. 94 + */ 95 + struct mshv_user_mem_region { 96 + __u64 size; 97 + __u64 guest_pfn; 98 + __u64 userspace_addr; 99 + __u8 flags; 100 + __u8 rsvd[7]; 101 + }; 102 + 103 + enum { 104 + MSHV_IRQFD_BIT_DEASSIGN, 105 + MSHV_IRQFD_BIT_RESAMPLE, 106 + MSHV_IRQFD_BIT_COUNT, 107 + }; 108 + 109 + #define MSHV_IRQFD_FLAGS_MASK ((1 << MSHV_IRQFD_BIT_COUNT) - 1) 110 + 111 + struct mshv_user_irqfd { 112 + __s32 fd; 113 + __s32 resamplefd; 114 + __u32 gsi; 115 + __u32 flags; 116 + }; 117 + 118 + enum { 119 + MSHV_IOEVENTFD_BIT_DATAMATCH, 120 + MSHV_IOEVENTFD_BIT_PIO, 121 + MSHV_IOEVENTFD_BIT_DEASSIGN, 122 + MSHV_IOEVENTFD_BIT_COUNT, 123 + }; 124 + 125 + #define MSHV_IOEVENTFD_FLAGS_MASK ((1 << MSHV_IOEVENTFD_BIT_COUNT) - 1) 126 + 127 + struct mshv_user_ioeventfd { 128 + __u64 datamatch; 129 + __u64 addr; /* legal pio/mmio address */ 130 + __u32 len; /* 1, 2, 4, or 8 bytes */ 131 + __s32 fd; 132 + __u32 flags; 133 + __u8 rsvd[4]; 134 + }; 135 + 136 + struct mshv_user_irq_entry { 137 + __u32 gsi; 138 + __u32 address_lo; 139 + __u32 address_hi; 140 + __u32 data; 141 + }; 142 + 143 + struct mshv_user_irq_table { 144 + __u32 nr; 145 + __u32 rsvd; /* MBZ */ 146 + struct mshv_user_irq_entry entries[]; 147 + }; 148 + 149 + enum { 150 + MSHV_GPAP_ACCESS_TYPE_ACCESSED, 151 + MSHV_GPAP_ACCESS_TYPE_DIRTY, 152 + MSHV_GPAP_ACCESS_TYPE_COUNT /* Count of enum members */ 153 + }; 154 + 155 + enum { 156 + MSHV_GPAP_ACCESS_OP_NOOP, 157 + MSHV_GPAP_ACCESS_OP_CLEAR, 158 + MSHV_GPAP_ACCESS_OP_SET, 159 + MSHV_GPAP_ACCESS_OP_COUNT /* Count of enum members */ 160 + }; 161 + 162 + /** 163 + * struct mshv_gpap_access_bitmap - arguments for MSHV_GET_GPAP_ACCESS_BITMAP 164 + * @access_type: MSHV_GPAP_ACCESS_TYPE_* - The type of access to record in the 165 + * bitmap 166 + * @access_op: MSHV_GPAP_ACCESS_OP_* - Allows an optional clear or set of all 167 + * the access states in the range, after retrieving the current 168 + * states. 169 + * @rsvd: MBZ 170 + * @page_count: Number of pages 171 + * @gpap_base: Base gpa page number 172 + * @bitmap_ptr: Output buffer for bitmap, at least (page_count + 7) / 8 bytes 173 + * 174 + * Retrieve a bitmap of either ACCESSED or DIRTY bits for a given range of guest 175 + * memory, and optionally clear or set the bits. 176 + */ 177 + struct mshv_gpap_access_bitmap { 178 + __u8 access_type; 179 + __u8 access_op; 180 + __u8 rsvd[6]; 181 + __u64 page_count; 182 + __u64 gpap_base; 183 + __u64 bitmap_ptr; 184 + }; 185 + 186 + /** 187 + * struct mshv_root_hvcall - arguments for MSHV_ROOT_HVCALL 188 + * @code: Hypercall code (HVCALL_*) 189 + * @reps: in: Rep count ('repcount') 190 + * out: Reps completed ('repcomp'). MBZ unless rep hvcall 191 + * @in_sz: Size of input incl rep data. <= MSHV_HV_PAGE_SIZE 192 + * @out_sz: Size of output buffer. <= MSHV_HV_PAGE_SIZE. MBZ if out_ptr is 0 193 + * @status: in: MBZ 194 + * out: HV_STATUS_* from hypercall 195 + * @rsvd: MBZ 196 + * @in_ptr: Input data buffer (struct hv_input_*). If used with partition or 197 + * vp fd, partition id field is populated by kernel. 198 + * @out_ptr: Output data buffer (optional) 199 + */ 200 + struct mshv_root_hvcall { 201 + __u16 code; 202 + __u16 reps; 203 + __u16 in_sz; 204 + __u16 out_sz; 205 + __u16 status; 206 + __u8 rsvd[6]; 207 + __u64 in_ptr; 208 + __u64 out_ptr; 209 + }; 210 + 211 + /* Partition fds created with MSHV_CREATE_PARTITION */ 212 + #define MSHV_INITIALIZE_PARTITION _IO(MSHV_IOCTL, 0x00) 213 + #define MSHV_CREATE_VP _IOW(MSHV_IOCTL, 0x01, struct mshv_create_vp) 214 + #define MSHV_SET_GUEST_MEMORY _IOW(MSHV_IOCTL, 0x02, struct mshv_user_mem_region) 215 + #define MSHV_IRQFD _IOW(MSHV_IOCTL, 0x03, struct mshv_user_irqfd) 216 + #define MSHV_IOEVENTFD _IOW(MSHV_IOCTL, 0x04, struct mshv_user_ioeventfd) 217 + #define MSHV_SET_MSI_ROUTING _IOW(MSHV_IOCTL, 0x05, struct mshv_user_irq_table) 218 + #define MSHV_GET_GPAP_ACCESS_BITMAP _IOWR(MSHV_IOCTL, 0x06, struct mshv_gpap_access_bitmap) 219 + /* Generic hypercall */ 220 + #define MSHV_ROOT_HVCALL _IOWR(MSHV_IOCTL, 0x07, struct mshv_root_hvcall) 221 + 222 + /* 223 + ******************************** 224 + * VP APIs for child partitions * 225 + ******************************** 226 + */ 227 + 228 + #define MSHV_RUN_VP_BUF_SZ 256 229 + 230 + /* 231 + * VP state pages may be mapped to userspace via mmap(). 232 + * To specify which state page, use MSHV_VP_MMAP_OFFSET_ values multiplied by 233 + * the system page size. 234 + * e.g. 235 + * long page_size = sysconf(_SC_PAGE_SIZE); 236 + * void *reg_page = mmap(NULL, MSHV_HV_PAGE_SIZE, PROT_READ|PROT_WRITE, 237 + * MAP_SHARED, vp_fd, 238 + * MSHV_VP_MMAP_OFFSET_REGISTERS * page_size); 239 + */ 240 + enum { 241 + MSHV_VP_MMAP_OFFSET_REGISTERS, 242 + MSHV_VP_MMAP_OFFSET_INTERCEPT_MESSAGE, 243 + MSHV_VP_MMAP_OFFSET_GHCB, 244 + MSHV_VP_MMAP_OFFSET_COUNT 245 + }; 246 + 247 + /** 248 + * struct mshv_run_vp - argument for MSHV_RUN_VP 249 + * @msg_buf: On success, the intercept message is copied here. It can be 250 + * interpreted using the relevant hypervisor definitions. 251 + */ 252 + struct mshv_run_vp { 253 + __u8 msg_buf[MSHV_RUN_VP_BUF_SZ]; 254 + }; 255 + 256 + enum { 257 + MSHV_VP_STATE_LAPIC, /* Local interrupt controller state (either arch) */ 258 + MSHV_VP_STATE_XSAVE, /* XSAVE data in compacted form (x86_64) */ 259 + MSHV_VP_STATE_SIMP, 260 + MSHV_VP_STATE_SIEFP, 261 + MSHV_VP_STATE_SYNTHETIC_TIMERS, 262 + MSHV_VP_STATE_COUNT, 263 + }; 264 + 265 + /** 266 + * struct mshv_get_set_vp_state - arguments for MSHV_[GET,SET]_VP_STATE 267 + * @type: MSHV_VP_STATE_* 268 + * @rsvd: MBZ 269 + * @buf_sz: in: 4k page-aligned size of buffer 270 + * out: Actual size of data (on EINVAL, check this to see if buffer 271 + * was too small) 272 + * @buf_ptr: 4k page-aligned data buffer 273 + */ 274 + struct mshv_get_set_vp_state { 275 + __u8 type; 276 + __u8 rsvd[3]; 277 + __u32 buf_sz; 278 + __u64 buf_ptr; 279 + }; 280 + 281 + /* VP fds created with MSHV_CREATE_VP */ 282 + #define MSHV_RUN_VP _IOR(MSHV_IOCTL, 0x00, struct mshv_run_vp) 283 + #define MSHV_GET_VP_STATE _IOWR(MSHV_IOCTL, 0x01, struct mshv_get_set_vp_state) 284 + #define MSHV_SET_VP_STATE _IOWR(MSHV_IOCTL, 0x02, struct mshv_get_set_vp_state) 285 + /* 286 + * Generic hypercall 287 + * Defined above in partition IOCTLs, avoid redefining it here 288 + * #define MSHV_ROOT_HVCALL _IOWR(MSHV_IOCTL, 0x07, struct mshv_root_hvcall) 289 + */ 290 + 291 + #endif