at v6.19 1053 lines 26 kB view raw
1// SPDX-License-Identifier: GPL-2.0-only 2/* 3 * kvm eventfd support - use eventfd objects to signal various KVM events 4 * 5 * Copyright 2009 Novell. All Rights Reserved. 6 * Copyright 2010 Red Hat, Inc. and/or its affiliates. 7 * 8 * Author: 9 * Gregory Haskins <ghaskins@novell.com> 10 */ 11 12#include <linux/kvm_host.h> 13#include <linux/kvm.h> 14#include <linux/kvm_irqfd.h> 15#include <linux/workqueue.h> 16#include <linux/syscalls.h> 17#include <linux/wait.h> 18#include <linux/poll.h> 19#include <linux/file.h> 20#include <linux/list.h> 21#include <linux/eventfd.h> 22#include <linux/kernel.h> 23#include <linux/srcu.h> 24#include <linux/slab.h> 25#include <linux/seqlock.h> 26#include <linux/irqbypass.h> 27#include <trace/events/kvm.h> 28 29#include <kvm/iodev.h> 30 31#ifdef CONFIG_HAVE_KVM_IRQCHIP 32 33static struct workqueue_struct *irqfd_cleanup_wq; 34 35bool __attribute__((weak)) 36kvm_arch_irqfd_allowed(struct kvm *kvm, struct kvm_irqfd *args) 37{ 38 return true; 39} 40 41static void 42irqfd_inject(struct work_struct *work) 43{ 44 struct kvm_kernel_irqfd *irqfd = 45 container_of(work, struct kvm_kernel_irqfd, inject); 46 struct kvm *kvm = irqfd->kvm; 47 48 if (!irqfd->resampler) { 49 kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 1, 50 false); 51 kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 0, 52 false); 53 } else 54 kvm_set_irq(kvm, KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID, 55 irqfd->gsi, 1, false); 56} 57 58static void irqfd_resampler_notify(struct kvm_kernel_irqfd_resampler *resampler) 59{ 60 struct kvm_kernel_irqfd *irqfd; 61 62 list_for_each_entry_srcu(irqfd, &resampler->list, resampler_link, 63 srcu_read_lock_held(&resampler->kvm->irq_srcu)) 64 eventfd_signal(irqfd->resamplefd); 65} 66 67/* 68 * Since resampler irqfds share an IRQ source ID, we de-assert once 69 * then notify all of the resampler irqfds using this GSI. We can't 70 * do multiple de-asserts or we risk racing with incoming re-asserts. 71 */ 72static void 73irqfd_resampler_ack(struct kvm_irq_ack_notifier *kian) 74{ 75 struct kvm_kernel_irqfd_resampler *resampler; 76 struct kvm *kvm; 77 int idx; 78 79 resampler = container_of(kian, 80 struct kvm_kernel_irqfd_resampler, notifier); 81 kvm = resampler->kvm; 82 83 kvm_set_irq(kvm, KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID, 84 resampler->notifier.gsi, 0, false); 85 86 idx = srcu_read_lock(&kvm->irq_srcu); 87 irqfd_resampler_notify(resampler); 88 srcu_read_unlock(&kvm->irq_srcu, idx); 89} 90 91static void 92irqfd_resampler_shutdown(struct kvm_kernel_irqfd *irqfd) 93{ 94 struct kvm_kernel_irqfd_resampler *resampler = irqfd->resampler; 95 struct kvm *kvm = resampler->kvm; 96 97 mutex_lock(&kvm->irqfds.resampler_lock); 98 99 list_del_rcu(&irqfd->resampler_link); 100 101 if (list_empty(&resampler->list)) { 102 list_del_rcu(&resampler->link); 103 kvm_unregister_irq_ack_notifier(kvm, &resampler->notifier); 104 /* 105 * synchronize_srcu_expedited(&kvm->irq_srcu) already called 106 * in kvm_unregister_irq_ack_notifier(). 107 */ 108 kvm_set_irq(kvm, KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID, 109 resampler->notifier.gsi, 0, false); 110 kfree(resampler); 111 } else { 112 synchronize_srcu_expedited(&kvm->irq_srcu); 113 } 114 115 mutex_unlock(&kvm->irqfds.resampler_lock); 116} 117 118/* 119 * Race-free decouple logic (ordering is critical) 120 */ 121static void 122irqfd_shutdown(struct work_struct *work) 123{ 124 struct kvm_kernel_irqfd *irqfd = 125 container_of(work, struct kvm_kernel_irqfd, shutdown); 126 struct kvm *kvm = irqfd->kvm; 127 u64 cnt; 128 129 /* Make sure irqfd has been initialized in assign path. */ 130 synchronize_srcu_expedited(&kvm->irq_srcu); 131 132 /* 133 * Synchronize with the wait-queue and unhook ourselves to prevent 134 * further events. 135 */ 136 eventfd_ctx_remove_wait_queue(irqfd->eventfd, &irqfd->wait, &cnt); 137 138 /* 139 * We know no new events will be scheduled at this point, so block 140 * until all previously outstanding events have completed 141 */ 142 flush_work(&irqfd->inject); 143 144 if (irqfd->resampler) { 145 irqfd_resampler_shutdown(irqfd); 146 eventfd_ctx_put(irqfd->resamplefd); 147 } 148 149 /* 150 * It is now safe to release the object's resources 151 */ 152#if IS_ENABLED(CONFIG_HAVE_KVM_IRQ_BYPASS) 153 irq_bypass_unregister_consumer(&irqfd->consumer); 154#endif 155 eventfd_ctx_put(irqfd->eventfd); 156 kfree(irqfd); 157} 158 159 160static bool irqfd_is_active(struct kvm_kernel_irqfd *irqfd) 161{ 162 /* 163 * Assert that either irqfds.lock or SRCU is held, as irqfds.lock must 164 * be held to prevent false positives (on the irqfd being active), and 165 * while false negatives are impossible as irqfds are never added back 166 * to the list once they're deactivated, the caller must at least hold 167 * SRCU to guard against routing changes if the irqfd is deactivated. 168 */ 169 lockdep_assert_once(lockdep_is_held(&irqfd->kvm->irqfds.lock) || 170 srcu_read_lock_held(&irqfd->kvm->irq_srcu)); 171 172 return list_empty(&irqfd->list) ? false : true; 173} 174 175/* 176 * Mark the irqfd as inactive and schedule it for removal 177 */ 178static void irqfd_deactivate(struct kvm_kernel_irqfd *irqfd) 179{ 180 lockdep_assert_held(&irqfd->kvm->irqfds.lock); 181 182 BUG_ON(!irqfd_is_active(irqfd)); 183 184 list_del_init(&irqfd->list); 185 186 queue_work(irqfd_cleanup_wq, &irqfd->shutdown); 187} 188 189int __attribute__((weak)) kvm_arch_set_irq_inatomic( 190 struct kvm_kernel_irq_routing_entry *irq, 191 struct kvm *kvm, int irq_source_id, 192 int level, 193 bool line_status) 194{ 195 return -EWOULDBLOCK; 196} 197 198/* 199 * Called with wqh->lock held and interrupts disabled 200 */ 201static int 202irqfd_wakeup(wait_queue_entry_t *wait, unsigned mode, int sync, void *key) 203{ 204 struct kvm_kernel_irqfd *irqfd = 205 container_of(wait, struct kvm_kernel_irqfd, wait); 206 __poll_t flags = key_to_poll(key); 207 struct kvm_kernel_irq_routing_entry irq; 208 struct kvm *kvm = irqfd->kvm; 209 unsigned seq; 210 int idx; 211 int ret = 0; 212 213 if (flags & EPOLLIN) { 214 /* 215 * WARNING: Do NOT take irqfds.lock in any path except EPOLLHUP, 216 * as KVM holds irqfds.lock when registering the irqfd with the 217 * eventfd. 218 */ 219 u64 cnt; 220 eventfd_ctx_do_read(irqfd->eventfd, &cnt); 221 222 idx = srcu_read_lock(&kvm->irq_srcu); 223 do { 224 seq = read_seqcount_begin(&irqfd->irq_entry_sc); 225 irq = irqfd->irq_entry; 226 } while (read_seqcount_retry(&irqfd->irq_entry_sc, seq)); 227 228 /* 229 * An event has been signaled, inject an interrupt unless the 230 * irqfd is being deassigned (isn't active), in which case the 231 * routing information may be stale (once the irqfd is removed 232 * from the list, it will stop receiving routing updates). 233 */ 234 if (unlikely(!irqfd_is_active(irqfd)) || 235 kvm_arch_set_irq_inatomic(&irq, kvm, 236 KVM_USERSPACE_IRQ_SOURCE_ID, 1, 237 false) == -EWOULDBLOCK) 238 schedule_work(&irqfd->inject); 239 srcu_read_unlock(&kvm->irq_srcu, idx); 240 ret = 1; 241 } 242 243 if (flags & EPOLLHUP) { 244 /* The eventfd is closing, detach from KVM */ 245 unsigned long iflags; 246 247 /* 248 * Taking irqfds.lock is safe here, as KVM holds a reference to 249 * the eventfd when registering the irqfd, i.e. this path can't 250 * be reached while kvm_irqfd_add() is running. 251 */ 252 spin_lock_irqsave(&kvm->irqfds.lock, iflags); 253 254 /* 255 * We must check if someone deactivated the irqfd before 256 * we could acquire the irqfds.lock since the item is 257 * deactivated from the KVM side before it is unhooked from 258 * the wait-queue. If it is already deactivated, we can 259 * simply return knowing the other side will cleanup for us. 260 * We cannot race against the irqfd going away since the 261 * other side is required to acquire wqh->lock, which we hold 262 */ 263 if (irqfd_is_active(irqfd)) 264 irqfd_deactivate(irqfd); 265 266 spin_unlock_irqrestore(&kvm->irqfds.lock, iflags); 267 } 268 269 return ret; 270} 271 272static void irqfd_update(struct kvm *kvm, struct kvm_kernel_irqfd *irqfd) 273{ 274 struct kvm_kernel_irq_routing_entry *e; 275 struct kvm_kernel_irq_routing_entry entries[KVM_NR_IRQCHIPS]; 276 int n_entries; 277 278 lockdep_assert_held(&kvm->irqfds.lock); 279 280 n_entries = kvm_irq_map_gsi(kvm, entries, irqfd->gsi); 281 282 write_seqcount_begin(&irqfd->irq_entry_sc); 283 284 e = entries; 285 if (n_entries == 1) 286 irqfd->irq_entry = *e; 287 else 288 irqfd->irq_entry.type = 0; 289 290 write_seqcount_end(&irqfd->irq_entry_sc); 291} 292 293struct kvm_irqfd_pt { 294 struct kvm_kernel_irqfd *irqfd; 295 struct kvm *kvm; 296 poll_table pt; 297 int ret; 298}; 299 300static void kvm_irqfd_register(struct file *file, wait_queue_head_t *wqh, 301 poll_table *pt) 302{ 303 struct kvm_irqfd_pt *p = container_of(pt, struct kvm_irqfd_pt, pt); 304 struct kvm_kernel_irqfd *irqfd = p->irqfd; 305 struct kvm *kvm = p->kvm; 306 307 /* 308 * Note, irqfds.lock protects the irqfd's irq_entry, i.e. its routing, 309 * and irqfds.items. It does NOT protect registering with the eventfd. 310 */ 311 spin_lock_irq(&kvm->irqfds.lock); 312 313 /* 314 * Initialize the routing information prior to adding the irqfd to the 315 * eventfd's waitqueue, as irqfd_wakeup() can be invoked as soon as the 316 * irqfd is registered. 317 */ 318 irqfd_update(kvm, irqfd); 319 320 /* 321 * Add the irqfd as a priority waiter on the eventfd, with a custom 322 * wake-up handler, so that KVM *and only KVM* is notified whenever the 323 * underlying eventfd is signaled. 324 */ 325 init_waitqueue_func_entry(&irqfd->wait, irqfd_wakeup); 326 327 /* 328 * Temporarily lie to lockdep about holding irqfds.lock to avoid a 329 * false positive regarding potential deadlock with irqfd_wakeup() 330 * (see irqfd_wakeup() for details). 331 * 332 * Adding to the wait queue will fail if there is already a priority 333 * waiter, i.e. if the eventfd is associated with another irqfd (in any 334 * VM). Note, kvm_irqfd_deassign() waits for all in-flight shutdown 335 * jobs to complete, i.e. ensures the irqfd has been removed from the 336 * eventfd's waitqueue before returning to userspace. 337 */ 338 spin_release(&kvm->irqfds.lock.dep_map, _RET_IP_); 339 p->ret = add_wait_queue_priority_exclusive(wqh, &irqfd->wait); 340 spin_acquire(&kvm->irqfds.lock.dep_map, 0, 0, _RET_IP_); 341 if (p->ret) 342 goto out; 343 344 list_add_tail(&irqfd->list, &kvm->irqfds.items); 345 346out: 347 spin_unlock_irq(&kvm->irqfds.lock); 348} 349 350#if IS_ENABLED(CONFIG_HAVE_KVM_IRQ_BYPASS) 351void __attribute__((weak)) kvm_arch_irq_bypass_stop( 352 struct irq_bypass_consumer *cons) 353{ 354} 355 356void __attribute__((weak)) kvm_arch_irq_bypass_start( 357 struct irq_bypass_consumer *cons) 358{ 359} 360 361void __weak kvm_arch_update_irqfd_routing(struct kvm_kernel_irqfd *irqfd, 362 struct kvm_kernel_irq_routing_entry *old, 363 struct kvm_kernel_irq_routing_entry *new) 364{ 365 366} 367#endif 368 369static int 370kvm_irqfd_assign(struct kvm *kvm, struct kvm_irqfd *args) 371{ 372 struct kvm_kernel_irqfd *irqfd; 373 struct eventfd_ctx *eventfd = NULL, *resamplefd = NULL; 374 struct kvm_irqfd_pt irqfd_pt; 375 int ret; 376 __poll_t events; 377 int idx; 378 379 if (!kvm_arch_intc_initialized(kvm)) 380 return -EAGAIN; 381 382 if (!kvm_arch_irqfd_allowed(kvm, args)) 383 return -EINVAL; 384 385 irqfd = kzalloc(sizeof(*irqfd), GFP_KERNEL_ACCOUNT); 386 if (!irqfd) 387 return -ENOMEM; 388 389 irqfd->kvm = kvm; 390 irqfd->gsi = args->gsi; 391 INIT_LIST_HEAD(&irqfd->list); 392 INIT_WORK(&irqfd->inject, irqfd_inject); 393 INIT_WORK(&irqfd->shutdown, irqfd_shutdown); 394 seqcount_spinlock_init(&irqfd->irq_entry_sc, &kvm->irqfds.lock); 395 396 CLASS(fd, f)(args->fd); 397 if (fd_empty(f)) { 398 ret = -EBADF; 399 goto out; 400 } 401 402 eventfd = eventfd_ctx_fileget(fd_file(f)); 403 if (IS_ERR(eventfd)) { 404 ret = PTR_ERR(eventfd); 405 goto out; 406 } 407 408 irqfd->eventfd = eventfd; 409 410 if (args->flags & KVM_IRQFD_FLAG_RESAMPLE) { 411 struct kvm_kernel_irqfd_resampler *resampler; 412 413 resamplefd = eventfd_ctx_fdget(args->resamplefd); 414 if (IS_ERR(resamplefd)) { 415 ret = PTR_ERR(resamplefd); 416 goto fail; 417 } 418 419 irqfd->resamplefd = resamplefd; 420 INIT_LIST_HEAD(&irqfd->resampler_link); 421 422 mutex_lock(&kvm->irqfds.resampler_lock); 423 424 list_for_each_entry(resampler, 425 &kvm->irqfds.resampler_list, link) { 426 if (resampler->notifier.gsi == irqfd->gsi) { 427 irqfd->resampler = resampler; 428 break; 429 } 430 } 431 432 if (!irqfd->resampler) { 433 resampler = kzalloc(sizeof(*resampler), 434 GFP_KERNEL_ACCOUNT); 435 if (!resampler) { 436 ret = -ENOMEM; 437 mutex_unlock(&kvm->irqfds.resampler_lock); 438 goto fail; 439 } 440 441 resampler->kvm = kvm; 442 INIT_LIST_HEAD(&resampler->list); 443 resampler->notifier.gsi = irqfd->gsi; 444 resampler->notifier.irq_acked = irqfd_resampler_ack; 445 INIT_LIST_HEAD(&resampler->link); 446 447 list_add_rcu(&resampler->link, &kvm->irqfds.resampler_list); 448 kvm_register_irq_ack_notifier(kvm, 449 &resampler->notifier); 450 irqfd->resampler = resampler; 451 } 452 453 list_add_rcu(&irqfd->resampler_link, &irqfd->resampler->list); 454 synchronize_srcu_expedited(&kvm->irq_srcu); 455 456 mutex_unlock(&kvm->irqfds.resampler_lock); 457 } 458 459 /* 460 * Set the irqfd routing and add it to KVM's list before registering 461 * the irqfd with the eventfd, so that the routing information is valid 462 * and stays valid, e.g. if there are GSI routing changes, prior to 463 * making the irqfd visible, i.e. before it might be signaled. 464 * 465 * Note, holding SRCU ensures a stable read of routing information, and 466 * also prevents irqfd_shutdown() from freeing the irqfd before it's 467 * fully initialized. 468 */ 469 idx = srcu_read_lock(&kvm->irq_srcu); 470 471 /* 472 * Register the irqfd with the eventfd by polling on the eventfd, and 473 * simultaneously and the irqfd to KVM's list. If there was en event 474 * pending on the eventfd prior to registering, manually trigger IRQ 475 * injection. 476 */ 477 irqfd_pt.irqfd = irqfd; 478 irqfd_pt.kvm = kvm; 479 init_poll_funcptr(&irqfd_pt.pt, kvm_irqfd_register); 480 481 events = vfs_poll(fd_file(f), &irqfd_pt.pt); 482 483 ret = irqfd_pt.ret; 484 if (ret) 485 goto fail_poll; 486 487 if (events & EPOLLIN) 488 schedule_work(&irqfd->inject); 489 490#if IS_ENABLED(CONFIG_HAVE_KVM_IRQ_BYPASS) 491 if (kvm_arch_has_irq_bypass()) { 492 irqfd->consumer.add_producer = kvm_arch_irq_bypass_add_producer; 493 irqfd->consumer.del_producer = kvm_arch_irq_bypass_del_producer; 494 irqfd->consumer.stop = kvm_arch_irq_bypass_stop; 495 irqfd->consumer.start = kvm_arch_irq_bypass_start; 496 ret = irq_bypass_register_consumer(&irqfd->consumer, irqfd->eventfd); 497 if (ret) 498 pr_info("irq bypass consumer (eventfd %p) registration fails: %d\n", 499 irqfd->eventfd, ret); 500 } 501#endif 502 503 srcu_read_unlock(&kvm->irq_srcu, idx); 504 return 0; 505 506fail_poll: 507 srcu_read_unlock(&kvm->irq_srcu, idx); 508fail: 509 if (irqfd->resampler) 510 irqfd_resampler_shutdown(irqfd); 511 512 if (resamplefd && !IS_ERR(resamplefd)) 513 eventfd_ctx_put(resamplefd); 514 515 if (eventfd && !IS_ERR(eventfd)) 516 eventfd_ctx_put(eventfd); 517 518out: 519 kfree(irqfd); 520 return ret; 521} 522 523bool kvm_irq_has_notifier(struct kvm *kvm, unsigned irqchip, unsigned pin) 524{ 525 struct kvm_irq_ack_notifier *kian; 526 int gsi, idx; 527 528 idx = srcu_read_lock(&kvm->irq_srcu); 529 gsi = kvm_irq_map_chip_pin(kvm, irqchip, pin); 530 if (gsi != -1) 531 hlist_for_each_entry_srcu(kian, &kvm->irq_ack_notifier_list, 532 link, srcu_read_lock_held(&kvm->irq_srcu)) 533 if (kian->gsi == gsi) { 534 srcu_read_unlock(&kvm->irq_srcu, idx); 535 return true; 536 } 537 538 srcu_read_unlock(&kvm->irq_srcu, idx); 539 540 return false; 541} 542EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_irq_has_notifier); 543 544void kvm_notify_acked_gsi(struct kvm *kvm, int gsi) 545{ 546 struct kvm_irq_ack_notifier *kian; 547 548 hlist_for_each_entry_srcu(kian, &kvm->irq_ack_notifier_list, 549 link, srcu_read_lock_held(&kvm->irq_srcu)) 550 if (kian->gsi == gsi) 551 kian->irq_acked(kian); 552} 553 554void kvm_notify_acked_irq(struct kvm *kvm, unsigned irqchip, unsigned pin) 555{ 556 int gsi, idx; 557 558 trace_kvm_ack_irq(irqchip, pin); 559 560 idx = srcu_read_lock(&kvm->irq_srcu); 561 gsi = kvm_irq_map_chip_pin(kvm, irqchip, pin); 562 if (gsi != -1) 563 kvm_notify_acked_gsi(kvm, gsi); 564 srcu_read_unlock(&kvm->irq_srcu, idx); 565} 566 567void kvm_register_irq_ack_notifier(struct kvm *kvm, 568 struct kvm_irq_ack_notifier *kian) 569{ 570 mutex_lock(&kvm->irq_lock); 571 hlist_add_head_rcu(&kian->link, &kvm->irq_ack_notifier_list); 572 mutex_unlock(&kvm->irq_lock); 573 kvm_arch_post_irq_ack_notifier_list_update(kvm); 574} 575 576void kvm_unregister_irq_ack_notifier(struct kvm *kvm, 577 struct kvm_irq_ack_notifier *kian) 578{ 579 mutex_lock(&kvm->irq_lock); 580 hlist_del_init_rcu(&kian->link); 581 mutex_unlock(&kvm->irq_lock); 582 synchronize_srcu_expedited(&kvm->irq_srcu); 583 kvm_arch_post_irq_ack_notifier_list_update(kvm); 584} 585 586/* 587 * shutdown any irqfd's that match fd+gsi 588 */ 589static int 590kvm_irqfd_deassign(struct kvm *kvm, struct kvm_irqfd *args) 591{ 592 struct kvm_kernel_irqfd *irqfd, *tmp; 593 struct eventfd_ctx *eventfd; 594 595 eventfd = eventfd_ctx_fdget(args->fd); 596 if (IS_ERR(eventfd)) 597 return PTR_ERR(eventfd); 598 599 spin_lock_irq(&kvm->irqfds.lock); 600 601 list_for_each_entry_safe(irqfd, tmp, &kvm->irqfds.items, list) { 602 if (irqfd->eventfd == eventfd && irqfd->gsi == args->gsi) 603 irqfd_deactivate(irqfd); 604 } 605 606 spin_unlock_irq(&kvm->irqfds.lock); 607 eventfd_ctx_put(eventfd); 608 609 /* 610 * Block until we know all outstanding shutdown jobs have completed 611 * so that we guarantee there will not be any more interrupts on this 612 * gsi once this deassign function returns. 613 */ 614 flush_workqueue(irqfd_cleanup_wq); 615 616 return 0; 617} 618 619int 620kvm_irqfd(struct kvm *kvm, struct kvm_irqfd *args) 621{ 622 if (args->flags & ~(KVM_IRQFD_FLAG_DEASSIGN | KVM_IRQFD_FLAG_RESAMPLE)) 623 return -EINVAL; 624 625 if (args->flags & KVM_IRQFD_FLAG_DEASSIGN) 626 return kvm_irqfd_deassign(kvm, args); 627 628 return kvm_irqfd_assign(kvm, args); 629} 630 631/* 632 * This function is called as the kvm VM fd is being released. Shutdown all 633 * irqfds that still remain open 634 */ 635void 636kvm_irqfd_release(struct kvm *kvm) 637{ 638 struct kvm_kernel_irqfd *irqfd, *tmp; 639 640 spin_lock_irq(&kvm->irqfds.lock); 641 642 list_for_each_entry_safe(irqfd, tmp, &kvm->irqfds.items, list) 643 irqfd_deactivate(irqfd); 644 645 spin_unlock_irq(&kvm->irqfds.lock); 646 647 /* 648 * Block until we know all outstanding shutdown jobs have completed 649 * since we do not take a kvm* reference. 650 */ 651 flush_workqueue(irqfd_cleanup_wq); 652 653} 654 655/* 656 * Take note of a change in irq routing. 657 * Caller must invoke synchronize_srcu_expedited(&kvm->irq_srcu) afterwards. 658 */ 659void kvm_irq_routing_update(struct kvm *kvm) 660{ 661 struct kvm_kernel_irqfd *irqfd; 662 663 spin_lock_irq(&kvm->irqfds.lock); 664 665 list_for_each_entry(irqfd, &kvm->irqfds.items, list) { 666#if IS_ENABLED(CONFIG_HAVE_KVM_IRQ_BYPASS) 667 /* Under irqfds.lock, so can read irq_entry safely */ 668 struct kvm_kernel_irq_routing_entry old = irqfd->irq_entry; 669#endif 670 671 irqfd_update(kvm, irqfd); 672 673#if IS_ENABLED(CONFIG_HAVE_KVM_IRQ_BYPASS) 674 if (irqfd->producer) 675 kvm_arch_update_irqfd_routing(irqfd, &old, &irqfd->irq_entry); 676#endif 677 } 678 679 spin_unlock_irq(&kvm->irqfds.lock); 680} 681 682bool kvm_notify_irqfd_resampler(struct kvm *kvm, 683 unsigned int irqchip, 684 unsigned int pin) 685{ 686 struct kvm_kernel_irqfd_resampler *resampler; 687 int gsi, idx; 688 689 idx = srcu_read_lock(&kvm->irq_srcu); 690 gsi = kvm_irq_map_chip_pin(kvm, irqchip, pin); 691 if (gsi != -1) { 692 list_for_each_entry_srcu(resampler, 693 &kvm->irqfds.resampler_list, link, 694 srcu_read_lock_held(&kvm->irq_srcu)) { 695 if (resampler->notifier.gsi == gsi) { 696 irqfd_resampler_notify(resampler); 697 srcu_read_unlock(&kvm->irq_srcu, idx); 698 return true; 699 } 700 } 701 } 702 srcu_read_unlock(&kvm->irq_srcu, idx); 703 704 return false; 705} 706 707/* 708 * create a host-wide workqueue for issuing deferred shutdown requests 709 * aggregated from all vm* instances. We need our own isolated 710 * queue to ease flushing work items when a VM exits. 711 */ 712int kvm_irqfd_init(void) 713{ 714 irqfd_cleanup_wq = alloc_workqueue("kvm-irqfd-cleanup", WQ_PERCPU, 0); 715 if (!irqfd_cleanup_wq) 716 return -ENOMEM; 717 718 return 0; 719} 720 721void kvm_irqfd_exit(void) 722{ 723 destroy_workqueue(irqfd_cleanup_wq); 724} 725#endif 726 727/* 728 * -------------------------------------------------------------------- 729 * ioeventfd: translate a PIO/MMIO memory write to an eventfd signal. 730 * 731 * userspace can register a PIO/MMIO address with an eventfd for receiving 732 * notification when the memory has been touched. 733 * -------------------------------------------------------------------- 734 */ 735 736struct _ioeventfd { 737 struct list_head list; 738 u64 addr; 739 int length; 740 struct eventfd_ctx *eventfd; 741 u64 datamatch; 742 struct kvm_io_device dev; 743 u8 bus_idx; 744 bool wildcard; 745}; 746 747static inline struct _ioeventfd * 748to_ioeventfd(struct kvm_io_device *dev) 749{ 750 return container_of(dev, struct _ioeventfd, dev); 751} 752 753static void 754ioeventfd_release(struct _ioeventfd *p) 755{ 756 eventfd_ctx_put(p->eventfd); 757 list_del(&p->list); 758 kfree(p); 759} 760 761static bool 762ioeventfd_in_range(struct _ioeventfd *p, gpa_t addr, int len, const void *val) 763{ 764 u64 _val; 765 766 if (addr != p->addr) 767 /* address must be precise for a hit */ 768 return false; 769 770 if (!p->length) 771 /* length = 0 means only look at the address, so always a hit */ 772 return true; 773 774 if (len != p->length) 775 /* address-range must be precise for a hit */ 776 return false; 777 778 if (p->wildcard) 779 /* all else equal, wildcard is always a hit */ 780 return true; 781 782 /* otherwise, we have to actually compare the data */ 783 784 BUG_ON(!IS_ALIGNED((unsigned long)val, len)); 785 786 switch (len) { 787 case 1: 788 _val = *(u8 *)val; 789 break; 790 case 2: 791 _val = *(u16 *)val; 792 break; 793 case 4: 794 _val = *(u32 *)val; 795 break; 796 case 8: 797 _val = *(u64 *)val; 798 break; 799 default: 800 return false; 801 } 802 803 return _val == p->datamatch; 804} 805 806/* MMIO/PIO writes trigger an event if the addr/val match */ 807static int 808ioeventfd_write(struct kvm_vcpu *vcpu, struct kvm_io_device *this, gpa_t addr, 809 int len, const void *val) 810{ 811 struct _ioeventfd *p = to_ioeventfd(this); 812 813 if (!ioeventfd_in_range(p, addr, len, val)) 814 return -EOPNOTSUPP; 815 816 eventfd_signal(p->eventfd); 817 return 0; 818} 819 820/* 821 * This function is called as KVM is completely shutting down. We do not 822 * need to worry about locking just nuke anything we have as quickly as possible 823 */ 824static void 825ioeventfd_destructor(struct kvm_io_device *this) 826{ 827 struct _ioeventfd *p = to_ioeventfd(this); 828 829 ioeventfd_release(p); 830} 831 832static const struct kvm_io_device_ops ioeventfd_ops = { 833 .write = ioeventfd_write, 834 .destructor = ioeventfd_destructor, 835}; 836 837/* assumes kvm->slots_lock held */ 838static bool 839ioeventfd_check_collision(struct kvm *kvm, struct _ioeventfd *p) 840{ 841 struct _ioeventfd *_p; 842 843 list_for_each_entry(_p, &kvm->ioeventfds, list) 844 if (_p->bus_idx == p->bus_idx && 845 _p->addr == p->addr && 846 (!_p->length || !p->length || 847 (_p->length == p->length && 848 (_p->wildcard || p->wildcard || 849 _p->datamatch == p->datamatch)))) 850 return true; 851 852 return false; 853} 854 855static enum kvm_bus ioeventfd_bus_from_flags(__u32 flags) 856{ 857 if (flags & KVM_IOEVENTFD_FLAG_PIO) 858 return KVM_PIO_BUS; 859 if (flags & KVM_IOEVENTFD_FLAG_VIRTIO_CCW_NOTIFY) 860 return KVM_VIRTIO_CCW_NOTIFY_BUS; 861 return KVM_MMIO_BUS; 862} 863 864static int kvm_assign_ioeventfd_idx(struct kvm *kvm, 865 enum kvm_bus bus_idx, 866 struct kvm_ioeventfd *args) 867{ 868 869 struct eventfd_ctx *eventfd; 870 struct _ioeventfd *p; 871 int ret; 872 873 eventfd = eventfd_ctx_fdget(args->fd); 874 if (IS_ERR(eventfd)) 875 return PTR_ERR(eventfd); 876 877 p = kzalloc(sizeof(*p), GFP_KERNEL_ACCOUNT); 878 if (!p) { 879 ret = -ENOMEM; 880 goto fail; 881 } 882 883 INIT_LIST_HEAD(&p->list); 884 p->addr = args->addr; 885 p->bus_idx = bus_idx; 886 p->length = args->len; 887 p->eventfd = eventfd; 888 889 /* The datamatch feature is optional, otherwise this is a wildcard */ 890 if (args->flags & KVM_IOEVENTFD_FLAG_DATAMATCH) 891 p->datamatch = args->datamatch; 892 else 893 p->wildcard = true; 894 895 mutex_lock(&kvm->slots_lock); 896 897 /* Verify that there isn't a match already */ 898 if (ioeventfd_check_collision(kvm, p)) { 899 ret = -EEXIST; 900 goto unlock_fail; 901 } 902 903 kvm_iodevice_init(&p->dev, &ioeventfd_ops); 904 905 ret = kvm_io_bus_register_dev(kvm, bus_idx, p->addr, p->length, 906 &p->dev); 907 if (ret < 0) 908 goto unlock_fail; 909 910 kvm_get_bus(kvm, bus_idx)->ioeventfd_count++; 911 list_add_tail(&p->list, &kvm->ioeventfds); 912 913 mutex_unlock(&kvm->slots_lock); 914 915 return 0; 916 917unlock_fail: 918 mutex_unlock(&kvm->slots_lock); 919 kfree(p); 920 921fail: 922 eventfd_ctx_put(eventfd); 923 924 return ret; 925} 926 927static int 928kvm_deassign_ioeventfd_idx(struct kvm *kvm, enum kvm_bus bus_idx, 929 struct kvm_ioeventfd *args) 930{ 931 struct _ioeventfd *p; 932 struct eventfd_ctx *eventfd; 933 struct kvm_io_bus *bus; 934 int ret = -ENOENT; 935 bool wildcard; 936 937 eventfd = eventfd_ctx_fdget(args->fd); 938 if (IS_ERR(eventfd)) 939 return PTR_ERR(eventfd); 940 941 wildcard = !(args->flags & KVM_IOEVENTFD_FLAG_DATAMATCH); 942 943 mutex_lock(&kvm->slots_lock); 944 945 list_for_each_entry(p, &kvm->ioeventfds, list) { 946 if (p->bus_idx != bus_idx || 947 p->eventfd != eventfd || 948 p->addr != args->addr || 949 p->length != args->len || 950 p->wildcard != wildcard) 951 continue; 952 953 if (!p->wildcard && p->datamatch != args->datamatch) 954 continue; 955 956 kvm_io_bus_unregister_dev(kvm, bus_idx, &p->dev); 957 bus = kvm_get_bus(kvm, bus_idx); 958 if (bus) 959 bus->ioeventfd_count--; 960 ret = 0; 961 break; 962 } 963 964 mutex_unlock(&kvm->slots_lock); 965 966 eventfd_ctx_put(eventfd); 967 968 return ret; 969} 970 971static int kvm_deassign_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args) 972{ 973 enum kvm_bus bus_idx = ioeventfd_bus_from_flags(args->flags); 974 int ret = kvm_deassign_ioeventfd_idx(kvm, bus_idx, args); 975 976 if (!args->len && bus_idx == KVM_MMIO_BUS) 977 kvm_deassign_ioeventfd_idx(kvm, KVM_FAST_MMIO_BUS, args); 978 979 return ret; 980} 981 982static int 983kvm_assign_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args) 984{ 985 enum kvm_bus bus_idx; 986 int ret; 987 988 bus_idx = ioeventfd_bus_from_flags(args->flags); 989 /* must be natural-word sized, or 0 to ignore length */ 990 switch (args->len) { 991 case 0: 992 case 1: 993 case 2: 994 case 4: 995 case 8: 996 break; 997 default: 998 return -EINVAL; 999 } 1000 1001 /* check for range overflow */ 1002 if (args->addr + args->len < args->addr) 1003 return -EINVAL; 1004 1005 /* check for extra flags that we don't understand */ 1006 if (args->flags & ~KVM_IOEVENTFD_VALID_FLAG_MASK) 1007 return -EINVAL; 1008 1009 /* ioeventfd with no length can't be combined with DATAMATCH */ 1010 if (!args->len && (args->flags & KVM_IOEVENTFD_FLAG_DATAMATCH)) 1011 return -EINVAL; 1012 1013 ret = kvm_assign_ioeventfd_idx(kvm, bus_idx, args); 1014 if (ret) 1015 goto fail; 1016 1017 /* When length is ignored, MMIO is also put on a separate bus, for 1018 * faster lookups. 1019 */ 1020 if (!args->len && bus_idx == KVM_MMIO_BUS) { 1021 ret = kvm_assign_ioeventfd_idx(kvm, KVM_FAST_MMIO_BUS, args); 1022 if (ret < 0) 1023 goto fast_fail; 1024 } 1025 1026 return 0; 1027 1028fast_fail: 1029 kvm_deassign_ioeventfd_idx(kvm, bus_idx, args); 1030fail: 1031 return ret; 1032} 1033 1034int 1035kvm_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args) 1036{ 1037 if (args->flags & KVM_IOEVENTFD_FLAG_DEASSIGN) 1038 return kvm_deassign_ioeventfd(kvm, args); 1039 1040 return kvm_assign_ioeventfd(kvm, args); 1041} 1042 1043void 1044kvm_eventfd_init(struct kvm *kvm) 1045{ 1046#ifdef CONFIG_HAVE_KVM_IRQCHIP 1047 spin_lock_init(&kvm->irqfds.lock); 1048 INIT_LIST_HEAD(&kvm->irqfds.items); 1049 INIT_LIST_HEAD(&kvm->irqfds.resampler_list); 1050 mutex_init(&kvm->irqfds.resampler_lock); 1051#endif 1052 INIT_LIST_HEAD(&kvm->ioeventfds); 1053}