Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

fs: split generic and aio kiocb

Most callers in the kernel want to perform synchronous file I/O, but
still have to bloat the stack with a full struct kiocb. Split out
the parts needed in filesystem code from those in the aio code, and
only allocate those needed to pass down argument on the stack. The
aio code embedds the generic iocb in the one it allocates and can
easily get back to it by using container_of.

Also add a ->ki_complete method to struct kiocb, this is used to call
into the aio code and thus removes the dependency on aio for filesystems
impementing asynchronous operations. It will also allow other callers
to substitute their own completion callback.

We also add a new ->ki_flags field to work around the nasty layering
violation recently introduced in commit 5e33f6 ("usb: gadget: ffs: add
eventfd notification about ffs events").

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

authored by

Christoph Hellwig and committed by
Al Viro
04b2fa9f 599bd19b

+81 -77
+3 -2
drivers/usb/gadget/function/f_fs.c
··· 655 655 unuse_mm(io_data->mm); 656 656 } 657 657 658 - aio_complete(io_data->kiocb, ret, ret); 658 + io_data->kiocb->ki_complete(io_data->kiocb, ret, ret); 659 659 660 - if (io_data->ffs->ffs_eventfd && !io_data->kiocb->ki_eventfd) 660 + if (io_data->ffs->ffs_eventfd && 661 + !(io_data->kiocb->ki_flags & IOCB_EVENTFD)) 661 662 eventfd_signal(io_data->ffs->ffs_eventfd, 1); 662 663 663 664 usb_ep_free_request(io_data->ep, io_data->req);
+3 -2
drivers/usb/gadget/legacy/inode.c
··· 469 469 ret = -EFAULT; 470 470 471 471 /* completing the iocb can drop the ctx and mm, don't touch mm after */ 472 - aio_complete(iocb, ret, ret); 472 + iocb->ki_complete(iocb, ret, ret); 473 473 474 474 kfree(priv->buf); 475 475 kfree(priv->to_free); ··· 497 497 kfree(priv); 498 498 iocb->private = NULL; 499 499 /* aio_complete() reports bytes-transferred _and_ faults */ 500 - aio_complete(iocb, req->actual ? req->actual : req->status, 500 + 501 + iocb->ki_complete(iocb, req->actual ? req->actual : req->status, 501 502 req->status); 502 503 } else { 503 504 /* ep_copy_to_user() won't report both; we hide some faults */
+65 -29
fs/aio.c
··· 151 151 unsigned id; 152 152 }; 153 153 154 + /* 155 + * We use ki_cancel == KIOCB_CANCELLED to indicate that a kiocb has been either 156 + * cancelled or completed (this makes a certain amount of sense because 157 + * successful cancellation - io_cancel() - does deliver the completion to 158 + * userspace). 159 + * 160 + * And since most things don't implement kiocb cancellation and we'd really like 161 + * kiocb completion to be lockless when possible, we use ki_cancel to 162 + * synchronize cancellation and completion - we only set it to KIOCB_CANCELLED 163 + * with xchg() or cmpxchg(), see batch_complete_aio() and kiocb_cancel(). 164 + */ 165 + #define KIOCB_CANCELLED ((void *) (~0ULL)) 166 + 167 + struct aio_kiocb { 168 + struct kiocb common; 169 + 170 + struct kioctx *ki_ctx; 171 + kiocb_cancel_fn *ki_cancel; 172 + 173 + struct iocb __user *ki_user_iocb; /* user's aiocb */ 174 + __u64 ki_user_data; /* user's data for completion */ 175 + 176 + struct list_head ki_list; /* the aio core uses this 177 + * for cancellation */ 178 + 179 + /* 180 + * If the aio_resfd field of the userspace iocb is not zero, 181 + * this is the underlying eventfd context to deliver events to. 182 + */ 183 + struct eventfd_ctx *ki_eventfd; 184 + }; 185 + 154 186 /*------ sysctl variables----*/ 155 187 static DEFINE_SPINLOCK(aio_nr_lock); 156 188 unsigned long aio_nr; /* current system wide number of aio requests */ ··· 252 220 if (IS_ERR(aio_mnt)) 253 221 panic("Failed to create aio fs mount."); 254 222 255 - kiocb_cachep = KMEM_CACHE(kiocb, SLAB_HWCACHE_ALIGN|SLAB_PANIC); 223 + kiocb_cachep = KMEM_CACHE(aio_kiocb, SLAB_HWCACHE_ALIGN|SLAB_PANIC); 256 224 kioctx_cachep = KMEM_CACHE(kioctx,SLAB_HWCACHE_ALIGN|SLAB_PANIC); 257 225 258 226 pr_debug("sizeof(struct page) = %zu\n", sizeof(struct page)); ··· 512 480 #define AIO_EVENTS_FIRST_PAGE ((PAGE_SIZE - sizeof(struct aio_ring)) / sizeof(struct io_event)) 513 481 #define AIO_EVENTS_OFFSET (AIO_EVENTS_PER_PAGE - AIO_EVENTS_FIRST_PAGE) 514 482 515 - void kiocb_set_cancel_fn(struct kiocb *req, kiocb_cancel_fn *cancel) 483 + void kiocb_set_cancel_fn(struct kiocb *iocb, kiocb_cancel_fn *cancel) 516 484 { 485 + struct aio_kiocb *req = container_of(iocb, struct aio_kiocb, common); 517 486 struct kioctx *ctx = req->ki_ctx; 518 487 unsigned long flags; 519 488 ··· 529 496 } 530 497 EXPORT_SYMBOL(kiocb_set_cancel_fn); 531 498 532 - static int kiocb_cancel(struct kiocb *kiocb) 499 + static int kiocb_cancel(struct aio_kiocb *kiocb) 533 500 { 534 501 kiocb_cancel_fn *old, *cancel; 535 502 ··· 547 514 cancel = cmpxchg(&kiocb->ki_cancel, old, KIOCB_CANCELLED); 548 515 } while (cancel != old); 549 516 550 - return cancel(kiocb); 517 + return cancel(&kiocb->common); 551 518 } 552 519 553 520 static void free_ioctx(struct work_struct *work) ··· 583 550 static void free_ioctx_users(struct percpu_ref *ref) 584 551 { 585 552 struct kioctx *ctx = container_of(ref, struct kioctx, users); 586 - struct kiocb *req; 553 + struct aio_kiocb *req; 587 554 588 555 spin_lock_irq(&ctx->ctx_lock); 589 556 590 557 while (!list_empty(&ctx->active_reqs)) { 591 558 req = list_first_entry(&ctx->active_reqs, 592 - struct kiocb, ki_list); 559 + struct aio_kiocb, ki_list); 593 560 594 561 list_del_init(&req->ki_list); 595 562 kiocb_cancel(req); ··· 965 932 * Allocate a slot for an aio request. 966 933 * Returns NULL if no requests are free. 967 934 */ 968 - static inline struct kiocb *aio_get_req(struct kioctx *ctx) 935 + static inline struct aio_kiocb *aio_get_req(struct kioctx *ctx) 969 936 { 970 - struct kiocb *req; 937 + struct aio_kiocb *req; 971 938 972 939 if (!get_reqs_available(ctx)) { 973 940 user_refill_reqs_available(ctx); ··· 988 955 return NULL; 989 956 } 990 957 991 - static void kiocb_free(struct kiocb *req) 958 + static void kiocb_free(struct aio_kiocb *req) 992 959 { 993 - if (req->ki_filp) 994 - fput(req->ki_filp); 960 + if (req->common.ki_filp) 961 + fput(req->common.ki_filp); 995 962 if (req->ki_eventfd != NULL) 996 963 eventfd_ctx_put(req->ki_eventfd); 997 964 kmem_cache_free(kiocb_cachep, req); ··· 1027 994 /* aio_complete 1028 995 * Called when the io request on the given iocb is complete. 1029 996 */ 1030 - void aio_complete(struct kiocb *iocb, long res, long res2) 997 + static void aio_complete(struct kiocb *kiocb, long res, long res2) 1031 998 { 999 + struct aio_kiocb *iocb = container_of(kiocb, struct aio_kiocb, common); 1032 1000 struct kioctx *ctx = iocb->ki_ctx; 1033 1001 struct aio_ring *ring; 1034 1002 struct io_event *ev_page, *event; ··· 1043 1009 * ref, no other paths have a way to get another ref 1044 1010 * - the sync task helpfully left a reference to itself in the iocb 1045 1011 */ 1046 - BUG_ON(is_sync_kiocb(iocb)); 1012 + BUG_ON(is_sync_kiocb(kiocb)); 1047 1013 1048 1014 if (iocb->ki_list.next) { 1049 1015 unsigned long flags; ··· 1069 1035 ev_page = kmap_atomic(ctx->ring_pages[pos / AIO_EVENTS_PER_PAGE]); 1070 1036 event = ev_page + pos % AIO_EVENTS_PER_PAGE; 1071 1037 1072 - event->obj = (u64)(unsigned long)iocb->ki_obj.user; 1038 + event->obj = (u64)(unsigned long)iocb->ki_user_iocb; 1073 1039 event->data = iocb->ki_user_data; 1074 1040 event->res = res; 1075 1041 event->res2 = res2; ··· 1078 1044 flush_dcache_page(ctx->ring_pages[pos / AIO_EVENTS_PER_PAGE]); 1079 1045 1080 1046 pr_debug("%p[%u]: %p: %p %Lx %lx %lx\n", 1081 - ctx, tail, iocb, iocb->ki_obj.user, iocb->ki_user_data, 1047 + ctx, tail, iocb, iocb->ki_user_iocb, iocb->ki_user_data, 1082 1048 res, res2); 1083 1049 1084 1050 /* after flagging the request as done, we ··· 1125 1091 1126 1092 percpu_ref_put(&ctx->reqs); 1127 1093 } 1128 - EXPORT_SYMBOL(aio_complete); 1129 1094 1130 1095 /* aio_read_events_ring 1131 1096 * Pull an event off of the ioctx's event ring. Returns the number of ··· 1513 1480 static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb, 1514 1481 struct iocb *iocb, bool compat) 1515 1482 { 1516 - struct kiocb *req; 1483 + struct aio_kiocb *req; 1517 1484 ssize_t ret; 1518 1485 1519 1486 /* enforce forwards compatibility on users */ ··· 1536 1503 if (unlikely(!req)) 1537 1504 return -EAGAIN; 1538 1505 1539 - req->ki_filp = fget(iocb->aio_fildes); 1540 - if (unlikely(!req->ki_filp)) { 1506 + req->common.ki_filp = fget(iocb->aio_fildes); 1507 + if (unlikely(!req->common.ki_filp)) { 1541 1508 ret = -EBADF; 1542 1509 goto out_put_req; 1543 1510 } 1511 + req->common.ki_pos = iocb->aio_offset; 1512 + req->common.ki_complete = aio_complete; 1513 + req->common.ki_flags = 0; 1544 1514 1545 1515 if (iocb->aio_flags & IOCB_FLAG_RESFD) { 1546 1516 /* ··· 1558 1522 req->ki_eventfd = NULL; 1559 1523 goto out_put_req; 1560 1524 } 1525 + 1526 + req->common.ki_flags |= IOCB_EVENTFD; 1561 1527 } 1562 1528 1563 1529 ret = put_user(KIOCB_KEY, &user_iocb->aio_key); ··· 1568 1530 goto out_put_req; 1569 1531 } 1570 1532 1571 - req->ki_obj.user = user_iocb; 1533 + req->ki_user_iocb = user_iocb; 1572 1534 req->ki_user_data = iocb->aio_data; 1573 - req->ki_pos = iocb->aio_offset; 1574 1535 1575 - ret = aio_run_iocb(req, iocb->aio_lio_opcode, 1536 + ret = aio_run_iocb(&req->common, iocb->aio_lio_opcode, 1576 1537 (char __user *)(unsigned long)iocb->aio_buf, 1577 1538 iocb->aio_nbytes, 1578 1539 compat); ··· 1660 1623 /* lookup_kiocb 1661 1624 * Finds a given iocb for cancellation. 1662 1625 */ 1663 - static struct kiocb *lookup_kiocb(struct kioctx *ctx, struct iocb __user *iocb, 1664 - u32 key) 1626 + static struct aio_kiocb * 1627 + lookup_kiocb(struct kioctx *ctx, struct iocb __user *iocb, u32 key) 1665 1628 { 1666 - struct list_head *pos; 1629 + struct aio_kiocb *kiocb; 1667 1630 1668 1631 assert_spin_locked(&ctx->ctx_lock); 1669 1632 ··· 1671 1634 return NULL; 1672 1635 1673 1636 /* TODO: use a hash or array, this sucks. */ 1674 - list_for_each(pos, &ctx->active_reqs) { 1675 - struct kiocb *kiocb = list_kiocb(pos); 1676 - if (kiocb->ki_obj.user == iocb) 1637 + list_for_each_entry(kiocb, &ctx->active_reqs, ki_list) { 1638 + if (kiocb->ki_user_iocb == iocb) 1677 1639 return kiocb; 1678 1640 } 1679 1641 return NULL; ··· 1692 1656 struct io_event __user *, result) 1693 1657 { 1694 1658 struct kioctx *ctx; 1695 - struct kiocb *kiocb; 1659 + struct aio_kiocb *kiocb; 1696 1660 u32 key; 1697 1661 int ret; 1698 1662
+2 -2
fs/direct-io.c
··· 265 265 ret = err; 266 266 } 267 267 268 - aio_complete(dio->iocb, ret, 0); 268 + dio->iocb->ki_complete(dio->iocb, ret, 0); 269 269 } 270 270 271 271 kmem_cache_free(dio_cache, dio); ··· 1056 1056 * operation. AIO can if it was a broken operation described above or 1057 1057 * in fact if all the bios race to complete before we get here. In 1058 1058 * that case dio_complete() translates the EIOCBQUEUED into the proper 1059 - * return code that the caller will hand to aio_complete(). 1059 + * return code that the caller will hand to ->complete(). 1060 1060 * 1061 1061 * This is managed by the bio_lock instead of being an atomic_t so that 1062 1062 * completion paths can drop their ref and use the remaining count to
+1 -1
fs/fuse/file.c
··· 584 584 spin_unlock(&fc->lock); 585 585 } 586 586 587 - aio_complete(io->iocb, res, 0); 587 + io->iocb->ki_complete(io->iocb, res, 0); 588 588 kfree(io); 589 589 } 590 590 }
+1 -1
fs/nfs/direct.c
··· 393 393 long res = (long) dreq->error; 394 394 if (!res) 395 395 res = (long) dreq->count; 396 - aio_complete(dreq->iocb, res, 0); 396 + dreq->iocb->ki_complete(dreq->iocb, res, 0); 397 397 } 398 398 399 399 complete_all(&dreq->completion);
+6 -40
include/linux/aio.h
··· 14 14 15 15 #define KIOCB_KEY 0 16 16 17 - /* 18 - * We use ki_cancel == KIOCB_CANCELLED to indicate that a kiocb has been either 19 - * cancelled or completed (this makes a certain amount of sense because 20 - * successful cancellation - io_cancel() - does deliver the completion to 21 - * userspace). 22 - * 23 - * And since most things don't implement kiocb cancellation and we'd really like 24 - * kiocb completion to be lockless when possible, we use ki_cancel to 25 - * synchronize cancellation and completion - we only set it to KIOCB_CANCELLED 26 - * with xchg() or cmpxchg(), see batch_complete_aio() and kiocb_cancel(). 27 - */ 28 - #define KIOCB_CANCELLED ((void *) (~0ULL)) 29 - 30 17 typedef int (kiocb_cancel_fn)(struct kiocb *); 18 + 19 + #define IOCB_EVENTFD (1 << 0) 31 20 32 21 struct kiocb { 33 22 struct file *ki_filp; 34 - struct kioctx *ki_ctx; /* NULL for sync ops */ 35 - kiocb_cancel_fn *ki_cancel; 36 - void *private; 37 - 38 - union { 39 - void __user *user; 40 - } ki_obj; 41 - 42 - __u64 ki_user_data; /* user's data for completion */ 43 23 loff_t ki_pos; 44 - 45 - struct list_head ki_list; /* the aio core uses this 46 - * for cancellation */ 47 - 48 - /* 49 - * If the aio_resfd field of the userspace iocb is not zero, 50 - * this is the underlying eventfd context to deliver events to. 51 - */ 52 - struct eventfd_ctx *ki_eventfd; 24 + void (*ki_complete)(struct kiocb *iocb, long ret, long ret2); 25 + void *private; 26 + int ki_flags; 53 27 }; 54 28 55 29 static inline bool is_sync_kiocb(struct kiocb *kiocb) 56 30 { 57 - return kiocb->ki_ctx == NULL; 31 + return kiocb->ki_complete == NULL; 58 32 } 59 33 60 34 static inline void init_sync_kiocb(struct kiocb *kiocb, struct file *filp) 61 35 { 62 36 *kiocb = (struct kiocb) { 63 - .ki_ctx = NULL, 64 37 .ki_filp = filp, 65 38 }; 66 39 } 67 40 68 41 /* prototypes */ 69 42 #ifdef CONFIG_AIO 70 - extern void aio_complete(struct kiocb *iocb, long res, long res2); 71 43 struct mm_struct; 72 44 extern void exit_aio(struct mm_struct *mm); 73 45 extern long do_io_submit(aio_context_t ctx_id, long nr, 74 46 struct iocb __user *__user *iocbpp, bool compat); 75 47 void kiocb_set_cancel_fn(struct kiocb *req, kiocb_cancel_fn *cancel); 76 48 #else 77 - static inline void aio_complete(struct kiocb *iocb, long res, long res2) { } 78 49 struct mm_struct; 79 50 static inline void exit_aio(struct mm_struct *mm) { } 80 51 static inline long do_io_submit(aio_context_t ctx_id, long nr, ··· 54 83 static inline void kiocb_set_cancel_fn(struct kiocb *req, 55 84 kiocb_cancel_fn *cancel) { } 56 85 #endif /* CONFIG_AIO */ 57 - 58 - static inline struct kiocb *list_kiocb(struct list_head *h) 59 - { 60 - return list_entry(h, struct kiocb, ki_list); 61 - } 62 86 63 87 /* for sysctl: */ 64 88 extern unsigned long aio_nr;