Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge tag 'for-5.3/io_uring-20190711' of git://git.kernel.dk/linux-block

Pull io_uring updates from Jens Axboe:
"This contains:

- Support for recvmsg/sendmsg as first class opcodes.

I don't envision going much further down this path, as there are
plans in progress to support potentially any system call in an
async fashion through io_uring. But I think it does make sense to
have certain core ops available directly, especially those that can
support a "try this non-blocking" flag/mode. (me)

- Handle generic short reads automatically.

This can happen fairly easily if parts of the buffered read is
cached. Since the application needs to issue another request for
the remainder, just do this internally and save kernel/user
roundtrip while providing a nicer more robust API. (me)

- Support for linked SQEs.

This allows SQEs to depend on each other, enabling an application
to eg queue a read-from-this-file,write-to-that-file pair. (me)

- Fix race in stopping SQ thread (Jackie)"

* tag 'for-5.3/io_uring-20190711' of git://git.kernel.dk/linux-block:
io_uring: fix io_sq_thread_stop running in front of io_sq_thread
io_uring: add support for recvmsg()
io_uring: add support for sendmsg()
io_uring: add support for sqe links
io_uring: punt short reads to async context
uio: make import_iovec()/compat_import_iovec() return bytes on success

+330 -76
+5 -4
fs/aio.c
··· 1479 1479 return 0; 1480 1480 } 1481 1481 1482 - static int aio_setup_rw(int rw, const struct iocb *iocb, struct iovec **iovec, 1483 - bool vectored, bool compat, struct iov_iter *iter) 1482 + static ssize_t aio_setup_rw(int rw, const struct iocb *iocb, 1483 + struct iovec **iovec, bool vectored, bool compat, 1484 + struct iov_iter *iter) 1484 1485 { 1485 1486 void __user *buf = (void __user *)(uintptr_t)iocb->aio_buf; 1486 1487 size_t len = iocb->aio_nbytes; ··· 1538 1537 return -EINVAL; 1539 1538 1540 1539 ret = aio_setup_rw(READ, iocb, &iovec, vectored, compat, &iter); 1541 - if (ret) 1540 + if (ret < 0) 1542 1541 return ret; 1543 1542 ret = rw_verify_area(READ, file, &req->ki_pos, iov_iter_count(&iter)); 1544 1543 if (!ret) ··· 1566 1565 return -EINVAL; 1567 1566 1568 1567 ret = aio_setup_rw(WRITE, iocb, &iovec, vectored, compat, &iter); 1569 - if (ret) 1568 + if (ret < 0) 1570 1569 return ret; 1571 1570 ret = rw_verify_area(WRITE, file, &req->ki_pos, iov_iter_count(&iter)); 1572 1571 if (!ret) {
+281 -57
fs/io_uring.c
··· 231 231 struct task_struct *sqo_thread; /* if using sq thread polling */ 232 232 struct mm_struct *sqo_mm; 233 233 wait_queue_head_t sqo_wait; 234 + struct completion sqo_thread_started; 234 235 235 236 struct { 236 237 /* CQ ring */ ··· 323 322 324 323 struct io_ring_ctx *ctx; 325 324 struct list_head list; 325 + struct list_head link_list; 326 326 unsigned int flags; 327 327 refcount_t refs; 328 328 #define REQ_F_NOWAIT 1 /* must not punt to workers */ ··· 332 330 #define REQ_F_SEQ_PREV 8 /* sequential with previous */ 333 331 #define REQ_F_IO_DRAIN 16 /* drain existing IO first */ 334 332 #define REQ_F_IO_DRAINED 32 /* drain done */ 333 + #define REQ_F_LINK 64 /* linked sqes */ 334 + #define REQ_F_FAIL_LINK 128 /* fail rest of links */ 335 335 u64 user_data; 336 - u32 error; /* iopoll result from callback */ 336 + u32 result; 337 337 u32 sequence; 338 338 339 339 struct work_struct work; ··· 407 403 ctx->flags = p->flags; 408 404 init_waitqueue_head(&ctx->cq_wait); 409 405 init_completion(&ctx->ctx_done); 406 + init_completion(&ctx->sqo_thread_started); 410 407 mutex_init(&ctx->uring_lock); 411 408 init_waitqueue_head(&ctx->wait); 412 409 for (i = 0; i < ARRAY_SIZE(ctx->pending_async); i++) { ··· 589 584 req->flags = 0; 590 585 /* one is dropped after submission, the other at completion */ 591 586 refcount_set(&req->refs, 2); 587 + req->result = 0; 592 588 return req; 593 589 out: 594 590 io_ring_drop_ctx_refs(ctx, 1); ··· 605 599 } 606 600 } 607 601 608 - static void io_free_req(struct io_kiocb *req) 602 + static void __io_free_req(struct io_kiocb *req) 609 603 { 610 604 if (req->file && !(req->flags & REQ_F_FIXED_FILE)) 611 605 fput(req->file); 612 606 io_ring_drop_ctx_refs(req->ctx, 1); 613 607 kmem_cache_free(req_cachep, req); 608 + } 609 + 610 + static void io_req_link_next(struct io_kiocb *req) 611 + { 612 + struct io_kiocb *nxt; 613 + 614 + /* 615 + * The list should never be empty when we are called here. But could 616 + * potentially happen if the chain is messed up, check to be on the 617 + * safe side. 618 + */ 619 + nxt = list_first_entry_or_null(&req->link_list, struct io_kiocb, list); 620 + if (nxt) { 621 + list_del(&nxt->list); 622 + if (!list_empty(&req->link_list)) { 623 + INIT_LIST_HEAD(&nxt->link_list); 624 + list_splice(&req->link_list, &nxt->link_list); 625 + nxt->flags |= REQ_F_LINK; 626 + } 627 + 628 + INIT_WORK(&nxt->work, io_sq_wq_submit_work); 629 + queue_work(req->ctx->sqo_wq, &nxt->work); 630 + } 631 + } 632 + 633 + /* 634 + * Called if REQ_F_LINK is set, and we fail the head request 635 + */ 636 + static void io_fail_links(struct io_kiocb *req) 637 + { 638 + struct io_kiocb *link; 639 + 640 + while (!list_empty(&req->link_list)) { 641 + link = list_first_entry(&req->link_list, struct io_kiocb, list); 642 + list_del(&link->list); 643 + 644 + io_cqring_add_event(req->ctx, link->user_data, -ECANCELED); 645 + __io_free_req(link); 646 + } 647 + } 648 + 649 + static void io_free_req(struct io_kiocb *req) 650 + { 651 + /* 652 + * If LINK is set, we have dependent requests in this chain. If we 653 + * didn't fail this request, queue the first one up, moving any other 654 + * dependencies to the next request. In case of failure, fail the rest 655 + * of the chain. 656 + */ 657 + if (req->flags & REQ_F_LINK) { 658 + if (req->flags & REQ_F_FAIL_LINK) 659 + io_fail_links(req); 660 + else 661 + io_req_link_next(req); 662 + } 663 + 664 + __io_free_req(req); 614 665 } 615 666 616 667 static void io_put_req(struct io_kiocb *req) ··· 691 628 req = list_first_entry(done, struct io_kiocb, list); 692 629 list_del(&req->list); 693 630 694 - io_cqring_fill_event(ctx, req->user_data, req->error); 631 + io_cqring_fill_event(ctx, req->user_data, req->result); 695 632 (*nr_events)++; 696 633 697 634 if (refcount_dec_and_test(&req->refs)) { 698 635 /* If we're not using fixed files, we have to pair the 699 636 * completion part with the file put. Use regular 700 637 * completions for those, only batch free for fixed 701 - * file. 638 + * file and non-linked commands. 702 639 */ 703 - if (req->flags & REQ_F_FIXED_FILE) { 640 + if ((req->flags & (REQ_F_FIXED_FILE|REQ_F_LINK)) == 641 + REQ_F_FIXED_FILE) { 704 642 reqs[to_free++] = req; 705 643 if (to_free == ARRAY_SIZE(reqs)) 706 644 io_free_req_many(ctx, reqs, &to_free); ··· 840 776 841 777 kiocb_end_write(kiocb); 842 778 779 + if ((req->flags & REQ_F_LINK) && res != req->result) 780 + req->flags |= REQ_F_FAIL_LINK; 843 781 io_cqring_add_event(req->ctx, req->user_data, res); 844 782 io_put_req(req); 845 783 } ··· 852 786 853 787 kiocb_end_write(kiocb); 854 788 855 - req->error = res; 789 + if ((req->flags & REQ_F_LINK) && res != req->result) 790 + req->flags |= REQ_F_FAIL_LINK; 791 + req->result = res; 856 792 if (res != -EAGAIN) 857 793 req->flags |= REQ_F_IOPOLL_COMPLETED; 858 794 } ··· 997 929 !kiocb->ki_filp->f_op->iopoll) 998 930 return -EOPNOTSUPP; 999 931 1000 - req->error = 0; 1001 932 kiocb->ki_flags |= IOCB_HIPRI; 1002 933 kiocb->ki_complete = io_complete_rw_iopoll; 1003 934 } else { ··· 1068 1001 return 0; 1069 1002 } 1070 1003 1071 - static int io_import_iovec(struct io_ring_ctx *ctx, int rw, 1072 - const struct sqe_submit *s, struct iovec **iovec, 1073 - struct iov_iter *iter) 1004 + static ssize_t io_import_iovec(struct io_ring_ctx *ctx, int rw, 1005 + const struct sqe_submit *s, struct iovec **iovec, 1006 + struct iov_iter *iter) 1074 1007 { 1075 1008 const struct io_uring_sqe *sqe = s->sqe; 1076 1009 void __user *buf = u64_to_user_ptr(READ_ONCE(sqe->addr)); ··· 1088 1021 opcode = READ_ONCE(sqe->opcode); 1089 1022 if (opcode == IORING_OP_READ_FIXED || 1090 1023 opcode == IORING_OP_WRITE_FIXED) { 1091 - int ret = io_import_fixed(ctx, rw, sqe, iter); 1024 + ssize_t ret = io_import_fixed(ctx, rw, sqe, iter); 1092 1025 *iovec = NULL; 1093 1026 return ret; 1094 1027 } ··· 1154 1087 struct iov_iter iter; 1155 1088 struct file *file; 1156 1089 size_t iov_count; 1157 - int ret; 1090 + ssize_t read_size, ret; 1158 1091 1159 1092 ret = io_prep_rw(req, s, force_nonblock); 1160 1093 if (ret) ··· 1167 1100 return -EINVAL; 1168 1101 1169 1102 ret = io_import_iovec(req->ctx, READ, s, &iovec, &iter); 1170 - if (ret) 1103 + if (ret < 0) 1171 1104 return ret; 1105 + 1106 + read_size = ret; 1107 + if (req->flags & REQ_F_LINK) 1108 + req->result = read_size; 1172 1109 1173 1110 iov_count = iov_iter_count(&iter); 1174 1111 ret = rw_verify_area(READ, file, &kiocb->ki_pos, iov_count); 1175 1112 if (!ret) { 1176 1113 ssize_t ret2; 1177 1114 1178 - /* Catch -EAGAIN return for forced non-blocking submission */ 1179 1115 ret2 = call_read_iter(file, kiocb, &iter); 1116 + /* 1117 + * In case of a short read, punt to async. This can happen 1118 + * if we have data partially cached. Alternatively we can 1119 + * return the short read, in which case the application will 1120 + * need to issue another SQE and wait for it. That SQE will 1121 + * need async punt anyway, so it's more efficient to do it 1122 + * here. 1123 + */ 1124 + if (force_nonblock && ret2 > 0 && ret2 < read_size) 1125 + ret2 = -EAGAIN; 1126 + /* Catch -EAGAIN return for forced non-blocking submission */ 1180 1127 if (!force_nonblock || ret2 != -EAGAIN) { 1181 1128 io_rw_done(kiocb, ret2); 1182 1129 } else { ··· 1215 1134 struct iov_iter iter; 1216 1135 struct file *file; 1217 1136 size_t iov_count; 1218 - int ret; 1137 + ssize_t ret; 1219 1138 1220 1139 ret = io_prep_rw(req, s, force_nonblock); 1221 1140 if (ret) ··· 1228 1147 return -EINVAL; 1229 1148 1230 1149 ret = io_import_iovec(req->ctx, WRITE, s, &iovec, &iter); 1231 - if (ret) 1150 + if (ret < 0) 1232 1151 return ret; 1152 + 1153 + if (req->flags & REQ_F_LINK) 1154 + req->result = ret; 1233 1155 1234 1156 iov_count = iov_iter_count(&iter); 1235 1157 ··· 1337 1253 end > 0 ? end : LLONG_MAX, 1338 1254 fsync_flags & IORING_FSYNC_DATASYNC); 1339 1255 1256 + if (ret < 0 && (req->flags & REQ_F_LINK)) 1257 + req->flags |= REQ_F_FAIL_LINK; 1340 1258 io_cqring_add_event(req->ctx, sqe->user_data, ret); 1341 1259 io_put_req(req); 1342 1260 return 0; ··· 1383 1297 1384 1298 ret = sync_file_range(req->rw.ki_filp, sqe_off, sqe_len, flags); 1385 1299 1300 + if (ret < 0 && (req->flags & REQ_F_LINK)) 1301 + req->flags |= REQ_F_FAIL_LINK; 1386 1302 io_cqring_add_event(req->ctx, sqe->user_data, ret); 1387 1303 io_put_req(req); 1388 1304 return 0; 1305 + } 1306 + 1307 + #if defined(CONFIG_NET) 1308 + static int io_send_recvmsg(struct io_kiocb *req, const struct io_uring_sqe *sqe, 1309 + bool force_nonblock, 1310 + long (*fn)(struct socket *, struct user_msghdr __user *, 1311 + unsigned int)) 1312 + { 1313 + struct socket *sock; 1314 + int ret; 1315 + 1316 + if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) 1317 + return -EINVAL; 1318 + 1319 + sock = sock_from_file(req->file, &ret); 1320 + if (sock) { 1321 + struct user_msghdr __user *msg; 1322 + unsigned flags; 1323 + 1324 + flags = READ_ONCE(sqe->msg_flags); 1325 + if (flags & MSG_DONTWAIT) 1326 + req->flags |= REQ_F_NOWAIT; 1327 + else if (force_nonblock) 1328 + flags |= MSG_DONTWAIT; 1329 + 1330 + msg = (struct user_msghdr __user *) (unsigned long) 1331 + READ_ONCE(sqe->addr); 1332 + 1333 + ret = fn(sock, msg, flags); 1334 + if (force_nonblock && ret == -EAGAIN) 1335 + return ret; 1336 + } 1337 + 1338 + io_cqring_add_event(req->ctx, sqe->user_data, ret); 1339 + io_put_req(req); 1340 + return 0; 1341 + } 1342 + #endif 1343 + 1344 + static int io_sendmsg(struct io_kiocb *req, const struct io_uring_sqe *sqe, 1345 + bool force_nonblock) 1346 + { 1347 + #if defined(CONFIG_NET) 1348 + return io_send_recvmsg(req, sqe, force_nonblock, __sys_sendmsg_sock); 1349 + #else 1350 + return -EOPNOTSUPP; 1351 + #endif 1352 + } 1353 + 1354 + static int io_recvmsg(struct io_kiocb *req, const struct io_uring_sqe *sqe, 1355 + bool force_nonblock) 1356 + { 1357 + #if defined(CONFIG_NET) 1358 + return io_send_recvmsg(req, sqe, force_nonblock, __sys_recvmsg_sock); 1359 + #else 1360 + return -EOPNOTSUPP; 1361 + #endif 1389 1362 } 1390 1363 1391 1364 static void io_poll_remove_one(struct io_kiocb *req) ··· 1694 1549 { 1695 1550 int ret, opcode; 1696 1551 1552 + req->user_data = READ_ONCE(s->sqe->user_data); 1553 + 1697 1554 if (unlikely(s->index >= ctx->sq_entries)) 1698 1555 return -EINVAL; 1699 - req->user_data = READ_ONCE(s->sqe->user_data); 1700 1556 1701 1557 opcode = READ_ONCE(s->sqe->opcode); 1702 1558 switch (opcode) { ··· 1732 1586 case IORING_OP_SYNC_FILE_RANGE: 1733 1587 ret = io_sync_file_range(req, s->sqe, force_nonblock); 1734 1588 break; 1589 + case IORING_OP_SENDMSG: 1590 + ret = io_sendmsg(req, s->sqe, force_nonblock); 1591 + break; 1592 + case IORING_OP_RECVMSG: 1593 + ret = io_recvmsg(req, s->sqe, force_nonblock); 1594 + break; 1735 1595 default: 1736 1596 ret = -EINVAL; 1737 1597 break; ··· 1747 1595 return ret; 1748 1596 1749 1597 if (ctx->flags & IORING_SETUP_IOPOLL) { 1750 - if (req->error == -EAGAIN) 1598 + if (req->result == -EAGAIN) 1751 1599 return -EAGAIN; 1752 1600 1753 1601 /* workqueue context doesn't hold uring_lock, grab it now */ ··· 1971 1819 return 0; 1972 1820 } 1973 1821 1974 - static int io_submit_sqe(struct io_ring_ctx *ctx, struct sqe_submit *s, 1975 - struct io_submit_state *state) 1822 + static int io_queue_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req, 1823 + struct sqe_submit *s) 1976 1824 { 1977 - struct io_kiocb *req; 1978 1825 int ret; 1979 - 1980 - /* enforce forwards compatibility on users */ 1981 - if (unlikely(s->sqe->flags & ~(IOSQE_FIXED_FILE | IOSQE_IO_DRAIN))) 1982 - return -EINVAL; 1983 - 1984 - req = io_get_req(ctx, state); 1985 - if (unlikely(!req)) 1986 - return -EAGAIN; 1987 - 1988 - ret = io_req_set_file(ctx, s, state, req); 1989 - if (unlikely(ret)) 1990 - goto out; 1991 - 1992 - ret = io_req_defer(ctx, req, s->sqe); 1993 - if (ret) { 1994 - if (ret == -EIOCBQUEUED) 1995 - ret = 0; 1996 - return ret; 1997 - } 1998 1826 1999 1827 ret = __io_submit_sqe(ctx, req, s, true); 2000 1828 if (ret == -EAGAIN && !(req->flags & REQ_F_NOWAIT)) { ··· 1998 1866 1999 1867 /* 2000 1868 * Queued up for async execution, worker will release 2001 - * submit reference when the iocb is actually 2002 - * submitted. 1869 + * submit reference when the iocb is actually submitted. 2003 1870 */ 2004 1871 return 0; 2005 1872 } 2006 1873 } 2007 1874 2008 - out: 2009 1875 /* drop submission reference */ 2010 1876 io_put_req(req); 2011 1877 2012 1878 /* and drop final reference, if we failed */ 2013 - if (ret) 1879 + if (ret) { 1880 + io_cqring_add_event(ctx, req->user_data, ret); 1881 + if (req->flags & REQ_F_LINK) 1882 + req->flags |= REQ_F_FAIL_LINK; 2014 1883 io_put_req(req); 1884 + } 2015 1885 2016 1886 return ret; 1887 + } 1888 + 1889 + #define SQE_VALID_FLAGS (IOSQE_FIXED_FILE|IOSQE_IO_DRAIN|IOSQE_IO_LINK) 1890 + 1891 + static void io_submit_sqe(struct io_ring_ctx *ctx, struct sqe_submit *s, 1892 + struct io_submit_state *state, struct io_kiocb **link) 1893 + { 1894 + struct io_uring_sqe *sqe_copy; 1895 + struct io_kiocb *req; 1896 + int ret; 1897 + 1898 + /* enforce forwards compatibility on users */ 1899 + if (unlikely(s->sqe->flags & ~SQE_VALID_FLAGS)) { 1900 + ret = -EINVAL; 1901 + goto err; 1902 + } 1903 + 1904 + req = io_get_req(ctx, state); 1905 + if (unlikely(!req)) { 1906 + ret = -EAGAIN; 1907 + goto err; 1908 + } 1909 + 1910 + ret = io_req_set_file(ctx, s, state, req); 1911 + if (unlikely(ret)) { 1912 + err_req: 1913 + io_free_req(req); 1914 + err: 1915 + io_cqring_add_event(ctx, s->sqe->user_data, ret); 1916 + return; 1917 + } 1918 + 1919 + ret = io_req_defer(ctx, req, s->sqe); 1920 + if (ret) { 1921 + if (ret != -EIOCBQUEUED) 1922 + goto err_req; 1923 + return; 1924 + } 1925 + 1926 + /* 1927 + * If we already have a head request, queue this one for async 1928 + * submittal once the head completes. If we don't have a head but 1929 + * IOSQE_IO_LINK is set in the sqe, start a new head. This one will be 1930 + * submitted sync once the chain is complete. If none of those 1931 + * conditions are true (normal request), then just queue it. 1932 + */ 1933 + if (*link) { 1934 + struct io_kiocb *prev = *link; 1935 + 1936 + sqe_copy = kmemdup(s->sqe, sizeof(*sqe_copy), GFP_KERNEL); 1937 + if (!sqe_copy) { 1938 + ret = -EAGAIN; 1939 + goto err_req; 1940 + } 1941 + 1942 + s->sqe = sqe_copy; 1943 + memcpy(&req->submit, s, sizeof(*s)); 1944 + list_add_tail(&req->list, &prev->link_list); 1945 + } else if (s->sqe->flags & IOSQE_IO_LINK) { 1946 + req->flags |= REQ_F_LINK; 1947 + 1948 + memcpy(&req->submit, s, sizeof(*s)); 1949 + INIT_LIST_HEAD(&req->link_list); 1950 + *link = req; 1951 + } else { 1952 + io_queue_sqe(ctx, req, s); 1953 + } 2017 1954 } 2018 1955 2019 1956 /* ··· 2167 1966 unsigned int nr, bool has_user, bool mm_fault) 2168 1967 { 2169 1968 struct io_submit_state state, *statep = NULL; 2170 - int ret, i, submitted = 0; 1969 + struct io_kiocb *link = NULL; 1970 + bool prev_was_link = false; 1971 + int i, submitted = 0; 2171 1972 2172 1973 if (nr > IO_PLUG_THRESHOLD) { 2173 1974 io_submit_state_start(&state, ctx, nr); ··· 2177 1974 } 2178 1975 2179 1976 for (i = 0; i < nr; i++) { 1977 + /* 1978 + * If previous wasn't linked and we have a linked command, 1979 + * that's the end of the chain. Submit the previous link. 1980 + */ 1981 + if (!prev_was_link && link) { 1982 + io_queue_sqe(ctx, link, &link->submit); 1983 + link = NULL; 1984 + } 1985 + prev_was_link = (sqes[i].sqe->flags & IOSQE_IO_LINK) != 0; 1986 + 2180 1987 if (unlikely(mm_fault)) { 2181 - ret = -EFAULT; 1988 + io_cqring_add_event(ctx, sqes[i].sqe->user_data, 1989 + -EFAULT); 2182 1990 } else { 2183 1991 sqes[i].has_user = has_user; 2184 1992 sqes[i].needs_lock = true; 2185 1993 sqes[i].needs_fixed_file = true; 2186 - ret = io_submit_sqe(ctx, &sqes[i], statep); 2187 - } 2188 - if (!ret) { 1994 + io_submit_sqe(ctx, &sqes[i], statep, &link); 2189 1995 submitted++; 2190 - continue; 2191 1996 } 2192 - 2193 - io_cqring_add_event(ctx, sqes[i].sqe->user_data, ret); 2194 1997 } 2195 1998 1999 + if (link) 2000 + io_queue_sqe(ctx, link, &link->submit); 2196 2001 if (statep) 2197 2002 io_submit_state_end(&state); 2198 2003 ··· 2216 2005 DEFINE_WAIT(wait); 2217 2006 unsigned inflight; 2218 2007 unsigned long timeout; 2008 + 2009 + complete(&ctx->sqo_thread_started); 2219 2010 2220 2011 old_fs = get_fs(); 2221 2012 set_fs(USER_DS); ··· 2343 2130 static int io_ring_submit(struct io_ring_ctx *ctx, unsigned int to_submit) 2344 2131 { 2345 2132 struct io_submit_state state, *statep = NULL; 2133 + struct io_kiocb *link = NULL; 2134 + bool prev_was_link = false; 2346 2135 int i, submit = 0; 2347 2136 2348 2137 if (to_submit > IO_PLUG_THRESHOLD) { ··· 2354 2139 2355 2140 for (i = 0; i < to_submit; i++) { 2356 2141 struct sqe_submit s; 2357 - int ret; 2358 2142 2359 2143 if (!io_get_sqring(ctx, &s)) 2360 2144 break; 2145 + 2146 + /* 2147 + * If previous wasn't linked and we have a linked command, 2148 + * that's the end of the chain. Submit the previous link. 2149 + */ 2150 + if (!prev_was_link && link) { 2151 + io_queue_sqe(ctx, link, &link->submit); 2152 + link = NULL; 2153 + } 2154 + prev_was_link = (s.sqe->flags & IOSQE_IO_LINK) != 0; 2361 2155 2362 2156 s.has_user = true; 2363 2157 s.needs_lock = false; 2364 2158 s.needs_fixed_file = false; 2365 2159 submit++; 2366 - 2367 - ret = io_submit_sqe(ctx, &s, statep); 2368 - if (ret) 2369 - io_cqring_add_event(ctx, s.sqe->user_data, ret); 2160 + io_submit_sqe(ctx, &s, statep, &link); 2370 2161 } 2371 2162 io_commit_sqring(ctx); 2372 2163 2164 + if (link) 2165 + io_queue_sqe(ctx, link, &link->submit); 2373 2166 if (statep) 2374 2167 io_submit_state_end(statep); 2375 2168 ··· 2463 2240 static void io_sq_thread_stop(struct io_ring_ctx *ctx) 2464 2241 { 2465 2242 if (ctx->sqo_thread) { 2243 + wait_for_completion(&ctx->sqo_thread_started); 2466 2244 /* 2467 2245 * The park is a bit of a work-around, without it we get 2468 2246 * warning spews on shutdown with SQPOLL set and affinity
+4 -4
fs/splice.c
··· 1356 1356 struct iovec iovstack[UIO_FASTIOV]; 1357 1357 struct iovec *iov = iovstack; 1358 1358 struct iov_iter iter; 1359 - long error; 1359 + ssize_t error; 1360 1360 struct fd f; 1361 1361 int type; 1362 1362 ··· 1367 1367 1368 1368 error = import_iovec(type, uiov, nr_segs, 1369 1369 ARRAY_SIZE(iovstack), &iov, &iter); 1370 - if (!error) { 1370 + if (error >= 0) { 1371 1371 error = do_vmsplice(f.file, &iter, flags); 1372 1372 kfree(iov); 1373 1373 } ··· 1382 1382 struct iovec iovstack[UIO_FASTIOV]; 1383 1383 struct iovec *iov = iovstack; 1384 1384 struct iov_iter iter; 1385 - long error; 1385 + ssize_t error; 1386 1386 struct fd f; 1387 1387 int type; 1388 1388 ··· 1393 1393 1394 1394 error = compat_import_iovec(type, iov32, nr_segs, 1395 1395 ARRAY_SIZE(iovstack), &iov, &iter); 1396 - if (!error) { 1396 + if (error >= 0) { 1397 1397 error = do_vmsplice(f.file, &iter, flags); 1398 1398 kfree(iov); 1399 1399 }
+7
include/linux/socket.h
··· 12 12 13 13 struct pid; 14 14 struct cred; 15 + struct socket; 15 16 16 17 #define __sockaddr_check_size(size) \ 17 18 BUILD_BUG_ON(((size) > sizeof(struct __kernel_sockaddr_storage))) ··· 375 374 extern int __sys_sendmmsg(int fd, struct mmsghdr __user *mmsg, 376 375 unsigned int vlen, unsigned int flags, 377 376 bool forbid_cmsg_compat); 377 + extern long __sys_sendmsg_sock(struct socket *sock, 378 + struct user_msghdr __user *msg, 379 + unsigned int flags); 380 + extern long __sys_recvmsg_sock(struct socket *sock, 381 + struct user_msghdr __user *msg, 382 + unsigned int flags); 378 383 379 384 /* helpers which do the actual work for syscalls */ 380 385 extern int __sys_recvfrom(int fd, void __user *ubuf, size_t size,
+2 -2
include/linux/uio.h
··· 267 267 size_t hash_and_copy_to_iter(const void *addr, size_t bytes, void *hashp, 268 268 struct iov_iter *i); 269 269 270 - int import_iovec(int type, const struct iovec __user * uvector, 270 + ssize_t import_iovec(int type, const struct iovec __user * uvector, 271 271 unsigned nr_segs, unsigned fast_segs, 272 272 struct iovec **iov, struct iov_iter *i); 273 273 274 274 #ifdef CONFIG_COMPAT 275 275 struct compat_iovec; 276 - int compat_import_iovec(int type, const struct compat_iovec __user * uvector, 276 + ssize_t compat_import_iovec(int type, const struct compat_iovec __user * uvector, 277 277 unsigned nr_segs, unsigned fast_segs, 278 278 struct iovec **iov, struct iov_iter *i); 279 279 #endif
+4
include/uapi/linux/io_uring.h
··· 27 27 __u32 fsync_flags; 28 28 __u16 poll_events; 29 29 __u32 sync_range_flags; 30 + __u32 msg_flags; 30 31 }; 31 32 __u64 user_data; /* data to be passed back at completion time */ 32 33 union { ··· 41 40 */ 42 41 #define IOSQE_FIXED_FILE (1U << 0) /* use fixed fileset */ 43 42 #define IOSQE_IO_DRAIN (1U << 1) /* issue after inflight IO */ 43 + #define IOSQE_IO_LINK (1U << 2) /* links next sqe */ 44 44 45 45 /* 46 46 * io_uring_setup() flags ··· 59 57 #define IORING_OP_POLL_ADD 6 60 58 #define IORING_OP_POLL_REMOVE 7 61 59 #define IORING_OP_SYNC_FILE_RANGE 8 60 + #define IORING_OP_SENDMSG 9 61 + #define IORING_OP_RECVMSG 10 62 62 63 63 /* 64 64 * sqe->fsync_flags
+8 -7
lib/iov_iter.c
··· 1634 1634 * on-stack array was used or not (and regardless of whether this function 1635 1635 * returns an error or not). 1636 1636 * 1637 - * Return: 0 on success or negative error code on error. 1637 + * Return: Negative error code on error, bytes imported on success 1638 1638 */ 1639 - int import_iovec(int type, const struct iovec __user * uvector, 1639 + ssize_t import_iovec(int type, const struct iovec __user * uvector, 1640 1640 unsigned nr_segs, unsigned fast_segs, 1641 1641 struct iovec **iov, struct iov_iter *i) 1642 1642 { ··· 1652 1652 } 1653 1653 iov_iter_init(i, type, p, nr_segs, n); 1654 1654 *iov = p == *iov ? NULL : p; 1655 - return 0; 1655 + return n; 1656 1656 } 1657 1657 EXPORT_SYMBOL(import_iovec); 1658 1658 1659 1659 #ifdef CONFIG_COMPAT 1660 1660 #include <linux/compat.h> 1661 1661 1662 - int compat_import_iovec(int type, const struct compat_iovec __user * uvector, 1663 - unsigned nr_segs, unsigned fast_segs, 1664 - struct iovec **iov, struct iov_iter *i) 1662 + ssize_t compat_import_iovec(int type, 1663 + const struct compat_iovec __user * uvector, 1664 + unsigned nr_segs, unsigned fast_segs, 1665 + struct iovec **iov, struct iov_iter *i) 1665 1666 { 1666 1667 ssize_t n; 1667 1668 struct iovec *p; ··· 1676 1675 } 1677 1676 iov_iter_init(i, type, p, nr_segs, n); 1678 1677 *iov = p == *iov ? NULL : p; 1679 - return 0; 1678 + return n; 1680 1679 } 1681 1680 #endif 1682 1681
+2 -1
net/compat.c
··· 80 80 81 81 kmsg->msg_iocb = NULL; 82 82 83 - return compat_import_iovec(save_addr ? READ : WRITE, 83 + err = compat_import_iovec(save_addr ? READ : WRITE, 84 84 compat_ptr(msg.msg_iov), msg.msg_iovlen, 85 85 UIO_FASTIOV, iov, &kmsg->msg_iter); 86 + return err < 0 ? err : 0; 86 87 } 87 88 88 89 /* Bleech... */
+17 -1
net/socket.c
··· 2222 2222 2223 2223 kmsg->msg_iocb = NULL; 2224 2224 2225 - return import_iovec(save_addr ? READ : WRITE, 2225 + err = import_iovec(save_addr ? READ : WRITE, 2226 2226 msg.msg_iov, msg.msg_iovlen, 2227 2227 UIO_FASTIOV, iov, &kmsg->msg_iter); 2228 + return err < 0 ? err : 0; 2228 2229 } 2229 2230 2230 2231 static int ___sys_sendmsg(struct socket *sock, struct user_msghdr __user *msg, ··· 2327 2326 /* 2328 2327 * BSD sendmsg interface 2329 2328 */ 2329 + long __sys_sendmsg_sock(struct socket *sock, struct user_msghdr __user *msg, 2330 + unsigned int flags) 2331 + { 2332 + struct msghdr msg_sys; 2333 + 2334 + return ___sys_sendmsg(sock, msg, &msg_sys, flags, NULL, 0); 2335 + } 2330 2336 2331 2337 long __sys_sendmsg(int fd, struct user_msghdr __user *msg, unsigned int flags, 2332 2338 bool forbid_cmsg_compat) ··· 2507 2499 /* 2508 2500 * BSD recvmsg interface 2509 2501 */ 2502 + 2503 + long __sys_recvmsg_sock(struct socket *sock, struct user_msghdr __user *msg, 2504 + unsigned int flags) 2505 + { 2506 + struct msghdr msg_sys; 2507 + 2508 + return ___sys_recvmsg(sock, msg, &msg_sys, flags, 0); 2509 + } 2510 2510 2511 2511 long __sys_recvmsg(int fd, struct user_msghdr __user *msg, unsigned int flags, 2512 2512 bool forbid_cmsg_compat)