Merge branch 'for-5.19/io_uring-socket' into for-5.19/io_uring-net

+29

fs/internal.h

··· 191 191 struct pipe_inode_info *opipe, 192 192 loff_t *offset, 193 193 size_t len, unsigned int flags); 194 + 195 + /* 196 + * fs/xattr.c: 197 + */ 198 + struct xattr_name { 199 + char name[XATTR_NAME_MAX + 1]; 200 + }; 201 + 202 + struct xattr_ctx { 203 + /* Value of attribute */ 204 + union { 205 + const void __user *cvalue; 206 + void __user *value; 207 + }; 208 + void *kvalue; 209 + size_t size; 210 + /* Attribute name */ 211 + struct xattr_name *kname; 212 + unsigned int flags; 213 + }; 214 + 215 + 216 + ssize_t do_getxattr(struct user_namespace *mnt_userns, 217 + struct dentry *d, 218 + struct xattr_ctx *ctx); 219 + 220 + int setxattr_copy(const char __user *name, struct xattr_ctx *ctx); 221 + int do_setxattr(struct user_namespace *mnt_userns, struct dentry *dentry, 222 + struct xattr_ctx *ctx);

+1

fs/io-wq.h

··· 155 155 struct io_wq_work { 156 156 struct io_wq_work_node list; 157 157 unsigned flags; 158 + int cancel_seq; 158 159 }; 159 160 160 161 static inline struct io_wq_work *wq_next_work(struct io_wq_work *work)

+1348 -677

fs/io_uring.c

··· 80 80 #include <linux/io_uring.h> 81 81 #include <linux/audit.h> 82 82 #include <linux/security.h> 83 + #include <linux/xattr.h> 83 84 84 85 #define CREATE_TRACE_POINTS 85 86 #include <trace/events/io_uring.h> ··· 113 112 114 113 #define IO_REQ_CLEAN_FLAGS (REQ_F_BUFFER_SELECTED | REQ_F_NEED_CLEANUP | \ 115 114 REQ_F_POLLED | REQ_F_CREDS | REQ_F_ASYNC_DATA) 115 + 116 + #define IO_REQ_CLEAN_SLOW_FLAGS (REQ_F_REFCOUNT | REQ_F_LINK | REQ_F_HARDLINK |\ 117 + IO_REQ_CLEAN_FLAGS) 116 118 117 119 #define IO_TCTX_REFS_CACHE_NR (1U << 10) 118 120 ··· 223 219 struct io_uring_cqe cqe; 224 220 struct list_head list; 225 221 }; 222 + 223 + /* 224 + * FFS_SCM is only available on 64-bit archs, for 32-bit we just define it as 0 225 + * and define IO_URING_SCM_ALL. For this case, we use SCM for all files as we 226 + * can't safely always dereference the file when the task has exited and ring 227 + * cleanup is done. If a file is tracked and part of SCM, then unix gc on 228 + * process exit may reap it before __io_sqe_files_unregister() is run. 229 + */ 230 + #define FFS_NOWAIT 0x1UL 231 + #define FFS_ISREG 0x2UL 232 + #if defined(CONFIG_64BIT) 233 + #define FFS_SCM 0x4UL 234 + #else 235 + #define IO_URING_SCM_ALL 236 + #define FFS_SCM 0x0UL 237 + #endif 238 + #define FFS_MASK ~(FFS_NOWAIT|FFS_ISREG|FFS_SCM) 226 239 227 240 struct io_fixed_file { 228 241 /* file * with additional FFS_* flags */ ··· 374 353 unsigned int drain_active: 1; 375 354 unsigned int drain_disabled: 1; 376 355 unsigned int has_evfd: 1; 356 + unsigned int syscall_iopoll: 1; 377 357 } ____cacheline_aligned_in_smp; 378 358 379 359 /* submission data */ ··· 404 382 */ 405 383 struct io_rsrc_node *rsrc_node; 406 384 int rsrc_cached_refs; 385 + atomic_t cancel_seq; 407 386 struct io_file_table file_table; 408 387 unsigned nr_user_files; 409 388 unsigned nr_user_bufs; ··· 432 409 struct wait_queue_head sqo_sq_wait; 433 410 struct list_head sqd_list; 434 411 435 - unsigned long check_cq_overflow; 412 + unsigned long check_cq; 436 413 437 414 struct { 415 + /* 416 + * We cache a range of free CQEs we can use, once exhausted it 417 + * should go through a slower range setup, see __io_get_cqe() 418 + */ 419 + struct io_uring_cqe *cqe_cached; 420 + struct io_uring_cqe *cqe_sentinel; 421 + 438 422 unsigned cached_cq_tail; 439 423 unsigned cq_entries; 440 424 struct io_ev_fd __rcu *io_ev_fd; ··· 576 546 unsigned long nofile; 577 547 }; 578 548 549 + struct io_socket { 550 + struct file *file; 551 + int domain; 552 + int type; 553 + int protocol; 554 + int flags; 555 + u32 file_slot; 556 + unsigned long nofile; 557 + }; 558 + 579 559 struct io_sync { 580 560 struct file *file; 581 561 loff_t len; ··· 597 557 struct io_cancel { 598 558 struct file *file; 599 559 u64 addr; 560 + u32 flags; 561 + s32 fd; 600 562 }; 601 563 602 564 struct io_timeout { ··· 790 748 struct wait_page_queue wpq; 791 749 }; 792 750 751 + struct io_xattr { 752 + struct file *file; 753 + struct xattr_ctx ctx; 754 + struct filename *filename; 755 + }; 756 + 793 757 enum { 794 758 REQ_F_FIXED_FILE_BIT = IOSQE_FIXED_FILE_BIT, 795 759 REQ_F_IO_DRAIN_BIT = IOSQE_IO_DRAIN_BIT, ··· 910 862 IORING_RSRC_BUFFER = 1, 911 863 }; 912 864 865 + struct io_cqe { 866 + __u64 user_data; 867 + __s32 res; 868 + /* fd initially, then cflags for completion */ 869 + union { 870 + __u32 flags; 871 + int fd; 872 + }; 873 + }; 874 + 875 + enum { 876 + IO_CHECK_CQ_OVERFLOW_BIT, 877 + IO_CHECK_CQ_DROPPED_BIT, 878 + }; 879 + 913 880 /* 914 881 * NOTE! Each of the iocb union members has the file pointer 915 882 * as the first entry in their struct definition. So you can ··· 960 897 struct io_symlink symlink; 961 898 struct io_hardlink hardlink; 962 899 struct io_msg msg; 900 + struct io_xattr xattr; 901 + struct io_socket sock; 963 902 }; 964 903 965 904 u8 opcode; ··· 970 905 u16 buf_index; 971 906 unsigned int flags; 972 907 973 - u64 user_data; 974 - u32 result; 975 - /* fd initially, then cflags for completion */ 976 - union { 977 - u32 cflags; 978 - int fd; 979 - }; 908 + struct io_cqe cqe; 980 909 981 910 struct io_ring_ctx *ctx; 982 911 struct task_struct *task; 983 912 984 - struct percpu_ref *fixed_rsrc_refs; 913 + struct io_rsrc_node *rsrc_node; 985 914 /* store used ubuf, so we can prevent reloading */ 986 915 struct io_mapped_ubuf *imu; 987 916 ··· 1013 954 struct list_head list; 1014 955 struct io_kiocb *req; 1015 956 u32 seq; 957 + }; 958 + 959 + struct io_cancel_data { 960 + struct io_ring_ctx *ctx; 961 + union { 962 + u64 data; 963 + struct file *file; 964 + }; 965 + u32 flags; 966 + int seq; 1016 967 }; 1017 968 1018 969 struct io_op_def { ··· 1230 1161 [IORING_OP_MSG_RING] = { 1231 1162 .needs_file = 1, 1232 1163 }, 1164 + [IORING_OP_FSETXATTR] = { 1165 + .needs_file = 1 1166 + }, 1167 + [IORING_OP_SETXATTR] = {}, 1168 + [IORING_OP_FGETXATTR] = { 1169 + .needs_file = 1 1170 + }, 1171 + [IORING_OP_GETXATTR] = {}, 1172 + [IORING_OP_SOCKET] = { 1173 + .audit_skip = 1, 1174 + }, 1233 1175 }; 1234 1176 1235 1177 /* requests with any of those set should undergo io_disarm_next() */ 1236 1178 #define IO_DISARM_MASK (REQ_F_ARM_LTIMEOUT | REQ_F_LINK_TIMEOUT | REQ_F_FAIL) 1179 + #define IO_REQ_LINK_FLAGS (REQ_F_LINK | REQ_F_HARDLINK) 1237 1180 1238 1181 static bool io_disarm_next(struct io_kiocb *req); 1239 1182 static void io_uring_del_tctx_node(unsigned long index); ··· 1254 1173 bool cancel_all); 1255 1174 static void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd); 1256 1175 1257 - static void io_fill_cqe_req(struct io_kiocb *req, s32 res, u32 cflags); 1258 - 1259 - static void io_put_req(struct io_kiocb *req); 1260 - static void io_put_req_deferred(struct io_kiocb *req); 1176 + static void __io_req_complete_post(struct io_kiocb *req, s32 res, u32 cflags); 1261 1177 static void io_dismantle_req(struct io_kiocb *req); 1262 1178 static void io_queue_linked_timeout(struct io_kiocb *req); 1263 1179 static int __io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned type, ··· 1266 1188 static inline struct file *io_file_get_normal(struct io_kiocb *req, int fd); 1267 1189 static void io_drop_inflight_file(struct io_kiocb *req); 1268 1190 static bool io_assign_file(struct io_kiocb *req, unsigned int issue_flags); 1269 - static void __io_queue_sqe(struct io_kiocb *req); 1191 + static void io_queue_sqe(struct io_kiocb *req); 1270 1192 static void io_rsrc_put_work(struct work_struct *work); 1271 1193 1272 1194 static void io_req_task_queue(struct io_kiocb *req); ··· 1279 1201 1280 1202 static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer); 1281 1203 static void io_eventfd_signal(struct io_ring_ctx *ctx); 1204 + static void io_req_tw_post_queue(struct io_kiocb *req, s32 res, u32 cflags); 1282 1205 1283 1206 static struct kmem_cache *req_cachep; 1284 1207 1285 1208 static const struct file_operations io_uring_fops; 1209 + 1210 + const char *io_uring_get_opcode(u8 opcode) 1211 + { 1212 + switch ((enum io_uring_op)opcode) { 1213 + case IORING_OP_NOP: 1214 + return "NOP"; 1215 + case IORING_OP_READV: 1216 + return "READV"; 1217 + case IORING_OP_WRITEV: 1218 + return "WRITEV"; 1219 + case IORING_OP_FSYNC: 1220 + return "FSYNC"; 1221 + case IORING_OP_READ_FIXED: 1222 + return "READ_FIXED"; 1223 + case IORING_OP_WRITE_FIXED: 1224 + return "WRITE_FIXED"; 1225 + case IORING_OP_POLL_ADD: 1226 + return "POLL_ADD"; 1227 + case IORING_OP_POLL_REMOVE: 1228 + return "POLL_REMOVE"; 1229 + case IORING_OP_SYNC_FILE_RANGE: 1230 + return "SYNC_FILE_RANGE"; 1231 + case IORING_OP_SENDMSG: 1232 + return "SENDMSG"; 1233 + case IORING_OP_RECVMSG: 1234 + return "RECVMSG"; 1235 + case IORING_OP_TIMEOUT: 1236 + return "TIMEOUT"; 1237 + case IORING_OP_TIMEOUT_REMOVE: 1238 + return "TIMEOUT_REMOVE"; 1239 + case IORING_OP_ACCEPT: 1240 + return "ACCEPT"; 1241 + case IORING_OP_ASYNC_CANCEL: 1242 + return "ASYNC_CANCEL"; 1243 + case IORING_OP_LINK_TIMEOUT: 1244 + return "LINK_TIMEOUT"; 1245 + case IORING_OP_CONNECT: 1246 + return "CONNECT"; 1247 + case IORING_OP_FALLOCATE: 1248 + return "FALLOCATE"; 1249 + case IORING_OP_OPENAT: 1250 + return "OPENAT"; 1251 + case IORING_OP_CLOSE: 1252 + return "CLOSE"; 1253 + case IORING_OP_FILES_UPDATE: 1254 + return "FILES_UPDATE"; 1255 + case IORING_OP_STATX: 1256 + return "STATX"; 1257 + case IORING_OP_READ: 1258 + return "READ"; 1259 + case IORING_OP_WRITE: 1260 + return "WRITE"; 1261 + case IORING_OP_FADVISE: 1262 + return "FADVISE"; 1263 + case IORING_OP_MADVISE: 1264 + return "MADVISE"; 1265 + case IORING_OP_SEND: 1266 + return "SEND"; 1267 + case IORING_OP_RECV: 1268 + return "RECV"; 1269 + case IORING_OP_OPENAT2: 1270 + return "OPENAT2"; 1271 + case IORING_OP_EPOLL_CTL: 1272 + return "EPOLL_CTL"; 1273 + case IORING_OP_SPLICE: 1274 + return "SPLICE"; 1275 + case IORING_OP_PROVIDE_BUFFERS: 1276 + return "PROVIDE_BUFFERS"; 1277 + case IORING_OP_REMOVE_BUFFERS: 1278 + return "REMOVE_BUFFERS"; 1279 + case IORING_OP_TEE: 1280 + return "TEE"; 1281 + case IORING_OP_SHUTDOWN: 1282 + return "SHUTDOWN"; 1283 + case IORING_OP_RENAMEAT: 1284 + return "RENAMEAT"; 1285 + case IORING_OP_UNLINKAT: 1286 + return "UNLINKAT"; 1287 + case IORING_OP_MKDIRAT: 1288 + return "MKDIRAT"; 1289 + case IORING_OP_SYMLINKAT: 1290 + return "SYMLINKAT"; 1291 + case IORING_OP_LINKAT: 1292 + return "LINKAT"; 1293 + case IORING_OP_MSG_RING: 1294 + return "MSG_RING"; 1295 + case IORING_OP_FSETXATTR: 1296 + return "FSETXATTR"; 1297 + case IORING_OP_SETXATTR: 1298 + return "SETXATTR"; 1299 + case IORING_OP_FGETXATTR: 1300 + return "FGETXATTR"; 1301 + case IORING_OP_GETXATTR: 1302 + return "GETXATTR"; 1303 + case IORING_OP_SOCKET: 1304 + return "SOCKET"; 1305 + case IORING_OP_LAST: 1306 + return "INVALID"; 1307 + } 1308 + return "INVALID"; 1309 + } 1286 1310 1287 1311 struct sock *io_uring_get_socket(struct file *file) 1288 1312 { ··· 1398 1218 return NULL; 1399 1219 } 1400 1220 EXPORT_SYMBOL(io_uring_get_socket); 1221 + 1222 + #if defined(CONFIG_UNIX) 1223 + static inline bool io_file_need_scm(struct file *filp) 1224 + { 1225 + #if defined(IO_URING_SCM_ALL) 1226 + return true; 1227 + #else 1228 + return !!unix_get_socket(filp); 1229 + #endif 1230 + } 1231 + #else 1232 + static inline bool io_file_need_scm(struct file *filp) 1233 + { 1234 + return false; 1235 + } 1236 + #endif 1237 + 1238 + static void io_ring_submit_unlock(struct io_ring_ctx *ctx, unsigned issue_flags) 1239 + { 1240 + lockdep_assert_held(&ctx->uring_lock); 1241 + if (issue_flags & IO_URING_F_UNLOCKED) 1242 + mutex_unlock(&ctx->uring_lock); 1243 + } 1244 + 1245 + static void io_ring_submit_lock(struct io_ring_ctx *ctx, unsigned issue_flags) 1246 + { 1247 + /* 1248 + * "Normal" inline submissions always hold the uring_lock, since we 1249 + * grab it from the system call. Same is true for the SQPOLL offload. 1250 + * The only exception is when we've detached the request and issue it 1251 + * from an async worker thread, grab the lock for that case. 1252 + */ 1253 + if (issue_flags & IO_URING_F_UNLOCKED) 1254 + mutex_lock(&ctx->uring_lock); 1255 + lockdep_assert_held(&ctx->uring_lock); 1256 + } 1401 1257 1402 1258 static inline void io_tw_lock(struct io_ring_ctx *ctx, bool *locked) 1403 1259 { ··· 1496 1280 1497 1281 #define IO_RSRC_REF_BATCH 100 1498 1282 1283 + static void io_rsrc_put_node(struct io_rsrc_node *node, int nr) 1284 + { 1285 + percpu_ref_put_many(&node->refs, nr); 1286 + } 1287 + 1499 1288 static inline void io_req_put_rsrc_locked(struct io_kiocb *req, 1500 1289 struct io_ring_ctx *ctx) 1501 1290 __must_hold(&ctx->uring_lock) 1502 1291 { 1503 - struct percpu_ref *ref = req->fixed_rsrc_refs; 1292 + struct io_rsrc_node *node = req->rsrc_node; 1504 1293 1505 - if (ref) { 1506 - if (ref == &ctx->rsrc_node->refs) 1294 + if (node) { 1295 + if (node == ctx->rsrc_node) 1507 1296 ctx->rsrc_cached_refs++; 1508 1297 else 1509 - percpu_ref_put(ref); 1298 + io_rsrc_put_node(node, 1); 1510 1299 } 1511 1300 } 1512 1301 1513 - static inline void io_req_put_rsrc(struct io_kiocb *req, struct io_ring_ctx *ctx) 1302 + static inline void io_req_put_rsrc(struct io_kiocb *req) 1514 1303 { 1515 - if (req->fixed_rsrc_refs) 1516 - percpu_ref_put(req->fixed_rsrc_refs); 1304 + if (req->rsrc_node) 1305 + io_rsrc_put_node(req->rsrc_node, 1); 1517 1306 } 1518 1307 1519 1308 static __cold void io_rsrc_refs_drop(struct io_ring_ctx *ctx) 1520 1309 __must_hold(&ctx->uring_lock) 1521 1310 { 1522 1311 if (ctx->rsrc_cached_refs) { 1523 - percpu_ref_put_many(&ctx->rsrc_node->refs, ctx->rsrc_cached_refs); 1312 + io_rsrc_put_node(ctx->rsrc_node, ctx->rsrc_cached_refs); 1524 1313 ctx->rsrc_cached_refs = 0; 1525 1314 } 1526 1315 } ··· 1541 1320 struct io_ring_ctx *ctx, 1542 1321 unsigned int issue_flags) 1543 1322 { 1544 - if (!req->fixed_rsrc_refs) { 1545 - req->fixed_rsrc_refs = &ctx->rsrc_node->refs; 1323 + if (!req->rsrc_node) { 1324 + req->rsrc_node = ctx->rsrc_node; 1546 1325 1547 1326 if (!(issue_flags & IO_URING_F_UNLOCKED)) { 1548 1327 lockdep_assert_held(&ctx->uring_lock); ··· 1550 1329 if (unlikely(ctx->rsrc_cached_refs < 0)) 1551 1330 io_rsrc_refs_refill(ctx); 1552 1331 } else { 1553 - percpu_ref_get(req->fixed_rsrc_refs); 1332 + percpu_ref_get(&req->rsrc_node->refs); 1554 1333 } 1555 1334 } 1556 1335 } ··· 1637 1416 if (req->flags & REQ_F_PARTIAL_IO) 1638 1417 return; 1639 1418 1640 - if (issue_flags & IO_URING_F_UNLOCKED) 1641 - mutex_lock(&ctx->uring_lock); 1642 - 1643 - lockdep_assert_held(&ctx->uring_lock); 1419 + io_ring_submit_lock(ctx, issue_flags); 1644 1420 1645 1421 buf = req->kbuf; 1646 1422 bl = io_buffer_get_list(ctx, buf->bgid); ··· 1645 1427 req->flags &= ~REQ_F_BUFFER_SELECTED; 1646 1428 req->kbuf = NULL; 1647 1429 1648 - if (issue_flags & IO_URING_F_UNLOCKED) 1649 - mutex_unlock(&ctx->uring_lock); 1430 + io_ring_submit_unlock(ctx, issue_flags); 1650 1431 } 1651 1432 1652 1433 static bool io_match_task(struct io_kiocb *head, struct task_struct *task, ··· 1686 1469 static inline void req_fail_link_node(struct io_kiocb *req, int res) 1687 1470 { 1688 1471 req_set_fail(req); 1689 - req->result = res; 1472 + req->cqe.res = res; 1473 + } 1474 + 1475 + static inline void io_req_add_to_cache(struct io_kiocb *req, struct io_ring_ctx *ctx) 1476 + { 1477 + wq_stack_add_head(&req->comp_list, &ctx->submit_state.free_list); 1690 1478 } 1691 1479 1692 1480 static __cold void io_ring_ctx_ref_free(struct percpu_ref *ref) ··· 1821 1599 return false; 1822 1600 } 1823 1601 1824 - #define FFS_NOWAIT 0x1UL 1825 - #define FFS_ISREG 0x2UL 1826 - #define FFS_MASK ~(FFS_NOWAIT|FFS_ISREG) 1827 - 1828 1602 static inline bool io_req_ffs_set(struct io_kiocb *req) 1829 1603 { 1830 1604 return req->flags & REQ_F_FIXED_FILE; ··· 1847 1629 return __io_prep_linked_timeout(req); 1848 1630 } 1849 1631 1632 + static noinline void __io_arm_ltimeout(struct io_kiocb *req) 1633 + { 1634 + io_queue_linked_timeout(__io_prep_linked_timeout(req)); 1635 + } 1636 + 1637 + static inline void io_arm_ltimeout(struct io_kiocb *req) 1638 + { 1639 + if (unlikely(req->flags & REQ_F_ARM_LTIMEOUT)) 1640 + __io_arm_ltimeout(req); 1641 + } 1642 + 1850 1643 static void io_prep_async_work(struct io_kiocb *req) 1851 1644 { 1852 1645 const struct io_op_def *def = &io_op_defs[req->opcode]; ··· 1870 1641 1871 1642 req->work.list.next = NULL; 1872 1643 req->work.flags = 0; 1644 + req->work.cancel_seq = atomic_read(&ctx->cancel_seq); 1873 1645 if (req->flags & REQ_F_FORCE_ASYNC) 1874 1646 req->work.flags |= IO_WQ_WORK_CONCURRENT; 1875 1647 ··· 1902 1672 1903 1673 static inline void io_req_add_compl_list(struct io_kiocb *req) 1904 1674 { 1905 - struct io_ring_ctx *ctx = req->ctx; 1906 - struct io_submit_state *state = &ctx->submit_state; 1675 + struct io_submit_state *state = &req->ctx->submit_state; 1907 1676 1908 1677 if (!(req->flags & REQ_F_CQE_SKIP)) 1909 - ctx->submit_state.flush_cqes = true; 1678 + state->flush_cqes = true; 1910 1679 wq_list_add_tail(&req->comp_list, &state->compl_reqs); 1911 1680 } 1912 1681 1913 - static void io_queue_async_work(struct io_kiocb *req, bool *dont_use) 1682 + static void io_queue_iowq(struct io_kiocb *req, bool *dont_use) 1914 1683 { 1915 - struct io_ring_ctx *ctx = req->ctx; 1916 1684 struct io_kiocb *link = io_prep_linked_timeout(req); 1917 1685 struct io_uring_task *tctx = req->task->io_uring; 1918 1686 ··· 1930 1702 if (WARN_ON_ONCE(!same_thread_group(req->task, current))) 1931 1703 req->work.flags |= IO_WQ_WORK_CANCEL; 1932 1704 1933 - trace_io_uring_queue_async_work(ctx, req, req->user_data, req->opcode, req->flags, 1934 - &req->work, io_wq_is_hashed(&req->work)); 1705 + trace_io_uring_queue_async_work(req->ctx, req, req->cqe.user_data, 1706 + req->opcode, req->flags, &req->work, 1707 + io_wq_is_hashed(&req->work)); 1935 1708 io_wq_enqueue(tctx->io_wq, &req->work); 1936 1709 if (link) 1937 1710 io_queue_linked_timeout(link); ··· 1950 1721 atomic_set(&req->ctx->cq_timeouts, 1951 1722 atomic_read(&req->ctx->cq_timeouts) + 1); 1952 1723 list_del_init(&req->timeout.list); 1953 - io_fill_cqe_req(req, status, 0); 1954 - io_put_req_deferred(req); 1724 + io_req_tw_post_queue(req, status, 0); 1955 1725 } 1956 1726 } 1957 1727 ··· 2032 1804 return ctx->cached_cq_tail - READ_ONCE(ctx->rings->cq.head); 2033 1805 } 2034 1806 2035 - static inline struct io_uring_cqe *io_get_cqe(struct io_ring_ctx *ctx) 1807 + /* 1808 + * writes to the cq entry need to come after reading head; the 1809 + * control dependency is enough as we're using WRITE_ONCE to 1810 + * fill the cq entry 1811 + */ 1812 + static noinline struct io_uring_cqe *__io_get_cqe(struct io_ring_ctx *ctx) 2036 1813 { 2037 1814 struct io_rings *rings = ctx->rings; 2038 - unsigned tail, mask = ctx->cq_entries - 1; 1815 + unsigned int off = ctx->cached_cq_tail & (ctx->cq_entries - 1); 1816 + unsigned int free, queued, len; 2039 1817 2040 - /* 2041 - * writes to the cq entry need to come after reading head; the 2042 - * control dependency is enough as we're using WRITE_ONCE to 2043 - * fill the cq entry 2044 - */ 2045 - if (__io_cqring_events(ctx) == ctx->cq_entries) 1818 + /* userspace may cheat modifying the tail, be safe and do min */ 1819 + queued = min(__io_cqring_events(ctx), ctx->cq_entries); 1820 + free = ctx->cq_entries - queued; 1821 + /* we need a contiguous range, limit based on the current array offset */ 1822 + len = min(free, ctx->cq_entries - off); 1823 + if (!len) 2046 1824 return NULL; 2047 1825 2048 - tail = ctx->cached_cq_tail++; 2049 - return &rings->cqes[tail & mask]; 1826 + ctx->cached_cq_tail++; 1827 + ctx->cqe_cached = &rings->cqes[off]; 1828 + ctx->cqe_sentinel = ctx->cqe_cached + len; 1829 + return ctx->cqe_cached++; 1830 + } 1831 + 1832 + static inline struct io_uring_cqe *io_get_cqe(struct io_ring_ctx *ctx) 1833 + { 1834 + if (likely(ctx->cqe_cached < ctx->cqe_sentinel)) { 1835 + ctx->cached_cq_tail++; 1836 + return ctx->cqe_cached++; 1837 + } 1838 + return __io_get_cqe(ctx); 2050 1839 } 2051 1840 2052 1841 static void io_eventfd_signal(struct io_ring_ctx *ctx) ··· 2160 1915 2161 1916 all_flushed = list_empty(&ctx->cq_overflow_list); 2162 1917 if (all_flushed) { 2163 - clear_bit(0, &ctx->check_cq_overflow); 1918 + clear_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq); 2164 1919 WRITE_ONCE(ctx->rings->sq_flags, 2165 1920 ctx->rings->sq_flags & ~IORING_SQ_CQ_OVERFLOW); 2166 1921 } 2167 1922 2168 - if (posted) 2169 - io_commit_cqring(ctx); 1923 + io_commit_cqring(ctx); 2170 1924 spin_unlock(&ctx->completion_lock); 2171 1925 if (posted) 2172 1926 io_cqring_ev_posted(ctx); ··· 2176 1932 { 2177 1933 bool ret = true; 2178 1934 2179 - if (test_bit(0, &ctx->check_cq_overflow)) { 1935 + if (test_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq)) { 2180 1936 /* iopoll syncs against uring_lock, not completion_lock */ 2181 1937 if (ctx->flags & IORING_SETUP_IOPOLL) 2182 1938 mutex_lock(&ctx->uring_lock); ··· 2188 1944 return ret; 2189 1945 } 2190 1946 2191 - /* must to be called somewhat shortly after putting a request */ 2192 - static inline void io_put_task(struct task_struct *task, int nr) 1947 + static void __io_put_task(struct task_struct *task, int nr) 2193 1948 { 2194 1949 struct io_uring_task *tctx = task->io_uring; 2195 1950 2196 - if (likely(task == current)) { 2197 - tctx->cached_refs += nr; 2198 - } else { 2199 - percpu_counter_sub(&tctx->inflight, nr); 2200 - if (unlikely(atomic_read(&tctx->in_idle))) 2201 - wake_up(&tctx->wait); 2202 - put_task_struct_many(task, nr); 2203 - } 1951 + percpu_counter_sub(&tctx->inflight, nr); 1952 + if (unlikely(atomic_read(&tctx->in_idle))) 1953 + wake_up(&tctx->wait); 1954 + put_task_struct_many(task, nr); 1955 + } 1956 + 1957 + /* must to be called somewhat shortly after putting a request */ 1958 + static inline void io_put_task(struct task_struct *task, int nr) 1959 + { 1960 + if (likely(task == current)) 1961 + task->io_uring->cached_refs += nr; 1962 + else 1963 + __io_put_task(task, nr); 2204 1964 } 2205 1965 2206 1966 static void io_task_refs_refill(struct io_uring_task *tctx) ··· 2243 1995 struct io_overflow_cqe *ocqe; 2244 1996 2245 1997 ocqe = kmalloc(sizeof(*ocqe), GFP_ATOMIC | __GFP_ACCOUNT); 1998 + trace_io_uring_cqe_overflow(ctx, user_data, res, cflags, ocqe); 2246 1999 if (!ocqe) { 2247 2000 /* 2248 2001 * If we're in ring overflow flush mode, or in task cancel mode, ··· 2251 2002 * on the floor. 2252 2003 */ 2253 2004 io_account_cq_overflow(ctx); 2005 + set_bit(IO_CHECK_CQ_DROPPED_BIT, &ctx->check_cq); 2254 2006 return false; 2255 2007 } 2256 2008 if (list_empty(&ctx->cq_overflow_list)) { 2257 - set_bit(0, &ctx->check_cq_overflow); 2009 + set_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq); 2258 2010 WRITE_ONCE(ctx->rings->sq_flags, 2259 2011 ctx->rings->sq_flags | IORING_SQ_CQ_OVERFLOW); 2260 2012 ··· 2287 2037 return io_cqring_event_overflow(ctx, user_data, res, cflags); 2288 2038 } 2289 2039 2290 - static inline bool __io_fill_cqe_req(struct io_kiocb *req, s32 res, u32 cflags) 2040 + static inline bool __io_fill_cqe_req_filled(struct io_ring_ctx *ctx, 2041 + struct io_kiocb *req) 2291 2042 { 2292 - trace_io_uring_complete(req->ctx, req, req->user_data, res, cflags); 2293 - return __io_fill_cqe(req->ctx, req->user_data, res, cflags); 2043 + struct io_uring_cqe *cqe; 2044 + 2045 + trace_io_uring_complete(req->ctx, req, req->cqe.user_data, 2046 + req->cqe.res, req->cqe.flags); 2047 + 2048 + /* 2049 + * If we can't get a cq entry, userspace overflowed the 2050 + * submission (by quite a lot). Increment the overflow count in 2051 + * the ring. 2052 + */ 2053 + cqe = io_get_cqe(ctx); 2054 + if (likely(cqe)) { 2055 + memcpy(cqe, &req->cqe, sizeof(*cqe)); 2056 + return true; 2057 + } 2058 + return io_cqring_event_overflow(ctx, req->cqe.user_data, 2059 + req->cqe.res, req->cqe.flags); 2294 2060 } 2295 2061 2296 - static noinline void io_fill_cqe_req(struct io_kiocb *req, s32 res, u32 cflags) 2062 + static inline bool __io_fill_cqe_req(struct io_kiocb *req, s32 res, u32 cflags) 2297 2063 { 2298 - if (!(req->flags & REQ_F_CQE_SKIP)) 2299 - __io_fill_cqe_req(req, res, cflags); 2064 + trace_io_uring_complete(req->ctx, req, req->cqe.user_data, res, cflags); 2065 + return __io_fill_cqe(req->ctx, req->cqe.user_data, res, cflags); 2300 2066 } 2301 2067 2302 2068 static noinline bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, ··· 2335 2069 * free_list cache. 2336 2070 */ 2337 2071 if (req_ref_put_and_test(req)) { 2338 - if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) { 2072 + if (req->flags & IO_REQ_LINK_FLAGS) { 2339 2073 if (req->flags & IO_DISARM_MASK) 2340 2074 io_disarm_next(req); 2341 2075 if (req->link) { ··· 2343 2077 req->link = NULL; 2344 2078 } 2345 2079 } 2346 - io_req_put_rsrc(req, ctx); 2080 + io_req_put_rsrc(req); 2347 2081 /* 2348 2082 * Selected buffer deallocation in io_clean_op() assumes that 2349 2083 * we don't hold ->completion_lock. Clean them here to avoid ··· 2372 2106 static inline void io_req_complete_state(struct io_kiocb *req, s32 res, 2373 2107 u32 cflags) 2374 2108 { 2375 - req->result = res; 2376 - req->cflags = cflags; 2109 + req->cqe.res = res; 2110 + req->cqe.flags = cflags; 2377 2111 req->flags |= REQ_F_COMPLETE_INLINE; 2378 2112 } 2379 2113 ··· 2397 2131 io_req_complete_post(req, res, io_put_kbuf(req, IO_URING_F_UNLOCKED)); 2398 2132 } 2399 2133 2400 - static void io_req_complete_fail_submit(struct io_kiocb *req) 2401 - { 2402 - /* 2403 - * We don't submit, fail them all, for that replace hardlinks with 2404 - * normal links. Extra REQ_F_LINK is tolerated. 2405 - */ 2406 - req->flags &= ~REQ_F_HARDLINK; 2407 - req->flags |= REQ_F_LINK; 2408 - io_req_complete_failed(req, req->result); 2409 - } 2410 - 2411 2134 /* 2412 2135 * Don't initialise the fields below on every allocation, but do that in 2413 2136 * advance and keep them valid across allocations. ··· 2407 2152 req->link = NULL; 2408 2153 req->async_data = NULL; 2409 2154 /* not necessary, but safer to zero */ 2410 - req->result = 0; 2155 + req->cqe.res = 0; 2411 2156 } 2412 2157 2413 2158 static void io_flush_cached_locked_reqs(struct io_ring_ctx *ctx, ··· 2419 2164 spin_unlock(&ctx->completion_lock); 2420 2165 } 2421 2166 2422 - /* Returns true IFF there are requests in the cache */ 2423 - static bool io_flush_cached_reqs(struct io_ring_ctx *ctx) 2167 + static inline bool io_req_cache_empty(struct io_ring_ctx *ctx) 2424 2168 { 2425 - struct io_submit_state *state = &ctx->submit_state; 2426 - 2427 - /* 2428 - * If we have more than a batch's worth of requests in our IRQ side 2429 - * locked cache, grab the lock and move them over to our submission 2430 - * side cache. 2431 - */ 2432 - if (READ_ONCE(ctx->locked_free_nr) > IO_COMPL_BATCH) 2433 - io_flush_cached_locked_reqs(ctx, state); 2434 - return !!state->free_list.next; 2169 + return !ctx->submit_state.free_list.next; 2435 2170 } 2436 2171 2437 2172 /* ··· 2433 2188 static __cold bool __io_alloc_req_refill(struct io_ring_ctx *ctx) 2434 2189 __must_hold(&ctx->uring_lock) 2435 2190 { 2436 - struct io_submit_state *state = &ctx->submit_state; 2437 2191 gfp_t gfp = GFP_KERNEL | __GFP_NOWARN; 2438 2192 void *reqs[IO_REQ_ALLOC_BATCH]; 2439 - struct io_kiocb *req; 2440 2193 int ret, i; 2441 2194 2442 - if (likely(state->free_list.next || io_flush_cached_reqs(ctx))) 2443 - return true; 2195 + /* 2196 + * If we have more than a batch's worth of requests in our IRQ side 2197 + * locked cache, grab the lock and move them over to our submission 2198 + * side cache. 2199 + */ 2200 + if (data_race(ctx->locked_free_nr) > IO_COMPL_BATCH) { 2201 + io_flush_cached_locked_reqs(ctx, &ctx->submit_state); 2202 + if (!io_req_cache_empty(ctx)) 2203 + return true; 2204 + } 2444 2205 2445 2206 ret = kmem_cache_alloc_bulk(req_cachep, gfp, ARRAY_SIZE(reqs), reqs); 2446 2207 ··· 2463 2212 2464 2213 percpu_ref_get_many(&ctx->refs, ret); 2465 2214 for (i = 0; i < ret; i++) { 2466 - req = reqs[i]; 2215 + struct io_kiocb *req = reqs[i]; 2467 2216 2468 2217 io_preinit_req(req, ctx); 2469 - wq_stack_add_head(&req->comp_list, &state->free_list); 2218 + io_req_add_to_cache(req, ctx); 2470 2219 } 2471 2220 return true; 2472 2221 } 2473 2222 2474 2223 static inline bool io_alloc_req_refill(struct io_ring_ctx *ctx) 2475 2224 { 2476 - if (unlikely(!ctx->submit_state.free_list.next)) 2225 + if (unlikely(io_req_cache_empty(ctx))) 2477 2226 return __io_alloc_req_refill(ctx); 2478 2227 return true; 2479 2228 } ··· 2502 2251 io_put_file(req->file); 2503 2252 } 2504 2253 2505 - static __cold void __io_free_req(struct io_kiocb *req) 2254 + static __cold void io_free_req(struct io_kiocb *req) 2506 2255 { 2507 2256 struct io_ring_ctx *ctx = req->ctx; 2508 2257 2509 - io_req_put_rsrc(req, ctx); 2258 + io_req_put_rsrc(req); 2510 2259 io_dismantle_req(req); 2511 2260 io_put_task(req->task, 1); 2512 2261 ··· 2524 2273 nxt->link = NULL; 2525 2274 } 2526 2275 2527 - static bool io_kill_linked_timeout(struct io_kiocb *req) 2276 + static struct io_kiocb *io_disarm_linked_timeout(struct io_kiocb *req) 2528 2277 __must_hold(&req->ctx->completion_lock) 2529 2278 __must_hold(&req->ctx->timeout_lock) 2530 2279 { ··· 2537 2286 link->timeout.head = NULL; 2538 2287 if (hrtimer_try_to_cancel(&io->timer) != -1) { 2539 2288 list_del(&link->timeout.list); 2540 - /* leave REQ_F_CQE_SKIP to io_fill_cqe_req */ 2541 - io_fill_cqe_req(link, -ECANCELED, 0); 2542 - io_put_req_deferred(link); 2543 - return true; 2289 + return link; 2544 2290 } 2545 2291 } 2546 - return false; 2292 + return NULL; 2547 2293 } 2548 2294 2549 2295 static void io_fail_links(struct io_kiocb *req) ··· 2554 2306 long res = -ECANCELED; 2555 2307 2556 2308 if (link->flags & REQ_F_FAIL) 2557 - res = link->result; 2309 + res = link->cqe.res; 2558 2310 2559 2311 nxt = link->link; 2560 2312 link->link = NULL; 2561 2313 2562 - trace_io_uring_fail_link(req->ctx, req, req->user_data, 2314 + trace_io_uring_fail_link(req->ctx, req, req->cqe.user_data, 2563 2315 req->opcode, link); 2564 2316 2565 - if (!ignore_cqes) { 2317 + if (ignore_cqes) 2318 + link->flags |= REQ_F_CQE_SKIP; 2319 + else 2566 2320 link->flags &= ~REQ_F_CQE_SKIP; 2567 - io_fill_cqe_req(link, res, 0); 2568 - } 2569 - io_put_req_deferred(link); 2321 + __io_req_complete_post(link, res, 0); 2570 2322 link = nxt; 2571 2323 } 2572 2324 } ··· 2574 2326 static bool io_disarm_next(struct io_kiocb *req) 2575 2327 __must_hold(&req->ctx->completion_lock) 2576 2328 { 2329 + struct io_kiocb *link = NULL; 2577 2330 bool posted = false; 2578 2331 2579 2332 if (req->flags & REQ_F_ARM_LTIMEOUT) { 2580 - struct io_kiocb *link = req->link; 2581 - 2333 + link = req->link; 2582 2334 req->flags &= ~REQ_F_ARM_LTIMEOUT; 2583 2335 if (link && link->opcode == IORING_OP_LINK_TIMEOUT) { 2584 2336 io_remove_next_linked(req); 2585 - /* leave REQ_F_CQE_SKIP to io_fill_cqe_req */ 2586 - io_fill_cqe_req(link, -ECANCELED, 0); 2587 - io_put_req_deferred(link); 2337 + io_req_tw_post_queue(link, -ECANCELED, 0); 2588 2338 posted = true; 2589 2339 } 2590 2340 } else if (req->flags & REQ_F_LINK_TIMEOUT) { 2591 2341 struct io_ring_ctx *ctx = req->ctx; 2592 2342 2593 2343 spin_lock_irq(&ctx->timeout_lock); 2594 - posted = io_kill_linked_timeout(req); 2344 + link = io_disarm_linked_timeout(req); 2595 2345 spin_unlock_irq(&ctx->timeout_lock); 2346 + if (link) { 2347 + posted = true; 2348 + io_req_tw_post_queue(link, -ECANCELED, 0); 2349 + } 2596 2350 } 2597 2351 if (unlikely((req->flags & REQ_F_FAIL) && 2598 2352 !(req->flags & REQ_F_HARDLINK))) { ··· 2611 2361 2612 2362 spin_lock(&ctx->completion_lock); 2613 2363 posted = io_disarm_next(req); 2614 - if (posted) 2615 - io_commit_cqring(ctx); 2364 + io_commit_cqring(ctx); 2616 2365 spin_unlock(&ctx->completion_lock); 2617 2366 if (posted) 2618 2367 io_cqring_ev_posted(ctx); ··· 2621 2372 { 2622 2373 struct io_kiocb *nxt; 2623 2374 2624 - if (likely(!(req->flags & (REQ_F_LINK|REQ_F_HARDLINK)))) 2625 - return NULL; 2626 2375 /* 2627 2376 * If LINK is set, we have dependent requests in this chain. If we 2628 2377 * didn't fail this request, queue the first one up, moving any other ··· 2681 2434 if (likely(*uring_locked)) 2682 2435 req->io_task_work.func(req, uring_locked); 2683 2436 else 2684 - __io_req_complete_post(req, req->result, 2437 + __io_req_complete_post(req, req->cqe.res, 2685 2438 io_put_kbuf_comp(req)); 2686 2439 node = next; 2687 2440 } while (node); ··· 2722 2475 while (1) { 2723 2476 struct io_wq_work_node *node1, *node2; 2724 2477 2725 - if (!tctx->task_list.first && 2726 - !tctx->prior_task_list.first && uring_locked) 2727 - io_submit_flush_completions(ctx); 2728 - 2729 2478 spin_lock_irq(&tctx->task_lock); 2730 2479 node1 = tctx->prior_task_list.first; 2731 2480 node2 = tctx->task_list.first; ··· 2735 2492 2736 2493 if (node1) 2737 2494 handle_prev_tw_list(node1, &ctx, &uring_locked); 2738 - 2739 2495 if (node2) 2740 2496 handle_tw_list(node2, &ctx, &uring_locked); 2741 2497 cond_resched(); 2498 + 2499 + if (data_race(!tctx->task_list.first) && 2500 + data_race(!tctx->prior_task_list.first) && uring_locked) 2501 + io_submit_flush_completions(ctx); 2742 2502 } 2743 2503 2744 2504 ctx_flush_and_put(ctx, &uring_locked); ··· 2805 2559 } 2806 2560 } 2807 2561 2562 + static void io_req_tw_post(struct io_kiocb *req, bool *locked) 2563 + { 2564 + io_req_complete_post(req, req->cqe.res, req->cqe.flags); 2565 + } 2566 + 2567 + static void io_req_tw_post_queue(struct io_kiocb *req, s32 res, u32 cflags) 2568 + { 2569 + req->cqe.res = res; 2570 + req->cqe.flags = cflags; 2571 + req->io_task_work.func = io_req_tw_post; 2572 + io_req_task_work_add(req, false); 2573 + } 2574 + 2808 2575 static void io_req_task_cancel(struct io_kiocb *req, bool *locked) 2809 2576 { 2810 - struct io_ring_ctx *ctx = req->ctx; 2811 - 2812 2577 /* not needed for normal modes, but SQPOLL depends on it */ 2813 - io_tw_lock(ctx, locked); 2814 - io_req_complete_failed(req, req->result); 2578 + io_tw_lock(req->ctx, locked); 2579 + io_req_complete_failed(req, req->cqe.res); 2815 2580 } 2816 2581 2817 2582 static void io_req_task_submit(struct io_kiocb *req, bool *locked) 2818 2583 { 2819 - struct io_ring_ctx *ctx = req->ctx; 2820 - 2821 - io_tw_lock(ctx, locked); 2584 + io_tw_lock(req->ctx, locked); 2822 2585 /* req->task == current here, checking PF_EXITING is safe */ 2823 2586 if (likely(!(req->task->flags & PF_EXITING))) 2824 - __io_queue_sqe(req); 2587 + io_queue_sqe(req); 2825 2588 else 2826 2589 io_req_complete_failed(req, -EFAULT); 2827 2590 } 2828 2591 2829 2592 static void io_req_task_queue_fail(struct io_kiocb *req, int ret) 2830 2593 { 2831 - req->result = ret; 2594 + req->cqe.res = ret; 2832 2595 req->io_task_work.func = io_req_task_cancel; 2833 2596 io_req_task_work_add(req, false); 2834 2597 } ··· 2850 2595 2851 2596 static void io_req_task_queue_reissue(struct io_kiocb *req) 2852 2597 { 2853 - req->io_task_work.func = io_queue_async_work; 2598 + req->io_task_work.func = io_queue_iowq; 2854 2599 io_req_task_work_add(req, false); 2855 2600 } 2856 2601 2857 - static inline void io_queue_next(struct io_kiocb *req) 2602 + static void io_queue_next(struct io_kiocb *req) 2858 2603 { 2859 2604 struct io_kiocb *nxt = io_req_find_next(req); 2860 2605 2861 2606 if (nxt) 2862 2607 io_req_task_queue(nxt); 2863 - } 2864 - 2865 - static void io_free_req(struct io_kiocb *req) 2866 - { 2867 - io_queue_next(req); 2868 - __io_free_req(req); 2869 - } 2870 - 2871 - static void io_free_req_work(struct io_kiocb *req, bool *locked) 2872 - { 2873 - io_free_req(req); 2874 2608 } 2875 2609 2876 2610 static void io_free_batch_list(struct io_ring_ctx *ctx, ··· 2873 2629 struct io_kiocb *req = container_of(node, struct io_kiocb, 2874 2630 comp_list); 2875 2631 2876 - if (unlikely(req->flags & REQ_F_REFCOUNT)) { 2877 - node = req->comp_list.next; 2878 - if (!req_ref_put_and_test(req)) 2879 - continue; 2632 + if (unlikely(req->flags & IO_REQ_CLEAN_SLOW_FLAGS)) { 2633 + if (req->flags & REQ_F_REFCOUNT) { 2634 + node = req->comp_list.next; 2635 + if (!req_ref_put_and_test(req)) 2636 + continue; 2637 + } 2638 + if ((req->flags & REQ_F_POLLED) && req->apoll) { 2639 + struct async_poll *apoll = req->apoll; 2640 + 2641 + if (apoll->double_poll) 2642 + kfree(apoll->double_poll); 2643 + list_add(&apoll->poll.wait.entry, 2644 + &ctx->apoll_cache); 2645 + req->flags &= ~REQ_F_POLLED; 2646 + } 2647 + if (req->flags & IO_REQ_LINK_FLAGS) 2648 + io_queue_next(req); 2649 + if (unlikely(req->flags & IO_REQ_CLEAN_FLAGS)) 2650 + io_clean_op(req); 2880 2651 } 2652 + if (!(req->flags & REQ_F_FIXED_FILE)) 2653 + io_put_file(req->file); 2881 2654 2882 2655 io_req_put_rsrc_locked(req, ctx); 2883 - io_queue_next(req); 2884 - io_dismantle_req(req); 2885 2656 2886 2657 if (req->task != task) { 2887 2658 if (task) ··· 2906 2647 } 2907 2648 task_refs++; 2908 2649 node = req->comp_list.next; 2909 - wq_stack_add_head(&req->comp_list, &ctx->submit_state.free_list); 2650 + io_req_add_to_cache(req, ctx); 2910 2651 } while (node); 2911 2652 2912 2653 if (task) ··· 2926 2667 comp_list); 2927 2668 2928 2669 if (!(req->flags & REQ_F_CQE_SKIP)) 2929 - __io_fill_cqe_req(req, req->result, req->cflags); 2930 - if ((req->flags & REQ_F_POLLED) && req->apoll) { 2931 - struct async_poll *apoll = req->apoll; 2932 - 2933 - if (apoll->double_poll) 2934 - kfree(apoll->double_poll); 2935 - list_add(&apoll->poll.wait.entry, 2936 - &ctx->apoll_cache); 2937 - req->flags &= ~REQ_F_POLLED; 2938 - } 2670 + __io_fill_cqe_req_filled(ctx, req); 2939 2671 } 2940 2672 2941 2673 io_commit_cqring(ctx); ··· 2948 2698 struct io_kiocb *nxt = NULL; 2949 2699 2950 2700 if (req_ref_put_and_test(req)) { 2951 - nxt = io_req_find_next(req); 2952 - __io_free_req(req); 2701 + if (unlikely(req->flags & IO_REQ_LINK_FLAGS)) 2702 + nxt = io_req_find_next(req); 2703 + io_free_req(req); 2953 2704 } 2954 2705 return nxt; 2955 2706 } 2956 2707 2957 2708 static inline void io_put_req(struct io_kiocb *req) 2958 2709 { 2959 - if (req_ref_put_and_test(req)) 2960 - io_free_req(req); 2961 - } 2962 - 2963 - static inline void io_put_req_deferred(struct io_kiocb *req) 2964 - { 2965 2710 if (req_ref_put_and_test(req)) { 2966 - req->io_task_work.func = io_free_req_work; 2967 - io_req_task_work_add(req, false); 2711 + io_queue_next(req); 2712 + io_free_req(req); 2968 2713 } 2969 2714 } 2970 2715 ··· 3045 2800 nr_events++; 3046 2801 if (unlikely(req->flags & REQ_F_CQE_SKIP)) 3047 2802 continue; 3048 - __io_fill_cqe_req(req, req->result, io_put_kbuf(req, 0)); 2803 + __io_fill_cqe_req(req, req->cqe.res, io_put_kbuf(req, 0)); 3049 2804 } 3050 2805 3051 2806 if (unlikely(!nr_events)) ··· 3091 2846 { 3092 2847 unsigned int nr_events = 0; 3093 2848 int ret = 0; 2849 + unsigned long check_cq; 3094 2850 3095 - /* 3096 - * We disallow the app entering submit/complete with polling, but we 3097 - * still need to lock the ring to prevent racing with polled issue 3098 - * that got punted to a workqueue. 3099 - */ 3100 - mutex_lock(&ctx->uring_lock); 3101 2851 /* 3102 2852 * Don't enter poll loop if we already have events pending. 3103 2853 * If we do, we can potentially be spinning for commands that 3104 2854 * already triggered a CQE (eg in error). 3105 2855 */ 3106 - if (test_bit(0, &ctx->check_cq_overflow)) 2856 + check_cq = READ_ONCE(ctx->check_cq); 2857 + if (check_cq & BIT(IO_CHECK_CQ_OVERFLOW_BIT)) 3107 2858 __io_cqring_overflow_flush(ctx, false); 3108 2859 if (io_cqring_events(ctx)) 3109 - goto out; 2860 + return 0; 2861 + 2862 + /* 2863 + * Similarly do not spin if we have not informed the user of any 2864 + * dropped CQE. 2865 + */ 2866 + if (unlikely(check_cq & BIT(IO_CHECK_CQ_DROPPED_BIT))) 2867 + return -EBADR; 2868 + 3110 2869 do { 3111 2870 /* 3112 2871 * If a submit got punted to a workqueue, we can have the ··· 3140 2891 nr_events += ret; 3141 2892 ret = 0; 3142 2893 } while (nr_events < min && !need_resched()); 3143 - out: 3144 - mutex_unlock(&ctx->uring_lock); 2894 + 3145 2895 return ret; 3146 2896 } 3147 2897 ··· 3213 2965 } else { 3214 2966 fsnotify_access(req->file); 3215 2967 } 3216 - if (unlikely(res != req->result)) { 2968 + if (unlikely(res != req->cqe.res)) { 3217 2969 if ((res == -EAGAIN || res == -EOPNOTSUPP) && 3218 2970 io_rw_should_reissue(req)) { 3219 2971 req->flags |= REQ_F_REISSUE; 3220 2972 return true; 3221 2973 } 3222 2974 req_set_fail(req); 3223 - req->result = res; 2975 + req->cqe.res = res; 3224 2976 } 3225 2977 return false; 3226 2978 } 3227 2979 3228 2980 static inline void io_req_task_complete(struct io_kiocb *req, bool *locked) 3229 2981 { 3230 - int res = req->result; 2982 + int res = req->cqe.res; 3231 2983 3232 2984 if (*locked) { 3233 2985 io_req_complete_state(req, res, io_put_kbuf(req, 0)); ··· 3243 2995 { 3244 2996 if (__io_complete_rw_common(req, res)) 3245 2997 return; 3246 - __io_req_complete(req, issue_flags, req->result, 2998 + __io_req_complete(req, issue_flags, req->cqe.res, 3247 2999 io_put_kbuf(req, issue_flags)); 3248 3000 } 3249 3001 ··· 3253 3005 3254 3006 if (__io_complete_rw_common(req, res)) 3255 3007 return; 3256 - req->result = res; 3008 + req->cqe.res = res; 3257 3009 req->io_task_work.func = io_req_task_complete; 3258 3010 io_req_task_work_add(req, !!(req->ctx->flags & IORING_SETUP_SQPOLL)); 3259 3011 } ··· 3264 3016 3265 3017 if (kiocb->ki_flags & IOCB_WRITE) 3266 3018 kiocb_end_write(req); 3267 - if (unlikely(res != req->result)) { 3019 + if (unlikely(res != req->cqe.res)) { 3268 3020 if (res == -EAGAIN && io_rw_should_reissue(req)) { 3269 3021 req->flags |= REQ_F_REISSUE; 3270 3022 return; 3271 3023 } 3272 - req->result = res; 3024 + req->cqe.res = res; 3273 3025 } 3274 3026 3275 3027 /* order with io_iopoll_complete() checking ->iopoll_completed */ ··· 3379 3131 res |= FFS_ISREG; 3380 3132 if (__io_file_supports_nowait(file, mode)) 3381 3133 res |= FFS_NOWAIT; 3134 + if (io_file_need_scm(file)) 3135 + res |= FFS_SCM; 3382 3136 return res; 3383 3137 } 3384 3138 ··· 3560 3310 return __io_import_fixed(req, rw, iter, imu); 3561 3311 } 3562 3312 3563 - static void io_ring_submit_unlock(struct io_ring_ctx *ctx, bool needs_lock) 3564 - { 3565 - if (needs_lock) 3566 - mutex_unlock(&ctx->uring_lock); 3567 - } 3568 - 3569 - static void io_ring_submit_lock(struct io_ring_ctx *ctx, bool needs_lock) 3570 - { 3571 - /* 3572 - * "Normal" inline submissions always hold the uring_lock, since we 3573 - * grab it from the system call. Same is true for the SQPOLL offload. 3574 - * The only exception is when we've detached the request and issue it 3575 - * from an async worker thread, grab the lock for that case. 3576 - */ 3577 - if (needs_lock) 3578 - mutex_lock(&ctx->uring_lock); 3579 - } 3580 - 3581 3313 static void io_buffer_add_list(struct io_ring_ctx *ctx, 3582 3314 struct io_buffer_list *bl, unsigned int bgid) 3583 3315 { ··· 3575 3343 int bgid, unsigned int issue_flags) 3576 3344 { 3577 3345 struct io_buffer *kbuf = req->kbuf; 3578 - bool needs_lock = issue_flags & IO_URING_F_UNLOCKED; 3579 3346 struct io_ring_ctx *ctx = req->ctx; 3580 3347 struct io_buffer_list *bl; 3581 3348 3582 3349 if (req->flags & REQ_F_BUFFER_SELECTED) 3583 3350 return kbuf; 3584 3351 3585 - io_ring_submit_lock(ctx, needs_lock); 3586 - 3587 - lockdep_assert_held(&ctx->uring_lock); 3352 + io_ring_submit_lock(req->ctx, issue_flags); 3588 3353 3589 3354 bl = io_buffer_get_list(ctx, bgid); 3590 3355 if (bl && !list_empty(&bl->buf_list)) { ··· 3595 3366 kbuf = ERR_PTR(-ENOBUFS); 3596 3367 } 3597 3368 3598 - io_ring_submit_unlock(req->ctx, needs_lock); 3369 + io_ring_submit_unlock(req->ctx, issue_flags); 3599 3370 return kbuf; 3600 3371 } 3601 3372 ··· 4065 3836 kfree(iovec); 4066 3837 return ret; 4067 3838 } 4068 - req->result = iov_iter_count(&s->iter); 3839 + req->cqe.res = iov_iter_count(&s->iter); 4069 3840 4070 3841 if (force_nonblock) { 4071 3842 /* If the file doesn't support async, just async punt */ ··· 4081 3852 4082 3853 ppos = io_kiocb_update_pos(req); 4083 3854 4084 - ret = rw_verify_area(READ, req->file, ppos, req->result); 3855 + ret = rw_verify_area(READ, req->file, ppos, req->cqe.res); 4085 3856 if (unlikely(ret)) { 4086 3857 kfree(iovec); 4087 3858 return ret; ··· 4103 3874 ret = 0; 4104 3875 } else if (ret == -EIOCBQUEUED) { 4105 3876 goto out_free; 4106 - } else if (ret == req->result || ret <= 0 || !force_nonblock || 3877 + } else if (ret == req->cqe.res || ret <= 0 || !force_nonblock || 4107 3878 (req->flags & REQ_F_NOWAIT) || !need_read_all(req)) { 4108 3879 /* read all, failed, already did sync or don't want to retry */ 4109 3880 goto done; ··· 4193 3964 kfree(iovec); 4194 3965 return ret; 4195 3966 } 4196 - req->result = iov_iter_count(&s->iter); 3967 + req->cqe.res = iov_iter_count(&s->iter); 4197 3968 4198 3969 if (force_nonblock) { 4199 3970 /* If the file doesn't support async, just async punt */ ··· 4213 3984 4214 3985 ppos = io_kiocb_update_pos(req); 4215 3986 4216 - ret = rw_verify_area(WRITE, req->file, ppos, req->result); 3987 + ret = rw_verify_area(WRITE, req->file, ppos, req->cqe.res); 4217 3988 if (unlikely(ret)) 4218 3989 goto out_free; 4219 3990 ··· 4319 4090 if (ret < 0) 4320 4091 req_set_fail(req); 4321 4092 io_req_complete(req, ret); 4093 + return 0; 4094 + } 4095 + 4096 + static inline void __io_xattr_finish(struct io_kiocb *req) 4097 + { 4098 + struct io_xattr *ix = &req->xattr; 4099 + 4100 + if (ix->filename) 4101 + putname(ix->filename); 4102 + 4103 + kfree(ix->ctx.kname); 4104 + kvfree(ix->ctx.kvalue); 4105 + } 4106 + 4107 + static void io_xattr_finish(struct io_kiocb *req, int ret) 4108 + { 4109 + req->flags &= ~REQ_F_NEED_CLEANUP; 4110 + 4111 + __io_xattr_finish(req); 4112 + if (ret < 0) 4113 + req_set_fail(req); 4114 + 4115 + io_req_complete(req, ret); 4116 + } 4117 + 4118 + static int __io_getxattr_prep(struct io_kiocb *req, 4119 + const struct io_uring_sqe *sqe) 4120 + { 4121 + struct io_xattr *ix = &req->xattr; 4122 + const char __user *name; 4123 + int ret; 4124 + 4125 + if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) 4126 + return -EINVAL; 4127 + if (unlikely(sqe->ioprio)) 4128 + return -EINVAL; 4129 + if (unlikely(req->flags & REQ_F_FIXED_FILE)) 4130 + return -EBADF; 4131 + 4132 + ix->filename = NULL; 4133 + ix->ctx.kvalue = NULL; 4134 + name = u64_to_user_ptr(READ_ONCE(sqe->addr)); 4135 + ix->ctx.cvalue = u64_to_user_ptr(READ_ONCE(sqe->addr2)); 4136 + ix->ctx.size = READ_ONCE(sqe->len); 4137 + ix->ctx.flags = READ_ONCE(sqe->xattr_flags); 4138 + 4139 + if (ix->ctx.flags) 4140 + return -EINVAL; 4141 + 4142 + ix->ctx.kname = kmalloc(sizeof(*ix->ctx.kname), GFP_KERNEL); 4143 + if (!ix->ctx.kname) 4144 + return -ENOMEM; 4145 + 4146 + ret = strncpy_from_user(ix->ctx.kname->name, name, 4147 + sizeof(ix->ctx.kname->name)); 4148 + if (!ret || ret == sizeof(ix->ctx.kname->name)) 4149 + ret = -ERANGE; 4150 + if (ret < 0) { 4151 + kfree(ix->ctx.kname); 4152 + return ret; 4153 + } 4154 + 4155 + req->flags |= REQ_F_NEED_CLEANUP; 4156 + return 0; 4157 + } 4158 + 4159 + static int io_fgetxattr_prep(struct io_kiocb *req, 4160 + const struct io_uring_sqe *sqe) 4161 + { 4162 + return __io_getxattr_prep(req, sqe); 4163 + } 4164 + 4165 + static int io_getxattr_prep(struct io_kiocb *req, 4166 + const struct io_uring_sqe *sqe) 4167 + { 4168 + struct io_xattr *ix = &req->xattr; 4169 + const char __user *path; 4170 + int ret; 4171 + 4172 + ret = __io_getxattr_prep(req, sqe); 4173 + if (ret) 4174 + return ret; 4175 + 4176 + path = u64_to_user_ptr(READ_ONCE(sqe->addr3)); 4177 + 4178 + ix->filename = getname_flags(path, LOOKUP_FOLLOW, NULL); 4179 + if (IS_ERR(ix->filename)) { 4180 + ret = PTR_ERR(ix->filename); 4181 + ix->filename = NULL; 4182 + } 4183 + 4184 + return ret; 4185 + } 4186 + 4187 + static int io_fgetxattr(struct io_kiocb *req, unsigned int issue_flags) 4188 + { 4189 + struct io_xattr *ix = &req->xattr; 4190 + int ret; 4191 + 4192 + if (issue_flags & IO_URING_F_NONBLOCK) 4193 + return -EAGAIN; 4194 + 4195 + ret = do_getxattr(mnt_user_ns(req->file->f_path.mnt), 4196 + req->file->f_path.dentry, 4197 + &ix->ctx); 4198 + 4199 + io_xattr_finish(req, ret); 4200 + return 0; 4201 + } 4202 + 4203 + static int io_getxattr(struct io_kiocb *req, unsigned int issue_flags) 4204 + { 4205 + struct io_xattr *ix = &req->xattr; 4206 + unsigned int lookup_flags = LOOKUP_FOLLOW; 4207 + struct path path; 4208 + int ret; 4209 + 4210 + if (issue_flags & IO_URING_F_NONBLOCK) 4211 + return -EAGAIN; 4212 + 4213 + retry: 4214 + ret = filename_lookup(AT_FDCWD, ix->filename, lookup_flags, &path, NULL); 4215 + if (!ret) { 4216 + ret = do_getxattr(mnt_user_ns(path.mnt), 4217 + path.dentry, 4218 + &ix->ctx); 4219 + 4220 + path_put(&path); 4221 + if (retry_estale(ret, lookup_flags)) { 4222 + lookup_flags |= LOOKUP_REVAL; 4223 + goto retry; 4224 + } 4225 + } 4226 + 4227 + io_xattr_finish(req, ret); 4228 + return 0; 4229 + } 4230 + 4231 + static int __io_setxattr_prep(struct io_kiocb *req, 4232 + const struct io_uring_sqe *sqe) 4233 + { 4234 + struct io_xattr *ix = &req->xattr; 4235 + const char __user *name; 4236 + int ret; 4237 + 4238 + if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) 4239 + return -EINVAL; 4240 + if (unlikely(sqe->ioprio)) 4241 + return -EINVAL; 4242 + if (unlikely(req->flags & REQ_F_FIXED_FILE)) 4243 + return -EBADF; 4244 + 4245 + ix->filename = NULL; 4246 + name = u64_to_user_ptr(READ_ONCE(sqe->addr)); 4247 + ix->ctx.cvalue = u64_to_user_ptr(READ_ONCE(sqe->addr2)); 4248 + ix->ctx.kvalue = NULL; 4249 + ix->ctx.size = READ_ONCE(sqe->len); 4250 + ix->ctx.flags = READ_ONCE(sqe->xattr_flags); 4251 + 4252 + ix->ctx.kname = kmalloc(sizeof(*ix->ctx.kname), GFP_KERNEL); 4253 + if (!ix->ctx.kname) 4254 + return -ENOMEM; 4255 + 4256 + ret = setxattr_copy(name, &ix->ctx); 4257 + if (ret) { 4258 + kfree(ix->ctx.kname); 4259 + return ret; 4260 + } 4261 + 4262 + req->flags |= REQ_F_NEED_CLEANUP; 4263 + return 0; 4264 + } 4265 + 4266 + static int io_setxattr_prep(struct io_kiocb *req, 4267 + const struct io_uring_sqe *sqe) 4268 + { 4269 + struct io_xattr *ix = &req->xattr; 4270 + const char __user *path; 4271 + int ret; 4272 + 4273 + ret = __io_setxattr_prep(req, sqe); 4274 + if (ret) 4275 + return ret; 4276 + 4277 + path = u64_to_user_ptr(READ_ONCE(sqe->addr3)); 4278 + 4279 + ix->filename = getname_flags(path, LOOKUP_FOLLOW, NULL); 4280 + if (IS_ERR(ix->filename)) { 4281 + ret = PTR_ERR(ix->filename); 4282 + ix->filename = NULL; 4283 + } 4284 + 4285 + return ret; 4286 + } 4287 + 4288 + static int io_fsetxattr_prep(struct io_kiocb *req, 4289 + const struct io_uring_sqe *sqe) 4290 + { 4291 + return __io_setxattr_prep(req, sqe); 4292 + } 4293 + 4294 + static int __io_setxattr(struct io_kiocb *req, unsigned int issue_flags, 4295 + struct path *path) 4296 + { 4297 + struct io_xattr *ix = &req->xattr; 4298 + int ret; 4299 + 4300 + ret = mnt_want_write(path->mnt); 4301 + if (!ret) { 4302 + ret = do_setxattr(mnt_user_ns(path->mnt), path->dentry, &ix->ctx); 4303 + mnt_drop_write(path->mnt); 4304 + } 4305 + 4306 + return ret; 4307 + } 4308 + 4309 + static int io_fsetxattr(struct io_kiocb *req, unsigned int issue_flags) 4310 + { 4311 + int ret; 4312 + 4313 + if (issue_flags & IO_URING_F_NONBLOCK) 4314 + return -EAGAIN; 4315 + 4316 + ret = __io_setxattr(req, issue_flags, &req->file->f_path); 4317 + io_xattr_finish(req, ret); 4318 + 4319 + return 0; 4320 + } 4321 + 4322 + static int io_setxattr(struct io_kiocb *req, unsigned int issue_flags) 4323 + { 4324 + struct io_xattr *ix = &req->xattr; 4325 + unsigned int lookup_flags = LOOKUP_FOLLOW; 4326 + struct path path; 4327 + int ret; 4328 + 4329 + if (issue_flags & IO_URING_F_NONBLOCK) 4330 + return -EAGAIN; 4331 + 4332 + retry: 4333 + ret = filename_lookup(AT_FDCWD, ix->filename, lookup_flags, &path, NULL); 4334 + if (!ret) { 4335 + ret = __io_setxattr(req, issue_flags, &path); 4336 + path_put(&path); 4337 + if (retry_estale(ret, lookup_flags)) { 4338 + lookup_flags |= LOOKUP_REVAL; 4339 + goto retry; 4340 + } 4341 + } 4342 + 4343 + io_xattr_finish(req, ret); 4322 4344 return 0; 4323 4345 } 4324 4346 ··· 5216 4736 struct io_ring_ctx *ctx = req->ctx; 5217 4737 struct io_buffer_list *bl; 5218 4738 int ret = 0; 5219 - bool needs_lock = issue_flags & IO_URING_F_UNLOCKED; 5220 4739 5221 - io_ring_submit_lock(ctx, needs_lock); 5222 - 5223 - lockdep_assert_held(&ctx->uring_lock); 4740 + io_ring_submit_lock(ctx, issue_flags); 5224 4741 5225 4742 ret = -ENOENT; 5226 4743 bl = io_buffer_get_list(ctx, p->bgid); ··· 5228 4751 5229 4752 /* complete before unlock, IOPOLL may need the lock */ 5230 4753 __io_req_complete(req, issue_flags, ret, 0); 5231 - io_ring_submit_unlock(ctx, needs_lock); 4754 + io_ring_submit_unlock(ctx, issue_flags); 5232 4755 return 0; 5233 4756 } 5234 4757 ··· 5342 4865 struct io_ring_ctx *ctx = req->ctx; 5343 4866 struct io_buffer_list *bl; 5344 4867 int ret = 0; 5345 - bool needs_lock = issue_flags & IO_URING_F_UNLOCKED; 5346 4868 5347 - io_ring_submit_lock(ctx, needs_lock); 5348 - 5349 - lockdep_assert_held(&ctx->uring_lock); 4869 + io_ring_submit_lock(ctx, issue_flags); 5350 4870 5351 4871 bl = io_buffer_get_list(ctx, p->bgid); 5352 4872 if (unlikely(!bl)) { ··· 5361 4887 req_set_fail(req); 5362 4888 /* complete before unlock, IOPOLL may need the lock */ 5363 4889 __io_req_complete(req, issue_flags, ret, 0); 5364 - io_ring_submit_unlock(ctx, needs_lock); 4890 + io_ring_submit_unlock(ctx, issue_flags); 5365 4891 return 0; 5366 4892 } 5367 4893 ··· 5635 5161 } 5636 5162 5637 5163 #if defined(CONFIG_NET) 5164 + static bool io_net_retry(struct socket *sock, int flags) 5165 + { 5166 + if (!(flags & MSG_WAITALL)) 5167 + return false; 5168 + return sock->type == SOCK_STREAM || sock->type == SOCK_SEQPACKET; 5169 + } 5170 + 5638 5171 static int io_setup_async_msg(struct io_kiocb *req, 5639 5172 struct io_async_msghdr *kmsg) 5640 5173 { ··· 5702 5221 if (req->ctx->compat) 5703 5222 sr->msg_flags |= MSG_CMSG_COMPAT; 5704 5223 #endif 5224 + sr->done_io = 0; 5705 5225 return 0; 5706 5226 } 5707 5227 5708 5228 static int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags) 5709 5229 { 5710 5230 struct io_async_msghdr iomsg, *kmsg; 5231 + struct io_sr_msg *sr = &req->sr_msg; 5711 5232 struct socket *sock; 5712 5233 unsigned flags; 5713 5234 int min_ret = 0; ··· 5741 5258 return io_setup_async_msg(req, kmsg); 5742 5259 if (ret == -ERESTARTSYS) 5743 5260 ret = -EINTR; 5261 + if (ret > 0 && io_net_retry(sock, flags)) { 5262 + sr->done_io += ret; 5263 + req->flags |= REQ_F_PARTIAL_IO; 5264 + return io_setup_async_msg(req, kmsg); 5265 + } 5744 5266 req_set_fail(req); 5745 5267 } 5746 5268 /* fast path, check for non-NULL to avoid function call */ 5747 5269 if (kmsg->free_iov) 5748 5270 kfree(kmsg->free_iov); 5749 5271 req->flags &= ~REQ_F_NEED_CLEANUP; 5272 + if (ret >= 0) 5273 + ret += sr->done_io; 5274 + else if (sr->done_io) 5275 + ret = sr->done_io; 5750 5276 __io_req_complete(req, issue_flags, ret, 0); 5751 5277 return 0; 5752 5278 } ··· 5796 5304 return -EAGAIN; 5797 5305 if (ret == -ERESTARTSYS) 5798 5306 ret = -EINTR; 5307 + if (ret > 0 && io_net_retry(sock, flags)) { 5308 + sr->len -= ret; 5309 + sr->buf += ret; 5310 + sr->done_io += ret; 5311 + req->flags |= REQ_F_PARTIAL_IO; 5312 + return -EAGAIN; 5313 + } 5799 5314 req_set_fail(req); 5800 5315 } 5316 + if (ret >= 0) 5317 + ret += sr->done_io; 5318 + else if (sr->done_io) 5319 + ret = sr->done_io; 5801 5320 __io_req_complete(req, issue_flags, ret, 0); 5802 5321 return 0; 5803 5322 } ··· 5940 5437 #endif 5941 5438 sr->done_io = 0; 5942 5439 return 0; 5943 - } 5944 - 5945 - static bool io_net_retry(struct socket *sock, int flags) 5946 - { 5947 - if (!(flags & MSG_WAITALL)) 5948 - return false; 5949 - return sock->type == SOCK_STREAM || sock->type == SOCK_SEQPACKET; 5950 5440 } 5951 5441 5952 5442 static int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags) ··· 6137 5641 return 0; 6138 5642 } 6139 5643 5644 + static int io_socket_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 5645 + { 5646 + struct io_socket *sock = &req->sock; 5647 + 5648 + if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) 5649 + return -EINVAL; 5650 + if (sqe->ioprio || sqe->addr || sqe->rw_flags || sqe->buf_index) 5651 + return -EINVAL; 5652 + 5653 + sock->domain = READ_ONCE(sqe->fd); 5654 + sock->type = READ_ONCE(sqe->off); 5655 + sock->protocol = READ_ONCE(sqe->len); 5656 + sock->file_slot = READ_ONCE(sqe->file_index); 5657 + sock->nofile = rlimit(RLIMIT_NOFILE); 5658 + 5659 + sock->flags = sock->type & ~SOCK_TYPE_MASK; 5660 + if (sock->file_slot && (sock->flags & SOCK_CLOEXEC)) 5661 + return -EINVAL; 5662 + if (sock->flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK)) 5663 + return -EINVAL; 5664 + return 0; 5665 + } 5666 + 5667 + static int io_socket(struct io_kiocb *req, unsigned int issue_flags) 5668 + { 5669 + struct io_socket *sock = &req->sock; 5670 + bool fixed = !!sock->file_slot; 5671 + struct file *file; 5672 + int ret, fd; 5673 + 5674 + if (!fixed) { 5675 + fd = __get_unused_fd_flags(sock->flags, sock->nofile); 5676 + if (unlikely(fd < 0)) 5677 + return fd; 5678 + } 5679 + file = __sys_socket_file(sock->domain, sock->type, sock->protocol); 5680 + if (IS_ERR(file)) { 5681 + if (!fixed) 5682 + put_unused_fd(fd); 5683 + ret = PTR_ERR(file); 5684 + if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK)) 5685 + return -EAGAIN; 5686 + if (ret == -ERESTARTSYS) 5687 + ret = -EINTR; 5688 + req_set_fail(req); 5689 + } else if (!fixed) { 5690 + fd_install(fd, file); 5691 + ret = fd; 5692 + } else { 5693 + ret = io_install_fixed_file(req, file, issue_flags, 5694 + sock->file_slot - 1); 5695 + } 5696 + __io_req_complete(req, issue_flags, ret, 0); 5697 + return 0; 5698 + } 5699 + 6140 5700 static int io_connect_prep_async(struct io_kiocb *req) 6141 5701 { 6142 5702 struct io_async_connect *io = req->async_data; ··· 6281 5729 IO_NETOP_PREP_ASYNC(recvmsg); 6282 5730 IO_NETOP_PREP_ASYNC(connect); 6283 5731 IO_NETOP_PREP(accept); 5732 + IO_NETOP_PREP(socket); 6284 5733 IO_NETOP_FN(send); 6285 5734 IO_NETOP_FN(recv); 6286 5735 #endif /* CONFIG_NET */ ··· 6332 5779 struct io_ring_ctx *ctx = req->ctx; 6333 5780 struct hlist_head *list; 6334 5781 6335 - list = &ctx->cancel_hash[hash_long(req->user_data, ctx->cancel_hash_bits)]; 5782 + list = &ctx->cancel_hash[hash_long(req->cqe.user_data, ctx->cancel_hash_bits)]; 6336 5783 hlist_add_head(&req->hash_node, list); 6337 5784 } 6338 5785 ··· 6397 5844 * 6398 5845 * Returns a negative error on failure. >0 when no action require, which is 6399 5846 * either spurious wakeup or multishot CQE is served. 0 when it's done with 6400 - * the request, then the mask is stored in req->result. 5847 + * the request, then the mask is stored in req->cqe.res. 6401 5848 */ 6402 5849 static int io_poll_check_events(struct io_kiocb *req, bool locked) 6403 5850 { ··· 6406 5853 6407 5854 /* req->task == current here, checking PF_EXITING is safe */ 6408 5855 if (unlikely(req->task->flags & PF_EXITING)) 6409 - io_poll_mark_cancelled(req); 5856 + return -ECANCELED; 6410 5857 6411 5858 do { 6412 5859 v = atomic_read(&req->poll_refs); ··· 6417 5864 if (v & IO_POLL_CANCEL_FLAG) 6418 5865 return -ECANCELED; 6419 5866 6420 - if (!req->result) { 5867 + if (!req->cqe.res) { 6421 5868 struct poll_table_struct pt = { ._key = req->apoll_events }; 6422 5869 unsigned flags = locked ? 0 : IO_URING_F_UNLOCKED; 6423 5870 6424 5871 if (unlikely(!io_assign_file(req, flags))) 6425 5872 return -EBADF; 6426 - req->result = vfs_poll(req->file, &pt) & req->apoll_events; 5873 + req->cqe.res = vfs_poll(req->file, &pt) & req->apoll_events; 6427 5874 } 6428 5875 6429 5876 /* multishot, just fill an CQE and proceed */ 6430 - if (req->result && !(req->apoll_events & EPOLLONESHOT)) { 6431 - __poll_t mask = mangle_poll(req->result & req->apoll_events); 5877 + if (req->cqe.res && !(req->apoll_events & EPOLLONESHOT)) { 5878 + __poll_t mask = mangle_poll(req->cqe.res & req->apoll_events); 6432 5879 bool filled; 6433 5880 6434 5881 spin_lock(&ctx->completion_lock); 6435 - filled = io_fill_cqe_aux(ctx, req->user_data, mask, 5882 + filled = io_fill_cqe_aux(ctx, req->cqe.user_data, mask, 6436 5883 IORING_CQE_F_MORE); 6437 5884 io_commit_cqring(ctx); 6438 5885 spin_unlock(&ctx->completion_lock); 6439 5886 if (unlikely(!filled)) 6440 5887 return -ECANCELED; 6441 5888 io_cqring_ev_posted(ctx); 6442 - } else if (req->result) { 5889 + } else if (req->cqe.res) { 6443 5890 return 0; 6444 5891 } 6445 5892 ··· 6462 5909 return; 6463 5910 6464 5911 if (!ret) { 6465 - req->result = mangle_poll(req->result & req->poll.events); 5912 + req->cqe.res = mangle_poll(req->cqe.res & req->poll.events); 6466 5913 } else { 6467 - req->result = ret; 5914 + req->cqe.res = ret; 6468 5915 req_set_fail(req); 6469 5916 } 6470 5917 6471 5918 io_poll_remove_entries(req); 6472 5919 spin_lock(&ctx->completion_lock); 6473 5920 hash_del(&req->hash_node); 6474 - __io_req_complete_post(req, req->result, 0); 5921 + __io_req_complete_post(req, req->cqe.res, 0); 6475 5922 io_commit_cqring(ctx); 6476 5923 spin_unlock(&ctx->completion_lock); 6477 5924 io_cqring_ev_posted(ctx); ··· 6499 5946 6500 5947 static void __io_poll_execute(struct io_kiocb *req, int mask, int events) 6501 5948 { 6502 - req->result = mask; 5949 + req->cqe.res = mask; 6503 5950 /* 6504 5951 * This is useful for poll that is armed on behalf of another 6505 5952 * request, and where the wakeup path could be on a different ··· 6512 5959 else 6513 5960 req->io_task_work.func = io_apoll_task_func; 6514 5961 6515 - trace_io_uring_task_add(req->ctx, req, req->user_data, req->opcode, mask); 5962 + trace_io_uring_task_add(req->ctx, req, req->cqe.user_data, req->opcode, mask); 6516 5963 io_req_task_work_add(req, false); 6517 5964 } 6518 5965 ··· 6651 6098 int v; 6652 6099 6653 6100 INIT_HLIST_NODE(&req->hash_node); 6101 + req->work.cancel_seq = atomic_read(&ctx->cancel_seq); 6654 6102 io_init_poll_iocb(poll, mask, io_poll_wake); 6655 6103 poll->file = req->file; 6656 6104 ··· 6727 6173 6728 6174 if (!def->pollin && !def->pollout) 6729 6175 return IO_APOLL_ABORTED; 6730 - if (!file_can_poll(req->file) || (req->flags & REQ_F_POLLED)) 6176 + if (!file_can_poll(req->file)) 6177 + return IO_APOLL_ABORTED; 6178 + if ((req->flags & (REQ_F_POLLED|REQ_F_PARTIAL_IO)) == REQ_F_POLLED) 6731 6179 return IO_APOLL_ABORTED; 6732 6180 6733 6181 if (def->pollin) { ··· 6744 6188 } 6745 6189 if (def->poll_exclusive) 6746 6190 mask |= EPOLLEXCLUSIVE; 6747 - if (!(issue_flags & IO_URING_F_UNLOCKED) && 6748 - !list_empty(&ctx->apoll_cache)) { 6191 + if (req->flags & REQ_F_POLLED) { 6192 + apoll = req->apoll; 6193 + } else if (!(issue_flags & IO_URING_F_UNLOCKED) && 6194 + !list_empty(&ctx->apoll_cache)) { 6749 6195 apoll = list_first_entry(&ctx->apoll_cache, struct async_poll, 6750 6196 poll.wait.entry); 6751 6197 list_del_init(&apoll->poll.wait.entry); ··· 6767 6209 if (ret || ipt.error) 6768 6210 return ret ? IO_APOLL_READY : IO_APOLL_ABORTED; 6769 6211 6770 - trace_io_uring_poll_arm(ctx, req, req->user_data, req->opcode, 6212 + trace_io_uring_poll_arm(ctx, req, req->cqe.user_data, req->opcode, 6771 6213 mask, apoll->poll.events); 6772 6214 return IO_APOLL_OK; 6773 6215 } ··· 6800 6242 return found; 6801 6243 } 6802 6244 6803 - static struct io_kiocb *io_poll_find(struct io_ring_ctx *ctx, __u64 sqe_addr, 6804 - bool poll_only) 6245 + static struct io_kiocb *io_poll_find(struct io_ring_ctx *ctx, bool poll_only, 6246 + struct io_cancel_data *cd) 6805 6247 __must_hold(&ctx->completion_lock) 6806 6248 { 6807 6249 struct hlist_head *list; 6808 6250 struct io_kiocb *req; 6809 6251 6810 - list = &ctx->cancel_hash[hash_long(sqe_addr, ctx->cancel_hash_bits)]; 6252 + list = &ctx->cancel_hash[hash_long(cd->data, ctx->cancel_hash_bits)]; 6811 6253 hlist_for_each_entry(req, list, hash_node) { 6812 - if (sqe_addr != req->user_data) 6254 + if (cd->data != req->cqe.user_data) 6813 6255 continue; 6814 6256 if (poll_only && req->opcode != IORING_OP_POLL_ADD) 6815 6257 continue; 6258 + if (cd->flags & IORING_ASYNC_CANCEL_ALL) { 6259 + if (cd->seq == req->work.cancel_seq) 6260 + continue; 6261 + req->work.cancel_seq = cd->seq; 6262 + } 6816 6263 return req; 6264 + } 6265 + return NULL; 6266 + } 6267 + 6268 + static struct io_kiocb *io_poll_file_find(struct io_ring_ctx *ctx, 6269 + struct io_cancel_data *cd) 6270 + __must_hold(&ctx->completion_lock) 6271 + { 6272 + struct io_kiocb *req; 6273 + int i; 6274 + 6275 + for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) { 6276 + struct hlist_head *list; 6277 + 6278 + list = &ctx->cancel_hash[i]; 6279 + hlist_for_each_entry(req, list, hash_node) { 6280 + if (!(cd->flags & IORING_ASYNC_CANCEL_ANY) && 6281 + req->file != cd->file) 6282 + continue; 6283 + if (cd->seq == req->work.cancel_seq) 6284 + continue; 6285 + req->work.cancel_seq = cd->seq; 6286 + return req; 6287 + } 6817 6288 } 6818 6289 return NULL; 6819 6290 } ··· 6857 6270 return true; 6858 6271 } 6859 6272 6860 - static int io_poll_cancel(struct io_ring_ctx *ctx, __u64 sqe_addr, 6861 - bool poll_only) 6273 + static int io_poll_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd) 6862 6274 __must_hold(&ctx->completion_lock) 6863 6275 { 6864 - struct io_kiocb *req = io_poll_find(ctx, sqe_addr, poll_only); 6276 + struct io_kiocb *req; 6865 6277 6278 + if (cd->flags & (IORING_ASYNC_CANCEL_FD|IORING_ASYNC_CANCEL_ANY)) 6279 + req = io_poll_file_find(ctx, cd); 6280 + else 6281 + req = io_poll_find(ctx, false, cd); 6866 6282 if (!req) 6867 6283 return -ENOENT; 6868 6284 io_poll_cancel_req(req); ··· 6956 6366 6957 6367 static int io_poll_update(struct io_kiocb *req, unsigned int issue_flags) 6958 6368 { 6369 + struct io_cancel_data cd = { .data = req->poll_update.old_user_data, }; 6959 6370 struct io_ring_ctx *ctx = req->ctx; 6960 6371 struct io_kiocb *preq; 6961 6372 int ret2, ret = 0; 6962 6373 bool locked; 6963 6374 6964 6375 spin_lock(&ctx->completion_lock); 6965 - preq = io_poll_find(ctx, req->poll_update.old_user_data, true); 6376 + preq = io_poll_find(ctx, true, &cd); 6966 6377 if (!preq || !io_poll_disarm(preq)) { 6967 6378 spin_unlock(&ctx->completion_lock); 6968 6379 ret = preq ? -EALREADY : -ENOENT; ··· 6979 6388 preq->poll.events |= IO_POLL_UNMASK; 6980 6389 } 6981 6390 if (req->poll_update.update_user_data) 6982 - preq->user_data = req->poll_update.new_user_data; 6391 + preq->cqe.user_data = req->poll_update.new_user_data; 6983 6392 6984 6393 ret2 = io_poll_add(preq, issue_flags); 6985 6394 /* successfully updated, don't complete poll request */ ··· 6988 6397 } 6989 6398 6990 6399 req_set_fail(preq); 6991 - preq->result = -ECANCELED; 6400 + preq->cqe.res = -ECANCELED; 6992 6401 locked = !(issue_flags & IO_URING_F_UNLOCKED); 6993 6402 io_req_task_complete(preq, &locked); 6994 6403 out: ··· 7016 6425 if (!(data->flags & IORING_TIMEOUT_ETIME_SUCCESS)) 7017 6426 req_set_fail(req); 7018 6427 7019 - req->result = -ETIME; 6428 + req->cqe.res = -ETIME; 7020 6429 req->io_task_work.func = io_req_task_complete; 7021 6430 io_req_task_work_add(req, false); 7022 6431 return HRTIMER_NORESTART; 7023 6432 } 7024 6433 7025 6434 static struct io_kiocb *io_timeout_extract(struct io_ring_ctx *ctx, 7026 - __u64 user_data) 6435 + struct io_cancel_data *cd) 7027 6436 __must_hold(&ctx->timeout_lock) 7028 6437 { 7029 6438 struct io_timeout_data *io; ··· 7031 6440 bool found = false; 7032 6441 7033 6442 list_for_each_entry(req, &ctx->timeout_list, timeout.list) { 7034 - found = user_data == req->user_data; 7035 - if (found) 7036 - break; 6443 + if (!(cd->flags & IORING_ASYNC_CANCEL_ANY) && 6444 + cd->data != req->cqe.user_data) 6445 + continue; 6446 + if (cd->flags & (IORING_ASYNC_CANCEL_ALL|IORING_ASYNC_CANCEL_ANY)) { 6447 + if (cd->seq == req->work.cancel_seq) 6448 + continue; 6449 + req->work.cancel_seq = cd->seq; 6450 + } 6451 + found = true; 6452 + break; 7037 6453 } 7038 6454 if (!found) 7039 6455 return ERR_PTR(-ENOENT); ··· 7052 6454 return req; 7053 6455 } 7054 6456 7055 - static int io_timeout_cancel(struct io_ring_ctx *ctx, __u64 user_data) 6457 + static int io_timeout_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd) 7056 6458 __must_hold(&ctx->completion_lock) 7057 - __must_hold(&ctx->timeout_lock) 7058 6459 { 7059 - struct io_kiocb *req = io_timeout_extract(ctx, user_data); 6460 + struct io_kiocb *req; 6461 + 6462 + spin_lock_irq(&ctx->timeout_lock); 6463 + req = io_timeout_extract(ctx, cd); 6464 + spin_unlock_irq(&ctx->timeout_lock); 7060 6465 7061 6466 if (IS_ERR(req)) 7062 6467 return PTR_ERR(req); ··· 7092 6491 bool found = false; 7093 6492 7094 6493 list_for_each_entry(req, &ctx->ltimeout_list, timeout.list) { 7095 - found = user_data == req->user_data; 6494 + found = user_data == req->cqe.user_data; 7096 6495 if (found) 7097 6496 break; 7098 6497 } ··· 7112 6511 struct timespec64 *ts, enum hrtimer_mode mode) 7113 6512 __must_hold(&ctx->timeout_lock) 7114 6513 { 7115 - struct io_kiocb *req = io_timeout_extract(ctx, user_data); 6514 + struct io_cancel_data cd = { .data = user_data, }; 6515 + struct io_kiocb *req = io_timeout_extract(ctx, &cd); 7116 6516 struct io_timeout_data *data; 7117 6517 7118 6518 if (IS_ERR(req)) ··· 7178 6576 int ret; 7179 6577 7180 6578 if (!(req->timeout_rem.flags & IORING_TIMEOUT_UPDATE)) { 6579 + struct io_cancel_data cd = { .data = tr->addr, }; 6580 + 7181 6581 spin_lock(&ctx->completion_lock); 7182 - spin_lock_irq(&ctx->timeout_lock); 7183 - ret = io_timeout_cancel(ctx, tr->addr); 7184 - spin_unlock_irq(&ctx->timeout_lock); 6582 + ret = io_timeout_cancel(ctx, &cd); 7185 6583 spin_unlock(&ctx->completion_lock); 7186 6584 } else { 7187 6585 enum hrtimer_mode mode = io_translate_timeout_mode(tr->flags); ··· 7309 6707 return 0; 7310 6708 } 7311 6709 7312 - struct io_cancel_data { 7313 - struct io_ring_ctx *ctx; 7314 - u64 user_data; 7315 - }; 7316 - 7317 6710 static bool io_cancel_cb(struct io_wq_work *work, void *data) 7318 6711 { 7319 6712 struct io_kiocb *req = container_of(work, struct io_kiocb, work); 7320 6713 struct io_cancel_data *cd = data; 7321 6714 7322 - return req->ctx == cd->ctx && req->user_data == cd->user_data; 6715 + if (req->ctx != cd->ctx) 6716 + return false; 6717 + if (cd->flags & IORING_ASYNC_CANCEL_ANY) { 6718 + ; 6719 + } else if (cd->flags & IORING_ASYNC_CANCEL_FD) { 6720 + if (req->file != cd->file) 6721 + return false; 6722 + } else { 6723 + if (req->cqe.user_data != cd->data) 6724 + return false; 6725 + } 6726 + if (cd->flags & (IORING_ASYNC_CANCEL_ALL|IORING_ASYNC_CANCEL_ANY)) { 6727 + if (cd->seq == req->work.cancel_seq) 6728 + return false; 6729 + req->work.cancel_seq = cd->seq; 6730 + } 6731 + return true; 7323 6732 } 7324 6733 7325 - static int io_async_cancel_one(struct io_uring_task *tctx, u64 user_data, 7326 - struct io_ring_ctx *ctx) 6734 + static int io_async_cancel_one(struct io_uring_task *tctx, 6735 + struct io_cancel_data *cd) 7327 6736 { 7328 - struct io_cancel_data data = { .ctx = ctx, .user_data = user_data, }; 7329 6737 enum io_wq_cancel cancel_ret; 7330 6738 int ret = 0; 6739 + bool all; 7331 6740 7332 6741 if (!tctx || !tctx->io_wq) 7333 6742 return -ENOENT; 7334 6743 7335 - cancel_ret = io_wq_cancel_cb(tctx->io_wq, io_cancel_cb, &data, false); 6744 + all = cd->flags & (IORING_ASYNC_CANCEL_ALL|IORING_ASYNC_CANCEL_ANY); 6745 + cancel_ret = io_wq_cancel_cb(tctx->io_wq, io_cancel_cb, cd, all); 7336 6746 switch (cancel_ret) { 7337 6747 case IO_WQ_CANCEL_OK: 7338 6748 ret = 0; ··· 7360 6746 return ret; 7361 6747 } 7362 6748 7363 - static int io_try_cancel_userdata(struct io_kiocb *req, u64 sqe_addr) 6749 + static int io_try_cancel(struct io_kiocb *req, struct io_cancel_data *cd) 7364 6750 { 7365 6751 struct io_ring_ctx *ctx = req->ctx; 7366 6752 int ret; 7367 6753 7368 6754 WARN_ON_ONCE(!io_wq_current_is_worker() && req->task != current); 7369 6755 7370 - ret = io_async_cancel_one(req->task->io_uring, sqe_addr, ctx); 6756 + ret = io_async_cancel_one(req->task->io_uring, cd); 7371 6757 /* 7372 6758 * Fall-through even for -EALREADY, as we may have poll armed 7373 6759 * that need unarming. ··· 7376 6762 return 0; 7377 6763 7378 6764 spin_lock(&ctx->completion_lock); 7379 - ret = io_poll_cancel(ctx, sqe_addr, false); 6765 + ret = io_poll_cancel(ctx, cd); 7380 6766 if (ret != -ENOENT) 7381 6767 goto out; 7382 - 7383 - spin_lock_irq(&ctx->timeout_lock); 7384 - ret = io_timeout_cancel(ctx, sqe_addr); 7385 - spin_unlock_irq(&ctx->timeout_lock); 6768 + if (!(cd->flags & IORING_ASYNC_CANCEL_FD)) 6769 + ret = io_timeout_cancel(ctx, cd); 7386 6770 out: 7387 6771 spin_unlock(&ctx->completion_lock); 7388 6772 return ret; 7389 6773 } 6774 + 6775 + #define CANCEL_FLAGS (IORING_ASYNC_CANCEL_ALL | IORING_ASYNC_CANCEL_FD | \ 6776 + IORING_ASYNC_CANCEL_ANY) 7390 6777 7391 6778 static int io_async_cancel_prep(struct io_kiocb *req, 7392 6779 const struct io_uring_sqe *sqe) 7393 6780 { 7394 6781 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) 7395 6782 return -EINVAL; 7396 - if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT))) 6783 + if (unlikely(req->flags & REQ_F_BUFFER_SELECT)) 7397 6784 return -EINVAL; 7398 - if (sqe->ioprio || sqe->off || sqe->len || sqe->cancel_flags || 7399 - sqe->splice_fd_in) 6785 + if (sqe->ioprio || sqe->off || sqe->len || sqe->splice_fd_in) 7400 6786 return -EINVAL; 7401 6787 7402 6788 req->cancel.addr = READ_ONCE(sqe->addr); 6789 + req->cancel.flags = READ_ONCE(sqe->cancel_flags); 6790 + if (req->cancel.flags & ~CANCEL_FLAGS) 6791 + return -EINVAL; 6792 + if (req->cancel.flags & IORING_ASYNC_CANCEL_FD) { 6793 + if (req->cancel.flags & IORING_ASYNC_CANCEL_ANY) 6794 + return -EINVAL; 6795 + req->cancel.fd = READ_ONCE(sqe->fd); 6796 + } 6797 + 7403 6798 return 0; 7404 6799 } 7405 6800 7406 - static int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags) 6801 + static int __io_async_cancel(struct io_cancel_data *cd, struct io_kiocb *req, 6802 + unsigned int issue_flags) 7407 6803 { 7408 - struct io_ring_ctx *ctx = req->ctx; 7409 - u64 sqe_addr = req->cancel.addr; 7410 - bool needs_lock = issue_flags & IO_URING_F_UNLOCKED; 6804 + bool all = cd->flags & (IORING_ASYNC_CANCEL_ALL|IORING_ASYNC_CANCEL_ANY); 6805 + struct io_ring_ctx *ctx = cd->ctx; 7411 6806 struct io_tctx_node *node; 7412 - int ret; 6807 + int ret, nr = 0; 7413 6808 7414 - ret = io_try_cancel_userdata(req, sqe_addr); 7415 - if (ret != -ENOENT) 7416 - goto done; 6809 + do { 6810 + ret = io_try_cancel(req, cd); 6811 + if (ret == -ENOENT) 6812 + break; 6813 + if (!all) 6814 + return ret; 6815 + nr++; 6816 + } while (1); 7417 6817 7418 6818 /* slow path, try all io-wq's */ 7419 - io_ring_submit_lock(ctx, needs_lock); 6819 + io_ring_submit_lock(ctx, issue_flags); 7420 6820 ret = -ENOENT; 7421 6821 list_for_each_entry(node, &ctx->tctx_list, ctx_node) { 7422 6822 struct io_uring_task *tctx = node->task->io_uring; 7423 6823 7424 - ret = io_async_cancel_one(tctx, req->cancel.addr, ctx); 7425 - if (ret != -ENOENT) 7426 - break; 6824 + ret = io_async_cancel_one(tctx, cd); 6825 + if (ret != -ENOENT) { 6826 + if (!all) 6827 + break; 6828 + nr++; 6829 + } 7427 6830 } 7428 - io_ring_submit_unlock(ctx, needs_lock); 6831 + io_ring_submit_unlock(ctx, issue_flags); 6832 + return all ? nr : ret; 6833 + } 6834 + 6835 + static int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags) 6836 + { 6837 + struct io_cancel_data cd = { 6838 + .ctx = req->ctx, 6839 + .data = req->cancel.addr, 6840 + .flags = req->cancel.flags, 6841 + .seq = atomic_inc_return(&req->ctx->cancel_seq), 6842 + }; 6843 + int ret; 6844 + 6845 + if (cd.flags & IORING_ASYNC_CANCEL_FD) { 6846 + if (req->flags & REQ_F_FIXED_FILE) 6847 + req->file = io_file_get_fixed(req, req->cancel.fd, 6848 + issue_flags); 6849 + else 6850 + req->file = io_file_get_normal(req, req->cancel.fd); 6851 + if (!req->file) { 6852 + ret = -EBADF; 6853 + goto done; 6854 + } 6855 + cd.file = req->file; 6856 + } 6857 + 6858 + ret = __io_async_cancel(&cd, req, issue_flags); 7429 6859 done: 7430 6860 if (ret < 0) 7431 6861 req_set_fail(req); ··· 7496 6838 static int io_files_update(struct io_kiocb *req, unsigned int issue_flags) 7497 6839 { 7498 6840 struct io_ring_ctx *ctx = req->ctx; 7499 - bool needs_lock = issue_flags & IO_URING_F_UNLOCKED; 7500 6841 struct io_uring_rsrc_update2 up; 7501 6842 int ret; 7502 6843 ··· 7506 6849 up.resv = 0; 7507 6850 up.resv2 = 0; 7508 6851 7509 - io_ring_submit_lock(ctx, needs_lock); 6852 + io_ring_submit_lock(ctx, issue_flags); 7510 6853 ret = __io_register_rsrc_update(ctx, IORING_RSRC_FILE, 7511 6854 &up, req->rsrc_update.nr_args); 7512 - io_ring_submit_unlock(ctx, needs_lock); 6855 + io_ring_submit_unlock(ctx, issue_flags); 7513 6856 7514 6857 if (ret < 0) 7515 6858 req_set_fail(req); ··· 7595 6938 return io_linkat_prep(req, sqe); 7596 6939 case IORING_OP_MSG_RING: 7597 6940 return io_msg_ring_prep(req, sqe); 6941 + case IORING_OP_FSETXATTR: 6942 + return io_fsetxattr_prep(req, sqe); 6943 + case IORING_OP_SETXATTR: 6944 + return io_setxattr_prep(req, sqe); 6945 + case IORING_OP_FGETXATTR: 6946 + return io_fgetxattr_prep(req, sqe); 6947 + case IORING_OP_GETXATTR: 6948 + return io_getxattr_prep(req, sqe); 6949 + case IORING_OP_SOCKET: 6950 + return io_socket_prep(req, sqe); 7598 6951 } 7599 6952 7600 6953 printk_once(KERN_WARNING "io_uring: unhandled opcode %d\n", ··· 7641 6974 static u32 io_get_sequence(struct io_kiocb *req) 7642 6975 { 7643 6976 u32 seq = req->ctx->cached_sq_head; 6977 + struct io_kiocb *cur; 7644 6978 7645 6979 /* need original cached_sq_head, but it was increased for each req */ 7646 - io_for_each_link(req, req) 6980 + io_for_each_link(cur, req) 7647 6981 seq--; 7648 6982 return seq; 7649 6983 } ··· 7687 7019 goto queue; 7688 7020 } 7689 7021 7690 - trace_io_uring_defer(ctx, req, req->user_data, req->opcode); 7022 + trace_io_uring_defer(ctx, req, req->cqe.user_data, req->opcode); 7691 7023 de->req = req; 7692 7024 de->seq = seq; 7693 7025 list_add_tail(&de->list, &ctx->defer_list); ··· 7749 7081 if (req->statx.filename) 7750 7082 putname(req->statx.filename); 7751 7083 break; 7084 + case IORING_OP_SETXATTR: 7085 + case IORING_OP_FSETXATTR: 7086 + case IORING_OP_GETXATTR: 7087 + case IORING_OP_FGETXATTR: 7088 + __io_xattr_finish(req); 7089 + break; 7752 7090 } 7753 7091 } 7754 7092 if ((req->flags & REQ_F_POLLED) && req->apoll) { ··· 7777 7103 return true; 7778 7104 7779 7105 if (req->flags & REQ_F_FIXED_FILE) 7780 - req->file = io_file_get_fixed(req, req->fd, issue_flags); 7106 + req->file = io_file_get_fixed(req, req->cqe.fd, issue_flags); 7781 7107 else 7782 - req->file = io_file_get_normal(req, req->fd); 7783 - if (req->file) 7784 - return true; 7108 + req->file = io_file_get_normal(req, req->cqe.fd); 7785 7109 7786 - req_set_fail(req); 7787 - req->result = -EBADF; 7788 - return false; 7110 + return !!req->file; 7789 7111 } 7790 7112 7791 7113 static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags) ··· 7911 7241 case IORING_OP_MSG_RING: 7912 7242 ret = io_msg_ring(req, issue_flags); 7913 7243 break; 7244 + case IORING_OP_FSETXATTR: 7245 + ret = io_fsetxattr(req, issue_flags); 7246 + break; 7247 + case IORING_OP_SETXATTR: 7248 + ret = io_setxattr(req, issue_flags); 7249 + break; 7250 + case IORING_OP_FGETXATTR: 7251 + ret = io_fgetxattr(req, issue_flags); 7252 + break; 7253 + case IORING_OP_GETXATTR: 7254 + ret = io_getxattr(req, issue_flags); 7255 + break; 7256 + case IORING_OP_SOCKET: 7257 + ret = io_socket(req, issue_flags); 7258 + break; 7914 7259 default: 7915 7260 ret = -EINVAL; 7916 7261 break; ··· 7959 7274 const struct io_op_def *def = &io_op_defs[req->opcode]; 7960 7275 unsigned int issue_flags = IO_URING_F_UNLOCKED; 7961 7276 bool needs_poll = false; 7962 - struct io_kiocb *timeout; 7963 7277 int ret = 0, err = -ECANCELED; 7964 7278 7965 7279 /* one will be dropped by ->io_free_work() after returning to io-wq */ ··· 7967 7283 else 7968 7284 req_ref_get(req); 7969 7285 7970 - timeout = io_prep_linked_timeout(req); 7971 - if (timeout) 7972 - io_queue_linked_timeout(timeout); 7973 - 7286 + io_arm_ltimeout(req); 7974 7287 7975 7288 /* either cancelled or io-wq is dying, so don't touch tctx->iowq */ 7976 7289 if (work->flags & IO_WQ_WORK_CANCEL) { ··· 8045 7364 struct file *file = NULL; 8046 7365 unsigned long file_ptr; 8047 7366 8048 - if (issue_flags & IO_URING_F_UNLOCKED) 8049 - mutex_lock(&ctx->uring_lock); 7367 + io_ring_submit_lock(ctx, issue_flags); 8050 7368 8051 7369 if (unlikely((unsigned int)fd >= ctx->nr_user_files)) 8052 7370 goto out; ··· 8057 7377 req->flags |= (file_ptr << REQ_F_SUPPORT_NOWAIT_BIT); 8058 7378 io_req_set_rsrc_node(req, ctx, 0); 8059 7379 out: 8060 - if (issue_flags & IO_URING_F_UNLOCKED) 8061 - mutex_unlock(&ctx->uring_lock); 7380 + io_ring_submit_unlock(ctx, issue_flags); 8062 7381 return file; 8063 7382 } 8064 7383 ··· 8078 7399 { 8079 7400 struct file *file = fget(fd); 8080 7401 8081 - trace_io_uring_file_get(req->ctx, req, req->user_data, fd); 7402 + trace_io_uring_file_get(req->ctx, req, req->cqe.user_data, fd); 8082 7403 8083 7404 /* we don't allow fixed io_uring files */ 8084 7405 if (file && file->f_op == &io_uring_fops) ··· 8092 7413 int ret = -ENOENT; 8093 7414 8094 7415 if (prev) { 8095 - if (!(req->task->flags & PF_EXITING)) 8096 - ret = io_try_cancel_userdata(req, prev->user_data); 7416 + if (!(req->task->flags & PF_EXITING)) { 7417 + struct io_cancel_data cd = { 7418 + .ctx = req->ctx, 7419 + .data = prev->cqe.user_data, 7420 + }; 7421 + 7422 + ret = io_try_cancel(req, &cd); 7423 + } 8097 7424 io_req_complete_post(req, ret ?: -ETIME, 0); 8098 7425 io_put_req(prev); 8099 7426 } else { ··· 8159 7474 io_put_req(req); 8160 7475 } 8161 7476 8162 - static void io_queue_sqe_arm_apoll(struct io_kiocb *req) 7477 + static void io_queue_async(struct io_kiocb *req, int ret) 8163 7478 __must_hold(&req->ctx->uring_lock) 8164 7479 { 8165 - struct io_kiocb *linked_timeout = io_prep_linked_timeout(req); 7480 + struct io_kiocb *linked_timeout; 7481 + 7482 + if (ret != -EAGAIN || (req->flags & REQ_F_NOWAIT)) { 7483 + io_req_complete_failed(req, ret); 7484 + return; 7485 + } 7486 + 7487 + linked_timeout = io_prep_linked_timeout(req); 8166 7488 8167 7489 switch (io_arm_poll_handler(req, 0)) { 8168 7490 case IO_APOLL_READY: ··· 8180 7488 * Queued up for async execution, worker will release 8181 7489 * submit reference when the iocb is actually submitted. 8182 7490 */ 8183 - io_queue_async_work(req, NULL); 7491 + io_queue_iowq(req, NULL); 8184 7492 break; 8185 7493 case IO_APOLL_OK: 8186 7494 break; ··· 8190 7498 io_queue_linked_timeout(linked_timeout); 8191 7499 } 8192 7500 8193 - static inline void __io_queue_sqe(struct io_kiocb *req) 7501 + static inline void io_queue_sqe(struct io_kiocb *req) 8194 7502 __must_hold(&req->ctx->uring_lock) 8195 7503 { 8196 - struct io_kiocb *linked_timeout; 8197 7504 int ret; 8198 7505 8199 7506 ret = io_issue_sqe(req, IO_URING_F_NONBLOCK|IO_URING_F_COMPLETE_DEFER); ··· 8205 7514 * We async punt it if the file wasn't marked NOWAIT, or if the file 8206 7515 * doesn't support non-blocking read/write attempts 8207 7516 */ 8208 - if (likely(!ret)) { 8209 - linked_timeout = io_prep_linked_timeout(req); 8210 - if (linked_timeout) 8211 - io_queue_linked_timeout(linked_timeout); 8212 - } else if (ret == -EAGAIN && !(req->flags & REQ_F_NOWAIT)) { 8213 - io_queue_sqe_arm_apoll(req); 8214 - } else { 8215 - io_req_complete_failed(req, ret); 8216 - } 7517 + if (likely(!ret)) 7518 + io_arm_ltimeout(req); 7519 + else 7520 + io_queue_async(req, ret); 8217 7521 } 8218 7522 8219 7523 static void io_queue_sqe_fallback(struct io_kiocb *req) 8220 7524 __must_hold(&req->ctx->uring_lock) 8221 7525 { 8222 - if (req->flags & REQ_F_FAIL) { 8223 - io_req_complete_fail_submit(req); 7526 + if (unlikely(req->flags & REQ_F_FAIL)) { 7527 + /* 7528 + * We don't submit, fail them all, for that replace hardlinks 7529 + * with normal links. Extra REQ_F_LINK is tolerated. 7530 + */ 7531 + req->flags &= ~REQ_F_HARDLINK; 7532 + req->flags |= REQ_F_LINK; 7533 + io_req_complete_failed(req, req->cqe.res); 8224 7534 } else if (unlikely(req->ctx->drain_active)) { 8225 7535 io_drain_req(req); 8226 7536 } else { ··· 8230 7538 if (unlikely(ret)) 8231 7539 io_req_complete_failed(req, ret); 8232 7540 else 8233 - io_queue_async_work(req, NULL); 7541 + io_queue_iowq(req, NULL); 8234 7542 } 8235 - } 8236 - 8237 - static inline void io_queue_sqe(struct io_kiocb *req) 8238 - __must_hold(&req->ctx->uring_lock) 8239 - { 8240 - if (likely(!(req->flags & (REQ_F_FORCE_ASYNC | REQ_F_FAIL)))) 8241 - __io_queue_sqe(req); 8242 - else 8243 - io_queue_sqe_fallback(req); 8244 7543 } 8245 7544 8246 7545 /* ··· 8288 7605 req->opcode = opcode = READ_ONCE(sqe->opcode); 8289 7606 /* same numerical values with corresponding REQ_F_*, safe to copy */ 8290 7607 req->flags = sqe_flags = READ_ONCE(sqe->flags); 8291 - req->user_data = READ_ONCE(sqe->user_data); 7608 + req->cqe.user_data = READ_ONCE(sqe->user_data); 8292 7609 req->file = NULL; 8293 - req->fixed_rsrc_refs = NULL; 7610 + req->rsrc_node = NULL; 8294 7611 req->task = current; 8295 7612 8296 7613 if (unlikely(opcode >= IORING_OP_LAST)) { ··· 8329 7646 if (io_op_defs[opcode].needs_file) { 8330 7647 struct io_submit_state *state = &ctx->submit_state; 8331 7648 8332 - req->fd = READ_ONCE(sqe->fd); 7649 + req->cqe.fd = READ_ONCE(sqe->fd); 8333 7650 8334 7651 /* 8335 7652 * Plug now if we have more than 2 IO left after this, and the ··· 8361 7678 return io_req_prep(req, sqe); 8362 7679 } 8363 7680 8364 - static int io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req, 7681 + static __cold int io_submit_fail_init(const struct io_uring_sqe *sqe, 7682 + struct io_kiocb *req, int ret) 7683 + { 7684 + struct io_ring_ctx *ctx = req->ctx; 7685 + struct io_submit_link *link = &ctx->submit_state.link; 7686 + struct io_kiocb *head = link->head; 7687 + 7688 + trace_io_uring_req_failed(sqe, ctx, req, ret); 7689 + 7690 + /* 7691 + * Avoid breaking links in the middle as it renders links with SQPOLL 7692 + * unusable. Instead of failing eagerly, continue assembling the link if 7693 + * applicable and mark the head with REQ_F_FAIL. The link flushing code 7694 + * should find the flag and handle the rest. 7695 + */ 7696 + req_fail_link_node(req, ret); 7697 + if (head && !(head->flags & REQ_F_FAIL)) 7698 + req_fail_link_node(head, -ECANCELED); 7699 + 7700 + if (!(req->flags & IO_REQ_LINK_FLAGS)) { 7701 + if (head) { 7702 + link->last->link = req; 7703 + link->head = NULL; 7704 + req = head; 7705 + } 7706 + io_queue_sqe_fallback(req); 7707 + return ret; 7708 + } 7709 + 7710 + if (head) 7711 + link->last->link = req; 7712 + else 7713 + link->head = req; 7714 + link->last = req; 7715 + return 0; 7716 + } 7717 + 7718 + static inline int io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req, 8365 7719 const struct io_uring_sqe *sqe) 8366 7720 __must_hold(&ctx->uring_lock) 8367 7721 { ··· 8406 7686 int ret; 8407 7687 8408 7688 ret = io_init_req(ctx, req, sqe); 8409 - if (unlikely(ret)) { 8410 - trace_io_uring_req_failed(sqe, ctx, req, ret); 8411 - 8412 - /* fail even hard links since we don't submit */ 8413 - if (link->head) { 8414 - /* 8415 - * we can judge a link req is failed or cancelled by if 8416 - * REQ_F_FAIL is set, but the head is an exception since 8417 - * it may be set REQ_F_FAIL because of other req's failure 8418 - * so let's leverage req->result to distinguish if a head 8419 - * is set REQ_F_FAIL because of its failure or other req's 8420 - * failure so that we can set the correct ret code for it. 8421 - * init result here to avoid affecting the normal path. 8422 - */ 8423 - if (!(link->head->flags & REQ_F_FAIL)) 8424 - req_fail_link_node(link->head, -ECANCELED); 8425 - } else if (!(req->flags & (REQ_F_LINK | REQ_F_HARDLINK))) { 8426 - /* 8427 - * the current req is a normal req, we should return 8428 - * error and thus break the submittion loop. 8429 - */ 8430 - io_req_complete_failed(req, ret); 8431 - return ret; 8432 - } 8433 - req_fail_link_node(req, ret); 8434 - } 7689 + if (unlikely(ret)) 7690 + return io_submit_fail_init(sqe, req, ret); 8435 7691 8436 7692 /* don't need @sqe from now on */ 8437 - trace_io_uring_submit_sqe(ctx, req, req->user_data, req->opcode, 7693 + trace_io_uring_submit_sqe(ctx, req, req->cqe.user_data, req->opcode, 8438 7694 req->flags, true, 8439 7695 ctx->flags & IORING_SETUP_SQPOLL); 8440 7696 ··· 8421 7725 * submitted sync once the chain is complete. If none of those 8422 7726 * conditions are true (normal request), then just queue it. 8423 7727 */ 8424 - if (link->head) { 8425 - struct io_kiocb *head = link->head; 7728 + if (unlikely(link->head)) { 7729 + ret = io_req_prep_async(req); 7730 + if (unlikely(ret)) 7731 + return io_submit_fail_init(sqe, req, ret); 8426 7732 8427 - if (!(req->flags & REQ_F_FAIL)) { 8428 - ret = io_req_prep_async(req); 8429 - if (unlikely(ret)) { 8430 - req_fail_link_node(req, ret); 8431 - if (!(head->flags & REQ_F_FAIL)) 8432 - req_fail_link_node(head, -ECANCELED); 8433 - } 8434 - } 8435 - trace_io_uring_link(ctx, req, head); 7733 + trace_io_uring_link(ctx, req, link->head); 8436 7734 link->last->link = req; 8437 7735 link->last = req; 8438 7736 8439 - if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) 7737 + if (req->flags & IO_REQ_LINK_FLAGS) 8440 7738 return 0; 8441 - /* last request of a link, enqueue the link */ 7739 + /* last request of the link, flush it */ 7740 + req = link->head; 8442 7741 link->head = NULL; 8443 - req = head; 8444 - } else if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) { 8445 - link->head = req; 8446 - link->last = req; 7742 + if (req->flags & (REQ_F_FORCE_ASYNC | REQ_F_FAIL)) 7743 + goto fallback; 7744 + 7745 + } else if (unlikely(req->flags & (IO_REQ_LINK_FLAGS | 7746 + REQ_F_FORCE_ASYNC | REQ_F_FAIL))) { 7747 + if (req->flags & IO_REQ_LINK_FLAGS) { 7748 + link->head = req; 7749 + link->last = req; 7750 + } else { 7751 + fallback: 7752 + io_queue_sqe_fallback(req); 7753 + } 8447 7754 return 0; 8448 7755 } 8449 7756 ··· 8461 7762 { 8462 7763 struct io_submit_state *state = &ctx->submit_state; 8463 7764 8464 - if (state->link.head) 8465 - io_queue_sqe(state->link.head); 7765 + if (unlikely(state->link.head)) 7766 + io_queue_sqe_fallback(state->link.head); 8466 7767 /* flush only after queuing links as they can generate completions */ 8467 7768 io_submit_flush_completions(ctx); 8468 7769 if (state->plug_started) ··· 8530 7831 __must_hold(&ctx->uring_lock) 8531 7832 { 8532 7833 unsigned int entries = io_sqring_entries(ctx); 8533 - int submitted = 0; 7834 + unsigned int left; 7835 + int ret; 8534 7836 8535 7837 if (unlikely(!entries)) 8536 7838 return 0; 8537 7839 /* make sure SQ entry isn't read before tail */ 8538 - nr = min3(nr, ctx->sq_entries, entries); 8539 - io_get_task_refs(nr); 7840 + ret = left = min3(nr, ctx->sq_entries, entries); 7841 + io_get_task_refs(left); 7842 + io_submit_state_start(&ctx->submit_state, left); 8540 7843 8541 - io_submit_state_start(&ctx->submit_state, nr); 8542 7844 do { 8543 7845 const struct io_uring_sqe *sqe; 8544 7846 struct io_kiocb *req; 8545 7847 8546 - if (unlikely(!io_alloc_req_refill(ctx))) { 8547 - if (!submitted) 8548 - submitted = -EAGAIN; 7848 + if (unlikely(!io_alloc_req_refill(ctx))) 8549 7849 break; 8550 - } 8551 7850 req = io_alloc_req(ctx); 8552 7851 sqe = io_get_sqe(ctx); 8553 7852 if (unlikely(!sqe)) { 8554 - wq_stack_add_head(&req->comp_list, &ctx->submit_state.free_list); 7853 + io_req_add_to_cache(req, ctx); 8555 7854 break; 8556 7855 } 8557 - /* will complete beyond this point, count as submitted */ 8558 - submitted++; 8559 - if (io_submit_sqe(ctx, req, sqe)) { 8560 - /* 8561 - * Continue submitting even for sqe failure if the 8562 - * ring was setup with IORING_SETUP_SUBMIT_ALL 8563 - */ 8564 - if (!(ctx->flags & IORING_SETUP_SUBMIT_ALL)) 8565 - break; 7856 + 7857 + /* 7858 + * Continue submitting even for sqe failure if the 7859 + * ring was setup with IORING_SETUP_SUBMIT_ALL 7860 + */ 7861 + if (unlikely(io_submit_sqe(ctx, req, sqe)) && 7862 + !(ctx->flags & IORING_SETUP_SUBMIT_ALL)) { 7863 + left--; 7864 + break; 8566 7865 } 8567 - } while (submitted < nr); 7866 + } while (--left); 8568 7867 8569 - if (unlikely(submitted != nr)) { 8570 - int ref_used = (submitted == -EAGAIN) ? 0 : submitted; 8571 - int unused = nr - ref_used; 8572 - 8573 - current->io_uring->cached_refs += unused; 7868 + if (unlikely(left)) { 7869 + ret -= left; 7870 + /* try again if it submitted nothing and can't allocate a req */ 7871 + if (!ret && io_req_cache_empty(ctx)) 7872 + ret = -EAGAIN; 7873 + current->io_uring->cached_refs += left; 8574 7874 } 8575 7875 8576 7876 io_submit_state_end(ctx); 8577 7877 /* Commit SQ ring head once we've consumed and submitted all SQEs */ 8578 7878 io_commit_sqring(ctx); 8579 - 8580 - return submitted; 7879 + return ret; 8581 7880 } 8582 7881 8583 7882 static inline bool io_sqd_events_pending(struct io_sq_data *sqd) ··· 8791 8094 * Cannot safely flush overflowed CQEs from here, ensure we wake up 8792 8095 * the task, and the next invocation will do it. 8793 8096 */ 8794 - if (io_should_wake(iowq) || test_bit(0, &iowq->ctx->check_cq_overflow)) 8097 + if (io_should_wake(iowq) || 8098 + test_bit(IO_CHECK_CQ_OVERFLOW_BIT, &iowq->ctx->check_cq)) 8795 8099 return autoremove_wake_function(curr, mode, wake_flags, key); 8796 8100 return -1; 8797 8101 } ··· 8814 8116 ktime_t timeout) 8815 8117 { 8816 8118 int ret; 8119 + unsigned long check_cq; 8817 8120 8818 8121 /* make sure we run task_work before checking for signals */ 8819 8122 ret = io_run_task_work_sig(); 8820 8123 if (ret || io_should_wake(iowq)) 8821 8124 return ret; 8125 + check_cq = READ_ONCE(ctx->check_cq); 8822 8126 /* let the caller flush overflows, retry */ 8823 - if (test_bit(0, &ctx->check_cq_overflow)) 8127 + if (check_cq & BIT(IO_CHECK_CQ_OVERFLOW_BIT)) 8824 8128 return 1; 8825 - 8129 + if (unlikely(check_cq & BIT(IO_CHECK_CQ_DROPPED_BIT))) 8130 + return -EBADR; 8826 8131 if (!schedule_hrtimeout(&timeout, HRTIMER_MODE_ABS)) 8827 8132 return -ETIME; 8828 8133 return 1; ··· 8890 8189 prepare_to_wait_exclusive(&ctx->cq_wait, &iowq.wq, 8891 8190 TASK_INTERRUPTIBLE); 8892 8191 ret = io_cqring_wait_schedule(ctx, &iowq, timeout); 8893 - finish_wait(&ctx->cq_wait, &iowq.wq); 8894 8192 cond_resched(); 8895 8193 } while (ret > 0); 8896 8194 8195 + finish_wait(&ctx->cq_wait, &iowq.wq); 8897 8196 restore_saved_sigmask_unless(ret == -EINTR); 8898 8197 8899 8198 return READ_ONCE(rings->cq.head) == READ_ONCE(rings->cq.tail) ? ret : 0; ··· 9142 8441 9143 8442 static void __io_sqe_files_unregister(struct io_ring_ctx *ctx) 9144 8443 { 8444 + int i; 8445 + 8446 + #if !defined(IO_URING_SCM_ALL) 8447 + for (i = 0; i < ctx->nr_user_files; i++) { 8448 + struct file *file = io_file_from_index(ctx, i); 8449 + 8450 + if (!file) 8451 + continue; 8452 + if (io_fixed_file_slot(&ctx->file_table, i)->file_ptr & FFS_SCM) 8453 + continue; 8454 + fput(file); 8455 + } 8456 + #endif 8457 + 9145 8458 #if defined(CONFIG_UNIX) 9146 8459 if (ctx->ring_sock) { 9147 8460 struct sock *sock = ctx->ring_sock->sk; ··· 9163 8448 9164 8449 while ((skb = skb_dequeue(&sock->sk_receive_queue)) != NULL) 9165 8450 kfree_skb(skb); 9166 - } 9167 - #else 9168 - int i; 9169 - 9170 - for (i = 0; i < ctx->nr_user_files; i++) { 9171 - struct file *file; 9172 - 9173 - file = io_file_from_index(ctx, i); 9174 - if (file) 9175 - fput(file); 9176 8451 } 9177 8452 #endif 9178 8453 io_free_file_tables(&ctx->file_table); ··· 9308 8603 return sqd; 9309 8604 } 9310 8605 9311 - #if defined(CONFIG_UNIX) 9312 8606 /* 9313 8607 * Ensure the UNIX gc is aware of our file set, so we are certain that 9314 8608 * the io_uring can be safely unregistered on process exit, even if we have 9315 - * loops in the file referencing. 8609 + * loops in the file referencing. We account only files that can hold other 8610 + * files because otherwise they can't form a loop and so are not interesting 8611 + * for GC. 9316 8612 */ 9317 - static int __io_sqe_files_scm(struct io_ring_ctx *ctx, int nr, int offset) 8613 + static int io_scm_file_account(struct io_ring_ctx *ctx, struct file *file) 9318 8614 { 8615 + #if defined(CONFIG_UNIX) 9319 8616 struct sock *sk = ctx->ring_sock->sk; 8617 + struct sk_buff_head *head = &sk->sk_receive_queue; 9320 8618 struct scm_fp_list *fpl; 9321 8619 struct sk_buff *skb; 9322 - int i, nr_files; 9323 8620 9324 - fpl = kzalloc(sizeof(*fpl), GFP_KERNEL); 9325 - if (!fpl) 9326 - return -ENOMEM; 9327 - 9328 - skb = alloc_skb(0, GFP_KERNEL); 9329 - if (!skb) { 9330 - kfree(fpl); 9331 - return -ENOMEM; 9332 - } 9333 - 9334 - skb->sk = sk; 9335 - 9336 - nr_files = 0; 9337 - fpl->user = get_uid(current_user()); 9338 - for (i = 0; i < nr; i++) { 9339 - struct file *file = io_file_from_index(ctx, i + offset); 9340 - 9341 - if (!file) 9342 - continue; 9343 - fpl->fp[nr_files] = get_file(file); 9344 - unix_inflight(fpl->user, fpl->fp[nr_files]); 9345 - nr_files++; 9346 - } 9347 - 9348 - if (nr_files) { 9349 - fpl->max = SCM_MAX_FD; 9350 - fpl->count = nr_files; 9351 - UNIXCB(skb).fp = fpl; 9352 - skb->destructor = unix_destruct_scm; 9353 - refcount_add(skb->truesize, &sk->sk_wmem_alloc); 9354 - skb_queue_head(&sk->sk_receive_queue, skb); 9355 - 9356 - for (i = 0; i < nr; i++) { 9357 - struct file *file = io_file_from_index(ctx, i + offset); 9358 - 9359 - if (file) 9360 - fput(file); 9361 - } 9362 - } else { 9363 - kfree_skb(skb); 9364 - free_uid(fpl->user); 9365 - kfree(fpl); 9366 - } 9367 - 9368 - return 0; 9369 - } 9370 - 9371 - /* 9372 - * If UNIX sockets are enabled, fd passing can cause a reference cycle which 9373 - * causes regular reference counting to break down. We rely on the UNIX 9374 - * garbage collection to take care of this problem for us. 9375 - */ 9376 - static int io_sqe_files_scm(struct io_ring_ctx *ctx) 9377 - { 9378 - unsigned left, total; 9379 - int ret = 0; 9380 - 9381 - total = 0; 9382 - left = ctx->nr_user_files; 9383 - while (left) { 9384 - unsigned this_files = min_t(unsigned, left, SCM_MAX_FD); 9385 - 9386 - ret = __io_sqe_files_scm(ctx, this_files, total); 9387 - if (ret) 9388 - break; 9389 - left -= this_files; 9390 - total += this_files; 9391 - } 9392 - 9393 - if (!ret) 8621 + if (likely(!io_file_need_scm(file))) 9394 8622 return 0; 9395 8623 9396 - while (total < ctx->nr_user_files) { 9397 - struct file *file = io_file_from_index(ctx, total); 8624 + /* 8625 + * See if we can merge this file into an existing skb SCM_RIGHTS 8626 + * file set. If there's no room, fall back to allocating a new skb 8627 + * and filling it in. 8628 + */ 8629 + spin_lock_irq(&head->lock); 8630 + skb = skb_peek(head); 8631 + if (skb && UNIXCB(skb).fp->count < SCM_MAX_FD) 8632 + __skb_unlink(skb, head); 8633 + else 8634 + skb = NULL; 8635 + spin_unlock_irq(&head->lock); 9398 8636 9399 - if (file) 9400 - fput(file); 9401 - total++; 8637 + if (!skb) { 8638 + fpl = kzalloc(sizeof(*fpl), GFP_KERNEL); 8639 + if (!fpl) 8640 + return -ENOMEM; 8641 + 8642 + skb = alloc_skb(0, GFP_KERNEL); 8643 + if (!skb) { 8644 + kfree(fpl); 8645 + return -ENOMEM; 8646 + } 8647 + 8648 + fpl->user = get_uid(current_user()); 8649 + fpl->max = SCM_MAX_FD; 8650 + fpl->count = 0; 8651 + 8652 + UNIXCB(skb).fp = fpl; 8653 + skb->sk = sk; 8654 + skb->destructor = unix_destruct_scm; 8655 + refcount_add(skb->truesize, &sk->sk_wmem_alloc); 9402 8656 } 9403 8657 9404 - return ret; 9405 - } 9406 - #else 9407 - static int io_sqe_files_scm(struct io_ring_ctx *ctx) 9408 - { 8658 + fpl = UNIXCB(skb).fp; 8659 + fpl->fp[fpl->count++] = get_file(file); 8660 + unix_inflight(fpl->user, file); 8661 + skb_queue_head(head, skb); 8662 + fput(file); 8663 + #endif 9409 8664 return 0; 9410 8665 } 9411 - #endif 9412 8666 9413 8667 static void io_rsrc_file_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc) 9414 8668 { ··· 9377 8713 struct sk_buff_head list, *head = &sock->sk_receive_queue; 9378 8714 struct sk_buff *skb; 9379 8715 int i; 8716 + 8717 + if (!io_file_need_scm(file)) { 8718 + fput(file); 8719 + return; 8720 + } 9380 8721 9381 8722 __skb_queue_head_init(&list); 9382 8723 ··· 9447 8778 list_del(&prsrc->list); 9448 8779 9449 8780 if (prsrc->tag) { 9450 - bool lock_ring = ctx->flags & IORING_SETUP_IOPOLL; 8781 + if (ctx->flags & IORING_SETUP_IOPOLL) 8782 + mutex_lock(&ctx->uring_lock); 9451 8783 9452 - io_ring_submit_lock(ctx, lock_ring); 9453 8784 spin_lock(&ctx->completion_lock); 9454 8785 io_fill_cqe_aux(ctx, prsrc->tag, 0, 0); 9455 8786 io_commit_cqring(ctx); 9456 8787 spin_unlock(&ctx->completion_lock); 9457 8788 io_cqring_ev_posted(ctx); 9458 - io_ring_submit_unlock(ctx, lock_ring); 8789 + 8790 + if (ctx->flags & IORING_SETUP_IOPOLL) 8791 + mutex_unlock(&ctx->uring_lock); 9459 8792 } 9460 8793 9461 8794 rsrc_data->do_put(ctx, prsrc); ··· 9511 8840 if (ret) 9512 8841 return ret; 9513 8842 9514 - ret = -ENOMEM; 9515 - if (!io_alloc_file_tables(&ctx->file_table, nr_args)) 9516 - goto out_free; 8843 + if (!io_alloc_file_tables(&ctx->file_table, nr_args)) { 8844 + io_rsrc_data_free(ctx->file_data); 8845 + ctx->file_data = NULL; 8846 + return -ENOMEM; 8847 + } 9517 8848 9518 8849 for (i = 0; i < nr_args; i++, ctx->nr_user_files++) { 8850 + struct io_fixed_file *file_slot; 8851 + 9519 8852 if (copy_from_user(&fd, &fds[i], sizeof(fd))) { 9520 8853 ret = -EFAULT; 9521 - goto out_fput; 8854 + goto fail; 9522 8855 } 9523 8856 /* allow sparse sets */ 9524 8857 if (fd == -1) { 9525 8858 ret = -EINVAL; 9526 8859 if (unlikely(*io_get_tag_slot(ctx->file_data, i))) 9527 - goto out_fput; 8860 + goto fail; 9528 8861 continue; 9529 8862 } 9530 8863 9531 8864 file = fget(fd); 9532 8865 ret = -EBADF; 9533 8866 if (unlikely(!file)) 9534 - goto out_fput; 8867 + goto fail; 9535 8868 9536 8869 /* 9537 8870 * Don't allow io_uring instances to be registered. If UNIX ··· 9546 8871 */ 9547 8872 if (file->f_op == &io_uring_fops) { 9548 8873 fput(file); 9549 - goto out_fput; 8874 + goto fail; 9550 8875 } 9551 - io_fixed_file_set(io_fixed_file_slot(&ctx->file_table, i), file); 9552 - } 9553 - 9554 - ret = io_sqe_files_scm(ctx); 9555 - if (ret) { 9556 - __io_sqe_files_unregister(ctx); 9557 - return ret; 8876 + ret = io_scm_file_account(ctx, file); 8877 + if (ret) { 8878 + fput(file); 8879 + goto fail; 8880 + } 8881 + file_slot = io_fixed_file_slot(&ctx->file_table, i); 8882 + io_fixed_file_set(file_slot, file); 9558 8883 } 9559 8884 9560 8885 io_rsrc_node_switch(ctx, NULL); 9561 - return ret; 9562 - out_fput: 9563 - for (i = 0; i < ctx->nr_user_files; i++) { 9564 - file = io_file_from_index(ctx, i); 9565 - if (file) 9566 - fput(file); 9567 - } 9568 - io_free_file_tables(&ctx->file_table); 9569 - ctx->nr_user_files = 0; 9570 - out_free: 9571 - io_rsrc_data_free(ctx->file_data); 9572 - ctx->file_data = NULL; 9573 - return ret; 9574 - } 9575 - 9576 - static int io_sqe_file_register(struct io_ring_ctx *ctx, struct file *file, 9577 - int index) 9578 - { 9579 - #if defined(CONFIG_UNIX) 9580 - struct sock *sock = ctx->ring_sock->sk; 9581 - struct sk_buff_head *head = &sock->sk_receive_queue; 9582 - struct sk_buff *skb; 9583 - 9584 - /* 9585 - * See if we can merge this file into an existing skb SCM_RIGHTS 9586 - * file set. If there's no room, fall back to allocating a new skb 9587 - * and filling it in. 9588 - */ 9589 - spin_lock_irq(&head->lock); 9590 - skb = skb_peek(head); 9591 - if (skb) { 9592 - struct scm_fp_list *fpl = UNIXCB(skb).fp; 9593 - 9594 - if (fpl->count < SCM_MAX_FD) { 9595 - __skb_unlink(skb, head); 9596 - spin_unlock_irq(&head->lock); 9597 - fpl->fp[fpl->count] = get_file(file); 9598 - unix_inflight(fpl->user, fpl->fp[fpl->count]); 9599 - fpl->count++; 9600 - spin_lock_irq(&head->lock); 9601 - __skb_queue_head(head, skb); 9602 - } else { 9603 - skb = NULL; 9604 - } 9605 - } 9606 - spin_unlock_irq(&head->lock); 9607 - 9608 - if (skb) { 9609 - fput(file); 9610 - return 0; 9611 - } 9612 - 9613 - return __io_sqe_files_scm(ctx, 1, index); 9614 - #else 9615 8886 return 0; 9616 - #endif 8887 + fail: 8888 + __io_sqe_files_unregister(ctx); 8889 + return ret; 9617 8890 } 9618 8891 9619 8892 static int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx, ··· 9585 8962 unsigned int issue_flags, u32 slot_index) 9586 8963 { 9587 8964 struct io_ring_ctx *ctx = req->ctx; 9588 - bool needs_lock = issue_flags & IO_URING_F_UNLOCKED; 9589 8965 bool needs_switch = false; 9590 8966 struct io_fixed_file *file_slot; 9591 8967 int ret = -EBADF; 9592 8968 9593 - io_ring_submit_lock(ctx, needs_lock); 8969 + io_ring_submit_lock(ctx, issue_flags); 9594 8970 if (file->f_op == &io_uring_fops) 9595 8971 goto err; 9596 8972 ret = -ENXIO; ··· 9618 8996 needs_switch = true; 9619 8997 } 9620 8998 9621 - *io_get_tag_slot(ctx->file_data, slot_index) = 0; 9622 - io_fixed_file_set(file_slot, file); 9623 - ret = io_sqe_file_register(ctx, file, slot_index); 9624 - if (ret) { 9625 - file_slot->file_ptr = 0; 9626 - goto err; 8999 + ret = io_scm_file_account(ctx, file); 9000 + if (!ret) { 9001 + *io_get_tag_slot(ctx->file_data, slot_index) = 0; 9002 + io_fixed_file_set(file_slot, file); 9627 9003 } 9628 - 9629 - ret = 0; 9630 9004 err: 9631 9005 if (needs_switch) 9632 9006 io_rsrc_node_switch(ctx, ctx->file_data); 9633 - io_ring_submit_unlock(ctx, needs_lock); 9007 + io_ring_submit_unlock(ctx, issue_flags); 9634 9008 if (ret) 9635 9009 fput(file); 9636 9010 return ret; ··· 9636 9018 { 9637 9019 unsigned int offset = req->close.file_slot - 1; 9638 9020 struct io_ring_ctx *ctx = req->ctx; 9639 - bool needs_lock = issue_flags & IO_URING_F_UNLOCKED; 9640 9021 struct io_fixed_file *file_slot; 9641 9022 struct file *file; 9642 9023 int ret; 9643 9024 9644 - io_ring_submit_lock(ctx, needs_lock); 9025 + io_ring_submit_lock(ctx, issue_flags); 9645 9026 ret = -ENXIO; 9646 9027 if (unlikely(!ctx->file_data)) 9647 9028 goto out; ··· 9666 9049 io_rsrc_node_switch(ctx, ctx->file_data); 9667 9050 ret = 0; 9668 9051 out: 9669 - io_ring_submit_unlock(ctx, needs_lock); 9052 + io_ring_submit_unlock(ctx, issue_flags); 9670 9053 return ret; 9671 9054 } 9672 9055 ··· 9733 9116 err = -EBADF; 9734 9117 break; 9735 9118 } 9736 - *io_get_tag_slot(data, i) = tag; 9737 - io_fixed_file_set(file_slot, file); 9738 - err = io_sqe_file_register(ctx, file, i); 9119 + err = io_scm_file_account(ctx, file); 9739 9120 if (err) { 9740 - file_slot->file_ptr = 0; 9741 9121 fput(file); 9742 9122 break; 9743 9123 } 9124 + *io_get_tag_slot(data, i) = tag; 9125 + io_fixed_file_set(file_slot, file); 9744 9126 } 9745 9127 } 9746 9128 ··· 10498 9882 mutex_lock(&ctx->uring_lock); 10499 9883 io_flush_cached_locked_reqs(ctx, state); 10500 9884 10501 - while (state->free_list.next) { 9885 + while (!io_req_cache_empty(ctx)) { 10502 9886 struct io_wq_work_node *node; 10503 9887 struct io_kiocb *req; 10504 9888 ··· 10618 10002 * Users may get EPOLLIN meanwhile seeing nothing in cqring, this 10619 10003 * pushs them to do the flush. 10620 10004 */ 10621 - if (io_cqring_events(ctx) || test_bit(0, &ctx->check_cq_overflow)) 10005 + if (io_cqring_events(ctx) || 10006 + test_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq)) 10622 10007 mask |= EPOLLIN | EPOLLRDNORM; 10623 10008 10624 10009 return mask; ··· 10751 10134 } 10752 10135 } 10753 10136 spin_unlock_irq(&ctx->timeout_lock); 10754 - if (canceled != 0) 10755 - io_commit_cqring(ctx); 10137 + io_commit_cqring(ctx); 10756 10138 spin_unlock(&ctx->completion_lock); 10757 10139 if (canceled != 0) 10758 10140 io_cqring_ev_posted(ctx); ··· 10771 10155 io_unregister_personality(ctx, index); 10772 10156 mutex_unlock(&ctx->uring_lock); 10773 10157 10774 - io_kill_timeouts(ctx, NULL, true); 10775 - io_poll_remove_all(ctx, NULL, true); 10776 - 10777 - /* if we failed setting up the ctx, we might not have any rings */ 10778 - io_iopoll_try_reap_events(ctx); 10158 + /* failed during ring init, it couldn't have issued any requests */ 10159 + if (ctx->rings) { 10160 + io_kill_timeouts(ctx, NULL, true); 10161 + io_poll_remove_all(ctx, NULL, true); 10162 + /* if we failed setting up the ctx, we might not have any rings */ 10163 + io_iopoll_try_reap_events(ctx); 10164 + } 10779 10165 10780 10166 INIT_WORK(&ctx->exit_work, io_ring_exit_work); 10781 10167 /* ··· 10868 10250 { 10869 10251 struct io_task_cancel cancel = { .task = task, .all = cancel_all, }; 10870 10252 struct io_uring_task *tctx = task ? task->io_uring : NULL; 10253 + 10254 + /* failed during ring init, it couldn't have issued any requests */ 10255 + if (!ctx->rings) 10256 + return; 10871 10257 10872 10258 while (1) { 10873 10259 enum io_wq_cancel cret; ··· 11318 10696 return 0; 11319 10697 } 11320 10698 10699 + static int io_validate_ext_arg(unsigned flags, const void __user *argp, size_t argsz) 10700 + { 10701 + if (flags & IORING_ENTER_EXT_ARG) { 10702 + struct io_uring_getevents_arg arg; 10703 + 10704 + if (argsz != sizeof(arg)) 10705 + return -EINVAL; 10706 + if (copy_from_user(&arg, argp, sizeof(arg))) 10707 + return -EFAULT; 10708 + } 10709 + return 0; 10710 + } 10711 + 11321 10712 static int io_get_ext_arg(unsigned flags, const void __user *argp, size_t *argsz, 11322 10713 struct __kernel_timespec __user **ts, 11323 10714 const sigset_t __user **sig) ··· 11368 10733 size_t, argsz) 11369 10734 { 11370 10735 struct io_ring_ctx *ctx; 11371 - int submitted = 0; 11372 10736 struct fd f; 11373 10737 long ret; 11374 10738 ··· 11430 10796 if (ret) 11431 10797 goto out; 11432 10798 } 11433 - submitted = to_submit; 10799 + ret = to_submit; 11434 10800 } else if (to_submit) { 11435 10801 ret = io_uring_add_tctx_node(ctx); 11436 10802 if (unlikely(ret)) 11437 10803 goto out; 11438 - mutex_lock(&ctx->uring_lock); 11439 - submitted = io_submit_sqes(ctx, to_submit); 11440 - mutex_unlock(&ctx->uring_lock); 11441 10804 11442 - if (submitted != to_submit) 10805 + mutex_lock(&ctx->uring_lock); 10806 + ret = io_submit_sqes(ctx, to_submit); 10807 + if (ret != to_submit) { 10808 + mutex_unlock(&ctx->uring_lock); 11443 10809 goto out; 10810 + } 10811 + if ((flags & IORING_ENTER_GETEVENTS) && ctx->syscall_iopoll) 10812 + goto iopoll_locked; 10813 + mutex_unlock(&ctx->uring_lock); 11444 10814 } 11445 10815 if (flags & IORING_ENTER_GETEVENTS) { 11446 - const sigset_t __user *sig; 11447 - struct __kernel_timespec __user *ts; 11448 - 11449 - ret = io_get_ext_arg(flags, argp, &argsz, &ts, &sig); 11450 - if (unlikely(ret)) 11451 - goto out; 11452 - 11453 - min_complete = min(min_complete, ctx->cq_entries); 11454 - 11455 - /* 11456 - * When SETUP_IOPOLL and SETUP_SQPOLL are both enabled, user 11457 - * space applications don't need to do io completion events 11458 - * polling again, they can rely on io_sq_thread to do polling 11459 - * work, which can reduce cpu usage and uring_lock contention. 11460 - */ 11461 - if (ctx->flags & IORING_SETUP_IOPOLL && 11462 - !(ctx->flags & IORING_SETUP_SQPOLL)) { 11463 - ret = io_iopoll_check(ctx, min_complete); 10816 + int ret2; 10817 + if (ctx->syscall_iopoll) { 10818 + /* 10819 + * We disallow the app entering submit/complete with 10820 + * polling, but we still need to lock the ring to 10821 + * prevent racing with polled issue that got punted to 10822 + * a workqueue. 10823 + */ 10824 + mutex_lock(&ctx->uring_lock); 10825 + iopoll_locked: 10826 + ret2 = io_validate_ext_arg(flags, argp, argsz); 10827 + if (likely(!ret2)) { 10828 + min_complete = min(min_complete, 10829 + ctx->cq_entries); 10830 + ret2 = io_iopoll_check(ctx, min_complete); 10831 + } 10832 + mutex_unlock(&ctx->uring_lock); 11464 10833 } else { 11465 - ret = io_cqring_wait(ctx, min_complete, sig, argsz, ts); 10834 + const sigset_t __user *sig; 10835 + struct __kernel_timespec __user *ts; 10836 + 10837 + ret2 = io_get_ext_arg(flags, argp, &argsz, &ts, &sig); 10838 + if (likely(!ret2)) { 10839 + min_complete = min(min_complete, 10840 + ctx->cq_entries); 10841 + ret2 = io_cqring_wait(ctx, min_complete, sig, 10842 + argsz, ts); 10843 + } 10844 + } 10845 + 10846 + if (!ret) { 10847 + ret = ret2; 10848 + 10849 + /* 10850 + * EBADR indicates that one or more CQE were dropped. 10851 + * Once the user has been informed we can clear the bit 10852 + * as they are obviously ok with those drops. 10853 + */ 10854 + if (unlikely(ret2 == -EBADR)) 10855 + clear_bit(IO_CHECK_CQ_DROPPED_BIT, 10856 + &ctx->check_cq); 11466 10857 } 11467 10858 } 11468 10859 ··· 11496 10837 out_fput: 11497 10838 if (!(flags & IORING_ENTER_REGISTERED_RING)) 11498 10839 fdput(f); 11499 - return submitted ? submitted : ret; 10840 + return ret; 11500 10841 } 11501 10842 11502 10843 #ifdef CONFIG_PROC_FS ··· 11813 11154 ctx = io_ring_ctx_alloc(p); 11814 11155 if (!ctx) 11815 11156 return -ENOMEM; 11157 + 11158 + /* 11159 + * When SETUP_IOPOLL and SETUP_SQPOLL are both enabled, user 11160 + * space applications don't need to do io completion events 11161 + * polling again, they can rely on io_sq_thread to do polling 11162 + * work, which can reduce cpu usage and uring_lock contention. 11163 + */ 11164 + if (ctx->flags & IORING_SETUP_IOPOLL && 11165 + !(ctx->flags & IORING_SETUP_SQPOLL)) 11166 + ctx->syscall_iopoll = 1; 11167 + 11816 11168 ctx->compat = in_compat_syscall(); 11817 11169 if (!capable(CAP_IPC_LOCK)) 11818 11170 ctx->user = get_uid(current_user()); ··· 12487 11817 BUILD_BUG_SQE_ELEM(42, __u16, personality); 12488 11818 BUILD_BUG_SQE_ELEM(44, __s32, splice_fd_in); 12489 11819 BUILD_BUG_SQE_ELEM(44, __u32, file_index); 11820 + BUILD_BUG_SQE_ELEM(48, __u64, addr3); 12490 11821 12491 11822 BUILD_BUG_ON(sizeof(struct io_uring_files_update) != 12492 11823 sizeof(struct io_uring_rsrc_update));

+97 -46

fs/xattr.c

··· 25 25 26 26 #include <linux/uaccess.h> 27 27 28 + #include "internal.h" 29 + 28 30 static const char * 29 31 strcmp_prefix(const char *a, const char *a_prefix) 30 32 { ··· 541 539 /* 542 540 * Extended attribute SET operations 543 541 */ 544 - static long 545 - setxattr(struct user_namespace *mnt_userns, struct dentry *d, 546 - const char __user *name, const void __user *value, size_t size, 547 - int flags) 542 + 543 + int setxattr_copy(const char __user *name, struct xattr_ctx *ctx) 548 544 { 549 545 int error; 550 - void *kvalue = NULL; 551 - char kname[XATTR_NAME_MAX + 1]; 552 546 553 - if (flags & ~(XATTR_CREATE|XATTR_REPLACE)) 547 + if (ctx->flags & ~(XATTR_CREATE|XATTR_REPLACE)) 554 548 return -EINVAL; 555 549 556 - error = strncpy_from_user(kname, name, sizeof(kname)); 557 - if (error == 0 || error == sizeof(kname)) 558 - error = -ERANGE; 550 + error = strncpy_from_user(ctx->kname->name, name, 551 + sizeof(ctx->kname->name)); 552 + if (error == 0 || error == sizeof(ctx->kname->name)) 553 + return -ERANGE; 559 554 if (error < 0) 560 555 return error; 561 556 562 - if (size) { 563 - if (size > XATTR_SIZE_MAX) 557 + error = 0; 558 + if (ctx->size) { 559 + if (ctx->size > XATTR_SIZE_MAX) 564 560 return -E2BIG; 565 - kvalue = kvmalloc(size, GFP_KERNEL); 566 - if (!kvalue) 567 - return -ENOMEM; 568 - if (copy_from_user(kvalue, value, size)) { 569 - error = -EFAULT; 570 - goto out; 561 + 562 + ctx->kvalue = vmemdup_user(ctx->cvalue, ctx->size); 563 + if (IS_ERR(ctx->kvalue)) { 564 + error = PTR_ERR(ctx->kvalue); 565 + ctx->kvalue = NULL; 571 566 } 572 - if ((strcmp(kname, XATTR_NAME_POSIX_ACL_ACCESS) == 0) || 573 - (strcmp(kname, XATTR_NAME_POSIX_ACL_DEFAULT) == 0)) 574 - posix_acl_fix_xattr_from_user(mnt_userns, d_inode(d), 575 - kvalue, size); 576 567 } 577 568 578 - error = vfs_setxattr(mnt_userns, d, kname, kvalue, size, flags); 579 - out: 580 - kvfree(kvalue); 569 + return error; 570 + } 581 571 572 + static void setxattr_convert(struct user_namespace *mnt_userns, 573 + struct dentry *d, struct xattr_ctx *ctx) 574 + { 575 + if (ctx->size && 576 + ((strcmp(ctx->kname->name, XATTR_NAME_POSIX_ACL_ACCESS) == 0) || 577 + (strcmp(ctx->kname->name, XATTR_NAME_POSIX_ACL_DEFAULT) == 0))) 578 + posix_acl_fix_xattr_from_user(mnt_userns, d_inode(d), 579 + ctx->kvalue, ctx->size); 580 + } 581 + 582 + int do_setxattr(struct user_namespace *mnt_userns, struct dentry *dentry, 583 + struct xattr_ctx *ctx) 584 + { 585 + setxattr_convert(mnt_userns, dentry, ctx); 586 + return vfs_setxattr(mnt_userns, dentry, ctx->kname->name, 587 + ctx->kvalue, ctx->size, ctx->flags); 588 + } 589 + 590 + static long 591 + setxattr(struct user_namespace *mnt_userns, struct dentry *d, 592 + const char __user *name, const void __user *value, size_t size, 593 + int flags) 594 + { 595 + struct xattr_name kname; 596 + struct xattr_ctx ctx = { 597 + .cvalue = value, 598 + .kvalue = NULL, 599 + .size = size, 600 + .kname = &kname, 601 + .flags = flags, 602 + }; 603 + int error; 604 + 605 + error = setxattr_copy(name, &ctx); 606 + if (error) 607 + return error; 608 + 609 + error = do_setxattr(mnt_userns, d, &ctx); 610 + 611 + kvfree(ctx.kvalue); 582 612 return error; 583 613 } 584 614 ··· 676 642 /* 677 643 * Extended attribute GET operations 678 644 */ 679 - static ssize_t 680 - getxattr(struct user_namespace *mnt_userns, struct dentry *d, 681 - const char __user *name, void __user *value, size_t size) 645 + ssize_t 646 + do_getxattr(struct user_namespace *mnt_userns, struct dentry *d, 647 + struct xattr_ctx *ctx) 682 648 { 683 649 ssize_t error; 684 - void *kvalue = NULL; 685 - char kname[XATTR_NAME_MAX + 1]; 650 + char *kname = ctx->kname->name; 686 651 687 - error = strncpy_from_user(kname, name, sizeof(kname)); 688 - if (error == 0 || error == sizeof(kname)) 689 - error = -ERANGE; 690 - if (error < 0) 691 - return error; 692 - 693 - if (size) { 694 - if (size > XATTR_SIZE_MAX) 695 - size = XATTR_SIZE_MAX; 696 - kvalue = kvzalloc(size, GFP_KERNEL); 697 - if (!kvalue) 652 + if (ctx->size) { 653 + if (ctx->size > XATTR_SIZE_MAX) 654 + ctx->size = XATTR_SIZE_MAX; 655 + ctx->kvalue = kvzalloc(ctx->size, GFP_KERNEL); 656 + if (!ctx->kvalue) 698 657 return -ENOMEM; 699 658 } 700 659 701 - error = vfs_getxattr(mnt_userns, d, kname, kvalue, size); 660 + error = vfs_getxattr(mnt_userns, d, kname, ctx->kvalue, ctx->size); 702 661 if (error > 0) { 703 662 if ((strcmp(kname, XATTR_NAME_POSIX_ACL_ACCESS) == 0) || 704 663 (strcmp(kname, XATTR_NAME_POSIX_ACL_DEFAULT) == 0)) 705 664 posix_acl_fix_xattr_to_user(mnt_userns, d_inode(d), 706 - kvalue, error); 707 - if (size && copy_to_user(value, kvalue, error)) 665 + ctx->kvalue, error); 666 + if (ctx->size && copy_to_user(ctx->value, ctx->kvalue, error)) 708 667 error = -EFAULT; 709 - } else if (error == -ERANGE && size >= XATTR_SIZE_MAX) { 668 + } else if (error == -ERANGE && ctx->size >= XATTR_SIZE_MAX) { 710 669 /* The file system tried to returned a value bigger 711 670 than XATTR_SIZE_MAX bytes. Not possible. */ 712 671 error = -E2BIG; 713 672 } 714 673 715 - kvfree(kvalue); 674 + return error; 675 + } 716 676 677 + static ssize_t 678 + getxattr(struct user_namespace *mnt_userns, struct dentry *d, 679 + const char __user *name, void __user *value, size_t size) 680 + { 681 + ssize_t error; 682 + struct xattr_name kname; 683 + struct xattr_ctx ctx = { 684 + .value = value, 685 + .kvalue = NULL, 686 + .size = size, 687 + .kname = &kname, 688 + .flags = 0, 689 + }; 690 + 691 + error = strncpy_from_user(kname.name, name, sizeof(kname.name)); 692 + if (error == 0 || error == sizeof(kname.name)) 693 + error = -ERANGE; 694 + if (error < 0) 695 + return error; 696 + 697 + error = do_getxattr(mnt_userns, d, &ctx); 698 + 699 + kvfree(ctx.kvalue); 717 700 return error; 718 701 } 719 702

+5

include/linux/io_uring.h

··· 10 10 void __io_uring_cancel(bool cancel_all); 11 11 void __io_uring_free(struct task_struct *tsk); 12 12 void io_uring_unreg_ringfd(void); 13 + const char *io_uring_get_opcode(u8 opcode); 13 14 14 15 static inline void io_uring_files_cancel(void) 15 16 { ··· 42 41 } 43 42 static inline void io_uring_free(struct task_struct *tsk) 44 43 { 44 + } 45 + static inline const char *io_uring_get_opcode(u8 opcode) 46 + { 47 + return ""; 45 48 } 46 49 #endif 47 50

+1

include/linux/socket.h

··· 434 434 extern int __sys_accept4(int fd, struct sockaddr __user *upeer_sockaddr, 435 435 int __user *upeer_addrlen, int flags); 436 436 extern int __sys_socket(int family, int type, int protocol); 437 + extern struct file *__sys_socket_file(int family, int type, int protocol); 437 438 extern int __sys_bind(int fd, struct sockaddr __user *umyaddr, int addrlen); 438 439 extern int __sys_connect_file(struct file *file, struct sockaddr_storage *addr, 439 440 int addrlen, int file_flags);

+66 -19

include/trace/events/io_uring.h

··· 7 7 8 8 #include <linux/tracepoint.h> 9 9 #include <uapi/linux/io_uring.h> 10 + #include <linux/io_uring.h> 10 11 11 12 struct io_wq_work; 12 13 ··· 170 169 __entry->rw = rw; 171 170 ), 172 171 173 - TP_printk("ring %p, request %p, user_data 0x%llx, opcode %d, flags 0x%x, %s queue, work %p", 174 - __entry->ctx, __entry->req, __entry->user_data, __entry->opcode, 172 + TP_printk("ring %p, request %p, user_data 0x%llx, opcode %s, flags 0x%x, %s queue, work %p", 173 + __entry->ctx, __entry->req, __entry->user_data, 174 + io_uring_get_opcode(__entry->opcode), 175 175 __entry->flags, __entry->rw ? "hashed" : "normal", __entry->work) 176 176 ); 177 177 ··· 207 205 __entry->opcode = opcode; 208 206 ), 209 207 210 - TP_printk("ring %p, request %p, user_data 0x%llx, opcode %d", 211 - __entry->ctx, __entry->req, __entry->data, __entry->opcode) 208 + TP_printk("ring %p, request %p, user_data 0x%llx, opcode %s", 209 + __entry->ctx, __entry->req, __entry->data, 210 + io_uring_get_opcode(__entry->opcode)) 212 211 ); 213 212 214 213 /** ··· 308 305 __entry->link = link; 309 306 ), 310 307 311 - TP_printk("ring %p, request %p, user_data 0x%llx, opcode %d, link %p", 312 - __entry->ctx, __entry->req, __entry->user_data, __entry->opcode, 313 - __entry->link) 308 + TP_printk("ring %p, request %p, user_data 0x%llx, opcode %s, link %p", 309 + __entry->ctx, __entry->req, __entry->user_data, 310 + io_uring_get_opcode(__entry->opcode), __entry->link) 314 311 ); 315 312 316 313 /** ··· 392 389 __entry->sq_thread = sq_thread; 393 390 ), 394 391 395 - TP_printk("ring %p, req %p, user_data 0x%llx, opcode %d, flags 0x%x, " 392 + TP_printk("ring %p, req %p, user_data 0x%llx, opcode %s, flags 0x%x, " 396 393 "non block %d, sq_thread %d", __entry->ctx, __entry->req, 397 - __entry->user_data, __entry->opcode, 394 + __entry->user_data, io_uring_get_opcode(__entry->opcode), 398 395 __entry->flags, __entry->force_nonblock, __entry->sq_thread) 399 396 ); 400 397 ··· 436 433 __entry->events = events; 437 434 ), 438 435 439 - TP_printk("ring %p, req %p, user_data 0x%llx, opcode %d, mask 0x%x, events 0x%x", 440 - __entry->ctx, __entry->req, __entry->user_data, __entry->opcode, 436 + TP_printk("ring %p, req %p, user_data 0x%llx, opcode %s, mask 0x%x, events 0x%x", 437 + __entry->ctx, __entry->req, __entry->user_data, 438 + io_uring_get_opcode(__entry->opcode), 441 439 __entry->mask, __entry->events) 442 440 ); 443 441 ··· 474 470 __entry->mask = mask; 475 471 ), 476 472 477 - TP_printk("ring %p, req %p, user_data 0x%llx, opcode %d, mask %x", 478 - __entry->ctx, __entry->req, __entry->user_data, __entry->opcode, 473 + TP_printk("ring %p, req %p, user_data 0x%llx, opcode %s, mask %x", 474 + __entry->ctx, __entry->req, __entry->user_data, 475 + io_uring_get_opcode(__entry->opcode), 479 476 __entry->mask) 480 477 ); 481 478 ··· 511 506 __field( u16, personality ) 512 507 __field( u32, file_index ) 513 508 __field( u64, pad1 ) 514 - __field( u64, pad2 ) 509 + __field( u64, addr3 ) 515 510 __field( int, error ) 516 511 ), 517 512 ··· 530 525 __entry->personality = sqe->personality; 531 526 __entry->file_index = sqe->file_index; 532 527 __entry->pad1 = sqe->__pad2[0]; 533 - __entry->pad2 = sqe->__pad2[1]; 528 + __entry->addr3 = sqe->addr3; 534 529 __entry->error = error; 535 530 ), 536 531 537 532 TP_printk("ring %p, req %p, user_data 0x%llx, " 538 - "op %d, flags 0x%x, prio=%d, off=%llu, addr=%llu, " 533 + "opcode %s, flags 0x%x, prio=%d, off=%llu, addr=%llu, " 539 534 "len=%u, rw_flags=0x%x, buf_index=%d, " 540 - "personality=%d, file_index=%d, pad=0x%llx/%llx, error=%d", 535 + "personality=%d, file_index=%d, pad=0x%llx, addr3=%llx, " 536 + "error=%d", 541 537 __entry->ctx, __entry->req, __entry->user_data, 542 - __entry->opcode, __entry->flags, __entry->ioprio, 538 + io_uring_get_opcode(__entry->opcode), 539 + __entry->flags, __entry->ioprio, 543 540 (unsigned long long)__entry->off, 544 541 (unsigned long long) __entry->addr, __entry->len, 545 542 __entry->op_flags, 546 543 __entry->buf_index, __entry->personality, __entry->file_index, 547 544 (unsigned long long) __entry->pad1, 548 - (unsigned long long) __entry->pad2, __entry->error) 545 + (unsigned long long) __entry->addr3, __entry->error) 546 + ); 547 + 548 + 549 + /* 550 + * io_uring_cqe_overflow - a CQE overflowed 551 + * 552 + * @ctx: pointer to a ring context structure 553 + * @user_data: user data associated with the request 554 + * @res: CQE result 555 + * @cflags: CQE flags 556 + * @ocqe: pointer to the overflow cqe (if available) 557 + * 558 + */ 559 + TRACE_EVENT(io_uring_cqe_overflow, 560 + 561 + TP_PROTO(void *ctx, unsigned long long user_data, s32 res, u32 cflags, 562 + void *ocqe), 563 + 564 + TP_ARGS(ctx, user_data, res, cflags, ocqe), 565 + 566 + TP_STRUCT__entry ( 567 + __field( void *, ctx ) 568 + __field( unsigned long long, user_data ) 569 + __field( s32, res ) 570 + __field( u32, cflags ) 571 + __field( void *, ocqe ) 572 + ), 573 + 574 + TP_fast_assign( 575 + __entry->ctx = ctx; 576 + __entry->user_data = user_data; 577 + __entry->res = res; 578 + __entry->cflags = cflags; 579 + __entry->ocqe = ocqe; 580 + ), 581 + 582 + TP_printk("ring %p, user_data 0x%llx, res %d, flags %x, " 583 + "overflow_cqe %p", 584 + __entry->ctx, __entry->user_data, __entry->res, 585 + __entry->cflags, __entry->ocqe) 549 586 ); 550 587 551 588 #endif /* _TRACE_IO_URING_H */

+21 -2

include/uapi/linux/io_uring.h

··· 45 45 __u32 rename_flags; 46 46 __u32 unlink_flags; 47 47 __u32 hardlink_flags; 48 + __u32 xattr_flags; 48 49 }; 49 50 __u64 user_data; /* data to be passed back at completion time */ 50 51 /* pack this to avoid bogus arm OABI complaints */ ··· 61 60 __s32 splice_fd_in; 62 61 __u32 file_index; 63 62 }; 64 - __u64 __pad2[2]; 63 + __u64 addr3; 64 + __u64 __pad2[1]; 65 65 }; 66 66 67 67 enum { ··· 105 103 #define IORING_SETUP_R_DISABLED (1U << 6) /* start with ring disabled */ 106 104 #define IORING_SETUP_SUBMIT_ALL (1U << 7) /* continue submit on error */ 107 105 108 - enum { 106 + enum io_uring_op { 109 107 IORING_OP_NOP, 110 108 IORING_OP_READV, 111 109 IORING_OP_WRITEV, ··· 147 145 IORING_OP_SYMLINKAT, 148 146 IORING_OP_LINKAT, 149 147 IORING_OP_MSG_RING, 148 + IORING_OP_FSETXATTR, 149 + IORING_OP_SETXATTR, 150 + IORING_OP_FGETXATTR, 151 + IORING_OP_GETXATTR, 152 + IORING_OP_SOCKET, 150 153 151 154 /* this goes last, obviously */ 152 155 IORING_OP_LAST, ··· 193 186 #define IORING_POLL_ADD_MULTI (1U << 0) 194 187 #define IORING_POLL_UPDATE_EVENTS (1U << 1) 195 188 #define IORING_POLL_UPDATE_USER_DATA (1U << 2) 189 + 190 + /* 191 + * ASYNC_CANCEL flags. 192 + * 193 + * IORING_ASYNC_CANCEL_ALL Cancel all requests that match the given key 194 + * IORING_ASYNC_CANCEL_FD Key off 'fd' for cancelation rather than the 195 + * request 'user_data' 196 + * IORING_ASYNC_CANCEL_ANY Match any request 197 + */ 198 + #define IORING_ASYNC_CANCEL_ALL (1U << 0) 199 + #define IORING_ASYNC_CANCEL_FD (1U << 1) 200 + #define IORING_ASYNC_CANCEL_ANY (1U << 2) 196 201 197 202 /* 198 203 * IO completion data structure (Completion Queue Entry)

+43 -11

net/socket.c

··· 504 504 struct socket *sock_from_file(struct file *file) 505 505 { 506 506 if (file->f_op == &socket_file_ops) 507 - return file->private_data; /* set in sock_map_fd */ 507 + return file->private_data; /* set in sock_alloc_file */ 508 508 509 509 return NULL; 510 510 } ··· 1538 1538 } 1539 1539 EXPORT_SYMBOL(sock_create_kern); 1540 1540 1541 - int __sys_socket(int family, int type, int protocol) 1541 + static struct socket *__sys_socket_create(int family, int type, int protocol) 1542 1542 { 1543 - int retval; 1544 1543 struct socket *sock; 1545 - int flags; 1544 + int retval; 1546 1545 1547 1546 /* Check the SOCK_* constants for consistency. */ 1548 1547 BUILD_BUG_ON(SOCK_CLOEXEC != O_CLOEXEC); ··· 1549 1550 BUILD_BUG_ON(SOCK_CLOEXEC & SOCK_TYPE_MASK); 1550 1551 BUILD_BUG_ON(SOCK_NONBLOCK & SOCK_TYPE_MASK); 1551 1552 1552 - flags = type & ~SOCK_TYPE_MASK; 1553 - if (flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK)) 1554 - return -EINVAL; 1553 + if ((type & ~SOCK_TYPE_MASK) & ~(SOCK_CLOEXEC | SOCK_NONBLOCK)) 1554 + return ERR_PTR(-EINVAL); 1555 1555 type &= SOCK_TYPE_MASK; 1556 - 1557 - if (SOCK_NONBLOCK != O_NONBLOCK && (flags & SOCK_NONBLOCK)) 1558 - flags = (flags & ~SOCK_NONBLOCK) | O_NONBLOCK; 1559 1556 1560 1557 retval = sock_create(family, type, protocol, &sock); 1561 1558 if (retval < 0) 1562 - return retval; 1559 + return ERR_PTR(retval); 1560 + 1561 + return sock; 1562 + } 1563 + 1564 + struct file *__sys_socket_file(int family, int type, int protocol) 1565 + { 1566 + struct socket *sock; 1567 + struct file *file; 1568 + int flags; 1569 + 1570 + sock = __sys_socket_create(family, type, protocol); 1571 + if (IS_ERR(sock)) 1572 + return ERR_CAST(sock); 1573 + 1574 + flags = type & ~SOCK_TYPE_MASK; 1575 + if (SOCK_NONBLOCK != O_NONBLOCK && (flags & SOCK_NONBLOCK)) 1576 + flags = (flags & ~SOCK_NONBLOCK) | O_NONBLOCK; 1577 + 1578 + file = sock_alloc_file(sock, flags, NULL); 1579 + if (IS_ERR(file)) 1580 + sock_release(sock); 1581 + 1582 + return file; 1583 + } 1584 + 1585 + int __sys_socket(int family, int type, int protocol) 1586 + { 1587 + struct socket *sock; 1588 + int flags; 1589 + 1590 + sock = __sys_socket_create(family, type, protocol); 1591 + if (IS_ERR(sock)) 1592 + return PTR_ERR(sock); 1593 + 1594 + flags = type & ~SOCK_TYPE_MASK; 1595 + if (SOCK_NONBLOCK != O_NONBLOCK && (flags & SOCK_NONBLOCK)) 1596 + flags = (flags & ~SOCK_NONBLOCK) | O_NONBLOCK; 1563 1597 1564 1598 return sock_map_fd(sock, flags & (O_CLOEXEC | O_NONBLOCK)); 1565 1599 }