Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge branch 'for-5.19/io_uring' into for-5.19/io_uring-passthrough

* for-5.19/io_uring: (85 commits)
io_uring: don't clear req->kbuf when buffer selection is done
io_uring: eliminate the need to track provided buffer ID separately
io_uring: move provided buffer state closer to submit state
io_uring: move provided and fixed buffers into the same io_kiocb area
io_uring: abstract out provided buffer list selection
io_uring: never call io_buffer_select() for a buffer re-select
io_uring: get rid of hashed provided buffer groups
io_uring: always use req->buf_index for the provided buffer group
io_uring: ignore ->buf_index if REQ_F_BUFFER_SELECT isn't set
io_uring: kill io_rw_buffer_select() wrapper
io_uring: make io_buffer_select() return the user address directly
io_uring: kill io_recv_buffer_select() wrapper
io_uring: use 'sr' vs 'req->sr_msg' consistently
io_uring: add POLL_FIRST support for send/sendmsg and recv/recvmsg
io_uring: check IOPOLL/ioprio support upfront
io_uring: replace smp_mb() with smp_mb__after_atomic() in io_sq_thread()
io_uring: add IORING_SETUP_TASKRUN_FLAG
io_uring: use TWA_SIGNAL_NO_IPI if IORING_SETUP_COOP_TASKRUN is used
io_uring: set task_work notify method at init time
io-wq: use __set_notify_signal() to wake workers
...

+1263 -956
+2 -2
fs/io-wq.c
··· 871 871 872 872 static bool io_wq_worker_wake(struct io_worker *worker, void *data) 873 873 { 874 - set_notify_signal(worker->task); 874 + __set_notify_signal(worker->task); 875 875 wake_up_process(worker->task); 876 876 return false; 877 877 } ··· 991 991 { 992 992 if (work && match->fn(work, match->data)) { 993 993 work->flags |= IO_WQ_WORK_CANCEL; 994 - set_notify_signal(worker->task); 994 + __set_notify_signal(worker->task); 995 995 return true; 996 996 } 997 997
+1
fs/io-wq.h
··· 155 155 struct io_wq_work { 156 156 struct io_wq_work_node list; 157 157 unsigned flags; 158 + int cancel_seq; 158 159 }; 159 160 160 161 static inline struct io_wq_work *wq_next_work(struct io_wq_work *work)
+1151 -945
fs/io_uring.c
··· 113 113 #define IO_REQ_CLEAN_FLAGS (REQ_F_BUFFER_SELECTED | REQ_F_NEED_CLEANUP | \ 114 114 REQ_F_POLLED | REQ_F_CREDS | REQ_F_ASYNC_DATA) 115 115 116 + #define IO_REQ_CLEAN_SLOW_FLAGS (REQ_F_REFCOUNT | REQ_F_LINK | REQ_F_HARDLINK |\ 117 + IO_REQ_CLEAN_FLAGS) 118 + 116 119 #define IO_TCTX_REFS_CACHE_NR (1U << 10) 117 120 118 121 struct io_uring { ··· 169 166 * The application needs a full memory barrier before checking 170 167 * for IORING_SQ_NEED_WAKEUP after updating the sq tail. 171 168 */ 172 - u32 sq_flags; 169 + atomic_t sq_flags; 173 170 /* 174 171 * Runtime CQ flags 175 172 * ··· 223 220 struct list_head list; 224 221 }; 225 222 223 + /* 224 + * FFS_SCM is only available on 64-bit archs, for 32-bit we just define it as 0 225 + * and define IO_URING_SCM_ALL. For this case, we use SCM for all files as we 226 + * can't safely always dereference the file when the task has exited and ring 227 + * cleanup is done. If a file is tracked and part of SCM, then unix gc on 228 + * process exit may reap it before __io_sqe_files_unregister() is run. 229 + */ 230 + #define FFS_NOWAIT 0x1UL 231 + #define FFS_ISREG 0x2UL 232 + #if defined(CONFIG_64BIT) 233 + #define FFS_SCM 0x4UL 234 + #else 235 + #define IO_URING_SCM_ALL 236 + #define FFS_SCM 0x0UL 237 + #endif 238 + #define FFS_MASK ~(FFS_NOWAIT|FFS_ISREG|FFS_SCM) 239 + 226 240 struct io_fixed_file { 227 241 /* file * with additional FFS_* flags */ 228 242 unsigned long file_ptr; ··· 282 262 }; 283 263 284 264 struct io_buffer_list { 285 - struct list_head list; 286 265 struct list_head buf_list; 287 266 __u16 bgid; 288 267 }; ··· 356 337 struct rcu_head rcu; 357 338 }; 358 339 359 - #define IO_BUFFERS_HASH_BITS 5 340 + #define BGID_ARRAY 64 360 341 361 342 struct io_ring_ctx { 362 343 /* const or read-mostly hot data */ ··· 365 346 366 347 struct io_rings *rings; 367 348 unsigned int flags; 349 + enum task_work_notify_mode notify_method; 368 350 unsigned int compat: 1; 369 351 unsigned int drain_next: 1; 370 352 unsigned int restricted: 1; ··· 373 353 unsigned int drain_active: 1; 374 354 unsigned int drain_disabled: 1; 375 355 unsigned int has_evfd: 1; 356 + unsigned int syscall_iopoll: 1; 376 357 } ____cacheline_aligned_in_smp; 377 358 378 359 /* submission data */ ··· 403 382 */ 404 383 struct io_rsrc_node *rsrc_node; 405 384 int rsrc_cached_refs; 385 + atomic_t cancel_seq; 406 386 struct io_file_table file_table; 407 387 unsigned nr_user_files; 408 388 unsigned nr_user_bufs; 409 389 struct io_mapped_ubuf **user_bufs; 410 390 411 391 struct io_submit_state submit_state; 392 + 393 + struct io_buffer_list *io_bl; 394 + struct xarray io_bl_xa; 395 + struct list_head io_buffers_cache; 396 + 412 397 struct list_head timeout_list; 413 398 struct list_head ltimeout_list; 414 399 struct list_head cq_overflow_list; 415 - struct list_head *io_buffers; 416 - struct list_head io_buffers_cache; 417 400 struct list_head apoll_cache; 418 401 struct xarray personalities; 419 402 u32 pers_next; ··· 434 409 struct wait_queue_head sqo_sq_wait; 435 410 struct list_head sqd_list; 436 411 437 - unsigned long check_cq_overflow; 412 + unsigned long check_cq; 438 413 439 414 struct { 415 + /* 416 + * We cache a range of free CQEs we can use, once exhausted it 417 + * should go through a slower range setup, see __io_get_cqe() 418 + */ 419 + struct io_uring_cqe *cqe_cached; 420 + struct io_uring_cqe *cqe_sentinel; 421 + 440 422 unsigned cached_cq_tail; 441 423 unsigned cq_entries; 442 424 struct io_ev_fd __rcu *io_ev_fd; ··· 589 557 struct io_cancel { 590 558 struct file *file; 591 559 u64 addr; 560 + u32 flags; 561 + s32 fd; 592 562 }; 593 563 594 564 struct io_timeout { ··· 636 602 void __user *buf; 637 603 }; 638 604 int msg_flags; 639 - int bgid; 640 605 size_t len; 641 606 size_t done_io; 607 + unsigned int flags; 642 608 }; 643 609 644 610 struct io_open { ··· 896 862 IORING_RSRC_BUFFER = 1, 897 863 }; 898 864 865 + struct io_cqe { 866 + __u64 user_data; 867 + __s32 res; 868 + /* fd initially, then cflags for completion */ 869 + union { 870 + __u32 flags; 871 + int fd; 872 + }; 873 + }; 874 + 875 + enum { 876 + IO_CHECK_CQ_OVERFLOW_BIT, 877 + IO_CHECK_CQ_DROPPED_BIT, 878 + }; 879 + 899 880 /* 900 881 * NOTE! Each of the iocb union members has the file pointer 901 882 * as the first entry in their struct definition. So you can ··· 951 902 u8 opcode; 952 903 /* polled IO has completed */ 953 904 u8 iopoll_completed; 905 + /* 906 + * Can be either a fixed buffer index, or used with provided buffers. 907 + * For the latter, before issue it points to the buffer group ID, 908 + * and after selection it points to the buffer ID itself. 909 + */ 954 910 u16 buf_index; 955 911 unsigned int flags; 956 912 957 - u64 user_data; 958 - u32 result; 959 - /* fd initially, then cflags for completion */ 960 - union { 961 - u32 cflags; 962 - int fd; 963 - }; 913 + struct io_cqe cqe; 964 914 965 915 struct io_ring_ctx *ctx; 966 916 struct task_struct *task; 967 917 968 - struct percpu_ref *fixed_rsrc_refs; 969 - /* store used ubuf, so we can prevent reloading */ 970 - struct io_mapped_ubuf *imu; 918 + struct io_rsrc_node *rsrc_node; 919 + 920 + union { 921 + /* store used ubuf, so we can prevent reloading */ 922 + struct io_mapped_ubuf *imu; 923 + 924 + /* stores selected buf, valid IFF REQ_F_BUFFER_SELECTED is set */ 925 + struct io_buffer *kbuf; 926 + }; 971 927 972 928 union { 973 929 /* used by request caches, completion batching and iopoll */ ··· 989 935 struct async_poll *apoll; 990 936 /* opcode allocated if it needs to store data for async defer */ 991 937 void *async_data; 992 - /* stores selected buf, valid IFF REQ_F_BUFFER_SELECTED is set */ 993 - struct io_buffer *kbuf; 994 938 /* linked requests, IFF REQ_F_HARDLINK or REQ_F_LINK are set */ 995 939 struct io_kiocb *link; 996 940 /* custom credentials, valid IFF REQ_F_CREDS is set */ ··· 1006 954 struct list_head list; 1007 955 struct io_kiocb *req; 1008 956 u32 seq; 957 + }; 958 + 959 + struct io_cancel_data { 960 + struct io_ring_ctx *ctx; 961 + union { 962 + u64 data; 963 + struct file *file; 964 + }; 965 + u32 flags; 966 + int seq; 1009 967 }; 1010 968 1011 969 struct io_op_def { ··· 1039 977 unsigned not_supported : 1; 1040 978 /* skip auditing */ 1041 979 unsigned audit_skip : 1; 980 + /* supports ioprio */ 981 + unsigned ioprio : 1; 982 + /* supports iopoll */ 983 + unsigned iopoll : 1; 1042 984 /* size of async data needed, if any */ 1043 985 unsigned short async_size; 1044 986 }; 1045 987 1046 988 static const struct io_op_def io_op_defs[] = { 1047 - [IORING_OP_NOP] = {}, 989 + [IORING_OP_NOP] = { 990 + .audit_skip = 1, 991 + .iopoll = 1, 992 + }, 1048 993 [IORING_OP_READV] = { 1049 994 .needs_file = 1, 1050 995 .unbound_nonreg_file = 1, ··· 1060 991 .needs_async_setup = 1, 1061 992 .plug = 1, 1062 993 .audit_skip = 1, 994 + .ioprio = 1, 995 + .iopoll = 1, 1063 996 .async_size = sizeof(struct io_async_rw), 1064 997 }, 1065 998 [IORING_OP_WRITEV] = { ··· 1072 1001 .needs_async_setup = 1, 1073 1002 .plug = 1, 1074 1003 .audit_skip = 1, 1004 + .ioprio = 1, 1005 + .iopoll = 1, 1075 1006 .async_size = sizeof(struct io_async_rw), 1076 1007 }, 1077 1008 [IORING_OP_FSYNC] = { ··· 1086 1013 .pollin = 1, 1087 1014 .plug = 1, 1088 1015 .audit_skip = 1, 1016 + .ioprio = 1, 1017 + .iopoll = 1, 1089 1018 .async_size = sizeof(struct io_async_rw), 1090 1019 }, 1091 1020 [IORING_OP_WRITE_FIXED] = { ··· 1097 1022 .pollout = 1, 1098 1023 .plug = 1, 1099 1024 .audit_skip = 1, 1025 + .ioprio = 1, 1026 + .iopoll = 1, 1100 1027 .async_size = sizeof(struct io_async_rw), 1101 1028 }, 1102 1029 [IORING_OP_POLL_ADD] = { ··· 1163 1086 [IORING_OP_CLOSE] = {}, 1164 1087 [IORING_OP_FILES_UPDATE] = { 1165 1088 .audit_skip = 1, 1089 + .iopoll = 1, 1166 1090 }, 1167 1091 [IORING_OP_STATX] = { 1168 1092 .audit_skip = 1, ··· 1175 1097 .buffer_select = 1, 1176 1098 .plug = 1, 1177 1099 .audit_skip = 1, 1100 + .ioprio = 1, 1101 + .iopoll = 1, 1178 1102 .async_size = sizeof(struct io_async_rw), 1179 1103 }, 1180 1104 [IORING_OP_WRITE] = { ··· 1186 1106 .pollout = 1, 1187 1107 .plug = 1, 1188 1108 .audit_skip = 1, 1109 + .ioprio = 1, 1110 + .iopoll = 1, 1189 1111 .async_size = sizeof(struct io_async_rw), 1190 1112 }, 1191 1113 [IORING_OP_FADVISE] = { ··· 1222 1140 }, 1223 1141 [IORING_OP_PROVIDE_BUFFERS] = { 1224 1142 .audit_skip = 1, 1143 + .iopoll = 1, 1225 1144 }, 1226 1145 [IORING_OP_REMOVE_BUFFERS] = { 1227 1146 .audit_skip = 1, 1147 + .iopoll = 1, 1228 1148 }, 1229 1149 [IORING_OP_TEE] = { 1230 1150 .needs_file = 1, ··· 1244 1160 [IORING_OP_LINKAT] = {}, 1245 1161 [IORING_OP_MSG_RING] = { 1246 1162 .needs_file = 1, 1163 + .iopoll = 1, 1247 1164 }, 1248 1165 }; 1249 1166 1250 1167 /* requests with any of those set should undergo io_disarm_next() */ 1251 1168 #define IO_DISARM_MASK (REQ_F_ARM_LTIMEOUT | REQ_F_LINK_TIMEOUT | REQ_F_FAIL) 1169 + #define IO_REQ_LINK_FLAGS (REQ_F_LINK | REQ_F_HARDLINK) 1252 1170 1253 1171 static bool io_disarm_next(struct io_kiocb *req); 1254 1172 static void io_uring_del_tctx_node(unsigned long index); ··· 1259 1173 bool cancel_all); 1260 1174 static void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd); 1261 1175 1262 - static void io_fill_cqe_req(struct io_kiocb *req, s32 res, u32 cflags); 1263 - 1264 - static void io_put_req(struct io_kiocb *req); 1265 - static void io_put_req_deferred(struct io_kiocb *req); 1176 + static void __io_req_complete_post(struct io_kiocb *req, s32 res, u32 cflags); 1266 1177 static void io_dismantle_req(struct io_kiocb *req); 1267 1178 static void io_queue_linked_timeout(struct io_kiocb *req); 1268 1179 static int __io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned type, ··· 1271 1188 static inline struct file *io_file_get_normal(struct io_kiocb *req, int fd); 1272 1189 static void io_drop_inflight_file(struct io_kiocb *req); 1273 1190 static bool io_assign_file(struct io_kiocb *req, unsigned int issue_flags); 1274 - static void __io_queue_sqe(struct io_kiocb *req); 1191 + static void io_queue_sqe(struct io_kiocb *req); 1275 1192 static void io_rsrc_put_work(struct work_struct *work); 1276 1193 1277 1194 static void io_req_task_queue(struct io_kiocb *req); ··· 1284 1201 1285 1202 static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer); 1286 1203 static void io_eventfd_signal(struct io_ring_ctx *ctx); 1204 + static void io_req_tw_post_queue(struct io_kiocb *req, s32 res, u32 cflags); 1287 1205 1288 1206 static struct kmem_cache *req_cachep; 1289 1207 ··· 1302 1218 return NULL; 1303 1219 } 1304 1220 EXPORT_SYMBOL(io_uring_get_socket); 1221 + 1222 + #if defined(CONFIG_UNIX) 1223 + static inline bool io_file_need_scm(struct file *filp) 1224 + { 1225 + #if defined(IO_URING_SCM_ALL) 1226 + return true; 1227 + #else 1228 + return !!unix_get_socket(filp); 1229 + #endif 1230 + } 1231 + #else 1232 + static inline bool io_file_need_scm(struct file *filp) 1233 + { 1234 + return false; 1235 + } 1236 + #endif 1237 + 1238 + static void io_ring_submit_unlock(struct io_ring_ctx *ctx, unsigned issue_flags) 1239 + { 1240 + lockdep_assert_held(&ctx->uring_lock); 1241 + if (issue_flags & IO_URING_F_UNLOCKED) 1242 + mutex_unlock(&ctx->uring_lock); 1243 + } 1244 + 1245 + static void io_ring_submit_lock(struct io_ring_ctx *ctx, unsigned issue_flags) 1246 + { 1247 + /* 1248 + * "Normal" inline submissions always hold the uring_lock, since we 1249 + * grab it from the system call. Same is true for the SQPOLL offload. 1250 + * The only exception is when we've detached the request and issue it 1251 + * from an async worker thread, grab the lock for that case. 1252 + */ 1253 + if (issue_flags & IO_URING_F_UNLOCKED) 1254 + mutex_lock(&ctx->uring_lock); 1255 + lockdep_assert_held(&ctx->uring_lock); 1256 + } 1305 1257 1306 1258 static inline void io_tw_lock(struct io_ring_ctx *ctx, bool *locked) 1307 1259 { ··· 1400 1280 1401 1281 #define IO_RSRC_REF_BATCH 100 1402 1282 1283 + static void io_rsrc_put_node(struct io_rsrc_node *node, int nr) 1284 + { 1285 + percpu_ref_put_many(&node->refs, nr); 1286 + } 1287 + 1403 1288 static inline void io_req_put_rsrc_locked(struct io_kiocb *req, 1404 1289 struct io_ring_ctx *ctx) 1405 1290 __must_hold(&ctx->uring_lock) 1406 1291 { 1407 - struct percpu_ref *ref = req->fixed_rsrc_refs; 1292 + struct io_rsrc_node *node = req->rsrc_node; 1408 1293 1409 - if (ref) { 1410 - if (ref == &ctx->rsrc_node->refs) 1294 + if (node) { 1295 + if (node == ctx->rsrc_node) 1411 1296 ctx->rsrc_cached_refs++; 1412 1297 else 1413 - percpu_ref_put(ref); 1298 + io_rsrc_put_node(node, 1); 1414 1299 } 1415 1300 } 1416 1301 1417 - static inline void io_req_put_rsrc(struct io_kiocb *req, struct io_ring_ctx *ctx) 1302 + static inline void io_req_put_rsrc(struct io_kiocb *req) 1418 1303 { 1419 - if (req->fixed_rsrc_refs) 1420 - percpu_ref_put(req->fixed_rsrc_refs); 1304 + if (req->rsrc_node) 1305 + io_rsrc_put_node(req->rsrc_node, 1); 1421 1306 } 1422 1307 1423 1308 static __cold void io_rsrc_refs_drop(struct io_ring_ctx *ctx) 1424 1309 __must_hold(&ctx->uring_lock) 1425 1310 { 1426 1311 if (ctx->rsrc_cached_refs) { 1427 - percpu_ref_put_many(&ctx->rsrc_node->refs, ctx->rsrc_cached_refs); 1312 + io_rsrc_put_node(ctx->rsrc_node, ctx->rsrc_cached_refs); 1428 1313 ctx->rsrc_cached_refs = 0; 1429 1314 } 1430 1315 } ··· 1445 1320 struct io_ring_ctx *ctx, 1446 1321 unsigned int issue_flags) 1447 1322 { 1448 - if (!req->fixed_rsrc_refs) { 1449 - req->fixed_rsrc_refs = &ctx->rsrc_node->refs; 1323 + if (!req->rsrc_node) { 1324 + req->rsrc_node = ctx->rsrc_node; 1450 1325 1451 1326 if (!(issue_flags & IO_URING_F_UNLOCKED)) { 1452 1327 lockdep_assert_held(&ctx->uring_lock); ··· 1454 1329 if (unlikely(ctx->rsrc_cached_refs < 0)) 1455 1330 io_rsrc_refs_refill(ctx); 1456 1331 } else { 1457 - percpu_ref_get(req->fixed_rsrc_refs); 1332 + percpu_ref_get(&req->rsrc_node->refs); 1458 1333 } 1459 1334 } 1460 1335 } 1461 1336 1462 1337 static unsigned int __io_put_kbuf(struct io_kiocb *req, struct list_head *list) 1463 1338 { 1464 - struct io_buffer *kbuf = req->kbuf; 1465 - unsigned int cflags; 1466 - 1467 - cflags = IORING_CQE_F_BUFFER | (kbuf->bid << IORING_CQE_BUFFER_SHIFT); 1468 1339 req->flags &= ~REQ_F_BUFFER_SELECTED; 1469 - list_add(&kbuf->list, list); 1470 - req->kbuf = NULL; 1471 - return cflags; 1340 + list_add(&req->kbuf->list, list); 1341 + 1342 + return IORING_CQE_F_BUFFER | (req->buf_index << IORING_CQE_BUFFER_SHIFT); 1472 1343 } 1473 1344 1474 1345 static inline unsigned int io_put_kbuf_comp(struct io_kiocb *req) ··· 1514 1393 static struct io_buffer_list *io_buffer_get_list(struct io_ring_ctx *ctx, 1515 1394 unsigned int bgid) 1516 1395 { 1517 - struct list_head *hash_list; 1518 - struct io_buffer_list *bl; 1396 + if (ctx->io_bl && bgid < BGID_ARRAY) 1397 + return &ctx->io_bl[bgid]; 1519 1398 1520 - hash_list = &ctx->io_buffers[hash_32(bgid, IO_BUFFERS_HASH_BITS)]; 1521 - list_for_each_entry(bl, hash_list, list) 1522 - if (bl->bgid == bgid || bgid == -1U) 1523 - return bl; 1524 - 1525 - return NULL; 1399 + return xa_load(&ctx->io_bl_xa, bgid); 1526 1400 } 1527 1401 1528 1402 static void io_kbuf_recycle(struct io_kiocb *req, unsigned issue_flags) ··· 1532 1416 if (req->flags & REQ_F_PARTIAL_IO) 1533 1417 return; 1534 1418 1535 - if (issue_flags & IO_URING_F_UNLOCKED) 1536 - mutex_lock(&ctx->uring_lock); 1537 - 1538 - lockdep_assert_held(&ctx->uring_lock); 1419 + io_ring_submit_lock(ctx, issue_flags); 1539 1420 1540 1421 buf = req->kbuf; 1541 1422 bl = io_buffer_get_list(ctx, buf->bgid); 1542 1423 list_add(&buf->list, &bl->buf_list); 1543 1424 req->flags &= ~REQ_F_BUFFER_SELECTED; 1544 - req->kbuf = NULL; 1425 + req->buf_index = buf->bgid; 1545 1426 1546 - if (issue_flags & IO_URING_F_UNLOCKED) 1547 - mutex_unlock(&ctx->uring_lock); 1427 + io_ring_submit_unlock(ctx, issue_flags); 1548 1428 } 1549 1429 1550 1430 static bool io_match_task(struct io_kiocb *head, struct task_struct *task, ··· 1581 1469 static inline void req_fail_link_node(struct io_kiocb *req, int res) 1582 1470 { 1583 1471 req_set_fail(req); 1584 - req->result = res; 1472 + req->cqe.res = res; 1473 + } 1474 + 1475 + static inline void io_req_add_to_cache(struct io_kiocb *req, struct io_ring_ctx *ctx) 1476 + { 1477 + wq_stack_add_head(&req->comp_list, &ctx->submit_state.free_list); 1585 1478 } 1586 1479 1587 1480 static __cold void io_ring_ctx_ref_free(struct percpu_ref *ref) ··· 1623 1506 static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) 1624 1507 { 1625 1508 struct io_ring_ctx *ctx; 1626 - int i, hash_bits; 1509 + int hash_bits; 1627 1510 1628 1511 ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); 1629 1512 if (!ctx) 1630 1513 return NULL; 1514 + 1515 + xa_init(&ctx->io_bl_xa); 1631 1516 1632 1517 /* 1633 1518 * Use 5 bits less than the max cq entries, that should give us around ··· 1651 1532 goto err; 1652 1533 /* set invalid range, so io_import_fixed() fails meeting it */ 1653 1534 ctx->dummy_ubuf->ubuf = -1UL; 1654 - 1655 - ctx->io_buffers = kcalloc(1U << IO_BUFFERS_HASH_BITS, 1656 - sizeof(struct list_head), GFP_KERNEL); 1657 - if (!ctx->io_buffers) 1658 - goto err; 1659 - for (i = 0; i < (1U << IO_BUFFERS_HASH_BITS); i++) 1660 - INIT_LIST_HEAD(&ctx->io_buffers[i]); 1661 1535 1662 1536 if (percpu_ref_init(&ctx->refs, io_ring_ctx_ref_free, 1663 1537 PERCPU_REF_ALLOW_REINIT, GFP_KERNEL)) ··· 1687 1575 err: 1688 1576 kfree(ctx->dummy_ubuf); 1689 1577 kfree(ctx->cancel_hash); 1690 - kfree(ctx->io_buffers); 1578 + kfree(ctx->io_bl); 1579 + xa_destroy(&ctx->io_bl_xa); 1691 1580 kfree(ctx); 1692 1581 return NULL; 1693 1582 } ··· 1711 1598 1712 1599 return false; 1713 1600 } 1714 - 1715 - #define FFS_NOWAIT 0x1UL 1716 - #define FFS_ISREG 0x2UL 1717 - #define FFS_MASK ~(FFS_NOWAIT|FFS_ISREG) 1718 1601 1719 1602 static inline bool io_req_ffs_set(struct io_kiocb *req) 1720 1603 { ··· 1738 1629 return __io_prep_linked_timeout(req); 1739 1630 } 1740 1631 1632 + static noinline void __io_arm_ltimeout(struct io_kiocb *req) 1633 + { 1634 + io_queue_linked_timeout(__io_prep_linked_timeout(req)); 1635 + } 1636 + 1637 + static inline void io_arm_ltimeout(struct io_kiocb *req) 1638 + { 1639 + if (unlikely(req->flags & REQ_F_ARM_LTIMEOUT)) 1640 + __io_arm_ltimeout(req); 1641 + } 1642 + 1741 1643 static void io_prep_async_work(struct io_kiocb *req) 1742 1644 { 1743 1645 const struct io_op_def *def = &io_op_defs[req->opcode]; ··· 1761 1641 1762 1642 req->work.list.next = NULL; 1763 1643 req->work.flags = 0; 1644 + req->work.cancel_seq = atomic_read(&ctx->cancel_seq); 1764 1645 if (req->flags & REQ_F_FORCE_ASYNC) 1765 1646 req->work.flags |= IO_WQ_WORK_CONCURRENT; 1766 1647 ··· 1793 1672 1794 1673 static inline void io_req_add_compl_list(struct io_kiocb *req) 1795 1674 { 1796 - struct io_ring_ctx *ctx = req->ctx; 1797 - struct io_submit_state *state = &ctx->submit_state; 1675 + struct io_submit_state *state = &req->ctx->submit_state; 1798 1676 1799 1677 if (!(req->flags & REQ_F_CQE_SKIP)) 1800 - ctx->submit_state.flush_cqes = true; 1678 + state->flush_cqes = true; 1801 1679 wq_list_add_tail(&req->comp_list, &state->compl_reqs); 1802 1680 } 1803 1681 1804 - static void io_queue_async_work(struct io_kiocb *req, bool *dont_use) 1682 + static void io_queue_iowq(struct io_kiocb *req, bool *dont_use) 1805 1683 { 1806 - struct io_ring_ctx *ctx = req->ctx; 1807 1684 struct io_kiocb *link = io_prep_linked_timeout(req); 1808 1685 struct io_uring_task *tctx = req->task->io_uring; 1809 1686 ··· 1821 1702 if (WARN_ON_ONCE(!same_thread_group(req->task, current))) 1822 1703 req->work.flags |= IO_WQ_WORK_CANCEL; 1823 1704 1824 - trace_io_uring_queue_async_work(ctx, req, req->user_data, req->opcode, req->flags, 1825 - &req->work, io_wq_is_hashed(&req->work)); 1705 + trace_io_uring_queue_async_work(req->ctx, req, req->cqe.user_data, 1706 + req->opcode, req->flags, &req->work, 1707 + io_wq_is_hashed(&req->work)); 1826 1708 io_wq_enqueue(tctx->io_wq, &req->work); 1827 1709 if (link) 1828 1710 io_queue_linked_timeout(link); ··· 1841 1721 atomic_set(&req->ctx->cq_timeouts, 1842 1722 atomic_read(&req->ctx->cq_timeouts) + 1); 1843 1723 list_del_init(&req->timeout.list); 1844 - io_fill_cqe_req(req, status, 0); 1845 - io_put_req_deferred(req); 1724 + io_req_tw_post_queue(req, status, 0); 1846 1725 } 1847 1726 } 1848 1727 ··· 1923 1804 return ctx->cached_cq_tail - READ_ONCE(ctx->rings->cq.head); 1924 1805 } 1925 1806 1926 - static inline struct io_uring_cqe *io_get_cqe(struct io_ring_ctx *ctx) 1807 + /* 1808 + * writes to the cq entry need to come after reading head; the 1809 + * control dependency is enough as we're using WRITE_ONCE to 1810 + * fill the cq entry 1811 + */ 1812 + static noinline struct io_uring_cqe *__io_get_cqe(struct io_ring_ctx *ctx) 1927 1813 { 1928 1814 struct io_rings *rings = ctx->rings; 1929 - unsigned tail, mask = ctx->cq_entries - 1; 1815 + unsigned int off = ctx->cached_cq_tail & (ctx->cq_entries - 1); 1816 + unsigned int free, queued, len; 1930 1817 1931 - /* 1932 - * writes to the cq entry need to come after reading head; the 1933 - * control dependency is enough as we're using WRITE_ONCE to 1934 - * fill the cq entry 1935 - */ 1936 - if (__io_cqring_events(ctx) == ctx->cq_entries) 1818 + /* userspace may cheat modifying the tail, be safe and do min */ 1819 + queued = min(__io_cqring_events(ctx), ctx->cq_entries); 1820 + free = ctx->cq_entries - queued; 1821 + /* we need a contiguous range, limit based on the current array offset */ 1822 + len = min(free, ctx->cq_entries - off); 1823 + if (!len) 1937 1824 return NULL; 1938 1825 1939 - tail = ctx->cached_cq_tail++; 1940 - return &rings->cqes[tail & mask]; 1826 + ctx->cached_cq_tail++; 1827 + ctx->cqe_cached = &rings->cqes[off]; 1828 + ctx->cqe_sentinel = ctx->cqe_cached + len; 1829 + return ctx->cqe_cached++; 1830 + } 1831 + 1832 + static inline struct io_uring_cqe *io_get_cqe(struct io_ring_ctx *ctx) 1833 + { 1834 + if (likely(ctx->cqe_cached < ctx->cqe_sentinel)) { 1835 + ctx->cached_cq_tail++; 1836 + return ctx->cqe_cached++; 1837 + } 1838 + return __io_get_cqe(ctx); 1941 1839 } 1942 1840 1943 1841 static void io_eventfd_signal(struct io_ring_ctx *ctx) ··· 2051 1915 2052 1916 all_flushed = list_empty(&ctx->cq_overflow_list); 2053 1917 if (all_flushed) { 2054 - clear_bit(0, &ctx->check_cq_overflow); 2055 - WRITE_ONCE(ctx->rings->sq_flags, 2056 - ctx->rings->sq_flags & ~IORING_SQ_CQ_OVERFLOW); 1918 + clear_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq); 1919 + atomic_andnot(IORING_SQ_CQ_OVERFLOW, &ctx->rings->sq_flags); 2057 1920 } 2058 1921 2059 - if (posted) 2060 - io_commit_cqring(ctx); 1922 + io_commit_cqring(ctx); 2061 1923 spin_unlock(&ctx->completion_lock); 2062 1924 if (posted) 2063 1925 io_cqring_ev_posted(ctx); ··· 2066 1932 { 2067 1933 bool ret = true; 2068 1934 2069 - if (test_bit(0, &ctx->check_cq_overflow)) { 1935 + if (test_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq)) { 2070 1936 /* iopoll syncs against uring_lock, not completion_lock */ 2071 1937 if (ctx->flags & IORING_SETUP_IOPOLL) 2072 1938 mutex_lock(&ctx->uring_lock); ··· 2078 1944 return ret; 2079 1945 } 2080 1946 2081 - /* must to be called somewhat shortly after putting a request */ 2082 - static inline void io_put_task(struct task_struct *task, int nr) 1947 + static void __io_put_task(struct task_struct *task, int nr) 2083 1948 { 2084 1949 struct io_uring_task *tctx = task->io_uring; 2085 1950 2086 - if (likely(task == current)) { 2087 - tctx->cached_refs += nr; 2088 - } else { 2089 - percpu_counter_sub(&tctx->inflight, nr); 2090 - if (unlikely(atomic_read(&tctx->in_idle))) 2091 - wake_up(&tctx->wait); 2092 - put_task_struct_many(task, nr); 2093 - } 1951 + percpu_counter_sub(&tctx->inflight, nr); 1952 + if (unlikely(atomic_read(&tctx->in_idle))) 1953 + wake_up(&tctx->wait); 1954 + put_task_struct_many(task, nr); 1955 + } 1956 + 1957 + /* must to be called somewhat shortly after putting a request */ 1958 + static inline void io_put_task(struct task_struct *task, int nr) 1959 + { 1960 + if (likely(task == current)) 1961 + task->io_uring->cached_refs += nr; 1962 + else 1963 + __io_put_task(task, nr); 2094 1964 } 2095 1965 2096 1966 static void io_task_refs_refill(struct io_uring_task *tctx) ··· 2133 1995 struct io_overflow_cqe *ocqe; 2134 1996 2135 1997 ocqe = kmalloc(sizeof(*ocqe), GFP_ATOMIC | __GFP_ACCOUNT); 1998 + trace_io_uring_cqe_overflow(ctx, user_data, res, cflags, ocqe); 2136 1999 if (!ocqe) { 2137 2000 /* 2138 2001 * If we're in ring overflow flush mode, or in task cancel mode, ··· 2141 2002 * on the floor. 2142 2003 */ 2143 2004 io_account_cq_overflow(ctx); 2005 + set_bit(IO_CHECK_CQ_DROPPED_BIT, &ctx->check_cq); 2144 2006 return false; 2145 2007 } 2146 2008 if (list_empty(&ctx->cq_overflow_list)) { 2147 - set_bit(0, &ctx->check_cq_overflow); 2148 - WRITE_ONCE(ctx->rings->sq_flags, 2149 - ctx->rings->sq_flags | IORING_SQ_CQ_OVERFLOW); 2009 + set_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq); 2010 + atomic_or(IORING_SQ_CQ_OVERFLOW, &ctx->rings->sq_flags); 2150 2011 2151 2012 } 2152 2013 ocqe->cqe.user_data = user_data; ··· 2176 2037 return io_cqring_event_overflow(ctx, user_data, res, cflags); 2177 2038 } 2178 2039 2179 - static inline bool __io_fill_cqe_req(struct io_kiocb *req, s32 res, u32 cflags) 2040 + static inline bool __io_fill_cqe_req_filled(struct io_ring_ctx *ctx, 2041 + struct io_kiocb *req) 2180 2042 { 2181 - trace_io_uring_complete(req->ctx, req, req->user_data, res, cflags); 2182 - return __io_fill_cqe(req->ctx, req->user_data, res, cflags); 2043 + struct io_uring_cqe *cqe; 2044 + 2045 + trace_io_uring_complete(req->ctx, req, req->cqe.user_data, 2046 + req->cqe.res, req->cqe.flags); 2047 + 2048 + /* 2049 + * If we can't get a cq entry, userspace overflowed the 2050 + * submission (by quite a lot). Increment the overflow count in 2051 + * the ring. 2052 + */ 2053 + cqe = io_get_cqe(ctx); 2054 + if (likely(cqe)) { 2055 + memcpy(cqe, &req->cqe, sizeof(*cqe)); 2056 + return true; 2057 + } 2058 + return io_cqring_event_overflow(ctx, req->cqe.user_data, 2059 + req->cqe.res, req->cqe.flags); 2183 2060 } 2184 2061 2185 - static noinline void io_fill_cqe_req(struct io_kiocb *req, s32 res, u32 cflags) 2062 + static inline bool __io_fill_cqe_req(struct io_kiocb *req, s32 res, u32 cflags) 2186 2063 { 2187 - if (!(req->flags & REQ_F_CQE_SKIP)) 2188 - __io_fill_cqe_req(req, res, cflags); 2064 + trace_io_uring_complete(req->ctx, req, req->cqe.user_data, res, cflags); 2065 + return __io_fill_cqe(req->ctx, req->cqe.user_data, res, cflags); 2189 2066 } 2190 2067 2191 2068 static noinline bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, ··· 2224 2069 * free_list cache. 2225 2070 */ 2226 2071 if (req_ref_put_and_test(req)) { 2227 - if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) { 2072 + if (req->flags & IO_REQ_LINK_FLAGS) { 2228 2073 if (req->flags & IO_DISARM_MASK) 2229 2074 io_disarm_next(req); 2230 2075 if (req->link) { ··· 2232 2077 req->link = NULL; 2233 2078 } 2234 2079 } 2235 - io_req_put_rsrc(req, ctx); 2080 + io_req_put_rsrc(req); 2236 2081 /* 2237 2082 * Selected buffer deallocation in io_clean_op() assumes that 2238 2083 * we don't hold ->completion_lock. Clean them here to avoid ··· 2261 2106 static inline void io_req_complete_state(struct io_kiocb *req, s32 res, 2262 2107 u32 cflags) 2263 2108 { 2264 - req->result = res; 2265 - req->cflags = cflags; 2109 + req->cqe.res = res; 2110 + req->cqe.flags = cflags; 2266 2111 req->flags |= REQ_F_COMPLETE_INLINE; 2267 2112 } 2268 2113 ··· 2286 2131 io_req_complete_post(req, res, io_put_kbuf(req, IO_URING_F_UNLOCKED)); 2287 2132 } 2288 2133 2289 - static void io_req_complete_fail_submit(struct io_kiocb *req) 2290 - { 2291 - /* 2292 - * We don't submit, fail them all, for that replace hardlinks with 2293 - * normal links. Extra REQ_F_LINK is tolerated. 2294 - */ 2295 - req->flags &= ~REQ_F_HARDLINK; 2296 - req->flags |= REQ_F_LINK; 2297 - io_req_complete_failed(req, req->result); 2298 - } 2299 - 2300 2134 /* 2301 2135 * Don't initialise the fields below on every allocation, but do that in 2302 2136 * advance and keep them valid across allocations. ··· 2296 2152 req->link = NULL; 2297 2153 req->async_data = NULL; 2298 2154 /* not necessary, but safer to zero */ 2299 - req->result = 0; 2155 + req->cqe.res = 0; 2300 2156 } 2301 2157 2302 2158 static void io_flush_cached_locked_reqs(struct io_ring_ctx *ctx, ··· 2308 2164 spin_unlock(&ctx->completion_lock); 2309 2165 } 2310 2166 2311 - /* Returns true IFF there are requests in the cache */ 2312 - static bool io_flush_cached_reqs(struct io_ring_ctx *ctx) 2167 + static inline bool io_req_cache_empty(struct io_ring_ctx *ctx) 2313 2168 { 2314 - struct io_submit_state *state = &ctx->submit_state; 2315 - 2316 - /* 2317 - * If we have more than a batch's worth of requests in our IRQ side 2318 - * locked cache, grab the lock and move them over to our submission 2319 - * side cache. 2320 - */ 2321 - if (READ_ONCE(ctx->locked_free_nr) > IO_COMPL_BATCH) 2322 - io_flush_cached_locked_reqs(ctx, state); 2323 - return !!state->free_list.next; 2169 + return !ctx->submit_state.free_list.next; 2324 2170 } 2325 2171 2326 2172 /* ··· 2322 2188 static __cold bool __io_alloc_req_refill(struct io_ring_ctx *ctx) 2323 2189 __must_hold(&ctx->uring_lock) 2324 2190 { 2325 - struct io_submit_state *state = &ctx->submit_state; 2326 2191 gfp_t gfp = GFP_KERNEL | __GFP_NOWARN; 2327 2192 void *reqs[IO_REQ_ALLOC_BATCH]; 2328 - struct io_kiocb *req; 2329 2193 int ret, i; 2330 2194 2331 - if (likely(state->free_list.next || io_flush_cached_reqs(ctx))) 2332 - return true; 2195 + /* 2196 + * If we have more than a batch's worth of requests in our IRQ side 2197 + * locked cache, grab the lock and move them over to our submission 2198 + * side cache. 2199 + */ 2200 + if (data_race(ctx->locked_free_nr) > IO_COMPL_BATCH) { 2201 + io_flush_cached_locked_reqs(ctx, &ctx->submit_state); 2202 + if (!io_req_cache_empty(ctx)) 2203 + return true; 2204 + } 2333 2205 2334 2206 ret = kmem_cache_alloc_bulk(req_cachep, gfp, ARRAY_SIZE(reqs), reqs); 2335 2207 ··· 2352 2212 2353 2213 percpu_ref_get_many(&ctx->refs, ret); 2354 2214 for (i = 0; i < ret; i++) { 2355 - req = reqs[i]; 2215 + struct io_kiocb *req = reqs[i]; 2356 2216 2357 2217 io_preinit_req(req, ctx); 2358 - wq_stack_add_head(&req->comp_list, &state->free_list); 2218 + io_req_add_to_cache(req, ctx); 2359 2219 } 2360 2220 return true; 2361 2221 } 2362 2222 2363 2223 static inline bool io_alloc_req_refill(struct io_ring_ctx *ctx) 2364 2224 { 2365 - if (unlikely(!ctx->submit_state.free_list.next)) 2225 + if (unlikely(io_req_cache_empty(ctx))) 2366 2226 return __io_alloc_req_refill(ctx); 2367 2227 return true; 2368 2228 } ··· 2391 2251 io_put_file(req->file); 2392 2252 } 2393 2253 2394 - static __cold void __io_free_req(struct io_kiocb *req) 2254 + static __cold void io_free_req(struct io_kiocb *req) 2395 2255 { 2396 2256 struct io_ring_ctx *ctx = req->ctx; 2397 2257 2398 - io_req_put_rsrc(req, ctx); 2258 + io_req_put_rsrc(req); 2399 2259 io_dismantle_req(req); 2400 2260 io_put_task(req->task, 1); 2401 2261 ··· 2413 2273 nxt->link = NULL; 2414 2274 } 2415 2275 2416 - static bool io_kill_linked_timeout(struct io_kiocb *req) 2276 + static struct io_kiocb *io_disarm_linked_timeout(struct io_kiocb *req) 2417 2277 __must_hold(&req->ctx->completion_lock) 2418 2278 __must_hold(&req->ctx->timeout_lock) 2419 2279 { ··· 2426 2286 link->timeout.head = NULL; 2427 2287 if (hrtimer_try_to_cancel(&io->timer) != -1) { 2428 2288 list_del(&link->timeout.list); 2429 - /* leave REQ_F_CQE_SKIP to io_fill_cqe_req */ 2430 - io_fill_cqe_req(link, -ECANCELED, 0); 2431 - io_put_req_deferred(link); 2432 - return true; 2289 + return link; 2433 2290 } 2434 2291 } 2435 - return false; 2292 + return NULL; 2436 2293 } 2437 2294 2438 2295 static void io_fail_links(struct io_kiocb *req) ··· 2443 2306 long res = -ECANCELED; 2444 2307 2445 2308 if (link->flags & REQ_F_FAIL) 2446 - res = link->result; 2309 + res = link->cqe.res; 2447 2310 2448 2311 nxt = link->link; 2449 2312 link->link = NULL; 2450 2313 2451 - trace_io_uring_fail_link(req->ctx, req, req->user_data, 2314 + trace_io_uring_fail_link(req->ctx, req, req->cqe.user_data, 2452 2315 req->opcode, link); 2453 2316 2454 - if (!ignore_cqes) { 2317 + if (ignore_cqes) 2318 + link->flags |= REQ_F_CQE_SKIP; 2319 + else 2455 2320 link->flags &= ~REQ_F_CQE_SKIP; 2456 - io_fill_cqe_req(link, res, 0); 2457 - } 2458 - io_put_req_deferred(link); 2321 + __io_req_complete_post(link, res, 0); 2459 2322 link = nxt; 2460 2323 } 2461 2324 } ··· 2463 2326 static bool io_disarm_next(struct io_kiocb *req) 2464 2327 __must_hold(&req->ctx->completion_lock) 2465 2328 { 2329 + struct io_kiocb *link = NULL; 2466 2330 bool posted = false; 2467 2331 2468 2332 if (req->flags & REQ_F_ARM_LTIMEOUT) { 2469 - struct io_kiocb *link = req->link; 2470 - 2333 + link = req->link; 2471 2334 req->flags &= ~REQ_F_ARM_LTIMEOUT; 2472 2335 if (link && link->opcode == IORING_OP_LINK_TIMEOUT) { 2473 2336 io_remove_next_linked(req); 2474 - /* leave REQ_F_CQE_SKIP to io_fill_cqe_req */ 2475 - io_fill_cqe_req(link, -ECANCELED, 0); 2476 - io_put_req_deferred(link); 2337 + io_req_tw_post_queue(link, -ECANCELED, 0); 2477 2338 posted = true; 2478 2339 } 2479 2340 } else if (req->flags & REQ_F_LINK_TIMEOUT) { 2480 2341 struct io_ring_ctx *ctx = req->ctx; 2481 2342 2482 2343 spin_lock_irq(&ctx->timeout_lock); 2483 - posted = io_kill_linked_timeout(req); 2344 + link = io_disarm_linked_timeout(req); 2484 2345 spin_unlock_irq(&ctx->timeout_lock); 2346 + if (link) { 2347 + posted = true; 2348 + io_req_tw_post_queue(link, -ECANCELED, 0); 2349 + } 2485 2350 } 2486 2351 if (unlikely((req->flags & REQ_F_FAIL) && 2487 2352 !(req->flags & REQ_F_HARDLINK))) { ··· 2500 2361 2501 2362 spin_lock(&ctx->completion_lock); 2502 2363 posted = io_disarm_next(req); 2503 - if (posted) 2504 - io_commit_cqring(ctx); 2364 + io_commit_cqring(ctx); 2505 2365 spin_unlock(&ctx->completion_lock); 2506 2366 if (posted) 2507 2367 io_cqring_ev_posted(ctx); ··· 2510 2372 { 2511 2373 struct io_kiocb *nxt; 2512 2374 2513 - if (likely(!(req->flags & (REQ_F_LINK|REQ_F_HARDLINK)))) 2514 - return NULL; 2515 2375 /* 2516 2376 * If LINK is set, we have dependent requests in this chain. If we 2517 2377 * didn't fail this request, queue the first one up, moving any other ··· 2527 2391 { 2528 2392 if (!ctx) 2529 2393 return; 2394 + if (ctx->flags & IORING_SETUP_TASKRUN_FLAG) 2395 + atomic_andnot(IORING_SQ_TASKRUN, &ctx->rings->sq_flags); 2530 2396 if (*locked) { 2531 2397 io_submit_flush_completions(ctx); 2532 2398 mutex_unlock(&ctx->uring_lock); ··· 2572 2434 if (likely(*uring_locked)) 2573 2435 req->io_task_work.func(req, uring_locked); 2574 2436 else 2575 - __io_req_complete_post(req, req->result, 2437 + __io_req_complete_post(req, req->cqe.res, 2576 2438 io_put_kbuf_comp(req)); 2577 2439 node = next; 2578 2440 } while (node); ··· 2613 2475 while (1) { 2614 2476 struct io_wq_work_node *node1, *node2; 2615 2477 2616 - if (!tctx->task_list.first && 2617 - !tctx->prior_task_list.first && uring_locked) 2618 - io_submit_flush_completions(ctx); 2619 - 2620 2478 spin_lock_irq(&tctx->task_lock); 2621 2479 node1 = tctx->prior_task_list.first; 2622 2480 node2 = tctx->task_list.first; ··· 2626 2492 2627 2493 if (node1) 2628 2494 handle_prev_tw_list(node1, &ctx, &uring_locked); 2629 - 2630 2495 if (node2) 2631 2496 handle_tw_list(node2, &ctx, &uring_locked); 2632 2497 cond_resched(); 2498 + 2499 + if (data_race(!tctx->task_list.first) && 2500 + data_race(!tctx->prior_task_list.first) && uring_locked) 2501 + io_submit_flush_completions(ctx); 2633 2502 } 2634 2503 2635 2504 ctx_flush_and_put(ctx, &uring_locked); ··· 2645 2508 static void io_req_task_work_add(struct io_kiocb *req, bool priority) 2646 2509 { 2647 2510 struct task_struct *tsk = req->task; 2511 + struct io_ring_ctx *ctx = req->ctx; 2648 2512 struct io_uring_task *tctx = tsk->io_uring; 2649 - enum task_work_notify_mode notify; 2650 2513 struct io_wq_work_node *node; 2651 2514 unsigned long flags; 2652 2515 bool running; ··· 2669 2532 if (running) 2670 2533 return; 2671 2534 2672 - /* 2673 - * SQPOLL kernel thread doesn't need notification, just a wakeup. For 2674 - * all other cases, use TWA_SIGNAL unconditionally to ensure we're 2675 - * processing task_work. There's no reliable way to tell if TWA_RESUME 2676 - * will do the job. 2677 - */ 2678 - notify = (req->ctx->flags & IORING_SETUP_SQPOLL) ? TWA_NONE : TWA_SIGNAL; 2679 - if (likely(!task_work_add(tsk, &tctx->task_work, notify))) { 2680 - if (notify == TWA_NONE) 2681 - wake_up_process(tsk); 2535 + if (ctx->flags & IORING_SETUP_TASKRUN_FLAG) 2536 + atomic_or(IORING_SQ_TASKRUN, &ctx->rings->sq_flags); 2537 + 2538 + if (likely(!task_work_add(tsk, &tctx->task_work, ctx->notify_method))) 2682 2539 return; 2683 - } 2684 2540 2685 2541 spin_lock_irqsave(&tctx->task_lock, flags); 2686 2542 tctx->task_running = false; ··· 2689 2559 } 2690 2560 } 2691 2561 2562 + static void io_req_tw_post(struct io_kiocb *req, bool *locked) 2563 + { 2564 + io_req_complete_post(req, req->cqe.res, req->cqe.flags); 2565 + } 2566 + 2567 + static void io_req_tw_post_queue(struct io_kiocb *req, s32 res, u32 cflags) 2568 + { 2569 + req->cqe.res = res; 2570 + req->cqe.flags = cflags; 2571 + req->io_task_work.func = io_req_tw_post; 2572 + io_req_task_work_add(req, false); 2573 + } 2574 + 2692 2575 static void io_req_task_cancel(struct io_kiocb *req, bool *locked) 2693 2576 { 2694 - struct io_ring_ctx *ctx = req->ctx; 2695 - 2696 2577 /* not needed for normal modes, but SQPOLL depends on it */ 2697 - io_tw_lock(ctx, locked); 2698 - io_req_complete_failed(req, req->result); 2578 + io_tw_lock(req->ctx, locked); 2579 + io_req_complete_failed(req, req->cqe.res); 2699 2580 } 2700 2581 2701 2582 static void io_req_task_submit(struct io_kiocb *req, bool *locked) 2702 2583 { 2703 - struct io_ring_ctx *ctx = req->ctx; 2704 - 2705 - io_tw_lock(ctx, locked); 2584 + io_tw_lock(req->ctx, locked); 2706 2585 /* req->task == current here, checking PF_EXITING is safe */ 2707 2586 if (likely(!(req->task->flags & PF_EXITING))) 2708 - __io_queue_sqe(req); 2587 + io_queue_sqe(req); 2709 2588 else 2710 2589 io_req_complete_failed(req, -EFAULT); 2711 2590 } 2712 2591 2713 2592 static void io_req_task_queue_fail(struct io_kiocb *req, int ret) 2714 2593 { 2715 - req->result = ret; 2594 + req->cqe.res = ret; 2716 2595 req->io_task_work.func = io_req_task_cancel; 2717 2596 io_req_task_work_add(req, false); 2718 2597 } ··· 2734 2595 2735 2596 static void io_req_task_queue_reissue(struct io_kiocb *req) 2736 2597 { 2737 - req->io_task_work.func = io_queue_async_work; 2598 + req->io_task_work.func = io_queue_iowq; 2738 2599 io_req_task_work_add(req, false); 2739 2600 } 2740 2601 2741 - static inline void io_queue_next(struct io_kiocb *req) 2602 + static void io_queue_next(struct io_kiocb *req) 2742 2603 { 2743 2604 struct io_kiocb *nxt = io_req_find_next(req); 2744 2605 2745 2606 if (nxt) 2746 2607 io_req_task_queue(nxt); 2747 - } 2748 - 2749 - static void io_free_req(struct io_kiocb *req) 2750 - { 2751 - io_queue_next(req); 2752 - __io_free_req(req); 2753 - } 2754 - 2755 - static void io_free_req_work(struct io_kiocb *req, bool *locked) 2756 - { 2757 - io_free_req(req); 2758 2608 } 2759 2609 2760 2610 static void io_free_batch_list(struct io_ring_ctx *ctx, ··· 2757 2629 struct io_kiocb *req = container_of(node, struct io_kiocb, 2758 2630 comp_list); 2759 2631 2760 - if (unlikely(req->flags & REQ_F_REFCOUNT)) { 2761 - node = req->comp_list.next; 2762 - if (!req_ref_put_and_test(req)) 2763 - continue; 2632 + if (unlikely(req->flags & IO_REQ_CLEAN_SLOW_FLAGS)) { 2633 + if (req->flags & REQ_F_REFCOUNT) { 2634 + node = req->comp_list.next; 2635 + if (!req_ref_put_and_test(req)) 2636 + continue; 2637 + } 2638 + if ((req->flags & REQ_F_POLLED) && req->apoll) { 2639 + struct async_poll *apoll = req->apoll; 2640 + 2641 + if (apoll->double_poll) 2642 + kfree(apoll->double_poll); 2643 + list_add(&apoll->poll.wait.entry, 2644 + &ctx->apoll_cache); 2645 + req->flags &= ~REQ_F_POLLED; 2646 + } 2647 + if (req->flags & IO_REQ_LINK_FLAGS) 2648 + io_queue_next(req); 2649 + if (unlikely(req->flags & IO_REQ_CLEAN_FLAGS)) 2650 + io_clean_op(req); 2764 2651 } 2652 + if (!(req->flags & REQ_F_FIXED_FILE)) 2653 + io_put_file(req->file); 2765 2654 2766 2655 io_req_put_rsrc_locked(req, ctx); 2767 - io_queue_next(req); 2768 - io_dismantle_req(req); 2769 2656 2770 2657 if (req->task != task) { 2771 2658 if (task) ··· 2790 2647 } 2791 2648 task_refs++; 2792 2649 node = req->comp_list.next; 2793 - wq_stack_add_head(&req->comp_list, &ctx->submit_state.free_list); 2650 + io_req_add_to_cache(req, ctx); 2794 2651 } while (node); 2795 2652 2796 2653 if (task) ··· 2810 2667 comp_list); 2811 2668 2812 2669 if (!(req->flags & REQ_F_CQE_SKIP)) 2813 - __io_fill_cqe_req(req, req->result, req->cflags); 2814 - if ((req->flags & REQ_F_POLLED) && req->apoll) { 2815 - struct async_poll *apoll = req->apoll; 2816 - 2817 - if (apoll->double_poll) 2818 - kfree(apoll->double_poll); 2819 - list_add(&apoll->poll.wait.entry, 2820 - &ctx->apoll_cache); 2821 - req->flags &= ~REQ_F_POLLED; 2822 - } 2670 + __io_fill_cqe_req_filled(ctx, req); 2823 2671 } 2824 2672 2825 2673 io_commit_cqring(ctx); ··· 2832 2698 struct io_kiocb *nxt = NULL; 2833 2699 2834 2700 if (req_ref_put_and_test(req)) { 2835 - nxt = io_req_find_next(req); 2836 - __io_free_req(req); 2701 + if (unlikely(req->flags & IO_REQ_LINK_FLAGS)) 2702 + nxt = io_req_find_next(req); 2703 + io_free_req(req); 2837 2704 } 2838 2705 return nxt; 2839 2706 } 2840 2707 2841 2708 static inline void io_put_req(struct io_kiocb *req) 2842 2709 { 2843 - if (req_ref_put_and_test(req)) 2844 - io_free_req(req); 2845 - } 2846 - 2847 - static inline void io_put_req_deferred(struct io_kiocb *req) 2848 - { 2849 2710 if (req_ref_put_and_test(req)) { 2850 - req->io_task_work.func = io_free_req_work; 2851 - io_req_task_work_add(req, false); 2711 + io_queue_next(req); 2712 + io_free_req(req); 2852 2713 } 2853 2714 } 2854 2715 ··· 2929 2800 nr_events++; 2930 2801 if (unlikely(req->flags & REQ_F_CQE_SKIP)) 2931 2802 continue; 2932 - __io_fill_cqe_req(req, req->result, io_put_kbuf(req, 0)); 2803 + __io_fill_cqe_req(req, req->cqe.res, io_put_kbuf(req, 0)); 2933 2804 } 2934 2805 2935 2806 if (unlikely(!nr_events)) ··· 2975 2846 { 2976 2847 unsigned int nr_events = 0; 2977 2848 int ret = 0; 2849 + unsigned long check_cq; 2978 2850 2979 - /* 2980 - * We disallow the app entering submit/complete with polling, but we 2981 - * still need to lock the ring to prevent racing with polled issue 2982 - * that got punted to a workqueue. 2983 - */ 2984 - mutex_lock(&ctx->uring_lock); 2985 2851 /* 2986 2852 * Don't enter poll loop if we already have events pending. 2987 2853 * If we do, we can potentially be spinning for commands that 2988 2854 * already triggered a CQE (eg in error). 2989 2855 */ 2990 - if (test_bit(0, &ctx->check_cq_overflow)) 2856 + check_cq = READ_ONCE(ctx->check_cq); 2857 + if (check_cq & BIT(IO_CHECK_CQ_OVERFLOW_BIT)) 2991 2858 __io_cqring_overflow_flush(ctx, false); 2992 2859 if (io_cqring_events(ctx)) 2993 - goto out; 2860 + return 0; 2861 + 2862 + /* 2863 + * Similarly do not spin if we have not informed the user of any 2864 + * dropped CQE. 2865 + */ 2866 + if (unlikely(check_cq & BIT(IO_CHECK_CQ_DROPPED_BIT))) 2867 + return -EBADR; 2868 + 2994 2869 do { 2995 2870 /* 2996 2871 * If a submit got punted to a workqueue, we can have the ··· 3024 2891 nr_events += ret; 3025 2892 ret = 0; 3026 2893 } while (nr_events < min && !need_resched()); 3027 - out: 3028 - mutex_unlock(&ctx->uring_lock); 2894 + 3029 2895 return ret; 3030 2896 } 3031 2897 ··· 3097 2965 } else { 3098 2966 fsnotify_access(req->file); 3099 2967 } 3100 - if (unlikely(res != req->result)) { 2968 + if (unlikely(res != req->cqe.res)) { 3101 2969 if ((res == -EAGAIN || res == -EOPNOTSUPP) && 3102 2970 io_rw_should_reissue(req)) { 3103 2971 req->flags |= REQ_F_REISSUE; 3104 2972 return true; 3105 2973 } 3106 2974 req_set_fail(req); 3107 - req->result = res; 2975 + req->cqe.res = res; 3108 2976 } 3109 2977 return false; 3110 2978 } 3111 2979 3112 2980 static inline void io_req_task_complete(struct io_kiocb *req, bool *locked) 3113 2981 { 3114 - int res = req->result; 2982 + int res = req->cqe.res; 3115 2983 3116 2984 if (*locked) { 3117 2985 io_req_complete_state(req, res, io_put_kbuf(req, 0)); ··· 3127 2995 { 3128 2996 if (__io_complete_rw_common(req, res)) 3129 2997 return; 3130 - __io_req_complete(req, issue_flags, req->result, 2998 + __io_req_complete(req, issue_flags, req->cqe.res, 3131 2999 io_put_kbuf(req, issue_flags)); 3132 3000 } 3133 3001 ··· 3137 3005 3138 3006 if (__io_complete_rw_common(req, res)) 3139 3007 return; 3140 - req->result = res; 3008 + req->cqe.res = res; 3141 3009 req->io_task_work.func = io_req_task_complete; 3142 3010 io_req_task_work_add(req, !!(req->ctx->flags & IORING_SETUP_SQPOLL)); 3143 3011 } ··· 3148 3016 3149 3017 if (kiocb->ki_flags & IOCB_WRITE) 3150 3018 kiocb_end_write(req); 3151 - if (unlikely(res != req->result)) { 3019 + if (unlikely(res != req->cqe.res)) { 3152 3020 if (res == -EAGAIN && io_rw_should_reissue(req)) { 3153 3021 req->flags |= REQ_F_REISSUE; 3154 3022 return; 3155 3023 } 3156 - req->result = res; 3024 + req->cqe.res = res; 3157 3025 } 3158 3026 3159 3027 /* order with io_iopoll_complete() checking ->iopoll_completed */ ··· 3263 3131 res |= FFS_ISREG; 3264 3132 if (__io_file_supports_nowait(file, mode)) 3265 3133 res |= FFS_NOWAIT; 3134 + if (io_file_need_scm(file)) 3135 + res |= FFS_SCM; 3266 3136 return res; 3267 3137 } 3268 3138 ··· 3296 3162 req->rw.addr = READ_ONCE(sqe->addr); 3297 3163 req->rw.len = READ_ONCE(sqe->len); 3298 3164 req->rw.flags = READ_ONCE(sqe->rw_flags); 3165 + /* used for fixed read/write too - just read unconditionally */ 3299 3166 req->buf_index = READ_ONCE(sqe->buf_index); 3300 3167 return 0; 3301 3168 } ··· 3445 3310 return __io_import_fixed(req, rw, iter, imu); 3446 3311 } 3447 3312 3448 - static void io_ring_submit_unlock(struct io_ring_ctx *ctx, bool needs_lock) 3313 + static int io_buffer_add_list(struct io_ring_ctx *ctx, 3314 + struct io_buffer_list *bl, unsigned int bgid) 3449 3315 { 3450 - if (needs_lock) 3451 - mutex_unlock(&ctx->uring_lock); 3452 - } 3453 - 3454 - static void io_ring_submit_lock(struct io_ring_ctx *ctx, bool needs_lock) 3455 - { 3456 - /* 3457 - * "Normal" inline submissions always hold the uring_lock, since we 3458 - * grab it from the system call. Same is true for the SQPOLL offload. 3459 - * The only exception is when we've detached the request and issue it 3460 - * from an async worker thread, grab the lock for that case. 3461 - */ 3462 - if (needs_lock) 3463 - mutex_lock(&ctx->uring_lock); 3464 - } 3465 - 3466 - static void io_buffer_add_list(struct io_ring_ctx *ctx, 3467 - struct io_buffer_list *bl, unsigned int bgid) 3468 - { 3469 - struct list_head *list; 3470 - 3471 - list = &ctx->io_buffers[hash_32(bgid, IO_BUFFERS_HASH_BITS)]; 3472 - INIT_LIST_HEAD(&bl->buf_list); 3473 3316 bl->bgid = bgid; 3474 - list_add(&bl->list, list); 3317 + if (bgid < BGID_ARRAY) 3318 + return 0; 3319 + 3320 + return xa_err(xa_store(&ctx->io_bl_xa, bgid, bl, GFP_KERNEL)); 3475 3321 } 3476 3322 3477 - static struct io_buffer *io_buffer_select(struct io_kiocb *req, size_t *len, 3478 - int bgid, unsigned int issue_flags) 3323 + static void __user *io_provided_buffer_select(struct io_kiocb *req, size_t *len, 3324 + struct io_buffer_list *bl, 3325 + unsigned int issue_flags) 3479 3326 { 3480 - struct io_buffer *kbuf = req->kbuf; 3481 - bool needs_lock = issue_flags & IO_URING_F_UNLOCKED; 3327 + struct io_buffer *kbuf; 3328 + 3329 + if (list_empty(&bl->buf_list)) 3330 + return ERR_PTR(-ENOBUFS); 3331 + 3332 + kbuf = list_first_entry(&bl->buf_list, struct io_buffer, list); 3333 + list_del(&kbuf->list); 3334 + if (*len > kbuf->len) 3335 + *len = kbuf->len; 3336 + req->flags |= REQ_F_BUFFER_SELECTED; 3337 + req->kbuf = kbuf; 3338 + req->buf_index = kbuf->bid; 3339 + io_ring_submit_unlock(req->ctx, issue_flags); 3340 + return u64_to_user_ptr(kbuf->addr); 3341 + } 3342 + 3343 + static void __user *io_buffer_select(struct io_kiocb *req, size_t *len, 3344 + unsigned int issue_flags) 3345 + { 3482 3346 struct io_ring_ctx *ctx = req->ctx; 3483 3347 struct io_buffer_list *bl; 3484 3348 3485 - if (req->flags & REQ_F_BUFFER_SELECTED) 3486 - return kbuf; 3349 + io_ring_submit_lock(req->ctx, issue_flags); 3487 3350 3488 - io_ring_submit_lock(ctx, needs_lock); 3489 - 3490 - lockdep_assert_held(&ctx->uring_lock); 3491 - 3492 - bl = io_buffer_get_list(ctx, bgid); 3493 - if (bl && !list_empty(&bl->buf_list)) { 3494 - kbuf = list_first_entry(&bl->buf_list, struct io_buffer, list); 3495 - list_del(&kbuf->list); 3496 - if (*len > kbuf->len) 3497 - *len = kbuf->len; 3498 - req->flags |= REQ_F_BUFFER_SELECTED; 3499 - req->kbuf = kbuf; 3500 - } else { 3501 - kbuf = ERR_PTR(-ENOBUFS); 3351 + bl = io_buffer_get_list(ctx, req->buf_index); 3352 + if (unlikely(!bl)) { 3353 + io_ring_submit_unlock(req->ctx, issue_flags); 3354 + return ERR_PTR(-ENOBUFS); 3502 3355 } 3503 3356 3504 - io_ring_submit_unlock(req->ctx, needs_lock); 3505 - return kbuf; 3506 - } 3507 - 3508 - static void __user *io_rw_buffer_select(struct io_kiocb *req, size_t *len, 3509 - unsigned int issue_flags) 3510 - { 3511 - struct io_buffer *kbuf; 3512 - u16 bgid; 3513 - 3514 - bgid = req->buf_index; 3515 - kbuf = io_buffer_select(req, len, bgid, issue_flags); 3516 - if (IS_ERR(kbuf)) 3517 - return kbuf; 3518 - return u64_to_user_ptr(kbuf->addr); 3357 + /* selection helpers drop the submit lock again, if needed */ 3358 + return io_provided_buffer_select(req, len, bl, issue_flags); 3519 3359 } 3520 3360 3521 3361 #ifdef CONFIG_COMPAT ··· 3500 3390 struct compat_iovec __user *uiov; 3501 3391 compat_ssize_t clen; 3502 3392 void __user *buf; 3503 - ssize_t len; 3393 + size_t len; 3504 3394 3505 3395 uiov = u64_to_user_ptr(req->rw.addr); 3506 3396 if (!access_ok(uiov, sizeof(*uiov))) ··· 3511 3401 return -EINVAL; 3512 3402 3513 3403 len = clen; 3514 - buf = io_rw_buffer_select(req, &len, issue_flags); 3404 + buf = io_buffer_select(req, &len, issue_flags); 3515 3405 if (IS_ERR(buf)) 3516 3406 return PTR_ERR(buf); 3407 + req->rw.addr = (unsigned long) buf; 3517 3408 iov[0].iov_base = buf; 3518 - iov[0].iov_len = (compat_size_t) len; 3409 + req->rw.len = iov[0].iov_len = (compat_size_t) len; 3519 3410 return 0; 3520 3411 } 3521 3412 #endif ··· 3534 3423 len = iov[0].iov_len; 3535 3424 if (len < 0) 3536 3425 return -EINVAL; 3537 - buf = io_rw_buffer_select(req, &len, issue_flags); 3426 + buf = io_buffer_select(req, &len, issue_flags); 3538 3427 if (IS_ERR(buf)) 3539 3428 return PTR_ERR(buf); 3429 + req->rw.addr = (unsigned long) buf; 3540 3430 iov[0].iov_base = buf; 3541 - iov[0].iov_len = len; 3431 + req->rw.len = iov[0].iov_len = len; 3542 3432 return 0; 3543 3433 } 3544 3434 ··· 3547 3435 unsigned int issue_flags) 3548 3436 { 3549 3437 if (req->flags & REQ_F_BUFFER_SELECTED) { 3550 - struct io_buffer *kbuf = req->kbuf; 3551 - 3552 - iov[0].iov_base = u64_to_user_ptr(kbuf->addr); 3553 - iov[0].iov_len = kbuf->len; 3438 + iov[0].iov_base = u64_to_user_ptr(req->rw.addr); 3439 + iov[0].iov_len = req->rw.len; 3554 3440 return 0; 3555 3441 } 3556 3442 if (req->rw.len != 1) ··· 3560 3450 #endif 3561 3451 3562 3452 return __io_iov_buffer_select(req, iov, issue_flags); 3453 + } 3454 + 3455 + static inline bool io_do_buffer_select(struct io_kiocb *req) 3456 + { 3457 + if (!(req->flags & REQ_F_BUFFER_SELECT)) 3458 + return false; 3459 + return !(req->flags & REQ_F_BUFFER_SELECTED); 3563 3460 } 3564 3461 3565 3462 static struct iovec *__io_import_iovec(int rw, struct io_kiocb *req, ··· 3587 3470 return NULL; 3588 3471 } 3589 3472 3590 - /* buffer index only valid with fixed read/write, or buffer select */ 3591 - if (unlikely(req->buf_index && !(req->flags & REQ_F_BUFFER_SELECT))) 3592 - return ERR_PTR(-EINVAL); 3593 - 3594 3473 buf = u64_to_user_ptr(req->rw.addr); 3595 3474 sqe_len = req->rw.len; 3596 3475 3597 3476 if (opcode == IORING_OP_READ || opcode == IORING_OP_WRITE) { 3598 - if (req->flags & REQ_F_BUFFER_SELECT) { 3599 - buf = io_rw_buffer_select(req, &sqe_len, issue_flags); 3477 + if (io_do_buffer_select(req)) { 3478 + buf = io_buffer_select(req, &sqe_len, issue_flags); 3600 3479 if (IS_ERR(buf)) 3601 3480 return ERR_CAST(buf); 3481 + req->rw.addr = (unsigned long) buf; 3602 3482 req->rw.len = sqe_len; 3603 3483 } 3604 3484 ··· 3950 3836 kfree(iovec); 3951 3837 return ret; 3952 3838 } 3953 - req->result = iov_iter_count(&s->iter); 3839 + req->cqe.res = iov_iter_count(&s->iter); 3954 3840 3955 3841 if (force_nonblock) { 3956 3842 /* If the file doesn't support async, just async punt */ ··· 3966 3852 3967 3853 ppos = io_kiocb_update_pos(req); 3968 3854 3969 - ret = rw_verify_area(READ, req->file, ppos, req->result); 3855 + ret = rw_verify_area(READ, req->file, ppos, req->cqe.res); 3970 3856 if (unlikely(ret)) { 3971 3857 kfree(iovec); 3972 3858 return ret; ··· 3988 3874 ret = 0; 3989 3875 } else if (ret == -EIOCBQUEUED) { 3990 3876 goto out_free; 3991 - } else if (ret == req->result || ret <= 0 || !force_nonblock || 3877 + } else if (ret == req->cqe.res || ret <= 0 || !force_nonblock || 3992 3878 (req->flags & REQ_F_NOWAIT) || !need_read_all(req)) { 3993 3879 /* read all, failed, already did sync or don't want to retry */ 3994 3880 goto done; ··· 4078 3964 kfree(iovec); 4079 3965 return ret; 4080 3966 } 4081 - req->result = iov_iter_count(&s->iter); 3967 + req->cqe.res = iov_iter_count(&s->iter); 4082 3968 4083 3969 if (force_nonblock) { 4084 3970 /* If the file doesn't support async, just async punt */ ··· 4098 3984 4099 3985 ppos = io_kiocb_update_pos(req); 4100 3986 4101 - ret = rw_verify_area(WRITE, req->file, ppos, req->result); 3987 + ret = rw_verify_area(WRITE, req->file, ppos, req->cqe.res); 4102 3988 if (unlikely(ret)) 4103 3989 goto out_free; 4104 3990 ··· 4162 4048 struct io_rename *ren = &req->rename; 4163 4049 const char __user *oldf, *newf; 4164 4050 4165 - if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) 4166 - return -EINVAL; 4167 - if (sqe->ioprio || sqe->buf_index || sqe->splice_fd_in) 4051 + if (sqe->buf_index || sqe->splice_fd_in) 4168 4052 return -EINVAL; 4169 4053 if (unlikely(req->flags & REQ_F_FIXED_FILE)) 4170 4054 return -EBADF; ··· 4211 4099 struct io_unlink *un = &req->unlink; 4212 4100 const char __user *fname; 4213 4101 4214 - if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) 4215 - return -EINVAL; 4216 - if (sqe->ioprio || sqe->off || sqe->len || sqe->buf_index || 4217 - sqe->splice_fd_in) 4102 + if (sqe->off || sqe->len || sqe->buf_index || sqe->splice_fd_in) 4218 4103 return -EINVAL; 4219 4104 if (unlikely(req->flags & REQ_F_FIXED_FILE)) 4220 4105 return -EBADF; ··· 4257 4148 struct io_mkdir *mkd = &req->mkdir; 4258 4149 const char __user *fname; 4259 4150 4260 - if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) 4261 - return -EINVAL; 4262 - if (sqe->ioprio || sqe->off || sqe->rw_flags || sqe->buf_index || 4263 - sqe->splice_fd_in) 4151 + if (sqe->off || sqe->rw_flags || sqe->buf_index || sqe->splice_fd_in) 4264 4152 return -EINVAL; 4265 4153 if (unlikely(req->flags & REQ_F_FIXED_FILE)) 4266 4154 return -EBADF; ··· 4297 4191 struct io_symlink *sl = &req->symlink; 4298 4192 const char __user *oldpath, *newpath; 4299 4193 4300 - if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) 4301 - return -EINVAL; 4302 - if (sqe->ioprio || sqe->len || sqe->rw_flags || sqe->buf_index || 4303 - sqe->splice_fd_in) 4194 + if (sqe->len || sqe->rw_flags || sqe->buf_index || sqe->splice_fd_in) 4304 4195 return -EINVAL; 4305 4196 if (unlikely(req->flags & REQ_F_FIXED_FILE)) 4306 4197 return -EBADF; ··· 4343 4240 struct io_hardlink *lnk = &req->hardlink; 4344 4241 const char __user *oldf, *newf; 4345 4242 4346 - if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) 4347 - return -EINVAL; 4348 - if (sqe->ioprio || sqe->rw_flags || sqe->buf_index || sqe->splice_fd_in) 4243 + if (sqe->rw_flags || sqe->buf_index || sqe->splice_fd_in) 4349 4244 return -EINVAL; 4350 4245 if (unlikely(req->flags & REQ_F_FIXED_FILE)) 4351 4246 return -EBADF; ··· 4390 4289 const struct io_uring_sqe *sqe) 4391 4290 { 4392 4291 #if defined(CONFIG_NET) 4393 - if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) 4394 - return -EINVAL; 4395 - if (unlikely(sqe->ioprio || sqe->off || sqe->addr || sqe->rw_flags || 4292 + if (unlikely(sqe->off || sqe->addr || sqe->rw_flags || 4396 4293 sqe->buf_index || sqe->splice_fd_in)) 4397 4294 return -EINVAL; 4398 4295 ··· 4429 4330 { 4430 4331 struct io_splice *sp = &req->splice; 4431 4332 unsigned int valid_flags = SPLICE_F_FD_IN_FIXED | SPLICE_F_ALL; 4432 - 4433 - if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) 4434 - return -EINVAL; 4435 4333 4436 4334 sp->len = READ_ONCE(sqe->len); 4437 4335 sp->flags = READ_ONCE(sqe->splice_flags); ··· 4528 4432 */ 4529 4433 static int io_nop(struct io_kiocb *req, unsigned int issue_flags) 4530 4434 { 4531 - struct io_ring_ctx *ctx = req->ctx; 4532 - 4533 - if (unlikely(ctx->flags & IORING_SETUP_IOPOLL)) 4534 - return -EINVAL; 4535 - 4536 4435 __io_req_complete(req, issue_flags, 0, 0); 4537 4436 return 0; 4538 4437 } ··· 4535 4444 static int io_msg_ring_prep(struct io_kiocb *req, 4536 4445 const struct io_uring_sqe *sqe) 4537 4446 { 4538 - if (unlikely(sqe->addr || sqe->ioprio || sqe->rw_flags || 4539 - sqe->splice_fd_in || sqe->buf_index || sqe->personality)) 4447 + if (unlikely(sqe->addr || sqe->rw_flags || sqe->splice_fd_in || 4448 + sqe->buf_index || sqe->personality)) 4540 4449 return -EINVAL; 4541 4450 4542 4451 req->msg.user_data = READ_ONCE(sqe->off); ··· 4577 4486 4578 4487 static int io_fsync_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 4579 4488 { 4580 - struct io_ring_ctx *ctx = req->ctx; 4581 - 4582 - if (unlikely(ctx->flags & IORING_SETUP_IOPOLL)) 4583 - return -EINVAL; 4584 - if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index || 4585 - sqe->splice_fd_in)) 4489 + if (unlikely(sqe->addr || sqe->buf_index || sqe->splice_fd_in)) 4586 4490 return -EINVAL; 4587 4491 4588 4492 req->sync.flags = READ_ONCE(sqe->fsync_flags); ··· 4610 4524 static int io_fallocate_prep(struct io_kiocb *req, 4611 4525 const struct io_uring_sqe *sqe) 4612 4526 { 4613 - if (sqe->ioprio || sqe->buf_index || sqe->rw_flags || 4614 - sqe->splice_fd_in) 4615 - return -EINVAL; 4616 - if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) 4527 + if (sqe->buf_index || sqe->rw_flags || sqe->splice_fd_in) 4617 4528 return -EINVAL; 4618 4529 4619 4530 req->sync.off = READ_ONCE(sqe->off); ··· 4641 4558 const char __user *fname; 4642 4559 int ret; 4643 4560 4644 - if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) 4645 - return -EINVAL; 4646 - if (unlikely(sqe->ioprio || sqe->buf_index)) 4561 + if (unlikely(sqe->buf_index)) 4647 4562 return -EINVAL; 4648 4563 if (unlikely(req->flags & REQ_F_FIXED_FILE)) 4649 4564 return -EBADF; ··· 4773 4692 struct io_provide_buf *p = &req->pbuf; 4774 4693 u64 tmp; 4775 4694 4776 - if (sqe->ioprio || sqe->rw_flags || sqe->addr || sqe->len || sqe->off || 4695 + if (sqe->rw_flags || sqe->addr || sqe->len || sqe->off || 4777 4696 sqe->splice_fd_in) 4778 4697 return -EINVAL; 4779 4698 ··· 4817 4736 struct io_ring_ctx *ctx = req->ctx; 4818 4737 struct io_buffer_list *bl; 4819 4738 int ret = 0; 4820 - bool needs_lock = issue_flags & IO_URING_F_UNLOCKED; 4821 4739 4822 - io_ring_submit_lock(ctx, needs_lock); 4823 - 4824 - lockdep_assert_held(&ctx->uring_lock); 4740 + io_ring_submit_lock(ctx, issue_flags); 4825 4741 4826 4742 ret = -ENOENT; 4827 4743 bl = io_buffer_get_list(ctx, p->bgid); ··· 4829 4751 4830 4752 /* complete before unlock, IOPOLL may need the lock */ 4831 4753 __io_req_complete(req, issue_flags, ret, 0); 4832 - io_ring_submit_unlock(ctx, needs_lock); 4754 + io_ring_submit_unlock(ctx, issue_flags); 4833 4755 return 0; 4834 4756 } 4835 4757 ··· 4840 4762 struct io_provide_buf *p = &req->pbuf; 4841 4763 u64 tmp; 4842 4764 4843 - if (sqe->ioprio || sqe->rw_flags || sqe->splice_fd_in) 4765 + if (sqe->rw_flags || sqe->splice_fd_in) 4844 4766 return -EINVAL; 4845 4767 4846 4768 tmp = READ_ONCE(sqe->fd); ··· 4937 4859 return i ? 0 : -ENOMEM; 4938 4860 } 4939 4861 4862 + static __cold int io_init_bl_list(struct io_ring_ctx *ctx) 4863 + { 4864 + int i; 4865 + 4866 + ctx->io_bl = kcalloc(BGID_ARRAY, sizeof(struct io_buffer_list), 4867 + GFP_KERNEL); 4868 + if (!ctx->io_bl) 4869 + return -ENOMEM; 4870 + 4871 + for (i = 0; i < BGID_ARRAY; i++) { 4872 + INIT_LIST_HEAD(&ctx->io_bl[i].buf_list); 4873 + ctx->io_bl[i].bgid = i; 4874 + } 4875 + 4876 + return 0; 4877 + } 4878 + 4940 4879 static int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags) 4941 4880 { 4942 4881 struct io_provide_buf *p = &req->pbuf; 4943 4882 struct io_ring_ctx *ctx = req->ctx; 4944 4883 struct io_buffer_list *bl; 4945 4884 int ret = 0; 4946 - bool needs_lock = issue_flags & IO_URING_F_UNLOCKED; 4947 4885 4948 - io_ring_submit_lock(ctx, needs_lock); 4886 + io_ring_submit_lock(ctx, issue_flags); 4949 4887 4950 - lockdep_assert_held(&ctx->uring_lock); 4888 + if (unlikely(p->bgid < BGID_ARRAY && !ctx->io_bl)) { 4889 + ret = io_init_bl_list(ctx); 4890 + if (ret) 4891 + goto err; 4892 + } 4951 4893 4952 4894 bl = io_buffer_get_list(ctx, p->bgid); 4953 4895 if (unlikely(!bl)) { ··· 4976 4878 ret = -ENOMEM; 4977 4879 goto err; 4978 4880 } 4979 - io_buffer_add_list(ctx, bl, p->bgid); 4881 + ret = io_buffer_add_list(ctx, bl, p->bgid); 4882 + if (ret) { 4883 + kfree(bl); 4884 + goto err; 4885 + } 4980 4886 } 4981 4887 4982 4888 ret = io_add_buffers(ctx, p, bl); ··· 4989 4887 req_set_fail(req); 4990 4888 /* complete before unlock, IOPOLL may need the lock */ 4991 4889 __io_req_complete(req, issue_flags, ret, 0); 4992 - io_ring_submit_unlock(ctx, needs_lock); 4890 + io_ring_submit_unlock(ctx, issue_flags); 4993 4891 return 0; 4994 4892 } 4995 4893 ··· 4997 4895 const struct io_uring_sqe *sqe) 4998 4896 { 4999 4897 #if defined(CONFIG_EPOLL) 5000 - if (sqe->ioprio || sqe->buf_index || sqe->splice_fd_in) 5001 - return -EINVAL; 5002 - if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) 4898 + if (sqe->buf_index || sqe->splice_fd_in) 5003 4899 return -EINVAL; 5004 4900 5005 4901 req->epoll.epfd = READ_ONCE(sqe->fd); ··· 5041 4941 static int io_madvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 5042 4942 { 5043 4943 #if defined(CONFIG_ADVISE_SYSCALLS) && defined(CONFIG_MMU) 5044 - if (sqe->ioprio || sqe->buf_index || sqe->off || sqe->splice_fd_in) 5045 - return -EINVAL; 5046 - if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) 4944 + if (sqe->buf_index || sqe->off || sqe->splice_fd_in) 5047 4945 return -EINVAL; 5048 4946 5049 4947 req->madvise.addr = READ_ONCE(sqe->addr); ··· 5074 4976 5075 4977 static int io_fadvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 5076 4978 { 5077 - if (sqe->ioprio || sqe->buf_index || sqe->addr || sqe->splice_fd_in) 5078 - return -EINVAL; 5079 - if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) 4979 + if (sqe->buf_index || sqe->addr || sqe->splice_fd_in) 5080 4980 return -EINVAL; 5081 4981 5082 4982 req->fadvise.offset = READ_ONCE(sqe->off); ··· 5110 5014 { 5111 5015 const char __user *path; 5112 5016 5113 - if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) 5114 - return -EINVAL; 5115 - if (sqe->ioprio || sqe->buf_index || sqe->splice_fd_in) 5017 + if (sqe->buf_index || sqe->splice_fd_in) 5116 5018 return -EINVAL; 5117 5019 if (req->flags & REQ_F_FIXED_FILE) 5118 5020 return -EBADF; ··· 5155 5061 5156 5062 static int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 5157 5063 { 5158 - if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) 5159 - return -EINVAL; 5160 - if (sqe->ioprio || sqe->off || sqe->addr || sqe->len || 5161 - sqe->rw_flags || sqe->buf_index) 5064 + if (sqe->off || sqe->addr || sqe->len || sqe->rw_flags || sqe->buf_index) 5162 5065 return -EINVAL; 5163 5066 if (req->flags & REQ_F_FIXED_FILE) 5164 5067 return -EBADF; ··· 5221 5130 5222 5131 static int io_sfr_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 5223 5132 { 5224 - struct io_ring_ctx *ctx = req->ctx; 5225 - 5226 - if (unlikely(ctx->flags & IORING_SETUP_IOPOLL)) 5227 - return -EINVAL; 5228 - if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index || 5229 - sqe->splice_fd_in)) 5133 + if (unlikely(sqe->addr || sqe->buf_index || sqe->splice_fd_in)) 5230 5134 return -EINVAL; 5231 5135 5232 5136 req->sync.off = READ_ONCE(sqe->off); ··· 5247 5161 } 5248 5162 5249 5163 #if defined(CONFIG_NET) 5164 + static bool io_net_retry(struct socket *sock, int flags) 5165 + { 5166 + if (!(flags & MSG_WAITALL)) 5167 + return false; 5168 + return sock->type == SOCK_STREAM || sock->type == SOCK_SEQPACKET; 5169 + } 5170 + 5250 5171 static int io_setup_async_msg(struct io_kiocb *req, 5251 5172 struct io_async_msghdr *kmsg) 5252 5173 { ··· 5299 5206 { 5300 5207 struct io_sr_msg *sr = &req->sr_msg; 5301 5208 5302 - if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) 5209 + if (unlikely(sqe->file_index)) 5303 5210 return -EINVAL; 5304 5211 if (unlikely(sqe->addr2 || sqe->file_index)) 5305 5212 return -EINVAL; 5306 5213 5307 5214 sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr)); 5308 5215 sr->len = READ_ONCE(sqe->len); 5216 + sr->flags = READ_ONCE(sqe->addr2); 5217 + if (sr->flags & ~IORING_RECVSEND_POLL_FIRST) 5218 + return -EINVAL; 5309 5219 sr->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL; 5310 5220 if (sr->msg_flags & MSG_DONTWAIT) 5311 5221 req->flags |= REQ_F_NOWAIT; ··· 5317 5221 if (req->ctx->compat) 5318 5222 sr->msg_flags |= MSG_CMSG_COMPAT; 5319 5223 #endif 5224 + sr->done_io = 0; 5320 5225 return 0; 5321 5226 } 5322 5227 5323 5228 static int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags) 5324 5229 { 5325 5230 struct io_async_msghdr iomsg, *kmsg; 5231 + struct io_sr_msg *sr = &req->sr_msg; 5326 5232 struct socket *sock; 5327 5233 unsigned flags; 5328 5234 int min_ret = 0; ··· 5343 5245 kmsg = &iomsg; 5344 5246 } 5345 5247 5346 - flags = req->sr_msg.msg_flags; 5248 + if (!(req->flags & REQ_F_POLLED) && 5249 + (sr->flags & IORING_RECVSEND_POLL_FIRST)) 5250 + return io_setup_async_msg(req, kmsg); 5251 + 5252 + flags = sr->msg_flags; 5347 5253 if (issue_flags & IO_URING_F_NONBLOCK) 5348 5254 flags |= MSG_DONTWAIT; 5349 5255 if (flags & MSG_WAITALL) ··· 5360 5258 return io_setup_async_msg(req, kmsg); 5361 5259 if (ret == -ERESTARTSYS) 5362 5260 ret = -EINTR; 5261 + if (ret > 0 && io_net_retry(sock, flags)) { 5262 + sr->done_io += ret; 5263 + req->flags |= REQ_F_PARTIAL_IO; 5264 + return io_setup_async_msg(req, kmsg); 5265 + } 5363 5266 req_set_fail(req); 5364 5267 } 5365 5268 /* fast path, check for non-NULL to avoid function call */ 5366 5269 if (kmsg->free_iov) 5367 5270 kfree(kmsg->free_iov); 5368 5271 req->flags &= ~REQ_F_NEED_CLEANUP; 5272 + if (ret >= 0) 5273 + ret += sr->done_io; 5274 + else if (sr->done_io) 5275 + ret = sr->done_io; 5369 5276 __io_req_complete(req, issue_flags, ret, 0); 5370 5277 return 0; 5371 5278 } ··· 5389 5278 int min_ret = 0; 5390 5279 int ret; 5391 5280 5281 + if (!(req->flags & REQ_F_POLLED) && 5282 + (sr->flags & IORING_RECVSEND_POLL_FIRST)) 5283 + return -EAGAIN; 5284 + 5392 5285 sock = sock_from_file(req->file); 5393 5286 if (unlikely(!sock)) 5394 5287 return -ENOTSOCK; ··· 5406 5291 msg.msg_controllen = 0; 5407 5292 msg.msg_namelen = 0; 5408 5293 5409 - flags = req->sr_msg.msg_flags; 5294 + flags = sr->msg_flags; 5410 5295 if (issue_flags & IO_URING_F_NONBLOCK) 5411 5296 flags |= MSG_DONTWAIT; 5412 5297 if (flags & MSG_WAITALL) ··· 5419 5304 return -EAGAIN; 5420 5305 if (ret == -ERESTARTSYS) 5421 5306 ret = -EINTR; 5307 + if (ret > 0 && io_net_retry(sock, flags)) { 5308 + sr->len -= ret; 5309 + sr->buf += ret; 5310 + sr->done_io += ret; 5311 + req->flags |= REQ_F_PARTIAL_IO; 5312 + return -EAGAIN; 5313 + } 5422 5314 req_set_fail(req); 5423 5315 } 5316 + if (ret >= 0) 5317 + ret += sr->done_io; 5318 + else if (sr->done_io) 5319 + ret = sr->done_io; 5424 5320 __io_req_complete(req, issue_flags, ret, 0); 5425 5321 return 0; 5426 5322 } ··· 5523 5397 return __io_recvmsg_copy_hdr(req, iomsg); 5524 5398 } 5525 5399 5526 - static struct io_buffer *io_recv_buffer_select(struct io_kiocb *req, 5527 - unsigned int issue_flags) 5528 - { 5529 - struct io_sr_msg *sr = &req->sr_msg; 5530 - 5531 - return io_buffer_select(req, &sr->len, sr->bgid, issue_flags); 5532 - } 5533 - 5534 5400 static int io_recvmsg_prep_async(struct io_kiocb *req) 5535 5401 { 5536 5402 int ret; ··· 5537 5419 { 5538 5420 struct io_sr_msg *sr = &req->sr_msg; 5539 5421 5540 - if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) 5422 + if (unlikely(sqe->file_index)) 5541 5423 return -EINVAL; 5542 5424 if (unlikely(sqe->addr2 || sqe->file_index)) 5543 5425 return -EINVAL; 5544 5426 5545 5427 sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr)); 5546 5428 sr->len = READ_ONCE(sqe->len); 5547 - sr->bgid = READ_ONCE(sqe->buf_group); 5429 + sr->flags = READ_ONCE(sqe->addr2); 5430 + if (sr->flags & ~IORING_RECVSEND_POLL_FIRST) 5431 + return -EINVAL; 5548 5432 sr->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL; 5549 5433 if (sr->msg_flags & MSG_DONTWAIT) 5550 5434 req->flags |= REQ_F_NOWAIT; ··· 5559 5439 return 0; 5560 5440 } 5561 5441 5562 - static bool io_net_retry(struct socket *sock, int flags) 5563 - { 5564 - if (!(flags & MSG_WAITALL)) 5565 - return false; 5566 - return sock->type == SOCK_STREAM || sock->type == SOCK_SEQPACKET; 5567 - } 5568 - 5569 5442 static int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags) 5570 5443 { 5571 5444 struct io_async_msghdr iomsg, *kmsg; 5572 5445 struct io_sr_msg *sr = &req->sr_msg; 5573 5446 struct socket *sock; 5574 - struct io_buffer *kbuf; 5575 5447 unsigned flags; 5576 5448 int ret, min_ret = 0; 5577 5449 bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; ··· 5581 5469 kmsg = &iomsg; 5582 5470 } 5583 5471 5584 - if (req->flags & REQ_F_BUFFER_SELECT) { 5585 - kbuf = io_recv_buffer_select(req, issue_flags); 5586 - if (IS_ERR(kbuf)) 5587 - return PTR_ERR(kbuf); 5588 - kmsg->fast_iov[0].iov_base = u64_to_user_ptr(kbuf->addr); 5589 - kmsg->fast_iov[0].iov_len = req->sr_msg.len; 5590 - iov_iter_init(&kmsg->msg.msg_iter, READ, kmsg->fast_iov, 5591 - 1, req->sr_msg.len); 5472 + if (!(req->flags & REQ_F_POLLED) && 5473 + (sr->flags & IORING_RECVSEND_POLL_FIRST)) 5474 + return io_setup_async_msg(req, kmsg); 5475 + 5476 + if (io_do_buffer_select(req)) { 5477 + void __user *buf; 5478 + 5479 + buf = io_buffer_select(req, &sr->len, issue_flags); 5480 + if (IS_ERR(buf)) 5481 + return PTR_ERR(buf); 5482 + kmsg->fast_iov[0].iov_base = buf; 5483 + kmsg->fast_iov[0].iov_len = sr->len; 5484 + iov_iter_init(&kmsg->msg.msg_iter, READ, kmsg->fast_iov, 1, 5485 + sr->len); 5592 5486 } 5593 5487 5594 - flags = req->sr_msg.msg_flags; 5488 + flags = sr->msg_flags; 5595 5489 if (force_nonblock) 5596 5490 flags |= MSG_DONTWAIT; 5597 5491 if (flags & MSG_WAITALL) 5598 5492 min_ret = iov_iter_count(&kmsg->msg.msg_iter); 5599 5493 5600 - ret = __sys_recvmsg_sock(sock, &kmsg->msg, req->sr_msg.umsg, 5601 - kmsg->uaddr, flags); 5494 + ret = __sys_recvmsg_sock(sock, &kmsg->msg, sr->umsg, kmsg->uaddr, flags); 5602 5495 if (ret < min_ret) { 5603 5496 if (ret == -EAGAIN && force_nonblock) 5604 5497 return io_setup_async_msg(req, kmsg); ··· 5633 5516 5634 5517 static int io_recv(struct io_kiocb *req, unsigned int issue_flags) 5635 5518 { 5636 - struct io_buffer *kbuf; 5637 5519 struct io_sr_msg *sr = &req->sr_msg; 5638 5520 struct msghdr msg; 5639 - void __user *buf = sr->buf; 5640 5521 struct socket *sock; 5641 5522 struct iovec iov; 5642 5523 unsigned flags; 5643 5524 int ret, min_ret = 0; 5644 5525 bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; 5645 5526 5527 + if (!(req->flags & REQ_F_POLLED) && 5528 + (sr->flags & IORING_RECVSEND_POLL_FIRST)) 5529 + return -EAGAIN; 5530 + 5646 5531 sock = sock_from_file(req->file); 5647 5532 if (unlikely(!sock)) 5648 5533 return -ENOTSOCK; 5649 5534 5650 - if (req->flags & REQ_F_BUFFER_SELECT) { 5651 - kbuf = io_recv_buffer_select(req, issue_flags); 5652 - if (IS_ERR(kbuf)) 5653 - return PTR_ERR(kbuf); 5654 - buf = u64_to_user_ptr(kbuf->addr); 5535 + if (io_do_buffer_select(req)) { 5536 + void __user *buf; 5537 + 5538 + buf = io_buffer_select(req, &sr->len, issue_flags); 5539 + if (IS_ERR(buf)) 5540 + return PTR_ERR(buf); 5541 + sr->buf = buf; 5655 5542 } 5656 5543 5657 - ret = import_single_range(READ, buf, sr->len, &iov, &msg.msg_iter); 5544 + ret = import_single_range(READ, sr->buf, sr->len, &iov, &msg.msg_iter); 5658 5545 if (unlikely(ret)) 5659 5546 goto out_free; 5660 5547 ··· 5669 5548 msg.msg_iocb = NULL; 5670 5549 msg.msg_flags = 0; 5671 5550 5672 - flags = req->sr_msg.msg_flags; 5551 + flags = sr->msg_flags; 5673 5552 if (force_nonblock) 5674 5553 flags |= MSG_DONTWAIT; 5675 5554 if (flags & MSG_WAITALL) ··· 5706 5585 { 5707 5586 struct io_accept *accept = &req->accept; 5708 5587 5709 - if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) 5710 - return -EINVAL; 5711 - if (sqe->ioprio || sqe->len || sqe->buf_index) 5588 + if (sqe->len || sqe->buf_index) 5712 5589 return -EINVAL; 5713 5590 5714 5591 accept->addr = u64_to_user_ptr(READ_ONCE(sqe->addr)); ··· 5772 5653 { 5773 5654 struct io_connect *conn = &req->connect; 5774 5655 5775 - if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) 5776 - return -EINVAL; 5777 - if (sqe->ioprio || sqe->len || sqe->buf_index || sqe->rw_flags || 5778 - sqe->splice_fd_in) 5656 + if (sqe->len || sqe->buf_index || sqe->rw_flags || sqe->splice_fd_in) 5779 5657 return -EINVAL; 5780 5658 5781 5659 conn->addr = u64_to_user_ptr(READ_ONCE(sqe->addr)); ··· 5895 5779 struct io_ring_ctx *ctx = req->ctx; 5896 5780 struct hlist_head *list; 5897 5781 5898 - list = &ctx->cancel_hash[hash_long(req->user_data, ctx->cancel_hash_bits)]; 5782 + list = &ctx->cancel_hash[hash_long(req->cqe.user_data, ctx->cancel_hash_bits)]; 5899 5783 hlist_add_head(&req->hash_node, list); 5900 5784 } 5901 5785 ··· 5960 5844 * 5961 5845 * Returns a negative error on failure. >0 when no action require, which is 5962 5846 * either spurious wakeup or multishot CQE is served. 0 when it's done with 5963 - * the request, then the mask is stored in req->result. 5847 + * the request, then the mask is stored in req->cqe.res. 5964 5848 */ 5965 5849 static int io_poll_check_events(struct io_kiocb *req, bool locked) 5966 5850 { ··· 5969 5853 5970 5854 /* req->task == current here, checking PF_EXITING is safe */ 5971 5855 if (unlikely(req->task->flags & PF_EXITING)) 5972 - io_poll_mark_cancelled(req); 5856 + return -ECANCELED; 5973 5857 5974 5858 do { 5975 5859 v = atomic_read(&req->poll_refs); ··· 5980 5864 if (v & IO_POLL_CANCEL_FLAG) 5981 5865 return -ECANCELED; 5982 5866 5983 - if (!req->result) { 5867 + if (!req->cqe.res) { 5984 5868 struct poll_table_struct pt = { ._key = req->apoll_events }; 5985 5869 unsigned flags = locked ? 0 : IO_URING_F_UNLOCKED; 5986 5870 5987 5871 if (unlikely(!io_assign_file(req, flags))) 5988 5872 return -EBADF; 5989 - req->result = vfs_poll(req->file, &pt) & req->apoll_events; 5873 + req->cqe.res = vfs_poll(req->file, &pt) & req->apoll_events; 5990 5874 } 5991 5875 5992 5876 /* multishot, just fill an CQE and proceed */ 5993 - if (req->result && !(req->apoll_events & EPOLLONESHOT)) { 5994 - __poll_t mask = mangle_poll(req->result & req->apoll_events); 5877 + if (req->cqe.res && !(req->apoll_events & EPOLLONESHOT)) { 5878 + __poll_t mask = mangle_poll(req->cqe.res & req->apoll_events); 5995 5879 bool filled; 5996 5880 5997 5881 spin_lock(&ctx->completion_lock); 5998 - filled = io_fill_cqe_aux(ctx, req->user_data, mask, 5882 + filled = io_fill_cqe_aux(ctx, req->cqe.user_data, mask, 5999 5883 IORING_CQE_F_MORE); 6000 5884 io_commit_cqring(ctx); 6001 5885 spin_unlock(&ctx->completion_lock); 6002 5886 if (unlikely(!filled)) 6003 5887 return -ECANCELED; 6004 5888 io_cqring_ev_posted(ctx); 6005 - } else if (req->result) { 5889 + } else if (req->cqe.res) { 6006 5890 return 0; 6007 5891 } 6008 5892 ··· 6025 5909 return; 6026 5910 6027 5911 if (!ret) { 6028 - req->result = mangle_poll(req->result & req->poll.events); 5912 + req->cqe.res = mangle_poll(req->cqe.res & req->poll.events); 6029 5913 } else { 6030 - req->result = ret; 5914 + req->cqe.res = ret; 6031 5915 req_set_fail(req); 6032 5916 } 6033 5917 6034 5918 io_poll_remove_entries(req); 6035 5919 spin_lock(&ctx->completion_lock); 6036 5920 hash_del(&req->hash_node); 6037 - __io_req_complete_post(req, req->result, 0); 5921 + __io_req_complete_post(req, req->cqe.res, 0); 6038 5922 io_commit_cqring(ctx); 6039 5923 spin_unlock(&ctx->completion_lock); 6040 5924 io_cqring_ev_posted(ctx); ··· 6062 5946 6063 5947 static void __io_poll_execute(struct io_kiocb *req, int mask, int events) 6064 5948 { 6065 - req->result = mask; 5949 + req->cqe.res = mask; 6066 5950 /* 6067 5951 * This is useful for poll that is armed on behalf of another 6068 5952 * request, and where the wakeup path could be on a different ··· 6075 5959 else 6076 5960 req->io_task_work.func = io_apoll_task_func; 6077 5961 6078 - trace_io_uring_task_add(req->ctx, req, req->user_data, req->opcode, mask); 5962 + trace_io_uring_task_add(req->ctx, req, req->cqe.user_data, req->opcode, mask); 6079 5963 io_req_task_work_add(req, false); 6080 5964 } 6081 5965 ··· 6214 6098 int v; 6215 6099 6216 6100 INIT_HLIST_NODE(&req->hash_node); 6101 + req->work.cancel_seq = atomic_read(&ctx->cancel_seq); 6217 6102 io_init_poll_iocb(poll, mask, io_poll_wake); 6218 6103 poll->file = req->file; 6219 6104 ··· 6290 6173 6291 6174 if (!def->pollin && !def->pollout) 6292 6175 return IO_APOLL_ABORTED; 6293 - if (!file_can_poll(req->file) || (req->flags & REQ_F_POLLED)) 6176 + if (!file_can_poll(req->file)) 6177 + return IO_APOLL_ABORTED; 6178 + if ((req->flags & (REQ_F_POLLED|REQ_F_PARTIAL_IO)) == REQ_F_POLLED) 6294 6179 return IO_APOLL_ABORTED; 6295 6180 6296 6181 if (def->pollin) { ··· 6307 6188 } 6308 6189 if (def->poll_exclusive) 6309 6190 mask |= EPOLLEXCLUSIVE; 6310 - if (!(issue_flags & IO_URING_F_UNLOCKED) && 6311 - !list_empty(&ctx->apoll_cache)) { 6191 + if (req->flags & REQ_F_POLLED) { 6192 + apoll = req->apoll; 6193 + } else if (!(issue_flags & IO_URING_F_UNLOCKED) && 6194 + !list_empty(&ctx->apoll_cache)) { 6312 6195 apoll = list_first_entry(&ctx->apoll_cache, struct async_poll, 6313 6196 poll.wait.entry); 6314 6197 list_del_init(&apoll->poll.wait.entry); ··· 6330 6209 if (ret || ipt.error) 6331 6210 return ret ? IO_APOLL_READY : IO_APOLL_ABORTED; 6332 6211 6333 - trace_io_uring_poll_arm(ctx, req, req->user_data, req->opcode, 6212 + trace_io_uring_poll_arm(ctx, req, req->cqe.user_data, req->opcode, 6334 6213 mask, apoll->poll.events); 6335 6214 return IO_APOLL_OK; 6336 6215 } ··· 6363 6242 return found; 6364 6243 } 6365 6244 6366 - static struct io_kiocb *io_poll_find(struct io_ring_ctx *ctx, __u64 sqe_addr, 6367 - bool poll_only) 6245 + static struct io_kiocb *io_poll_find(struct io_ring_ctx *ctx, bool poll_only, 6246 + struct io_cancel_data *cd) 6368 6247 __must_hold(&ctx->completion_lock) 6369 6248 { 6370 6249 struct hlist_head *list; 6371 6250 struct io_kiocb *req; 6372 6251 6373 - list = &ctx->cancel_hash[hash_long(sqe_addr, ctx->cancel_hash_bits)]; 6252 + list = &ctx->cancel_hash[hash_long(cd->data, ctx->cancel_hash_bits)]; 6374 6253 hlist_for_each_entry(req, list, hash_node) { 6375 - if (sqe_addr != req->user_data) 6254 + if (cd->data != req->cqe.user_data) 6376 6255 continue; 6377 6256 if (poll_only && req->opcode != IORING_OP_POLL_ADD) 6378 6257 continue; 6258 + if (cd->flags & IORING_ASYNC_CANCEL_ALL) { 6259 + if (cd->seq == req->work.cancel_seq) 6260 + continue; 6261 + req->work.cancel_seq = cd->seq; 6262 + } 6379 6263 return req; 6264 + } 6265 + return NULL; 6266 + } 6267 + 6268 + static struct io_kiocb *io_poll_file_find(struct io_ring_ctx *ctx, 6269 + struct io_cancel_data *cd) 6270 + __must_hold(&ctx->completion_lock) 6271 + { 6272 + struct io_kiocb *req; 6273 + int i; 6274 + 6275 + for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) { 6276 + struct hlist_head *list; 6277 + 6278 + list = &ctx->cancel_hash[i]; 6279 + hlist_for_each_entry(req, list, hash_node) { 6280 + if (!(cd->flags & IORING_ASYNC_CANCEL_ANY) && 6281 + req->file != cd->file) 6282 + continue; 6283 + if (cd->seq == req->work.cancel_seq) 6284 + continue; 6285 + req->work.cancel_seq = cd->seq; 6286 + return req; 6287 + } 6380 6288 } 6381 6289 return NULL; 6382 6290 } ··· 6420 6270 return true; 6421 6271 } 6422 6272 6423 - static int io_poll_cancel(struct io_ring_ctx *ctx, __u64 sqe_addr, 6424 - bool poll_only) 6273 + static int io_poll_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd) 6425 6274 __must_hold(&ctx->completion_lock) 6426 6275 { 6427 - struct io_kiocb *req = io_poll_find(ctx, sqe_addr, poll_only); 6276 + struct io_kiocb *req; 6428 6277 6278 + if (cd->flags & (IORING_ASYNC_CANCEL_FD|IORING_ASYNC_CANCEL_ANY)) 6279 + req = io_poll_file_find(ctx, cd); 6280 + else 6281 + req = io_poll_find(ctx, false, cd); 6429 6282 if (!req) 6430 6283 return -ENOENT; 6431 6284 io_poll_cancel_req(req); ··· 6455 6302 struct io_poll_update *upd = &req->poll_update; 6456 6303 u32 flags; 6457 6304 6458 - if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) 6459 - return -EINVAL; 6460 - if (sqe->ioprio || sqe->buf_index || sqe->splice_fd_in) 6305 + if (sqe->buf_index || sqe->splice_fd_in) 6461 6306 return -EINVAL; 6462 6307 flags = READ_ONCE(sqe->len); 6463 6308 if (flags & ~(IORING_POLL_UPDATE_EVENTS | IORING_POLL_UPDATE_USER_DATA | ··· 6485 6334 struct io_poll_iocb *poll = &req->poll; 6486 6335 u32 flags; 6487 6336 6488 - if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) 6489 - return -EINVAL; 6490 - if (sqe->ioprio || sqe->buf_index || sqe->off || sqe->addr) 6337 + if (sqe->buf_index || sqe->off || sqe->addr) 6491 6338 return -EINVAL; 6492 6339 flags = READ_ONCE(sqe->len); 6493 6340 if (flags & ~IORING_POLL_ADD_MULTI) ··· 6515 6366 6516 6367 static int io_poll_update(struct io_kiocb *req, unsigned int issue_flags) 6517 6368 { 6369 + struct io_cancel_data cd = { .data = req->poll_update.old_user_data, }; 6518 6370 struct io_ring_ctx *ctx = req->ctx; 6519 6371 struct io_kiocb *preq; 6520 6372 int ret2, ret = 0; 6521 6373 bool locked; 6522 6374 6523 6375 spin_lock(&ctx->completion_lock); 6524 - preq = io_poll_find(ctx, req->poll_update.old_user_data, true); 6376 + preq = io_poll_find(ctx, true, &cd); 6525 6377 if (!preq || !io_poll_disarm(preq)) { 6526 6378 spin_unlock(&ctx->completion_lock); 6527 6379 ret = preq ? -EALREADY : -ENOENT; ··· 6538 6388 preq->poll.events |= IO_POLL_UNMASK; 6539 6389 } 6540 6390 if (req->poll_update.update_user_data) 6541 - preq->user_data = req->poll_update.new_user_data; 6391 + preq->cqe.user_data = req->poll_update.new_user_data; 6542 6392 6543 6393 ret2 = io_poll_add(preq, issue_flags); 6544 6394 /* successfully updated, don't complete poll request */ ··· 6547 6397 } 6548 6398 6549 6399 req_set_fail(preq); 6550 - preq->result = -ECANCELED; 6400 + preq->cqe.res = -ECANCELED; 6551 6401 locked = !(issue_flags & IO_URING_F_UNLOCKED); 6552 6402 io_req_task_complete(preq, &locked); 6553 6403 out: ··· 6575 6425 if (!(data->flags & IORING_TIMEOUT_ETIME_SUCCESS)) 6576 6426 req_set_fail(req); 6577 6427 6578 - req->result = -ETIME; 6428 + req->cqe.res = -ETIME; 6579 6429 req->io_task_work.func = io_req_task_complete; 6580 6430 io_req_task_work_add(req, false); 6581 6431 return HRTIMER_NORESTART; 6582 6432 } 6583 6433 6584 6434 static struct io_kiocb *io_timeout_extract(struct io_ring_ctx *ctx, 6585 - __u64 user_data) 6435 + struct io_cancel_data *cd) 6586 6436 __must_hold(&ctx->timeout_lock) 6587 6437 { 6588 6438 struct io_timeout_data *io; ··· 6590 6440 bool found = false; 6591 6441 6592 6442 list_for_each_entry(req, &ctx->timeout_list, timeout.list) { 6593 - found = user_data == req->user_data; 6594 - if (found) 6595 - break; 6443 + if (!(cd->flags & IORING_ASYNC_CANCEL_ANY) && 6444 + cd->data != req->cqe.user_data) 6445 + continue; 6446 + if (cd->flags & (IORING_ASYNC_CANCEL_ALL|IORING_ASYNC_CANCEL_ANY)) { 6447 + if (cd->seq == req->work.cancel_seq) 6448 + continue; 6449 + req->work.cancel_seq = cd->seq; 6450 + } 6451 + found = true; 6452 + break; 6596 6453 } 6597 6454 if (!found) 6598 6455 return ERR_PTR(-ENOENT); ··· 6611 6454 return req; 6612 6455 } 6613 6456 6614 - static int io_timeout_cancel(struct io_ring_ctx *ctx, __u64 user_data) 6457 + static int io_timeout_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd) 6615 6458 __must_hold(&ctx->completion_lock) 6616 - __must_hold(&ctx->timeout_lock) 6617 6459 { 6618 - struct io_kiocb *req = io_timeout_extract(ctx, user_data); 6460 + struct io_kiocb *req; 6461 + 6462 + spin_lock_irq(&ctx->timeout_lock); 6463 + req = io_timeout_extract(ctx, cd); 6464 + spin_unlock_irq(&ctx->timeout_lock); 6619 6465 6620 6466 if (IS_ERR(req)) 6621 6467 return PTR_ERR(req); ··· 6651 6491 bool found = false; 6652 6492 6653 6493 list_for_each_entry(req, &ctx->ltimeout_list, timeout.list) { 6654 - found = user_data == req->user_data; 6494 + found = user_data == req->cqe.user_data; 6655 6495 if (found) 6656 6496 break; 6657 6497 } ··· 6671 6511 struct timespec64 *ts, enum hrtimer_mode mode) 6672 6512 __must_hold(&ctx->timeout_lock) 6673 6513 { 6674 - struct io_kiocb *req = io_timeout_extract(ctx, user_data); 6514 + struct io_cancel_data cd = { .data = user_data, }; 6515 + struct io_kiocb *req = io_timeout_extract(ctx, &cd); 6675 6516 struct io_timeout_data *data; 6676 6517 6677 6518 if (IS_ERR(req)) ··· 6692 6531 { 6693 6532 struct io_timeout_rem *tr = &req->timeout_rem; 6694 6533 6695 - if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) 6696 - return -EINVAL; 6697 6534 if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT))) 6698 6535 return -EINVAL; 6699 - if (sqe->ioprio || sqe->buf_index || sqe->len || sqe->splice_fd_in) 6536 + if (sqe->buf_index || sqe->len || sqe->splice_fd_in) 6700 6537 return -EINVAL; 6701 6538 6702 6539 tr->ltimeout = false; ··· 6735 6576 int ret; 6736 6577 6737 6578 if (!(req->timeout_rem.flags & IORING_TIMEOUT_UPDATE)) { 6579 + struct io_cancel_data cd = { .data = tr->addr, }; 6580 + 6738 6581 spin_lock(&ctx->completion_lock); 6739 - spin_lock_irq(&ctx->timeout_lock); 6740 - ret = io_timeout_cancel(ctx, tr->addr); 6741 - spin_unlock_irq(&ctx->timeout_lock); 6582 + ret = io_timeout_cancel(ctx, &cd); 6742 6583 spin_unlock(&ctx->completion_lock); 6743 6584 } else { 6744 6585 enum hrtimer_mode mode = io_translate_timeout_mode(tr->flags); ··· 6764 6605 unsigned flags; 6765 6606 u32 off = READ_ONCE(sqe->off); 6766 6607 6767 - if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) 6768 - return -EINVAL; 6769 - if (sqe->ioprio || sqe->buf_index || sqe->len != 1 || 6770 - sqe->splice_fd_in) 6608 + if (sqe->buf_index || sqe->len != 1 || sqe->splice_fd_in) 6771 6609 return -EINVAL; 6772 6610 if (off && is_timeout_link) 6773 6611 return -EINVAL; ··· 6863 6707 return 0; 6864 6708 } 6865 6709 6866 - struct io_cancel_data { 6867 - struct io_ring_ctx *ctx; 6868 - u64 user_data; 6869 - }; 6870 - 6871 6710 static bool io_cancel_cb(struct io_wq_work *work, void *data) 6872 6711 { 6873 6712 struct io_kiocb *req = container_of(work, struct io_kiocb, work); 6874 6713 struct io_cancel_data *cd = data; 6875 6714 6876 - return req->ctx == cd->ctx && req->user_data == cd->user_data; 6715 + if (req->ctx != cd->ctx) 6716 + return false; 6717 + if (cd->flags & IORING_ASYNC_CANCEL_ANY) { 6718 + ; 6719 + } else if (cd->flags & IORING_ASYNC_CANCEL_FD) { 6720 + if (req->file != cd->file) 6721 + return false; 6722 + } else { 6723 + if (req->cqe.user_data != cd->data) 6724 + return false; 6725 + } 6726 + if (cd->flags & (IORING_ASYNC_CANCEL_ALL|IORING_ASYNC_CANCEL_ANY)) { 6727 + if (cd->seq == req->work.cancel_seq) 6728 + return false; 6729 + req->work.cancel_seq = cd->seq; 6730 + } 6731 + return true; 6877 6732 } 6878 6733 6879 - static int io_async_cancel_one(struct io_uring_task *tctx, u64 user_data, 6880 - struct io_ring_ctx *ctx) 6734 + static int io_async_cancel_one(struct io_uring_task *tctx, 6735 + struct io_cancel_data *cd) 6881 6736 { 6882 - struct io_cancel_data data = { .ctx = ctx, .user_data = user_data, }; 6883 6737 enum io_wq_cancel cancel_ret; 6884 6738 int ret = 0; 6739 + bool all; 6885 6740 6886 6741 if (!tctx || !tctx->io_wq) 6887 6742 return -ENOENT; 6888 6743 6889 - cancel_ret = io_wq_cancel_cb(tctx->io_wq, io_cancel_cb, &data, false); 6744 + all = cd->flags & (IORING_ASYNC_CANCEL_ALL|IORING_ASYNC_CANCEL_ANY); 6745 + cancel_ret = io_wq_cancel_cb(tctx->io_wq, io_cancel_cb, cd, all); 6890 6746 switch (cancel_ret) { 6891 6747 case IO_WQ_CANCEL_OK: 6892 6748 ret = 0; ··· 6914 6746 return ret; 6915 6747 } 6916 6748 6917 - static int io_try_cancel_userdata(struct io_kiocb *req, u64 sqe_addr) 6749 + static int io_try_cancel(struct io_kiocb *req, struct io_cancel_data *cd) 6918 6750 { 6919 6751 struct io_ring_ctx *ctx = req->ctx; 6920 6752 int ret; 6921 6753 6922 6754 WARN_ON_ONCE(!io_wq_current_is_worker() && req->task != current); 6923 6755 6924 - ret = io_async_cancel_one(req->task->io_uring, sqe_addr, ctx); 6756 + ret = io_async_cancel_one(req->task->io_uring, cd); 6925 6757 /* 6926 6758 * Fall-through even for -EALREADY, as we may have poll armed 6927 6759 * that need unarming. ··· 6930 6762 return 0; 6931 6763 6932 6764 spin_lock(&ctx->completion_lock); 6933 - ret = io_poll_cancel(ctx, sqe_addr, false); 6765 + ret = io_poll_cancel(ctx, cd); 6934 6766 if (ret != -ENOENT) 6935 6767 goto out; 6936 - 6937 - spin_lock_irq(&ctx->timeout_lock); 6938 - ret = io_timeout_cancel(ctx, sqe_addr); 6939 - spin_unlock_irq(&ctx->timeout_lock); 6768 + if (!(cd->flags & IORING_ASYNC_CANCEL_FD)) 6769 + ret = io_timeout_cancel(ctx, cd); 6940 6770 out: 6941 6771 spin_unlock(&ctx->completion_lock); 6942 6772 return ret; 6943 6773 } 6944 6774 6775 + #define CANCEL_FLAGS (IORING_ASYNC_CANCEL_ALL | IORING_ASYNC_CANCEL_FD | \ 6776 + IORING_ASYNC_CANCEL_ANY) 6777 + 6945 6778 static int io_async_cancel_prep(struct io_kiocb *req, 6946 6779 const struct io_uring_sqe *sqe) 6947 6780 { 6948 - if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) 6781 + if (unlikely(req->flags & REQ_F_BUFFER_SELECT)) 6949 6782 return -EINVAL; 6950 - if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT))) 6951 - return -EINVAL; 6952 - if (sqe->ioprio || sqe->off || sqe->len || sqe->cancel_flags || 6953 - sqe->splice_fd_in) 6783 + if (sqe->off || sqe->len || sqe->splice_fd_in) 6954 6784 return -EINVAL; 6955 6785 6956 6786 req->cancel.addr = READ_ONCE(sqe->addr); 6787 + req->cancel.flags = READ_ONCE(sqe->cancel_flags); 6788 + if (req->cancel.flags & ~CANCEL_FLAGS) 6789 + return -EINVAL; 6790 + if (req->cancel.flags & IORING_ASYNC_CANCEL_FD) { 6791 + if (req->cancel.flags & IORING_ASYNC_CANCEL_ANY) 6792 + return -EINVAL; 6793 + req->cancel.fd = READ_ONCE(sqe->fd); 6794 + } 6795 + 6957 6796 return 0; 6958 6797 } 6959 6798 6960 - static int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags) 6799 + static int __io_async_cancel(struct io_cancel_data *cd, struct io_kiocb *req, 6800 + unsigned int issue_flags) 6961 6801 { 6962 - struct io_ring_ctx *ctx = req->ctx; 6963 - u64 sqe_addr = req->cancel.addr; 6964 - bool needs_lock = issue_flags & IO_URING_F_UNLOCKED; 6802 + bool all = cd->flags & (IORING_ASYNC_CANCEL_ALL|IORING_ASYNC_CANCEL_ANY); 6803 + struct io_ring_ctx *ctx = cd->ctx; 6965 6804 struct io_tctx_node *node; 6966 - int ret; 6805 + int ret, nr = 0; 6967 6806 6968 - ret = io_try_cancel_userdata(req, sqe_addr); 6969 - if (ret != -ENOENT) 6970 - goto done; 6807 + do { 6808 + ret = io_try_cancel(req, cd); 6809 + if (ret == -ENOENT) 6810 + break; 6811 + if (!all) 6812 + return ret; 6813 + nr++; 6814 + } while (1); 6971 6815 6972 6816 /* slow path, try all io-wq's */ 6973 - io_ring_submit_lock(ctx, needs_lock); 6817 + io_ring_submit_lock(ctx, issue_flags); 6974 6818 ret = -ENOENT; 6975 6819 list_for_each_entry(node, &ctx->tctx_list, ctx_node) { 6976 6820 struct io_uring_task *tctx = node->task->io_uring; 6977 6821 6978 - ret = io_async_cancel_one(tctx, req->cancel.addr, ctx); 6979 - if (ret != -ENOENT) 6980 - break; 6822 + ret = io_async_cancel_one(tctx, cd); 6823 + if (ret != -ENOENT) { 6824 + if (!all) 6825 + break; 6826 + nr++; 6827 + } 6981 6828 } 6982 - io_ring_submit_unlock(ctx, needs_lock); 6829 + io_ring_submit_unlock(ctx, issue_flags); 6830 + return all ? nr : ret; 6831 + } 6832 + 6833 + static int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags) 6834 + { 6835 + struct io_cancel_data cd = { 6836 + .ctx = req->ctx, 6837 + .data = req->cancel.addr, 6838 + .flags = req->cancel.flags, 6839 + .seq = atomic_inc_return(&req->ctx->cancel_seq), 6840 + }; 6841 + int ret; 6842 + 6843 + if (cd.flags & IORING_ASYNC_CANCEL_FD) { 6844 + if (req->flags & REQ_F_FIXED_FILE) 6845 + req->file = io_file_get_fixed(req, req->cancel.fd, 6846 + issue_flags); 6847 + else 6848 + req->file = io_file_get_normal(req, req->cancel.fd); 6849 + if (!req->file) { 6850 + ret = -EBADF; 6851 + goto done; 6852 + } 6853 + cd.file = req->file; 6854 + } 6855 + 6856 + ret = __io_async_cancel(&cd, req, issue_flags); 6983 6857 done: 6984 6858 if (ret < 0) 6985 6859 req_set_fail(req); ··· 7034 6824 { 7035 6825 if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT))) 7036 6826 return -EINVAL; 7037 - if (sqe->ioprio || sqe->rw_flags || sqe->splice_fd_in) 6827 + if (sqe->rw_flags || sqe->splice_fd_in) 7038 6828 return -EINVAL; 7039 6829 7040 6830 req->rsrc_update.offset = READ_ONCE(sqe->off); ··· 7048 6838 static int io_files_update(struct io_kiocb *req, unsigned int issue_flags) 7049 6839 { 7050 6840 struct io_ring_ctx *ctx = req->ctx; 7051 - bool needs_lock = issue_flags & IO_URING_F_UNLOCKED; 7052 6841 struct io_uring_rsrc_update2 up; 7053 6842 int ret; 7054 6843 ··· 7058 6849 up.resv = 0; 7059 6850 up.resv2 = 0; 7060 6851 7061 - io_ring_submit_lock(ctx, needs_lock); 6852 + io_ring_submit_lock(ctx, issue_flags); 7062 6853 ret = __io_register_rsrc_update(ctx, IORING_RSRC_FILE, 7063 6854 &up, req->rsrc_update.nr_args); 7064 - io_ring_submit_unlock(ctx, needs_lock); 6855 + io_ring_submit_unlock(ctx, issue_flags); 7065 6856 7066 6857 if (ret < 0) 7067 6858 req_set_fail(req); ··· 7160 6951 7161 6952 /* assign early for deferred execution for non-fixed file */ 7162 6953 if (def->needs_file && !(req->flags & REQ_F_FIXED_FILE)) 7163 - req->file = io_file_get_normal(req, req->fd); 6954 + req->file = io_file_get_normal(req, req->cqe.fd); 7164 6955 if (!def->needs_async_setup) 7165 6956 return 0; 7166 6957 if (WARN_ON_ONCE(req_has_async_data(req))) ··· 7188 6979 static u32 io_get_sequence(struct io_kiocb *req) 7189 6980 { 7190 6981 u32 seq = req->ctx->cached_sq_head; 6982 + struct io_kiocb *cur; 7191 6983 7192 6984 /* need original cached_sq_head, but it was increased for each req */ 7193 - io_for_each_link(req, req) 6985 + io_for_each_link(cur, req) 7194 6986 seq--; 7195 6987 return seq; 7196 6988 } ··· 7234 7024 goto queue; 7235 7025 } 7236 7026 7237 - trace_io_uring_defer(ctx, req, req->user_data, req->opcode); 7027 + trace_io_uring_defer(ctx, req, req->cqe.user_data, req->opcode); 7238 7028 de->req = req; 7239 7029 de->seq = seq; 7240 7030 list_add_tail(&de->list, &ctx->defer_list); ··· 7318 7108 return true; 7319 7109 7320 7110 if (req->flags & REQ_F_FIXED_FILE) 7321 - req->file = io_file_get_fixed(req, req->fd, issue_flags); 7111 + req->file = io_file_get_fixed(req, req->cqe.fd, issue_flags); 7322 7112 else 7323 - req->file = io_file_get_normal(req, req->fd); 7324 - if (req->file) 7325 - return true; 7113 + req->file = io_file_get_normal(req, req->cqe.fd); 7326 7114 7327 - req_set_fail(req); 7328 - req->result = -EBADF; 7329 - return false; 7115 + return !!req->file; 7330 7116 } 7331 7117 7332 7118 static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags) ··· 7485 7279 const struct io_op_def *def = &io_op_defs[req->opcode]; 7486 7280 unsigned int issue_flags = IO_URING_F_UNLOCKED; 7487 7281 bool needs_poll = false; 7488 - struct io_kiocb *timeout; 7489 7282 int ret = 0, err = -ECANCELED; 7490 7283 7491 7284 /* one will be dropped by ->io_free_work() after returning to io-wq */ ··· 7493 7288 else 7494 7289 req_ref_get(req); 7495 7290 7496 - timeout = io_prep_linked_timeout(req); 7497 - if (timeout) 7498 - io_queue_linked_timeout(timeout); 7499 - 7291 + io_arm_ltimeout(req); 7500 7292 7501 7293 /* either cancelled or io-wq is dying, so don't touch tctx->iowq */ 7502 7294 if (work->flags & IO_WQ_WORK_CANCEL) { ··· 7571 7369 struct file *file = NULL; 7572 7370 unsigned long file_ptr; 7573 7371 7574 - if (issue_flags & IO_URING_F_UNLOCKED) 7575 - mutex_lock(&ctx->uring_lock); 7372 + io_ring_submit_lock(ctx, issue_flags); 7576 7373 7577 7374 if (unlikely((unsigned int)fd >= ctx->nr_user_files)) 7578 7375 goto out; ··· 7583 7382 req->flags |= (file_ptr << REQ_F_SUPPORT_NOWAIT_BIT); 7584 7383 io_req_set_rsrc_node(req, ctx, 0); 7585 7384 out: 7586 - if (issue_flags & IO_URING_F_UNLOCKED) 7587 - mutex_unlock(&ctx->uring_lock); 7385 + io_ring_submit_unlock(ctx, issue_flags); 7588 7386 return file; 7589 7387 } 7590 7388 ··· 7604 7404 { 7605 7405 struct file *file = fget(fd); 7606 7406 7607 - trace_io_uring_file_get(req->ctx, req, req->user_data, fd); 7407 + trace_io_uring_file_get(req->ctx, req, req->cqe.user_data, fd); 7608 7408 7609 7409 /* we don't allow fixed io_uring files */ 7610 7410 if (file && file->f_op == &io_uring_fops) ··· 7618 7418 int ret = -ENOENT; 7619 7419 7620 7420 if (prev) { 7621 - if (!(req->task->flags & PF_EXITING)) 7622 - ret = io_try_cancel_userdata(req, prev->user_data); 7421 + if (!(req->task->flags & PF_EXITING)) { 7422 + struct io_cancel_data cd = { 7423 + .ctx = req->ctx, 7424 + .data = prev->cqe.user_data, 7425 + }; 7426 + 7427 + ret = io_try_cancel(req, &cd); 7428 + } 7623 7429 io_req_complete_post(req, ret ?: -ETIME, 0); 7624 7430 io_put_req(prev); 7625 7431 } else { ··· 7685 7479 io_put_req(req); 7686 7480 } 7687 7481 7688 - static void io_queue_sqe_arm_apoll(struct io_kiocb *req) 7482 + static void io_queue_async(struct io_kiocb *req, int ret) 7689 7483 __must_hold(&req->ctx->uring_lock) 7690 7484 { 7691 - struct io_kiocb *linked_timeout = io_prep_linked_timeout(req); 7485 + struct io_kiocb *linked_timeout; 7486 + 7487 + if (ret != -EAGAIN || (req->flags & REQ_F_NOWAIT)) { 7488 + io_req_complete_failed(req, ret); 7489 + return; 7490 + } 7491 + 7492 + linked_timeout = io_prep_linked_timeout(req); 7692 7493 7693 7494 switch (io_arm_poll_handler(req, 0)) { 7694 7495 case IO_APOLL_READY: ··· 7706 7493 * Queued up for async execution, worker will release 7707 7494 * submit reference when the iocb is actually submitted. 7708 7495 */ 7709 - io_queue_async_work(req, NULL); 7496 + io_queue_iowq(req, NULL); 7710 7497 break; 7711 7498 case IO_APOLL_OK: 7712 7499 break; ··· 7716 7503 io_queue_linked_timeout(linked_timeout); 7717 7504 } 7718 7505 7719 - static inline void __io_queue_sqe(struct io_kiocb *req) 7506 + static inline void io_queue_sqe(struct io_kiocb *req) 7720 7507 __must_hold(&req->ctx->uring_lock) 7721 7508 { 7722 - struct io_kiocb *linked_timeout; 7723 7509 int ret; 7724 7510 7725 7511 ret = io_issue_sqe(req, IO_URING_F_NONBLOCK|IO_URING_F_COMPLETE_DEFER); ··· 7731 7519 * We async punt it if the file wasn't marked NOWAIT, or if the file 7732 7520 * doesn't support non-blocking read/write attempts 7733 7521 */ 7734 - if (likely(!ret)) { 7735 - linked_timeout = io_prep_linked_timeout(req); 7736 - if (linked_timeout) 7737 - io_queue_linked_timeout(linked_timeout); 7738 - } else if (ret == -EAGAIN && !(req->flags & REQ_F_NOWAIT)) { 7739 - io_queue_sqe_arm_apoll(req); 7740 - } else { 7741 - io_req_complete_failed(req, ret); 7742 - } 7522 + if (likely(!ret)) 7523 + io_arm_ltimeout(req); 7524 + else 7525 + io_queue_async(req, ret); 7743 7526 } 7744 7527 7745 7528 static void io_queue_sqe_fallback(struct io_kiocb *req) 7746 7529 __must_hold(&req->ctx->uring_lock) 7747 7530 { 7748 - if (req->flags & REQ_F_FAIL) { 7749 - io_req_complete_fail_submit(req); 7531 + if (unlikely(req->flags & REQ_F_FAIL)) { 7532 + /* 7533 + * We don't submit, fail them all, for that replace hardlinks 7534 + * with normal links. Extra REQ_F_LINK is tolerated. 7535 + */ 7536 + req->flags &= ~REQ_F_HARDLINK; 7537 + req->flags |= REQ_F_LINK; 7538 + io_req_complete_failed(req, req->cqe.res); 7750 7539 } else if (unlikely(req->ctx->drain_active)) { 7751 7540 io_drain_req(req); 7752 7541 } else { ··· 7756 7543 if (unlikely(ret)) 7757 7544 io_req_complete_failed(req, ret); 7758 7545 else 7759 - io_queue_async_work(req, NULL); 7546 + io_queue_iowq(req, NULL); 7760 7547 } 7761 - } 7762 - 7763 - static inline void io_queue_sqe(struct io_kiocb *req) 7764 - __must_hold(&req->ctx->uring_lock) 7765 - { 7766 - if (likely(!(req->flags & (REQ_F_FORCE_ASYNC | REQ_F_FAIL)))) 7767 - __io_queue_sqe(req); 7768 - else 7769 - io_queue_sqe_fallback(req); 7770 7548 } 7771 7549 7772 7550 /* ··· 7814 7610 req->opcode = opcode = READ_ONCE(sqe->opcode); 7815 7611 /* same numerical values with corresponding REQ_F_*, safe to copy */ 7816 7612 req->flags = sqe_flags = READ_ONCE(sqe->flags); 7817 - req->user_data = READ_ONCE(sqe->user_data); 7613 + req->cqe.user_data = READ_ONCE(sqe->user_data); 7818 7614 req->file = NULL; 7819 - req->fixed_rsrc_refs = NULL; 7615 + req->rsrc_node = NULL; 7820 7616 req->task = current; 7821 7617 7822 7618 if (unlikely(opcode >= IORING_OP_LAST)) { ··· 7827 7623 /* enforce forwards compatibility on users */ 7828 7624 if (sqe_flags & ~SQE_VALID_FLAGS) 7829 7625 return -EINVAL; 7830 - if ((sqe_flags & IOSQE_BUFFER_SELECT) && 7831 - !io_op_defs[opcode].buffer_select) 7832 - return -EOPNOTSUPP; 7626 + if (sqe_flags & IOSQE_BUFFER_SELECT) { 7627 + if (!io_op_defs[opcode].buffer_select) 7628 + return -EOPNOTSUPP; 7629 + req->buf_index = READ_ONCE(sqe->buf_group); 7630 + } 7833 7631 if (sqe_flags & IOSQE_CQE_SKIP_SUCCESS) 7834 7632 ctx->drain_disabled = true; 7835 7633 if (sqe_flags & IOSQE_IO_DRAIN) { ··· 7854 7648 } 7855 7649 } 7856 7650 7651 + if (!io_op_defs[opcode].ioprio && sqe->ioprio) 7652 + return -EINVAL; 7653 + if (!io_op_defs[opcode].iopoll && (ctx->flags & IORING_SETUP_IOPOLL)) 7654 + return -EINVAL; 7655 + 7857 7656 if (io_op_defs[opcode].needs_file) { 7858 7657 struct io_submit_state *state = &ctx->submit_state; 7859 7658 7860 - req->fd = READ_ONCE(sqe->fd); 7659 + req->cqe.fd = READ_ONCE(sqe->fd); 7861 7660 7862 7661 /* 7863 7662 * Plug now if we have more than 2 IO left after this, and the ··· 7894 7683 return io_req_prep(req, sqe); 7895 7684 } 7896 7685 7897 - static int io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req, 7686 + static __cold int io_submit_fail_init(const struct io_uring_sqe *sqe, 7687 + struct io_kiocb *req, int ret) 7688 + { 7689 + struct io_ring_ctx *ctx = req->ctx; 7690 + struct io_submit_link *link = &ctx->submit_state.link; 7691 + struct io_kiocb *head = link->head; 7692 + 7693 + trace_io_uring_req_failed(sqe, ctx, req, ret); 7694 + 7695 + /* 7696 + * Avoid breaking links in the middle as it renders links with SQPOLL 7697 + * unusable. Instead of failing eagerly, continue assembling the link if 7698 + * applicable and mark the head with REQ_F_FAIL. The link flushing code 7699 + * should find the flag and handle the rest. 7700 + */ 7701 + req_fail_link_node(req, ret); 7702 + if (head && !(head->flags & REQ_F_FAIL)) 7703 + req_fail_link_node(head, -ECANCELED); 7704 + 7705 + if (!(req->flags & IO_REQ_LINK_FLAGS)) { 7706 + if (head) { 7707 + link->last->link = req; 7708 + link->head = NULL; 7709 + req = head; 7710 + } 7711 + io_queue_sqe_fallback(req); 7712 + return ret; 7713 + } 7714 + 7715 + if (head) 7716 + link->last->link = req; 7717 + else 7718 + link->head = req; 7719 + link->last = req; 7720 + return 0; 7721 + } 7722 + 7723 + static inline int io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req, 7898 7724 const struct io_uring_sqe *sqe) 7899 7725 __must_hold(&ctx->uring_lock) 7900 7726 { ··· 7939 7691 int ret; 7940 7692 7941 7693 ret = io_init_req(ctx, req, sqe); 7942 - if (unlikely(ret)) { 7943 - trace_io_uring_req_failed(sqe, ctx, req, ret); 7944 - 7945 - /* fail even hard links since we don't submit */ 7946 - if (link->head) { 7947 - /* 7948 - * we can judge a link req is failed or cancelled by if 7949 - * REQ_F_FAIL is set, but the head is an exception since 7950 - * it may be set REQ_F_FAIL because of other req's failure 7951 - * so let's leverage req->result to distinguish if a head 7952 - * is set REQ_F_FAIL because of its failure or other req's 7953 - * failure so that we can set the correct ret code for it. 7954 - * init result here to avoid affecting the normal path. 7955 - */ 7956 - if (!(link->head->flags & REQ_F_FAIL)) 7957 - req_fail_link_node(link->head, -ECANCELED); 7958 - } else if (!(req->flags & (REQ_F_LINK | REQ_F_HARDLINK))) { 7959 - /* 7960 - * the current req is a normal req, we should return 7961 - * error and thus break the submittion loop. 7962 - */ 7963 - io_req_complete_failed(req, ret); 7964 - return ret; 7965 - } 7966 - req_fail_link_node(req, ret); 7967 - } 7694 + if (unlikely(ret)) 7695 + return io_submit_fail_init(sqe, req, ret); 7968 7696 7969 7697 /* don't need @sqe from now on */ 7970 - trace_io_uring_submit_sqe(ctx, req, req->user_data, req->opcode, 7698 + trace_io_uring_submit_sqe(ctx, req, req->cqe.user_data, req->opcode, 7971 7699 req->flags, true, 7972 7700 ctx->flags & IORING_SETUP_SQPOLL); 7973 7701 ··· 7954 7730 * submitted sync once the chain is complete. If none of those 7955 7731 * conditions are true (normal request), then just queue it. 7956 7732 */ 7957 - if (link->head) { 7958 - struct io_kiocb *head = link->head; 7733 + if (unlikely(link->head)) { 7734 + ret = io_req_prep_async(req); 7735 + if (unlikely(ret)) 7736 + return io_submit_fail_init(sqe, req, ret); 7959 7737 7960 - if (!(req->flags & REQ_F_FAIL)) { 7961 - ret = io_req_prep_async(req); 7962 - if (unlikely(ret)) { 7963 - req_fail_link_node(req, ret); 7964 - if (!(head->flags & REQ_F_FAIL)) 7965 - req_fail_link_node(head, -ECANCELED); 7966 - } 7967 - } 7968 - trace_io_uring_link(ctx, req, head); 7738 + trace_io_uring_link(ctx, req, link->head); 7969 7739 link->last->link = req; 7970 7740 link->last = req; 7971 7741 7972 - if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) 7742 + if (req->flags & IO_REQ_LINK_FLAGS) 7973 7743 return 0; 7974 - /* last request of a link, enqueue the link */ 7744 + /* last request of the link, flush it */ 7745 + req = link->head; 7975 7746 link->head = NULL; 7976 - req = head; 7977 - } else if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) { 7978 - link->head = req; 7979 - link->last = req; 7747 + if (req->flags & (REQ_F_FORCE_ASYNC | REQ_F_FAIL)) 7748 + goto fallback; 7749 + 7750 + } else if (unlikely(req->flags & (IO_REQ_LINK_FLAGS | 7751 + REQ_F_FORCE_ASYNC | REQ_F_FAIL))) { 7752 + if (req->flags & IO_REQ_LINK_FLAGS) { 7753 + link->head = req; 7754 + link->last = req; 7755 + } else { 7756 + fallback: 7757 + io_queue_sqe_fallback(req); 7758 + } 7980 7759 return 0; 7981 7760 } 7982 7761 ··· 7994 7767 { 7995 7768 struct io_submit_state *state = &ctx->submit_state; 7996 7769 7997 - if (state->link.head) 7998 - io_queue_sqe(state->link.head); 7770 + if (unlikely(state->link.head)) 7771 + io_queue_sqe_fallback(state->link.head); 7999 7772 /* flush only after queuing links as they can generate completions */ 8000 7773 io_submit_flush_completions(ctx); 8001 7774 if (state->plug_started) ··· 8063 7836 __must_hold(&ctx->uring_lock) 8064 7837 { 8065 7838 unsigned int entries = io_sqring_entries(ctx); 8066 - int submitted = 0; 7839 + unsigned int left; 7840 + int ret; 8067 7841 8068 7842 if (unlikely(!entries)) 8069 7843 return 0; 8070 7844 /* make sure SQ entry isn't read before tail */ 8071 - nr = min3(nr, ctx->sq_entries, entries); 8072 - io_get_task_refs(nr); 7845 + ret = left = min3(nr, ctx->sq_entries, entries); 7846 + io_get_task_refs(left); 7847 + io_submit_state_start(&ctx->submit_state, left); 8073 7848 8074 - io_submit_state_start(&ctx->submit_state, nr); 8075 7849 do { 8076 7850 const struct io_uring_sqe *sqe; 8077 7851 struct io_kiocb *req; 8078 7852 8079 - if (unlikely(!io_alloc_req_refill(ctx))) { 8080 - if (!submitted) 8081 - submitted = -EAGAIN; 7853 + if (unlikely(!io_alloc_req_refill(ctx))) 8082 7854 break; 8083 - } 8084 7855 req = io_alloc_req(ctx); 8085 7856 sqe = io_get_sqe(ctx); 8086 7857 if (unlikely(!sqe)) { 8087 - wq_stack_add_head(&req->comp_list, &ctx->submit_state.free_list); 7858 + io_req_add_to_cache(req, ctx); 8088 7859 break; 8089 7860 } 8090 - /* will complete beyond this point, count as submitted */ 8091 - submitted++; 8092 - if (io_submit_sqe(ctx, req, sqe)) { 8093 - /* 8094 - * Continue submitting even for sqe failure if the 8095 - * ring was setup with IORING_SETUP_SUBMIT_ALL 8096 - */ 8097 - if (!(ctx->flags & IORING_SETUP_SUBMIT_ALL)) 8098 - break; 7861 + 7862 + /* 7863 + * Continue submitting even for sqe failure if the 7864 + * ring was setup with IORING_SETUP_SUBMIT_ALL 7865 + */ 7866 + if (unlikely(io_submit_sqe(ctx, req, sqe)) && 7867 + !(ctx->flags & IORING_SETUP_SUBMIT_ALL)) { 7868 + left--; 7869 + break; 8099 7870 } 8100 - } while (submitted < nr); 7871 + } while (--left); 8101 7872 8102 - if (unlikely(submitted != nr)) { 8103 - int ref_used = (submitted == -EAGAIN) ? 0 : submitted; 8104 - int unused = nr - ref_used; 8105 - 8106 - current->io_uring->cached_refs += unused; 7873 + if (unlikely(left)) { 7874 + ret -= left; 7875 + /* try again if it submitted nothing and can't allocate a req */ 7876 + if (!ret && io_req_cache_empty(ctx)) 7877 + ret = -EAGAIN; 7878 + current->io_uring->cached_refs += left; 8107 7879 } 8108 7880 8109 7881 io_submit_state_end(ctx); 8110 7882 /* Commit SQ ring head once we've consumed and submitted all SQEs */ 8111 7883 io_commit_sqring(ctx); 8112 - 8113 - return submitted; 7884 + return ret; 8114 7885 } 8115 7886 8116 7887 static inline bool io_sqd_events_pending(struct io_sq_data *sqd) 8117 7888 { 8118 7889 return READ_ONCE(sqd->state); 8119 - } 8120 - 8121 - static inline void io_ring_set_wakeup_flag(struct io_ring_ctx *ctx) 8122 - { 8123 - /* Tell userspace we may need a wakeup call */ 8124 - spin_lock(&ctx->completion_lock); 8125 - WRITE_ONCE(ctx->rings->sq_flags, 8126 - ctx->rings->sq_flags | IORING_SQ_NEED_WAKEUP); 8127 - spin_unlock(&ctx->completion_lock); 8128 - } 8129 - 8130 - static inline void io_ring_clear_wakeup_flag(struct io_ring_ctx *ctx) 8131 - { 8132 - spin_lock(&ctx->completion_lock); 8133 - WRITE_ONCE(ctx->rings->sq_flags, 8134 - ctx->rings->sq_flags & ~IORING_SQ_NEED_WAKEUP); 8135 - spin_unlock(&ctx->completion_lock); 8136 7890 } 8137 7891 8138 7892 static int __io_sq_thread(struct io_ring_ctx *ctx, bool cap_entries) ··· 8231 8023 bool needs_sched = true; 8232 8024 8233 8025 list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) { 8234 - io_ring_set_wakeup_flag(ctx); 8235 - 8026 + atomic_or(IORING_SQ_NEED_WAKEUP, 8027 + &ctx->rings->sq_flags); 8236 8028 if ((ctx->flags & IORING_SETUP_IOPOLL) && 8237 8029 !wq_list_empty(&ctx->iopoll_list)) { 8238 8030 needs_sched = false; ··· 8243 8035 * Ensure the store of the wakeup flag is not 8244 8036 * reordered with the load of the SQ tail 8245 8037 */ 8246 - smp_mb(); 8038 + smp_mb__after_atomic(); 8247 8039 8248 8040 if (io_sqring_entries(ctx)) { 8249 8041 needs_sched = false; ··· 8257 8049 mutex_lock(&sqd->lock); 8258 8050 } 8259 8051 list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) 8260 - io_ring_clear_wakeup_flag(ctx); 8052 + atomic_andnot(IORING_SQ_NEED_WAKEUP, 8053 + &ctx->rings->sq_flags); 8261 8054 } 8262 8055 8263 8056 finish_wait(&sqd->wait, &wait); ··· 8268 8059 io_uring_cancel_generic(true, sqd); 8269 8060 sqd->thread = NULL; 8270 8061 list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) 8271 - io_ring_set_wakeup_flag(ctx); 8062 + atomic_or(IORING_SQ_NEED_WAKEUP, &ctx->rings->sq_flags); 8272 8063 io_run_task_work(); 8273 8064 mutex_unlock(&sqd->lock); 8274 8065 ··· 8308 8099 * Cannot safely flush overflowed CQEs from here, ensure we wake up 8309 8100 * the task, and the next invocation will do it. 8310 8101 */ 8311 - if (io_should_wake(iowq) || test_bit(0, &iowq->ctx->check_cq_overflow)) 8102 + if (io_should_wake(iowq) || 8103 + test_bit(IO_CHECK_CQ_OVERFLOW_BIT, &iowq->ctx->check_cq)) 8312 8104 return autoremove_wake_function(curr, mode, wake_flags, key); 8313 8105 return -1; 8314 8106 } ··· 8331 8121 ktime_t timeout) 8332 8122 { 8333 8123 int ret; 8124 + unsigned long check_cq; 8334 8125 8335 8126 /* make sure we run task_work before checking for signals */ 8336 8127 ret = io_run_task_work_sig(); 8337 8128 if (ret || io_should_wake(iowq)) 8338 8129 return ret; 8130 + check_cq = READ_ONCE(ctx->check_cq); 8339 8131 /* let the caller flush overflows, retry */ 8340 - if (test_bit(0, &ctx->check_cq_overflow)) 8132 + if (check_cq & BIT(IO_CHECK_CQ_OVERFLOW_BIT)) 8341 8133 return 1; 8342 - 8134 + if (unlikely(check_cq & BIT(IO_CHECK_CQ_DROPPED_BIT))) 8135 + return -EBADR; 8343 8136 if (!schedule_hrtimeout(&timeout, HRTIMER_MODE_ABS)) 8344 8137 return -ETIME; 8345 8138 return 1; ··· 8407 8194 prepare_to_wait_exclusive(&ctx->cq_wait, &iowq.wq, 8408 8195 TASK_INTERRUPTIBLE); 8409 8196 ret = io_cqring_wait_schedule(ctx, &iowq, timeout); 8410 - finish_wait(&ctx->cq_wait, &iowq.wq); 8411 8197 cond_resched(); 8412 8198 } while (ret > 0); 8413 8199 8200 + finish_wait(&ctx->cq_wait, &iowq.wq); 8414 8201 restore_saved_sigmask_unless(ret == -EINTR); 8415 8202 8416 8203 return READ_ONCE(rings->cq.head) == READ_ONCE(rings->cq.tail) ? ret : 0; ··· 8659 8446 8660 8447 static void __io_sqe_files_unregister(struct io_ring_ctx *ctx) 8661 8448 { 8449 + #if !defined(IO_URING_SCM_ALL) 8450 + int i; 8451 + 8452 + for (i = 0; i < ctx->nr_user_files; i++) { 8453 + struct file *file = io_file_from_index(ctx, i); 8454 + 8455 + if (!file) 8456 + continue; 8457 + if (io_fixed_file_slot(&ctx->file_table, i)->file_ptr & FFS_SCM) 8458 + continue; 8459 + fput(file); 8460 + } 8461 + #endif 8462 + 8662 8463 #if defined(CONFIG_UNIX) 8663 8464 if (ctx->ring_sock) { 8664 8465 struct sock *sock = ctx->ring_sock->sk; ··· 8680 8453 8681 8454 while ((skb = skb_dequeue(&sock->sk_receive_queue)) != NULL) 8682 8455 kfree_skb(skb); 8683 - } 8684 - #else 8685 - int i; 8686 - 8687 - for (i = 0; i < ctx->nr_user_files; i++) { 8688 - struct file *file; 8689 - 8690 - file = io_file_from_index(ctx, i); 8691 - if (file) 8692 - fput(file); 8693 8456 } 8694 8457 #endif 8695 8458 io_free_file_tables(&ctx->file_table); ··· 8825 8608 return sqd; 8826 8609 } 8827 8610 8828 - #if defined(CONFIG_UNIX) 8829 8611 /* 8830 8612 * Ensure the UNIX gc is aware of our file set, so we are certain that 8831 8613 * the io_uring can be safely unregistered on process exit, even if we have 8832 - * loops in the file referencing. 8614 + * loops in the file referencing. We account only files that can hold other 8615 + * files because otherwise they can't form a loop and so are not interesting 8616 + * for GC. 8833 8617 */ 8834 - static int __io_sqe_files_scm(struct io_ring_ctx *ctx, int nr, int offset) 8618 + static int io_scm_file_account(struct io_ring_ctx *ctx, struct file *file) 8835 8619 { 8620 + #if defined(CONFIG_UNIX) 8836 8621 struct sock *sk = ctx->ring_sock->sk; 8622 + struct sk_buff_head *head = &sk->sk_receive_queue; 8837 8623 struct scm_fp_list *fpl; 8838 8624 struct sk_buff *skb; 8839 - int i, nr_files; 8840 8625 8841 - fpl = kzalloc(sizeof(*fpl), GFP_KERNEL); 8842 - if (!fpl) 8843 - return -ENOMEM; 8844 - 8845 - skb = alloc_skb(0, GFP_KERNEL); 8846 - if (!skb) { 8847 - kfree(fpl); 8848 - return -ENOMEM; 8849 - } 8850 - 8851 - skb->sk = sk; 8852 - 8853 - nr_files = 0; 8854 - fpl->user = get_uid(current_user()); 8855 - for (i = 0; i < nr; i++) { 8856 - struct file *file = io_file_from_index(ctx, i + offset); 8857 - 8858 - if (!file) 8859 - continue; 8860 - fpl->fp[nr_files] = get_file(file); 8861 - unix_inflight(fpl->user, fpl->fp[nr_files]); 8862 - nr_files++; 8863 - } 8864 - 8865 - if (nr_files) { 8866 - fpl->max = SCM_MAX_FD; 8867 - fpl->count = nr_files; 8868 - UNIXCB(skb).fp = fpl; 8869 - skb->destructor = unix_destruct_scm; 8870 - refcount_add(skb->truesize, &sk->sk_wmem_alloc); 8871 - skb_queue_head(&sk->sk_receive_queue, skb); 8872 - 8873 - for (i = 0; i < nr; i++) { 8874 - struct file *file = io_file_from_index(ctx, i + offset); 8875 - 8876 - if (file) 8877 - fput(file); 8878 - } 8879 - } else { 8880 - kfree_skb(skb); 8881 - free_uid(fpl->user); 8882 - kfree(fpl); 8883 - } 8884 - 8885 - return 0; 8886 - } 8887 - 8888 - /* 8889 - * If UNIX sockets are enabled, fd passing can cause a reference cycle which 8890 - * causes regular reference counting to break down. We rely on the UNIX 8891 - * garbage collection to take care of this problem for us. 8892 - */ 8893 - static int io_sqe_files_scm(struct io_ring_ctx *ctx) 8894 - { 8895 - unsigned left, total; 8896 - int ret = 0; 8897 - 8898 - total = 0; 8899 - left = ctx->nr_user_files; 8900 - while (left) { 8901 - unsigned this_files = min_t(unsigned, left, SCM_MAX_FD); 8902 - 8903 - ret = __io_sqe_files_scm(ctx, this_files, total); 8904 - if (ret) 8905 - break; 8906 - left -= this_files; 8907 - total += this_files; 8908 - } 8909 - 8910 - if (!ret) 8626 + if (likely(!io_file_need_scm(file))) 8911 8627 return 0; 8912 8628 8913 - while (total < ctx->nr_user_files) { 8914 - struct file *file = io_file_from_index(ctx, total); 8629 + /* 8630 + * See if we can merge this file into an existing skb SCM_RIGHTS 8631 + * file set. If there's no room, fall back to allocating a new skb 8632 + * and filling it in. 8633 + */ 8634 + spin_lock_irq(&head->lock); 8635 + skb = skb_peek(head); 8636 + if (skb && UNIXCB(skb).fp->count < SCM_MAX_FD) 8637 + __skb_unlink(skb, head); 8638 + else 8639 + skb = NULL; 8640 + spin_unlock_irq(&head->lock); 8915 8641 8916 - if (file) 8917 - fput(file); 8918 - total++; 8642 + if (!skb) { 8643 + fpl = kzalloc(sizeof(*fpl), GFP_KERNEL); 8644 + if (!fpl) 8645 + return -ENOMEM; 8646 + 8647 + skb = alloc_skb(0, GFP_KERNEL); 8648 + if (!skb) { 8649 + kfree(fpl); 8650 + return -ENOMEM; 8651 + } 8652 + 8653 + fpl->user = get_uid(current_user()); 8654 + fpl->max = SCM_MAX_FD; 8655 + fpl->count = 0; 8656 + 8657 + UNIXCB(skb).fp = fpl; 8658 + skb->sk = sk; 8659 + skb->destructor = unix_destruct_scm; 8660 + refcount_add(skb->truesize, &sk->sk_wmem_alloc); 8919 8661 } 8920 8662 8921 - return ret; 8922 - } 8923 - #else 8924 - static int io_sqe_files_scm(struct io_ring_ctx *ctx) 8925 - { 8663 + fpl = UNIXCB(skb).fp; 8664 + fpl->fp[fpl->count++] = get_file(file); 8665 + unix_inflight(fpl->user, file); 8666 + skb_queue_head(head, skb); 8667 + fput(file); 8668 + #endif 8926 8669 return 0; 8927 8670 } 8928 - #endif 8929 8671 8930 8672 static void io_rsrc_file_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc) 8931 8673 { ··· 8894 8718 struct sk_buff_head list, *head = &sock->sk_receive_queue; 8895 8719 struct sk_buff *skb; 8896 8720 int i; 8721 + 8722 + if (!io_file_need_scm(file)) { 8723 + fput(file); 8724 + return; 8725 + } 8897 8726 8898 8727 __skb_queue_head_init(&list); 8899 8728 ··· 8964 8783 list_del(&prsrc->list); 8965 8784 8966 8785 if (prsrc->tag) { 8967 - bool lock_ring = ctx->flags & IORING_SETUP_IOPOLL; 8786 + if (ctx->flags & IORING_SETUP_IOPOLL) 8787 + mutex_lock(&ctx->uring_lock); 8968 8788 8969 - io_ring_submit_lock(ctx, lock_ring); 8970 8789 spin_lock(&ctx->completion_lock); 8971 8790 io_fill_cqe_aux(ctx, prsrc->tag, 0, 0); 8972 8791 io_commit_cqring(ctx); 8973 8792 spin_unlock(&ctx->completion_lock); 8974 8793 io_cqring_ev_posted(ctx); 8975 - io_ring_submit_unlock(ctx, lock_ring); 8794 + 8795 + if (ctx->flags & IORING_SETUP_IOPOLL) 8796 + mutex_unlock(&ctx->uring_lock); 8976 8797 } 8977 8798 8978 8799 rsrc_data->do_put(ctx, prsrc); ··· 9028 8845 if (ret) 9029 8846 return ret; 9030 8847 9031 - ret = -ENOMEM; 9032 - if (!io_alloc_file_tables(&ctx->file_table, nr_args)) 9033 - goto out_free; 8848 + if (!io_alloc_file_tables(&ctx->file_table, nr_args)) { 8849 + io_rsrc_data_free(ctx->file_data); 8850 + ctx->file_data = NULL; 8851 + return -ENOMEM; 8852 + } 9034 8853 9035 8854 for (i = 0; i < nr_args; i++, ctx->nr_user_files++) { 8855 + struct io_fixed_file *file_slot; 8856 + 9036 8857 if (copy_from_user(&fd, &fds[i], sizeof(fd))) { 9037 8858 ret = -EFAULT; 9038 - goto out_fput; 8859 + goto fail; 9039 8860 } 9040 8861 /* allow sparse sets */ 9041 8862 if (fd == -1) { 9042 8863 ret = -EINVAL; 9043 8864 if (unlikely(*io_get_tag_slot(ctx->file_data, i))) 9044 - goto out_fput; 8865 + goto fail; 9045 8866 continue; 9046 8867 } 9047 8868 9048 8869 file = fget(fd); 9049 8870 ret = -EBADF; 9050 8871 if (unlikely(!file)) 9051 - goto out_fput; 8872 + goto fail; 9052 8873 9053 8874 /* 9054 8875 * Don't allow io_uring instances to be registered. If UNIX ··· 9063 8876 */ 9064 8877 if (file->f_op == &io_uring_fops) { 9065 8878 fput(file); 9066 - goto out_fput; 8879 + goto fail; 9067 8880 } 9068 - io_fixed_file_set(io_fixed_file_slot(&ctx->file_table, i), file); 9069 - } 9070 - 9071 - ret = io_sqe_files_scm(ctx); 9072 - if (ret) { 9073 - __io_sqe_files_unregister(ctx); 9074 - return ret; 8881 + ret = io_scm_file_account(ctx, file); 8882 + if (ret) { 8883 + fput(file); 8884 + goto fail; 8885 + } 8886 + file_slot = io_fixed_file_slot(&ctx->file_table, i); 8887 + io_fixed_file_set(file_slot, file); 9075 8888 } 9076 8889 9077 8890 io_rsrc_node_switch(ctx, NULL); 9078 - return ret; 9079 - out_fput: 9080 - for (i = 0; i < ctx->nr_user_files; i++) { 9081 - file = io_file_from_index(ctx, i); 9082 - if (file) 9083 - fput(file); 9084 - } 9085 - io_free_file_tables(&ctx->file_table); 9086 - ctx->nr_user_files = 0; 9087 - out_free: 9088 - io_rsrc_data_free(ctx->file_data); 9089 - ctx->file_data = NULL; 9090 - return ret; 9091 - } 9092 - 9093 - static int io_sqe_file_register(struct io_ring_ctx *ctx, struct file *file, 9094 - int index) 9095 - { 9096 - #if defined(CONFIG_UNIX) 9097 - struct sock *sock = ctx->ring_sock->sk; 9098 - struct sk_buff_head *head = &sock->sk_receive_queue; 9099 - struct sk_buff *skb; 9100 - 9101 - /* 9102 - * See if we can merge this file into an existing skb SCM_RIGHTS 9103 - * file set. If there's no room, fall back to allocating a new skb 9104 - * and filling it in. 9105 - */ 9106 - spin_lock_irq(&head->lock); 9107 - skb = skb_peek(head); 9108 - if (skb) { 9109 - struct scm_fp_list *fpl = UNIXCB(skb).fp; 9110 - 9111 - if (fpl->count < SCM_MAX_FD) { 9112 - __skb_unlink(skb, head); 9113 - spin_unlock_irq(&head->lock); 9114 - fpl->fp[fpl->count] = get_file(file); 9115 - unix_inflight(fpl->user, fpl->fp[fpl->count]); 9116 - fpl->count++; 9117 - spin_lock_irq(&head->lock); 9118 - __skb_queue_head(head, skb); 9119 - } else { 9120 - skb = NULL; 9121 - } 9122 - } 9123 - spin_unlock_irq(&head->lock); 9124 - 9125 - if (skb) { 9126 - fput(file); 9127 - return 0; 9128 - } 9129 - 9130 - return __io_sqe_files_scm(ctx, 1, index); 9131 - #else 9132 8891 return 0; 9133 - #endif 8892 + fail: 8893 + __io_sqe_files_unregister(ctx); 8894 + return ret; 9134 8895 } 9135 8896 9136 8897 static int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx, ··· 9102 8967 unsigned int issue_flags, u32 slot_index) 9103 8968 { 9104 8969 struct io_ring_ctx *ctx = req->ctx; 9105 - bool needs_lock = issue_flags & IO_URING_F_UNLOCKED; 9106 8970 bool needs_switch = false; 9107 8971 struct io_fixed_file *file_slot; 9108 8972 int ret = -EBADF; 9109 8973 9110 - io_ring_submit_lock(ctx, needs_lock); 8974 + io_ring_submit_lock(ctx, issue_flags); 9111 8975 if (file->f_op == &io_uring_fops) 9112 8976 goto err; 9113 8977 ret = -ENXIO; ··· 9135 9001 needs_switch = true; 9136 9002 } 9137 9003 9138 - *io_get_tag_slot(ctx->file_data, slot_index) = 0; 9139 - io_fixed_file_set(file_slot, file); 9140 - ret = io_sqe_file_register(ctx, file, slot_index); 9141 - if (ret) { 9142 - file_slot->file_ptr = 0; 9143 - goto err; 9004 + ret = io_scm_file_account(ctx, file); 9005 + if (!ret) { 9006 + *io_get_tag_slot(ctx->file_data, slot_index) = 0; 9007 + io_fixed_file_set(file_slot, file); 9144 9008 } 9145 - 9146 - ret = 0; 9147 9009 err: 9148 9010 if (needs_switch) 9149 9011 io_rsrc_node_switch(ctx, ctx->file_data); 9150 - io_ring_submit_unlock(ctx, needs_lock); 9012 + io_ring_submit_unlock(ctx, issue_flags); 9151 9013 if (ret) 9152 9014 fput(file); 9153 9015 return ret; ··· 9153 9023 { 9154 9024 unsigned int offset = req->close.file_slot - 1; 9155 9025 struct io_ring_ctx *ctx = req->ctx; 9156 - bool needs_lock = issue_flags & IO_URING_F_UNLOCKED; 9157 9026 struct io_fixed_file *file_slot; 9158 9027 struct file *file; 9159 9028 int ret; 9160 9029 9161 - io_ring_submit_lock(ctx, needs_lock); 9030 + io_ring_submit_lock(ctx, issue_flags); 9162 9031 ret = -ENXIO; 9163 9032 if (unlikely(!ctx->file_data)) 9164 9033 goto out; ··· 9183 9054 io_rsrc_node_switch(ctx, ctx->file_data); 9184 9055 ret = 0; 9185 9056 out: 9186 - io_ring_submit_unlock(ctx, needs_lock); 9057 + io_ring_submit_unlock(ctx, issue_flags); 9187 9058 return ret; 9188 9059 } 9189 9060 ··· 9250 9121 err = -EBADF; 9251 9122 break; 9252 9123 } 9253 - *io_get_tag_slot(data, i) = tag; 9254 - io_fixed_file_set(file_slot, file); 9255 - err = io_sqe_file_register(ctx, file, i); 9124 + err = io_scm_file_account(ctx, file); 9256 9125 if (err) { 9257 - file_slot->file_ptr = 0; 9258 9126 fput(file); 9259 9127 break; 9260 9128 } 9129 + *io_get_tag_slot(data, i) = tag; 9130 + io_fixed_file_set(file_slot, file); 9261 9131 } 9262 9132 } 9263 9133 ··· 9983 9855 9984 9856 static void io_destroy_buffers(struct io_ring_ctx *ctx) 9985 9857 { 9858 + struct io_buffer_list *bl; 9859 + unsigned long index; 9986 9860 int i; 9987 9861 9988 - for (i = 0; i < (1U << IO_BUFFERS_HASH_BITS); i++) { 9989 - struct list_head *list = &ctx->io_buffers[i]; 9862 + for (i = 0; i < BGID_ARRAY; i++) { 9863 + if (!ctx->io_bl) 9864 + break; 9865 + __io_remove_buffers(ctx, &ctx->io_bl[i], -1U); 9866 + } 9990 9867 9991 - while (!list_empty(list)) { 9992 - struct io_buffer_list *bl; 9993 - 9994 - bl = list_first_entry(list, struct io_buffer_list, list); 9995 - __io_remove_buffers(ctx, bl, -1U); 9996 - list_del(&bl->list); 9997 - kfree(bl); 9998 - } 9868 + xa_for_each(&ctx->io_bl_xa, index, bl) { 9869 + xa_erase(&ctx->io_bl_xa, bl->bgid); 9870 + __io_remove_buffers(ctx, bl, -1U); 9999 9871 } 10000 9872 10001 9873 while (!list_empty(&ctx->io_buffers_pages)) { ··· 10015 9887 mutex_lock(&ctx->uring_lock); 10016 9888 io_flush_cached_locked_reqs(ctx, state); 10017 9889 10018 - while (state->free_list.next) { 9890 + while (!io_req_cache_empty(ctx)) { 10019 9891 struct io_wq_work_node *node; 10020 9892 struct io_kiocb *req; 10021 9893 ··· 10104 9976 io_wq_put_hash(ctx->hash_map); 10105 9977 kfree(ctx->cancel_hash); 10106 9978 kfree(ctx->dummy_ubuf); 10107 - kfree(ctx->io_buffers); 9979 + kfree(ctx->io_bl); 9980 + xa_destroy(&ctx->io_bl_xa); 10108 9981 kfree(ctx); 10109 9982 } 10110 9983 ··· 10136 10007 * Users may get EPOLLIN meanwhile seeing nothing in cqring, this 10137 10008 * pushs them to do the flush. 10138 10009 */ 10139 - if (io_cqring_events(ctx) || test_bit(0, &ctx->check_cq_overflow)) 10010 + if (io_cqring_events(ctx) || 10011 + test_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq)) 10140 10012 mask |= EPOLLIN | EPOLLRDNORM; 10141 10013 10142 10014 return mask; ··· 10269 10139 } 10270 10140 } 10271 10141 spin_unlock_irq(&ctx->timeout_lock); 10272 - if (canceled != 0) 10273 - io_commit_cqring(ctx); 10142 + io_commit_cqring(ctx); 10274 10143 spin_unlock(&ctx->completion_lock); 10275 10144 if (canceled != 0) 10276 10145 io_cqring_ev_posted(ctx); ··· 10289 10160 io_unregister_personality(ctx, index); 10290 10161 mutex_unlock(&ctx->uring_lock); 10291 10162 10292 - io_kill_timeouts(ctx, NULL, true); 10293 - io_poll_remove_all(ctx, NULL, true); 10294 - 10295 - /* if we failed setting up the ctx, we might not have any rings */ 10296 - io_iopoll_try_reap_events(ctx); 10163 + /* failed during ring init, it couldn't have issued any requests */ 10164 + if (ctx->rings) { 10165 + io_kill_timeouts(ctx, NULL, true); 10166 + io_poll_remove_all(ctx, NULL, true); 10167 + /* if we failed setting up the ctx, we might not have any rings */ 10168 + io_iopoll_try_reap_events(ctx); 10169 + } 10297 10170 10298 10171 INIT_WORK(&ctx->exit_work, io_ring_exit_work); 10299 10172 /* ··· 10386 10255 { 10387 10256 struct io_task_cancel cancel = { .task = task, .all = cancel_all, }; 10388 10257 struct io_uring_task *tctx = task ? task->io_uring : NULL; 10258 + 10259 + /* failed during ring init, it couldn't have issued any requests */ 10260 + if (!ctx->rings) 10261 + return; 10389 10262 10390 10263 while (1) { 10391 10264 enum io_wq_cancel cret; ··· 10836 10701 return 0; 10837 10702 } 10838 10703 10704 + static int io_validate_ext_arg(unsigned flags, const void __user *argp, size_t argsz) 10705 + { 10706 + if (flags & IORING_ENTER_EXT_ARG) { 10707 + struct io_uring_getevents_arg arg; 10708 + 10709 + if (argsz != sizeof(arg)) 10710 + return -EINVAL; 10711 + if (copy_from_user(&arg, argp, sizeof(arg))) 10712 + return -EFAULT; 10713 + } 10714 + return 0; 10715 + } 10716 + 10839 10717 static int io_get_ext_arg(unsigned flags, const void __user *argp, size_t *argsz, 10840 10718 struct __kernel_timespec __user **ts, 10841 10719 const sigset_t __user **sig) ··· 10886 10738 size_t, argsz) 10887 10739 { 10888 10740 struct io_ring_ctx *ctx; 10889 - int submitted = 0; 10890 10741 struct fd f; 10891 10742 long ret; 10892 10743 ··· 10948 10801 if (ret) 10949 10802 goto out; 10950 10803 } 10951 - submitted = to_submit; 10804 + ret = to_submit; 10952 10805 } else if (to_submit) { 10953 10806 ret = io_uring_add_tctx_node(ctx); 10954 10807 if (unlikely(ret)) 10955 10808 goto out; 10956 - mutex_lock(&ctx->uring_lock); 10957 - submitted = io_submit_sqes(ctx, to_submit); 10958 - mutex_unlock(&ctx->uring_lock); 10959 10809 10960 - if (submitted != to_submit) 10810 + mutex_lock(&ctx->uring_lock); 10811 + ret = io_submit_sqes(ctx, to_submit); 10812 + if (ret != to_submit) { 10813 + mutex_unlock(&ctx->uring_lock); 10961 10814 goto out; 10815 + } 10816 + if ((flags & IORING_ENTER_GETEVENTS) && ctx->syscall_iopoll) 10817 + goto iopoll_locked; 10818 + mutex_unlock(&ctx->uring_lock); 10962 10819 } 10963 10820 if (flags & IORING_ENTER_GETEVENTS) { 10964 - const sigset_t __user *sig; 10965 - struct __kernel_timespec __user *ts; 10966 - 10967 - ret = io_get_ext_arg(flags, argp, &argsz, &ts, &sig); 10968 - if (unlikely(ret)) 10969 - goto out; 10970 - 10971 - min_complete = min(min_complete, ctx->cq_entries); 10972 - 10973 - /* 10974 - * When SETUP_IOPOLL and SETUP_SQPOLL are both enabled, user 10975 - * space applications don't need to do io completion events 10976 - * polling again, they can rely on io_sq_thread to do polling 10977 - * work, which can reduce cpu usage and uring_lock contention. 10978 - */ 10979 - if (ctx->flags & IORING_SETUP_IOPOLL && 10980 - !(ctx->flags & IORING_SETUP_SQPOLL)) { 10981 - ret = io_iopoll_check(ctx, min_complete); 10821 + int ret2; 10822 + if (ctx->syscall_iopoll) { 10823 + /* 10824 + * We disallow the app entering submit/complete with 10825 + * polling, but we still need to lock the ring to 10826 + * prevent racing with polled issue that got punted to 10827 + * a workqueue. 10828 + */ 10829 + mutex_lock(&ctx->uring_lock); 10830 + iopoll_locked: 10831 + ret2 = io_validate_ext_arg(flags, argp, argsz); 10832 + if (likely(!ret2)) { 10833 + min_complete = min(min_complete, 10834 + ctx->cq_entries); 10835 + ret2 = io_iopoll_check(ctx, min_complete); 10836 + } 10837 + mutex_unlock(&ctx->uring_lock); 10982 10838 } else { 10983 - ret = io_cqring_wait(ctx, min_complete, sig, argsz, ts); 10839 + const sigset_t __user *sig; 10840 + struct __kernel_timespec __user *ts; 10841 + 10842 + ret2 = io_get_ext_arg(flags, argp, &argsz, &ts, &sig); 10843 + if (likely(!ret2)) { 10844 + min_complete = min(min_complete, 10845 + ctx->cq_entries); 10846 + ret2 = io_cqring_wait(ctx, min_complete, sig, 10847 + argsz, ts); 10848 + } 10849 + } 10850 + 10851 + if (!ret) { 10852 + ret = ret2; 10853 + 10854 + /* 10855 + * EBADR indicates that one or more CQE were dropped. 10856 + * Once the user has been informed we can clear the bit 10857 + * as they are obviously ok with those drops. 10858 + */ 10859 + if (unlikely(ret2 == -EBADR)) 10860 + clear_bit(IO_CHECK_CQ_DROPPED_BIT, 10861 + &ctx->check_cq); 10984 10862 } 10985 10863 } 10986 10864 ··· 11014 10842 out_fput: 11015 10843 if (!(flags & IORING_ENTER_REGISTERED_RING)) 11016 10844 fdput(f); 11017 - return submitted ? submitted : ret; 10845 + return ret; 11018 10846 } 11019 10847 11020 10848 #ifdef CONFIG_PROC_FS ··· 11331 11159 ctx = io_ring_ctx_alloc(p); 11332 11160 if (!ctx) 11333 11161 return -ENOMEM; 11162 + 11163 + /* 11164 + * When SETUP_IOPOLL and SETUP_SQPOLL are both enabled, user 11165 + * space applications don't need to do io completion events 11166 + * polling again, they can rely on io_sq_thread to do polling 11167 + * work, which can reduce cpu usage and uring_lock contention. 11168 + */ 11169 + if (ctx->flags & IORING_SETUP_IOPOLL && 11170 + !(ctx->flags & IORING_SETUP_SQPOLL)) 11171 + ctx->syscall_iopoll = 1; 11172 + 11334 11173 ctx->compat = in_compat_syscall(); 11335 11174 if (!capable(CAP_IPC_LOCK)) 11336 11175 ctx->user = get_uid(current_user()); 11176 + 11177 + /* 11178 + * For SQPOLL, we just need a wakeup, always. For !SQPOLL, if 11179 + * COOP_TASKRUN is set, then IPIs are never needed by the app. 11180 + */ 11181 + ret = -EINVAL; 11182 + if (ctx->flags & IORING_SETUP_SQPOLL) { 11183 + /* IPI related flags don't make sense with SQPOLL */ 11184 + if (ctx->flags & (IORING_SETUP_COOP_TASKRUN | 11185 + IORING_SETUP_TASKRUN_FLAG)) 11186 + goto err; 11187 + ctx->notify_method = TWA_SIGNAL_NO_IPI; 11188 + } else if (ctx->flags & IORING_SETUP_COOP_TASKRUN) { 11189 + ctx->notify_method = TWA_SIGNAL_NO_IPI; 11190 + } else { 11191 + if (ctx->flags & IORING_SETUP_TASKRUN_FLAG) 11192 + goto err; 11193 + ctx->notify_method = TWA_SIGNAL; 11194 + } 11337 11195 11338 11196 /* 11339 11197 * This is just grabbed for accounting purposes. When a process exits, ··· 11462 11260 if (p.flags & ~(IORING_SETUP_IOPOLL | IORING_SETUP_SQPOLL | 11463 11261 IORING_SETUP_SQ_AFF | IORING_SETUP_CQSIZE | 11464 11262 IORING_SETUP_CLAMP | IORING_SETUP_ATTACH_WQ | 11465 - IORING_SETUP_R_DISABLED | IORING_SETUP_SUBMIT_ALL)) 11263 + IORING_SETUP_R_DISABLED | IORING_SETUP_SUBMIT_ALL | 11264 + IORING_SETUP_COOP_TASKRUN | IORING_SETUP_TASKRUN_FLAG)) 11466 11265 return -EINVAL; 11467 11266 11468 - return io_uring_create(entries, &p, params); 11267 + return io_uring_create(entries, &p, params); 11469 11268 } 11470 11269 11471 11270 SYSCALL_DEFINE2(io_uring_setup, u32, entries, ··· 12033 11830 12034 11831 /* ->buf_index is u16 */ 12035 11832 BUILD_BUG_ON(IORING_MAX_REG_BUFFERS >= (1u << 16)); 11833 + BUILD_BUG_ON(BGID_ARRAY * sizeof(struct io_buffer_list) > PAGE_SIZE); 12036 11834 12037 11835 /* should fit into one byte */ 12038 11836 BUILD_BUG_ON(SQE_VALID_FLAGS >= (1 << 8)); ··· 12042 11838 12043 11839 BUILD_BUG_ON(ARRAY_SIZE(io_op_defs) != IORING_OP_LAST); 12044 11840 BUILD_BUG_ON(__REQ_F_LAST_BIT > 8 * sizeof(int)); 11841 + 11842 + BUILD_BUG_ON(sizeof(atomic_t) != sizeof(u32)); 12045 11843 12046 11844 req_cachep = KMEM_CACHE(io_kiocb, SLAB_HWCACHE_ALIGN | SLAB_PANIC | 12047 11845 SLAB_ACCOUNT);
+11 -2
include/linux/sched/signal.h
··· 356 356 } 357 357 358 358 /* 359 + * Returns 'true' if kick_process() is needed to force a transition from 360 + * user -> kernel to guarantee expedient run of TWA_SIGNAL based task_work. 361 + */ 362 + static inline bool __set_notify_signal(struct task_struct *task) 363 + { 364 + return !test_and_set_tsk_thread_flag(task, TIF_NOTIFY_SIGNAL) && 365 + !wake_up_state(task, TASK_INTERRUPTIBLE); 366 + } 367 + 368 + /* 359 369 * Called to break out of interruptible wait loops, and enter the 360 370 * exit_to_user_mode_loop(). 361 371 */ 362 372 static inline void set_notify_signal(struct task_struct *task) 363 373 { 364 - if (!test_and_set_tsk_thread_flag(task, TIF_NOTIFY_SIGNAL) && 365 - !wake_up_state(task, TASK_INTERRUPTIBLE)) 374 + if (__set_notify_signal(task)) 366 375 kick_process(task); 367 376 } 368 377
+1
include/linux/task_work.h
··· 17 17 TWA_NONE, 18 18 TWA_RESUME, 19 19 TWA_SIGNAL, 20 + TWA_SIGNAL_NO_IPI, 20 21 }; 21 22 22 23 static inline bool task_work_pending(struct task_struct *task)
+41 -1
include/trace/events/io_uring.h
··· 530 530 ), 531 531 532 532 TP_printk("ring %p, req %p, user_data 0x%llx, " 533 - "op %d, flags 0x%x, prio=%d, off=%llu, addr=%llu, " 533 + "op %d, flags 0x%x, prio=%d, off=%llu, addr=%llu, " 534 534 "len=%u, rw_flags=0x%x, buf_index=%d, " 535 535 "personality=%d, file_index=%d, pad=0x%llx/%llx, error=%d", 536 536 __entry->ctx, __entry->req, __entry->user_data, ··· 541 541 __entry->buf_index, __entry->personality, __entry->file_index, 542 542 (unsigned long long) __entry->pad1, 543 543 (unsigned long long) __entry->pad2, __entry->error) 544 + ); 545 + 546 + 547 + /* 548 + * io_uring_cqe_overflow - a CQE overflowed 549 + * 550 + * @ctx: pointer to a ring context structure 551 + * @user_data: user data associated with the request 552 + * @res: CQE result 553 + * @cflags: CQE flags 554 + * @ocqe: pointer to the overflow cqe (if available) 555 + * 556 + */ 557 + TRACE_EVENT(io_uring_cqe_overflow, 558 + 559 + TP_PROTO(void *ctx, unsigned long long user_data, s32 res, u32 cflags, 560 + void *ocqe), 561 + 562 + TP_ARGS(ctx, user_data, res, cflags, ocqe), 563 + 564 + TP_STRUCT__entry ( 565 + __field( void *, ctx ) 566 + __field( unsigned long long, user_data ) 567 + __field( s32, res ) 568 + __field( u32, cflags ) 569 + __field( void *, ocqe ) 570 + ), 571 + 572 + TP_fast_assign( 573 + __entry->ctx = ctx; 574 + __entry->user_data = user_data; 575 + __entry->res = res; 576 + __entry->cflags = cflags; 577 + __entry->ocqe = ocqe; 578 + ), 579 + 580 + TP_printk("ring %p, user_data 0x%llx, res %d, flags %x, " 581 + "overflow_cqe %p", 582 + __entry->ctx, __entry->user_data, __entry->res, 583 + __entry->cflags, __entry->ocqe) 544 584 ); 545 585 546 586 #endif /* _TRACE_IO_URING_H */
+37
include/uapi/linux/io_uring.h
··· 102 102 #define IORING_SETUP_ATTACH_WQ (1U << 5) /* attach to existing wq */ 103 103 #define IORING_SETUP_R_DISABLED (1U << 6) /* start with ring disabled */ 104 104 #define IORING_SETUP_SUBMIT_ALL (1U << 7) /* continue submit on error */ 105 + /* 106 + * Cooperative task running. When requests complete, they often require 107 + * forcing the submitter to transition to the kernel to complete. If this 108 + * flag is set, work will be done when the task transitions anyway, rather 109 + * than force an inter-processor interrupt reschedule. This avoids interrupting 110 + * a task running in userspace, and saves an IPI. 111 + */ 112 + #define IORING_SETUP_COOP_TASKRUN (1U << 8) 113 + /* 114 + * If COOP_TASKRUN is set, get notified if task work is available for 115 + * running and a kernel transition would be needed to run it. This sets 116 + * IORING_SQ_TASKRUN in the sq ring flags. Not valid with COOP_TASKRUN. 117 + */ 118 + #define IORING_SETUP_TASKRUN_FLAG (1U << 9) 105 119 106 120 enum { 107 121 IORING_OP_NOP, ··· 202 188 #define IORING_POLL_UPDATE_USER_DATA (1U << 2) 203 189 204 190 /* 191 + * ASYNC_CANCEL flags. 192 + * 193 + * IORING_ASYNC_CANCEL_ALL Cancel all requests that match the given key 194 + * IORING_ASYNC_CANCEL_FD Key off 'fd' for cancelation rather than the 195 + * request 'user_data' 196 + * IORING_ASYNC_CANCEL_ANY Match any request 197 + */ 198 + #define IORING_ASYNC_CANCEL_ALL (1U << 0) 199 + #define IORING_ASYNC_CANCEL_FD (1U << 1) 200 + #define IORING_ASYNC_CANCEL_ANY (1U << 2) 201 + 202 + /* 203 + * send/sendmsg and recv/recvmsg flags (sqe->addr2) 204 + * 205 + * IORING_RECVSEND_POLL_FIRST If set, instead of first attempting to send 206 + * or receive and arm poll if that yields an 207 + * -EAGAIN result, arm poll upfront and skip 208 + * the initial transfer attempt. 209 + */ 210 + #define IORING_RECVSEND_POLL_FIRST (1U << 0) 211 + 212 + /* 205 213 * IO completion data structure (Completion Queue Entry) 206 214 */ 207 215 struct io_uring_cqe { ··· 272 236 */ 273 237 #define IORING_SQ_NEED_WAKEUP (1U << 0) /* needs io_uring_enter wakeup */ 274 238 #define IORING_SQ_CQ_OVERFLOW (1U << 1) /* CQ ring is overflown */ 239 + #define IORING_SQ_TASKRUN (1U << 2) /* task should enter the kernel */ 275 240 276 241 struct io_cqring_offsets { 277 242 __u32 head;
+19 -6
kernel/task_work.c
··· 12 12 * @notify: how to notify the targeted task 13 13 * 14 14 * Queue @work for task_work_run() below and notify the @task if @notify 15 - * is @TWA_RESUME or @TWA_SIGNAL. @TWA_SIGNAL works like signals, in that the 16 - * it will interrupt the targeted task and run the task_work. @TWA_RESUME 17 - * work is run only when the task exits the kernel and returns to user mode, 18 - * or before entering guest mode. Fails if the @task is exiting/exited and thus 19 - * it can't process this @work. Otherwise @work->func() will be called when the 20 - * @task goes through one of the aforementioned transitions, or exits. 15 + * is @TWA_RESUME, @TWA_SIGNAL, or @TWA_SIGNAL_NO_IPI. 16 + * 17 + * @TWA_SIGNAL works like signals, in that the it will interrupt the targeted 18 + * task and run the task_work, regardless of whether the task is currently 19 + * running in the kernel or userspace. 20 + * @TWA_SIGNAL_NO_IPI works like @TWA_SIGNAL, except it doesn't send a 21 + * reschedule IPI to force the targeted task to reschedule and run task_work. 22 + * This can be advantageous if there's no strict requirement that the 23 + * task_work be run as soon as possible, just whenever the task enters the 24 + * kernel anyway. 25 + * @TWA_RESUME work is run only when the task exits the kernel and returns to 26 + * user mode, or before entering guest mode. 27 + * 28 + * Fails if the @task is exiting/exited and thus it can't process this @work. 29 + * Otherwise @work->func() will be called when the @task goes through one of 30 + * the aforementioned transitions, or exits. 21 31 * 22 32 * If the targeted task is exiting, then an error is returned and the work item 23 33 * is not queued. It's up to the caller to arrange for an alternative mechanism ··· 62 52 break; 63 53 case TWA_SIGNAL: 64 54 set_notify_signal(task); 55 + break; 56 + case TWA_SIGNAL_NO_IPI: 57 + __set_notify_signal(task); 65 58 break; 66 59 default: 67 60 WARN_ON_ONCE(1);