Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

io_uring: make ctx->timeout_lock a raw spinlock

Chase reports that their tester complaints about a locking context
mismatch:

=============================
[ BUG: Invalid wait context ]
6.13.0-rc1-gf137f14b7ccb-dirty #9 Not tainted
-----------------------------
syz.1.25198/182604 is trying to lock:
ffff88805e66a358 (&ctx->timeout_lock){-.-.}-{3:3}, at: spin_lock_irq
include/linux/spinlock.h:376 [inline]
ffff88805e66a358 (&ctx->timeout_lock){-.-.}-{3:3}, at:
io_match_task_safe io_uring/io_uring.c:218 [inline]
ffff88805e66a358 (&ctx->timeout_lock){-.-.}-{3:3}, at:
io_match_task_safe+0x187/0x250 io_uring/io_uring.c:204
other info that might help us debug this:
context-{5:5}
1 lock held by syz.1.25198/182604:
#0: ffff88802b7d48c0 (&acct->lock){+.+.}-{2:2}, at:
io_acct_cancel_pending_work+0x2d/0x6b0 io_uring/io-wq.c:1049
stack backtrace:
CPU: 0 UID: 0 PID: 182604 Comm: syz.1.25198 Not tainted
6.13.0-rc1-gf137f14b7ccb-dirty #9
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.15.0-1 04/01/2014
Call Trace:
<TASK>
__dump_stack lib/dump_stack.c:94 [inline]
dump_stack_lvl+0x82/0xd0 lib/dump_stack.c:120
print_lock_invalid_wait_context kernel/locking/lockdep.c:4826 [inline]
check_wait_context kernel/locking/lockdep.c:4898 [inline]
__lock_acquire+0x883/0x3c80 kernel/locking/lockdep.c:5176
lock_acquire.part.0+0x11b/0x370 kernel/locking/lockdep.c:5849
__raw_spin_lock_irq include/linux/spinlock_api_smp.h:119 [inline]
_raw_spin_lock_irq+0x36/0x50 kernel/locking/spinlock.c:170
spin_lock_irq include/linux/spinlock.h:376 [inline]
io_match_task_safe io_uring/io_uring.c:218 [inline]
io_match_task_safe+0x187/0x250 io_uring/io_uring.c:204
io_acct_cancel_pending_work+0xb8/0x6b0 io_uring/io-wq.c:1052
io_wq_cancel_pending_work io_uring/io-wq.c:1074 [inline]
io_wq_cancel_cb+0xb0/0x390 io_uring/io-wq.c:1112
io_uring_try_cancel_requests+0x15e/0xd70 io_uring/io_uring.c:3062
io_uring_cancel_generic+0x6ec/0x8c0 io_uring/io_uring.c:3140
io_uring_files_cancel include/linux/io_uring.h:20 [inline]
do_exit+0x494/0x27a0 kernel/exit.c:894
do_group_exit+0xb3/0x250 kernel/exit.c:1087
get_signal+0x1d77/0x1ef0 kernel/signal.c:3017
arch_do_signal_or_restart+0x79/0x5b0 arch/x86/kernel/signal.c:337
exit_to_user_mode_loop kernel/entry/common.c:111 [inline]
exit_to_user_mode_prepare include/linux/entry-common.h:329 [inline]
__syscall_exit_to_user_mode_work kernel/entry/common.c:207 [inline]
syscall_exit_to_user_mode+0x150/0x2a0 kernel/entry/common.c:218
do_syscall_64+0xd8/0x250 arch/x86/entry/common.c:89
entry_SYSCALL_64_after_hwframe+0x77/0x7f

which is because io_uring has ctx->timeout_lock nesting inside the
io-wq acct lock, the latter of which is used from inside the scheduler
and hence is a raw spinlock, while the former is a "normal" spinlock
and can hence be sleeping on PREEMPT_RT.

Change ctx->timeout_lock to be a raw spinlock to solve this nesting
dependency on PREEMPT_RT=y.

Reported-by: chase xd <sl1589472800@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>

+26 -26
+1 -1
include/linux/io_uring_types.h
··· 345 345 346 346 /* timeouts */ 347 347 struct { 348 - spinlock_t timeout_lock; 348 + raw_spinlock_t timeout_lock; 349 349 struct list_head timeout_list; 350 350 struct list_head ltimeout_list; 351 351 unsigned cq_last_tm_flush;
+5 -5
io_uring/io_uring.c
··· 215 215 struct io_ring_ctx *ctx = head->ctx; 216 216 217 217 /* protect against races with linked timeouts */ 218 - spin_lock_irq(&ctx->timeout_lock); 218 + raw_spin_lock_irq(&ctx->timeout_lock); 219 219 matched = io_match_linked(head); 220 - spin_unlock_irq(&ctx->timeout_lock); 220 + raw_spin_unlock_irq(&ctx->timeout_lock); 221 221 } else { 222 222 matched = io_match_linked(head); 223 223 } ··· 333 333 init_waitqueue_head(&ctx->cq_wait); 334 334 init_waitqueue_head(&ctx->poll_wq); 335 335 spin_lock_init(&ctx->completion_lock); 336 - spin_lock_init(&ctx->timeout_lock); 336 + raw_spin_lock_init(&ctx->timeout_lock); 337 337 INIT_WQ_LIST(&ctx->iopoll_list); 338 338 INIT_LIST_HEAD(&ctx->io_buffers_comp); 339 339 INIT_LIST_HEAD(&ctx->defer_list); ··· 498 498 if (req->flags & REQ_F_LINK_TIMEOUT) { 499 499 struct io_ring_ctx *ctx = req->ctx; 500 500 501 - spin_lock_irq(&ctx->timeout_lock); 501 + raw_spin_lock_irq(&ctx->timeout_lock); 502 502 io_for_each_link(cur, req) 503 503 io_prep_async_work(cur); 504 - spin_unlock_irq(&ctx->timeout_lock); 504 + raw_spin_unlock_irq(&ctx->timeout_lock); 505 505 } else { 506 506 io_for_each_link(cur, req) 507 507 io_prep_async_work(cur);
+20 -20
io_uring/timeout.c
··· 74 74 if (!io_timeout_finish(timeout, data)) { 75 75 if (io_req_post_cqe(req, -ETIME, IORING_CQE_F_MORE)) { 76 76 /* re-arm timer */ 77 - spin_lock_irq(&ctx->timeout_lock); 77 + raw_spin_lock_irq(&ctx->timeout_lock); 78 78 list_add(&timeout->list, ctx->timeout_list.prev); 79 79 hrtimer_start(&data->timer, timespec64_to_ktime(data->ts), data->mode); 80 - spin_unlock_irq(&ctx->timeout_lock); 80 + raw_spin_unlock_irq(&ctx->timeout_lock); 81 81 return; 82 82 } 83 83 } ··· 109 109 u32 seq; 110 110 struct io_timeout *timeout, *tmp; 111 111 112 - spin_lock_irq(&ctx->timeout_lock); 112 + raw_spin_lock_irq(&ctx->timeout_lock); 113 113 seq = ctx->cached_cq_tail - atomic_read(&ctx->cq_timeouts); 114 114 115 115 list_for_each_entry_safe(timeout, tmp, &ctx->timeout_list, list) { ··· 134 134 io_kill_timeout(req, 0); 135 135 } 136 136 ctx->cq_last_tm_flush = seq; 137 - spin_unlock_irq(&ctx->timeout_lock); 137 + raw_spin_unlock_irq(&ctx->timeout_lock); 138 138 } 139 139 140 140 static void io_req_tw_fail_links(struct io_kiocb *link, struct io_tw_state *ts) ··· 200 200 } else if (req->flags & REQ_F_LINK_TIMEOUT) { 201 201 struct io_ring_ctx *ctx = req->ctx; 202 202 203 - spin_lock_irq(&ctx->timeout_lock); 203 + raw_spin_lock_irq(&ctx->timeout_lock); 204 204 link = io_disarm_linked_timeout(req); 205 - spin_unlock_irq(&ctx->timeout_lock); 205 + raw_spin_unlock_irq(&ctx->timeout_lock); 206 206 if (link) 207 207 io_req_queue_tw_complete(link, -ECANCELED); 208 208 } ··· 238 238 struct io_ring_ctx *ctx = req->ctx; 239 239 unsigned long flags; 240 240 241 - spin_lock_irqsave(&ctx->timeout_lock, flags); 241 + raw_spin_lock_irqsave(&ctx->timeout_lock, flags); 242 242 list_del_init(&timeout->list); 243 243 atomic_set(&req->ctx->cq_timeouts, 244 244 atomic_read(&req->ctx->cq_timeouts) + 1); 245 - spin_unlock_irqrestore(&ctx->timeout_lock, flags); 245 + raw_spin_unlock_irqrestore(&ctx->timeout_lock, flags); 246 246 247 247 if (!(data->flags & IORING_TIMEOUT_ETIME_SUCCESS)) 248 248 req_set_fail(req); ··· 285 285 { 286 286 struct io_kiocb *req; 287 287 288 - spin_lock_irq(&ctx->timeout_lock); 288 + raw_spin_lock_irq(&ctx->timeout_lock); 289 289 req = io_timeout_extract(ctx, cd); 290 - spin_unlock_irq(&ctx->timeout_lock); 290 + raw_spin_unlock_irq(&ctx->timeout_lock); 291 291 292 292 if (IS_ERR(req)) 293 293 return PTR_ERR(req); ··· 330 330 struct io_ring_ctx *ctx = req->ctx; 331 331 unsigned long flags; 332 332 333 - spin_lock_irqsave(&ctx->timeout_lock, flags); 333 + raw_spin_lock_irqsave(&ctx->timeout_lock, flags); 334 334 prev = timeout->head; 335 335 timeout->head = NULL; 336 336 ··· 345 345 } 346 346 list_del(&timeout->list); 347 347 timeout->prev = prev; 348 - spin_unlock_irqrestore(&ctx->timeout_lock, flags); 348 + raw_spin_unlock_irqrestore(&ctx->timeout_lock, flags); 349 349 350 350 req->io_task_work.func = io_req_task_link_timeout; 351 351 io_req_task_work_add(req); ··· 472 472 } else { 473 473 enum hrtimer_mode mode = io_translate_timeout_mode(tr->flags); 474 474 475 - spin_lock_irq(&ctx->timeout_lock); 475 + raw_spin_lock_irq(&ctx->timeout_lock); 476 476 if (tr->ltimeout) 477 477 ret = io_linked_timeout_update(ctx, tr->addr, &tr->ts, mode); 478 478 else 479 479 ret = io_timeout_update(ctx, tr->addr, &tr->ts, mode); 480 - spin_unlock_irq(&ctx->timeout_lock); 480 + raw_spin_unlock_irq(&ctx->timeout_lock); 481 481 } 482 482 483 483 if (ret < 0) ··· 572 572 struct list_head *entry; 573 573 u32 tail, off = timeout->off; 574 574 575 - spin_lock_irq(&ctx->timeout_lock); 575 + raw_spin_lock_irq(&ctx->timeout_lock); 576 576 577 577 /* 578 578 * sqe->off holds how many events that need to occur for this ··· 611 611 list_add(&timeout->list, entry); 612 612 data->timer.function = io_timeout_fn; 613 613 hrtimer_start(&data->timer, timespec64_to_ktime(data->ts), data->mode); 614 - spin_unlock_irq(&ctx->timeout_lock); 614 + raw_spin_unlock_irq(&ctx->timeout_lock); 615 615 return IOU_ISSUE_SKIP_COMPLETE; 616 616 } 617 617 ··· 620 620 struct io_timeout *timeout = io_kiocb_to_cmd(req, struct io_timeout); 621 621 struct io_ring_ctx *ctx = req->ctx; 622 622 623 - spin_lock_irq(&ctx->timeout_lock); 623 + raw_spin_lock_irq(&ctx->timeout_lock); 624 624 /* 625 625 * If the back reference is NULL, then our linked request finished 626 626 * before we got a chance to setup the timer ··· 633 633 data->mode); 634 634 list_add_tail(&timeout->list, &ctx->ltimeout_list); 635 635 } 636 - spin_unlock_irq(&ctx->timeout_lock); 636 + raw_spin_unlock_irq(&ctx->timeout_lock); 637 637 /* drop submission reference */ 638 638 io_put_req(req); 639 639 } ··· 668 668 * timeout_lockfirst to keep locking ordering. 669 669 */ 670 670 spin_lock(&ctx->completion_lock); 671 - spin_lock_irq(&ctx->timeout_lock); 671 + raw_spin_lock_irq(&ctx->timeout_lock); 672 672 list_for_each_entry_safe(timeout, tmp, &ctx->timeout_list, list) { 673 673 struct io_kiocb *req = cmd_to_io_kiocb(timeout); 674 674 ··· 676 676 io_kill_timeout(req, -ECANCELED)) 677 677 canceled++; 678 678 } 679 - spin_unlock_irq(&ctx->timeout_lock); 679 + raw_spin_unlock_irq(&ctx->timeout_lock); 680 680 spin_unlock(&ctx->completion_lock); 681 681 return canceled != 0; 682 682 }