Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

block: Remove queue freezing from several sysfs store callbacks

Freezing the request queue from inside sysfs store callbacks may cause a
deadlock in combination with the dm-multipath driver and the
queue_if_no_path option. Additionally, freezing the request queue slows
down system boot on systems where sysfs attributes are set synchronously.

Fix this by removing the blk_mq_freeze_queue() / blk_mq_unfreeze_queue()
calls from the store callbacks that do not strictly need these callbacks.
Add the __data_racy annotation to request_queue.rq_timeout to suppress
KCSAN data race reports about the rq_timeout reads.

This patch may cause a small delay in applying the new settings.

For all the attributes affected by this patch, I/O will complete
correctly whether the old or the new value of the attribute is used.

This patch affects the following sysfs attributes:
* io_poll_delay
* io_timeout
* nomerges
* read_ahead_kb
* rq_affinity

Here is an example of a deadlock triggered by running test srp/002
if this patch is not applied:

task:multipathd
Call Trace:
<TASK>
__schedule+0x8c1/0x1bf0
schedule+0xdd/0x270
schedule_preempt_disabled+0x1c/0x30
__mutex_lock+0xb89/0x1650
mutex_lock_nested+0x1f/0x30
dm_table_set_restrictions+0x823/0xdf0
__bind+0x166/0x590
dm_swap_table+0x2a7/0x490
do_resume+0x1b1/0x610
dev_suspend+0x55/0x1a0
ctl_ioctl+0x3a5/0x7e0
dm_ctl_ioctl+0x12/0x20
__x64_sys_ioctl+0x127/0x1a0
x64_sys_call+0xe2b/0x17d0
do_syscall_64+0x96/0x3a0
entry_SYSCALL_64_after_hwframe+0x4b/0x53
</TASK>
task:(udev-worker)
Call Trace:
<TASK>
__schedule+0x8c1/0x1bf0
schedule+0xdd/0x270
blk_mq_freeze_queue_wait+0xf2/0x140
blk_mq_freeze_queue_nomemsave+0x23/0x30
queue_ra_store+0x14e/0x290
queue_attr_store+0x23e/0x2c0
sysfs_kf_write+0xde/0x140
kernfs_fop_write_iter+0x3b2/0x630
vfs_write+0x4fd/0x1390
ksys_write+0xfd/0x230
__x64_sys_write+0x76/0xc0
x64_sys_call+0x276/0x17d0
do_syscall_64+0x96/0x3a0
entry_SYSCALL_64_after_hwframe+0x4b/0x53
</TASK>

Cc: Christoph Hellwig <hch@lst.de>
Cc: Ming Lei <ming.lei@redhat.com>
Cc: Nilay Shroff <nilay@linux.ibm.com>
Cc: Martin Wilck <mwilck@suse.com>
Cc: Benjamin Marzinski <bmarzins@redhat.com>
Cc: stable@vger.kernel.org
Fixes: af2814149883 ("block: freeze the queue in queue_attr_store")
Signed-off-by: Bart Van Assche <bvanassche@acm.org>
Reviewed-by: Nilay Shroff <nilay@linux.ibm.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>

authored by

Bart Van Assche and committed by
Jens Axboe
935a20d1 42adb2d4

+9 -19
+8 -18
block/blk-sysfs.c
··· 143 143 { 144 144 unsigned long ra_kb; 145 145 ssize_t ret; 146 - unsigned int memflags; 147 146 struct request_queue *q = disk->queue; 148 147 149 148 ret = queue_var_store(&ra_kb, page, count); 150 149 if (ret < 0) 151 150 return ret; 152 151 /* 153 - * ->ra_pages is protected by ->limits_lock because it is usually 154 - * calculated from the queue limits by queue_limits_commit_update. 152 + * The ->ra_pages change below is protected by ->limits_lock because it 153 + * is usually calculated from the queue limits by 154 + * queue_limits_commit_update(). 155 + * 156 + * bdi->ra_pages reads are not serialized against bdi->ra_pages writes. 157 + * Use WRITE_ONCE() to write bdi->ra_pages once. 155 158 */ 156 159 mutex_lock(&q->limits_lock); 157 - memflags = blk_mq_freeze_queue(q); 158 - disk->bdi->ra_pages = ra_kb >> (PAGE_SHIFT - 10); 160 + WRITE_ONCE(disk->bdi->ra_pages, ra_kb >> (PAGE_SHIFT - 10)); 159 161 mutex_unlock(&q->limits_lock); 160 - blk_mq_unfreeze_queue(q, memflags); 161 162 162 163 return ret; 163 164 } ··· 376 375 size_t count) 377 376 { 378 377 unsigned long nm; 379 - unsigned int memflags; 380 378 struct request_queue *q = disk->queue; 381 379 ssize_t ret = queue_var_store(&nm, page, count); 382 380 383 381 if (ret < 0) 384 382 return ret; 385 383 386 - memflags = blk_mq_freeze_queue(q); 387 384 blk_queue_flag_clear(QUEUE_FLAG_NOMERGES, q); 388 385 blk_queue_flag_clear(QUEUE_FLAG_NOXMERGES, q); 389 386 if (nm == 2) 390 387 blk_queue_flag_set(QUEUE_FLAG_NOMERGES, q); 391 388 else if (nm) 392 389 blk_queue_flag_set(QUEUE_FLAG_NOXMERGES, q); 393 - blk_mq_unfreeze_queue(q, memflags); 394 390 395 391 return ret; 396 392 } ··· 407 409 #ifdef CONFIG_SMP 408 410 struct request_queue *q = disk->queue; 409 411 unsigned long val; 410 - unsigned int memflags; 411 412 412 413 ret = queue_var_store(&val, page, count); 413 414 if (ret < 0) ··· 418 421 * are accessed individually using atomic test_bit operation. So we 419 422 * don't grab any lock while updating these flags. 420 423 */ 421 - memflags = blk_mq_freeze_queue(q); 422 424 if (val == 2) { 423 425 blk_queue_flag_set(QUEUE_FLAG_SAME_COMP, q); 424 426 blk_queue_flag_set(QUEUE_FLAG_SAME_FORCE, q); ··· 428 432 blk_queue_flag_clear(QUEUE_FLAG_SAME_COMP, q); 429 433 blk_queue_flag_clear(QUEUE_FLAG_SAME_FORCE, q); 430 434 } 431 - blk_mq_unfreeze_queue(q, memflags); 432 435 #endif 433 436 return ret; 434 437 } ··· 441 446 static ssize_t queue_poll_store(struct gendisk *disk, const char *page, 442 447 size_t count) 443 448 { 444 - unsigned int memflags; 445 449 ssize_t ret = count; 446 450 struct request_queue *q = disk->queue; 447 451 448 - memflags = blk_mq_freeze_queue(q); 449 452 if (!(q->limits.features & BLK_FEAT_POLL)) { 450 453 ret = -EINVAL; 451 454 goto out; ··· 452 459 pr_info_ratelimited("writes to the poll attribute are ignored.\n"); 453 460 pr_info_ratelimited("please use driver specific parameters instead.\n"); 454 461 out: 455 - blk_mq_unfreeze_queue(q, memflags); 456 462 return ret; 457 463 } 458 464 ··· 464 472 static ssize_t queue_io_timeout_store(struct gendisk *disk, const char *page, 465 473 size_t count) 466 474 { 467 - unsigned int val, memflags; 475 + unsigned int val; 468 476 int err; 469 477 struct request_queue *q = disk->queue; 470 478 ··· 472 480 if (err || val == 0) 473 481 return -EINVAL; 474 482 475 - memflags = blk_mq_freeze_queue(q); 476 483 blk_queue_rq_timeout(q, msecs_to_jiffies(val)); 477 - blk_mq_unfreeze_queue(q, memflags); 478 484 479 485 return count; 480 486 }
+1 -1
include/linux/blkdev.h
··· 495 495 */ 496 496 unsigned long queue_flags; 497 497 498 - unsigned int rq_timeout; 498 + unsigned int __data_racy rq_timeout; 499 499 500 500 unsigned int queue_depth; 501 501