Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

blk-mq: drain I/O when all CPUs in a hctx are offline

Most of blk-mq drivers depend on managed IRQ's auto-affinity to setup
up queue mapping. Thomas mentioned the following point[1]:

"That was the constraint of managed interrupts from the very beginning:

The driver/subsystem has to quiesce the interrupt line and the associated
queue _before_ it gets shutdown in CPU unplug and not fiddle with it
until it's restarted by the core when the CPU is plugged in again."

However, current blk-mq implementation doesn't quiesce hw queue before
the last CPU in the hctx is shutdown. Even worse, CPUHP_BLK_MQ_DEAD is a
cpuhp state handled after the CPU is down, so there isn't any chance to
quiesce the hctx before shutting down the CPU.

Add new CPUHP_AP_BLK_MQ_ONLINE state to stop allocating from blk-mq hctxs
where the last CPU goes away, and wait for completion of in-flight
requests. This guarantees that there is no inflight I/O before shutting
down the managed IRQ.

Add a BLK_MQ_F_STACKING and set it for dm-rq and loop, so we don't need
to wait for completion of in-flight requests from these drivers to avoid
a potential dead-lock. It is safe to do this for stacking drivers as those
do not use interrupts at all and their I/O completions are triggered by
underlying devices I/O completion.

[1] https://lore.kernel.org/linux-block/alpine.DEB.2.21.1904051331270.1802@nanos.tec.linutronix.de/

[hch: different retry mechanism, merged two patches, minor cleanups]

Signed-off-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Reviewed-by: Daniel Wagner <dwagner@suse.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>

authored by

Ming Lei and committed by
Jens Axboe
bf0beec0 602380d2

+133 -4
+2
block/blk-mq-debugfs.c
··· 213 213 HCTX_STATE_NAME(STOPPED), 214 214 HCTX_STATE_NAME(TAG_ACTIVE), 215 215 HCTX_STATE_NAME(SCHED_RESTART), 216 + HCTX_STATE_NAME(INACTIVE), 216 217 }; 217 218 #undef HCTX_STATE_NAME 218 219 ··· 240 239 HCTX_FLAG_NAME(TAG_SHARED), 241 240 HCTX_FLAG_NAME(BLOCKING), 242 241 HCTX_FLAG_NAME(NO_SCHED), 242 + HCTX_FLAG_NAME(STACKING), 243 243 }; 244 244 #undef HCTX_FLAG_NAME 245 245
+8
block/blk-mq-tag.c
··· 180 180 sbitmap_finish_wait(bt, ws, &wait); 181 181 182 182 found_tag: 183 + /* 184 + * Give up this allocation if the hctx is inactive. The caller will 185 + * retry on an active hctx. 186 + */ 187 + if (unlikely(test_bit(BLK_MQ_S_INACTIVE, &data->hctx->state))) { 188 + blk_mq_put_tag(tags, data->ctx, tag + tag_offset); 189 + return BLK_MQ_NO_TAG; 190 + } 183 191 return tag + tag_offset; 184 192 } 185 193
+110 -2
block/blk-mq.c
··· 375 375 e->type->ops.limit_depth(data->cmd_flags, data); 376 376 } 377 377 378 + retry: 378 379 data->ctx = blk_mq_get_ctx(q); 379 380 data->hctx = blk_mq_map_queue(q, data->cmd_flags, data->ctx); 380 381 if (!(data->flags & BLK_MQ_REQ_INTERNAL)) 381 382 blk_mq_tag_busy(data->hctx); 382 383 384 + /* 385 + * Waiting allocations only fail because of an inactive hctx. In that 386 + * case just retry the hctx assignment and tag allocation as CPU hotplug 387 + * should have migrated us to an online CPU by now. 388 + */ 383 389 tag = blk_mq_get_tag(data); 384 - if (tag == BLK_MQ_NO_TAG) 385 - return NULL; 390 + if (tag == BLK_MQ_NO_TAG) { 391 + if (data->flags & BLK_MQ_REQ_NOWAIT) 392 + return NULL; 393 + 394 + /* 395 + * Give up the CPU and sleep for a random short time to ensure 396 + * that thread using a realtime scheduling class are migrated 397 + * off the the CPU, and thus off the hctx that is going away. 398 + */ 399 + msleep(3); 400 + goto retry; 401 + } 386 402 return blk_mq_rq_ctx_init(data, tag, alloc_time_ns); 387 403 } 388 404 ··· 2351 2335 return -ENOMEM; 2352 2336 } 2353 2337 2338 + struct rq_iter_data { 2339 + struct blk_mq_hw_ctx *hctx; 2340 + bool has_rq; 2341 + }; 2342 + 2343 + static bool blk_mq_has_request(struct request *rq, void *data, bool reserved) 2344 + { 2345 + struct rq_iter_data *iter_data = data; 2346 + 2347 + if (rq->mq_hctx != iter_data->hctx) 2348 + return true; 2349 + iter_data->has_rq = true; 2350 + return false; 2351 + } 2352 + 2353 + static bool blk_mq_hctx_has_requests(struct blk_mq_hw_ctx *hctx) 2354 + { 2355 + struct blk_mq_tags *tags = hctx->sched_tags ? 2356 + hctx->sched_tags : hctx->tags; 2357 + struct rq_iter_data data = { 2358 + .hctx = hctx, 2359 + }; 2360 + 2361 + blk_mq_all_tag_iter(tags, blk_mq_has_request, &data); 2362 + return data.has_rq; 2363 + } 2364 + 2365 + static inline bool blk_mq_last_cpu_in_hctx(unsigned int cpu, 2366 + struct blk_mq_hw_ctx *hctx) 2367 + { 2368 + if (cpumask_next_and(-1, hctx->cpumask, cpu_online_mask) != cpu) 2369 + return false; 2370 + if (cpumask_next_and(cpu, hctx->cpumask, cpu_online_mask) < nr_cpu_ids) 2371 + return false; 2372 + return true; 2373 + } 2374 + 2375 + static int blk_mq_hctx_notify_offline(unsigned int cpu, struct hlist_node *node) 2376 + { 2377 + struct blk_mq_hw_ctx *hctx = hlist_entry_safe(node, 2378 + struct blk_mq_hw_ctx, cpuhp_online); 2379 + 2380 + if (!cpumask_test_cpu(cpu, hctx->cpumask) || 2381 + !blk_mq_last_cpu_in_hctx(cpu, hctx)) 2382 + return 0; 2383 + 2384 + /* 2385 + * Prevent new request from being allocated on the current hctx. 2386 + * 2387 + * The smp_mb__after_atomic() Pairs with the implied barrier in 2388 + * test_and_set_bit_lock in sbitmap_get(). Ensures the inactive flag is 2389 + * seen once we return from the tag allocator. 2390 + */ 2391 + set_bit(BLK_MQ_S_INACTIVE, &hctx->state); 2392 + smp_mb__after_atomic(); 2393 + 2394 + /* 2395 + * Try to grab a reference to the queue and wait for any outstanding 2396 + * requests. If we could not grab a reference the queue has been 2397 + * frozen and there are no requests. 2398 + */ 2399 + if (percpu_ref_tryget(&hctx->queue->q_usage_counter)) { 2400 + while (blk_mq_hctx_has_requests(hctx)) 2401 + msleep(5); 2402 + percpu_ref_put(&hctx->queue->q_usage_counter); 2403 + } 2404 + 2405 + return 0; 2406 + } 2407 + 2408 + static int blk_mq_hctx_notify_online(unsigned int cpu, struct hlist_node *node) 2409 + { 2410 + struct blk_mq_hw_ctx *hctx = hlist_entry_safe(node, 2411 + struct blk_mq_hw_ctx, cpuhp_online); 2412 + 2413 + if (cpumask_test_cpu(cpu, hctx->cpumask)) 2414 + clear_bit(BLK_MQ_S_INACTIVE, &hctx->state); 2415 + return 0; 2416 + } 2417 + 2354 2418 /* 2355 2419 * 'cpu' is going away. splice any existing rq_list entries from this 2356 2420 * software queue to the hw queue dispatch list, and ensure that it ··· 2444 2348 enum hctx_type type; 2445 2349 2446 2350 hctx = hlist_entry_safe(node, struct blk_mq_hw_ctx, cpuhp_dead); 2351 + if (!cpumask_test_cpu(cpu, hctx->cpumask)) 2352 + return 0; 2353 + 2447 2354 ctx = __blk_mq_get_ctx(hctx->queue, cpu); 2448 2355 type = hctx->type; 2449 2356 ··· 2470 2371 2471 2372 static void blk_mq_remove_cpuhp(struct blk_mq_hw_ctx *hctx) 2472 2373 { 2374 + if (!(hctx->flags & BLK_MQ_F_STACKING)) 2375 + cpuhp_state_remove_instance_nocalls(CPUHP_AP_BLK_MQ_ONLINE, 2376 + &hctx->cpuhp_online); 2473 2377 cpuhp_state_remove_instance_nocalls(CPUHP_BLK_MQ_DEAD, 2474 2378 &hctx->cpuhp_dead); 2475 2379 } ··· 2532 2430 { 2533 2431 hctx->queue_num = hctx_idx; 2534 2432 2433 + if (!(hctx->flags & BLK_MQ_F_STACKING)) 2434 + cpuhp_state_add_instance_nocalls(CPUHP_AP_BLK_MQ_ONLINE, 2435 + &hctx->cpuhp_online); 2535 2436 cpuhp_state_add_instance_nocalls(CPUHP_BLK_MQ_DEAD, &hctx->cpuhp_dead); 2536 2437 2537 2438 hctx->tags = set->tags[hctx_idx]; ··· 3789 3684 { 3790 3685 cpuhp_setup_state_multi(CPUHP_BLK_MQ_DEAD, "block/mq:dead", NULL, 3791 3686 blk_mq_hctx_notify_dead); 3687 + cpuhp_setup_state_multi(CPUHP_AP_BLK_MQ_ONLINE, "block/mq:online", 3688 + blk_mq_hctx_notify_online, 3689 + blk_mq_hctx_notify_offline); 3792 3690 return 0; 3793 3691 } 3794 3692 subsys_initcall(blk_mq_init);
+1 -1
drivers/block/loop.c
··· 2037 2037 lo->tag_set.queue_depth = 128; 2038 2038 lo->tag_set.numa_node = NUMA_NO_NODE; 2039 2039 lo->tag_set.cmd_size = sizeof(struct loop_cmd); 2040 - lo->tag_set.flags = BLK_MQ_F_SHOULD_MERGE; 2040 + lo->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_STACKING; 2041 2041 lo->tag_set.driver_data = lo; 2042 2042 2043 2043 err = blk_mq_alloc_tag_set(&lo->tag_set);
+1 -1
drivers/md/dm-rq.c
··· 547 547 md->tag_set->ops = &dm_mq_ops; 548 548 md->tag_set->queue_depth = dm_get_blk_mq_queue_depth(); 549 549 md->tag_set->numa_node = md->numa_node_id; 550 - md->tag_set->flags = BLK_MQ_F_SHOULD_MERGE; 550 + md->tag_set->flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_STACKING; 551 551 md->tag_set->nr_hw_queues = dm_get_blk_mq_nr_hw_queues(); 552 552 md->tag_set->driver_data = md; 553 553
+10
include/linux/blk-mq.h
··· 140 140 */ 141 141 atomic_t nr_active; 142 142 143 + /** @cpuhp_online: List to store request if CPU is going to die */ 144 + struct hlist_node cpuhp_online; 143 145 /** @cpuhp_dead: List to store request if some CPU die. */ 144 146 struct hlist_node cpuhp_dead; 145 147 /** @kobj: Kernel object for sysfs. */ ··· 393 391 enum { 394 392 BLK_MQ_F_SHOULD_MERGE = 1 << 0, 395 393 BLK_MQ_F_TAG_SHARED = 1 << 1, 394 + /* 395 + * Set when this device requires underlying blk-mq device for 396 + * completing IO: 397 + */ 398 + BLK_MQ_F_STACKING = 1 << 2, 396 399 BLK_MQ_F_BLOCKING = 1 << 5, 397 400 BLK_MQ_F_NO_SCHED = 1 << 6, 398 401 BLK_MQ_F_ALLOC_POLICY_START_BIT = 8, ··· 406 399 BLK_MQ_S_STOPPED = 0, 407 400 BLK_MQ_S_TAG_ACTIVE = 1, 408 401 BLK_MQ_S_SCHED_RESTART = 2, 402 + 403 + /* hw queue is inactive after all its CPUs become offline */ 404 + BLK_MQ_S_INACTIVE = 3, 409 405 410 406 BLK_MQ_MAX_DEPTH = 10240, 411 407
+1
include/linux/cpuhotplug.h
··· 152 152 CPUHP_AP_SMPBOOT_THREADS, 153 153 CPUHP_AP_X86_VDSO_VMA_ONLINE, 154 154 CPUHP_AP_IRQ_AFFINITY_ONLINE, 155 + CPUHP_AP_BLK_MQ_ONLINE, 155 156 CPUHP_AP_ARM_MVEBU_SYNC_CLOCKS, 156 157 CPUHP_AP_X86_INTEL_EPB_ONLINE, 157 158 CPUHP_AP_PERF_ONLINE,