Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

blk-mq: improve support for shared tags maps

This adds support for active queue tracking, meaning that the
blk-mq tagging maintains a count of active users of a tag set.
This allows us to maintain a notion of fairness between users,
so that we can distribute the tag depth evenly without starving
some users while allowing others to try unfair deep queues.

If sharing of a tag set is detected, each hardware queue will
track the depth of its own queue. And if this exceeds the total
depth divided by the number of active queues, the user is actively
throttled down.

The active queue count is done lazily to avoid bouncing that data
between submitter and completer. Each hardware queue gets marked
active when it allocates its first tag, and gets marked inactive
when 1) the last tag is cleared, and 2) the queue timeout grace
period has passed.

Signed-off-by: Jens Axboe <axboe@fb.com>

+236 -27
+10
block/blk-mq-sysfs.c
··· 208 208 return blk_mq_tag_sysfs_show(hctx->tags, page); 209 209 } 210 210 211 + static ssize_t blk_mq_hw_sysfs_active_show(struct blk_mq_hw_ctx *hctx, char *page) 212 + { 213 + return sprintf(page, "%u\n", atomic_read(&hctx->nr_active)); 214 + } 215 + 211 216 static ssize_t blk_mq_hw_sysfs_cpus_show(struct blk_mq_hw_ctx *hctx, char *page) 212 217 { 213 218 unsigned int i, first = 1; ··· 272 267 .attr = {.name = "dispatched", .mode = S_IRUGO }, 273 268 .show = blk_mq_hw_sysfs_dispatched_show, 274 269 }; 270 + static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_active = { 271 + .attr = {.name = "active", .mode = S_IRUGO }, 272 + .show = blk_mq_hw_sysfs_active_show, 273 + }; 275 274 static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_pending = { 276 275 .attr = {.name = "pending", .mode = S_IRUGO }, 277 276 .show = blk_mq_hw_sysfs_rq_list_show, ··· 296 287 &blk_mq_hw_sysfs_pending.attr, 297 288 &blk_mq_hw_sysfs_tags.attr, 298 289 &blk_mq_hw_sysfs_cpus.attr, 290 + &blk_mq_hw_sysfs_active.attr, 299 291 NULL, 300 292 }; 301 293
+95 -17
block/blk-mq-tag.c
··· 7 7 #include "blk-mq.h" 8 8 #include "blk-mq-tag.h" 9 9 10 - void blk_mq_wait_for_tags(struct blk_mq_tags *tags, struct blk_mq_hw_ctx *hctx, 11 - bool reserved) 10 + void blk_mq_wait_for_tags(struct blk_mq_hw_ctx *hctx, bool reserved) 12 11 { 13 12 int tag, zero = 0; 14 13 15 - tag = blk_mq_get_tag(tags, hctx, &zero, __GFP_WAIT, reserved); 16 - blk_mq_put_tag(tags, tag, &zero); 14 + tag = blk_mq_get_tag(hctx, &zero, __GFP_WAIT, reserved); 15 + blk_mq_put_tag(hctx, tag, &zero); 17 16 } 18 17 19 18 static bool bt_has_free_tags(struct blk_mq_bitmap_tags *bt) ··· 37 38 return true; 38 39 39 40 return bt_has_free_tags(&tags->bitmap_tags); 41 + } 42 + 43 + static inline void bt_index_inc(unsigned int *index) 44 + { 45 + *index = (*index + 1) & (BT_WAIT_QUEUES - 1); 46 + } 47 + 48 + /* 49 + * If a previously inactive queue goes active, bump the active user count. 50 + */ 51 + bool __blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx) 52 + { 53 + if (!test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state) && 54 + !test_and_set_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state)) 55 + atomic_inc(&hctx->tags->active_queues); 56 + 57 + return true; 58 + } 59 + 60 + /* 61 + * If a previously busy queue goes inactive, potential waiters could now 62 + * be allowed to queue. Wake them up and check. 63 + */ 64 + void __blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx) 65 + { 66 + struct blk_mq_tags *tags = hctx->tags; 67 + struct blk_mq_bitmap_tags *bt; 68 + int i, wake_index; 69 + 70 + if (!test_and_clear_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state)) 71 + return; 72 + 73 + atomic_dec(&tags->active_queues); 74 + 75 + /* 76 + * Will only throttle depth on non-reserved tags 77 + */ 78 + bt = &tags->bitmap_tags; 79 + wake_index = bt->wake_index; 80 + for (i = 0; i < BT_WAIT_QUEUES; i++) { 81 + struct bt_wait_state *bs = &bt->bs[wake_index]; 82 + 83 + if (waitqueue_active(&bs->wait)) 84 + wake_up(&bs->wait); 85 + 86 + bt_index_inc(&wake_index); 87 + } 88 + } 89 + 90 + /* 91 + * For shared tag users, we track the number of currently active users 92 + * and attempt to provide a fair share of the tag depth for each of them. 93 + */ 94 + static inline bool hctx_may_queue(struct blk_mq_hw_ctx *hctx, 95 + struct blk_mq_bitmap_tags *bt) 96 + { 97 + unsigned int depth, users; 98 + 99 + if (!hctx || !(hctx->flags & BLK_MQ_F_TAG_SHARED)) 100 + return true; 101 + if (!test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state)) 102 + return true; 103 + 104 + /* 105 + * Don't try dividing an ant 106 + */ 107 + if (bt->depth == 1) 108 + return true; 109 + 110 + users = atomic_read(&hctx->tags->active_queues); 111 + if (!users) 112 + return true; 113 + 114 + /* 115 + * Allow at least some tags 116 + */ 117 + depth = max((bt->depth + users - 1) / users, 4U); 118 + return atomic_read(&hctx->nr_active) < depth; 40 119 } 41 120 42 121 static int __bt_get_word(struct blk_mq_bitmap *bm, unsigned int last_tag) ··· 155 78 * multiple users will tend to stick to different cachelines, at least 156 79 * until the map is exhausted. 157 80 */ 158 - static int __bt_get(struct blk_mq_bitmap_tags *bt, unsigned int *tag_cache) 81 + static int __bt_get(struct blk_mq_hw_ctx *hctx, struct blk_mq_bitmap_tags *bt, 82 + unsigned int *tag_cache) 159 83 { 160 84 unsigned int last_tag, org_last_tag; 161 85 int index, i, tag; 86 + 87 + if (!hctx_may_queue(hctx, bt)) 88 + return -1; 162 89 163 90 last_tag = org_last_tag = *tag_cache; 164 91 index = TAG_TO_INDEX(bt, last_tag); ··· 198 117 return tag; 199 118 } 200 119 201 - static inline void bt_index_inc(unsigned int *index) 202 - { 203 - *index = (*index + 1) & (BT_WAIT_QUEUES - 1); 204 - } 205 - 206 120 static struct bt_wait_state *bt_wait_ptr(struct blk_mq_bitmap_tags *bt, 207 121 struct blk_mq_hw_ctx *hctx) 208 122 { ··· 218 142 DEFINE_WAIT(wait); 219 143 int tag; 220 144 221 - tag = __bt_get(bt, last_tag); 145 + tag = __bt_get(hctx, bt, last_tag); 222 146 if (tag != -1) 223 147 return tag; 224 148 ··· 232 156 was_empty = list_empty(&wait.task_list); 233 157 prepare_to_wait(&bs->wait, &wait, TASK_UNINTERRUPTIBLE); 234 158 235 - tag = __bt_get(bt, last_tag); 159 + tag = __bt_get(hctx, bt, last_tag); 236 160 if (tag != -1) 237 161 break; 238 162 ··· 276 200 return tag; 277 201 } 278 202 279 - unsigned int blk_mq_get_tag(struct blk_mq_tags *tags, 280 - struct blk_mq_hw_ctx *hctx, unsigned int *last_tag, 203 + unsigned int blk_mq_get_tag(struct blk_mq_hw_ctx *hctx, unsigned int *last_tag, 281 204 gfp_t gfp, bool reserved) 282 205 { 283 206 if (!reserved) 284 - return __blk_mq_get_tag(tags, hctx, last_tag, gfp); 207 + return __blk_mq_get_tag(hctx->tags, hctx, last_tag, gfp); 285 208 286 - return __blk_mq_get_reserved_tag(tags, gfp); 209 + return __blk_mq_get_reserved_tag(hctx->tags, gfp); 287 210 } 288 211 289 212 static struct bt_wait_state *bt_wake_ptr(struct blk_mq_bitmap_tags *bt) ··· 340 265 bt_clear_tag(&tags->breserved_tags, tag); 341 266 } 342 267 343 - void blk_mq_put_tag(struct blk_mq_tags *tags, unsigned int tag, 268 + void blk_mq_put_tag(struct blk_mq_hw_ctx *hctx, unsigned int tag, 344 269 unsigned int *last_tag) 345 270 { 271 + struct blk_mq_tags *tags = hctx->tags; 272 + 346 273 if (tag >= tags->nr_reserved_tags) { 347 274 const int real_tag = tag - tags->nr_reserved_tags; 348 275 ··· 542 465 res = bt_unused_tags(&tags->breserved_tags); 543 466 544 467 page += sprintf(page, "nr_free=%u, nr_reserved=%u\n", free, res); 468 + page += sprintf(page, "active_queues=%u\n", atomic_read(&tags->active_queues)); 545 469 546 470 return page - orig_page; 547 471 }
+24 -3
block/blk-mq-tag.h
··· 38 38 unsigned int nr_tags; 39 39 unsigned int nr_reserved_tags; 40 40 41 + atomic_t active_queues; 42 + 41 43 struct blk_mq_bitmap_tags bitmap_tags; 42 44 struct blk_mq_bitmap_tags breserved_tags; 43 45 ··· 51 49 extern struct blk_mq_tags *blk_mq_init_tags(unsigned int nr_tags, unsigned int reserved_tags, int node); 52 50 extern void blk_mq_free_tags(struct blk_mq_tags *tags); 53 51 54 - extern unsigned int blk_mq_get_tag(struct blk_mq_tags *tags, struct blk_mq_hw_ctx *hctx, unsigned int *last_tag, gfp_t gfp, bool reserved); 55 - extern void blk_mq_wait_for_tags(struct blk_mq_tags *tags, struct blk_mq_hw_ctx *hctx, bool reserved); 56 - extern void blk_mq_put_tag(struct blk_mq_tags *tags, unsigned int tag, unsigned int *last_tag); 52 + extern unsigned int blk_mq_get_tag(struct blk_mq_hw_ctx *hctx, unsigned int *last_tag, gfp_t gfp, bool reserved); 53 + extern void blk_mq_wait_for_tags(struct blk_mq_hw_ctx *hctx, bool reserved); 54 + extern void blk_mq_put_tag(struct blk_mq_hw_ctx *hctx, unsigned int tag, unsigned int *last_tag); 57 55 extern void blk_mq_tag_busy_iter(struct blk_mq_tags *tags, void (*fn)(void *data, unsigned long *), void *data); 58 56 extern bool blk_mq_has_free_tags(struct blk_mq_tags *tags); 59 57 extern ssize_t blk_mq_tag_sysfs_show(struct blk_mq_tags *tags, char *page); ··· 69 67 BLK_MQ_TAG_MIN = BLK_MQ_TAG_CACHE_MIN, 70 68 BLK_MQ_TAG_MAX = BLK_MQ_TAG_FAIL - 1, 71 69 }; 70 + 71 + extern bool __blk_mq_tag_busy(struct blk_mq_hw_ctx *); 72 + extern void __blk_mq_tag_idle(struct blk_mq_hw_ctx *); 73 + 74 + static inline bool blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx) 75 + { 76 + if (!(hctx->flags & BLK_MQ_F_TAG_SHARED)) 77 + return false; 78 + 79 + return __blk_mq_tag_busy(hctx); 80 + } 81 + 82 + static inline void blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx) 83 + { 84 + if (!(hctx->flags & BLK_MQ_F_TAG_SHARED)) 85 + return; 86 + 87 + __blk_mq_tag_idle(hctx); 88 + } 72 89 73 90 #endif
+79 -6
block/blk-mq.c
··· 80 80 struct request *rq; 81 81 unsigned int tag; 82 82 83 - tag = blk_mq_get_tag(hctx->tags, hctx, &ctx->last_tag, gfp, reserved); 83 + tag = blk_mq_get_tag(hctx, &ctx->last_tag, gfp, reserved); 84 84 if (tag != BLK_MQ_TAG_FAIL) { 85 85 rq = hctx->tags->rqs[tag]; 86 + 87 + rq->cmd_flags = 0; 88 + if (blk_mq_tag_busy(hctx)) { 89 + rq->cmd_flags = REQ_MQ_INFLIGHT; 90 + atomic_inc(&hctx->nr_active); 91 + } 92 + 86 93 rq->tag = tag; 87 94 return rq; 88 95 } ··· 197 190 /* csd/requeue_work/fifo_time is initialized before use */ 198 191 rq->q = q; 199 192 rq->mq_ctx = ctx; 200 - rq->cmd_flags = rw_flags; 193 + rq->cmd_flags |= rw_flags; 201 194 rq->cmd_type = 0; 202 195 /* do not touch atomic flags, it needs atomic ops against the timer */ 203 196 rq->cpu = -1; ··· 269 262 break; 270 263 } 271 264 272 - blk_mq_wait_for_tags(hctx->tags, hctx, reserved); 265 + blk_mq_wait_for_tags(hctx, reserved); 273 266 } while (1); 274 267 275 268 return rq; ··· 310 303 const int tag = rq->tag; 311 304 struct request_queue *q = rq->q; 312 305 306 + if (rq->cmd_flags & REQ_MQ_INFLIGHT) 307 + atomic_dec(&hctx->nr_active); 308 + 313 309 clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags); 314 - blk_mq_put_tag(hctx->tags, tag, &ctx->last_tag); 310 + blk_mq_put_tag(hctx, tag, &ctx->last_tag); 315 311 blk_mq_queue_exit(q); 316 312 } 317 313 ··· 581 571 queue_for_each_hw_ctx(q, hctx, i) 582 572 blk_mq_hw_ctx_check_timeout(hctx, &next, &next_set); 583 573 584 - if (next_set) 585 - mod_timer(&q->timeout, round_jiffies_up(next)); 574 + if (next_set) { 575 + next = blk_rq_timeout(round_jiffies_up(next)); 576 + mod_timer(&q->timeout, next); 577 + } else { 578 + queue_for_each_hw_ctx(q, hctx, i) 579 + blk_mq_tag_idle(hctx); 580 + } 586 581 } 587 582 588 583 /* ··· 1454 1439 } 1455 1440 } 1456 1441 1442 + static void blk_mq_update_tag_set_depth(struct blk_mq_tag_set *set) 1443 + { 1444 + struct blk_mq_hw_ctx *hctx; 1445 + struct request_queue *q; 1446 + bool shared; 1447 + int i; 1448 + 1449 + if (set->tag_list.next == set->tag_list.prev) 1450 + shared = false; 1451 + else 1452 + shared = true; 1453 + 1454 + list_for_each_entry(q, &set->tag_list, tag_set_list) { 1455 + blk_mq_freeze_queue(q); 1456 + 1457 + queue_for_each_hw_ctx(q, hctx, i) { 1458 + if (shared) 1459 + hctx->flags |= BLK_MQ_F_TAG_SHARED; 1460 + else 1461 + hctx->flags &= ~BLK_MQ_F_TAG_SHARED; 1462 + } 1463 + blk_mq_unfreeze_queue(q); 1464 + } 1465 + } 1466 + 1467 + static void blk_mq_del_queue_tag_set(struct request_queue *q) 1468 + { 1469 + struct blk_mq_tag_set *set = q->tag_set; 1470 + 1471 + blk_mq_freeze_queue(q); 1472 + 1473 + mutex_lock(&set->tag_list_lock); 1474 + list_del_init(&q->tag_set_list); 1475 + blk_mq_update_tag_set_depth(set); 1476 + mutex_unlock(&set->tag_list_lock); 1477 + 1478 + blk_mq_unfreeze_queue(q); 1479 + } 1480 + 1481 + static void blk_mq_add_queue_tag_set(struct blk_mq_tag_set *set, 1482 + struct request_queue *q) 1483 + { 1484 + q->tag_set = set; 1485 + 1486 + mutex_lock(&set->tag_list_lock); 1487 + list_add_tail(&q->tag_set_list, &set->tag_list); 1488 + blk_mq_update_tag_set_depth(set); 1489 + mutex_unlock(&set->tag_list_lock); 1490 + } 1491 + 1457 1492 struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set) 1458 1493 { 1459 1494 struct blk_mq_hw_ctx **hctxs; ··· 1529 1464 if (!zalloc_cpumask_var(&hctxs[i]->cpumask, GFP_KERNEL)) 1530 1465 goto err_hctxs; 1531 1466 1467 + atomic_set(&hctxs[i]->nr_active, 0); 1532 1468 hctxs[i]->numa_node = NUMA_NO_NODE; 1533 1469 hctxs[i]->queue_num = i; 1534 1470 } ··· 1582 1516 list_add_tail(&q->all_q_node, &all_q_list); 1583 1517 mutex_unlock(&all_q_mutex); 1584 1518 1519 + blk_mq_add_queue_tag_set(set, q); 1520 + 1585 1521 return q; 1586 1522 1587 1523 err_flush_rq: ··· 1610 1542 { 1611 1543 struct blk_mq_hw_ctx *hctx; 1612 1544 int i; 1545 + 1546 + blk_mq_del_queue_tag_set(q); 1613 1547 1614 1548 queue_for_each_hw_ctx(q, hctx, i) { 1615 1549 kfree(hctx->ctx_map); ··· 1704 1634 if (!set->tags[i]) 1705 1635 goto out_unwind; 1706 1636 } 1637 + 1638 + mutex_init(&set->tag_list_lock); 1639 + INIT_LIST_HEAD(&set->tag_list); 1707 1640 1708 1641 return 0; 1709 1642
+12 -1
block/blk-timeout.c
··· 166 166 } 167 167 EXPORT_SYMBOL_GPL(blk_abort_request); 168 168 169 + unsigned long blk_rq_timeout(unsigned long timeout) 170 + { 171 + unsigned long maxt; 172 + 173 + maxt = round_jiffies_up(jiffies + BLK_MAX_TIMEOUT); 174 + if (time_after(timeout, maxt)) 175 + timeout = maxt; 176 + 177 + return timeout; 178 + } 179 + 169 180 /** 170 181 * blk_add_timer - Start timeout timer for a single request 171 182 * @req: request that is about to start running. ··· 211 200 * than an existing one, modify the timer. Round up to next nearest 212 201 * second. 213 202 */ 214 - expiry = round_jiffies_up(req->deadline); 203 + expiry = blk_rq_timeout(round_jiffies_up(req->deadline)); 215 204 216 205 if (!timer_pending(&q->timeout) || 217 206 time_before(expiry, q->timeout.expires)) {
+4
block/blk.h
··· 9 9 /* Number of requests a "batching" process may submit */ 10 10 #define BLK_BATCH_REQ 32 11 11 12 + /* Max future timer expiry for timeouts */ 13 + #define BLK_MAX_TIMEOUT (5 * HZ) 14 + 12 15 extern struct kmem_cache *blk_requestq_cachep; 13 16 extern struct kmem_cache *request_cachep; 14 17 extern struct kobj_type blk_queue_ktype; ··· 40 37 void blk_rq_timed_out_timer(unsigned long data); 41 38 void blk_rq_check_expired(struct request *rq, unsigned long *next_timeout, 42 39 unsigned int *next_set); 40 + unsigned long blk_rq_timeout(unsigned long timeout); 43 41 void blk_add_timer(struct request *req); 44 42 void blk_delete_timer(struct request *); 45 43
+7
include/linux/blk-mq.h
··· 48 48 unsigned int numa_node; 49 49 unsigned int cmd_size; /* per-request extra data */ 50 50 51 + atomic_t nr_active; 52 + 51 53 struct blk_mq_cpu_notifier cpu_notifier; 52 54 struct kobject kobj; 53 55 }; ··· 66 64 void *driver_data; 67 65 68 66 struct blk_mq_tags **tags; 67 + 68 + struct mutex tag_list_lock; 69 + struct list_head tag_list; 69 70 }; 70 71 71 72 typedef int (queue_rq_fn)(struct blk_mq_hw_ctx *, struct request *); ··· 131 126 132 127 BLK_MQ_F_SHOULD_MERGE = 1 << 0, 133 128 BLK_MQ_F_SHOULD_SORT = 1 << 1, 129 + BLK_MQ_F_TAG_SHARED = 1 << 2, 134 130 135 131 BLK_MQ_S_STOPPED = 0, 132 + BLK_MQ_S_TAG_ACTIVE = 1, 136 133 137 134 BLK_MQ_MAX_DEPTH = 2048, 138 135
+2
include/linux/blk_types.h
··· 190 190 __REQ_PM, /* runtime pm request */ 191 191 __REQ_END, /* last of chain of requests */ 192 192 __REQ_HASHED, /* on IO scheduler merge hash */ 193 + __REQ_MQ_INFLIGHT, /* track inflight for MQ */ 193 194 __REQ_NR_BITS, /* stops here */ 194 195 }; 195 196 ··· 244 243 #define REQ_PM (1ULL << __REQ_PM) 245 244 #define REQ_END (1ULL << __REQ_END) 246 245 #define REQ_HASHED (1ULL << __REQ_HASHED) 246 + #define REQ_MQ_INFLIGHT (1ULL << __REQ_MQ_INFLIGHT) 247 247 248 248 #endif /* __LINUX_BLK_TYPES_H */
+3
include/linux/blkdev.h
··· 481 481 wait_queue_head_t mq_freeze_wq; 482 482 struct percpu_counter mq_usage_counter; 483 483 struct list_head all_q_node; 484 + 485 + struct blk_mq_tag_set *tag_set; 486 + struct list_head tag_set_list; 484 487 }; 485 488 486 489 #define QUEUE_FLAG_QUEUED 1 /* uses generic tag queueing */