blk-mq: improve support for shared tags maps

+10

block/blk-mq-sysfs.c

··· 208 208 return blk_mq_tag_sysfs_show(hctx->tags, page); 209 209 } 210 210 211 + static ssize_t blk_mq_hw_sysfs_active_show(struct blk_mq_hw_ctx *hctx, char *page) 212 + { 213 + return sprintf(page, "%u\n", atomic_read(&hctx->nr_active)); 214 + } 215 + 211 216 static ssize_t blk_mq_hw_sysfs_cpus_show(struct blk_mq_hw_ctx *hctx, char *page) 212 217 { 213 218 unsigned int i, first = 1; ··· 272 267 .attr = {.name = "dispatched", .mode = S_IRUGO }, 273 268 .show = blk_mq_hw_sysfs_dispatched_show, 274 269 }; 270 + static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_active = { 271 + .attr = {.name = "active", .mode = S_IRUGO }, 272 + .show = blk_mq_hw_sysfs_active_show, 273 + }; 275 274 static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_pending = { 276 275 .attr = {.name = "pending", .mode = S_IRUGO }, 277 276 .show = blk_mq_hw_sysfs_rq_list_show, ··· 296 287 &blk_mq_hw_sysfs_pending.attr, 297 288 &blk_mq_hw_sysfs_tags.attr, 298 289 &blk_mq_hw_sysfs_cpus.attr, 290 + &blk_mq_hw_sysfs_active.attr, 299 291 NULL, 300 292 }; 301 293

+95 -17

block/blk-mq-tag.c

··· 7 7 #include "blk-mq.h" 8 8 #include "blk-mq-tag.h" 9 9 10 - void blk_mq_wait_for_tags(struct blk_mq_tags *tags, struct blk_mq_hw_ctx *hctx, 11 - bool reserved) 10 + void blk_mq_wait_for_tags(struct blk_mq_hw_ctx *hctx, bool reserved) 12 11 { 13 12 int tag, zero = 0; 14 13 15 - tag = blk_mq_get_tag(tags, hctx, &zero, __GFP_WAIT, reserved); 16 - blk_mq_put_tag(tags, tag, &zero); 14 + tag = blk_mq_get_tag(hctx, &zero, __GFP_WAIT, reserved); 15 + blk_mq_put_tag(hctx, tag, &zero); 17 16 } 18 17 19 18 static bool bt_has_free_tags(struct blk_mq_bitmap_tags *bt) ··· 37 38 return true; 38 39 39 40 return bt_has_free_tags(&tags->bitmap_tags); 41 + } 42 + 43 + static inline void bt_index_inc(unsigned int *index) 44 + { 45 + *index = (*index + 1) & (BT_WAIT_QUEUES - 1); 46 + } 47 + 48 + /* 49 + * If a previously inactive queue goes active, bump the active user count. 50 + */ 51 + bool __blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx) 52 + { 53 + if (!test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state) && 54 + !test_and_set_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state)) 55 + atomic_inc(&hctx->tags->active_queues); 56 + 57 + return true; 58 + } 59 + 60 + /* 61 + * If a previously busy queue goes inactive, potential waiters could now 62 + * be allowed to queue. Wake them up and check. 63 + */ 64 + void __blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx) 65 + { 66 + struct blk_mq_tags *tags = hctx->tags; 67 + struct blk_mq_bitmap_tags *bt; 68 + int i, wake_index; 69 + 70 + if (!test_and_clear_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state)) 71 + return; 72 + 73 + atomic_dec(&tags->active_queues); 74 + 75 + /* 76 + * Will only throttle depth on non-reserved tags 77 + */ 78 + bt = &tags->bitmap_tags; 79 + wake_index = bt->wake_index; 80 + for (i = 0; i < BT_WAIT_QUEUES; i++) { 81 + struct bt_wait_state *bs = &bt->bs[wake_index]; 82 + 83 + if (waitqueue_active(&bs->wait)) 84 + wake_up(&bs->wait); 85 + 86 + bt_index_inc(&wake_index); 87 + } 88 + } 89 + 90 + /* 91 + * For shared tag users, we track the number of currently active users 92 + * and attempt to provide a fair share of the tag depth for each of them. 93 + */ 94 + static inline bool hctx_may_queue(struct blk_mq_hw_ctx *hctx, 95 + struct blk_mq_bitmap_tags *bt) 96 + { 97 + unsigned int depth, users; 98 + 99 + if (!hctx || !(hctx->flags & BLK_MQ_F_TAG_SHARED)) 100 + return true; 101 + if (!test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state)) 102 + return true; 103 + 104 + /* 105 + * Don't try dividing an ant 106 + */ 107 + if (bt->depth == 1) 108 + return true; 109 + 110 + users = atomic_read(&hctx->tags->active_queues); 111 + if (!users) 112 + return true; 113 + 114 + /* 115 + * Allow at least some tags 116 + */ 117 + depth = max((bt->depth + users - 1) / users, 4U); 118 + return atomic_read(&hctx->nr_active) < depth; 40 119 } 41 120 42 121 static int __bt_get_word(struct blk_mq_bitmap *bm, unsigned int last_tag) ··· 155 78 * multiple users will tend to stick to different cachelines, at least 156 79 * until the map is exhausted. 157 80 */ 158 - static int __bt_get(struct blk_mq_bitmap_tags *bt, unsigned int *tag_cache) 81 + static int __bt_get(struct blk_mq_hw_ctx *hctx, struct blk_mq_bitmap_tags *bt, 82 + unsigned int *tag_cache) 159 83 { 160 84 unsigned int last_tag, org_last_tag; 161 85 int index, i, tag; 86 + 87 + if (!hctx_may_queue(hctx, bt)) 88 + return -1; 162 89 163 90 last_tag = org_last_tag = *tag_cache; 164 91 index = TAG_TO_INDEX(bt, last_tag); ··· 198 117 return tag; 199 118 } 200 119 201 - static inline void bt_index_inc(unsigned int *index) 202 - { 203 - *index = (*index + 1) & (BT_WAIT_QUEUES - 1); 204 - } 205 - 206 120 static struct bt_wait_state *bt_wait_ptr(struct blk_mq_bitmap_tags *bt, 207 121 struct blk_mq_hw_ctx *hctx) 208 122 { ··· 218 142 DEFINE_WAIT(wait); 219 143 int tag; 220 144 221 - tag = __bt_get(bt, last_tag); 145 + tag = __bt_get(hctx, bt, last_tag); 222 146 if (tag != -1) 223 147 return tag; 224 148 ··· 232 156 was_empty = list_empty(&wait.task_list); 233 157 prepare_to_wait(&bs->wait, &wait, TASK_UNINTERRUPTIBLE); 234 158 235 - tag = __bt_get(bt, last_tag); 159 + tag = __bt_get(hctx, bt, last_tag); 236 160 if (tag != -1) 237 161 break; 238 162 ··· 276 200 return tag; 277 201 } 278 202 279 - unsigned int blk_mq_get_tag(struct blk_mq_tags *tags, 280 - struct blk_mq_hw_ctx *hctx, unsigned int *last_tag, 203 + unsigned int blk_mq_get_tag(struct blk_mq_hw_ctx *hctx, unsigned int *last_tag, 281 204 gfp_t gfp, bool reserved) 282 205 { 283 206 if (!reserved) 284 - return __blk_mq_get_tag(tags, hctx, last_tag, gfp); 207 + return __blk_mq_get_tag(hctx->tags, hctx, last_tag, gfp); 285 208 286 - return __blk_mq_get_reserved_tag(tags, gfp); 209 + return __blk_mq_get_reserved_tag(hctx->tags, gfp); 287 210 } 288 211 289 212 static struct bt_wait_state *bt_wake_ptr(struct blk_mq_bitmap_tags *bt) ··· 340 265 bt_clear_tag(&tags->breserved_tags, tag); 341 266 } 342 267 343 - void blk_mq_put_tag(struct blk_mq_tags *tags, unsigned int tag, 268 + void blk_mq_put_tag(struct blk_mq_hw_ctx *hctx, unsigned int tag, 344 269 unsigned int *last_tag) 345 270 { 271 + struct blk_mq_tags *tags = hctx->tags; 272 + 346 273 if (tag >= tags->nr_reserved_tags) { 347 274 const int real_tag = tag - tags->nr_reserved_tags; 348 275 ··· 542 465 res = bt_unused_tags(&tags->breserved_tags); 543 466 544 467 page += sprintf(page, "nr_free=%u, nr_reserved=%u\n", free, res); 468 + page += sprintf(page, "active_queues=%u\n", atomic_read(&tags->active_queues)); 545 469 546 470 return page - orig_page; 547 471 }

+24 -3

block/blk-mq-tag.h

··· 38 38 unsigned int nr_tags; 39 39 unsigned int nr_reserved_tags; 40 40 41 + atomic_t active_queues; 42 + 41 43 struct blk_mq_bitmap_tags bitmap_tags; 42 44 struct blk_mq_bitmap_tags breserved_tags; 43 45 ··· 51 49 extern struct blk_mq_tags *blk_mq_init_tags(unsigned int nr_tags, unsigned int reserved_tags, int node); 52 50 extern void blk_mq_free_tags(struct blk_mq_tags *tags); 53 51 54 - extern unsigned int blk_mq_get_tag(struct blk_mq_tags *tags, struct blk_mq_hw_ctx *hctx, unsigned int *last_tag, gfp_t gfp, bool reserved); 55 - extern void blk_mq_wait_for_tags(struct blk_mq_tags *tags, struct blk_mq_hw_ctx *hctx, bool reserved); 56 - extern void blk_mq_put_tag(struct blk_mq_tags *tags, unsigned int tag, unsigned int *last_tag); 52 + extern unsigned int blk_mq_get_tag(struct blk_mq_hw_ctx *hctx, unsigned int *last_tag, gfp_t gfp, bool reserved); 53 + extern void blk_mq_wait_for_tags(struct blk_mq_hw_ctx *hctx, bool reserved); 54 + extern void blk_mq_put_tag(struct blk_mq_hw_ctx *hctx, unsigned int tag, unsigned int *last_tag); 57 55 extern void blk_mq_tag_busy_iter(struct blk_mq_tags *tags, void (*fn)(void *data, unsigned long *), void *data); 58 56 extern bool blk_mq_has_free_tags(struct blk_mq_tags *tags); 59 57 extern ssize_t blk_mq_tag_sysfs_show(struct blk_mq_tags *tags, char *page); ··· 69 67 BLK_MQ_TAG_MIN = BLK_MQ_TAG_CACHE_MIN, 70 68 BLK_MQ_TAG_MAX = BLK_MQ_TAG_FAIL - 1, 71 69 }; 70 + 71 + extern bool __blk_mq_tag_busy(struct blk_mq_hw_ctx *); 72 + extern void __blk_mq_tag_idle(struct blk_mq_hw_ctx *); 73 + 74 + static inline bool blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx) 75 + { 76 + if (!(hctx->flags & BLK_MQ_F_TAG_SHARED)) 77 + return false; 78 + 79 + return __blk_mq_tag_busy(hctx); 80 + } 81 + 82 + static inline void blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx) 83 + { 84 + if (!(hctx->flags & BLK_MQ_F_TAG_SHARED)) 85 + return; 86 + 87 + __blk_mq_tag_idle(hctx); 88 + } 72 89 73 90 #endif

+79 -6

block/blk-mq.c

··· 80 80 struct request *rq; 81 81 unsigned int tag; 82 82 83 - tag = blk_mq_get_tag(hctx->tags, hctx, &ctx->last_tag, gfp, reserved); 83 + tag = blk_mq_get_tag(hctx, &ctx->last_tag, gfp, reserved); 84 84 if (tag != BLK_MQ_TAG_FAIL) { 85 85 rq = hctx->tags->rqs[tag]; 86 + 87 + rq->cmd_flags = 0; 88 + if (blk_mq_tag_busy(hctx)) { 89 + rq->cmd_flags = REQ_MQ_INFLIGHT; 90 + atomic_inc(&hctx->nr_active); 91 + } 92 + 86 93 rq->tag = tag; 87 94 return rq; 88 95 } ··· 197 190 /* csd/requeue_work/fifo_time is initialized before use */ 198 191 rq->q = q; 199 192 rq->mq_ctx = ctx; 200 - rq->cmd_flags = rw_flags; 193 + rq->cmd_flags |= rw_flags; 201 194 rq->cmd_type = 0; 202 195 /* do not touch atomic flags, it needs atomic ops against the timer */ 203 196 rq->cpu = -1; ··· 269 262 break; 270 263 } 271 264 272 - blk_mq_wait_for_tags(hctx->tags, hctx, reserved); 265 + blk_mq_wait_for_tags(hctx, reserved); 273 266 } while (1); 274 267 275 268 return rq; ··· 310 303 const int tag = rq->tag; 311 304 struct request_queue *q = rq->q; 312 305 306 + if (rq->cmd_flags & REQ_MQ_INFLIGHT) 307 + atomic_dec(&hctx->nr_active); 308 + 313 309 clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags); 314 - blk_mq_put_tag(hctx->tags, tag, &ctx->last_tag); 310 + blk_mq_put_tag(hctx, tag, &ctx->last_tag); 315 311 blk_mq_queue_exit(q); 316 312 } 317 313 ··· 581 571 queue_for_each_hw_ctx(q, hctx, i) 582 572 blk_mq_hw_ctx_check_timeout(hctx, &next, &next_set); 583 573 584 - if (next_set) 585 - mod_timer(&q->timeout, round_jiffies_up(next)); 574 + if (next_set) { 575 + next = blk_rq_timeout(round_jiffies_up(next)); 576 + mod_timer(&q->timeout, next); 577 + } else { 578 + queue_for_each_hw_ctx(q, hctx, i) 579 + blk_mq_tag_idle(hctx); 580 + } 586 581 } 587 582 588 583 /* ··· 1454 1439 } 1455 1440 } 1456 1441 1442 + static void blk_mq_update_tag_set_depth(struct blk_mq_tag_set *set) 1443 + { 1444 + struct blk_mq_hw_ctx *hctx; 1445 + struct request_queue *q; 1446 + bool shared; 1447 + int i; 1448 + 1449 + if (set->tag_list.next == set->tag_list.prev) 1450 + shared = false; 1451 + else 1452 + shared = true; 1453 + 1454 + list_for_each_entry(q, &set->tag_list, tag_set_list) { 1455 + blk_mq_freeze_queue(q); 1456 + 1457 + queue_for_each_hw_ctx(q, hctx, i) { 1458 + if (shared) 1459 + hctx->flags |= BLK_MQ_F_TAG_SHARED; 1460 + else 1461 + hctx->flags &= ~BLK_MQ_F_TAG_SHARED; 1462 + } 1463 + blk_mq_unfreeze_queue(q); 1464 + } 1465 + } 1466 + 1467 + static void blk_mq_del_queue_tag_set(struct request_queue *q) 1468 + { 1469 + struct blk_mq_tag_set *set = q->tag_set; 1470 + 1471 + blk_mq_freeze_queue(q); 1472 + 1473 + mutex_lock(&set->tag_list_lock); 1474 + list_del_init(&q->tag_set_list); 1475 + blk_mq_update_tag_set_depth(set); 1476 + mutex_unlock(&set->tag_list_lock); 1477 + 1478 + blk_mq_unfreeze_queue(q); 1479 + } 1480 + 1481 + static void blk_mq_add_queue_tag_set(struct blk_mq_tag_set *set, 1482 + struct request_queue *q) 1483 + { 1484 + q->tag_set = set; 1485 + 1486 + mutex_lock(&set->tag_list_lock); 1487 + list_add_tail(&q->tag_set_list, &set->tag_list); 1488 + blk_mq_update_tag_set_depth(set); 1489 + mutex_unlock(&set->tag_list_lock); 1490 + } 1491 + 1457 1492 struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set) 1458 1493 { 1459 1494 struct blk_mq_hw_ctx **hctxs; ··· 1529 1464 if (!zalloc_cpumask_var(&hctxs[i]->cpumask, GFP_KERNEL)) 1530 1465 goto err_hctxs; 1531 1466 1467 + atomic_set(&hctxs[i]->nr_active, 0); 1532 1468 hctxs[i]->numa_node = NUMA_NO_NODE; 1533 1469 hctxs[i]->queue_num = i; 1534 1470 } ··· 1582 1516 list_add_tail(&q->all_q_node, &all_q_list); 1583 1517 mutex_unlock(&all_q_mutex); 1584 1518 1519 + blk_mq_add_queue_tag_set(set, q); 1520 + 1585 1521 return q; 1586 1522 1587 1523 err_flush_rq: ··· 1610 1542 { 1611 1543 struct blk_mq_hw_ctx *hctx; 1612 1544 int i; 1545 + 1546 + blk_mq_del_queue_tag_set(q); 1613 1547 1614 1548 queue_for_each_hw_ctx(q, hctx, i) { 1615 1549 kfree(hctx->ctx_map); ··· 1704 1634 if (!set->tags[i]) 1705 1635 goto out_unwind; 1706 1636 } 1637 + 1638 + mutex_init(&set->tag_list_lock); 1639 + INIT_LIST_HEAD(&set->tag_list); 1707 1640 1708 1641 return 0; 1709 1642

+12 -1

block/blk-timeout.c

··· 166 166 } 167 167 EXPORT_SYMBOL_GPL(blk_abort_request); 168 168 169 + unsigned long blk_rq_timeout(unsigned long timeout) 170 + { 171 + unsigned long maxt; 172 + 173 + maxt = round_jiffies_up(jiffies + BLK_MAX_TIMEOUT); 174 + if (time_after(timeout, maxt)) 175 + timeout = maxt; 176 + 177 + return timeout; 178 + } 179 + 169 180 /** 170 181 * blk_add_timer - Start timeout timer for a single request 171 182 * @req: request that is about to start running. ··· 211 200 * than an existing one, modify the timer. Round up to next nearest 212 201 * second. 213 202 */ 214 - expiry = round_jiffies_up(req->deadline); 203 + expiry = blk_rq_timeout(round_jiffies_up(req->deadline)); 215 204 216 205 if (!timer_pending(&q->timeout) || 217 206 time_before(expiry, q->timeout.expires)) {

+4

block/blk.h

··· 9 9 /* Number of requests a "batching" process may submit */ 10 10 #define BLK_BATCH_REQ 32 11 11 12 + /* Max future timer expiry for timeouts */ 13 + #define BLK_MAX_TIMEOUT (5 * HZ) 14 + 12 15 extern struct kmem_cache *blk_requestq_cachep; 13 16 extern struct kmem_cache *request_cachep; 14 17 extern struct kobj_type blk_queue_ktype; ··· 40 37 void blk_rq_timed_out_timer(unsigned long data); 41 38 void blk_rq_check_expired(struct request *rq, unsigned long *next_timeout, 42 39 unsigned int *next_set); 40 + unsigned long blk_rq_timeout(unsigned long timeout); 43 41 void blk_add_timer(struct request *req); 44 42 void blk_delete_timer(struct request *); 45 43

+7

include/linux/blk-mq.h

··· 48 48 unsigned int numa_node; 49 49 unsigned int cmd_size; /* per-request extra data */ 50 50 51 + atomic_t nr_active; 52 + 51 53 struct blk_mq_cpu_notifier cpu_notifier; 52 54 struct kobject kobj; 53 55 }; ··· 66 64 void *driver_data; 67 65 68 66 struct blk_mq_tags **tags; 67 + 68 + struct mutex tag_list_lock; 69 + struct list_head tag_list; 69 70 }; 70 71 71 72 typedef int (queue_rq_fn)(struct blk_mq_hw_ctx *, struct request *); ··· 131 126 132 127 BLK_MQ_F_SHOULD_MERGE = 1 << 0, 133 128 BLK_MQ_F_SHOULD_SORT = 1 << 1, 129 + BLK_MQ_F_TAG_SHARED = 1 << 2, 134 130 135 131 BLK_MQ_S_STOPPED = 0, 132 + BLK_MQ_S_TAG_ACTIVE = 1, 136 133 137 134 BLK_MQ_MAX_DEPTH = 2048, 138 135

+2

include/linux/blk_types.h

··· 190 190 __REQ_PM, /* runtime pm request */ 191 191 __REQ_END, /* last of chain of requests */ 192 192 __REQ_HASHED, /* on IO scheduler merge hash */ 193 + __REQ_MQ_INFLIGHT, /* track inflight for MQ */ 193 194 __REQ_NR_BITS, /* stops here */ 194 195 }; 195 196 ··· 244 243 #define REQ_PM (1ULL << __REQ_PM) 245 244 #define REQ_END (1ULL << __REQ_END) 246 245 #define REQ_HASHED (1ULL << __REQ_HASHED) 246 + #define REQ_MQ_INFLIGHT (1ULL << __REQ_MQ_INFLIGHT) 247 247 248 248 #endif /* __LINUX_BLK_TYPES_H */

+3

include/linux/blkdev.h

··· 481 481 wait_queue_head_t mq_freeze_wq; 482 482 struct percpu_counter mq_usage_counter; 483 483 struct list_head all_q_node; 484 + 485 + struct blk_mq_tag_set *tag_set; 486 + struct list_head tag_set_list; 484 487 }; 485 488 486 489 #define QUEUE_FLAG_QUEUED 1 /* uses generic tag queueing */