blk-mq: new multi-queue block IO queueing mechanism

+3 -2

block/Makefile

··· 5 5 obj-$(CONFIG_BLOCK) := elevator.o blk-core.o blk-tag.o blk-sysfs.o \ 6 6 blk-flush.o blk-settings.o blk-ioc.o blk-map.o \ 7 7 blk-exec.o blk-merge.o blk-softirq.o blk-timeout.o \ 8 - blk-iopoll.o blk-lib.o ioctl.o genhd.o scsi_ioctl.o \ 9 - partition-generic.o partitions/ 8 + blk-iopoll.o blk-lib.o blk-mq.o blk-mq-tag.o \ 9 + blk-mq-sysfs.o blk-mq-cpu.o blk-mq-cpumap.o ioctl.o \ 10 + genhd.o scsi_ioctl.o partition-generic.o partitions/ 10 11 11 12 obj-$(CONFIG_BLK_DEV_BSG) += bsg.o 12 13 obj-$(CONFIG_BLK_DEV_BSGLIB) += bsg-lib.o

+84 -58

block/blk-core.c

··· 16 16 #include <linux/backing-dev.h> 17 17 #include <linux/bio.h> 18 18 #include <linux/blkdev.h> 19 + #include <linux/blk-mq.h> 19 20 #include <linux/highmem.h> 20 21 #include <linux/mm.h> 21 22 #include <linux/kernel_stat.h> ··· 49 48 /* 50 49 * For the allocated request tables 51 50 */ 52 - static struct kmem_cache *request_cachep; 51 + struct kmem_cache *request_cachep = NULL; 53 52 54 53 /* 55 54 * For queue allocation ··· 60 59 * Controlling structure to kblockd 61 60 */ 62 61 static struct workqueue_struct *kblockd_workqueue; 63 - 64 - static void drive_stat_acct(struct request *rq, int new_io) 65 - { 66 - struct hd_struct *part; 67 - int rw = rq_data_dir(rq); 68 - int cpu; 69 - 70 - if (!blk_do_io_stat(rq)) 71 - return; 72 - 73 - cpu = part_stat_lock(); 74 - 75 - if (!new_io) { 76 - part = rq->part; 77 - part_stat_inc(cpu, part, merges[rw]); 78 - } else { 79 - part = disk_map_sector_rcu(rq->rq_disk, blk_rq_pos(rq)); 80 - if (!hd_struct_try_get(part)) { 81 - /* 82 - * The partition is already being removed, 83 - * the request will be accounted on the disk only 84 - * 85 - * We take a reference on disk->part0 although that 86 - * partition will never be deleted, so we can treat 87 - * it as any other partition. 88 - */ 89 - part = &rq->rq_disk->part0; 90 - hd_struct_get(part); 91 - } 92 - part_round_stats(cpu, part); 93 - part_inc_in_flight(part, rw); 94 - rq->part = part; 95 - } 96 - 97 - part_stat_unlock(); 98 - } 99 62 100 63 void blk_queue_congestion_threshold(struct request_queue *q) 101 64 { ··· 559 594 if (!q) 560 595 return NULL; 561 596 597 + if (percpu_counter_init(&q->mq_usage_counter, 0)) 598 + goto fail_q; 599 + 562 600 q->id = ida_simple_get(&blk_queue_ida, 0, 0, gfp_mask); 563 601 if (q->id < 0) 564 - goto fail_q; 602 + goto fail_c; 565 603 566 604 q->backing_dev_info.ra_pages = 567 605 (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE; ··· 611 643 q->bypass_depth = 1; 612 644 __set_bit(QUEUE_FLAG_BYPASS, &q->queue_flags); 613 645 646 + init_waitqueue_head(&q->mq_freeze_wq); 647 + 614 648 if (blkcg_init_queue(q)) 615 649 goto fail_id; 616 650 ··· 620 650 621 651 fail_id: 622 652 ida_simple_remove(&blk_queue_ida, q->id); 653 + fail_c: 654 + percpu_counter_destroy(&q->mq_usage_counter); 623 655 fail_q: 624 656 kmem_cache_free(blk_requestq_cachep, q); 625 657 return NULL; ··· 1080 1108 goto retry; 1081 1109 } 1082 1110 1083 - struct request *blk_get_request(struct request_queue *q, int rw, gfp_t gfp_mask) 1111 + static struct request *blk_old_get_request(struct request_queue *q, int rw, 1112 + gfp_t gfp_mask) 1084 1113 { 1085 1114 struct request *rq; 1086 1115 ··· 1097 1124 /* q->queue_lock is unlocked at this point */ 1098 1125 1099 1126 return rq; 1127 + } 1128 + 1129 + struct request *blk_get_request(struct request_queue *q, int rw, gfp_t gfp_mask) 1130 + { 1131 + if (q->mq_ops) 1132 + return blk_mq_alloc_request(q, rw, gfp_mask); 1133 + else 1134 + return blk_old_get_request(q, rw, gfp_mask); 1100 1135 } 1101 1136 EXPORT_SYMBOL(blk_get_request); 1102 1137 ··· 1191 1210 static void add_acct_request(struct request_queue *q, struct request *rq, 1192 1211 int where) 1193 1212 { 1194 - drive_stat_acct(rq, 1); 1213 + blk_account_io_start(rq, true); 1195 1214 __elv_add_request(q, rq, where); 1196 1215 } 1197 1216 ··· 1280 1299 1281 1300 void blk_put_request(struct request *req) 1282 1301 { 1283 - unsigned long flags; 1284 1302 struct request_queue *q = req->q; 1285 1303 1286 - spin_lock_irqsave(q->queue_lock, flags); 1287 - __blk_put_request(q, req); 1288 - spin_unlock_irqrestore(q->queue_lock, flags); 1304 + if (q->mq_ops) 1305 + blk_mq_free_request(req); 1306 + else { 1307 + unsigned long flags; 1308 + 1309 + spin_lock_irqsave(q->queue_lock, flags); 1310 + __blk_put_request(q, req); 1311 + spin_unlock_irqrestore(q->queue_lock, flags); 1312 + } 1289 1313 } 1290 1314 EXPORT_SYMBOL(blk_put_request); 1291 1315 ··· 1326 1340 } 1327 1341 EXPORT_SYMBOL_GPL(blk_add_request_payload); 1328 1342 1329 - static bool bio_attempt_back_merge(struct request_queue *q, struct request *req, 1330 - struct bio *bio) 1343 + bool bio_attempt_back_merge(struct request_queue *q, struct request *req, 1344 + struct bio *bio) 1331 1345 { 1332 1346 const int ff = bio->bi_rw & REQ_FAILFAST_MASK; 1333 1347 ··· 1344 1358 req->__data_len += bio->bi_size; 1345 1359 req->ioprio = ioprio_best(req->ioprio, bio_prio(bio)); 1346 1360 1347 - drive_stat_acct(req, 0); 1361 + blk_account_io_start(req, false); 1348 1362 return true; 1349 1363 } 1350 1364 1351 - static bool bio_attempt_front_merge(struct request_queue *q, 1352 - struct request *req, struct bio *bio) 1365 + bool bio_attempt_front_merge(struct request_queue *q, struct request *req, 1366 + struct bio *bio) 1353 1367 { 1354 1368 const int ff = bio->bi_rw & REQ_FAILFAST_MASK; 1355 1369 ··· 1374 1388 req->__data_len += bio->bi_size; 1375 1389 req->ioprio = ioprio_best(req->ioprio, bio_prio(bio)); 1376 1390 1377 - drive_stat_acct(req, 0); 1391 + blk_account_io_start(req, false); 1378 1392 return true; 1379 1393 } 1380 1394 1381 1395 /** 1382 - * attempt_plug_merge - try to merge with %current's plugged list 1396 + * blk_attempt_plug_merge - try to merge with %current's plugged list 1383 1397 * @q: request_queue new bio is being queued at 1384 1398 * @bio: new bio being queued 1385 1399 * @request_count: out parameter for number of traversed plugged requests ··· 1395 1409 * reliable access to the elevator outside queue lock. Only check basic 1396 1410 * merging parameters without querying the elevator. 1397 1411 */ 1398 - static bool attempt_plug_merge(struct request_queue *q, struct bio *bio, 1399 - unsigned int *request_count) 1412 + bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio, 1413 + unsigned int *request_count) 1400 1414 { 1401 1415 struct blk_plug *plug; 1402 1416 struct request *rq; ··· 1475 1489 * Check if we can merge with the plugged list before grabbing 1476 1490 * any locks. 1477 1491 */ 1478 - if (attempt_plug_merge(q, bio, &request_count)) 1492 + if (blk_attempt_plug_merge(q, bio, &request_count)) 1479 1493 return; 1480 1494 1481 1495 spin_lock_irq(q->queue_lock); ··· 1543 1557 } 1544 1558 } 1545 1559 list_add_tail(&req->queuelist, &plug->list); 1546 - drive_stat_acct(req, 1); 1560 + blk_account_io_start(req, true); 1547 1561 } else { 1548 1562 spin_lock_irq(q->queue_lock); 1549 1563 add_acct_request(q, req, where); ··· 1997 2011 } 1998 2012 EXPORT_SYMBOL_GPL(blk_rq_err_bytes); 1999 2013 2000 - static void blk_account_io_completion(struct request *req, unsigned int bytes) 2014 + void blk_account_io_completion(struct request *req, unsigned int bytes) 2001 2015 { 2002 2016 if (blk_do_io_stat(req)) { 2003 2017 const int rw = rq_data_dir(req); ··· 2011 2025 } 2012 2026 } 2013 2027 2014 - static void blk_account_io_done(struct request *req) 2028 + void blk_account_io_done(struct request *req) 2015 2029 { 2016 2030 /* 2017 2031 * Account IO completion. flush_rq isn't accounted as a ··· 2058 2072 return rq; 2059 2073 } 2060 2074 #endif 2075 + 2076 + void blk_account_io_start(struct request *rq, bool new_io) 2077 + { 2078 + struct hd_struct *part; 2079 + int rw = rq_data_dir(rq); 2080 + int cpu; 2081 + 2082 + if (!blk_do_io_stat(rq)) 2083 + return; 2084 + 2085 + cpu = part_stat_lock(); 2086 + 2087 + if (!new_io) { 2088 + part = rq->part; 2089 + part_stat_inc(cpu, part, merges[rw]); 2090 + } else { 2091 + part = disk_map_sector_rcu(rq->rq_disk, blk_rq_pos(rq)); 2092 + if (!hd_struct_try_get(part)) { 2093 + /* 2094 + * The partition is already being removed, 2095 + * the request will be accounted on the disk only 2096 + * 2097 + * We take a reference on disk->part0 although that 2098 + * partition will never be deleted, so we can treat 2099 + * it as any other partition. 2100 + */ 2101 + part = &rq->rq_disk->part0; 2102 + hd_struct_get(part); 2103 + } 2104 + part_round_stats(cpu, part); 2105 + part_inc_in_flight(part, rw); 2106 + rq->part = part; 2107 + } 2108 + 2109 + part_stat_unlock(); 2110 + } 2061 2111 2062 2112 /** 2063 2113 * blk_peek_request - peek at the top of a request queue ··· 2469 2447 2470 2448 if (req->cmd_flags & REQ_DONTPREP) 2471 2449 blk_unprep_request(req); 2472 - 2473 2450 2474 2451 blk_account_io_done(req); 2475 2452 ··· 2891 2870 2892 2871 plug->magic = PLUG_MAGIC; 2893 2872 INIT_LIST_HEAD(&plug->list); 2873 + INIT_LIST_HEAD(&plug->mq_list); 2894 2874 INIT_LIST_HEAD(&plug->cb_list); 2895 2875 2896 2876 /* ··· 2989 2967 BUG_ON(plug->magic != PLUG_MAGIC); 2990 2968 2991 2969 flush_plug_callbacks(plug, from_schedule); 2970 + 2971 + if (!list_empty(&plug->mq_list)) 2972 + blk_mq_flush_plug_list(plug, from_schedule); 2973 + 2992 2974 if (list_empty(&plug->list)) 2993 2975 return; 2994 2976

+7

block/blk-exec.c

··· 5 5 #include <linux/module.h> 6 6 #include <linux/bio.h> 7 7 #include <linux/blkdev.h> 8 + #include <linux/blk-mq.h> 8 9 #include <linux/sched/sysctl.h> 9 10 10 11 #include "blk.h" ··· 59 58 60 59 rq->rq_disk = bd_disk; 61 60 rq->end_io = done; 61 + 62 + if (q->mq_ops) { 63 + blk_mq_insert_request(q, rq, true); 64 + return; 65 + } 66 + 62 67 /* 63 68 * need to check this before __blk_run_queue(), because rq can 64 69 * be freed before that returns.

+139 -15

block/blk-flush.c

··· 69 69 #include <linux/bio.h> 70 70 #include <linux/blkdev.h> 71 71 #include <linux/gfp.h> 72 + #include <linux/blk-mq.h> 72 73 73 74 #include "blk.h" 75 + #include "blk-mq.h" 74 76 75 77 /* FLUSH/FUA sequences */ 76 78 enum { ··· 126 124 /* make @rq a normal request */ 127 125 rq->cmd_flags &= ~REQ_FLUSH_SEQ; 128 126 rq->end_io = rq->flush.saved_end_io; 127 + 128 + blk_clear_rq_complete(rq); 129 + } 130 + 131 + static void mq_flush_data_run(struct work_struct *work) 132 + { 133 + struct request *rq; 134 + 135 + rq = container_of(work, struct request, mq_flush_data); 136 + 137 + memset(&rq->csd, 0, sizeof(rq->csd)); 138 + blk_mq_run_request(rq, true, false); 139 + } 140 + 141 + static void blk_mq_flush_data_insert(struct request *rq) 142 + { 143 + INIT_WORK(&rq->mq_flush_data, mq_flush_data_run); 144 + kblockd_schedule_work(rq->q, &rq->mq_flush_data); 129 145 } 130 146 131 147 /** ··· 156 136 * completion and trigger the next step. 157 137 * 158 138 * CONTEXT: 159 - * spin_lock_irq(q->queue_lock) 139 + * spin_lock_irq(q->queue_lock or q->mq_flush_lock) 160 140 * 161 141 * RETURNS: 162 142 * %true if requests were added to the dispatch queue, %false otherwise. ··· 166 146 { 167 147 struct request_queue *q = rq->q; 168 148 struct list_head *pending = &q->flush_queue[q->flush_pending_idx]; 169 - bool queued = false; 149 + bool queued = false, kicked; 170 150 171 151 BUG_ON(rq->flush.seq & seq); 172 152 rq->flush.seq |= seq; ··· 187 167 188 168 case REQ_FSEQ_DATA: 189 169 list_move_tail(&rq->flush.list, &q->flush_data_in_flight); 190 - list_add(&rq->queuelist, &q->queue_head); 191 - queued = true; 170 + if (q->mq_ops) 171 + blk_mq_flush_data_insert(rq); 172 + else { 173 + list_add(&rq->queuelist, &q->queue_head); 174 + queued = true; 175 + } 192 176 break; 193 177 194 178 case REQ_FSEQ_DONE: ··· 205 181 BUG_ON(!list_empty(&rq->queuelist)); 206 182 list_del_init(&rq->flush.list); 207 183 blk_flush_restore_request(rq); 208 - __blk_end_request_all(rq, error); 184 + if (q->mq_ops) 185 + blk_mq_end_io(rq, error); 186 + else 187 + __blk_end_request_all(rq, error); 209 188 break; 210 189 211 190 default: 212 191 BUG(); 213 192 } 214 193 215 - return blk_kick_flush(q) | queued; 194 + kicked = blk_kick_flush(q); 195 + /* blk_mq_run_flush will run queue */ 196 + if (q->mq_ops) 197 + return queued; 198 + return kicked | queued; 216 199 } 217 200 218 201 static void flush_end_io(struct request *flush_rq, int error) 219 202 { 220 203 struct request_queue *q = flush_rq->q; 221 - struct list_head *running = &q->flush_queue[q->flush_running_idx]; 204 + struct list_head *running; 222 205 bool queued = false; 223 206 struct request *rq, *n; 207 + unsigned long flags = 0; 224 208 209 + if (q->mq_ops) { 210 + blk_mq_free_request(flush_rq); 211 + spin_lock_irqsave(&q->mq_flush_lock, flags); 212 + } 213 + running = &q->flush_queue[q->flush_running_idx]; 225 214 BUG_ON(q->flush_pending_idx == q->flush_running_idx); 226 215 227 216 /* account completion of the flush request */ 228 217 q->flush_running_idx ^= 1; 229 - elv_completed_request(q, flush_rq); 218 + 219 + if (!q->mq_ops) 220 + elv_completed_request(q, flush_rq); 230 221 231 222 /* and push the waiting requests to the next stage */ 232 223 list_for_each_entry_safe(rq, n, running, flush.list) { ··· 262 223 * directly into request_fn may confuse the driver. Always use 263 224 * kblockd. 264 225 */ 265 - if (queued || q->flush_queue_delayed) 266 - blk_run_queue_async(q); 226 + if (queued || q->flush_queue_delayed) { 227 + if (!q->mq_ops) 228 + blk_run_queue_async(q); 229 + else 230 + /* 231 + * This can be optimized to only run queues with requests 232 + * queued if necessary. 233 + */ 234 + blk_mq_run_queues(q, true); 235 + } 267 236 q->flush_queue_delayed = 0; 237 + if (q->mq_ops) 238 + spin_unlock_irqrestore(&q->mq_flush_lock, flags); 239 + } 240 + 241 + static void mq_flush_work(struct work_struct *work) 242 + { 243 + struct request_queue *q; 244 + struct request *rq; 245 + 246 + q = container_of(work, struct request_queue, mq_flush_work); 247 + 248 + /* We don't need set REQ_FLUSH_SEQ, it's for consistency */ 249 + rq = blk_mq_alloc_request(q, WRITE_FLUSH|REQ_FLUSH_SEQ, 250 + __GFP_WAIT|GFP_ATOMIC); 251 + rq->cmd_type = REQ_TYPE_FS; 252 + rq->end_io = flush_end_io; 253 + 254 + blk_mq_run_request(rq, true, false); 255 + } 256 + 257 + /* 258 + * We can't directly use q->flush_rq, because it doesn't have tag and is not in 259 + * hctx->rqs[]. so we must allocate a new request, since we can't sleep here, 260 + * so offload the work to workqueue. 261 + * 262 + * Note: we assume a flush request finished in any hardware queue will flush 263 + * the whole disk cache. 264 + */ 265 + static void mq_run_flush(struct request_queue *q) 266 + { 267 + kblockd_schedule_work(q, &q->mq_flush_work); 268 268 } 269 269 270 270 /** ··· 314 236 * Please read the comment at the top of this file for more info. 315 237 * 316 238 * CONTEXT: 317 - * spin_lock_irq(q->queue_lock) 239 + * spin_lock_irq(q->queue_lock or q->mq_flush_lock) 318 240 * 319 241 * RETURNS: 320 242 * %true if flush was issued, %false otherwise. ··· 339 261 * Issue flush and toggle pending_idx. This makes pending_idx 340 262 * different from running_idx, which means flush is in flight. 341 263 */ 264 + q->flush_pending_idx ^= 1; 265 + if (q->mq_ops) { 266 + mq_run_flush(q); 267 + return true; 268 + } 269 + 342 270 blk_rq_init(q, &q->flush_rq); 343 271 q->flush_rq.cmd_type = REQ_TYPE_FS; 344 272 q->flush_rq.cmd_flags = WRITE_FLUSH | REQ_FLUSH_SEQ; 345 273 q->flush_rq.rq_disk = first_rq->rq_disk; 346 274 q->flush_rq.end_io = flush_end_io; 347 275 348 - q->flush_pending_idx ^= 1; 349 276 list_add_tail(&q->flush_rq.queuelist, &q->queue_head); 350 277 return true; 351 278 } ··· 367 284 blk_run_queue_async(q); 368 285 } 369 286 287 + static void mq_flush_data_end_io(struct request *rq, int error) 288 + { 289 + struct request_queue *q = rq->q; 290 + struct blk_mq_hw_ctx *hctx; 291 + struct blk_mq_ctx *ctx; 292 + unsigned long flags; 293 + 294 + ctx = rq->mq_ctx; 295 + hctx = q->mq_ops->map_queue(q, ctx->cpu); 296 + 297 + /* 298 + * After populating an empty queue, kick it to avoid stall. Read 299 + * the comment in flush_end_io(). 300 + */ 301 + spin_lock_irqsave(&q->mq_flush_lock, flags); 302 + if (blk_flush_complete_seq(rq, REQ_FSEQ_DATA, error)) 303 + blk_mq_run_hw_queue(hctx, true); 304 + spin_unlock_irqrestore(&q->mq_flush_lock, flags); 305 + } 306 + 370 307 /** 371 308 * blk_insert_flush - insert a new FLUSH/FUA request 372 309 * @rq: request to insert 373 310 * 374 311 * To be called from __elv_add_request() for %ELEVATOR_INSERT_FLUSH insertions. 312 + * or __blk_mq_run_hw_queue() to dispatch request. 375 313 * @rq is being submitted. Analyze what needs to be done and put it on the 376 314 * right queue. 377 315 * 378 316 * CONTEXT: 379 - * spin_lock_irq(q->queue_lock) 317 + * spin_lock_irq(q->queue_lock) in !mq case 380 318 */ 381 319 void blk_insert_flush(struct request *rq) 382 320 { ··· 420 316 * complete the request. 421 317 */ 422 318 if (!policy) { 423 - __blk_end_bidi_request(rq, 0, 0, 0); 319 + if (q->mq_ops) 320 + blk_mq_end_io(rq, 0); 321 + else 322 + __blk_end_bidi_request(rq, 0, 0, 0); 424 323 return; 425 324 } 426 325 ··· 436 329 */ 437 330 if ((policy & REQ_FSEQ_DATA) && 438 331 !(policy & (REQ_FSEQ_PREFLUSH | REQ_FSEQ_POSTFLUSH))) { 439 - list_add_tail(&rq->queuelist, &q->queue_head); 332 + if (q->mq_ops) { 333 + blk_mq_run_request(rq, false, true); 334 + } else 335 + list_add_tail(&rq->queuelist, &q->queue_head); 440 336 return; 441 337 } 442 338 ··· 451 341 INIT_LIST_HEAD(&rq->flush.list); 452 342 rq->cmd_flags |= REQ_FLUSH_SEQ; 453 343 rq->flush.saved_end_io = rq->end_io; /* Usually NULL */ 344 + if (q->mq_ops) { 345 + rq->end_io = mq_flush_data_end_io; 346 + 347 + spin_lock_irq(&q->mq_flush_lock); 348 + blk_flush_complete_seq(rq, REQ_FSEQ_ACTIONS & ~policy, 0); 349 + spin_unlock_irq(&q->mq_flush_lock); 350 + return; 351 + } 454 352 rq->end_io = flush_data_end_io; 455 353 456 354 blk_flush_complete_seq(rq, REQ_FSEQ_ACTIONS & ~policy, 0); ··· 571 453 return ret; 572 454 } 573 455 EXPORT_SYMBOL(blkdev_issue_flush); 456 + 457 + void blk_mq_init_flush(struct request_queue *q) 458 + { 459 + spin_lock_init(&q->mq_flush_lock); 460 + INIT_WORK(&q->mq_flush_work, mq_flush_work); 461 + }

+93

block/blk-mq-cpu.c

··· 1 + #include <linux/kernel.h> 2 + #include <linux/module.h> 3 + #include <linux/init.h> 4 + #include <linux/blkdev.h> 5 + #include <linux/list.h> 6 + #include <linux/llist.h> 7 + #include <linux/smp.h> 8 + #include <linux/cpu.h> 9 + 10 + #include <linux/blk-mq.h> 11 + #include "blk-mq.h" 12 + 13 + static LIST_HEAD(blk_mq_cpu_notify_list); 14 + static DEFINE_SPINLOCK(blk_mq_cpu_notify_lock); 15 + 16 + static int __cpuinit blk_mq_main_cpu_notify(struct notifier_block *self, 17 + unsigned long action, void *hcpu) 18 + { 19 + unsigned int cpu = (unsigned long) hcpu; 20 + struct blk_mq_cpu_notifier *notify; 21 + 22 + spin_lock(&blk_mq_cpu_notify_lock); 23 + 24 + list_for_each_entry(notify, &blk_mq_cpu_notify_list, list) 25 + notify->notify(notify->data, action, cpu); 26 + 27 + spin_unlock(&blk_mq_cpu_notify_lock); 28 + return NOTIFY_OK; 29 + } 30 + 31 + static void __cpuinit blk_mq_cpu_notify(void *data, unsigned long action, 32 + unsigned int cpu) 33 + { 34 + if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) { 35 + /* 36 + * If the CPU goes away, ensure that we run any pending 37 + * completions. 38 + */ 39 + struct llist_node *node; 40 + struct request *rq; 41 + 42 + local_irq_disable(); 43 + 44 + node = llist_del_all(&per_cpu(ipi_lists, cpu)); 45 + while (node) { 46 + struct llist_node *next = node->next; 47 + 48 + rq = llist_entry(node, struct request, ll_list); 49 + __blk_mq_end_io(rq, rq->errors); 50 + node = next; 51 + } 52 + 53 + local_irq_enable(); 54 + } 55 + } 56 + 57 + static struct notifier_block __cpuinitdata blk_mq_main_cpu_notifier = { 58 + .notifier_call = blk_mq_main_cpu_notify, 59 + }; 60 + 61 + void blk_mq_register_cpu_notifier(struct blk_mq_cpu_notifier *notifier) 62 + { 63 + BUG_ON(!notifier->notify); 64 + 65 + spin_lock(&blk_mq_cpu_notify_lock); 66 + list_add_tail(&notifier->list, &blk_mq_cpu_notify_list); 67 + spin_unlock(&blk_mq_cpu_notify_lock); 68 + } 69 + 70 + void blk_mq_unregister_cpu_notifier(struct blk_mq_cpu_notifier *notifier) 71 + { 72 + spin_lock(&blk_mq_cpu_notify_lock); 73 + list_del(&notifier->list); 74 + spin_unlock(&blk_mq_cpu_notify_lock); 75 + } 76 + 77 + void blk_mq_init_cpu_notifier(struct blk_mq_cpu_notifier *notifier, 78 + void (*fn)(void *, unsigned long, unsigned int), 79 + void *data) 80 + { 81 + notifier->notify = fn; 82 + notifier->data = data; 83 + } 84 + 85 + static struct blk_mq_cpu_notifier __cpuinitdata cpu_notifier = { 86 + .notify = blk_mq_cpu_notify, 87 + }; 88 + 89 + void __init blk_mq_cpu_init(void) 90 + { 91 + register_hotcpu_notifier(&blk_mq_main_cpu_notifier); 92 + blk_mq_register_cpu_notifier(&cpu_notifier); 93 + }

+108

block/blk-mq-cpumap.c

··· 1 + #include <linux/kernel.h> 2 + #include <linux/threads.h> 3 + #include <linux/module.h> 4 + #include <linux/mm.h> 5 + #include <linux/smp.h> 6 + #include <linux/cpu.h> 7 + 8 + #include <linux/blk-mq.h> 9 + #include "blk.h" 10 + #include "blk-mq.h" 11 + 12 + static void show_map(unsigned int *map, unsigned int nr) 13 + { 14 + int i; 15 + 16 + pr_info("blk-mq: CPU -> queue map\n"); 17 + for_each_online_cpu(i) 18 + pr_info(" CPU%2u -> Queue %u\n", i, map[i]); 19 + } 20 + 21 + static int cpu_to_queue_index(unsigned int nr_cpus, unsigned int nr_queues, 22 + const int cpu) 23 + { 24 + return cpu / ((nr_cpus + nr_queues - 1) / nr_queues); 25 + } 26 + 27 + static int get_first_sibling(unsigned int cpu) 28 + { 29 + unsigned int ret; 30 + 31 + ret = cpumask_first(topology_thread_cpumask(cpu)); 32 + if (ret < nr_cpu_ids) 33 + return ret; 34 + 35 + return cpu; 36 + } 37 + 38 + int blk_mq_update_queue_map(unsigned int *map, unsigned int nr_queues) 39 + { 40 + unsigned int i, nr_cpus, nr_uniq_cpus, queue, first_sibling; 41 + cpumask_var_t cpus; 42 + 43 + if (!alloc_cpumask_var(&cpus, GFP_ATOMIC)) 44 + return 1; 45 + 46 + cpumask_clear(cpus); 47 + nr_cpus = nr_uniq_cpus = 0; 48 + for_each_online_cpu(i) { 49 + nr_cpus++; 50 + first_sibling = get_first_sibling(i); 51 + if (!cpumask_test_cpu(first_sibling, cpus)) 52 + nr_uniq_cpus++; 53 + cpumask_set_cpu(i, cpus); 54 + } 55 + 56 + queue = 0; 57 + for_each_possible_cpu(i) { 58 + if (!cpu_online(i)) { 59 + map[i] = 0; 60 + continue; 61 + } 62 + 63 + /* 64 + * Easy case - we have equal or more hardware queues. Or 65 + * there are no thread siblings to take into account. Do 66 + * 1:1 if enough, or sequential mapping if less. 67 + */ 68 + if (nr_queues >= nr_cpus || nr_cpus == nr_uniq_cpus) { 69 + map[i] = cpu_to_queue_index(nr_cpus, nr_queues, queue); 70 + queue++; 71 + continue; 72 + } 73 + 74 + /* 75 + * Less then nr_cpus queues, and we have some number of 76 + * threads per cores. Map sibling threads to the same 77 + * queue. 78 + */ 79 + first_sibling = get_first_sibling(i); 80 + if (first_sibling == i) { 81 + map[i] = cpu_to_queue_index(nr_uniq_cpus, nr_queues, 82 + queue); 83 + queue++; 84 + } else 85 + map[i] = map[first_sibling]; 86 + } 87 + 88 + show_map(map, nr_cpus); 89 + free_cpumask_var(cpus); 90 + return 0; 91 + } 92 + 93 + unsigned int *blk_mq_make_queue_map(struct blk_mq_reg *reg) 94 + { 95 + unsigned int *map; 96 + 97 + /* If cpus are offline, map them to first hctx */ 98 + map = kzalloc_node(sizeof(*map) * num_possible_cpus(), GFP_KERNEL, 99 + reg->numa_node); 100 + if (!map) 101 + return NULL; 102 + 103 + if (!blk_mq_update_queue_map(map, reg->nr_hw_queues)) 104 + return map; 105 + 106 + kfree(map); 107 + return NULL; 108 + }

+384

block/blk-mq-sysfs.c

··· 1 + #include <linux/kernel.h> 2 + #include <linux/module.h> 3 + #include <linux/backing-dev.h> 4 + #include <linux/bio.h> 5 + #include <linux/blkdev.h> 6 + #include <linux/mm.h> 7 + #include <linux/init.h> 8 + #include <linux/slab.h> 9 + #include <linux/workqueue.h> 10 + #include <linux/smp.h> 11 + 12 + #include <linux/blk-mq.h> 13 + #include "blk-mq.h" 14 + #include "blk-mq-tag.h" 15 + 16 + static void blk_mq_sysfs_release(struct kobject *kobj) 17 + { 18 + } 19 + 20 + struct blk_mq_ctx_sysfs_entry { 21 + struct attribute attr; 22 + ssize_t (*show)(struct blk_mq_ctx *, char *); 23 + ssize_t (*store)(struct blk_mq_ctx *, const char *, size_t); 24 + }; 25 + 26 + struct blk_mq_hw_ctx_sysfs_entry { 27 + struct attribute attr; 28 + ssize_t (*show)(struct blk_mq_hw_ctx *, char *); 29 + ssize_t (*store)(struct blk_mq_hw_ctx *, const char *, size_t); 30 + }; 31 + 32 + static ssize_t blk_mq_sysfs_show(struct kobject *kobj, struct attribute *attr, 33 + char *page) 34 + { 35 + struct blk_mq_ctx_sysfs_entry *entry; 36 + struct blk_mq_ctx *ctx; 37 + struct request_queue *q; 38 + ssize_t res; 39 + 40 + entry = container_of(attr, struct blk_mq_ctx_sysfs_entry, attr); 41 + ctx = container_of(kobj, struct blk_mq_ctx, kobj); 42 + q = ctx->queue; 43 + 44 + if (!entry->show) 45 + return -EIO; 46 + 47 + res = -ENOENT; 48 + mutex_lock(&q->sysfs_lock); 49 + if (!blk_queue_dying(q)) 50 + res = entry->show(ctx, page); 51 + mutex_unlock(&q->sysfs_lock); 52 + return res; 53 + } 54 + 55 + static ssize_t blk_mq_sysfs_store(struct kobject *kobj, struct attribute *attr, 56 + const char *page, size_t length) 57 + { 58 + struct blk_mq_ctx_sysfs_entry *entry; 59 + struct blk_mq_ctx *ctx; 60 + struct request_queue *q; 61 + ssize_t res; 62 + 63 + entry = container_of(attr, struct blk_mq_ctx_sysfs_entry, attr); 64 + ctx = container_of(kobj, struct blk_mq_ctx, kobj); 65 + q = ctx->queue; 66 + 67 + if (!entry->store) 68 + return -EIO; 69 + 70 + res = -ENOENT; 71 + mutex_lock(&q->sysfs_lock); 72 + if (!blk_queue_dying(q)) 73 + res = entry->store(ctx, page, length); 74 + mutex_unlock(&q->sysfs_lock); 75 + return res; 76 + } 77 + 78 + static ssize_t blk_mq_hw_sysfs_show(struct kobject *kobj, 79 + struct attribute *attr, char *page) 80 + { 81 + struct blk_mq_hw_ctx_sysfs_entry *entry; 82 + struct blk_mq_hw_ctx *hctx; 83 + struct request_queue *q; 84 + ssize_t res; 85 + 86 + entry = container_of(attr, struct blk_mq_hw_ctx_sysfs_entry, attr); 87 + hctx = container_of(kobj, struct blk_mq_hw_ctx, kobj); 88 + q = hctx->queue; 89 + 90 + if (!entry->show) 91 + return -EIO; 92 + 93 + res = -ENOENT; 94 + mutex_lock(&q->sysfs_lock); 95 + if (!blk_queue_dying(q)) 96 + res = entry->show(hctx, page); 97 + mutex_unlock(&q->sysfs_lock); 98 + return res; 99 + } 100 + 101 + static ssize_t blk_mq_hw_sysfs_store(struct kobject *kobj, 102 + struct attribute *attr, const char *page, 103 + size_t length) 104 + { 105 + struct blk_mq_hw_ctx_sysfs_entry *entry; 106 + struct blk_mq_hw_ctx *hctx; 107 + struct request_queue *q; 108 + ssize_t res; 109 + 110 + entry = container_of(attr, struct blk_mq_hw_ctx_sysfs_entry, attr); 111 + hctx = container_of(kobj, struct blk_mq_hw_ctx, kobj); 112 + q = hctx->queue; 113 + 114 + if (!entry->store) 115 + return -EIO; 116 + 117 + res = -ENOENT; 118 + mutex_lock(&q->sysfs_lock); 119 + if (!blk_queue_dying(q)) 120 + res = entry->store(hctx, page, length); 121 + mutex_unlock(&q->sysfs_lock); 122 + return res; 123 + } 124 + 125 + static ssize_t blk_mq_sysfs_dispatched_show(struct blk_mq_ctx *ctx, char *page) 126 + { 127 + return sprintf(page, "%lu %lu\n", ctx->rq_dispatched[1], 128 + ctx->rq_dispatched[0]); 129 + } 130 + 131 + static ssize_t blk_mq_sysfs_merged_show(struct blk_mq_ctx *ctx, char *page) 132 + { 133 + return sprintf(page, "%lu\n", ctx->rq_merged); 134 + } 135 + 136 + static ssize_t blk_mq_sysfs_completed_show(struct blk_mq_ctx *ctx, char *page) 137 + { 138 + return sprintf(page, "%lu %lu\n", ctx->rq_completed[1], 139 + ctx->rq_completed[0]); 140 + } 141 + 142 + static ssize_t sysfs_list_show(char *page, struct list_head *list, char *msg) 143 + { 144 + char *start_page = page; 145 + struct request *rq; 146 + 147 + page += sprintf(page, "%s:\n", msg); 148 + 149 + list_for_each_entry(rq, list, queuelist) 150 + page += sprintf(page, "\t%p\n", rq); 151 + 152 + return page - start_page; 153 + } 154 + 155 + static ssize_t blk_mq_sysfs_rq_list_show(struct blk_mq_ctx *ctx, char *page) 156 + { 157 + ssize_t ret; 158 + 159 + spin_lock(&ctx->lock); 160 + ret = sysfs_list_show(page, &ctx->rq_list, "CTX pending"); 161 + spin_unlock(&ctx->lock); 162 + 163 + return ret; 164 + } 165 + 166 + static ssize_t blk_mq_hw_sysfs_queued_show(struct blk_mq_hw_ctx *hctx, 167 + char *page) 168 + { 169 + return sprintf(page, "%lu\n", hctx->queued); 170 + } 171 + 172 + static ssize_t blk_mq_hw_sysfs_run_show(struct blk_mq_hw_ctx *hctx, char *page) 173 + { 174 + return sprintf(page, "%lu\n", hctx->run); 175 + } 176 + 177 + static ssize_t blk_mq_hw_sysfs_dispatched_show(struct blk_mq_hw_ctx *hctx, 178 + char *page) 179 + { 180 + char *start_page = page; 181 + int i; 182 + 183 + page += sprintf(page, "%8u\t%lu\n", 0U, hctx->dispatched[0]); 184 + 185 + for (i = 1; i < BLK_MQ_MAX_DISPATCH_ORDER; i++) { 186 + unsigned long d = 1U << (i - 1); 187 + 188 + page += sprintf(page, "%8lu\t%lu\n", d, hctx->dispatched[i]); 189 + } 190 + 191 + return page - start_page; 192 + } 193 + 194 + static ssize_t blk_mq_hw_sysfs_rq_list_show(struct blk_mq_hw_ctx *hctx, 195 + char *page) 196 + { 197 + ssize_t ret; 198 + 199 + spin_lock(&hctx->lock); 200 + ret = sysfs_list_show(page, &hctx->dispatch, "HCTX pending"); 201 + spin_unlock(&hctx->lock); 202 + 203 + return ret; 204 + } 205 + 206 + static ssize_t blk_mq_hw_sysfs_ipi_show(struct blk_mq_hw_ctx *hctx, char *page) 207 + { 208 + ssize_t ret; 209 + 210 + spin_lock(&hctx->lock); 211 + ret = sprintf(page, "%u\n", !!(hctx->flags & BLK_MQ_F_SHOULD_IPI)); 212 + spin_unlock(&hctx->lock); 213 + 214 + return ret; 215 + } 216 + 217 + static ssize_t blk_mq_hw_sysfs_ipi_store(struct blk_mq_hw_ctx *hctx, 218 + const char *page, size_t len) 219 + { 220 + struct blk_mq_ctx *ctx; 221 + unsigned long ret; 222 + unsigned int i; 223 + 224 + if (kstrtoul(page, 10, &ret)) { 225 + pr_err("blk-mq-sysfs: invalid input '%s'\n", page); 226 + return -EINVAL; 227 + } 228 + 229 + spin_lock(&hctx->lock); 230 + if (ret) 231 + hctx->flags |= BLK_MQ_F_SHOULD_IPI; 232 + else 233 + hctx->flags &= ~BLK_MQ_F_SHOULD_IPI; 234 + spin_unlock(&hctx->lock); 235 + 236 + hctx_for_each_ctx(hctx, ctx, i) 237 + ctx->ipi_redirect = !!ret; 238 + 239 + return len; 240 + } 241 + 242 + static ssize_t blk_mq_hw_sysfs_tags_show(struct blk_mq_hw_ctx *hctx, char *page) 243 + { 244 + return blk_mq_tag_sysfs_show(hctx->tags, page); 245 + } 246 + 247 + static struct blk_mq_ctx_sysfs_entry blk_mq_sysfs_dispatched = { 248 + .attr = {.name = "dispatched", .mode = S_IRUGO }, 249 + .show = blk_mq_sysfs_dispatched_show, 250 + }; 251 + static struct blk_mq_ctx_sysfs_entry blk_mq_sysfs_merged = { 252 + .attr = {.name = "merged", .mode = S_IRUGO }, 253 + .show = blk_mq_sysfs_merged_show, 254 + }; 255 + static struct blk_mq_ctx_sysfs_entry blk_mq_sysfs_completed = { 256 + .attr = {.name = "completed", .mode = S_IRUGO }, 257 + .show = blk_mq_sysfs_completed_show, 258 + }; 259 + static struct blk_mq_ctx_sysfs_entry blk_mq_sysfs_rq_list = { 260 + .attr = {.name = "rq_list", .mode = S_IRUGO }, 261 + .show = blk_mq_sysfs_rq_list_show, 262 + }; 263 + 264 + static struct attribute *default_ctx_attrs[] = { 265 + &blk_mq_sysfs_dispatched.attr, 266 + &blk_mq_sysfs_merged.attr, 267 + &blk_mq_sysfs_completed.attr, 268 + &blk_mq_sysfs_rq_list.attr, 269 + NULL, 270 + }; 271 + 272 + static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_queued = { 273 + .attr = {.name = "queued", .mode = S_IRUGO }, 274 + .show = blk_mq_hw_sysfs_queued_show, 275 + }; 276 + static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_run = { 277 + .attr = {.name = "run", .mode = S_IRUGO }, 278 + .show = blk_mq_hw_sysfs_run_show, 279 + }; 280 + static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_dispatched = { 281 + .attr = {.name = "dispatched", .mode = S_IRUGO }, 282 + .show = blk_mq_hw_sysfs_dispatched_show, 283 + }; 284 + static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_pending = { 285 + .attr = {.name = "pending", .mode = S_IRUGO }, 286 + .show = blk_mq_hw_sysfs_rq_list_show, 287 + }; 288 + static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_ipi = { 289 + .attr = {.name = "ipi_redirect", .mode = S_IRUGO | S_IWUSR}, 290 + .show = blk_mq_hw_sysfs_ipi_show, 291 + .store = blk_mq_hw_sysfs_ipi_store, 292 + }; 293 + static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_tags = { 294 + .attr = {.name = "tags", .mode = S_IRUGO }, 295 + .show = blk_mq_hw_sysfs_tags_show, 296 + }; 297 + 298 + static struct attribute *default_hw_ctx_attrs[] = { 299 + &blk_mq_hw_sysfs_queued.attr, 300 + &blk_mq_hw_sysfs_run.attr, 301 + &blk_mq_hw_sysfs_dispatched.attr, 302 + &blk_mq_hw_sysfs_pending.attr, 303 + &blk_mq_hw_sysfs_ipi.attr, 304 + &blk_mq_hw_sysfs_tags.attr, 305 + NULL, 306 + }; 307 + 308 + static const struct sysfs_ops blk_mq_sysfs_ops = { 309 + .show = blk_mq_sysfs_show, 310 + .store = blk_mq_sysfs_store, 311 + }; 312 + 313 + static const struct sysfs_ops blk_mq_hw_sysfs_ops = { 314 + .show = blk_mq_hw_sysfs_show, 315 + .store = blk_mq_hw_sysfs_store, 316 + }; 317 + 318 + static struct kobj_type blk_mq_ktype = { 319 + .sysfs_ops = &blk_mq_sysfs_ops, 320 + .release = blk_mq_sysfs_release, 321 + }; 322 + 323 + static struct kobj_type blk_mq_ctx_ktype = { 324 + .sysfs_ops = &blk_mq_sysfs_ops, 325 + .default_attrs = default_ctx_attrs, 326 + .release = blk_mq_sysfs_release, 327 + }; 328 + 329 + static struct kobj_type blk_mq_hw_ktype = { 330 + .sysfs_ops = &blk_mq_hw_sysfs_ops, 331 + .default_attrs = default_hw_ctx_attrs, 332 + .release = blk_mq_sysfs_release, 333 + }; 334 + 335 + void blk_mq_unregister_disk(struct gendisk *disk) 336 + { 337 + struct request_queue *q = disk->queue; 338 + 339 + kobject_uevent(&q->mq_kobj, KOBJ_REMOVE); 340 + kobject_del(&q->mq_kobj); 341 + 342 + kobject_put(&disk_to_dev(disk)->kobj); 343 + } 344 + 345 + int blk_mq_register_disk(struct gendisk *disk) 346 + { 347 + struct device *dev = disk_to_dev(disk); 348 + struct request_queue *q = disk->queue; 349 + struct blk_mq_hw_ctx *hctx; 350 + struct blk_mq_ctx *ctx; 351 + int ret, i, j; 352 + 353 + kobject_init(&q->mq_kobj, &blk_mq_ktype); 354 + 355 + ret = kobject_add(&q->mq_kobj, kobject_get(&dev->kobj), "%s", "mq"); 356 + if (ret < 0) 357 + return ret; 358 + 359 + kobject_uevent(&q->mq_kobj, KOBJ_ADD); 360 + 361 + queue_for_each_hw_ctx(q, hctx, i) { 362 + kobject_init(&hctx->kobj, &blk_mq_hw_ktype); 363 + ret = kobject_add(&hctx->kobj, &q->mq_kobj, "%u", i); 364 + if (ret) 365 + break; 366 + 367 + if (!hctx->nr_ctx) 368 + continue; 369 + 370 + hctx_for_each_ctx(hctx, ctx, j) { 371 + kobject_init(&ctx->kobj, &blk_mq_ctx_ktype); 372 + ret = kobject_add(&ctx->kobj, &hctx->kobj, "cpu%u", ctx->cpu); 373 + if (ret) 374 + break; 375 + } 376 + } 377 + 378 + if (ret) { 379 + blk_mq_unregister_disk(disk); 380 + return ret; 381 + } 382 + 383 + return 0; 384 + }

+204

block/blk-mq-tag.c

··· 1 + #include <linux/kernel.h> 2 + #include <linux/module.h> 3 + #include <linux/percpu_ida.h> 4 + 5 + #include <linux/blk-mq.h> 6 + #include "blk.h" 7 + #include "blk-mq.h" 8 + #include "blk-mq-tag.h" 9 + 10 + /* 11 + * Per tagged queue (tag address space) map 12 + */ 13 + struct blk_mq_tags { 14 + unsigned int nr_tags; 15 + unsigned int nr_reserved_tags; 16 + unsigned int nr_batch_move; 17 + unsigned int nr_max_cache; 18 + 19 + struct percpu_ida free_tags; 20 + struct percpu_ida reserved_tags; 21 + }; 22 + 23 + void blk_mq_wait_for_tags(struct blk_mq_tags *tags) 24 + { 25 + int tag = blk_mq_get_tag(tags, __GFP_WAIT, false); 26 + blk_mq_put_tag(tags, tag); 27 + } 28 + 29 + bool blk_mq_has_free_tags(struct blk_mq_tags *tags) 30 + { 31 + return !tags || 32 + percpu_ida_free_tags(&tags->free_tags, nr_cpu_ids) != 0; 33 + } 34 + 35 + static unsigned int __blk_mq_get_tag(struct blk_mq_tags *tags, gfp_t gfp) 36 + { 37 + int tag; 38 + 39 + tag = percpu_ida_alloc(&tags->free_tags, gfp); 40 + if (tag < 0) 41 + return BLK_MQ_TAG_FAIL; 42 + return tag + tags->nr_reserved_tags; 43 + } 44 + 45 + static unsigned int __blk_mq_get_reserved_tag(struct blk_mq_tags *tags, 46 + gfp_t gfp) 47 + { 48 + int tag; 49 + 50 + if (unlikely(!tags->nr_reserved_tags)) { 51 + WARN_ON_ONCE(1); 52 + return BLK_MQ_TAG_FAIL; 53 + } 54 + 55 + tag = percpu_ida_alloc(&tags->reserved_tags, gfp); 56 + if (tag < 0) 57 + return BLK_MQ_TAG_FAIL; 58 + return tag; 59 + } 60 + 61 + unsigned int blk_mq_get_tag(struct blk_mq_tags *tags, gfp_t gfp, bool reserved) 62 + { 63 + if (!reserved) 64 + return __blk_mq_get_tag(tags, gfp); 65 + 66 + return __blk_mq_get_reserved_tag(tags, gfp); 67 + } 68 + 69 + static void __blk_mq_put_tag(struct blk_mq_tags *tags, unsigned int tag) 70 + { 71 + BUG_ON(tag >= tags->nr_tags); 72 + 73 + percpu_ida_free(&tags->free_tags, tag - tags->nr_reserved_tags); 74 + } 75 + 76 + static void __blk_mq_put_reserved_tag(struct blk_mq_tags *tags, 77 + unsigned int tag) 78 + { 79 + BUG_ON(tag >= tags->nr_reserved_tags); 80 + 81 + percpu_ida_free(&tags->reserved_tags, tag); 82 + } 83 + 84 + void blk_mq_put_tag(struct blk_mq_tags *tags, unsigned int tag) 85 + { 86 + if (tag >= tags->nr_reserved_tags) 87 + __blk_mq_put_tag(tags, tag); 88 + else 89 + __blk_mq_put_reserved_tag(tags, tag); 90 + } 91 + 92 + static int __blk_mq_tag_iter(unsigned id, void *data) 93 + { 94 + unsigned long *tag_map = data; 95 + __set_bit(id, tag_map); 96 + return 0; 97 + } 98 + 99 + void blk_mq_tag_busy_iter(struct blk_mq_tags *tags, 100 + void (*fn)(void *, unsigned long *), void *data) 101 + { 102 + unsigned long *tag_map; 103 + size_t map_size; 104 + 105 + map_size = ALIGN(tags->nr_tags, BITS_PER_LONG) / BITS_PER_LONG; 106 + tag_map = kzalloc(map_size * sizeof(unsigned long), GFP_ATOMIC); 107 + if (!tag_map) 108 + return; 109 + 110 + percpu_ida_for_each_free(&tags->free_tags, __blk_mq_tag_iter, tag_map); 111 + if (tags->nr_reserved_tags) 112 + percpu_ida_for_each_free(&tags->reserved_tags, __blk_mq_tag_iter, 113 + tag_map); 114 + 115 + fn(data, tag_map); 116 + kfree(tag_map); 117 + } 118 + 119 + struct blk_mq_tags *blk_mq_init_tags(unsigned int total_tags, 120 + unsigned int reserved_tags, int node) 121 + { 122 + unsigned int nr_tags, nr_cache; 123 + struct blk_mq_tags *tags; 124 + int ret; 125 + 126 + if (total_tags > BLK_MQ_TAG_MAX) { 127 + pr_err("blk-mq: tag depth too large\n"); 128 + return NULL; 129 + } 130 + 131 + tags = kzalloc_node(sizeof(*tags), GFP_KERNEL, node); 132 + if (!tags) 133 + return NULL; 134 + 135 + nr_tags = total_tags - reserved_tags; 136 + nr_cache = nr_tags / num_possible_cpus(); 137 + 138 + if (nr_cache < BLK_MQ_TAG_CACHE_MIN) 139 + nr_cache = BLK_MQ_TAG_CACHE_MIN; 140 + else if (nr_cache > BLK_MQ_TAG_CACHE_MAX) 141 + nr_cache = BLK_MQ_TAG_CACHE_MAX; 142 + 143 + tags->nr_tags = total_tags; 144 + tags->nr_reserved_tags = reserved_tags; 145 + tags->nr_max_cache = nr_cache; 146 + tags->nr_batch_move = max(1u, nr_cache / 2); 147 + 148 + ret = __percpu_ida_init(&tags->free_tags, tags->nr_tags - 149 + tags->nr_reserved_tags, 150 + tags->nr_max_cache, 151 + tags->nr_batch_move); 152 + if (ret) 153 + goto err_free_tags; 154 + 155 + if (reserved_tags) { 156 + /* 157 + * With max_cahe and batch set to 1, the allocator fallbacks to 158 + * no cached. It's fine reserved tags allocation is slow. 159 + */ 160 + ret = __percpu_ida_init(&tags->reserved_tags, reserved_tags, 161 + 1, 1); 162 + if (ret) 163 + goto err_reserved_tags; 164 + } 165 + 166 + return tags; 167 + 168 + err_reserved_tags: 169 + percpu_ida_destroy(&tags->free_tags); 170 + err_free_tags: 171 + kfree(tags); 172 + return NULL; 173 + } 174 + 175 + void blk_mq_free_tags(struct blk_mq_tags *tags) 176 + { 177 + percpu_ida_destroy(&tags->free_tags); 178 + percpu_ida_destroy(&tags->reserved_tags); 179 + kfree(tags); 180 + } 181 + 182 + ssize_t blk_mq_tag_sysfs_show(struct blk_mq_tags *tags, char *page) 183 + { 184 + char *orig_page = page; 185 + int cpu; 186 + 187 + if (!tags) 188 + return 0; 189 + 190 + page += sprintf(page, "nr_tags=%u, reserved_tags=%u, batch_move=%u," 191 + " max_cache=%u\n", tags->nr_tags, tags->nr_reserved_tags, 192 + tags->nr_batch_move, tags->nr_max_cache); 193 + 194 + page += sprintf(page, "nr_free=%u, nr_reserved=%u\n", 195 + percpu_ida_free_tags(&tags->free_tags, nr_cpu_ids), 196 + percpu_ida_free_tags(&tags->reserved_tags, nr_cpu_ids)); 197 + 198 + for_each_possible_cpu(cpu) { 199 + page += sprintf(page, " cpu%02u: nr_free=%u\n", cpu, 200 + percpu_ida_free_tags(&tags->free_tags, cpu)); 201 + } 202 + 203 + return page - orig_page; 204 + }

+27

block/blk-mq-tag.h

··· 1 + #ifndef INT_BLK_MQ_TAG_H 2 + #define INT_BLK_MQ_TAG_H 3 + 4 + struct blk_mq_tags; 5 + 6 + extern struct blk_mq_tags *blk_mq_init_tags(unsigned int nr_tags, unsigned int reserved_tags, int node); 7 + extern void blk_mq_free_tags(struct blk_mq_tags *tags); 8 + 9 + extern unsigned int blk_mq_get_tag(struct blk_mq_tags *tags, gfp_t gfp, bool reserved); 10 + extern void blk_mq_wait_for_tags(struct blk_mq_tags *tags); 11 + extern void blk_mq_put_tag(struct blk_mq_tags *tags, unsigned int tag); 12 + extern void blk_mq_tag_busy_iter(struct blk_mq_tags *tags, void (*fn)(void *data, unsigned long *), void *data); 13 + extern bool blk_mq_has_free_tags(struct blk_mq_tags *tags); 14 + extern ssize_t blk_mq_tag_sysfs_show(struct blk_mq_tags *tags, char *page); 15 + 16 + enum { 17 + BLK_MQ_TAG_CACHE_MIN = 1, 18 + BLK_MQ_TAG_CACHE_MAX = 64, 19 + }; 20 + 21 + enum { 22 + BLK_MQ_TAG_FAIL = -1U, 23 + BLK_MQ_TAG_MIN = BLK_MQ_TAG_CACHE_MIN, 24 + BLK_MQ_TAG_MAX = BLK_MQ_TAG_FAIL - 1, 25 + }; 26 + 27 + #endif

+1480

block/blk-mq.c

··· 1 + #include <linux/kernel.h> 2 + #include <linux/module.h> 3 + #include <linux/backing-dev.h> 4 + #include <linux/bio.h> 5 + #include <linux/blkdev.h> 6 + #include <linux/mm.h> 7 + #include <linux/init.h> 8 + #include <linux/slab.h> 9 + #include <linux/workqueue.h> 10 + #include <linux/smp.h> 11 + #include <linux/llist.h> 12 + #include <linux/list_sort.h> 13 + #include <linux/cpu.h> 14 + #include <linux/cache.h> 15 + #include <linux/sched/sysctl.h> 16 + #include <linux/delay.h> 17 + 18 + #include <trace/events/block.h> 19 + 20 + #include <linux/blk-mq.h> 21 + #include "blk.h" 22 + #include "blk-mq.h" 23 + #include "blk-mq-tag.h" 24 + 25 + static DEFINE_MUTEX(all_q_mutex); 26 + static LIST_HEAD(all_q_list); 27 + 28 + static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx); 29 + 30 + DEFINE_PER_CPU(struct llist_head, ipi_lists); 31 + 32 + static struct blk_mq_ctx *__blk_mq_get_ctx(struct request_queue *q, 33 + unsigned int cpu) 34 + { 35 + return per_cpu_ptr(q->queue_ctx, cpu); 36 + } 37 + 38 + /* 39 + * This assumes per-cpu software queueing queues. They could be per-node 40 + * as well, for instance. For now this is hardcoded as-is. Note that we don't 41 + * care about preemption, since we know the ctx's are persistent. This does 42 + * mean that we can't rely on ctx always matching the currently running CPU. 43 + */ 44 + static struct blk_mq_ctx *blk_mq_get_ctx(struct request_queue *q) 45 + { 46 + return __blk_mq_get_ctx(q, get_cpu()); 47 + } 48 + 49 + static void blk_mq_put_ctx(struct blk_mq_ctx *ctx) 50 + { 51 + put_cpu(); 52 + } 53 + 54 + /* 55 + * Check if any of the ctx's have pending work in this hardware queue 56 + */ 57 + static bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx) 58 + { 59 + unsigned int i; 60 + 61 + for (i = 0; i < hctx->nr_ctx_map; i++) 62 + if (hctx->ctx_map[i]) 63 + return true; 64 + 65 + return false; 66 + } 67 + 68 + /* 69 + * Mark this ctx as having pending work in this hardware queue 70 + */ 71 + static void blk_mq_hctx_mark_pending(struct blk_mq_hw_ctx *hctx, 72 + struct blk_mq_ctx *ctx) 73 + { 74 + if (!test_bit(ctx->index_hw, hctx->ctx_map)) 75 + set_bit(ctx->index_hw, hctx->ctx_map); 76 + } 77 + 78 + static struct request *blk_mq_alloc_rq(struct blk_mq_hw_ctx *hctx, gfp_t gfp, 79 + bool reserved) 80 + { 81 + struct request *rq; 82 + unsigned int tag; 83 + 84 + tag = blk_mq_get_tag(hctx->tags, gfp, reserved); 85 + if (tag != BLK_MQ_TAG_FAIL) { 86 + rq = hctx->rqs[tag]; 87 + rq->tag = tag; 88 + 89 + return rq; 90 + } 91 + 92 + return NULL; 93 + } 94 + 95 + static int blk_mq_queue_enter(struct request_queue *q) 96 + { 97 + int ret; 98 + 99 + __percpu_counter_add(&q->mq_usage_counter, 1, 1000000); 100 + smp_wmb(); 101 + /* we have problems to freeze the queue if it's initializing */ 102 + if (!blk_queue_bypass(q) || !blk_queue_init_done(q)) 103 + return 0; 104 + 105 + __percpu_counter_add(&q->mq_usage_counter, -1, 1000000); 106 + 107 + spin_lock_irq(q->queue_lock); 108 + ret = wait_event_interruptible_lock_irq(q->mq_freeze_wq, 109 + !blk_queue_bypass(q), *q->queue_lock); 110 + /* inc usage with lock hold to avoid freeze_queue runs here */ 111 + if (!ret) 112 + __percpu_counter_add(&q->mq_usage_counter, 1, 1000000); 113 + spin_unlock_irq(q->queue_lock); 114 + 115 + return ret; 116 + } 117 + 118 + static void blk_mq_queue_exit(struct request_queue *q) 119 + { 120 + __percpu_counter_add(&q->mq_usage_counter, -1, 1000000); 121 + } 122 + 123 + /* 124 + * Guarantee no request is in use, so we can change any data structure of 125 + * the queue afterward. 126 + */ 127 + static void blk_mq_freeze_queue(struct request_queue *q) 128 + { 129 + bool drain; 130 + 131 + spin_lock_irq(q->queue_lock); 132 + drain = !q->bypass_depth++; 133 + queue_flag_set(QUEUE_FLAG_BYPASS, q); 134 + spin_unlock_irq(q->queue_lock); 135 + 136 + if (!drain) 137 + return; 138 + 139 + while (true) { 140 + s64 count; 141 + 142 + spin_lock_irq(q->queue_lock); 143 + count = percpu_counter_sum(&q->mq_usage_counter); 144 + spin_unlock_irq(q->queue_lock); 145 + 146 + if (count == 0) 147 + break; 148 + blk_mq_run_queues(q, false); 149 + msleep(10); 150 + } 151 + } 152 + 153 + static void blk_mq_unfreeze_queue(struct request_queue *q) 154 + { 155 + bool wake = false; 156 + 157 + spin_lock_irq(q->queue_lock); 158 + if (!--q->bypass_depth) { 159 + queue_flag_clear(QUEUE_FLAG_BYPASS, q); 160 + wake = true; 161 + } 162 + WARN_ON_ONCE(q->bypass_depth < 0); 163 + spin_unlock_irq(q->queue_lock); 164 + if (wake) 165 + wake_up_all(&q->mq_freeze_wq); 166 + } 167 + 168 + bool blk_mq_can_queue(struct blk_mq_hw_ctx *hctx) 169 + { 170 + return blk_mq_has_free_tags(hctx->tags); 171 + } 172 + EXPORT_SYMBOL(blk_mq_can_queue); 173 + 174 + static void blk_mq_rq_ctx_init(struct blk_mq_ctx *ctx, struct request *rq, 175 + unsigned int rw_flags) 176 + { 177 + rq->mq_ctx = ctx; 178 + rq->cmd_flags = rw_flags; 179 + ctx->rq_dispatched[rw_is_sync(rw_flags)]++; 180 + } 181 + 182 + static struct request *__blk_mq_alloc_request(struct blk_mq_hw_ctx *hctx, 183 + gfp_t gfp, bool reserved) 184 + { 185 + return blk_mq_alloc_rq(hctx, gfp, reserved); 186 + } 187 + 188 + static struct request *blk_mq_alloc_request_pinned(struct request_queue *q, 189 + int rw, gfp_t gfp, 190 + bool reserved) 191 + { 192 + struct request *rq; 193 + 194 + do { 195 + struct blk_mq_ctx *ctx = blk_mq_get_ctx(q); 196 + struct blk_mq_hw_ctx *hctx = q->mq_ops->map_queue(q, ctx->cpu); 197 + 198 + rq = __blk_mq_alloc_request(hctx, gfp & ~__GFP_WAIT, reserved); 199 + if (rq) { 200 + blk_mq_rq_ctx_init(ctx, rq, rw); 201 + break; 202 + } else if (!(gfp & __GFP_WAIT)) 203 + break; 204 + 205 + blk_mq_put_ctx(ctx); 206 + __blk_mq_run_hw_queue(hctx); 207 + blk_mq_wait_for_tags(hctx->tags); 208 + } while (1); 209 + 210 + return rq; 211 + } 212 + 213 + struct request *blk_mq_alloc_request(struct request_queue *q, int rw, gfp_t gfp) 214 + { 215 + struct request *rq; 216 + 217 + if (blk_mq_queue_enter(q)) 218 + return NULL; 219 + 220 + rq = blk_mq_alloc_request_pinned(q, rw, gfp, false); 221 + blk_mq_put_ctx(rq->mq_ctx); 222 + return rq; 223 + } 224 + 225 + struct request *blk_mq_alloc_reserved_request(struct request_queue *q, int rw, 226 + gfp_t gfp) 227 + { 228 + struct request *rq; 229 + 230 + if (blk_mq_queue_enter(q)) 231 + return NULL; 232 + 233 + rq = blk_mq_alloc_request_pinned(q, rw, gfp, true); 234 + blk_mq_put_ctx(rq->mq_ctx); 235 + return rq; 236 + } 237 + EXPORT_SYMBOL(blk_mq_alloc_reserved_request); 238 + 239 + /* 240 + * Re-init and set pdu, if we have it 241 + */ 242 + static void blk_mq_rq_init(struct blk_mq_hw_ctx *hctx, struct request *rq) 243 + { 244 + blk_rq_init(hctx->queue, rq); 245 + 246 + if (hctx->cmd_size) 247 + rq->special = blk_mq_rq_to_pdu(rq); 248 + } 249 + 250 + static void __blk_mq_free_request(struct blk_mq_hw_ctx *hctx, 251 + struct blk_mq_ctx *ctx, struct request *rq) 252 + { 253 + const int tag = rq->tag; 254 + struct request_queue *q = rq->q; 255 + 256 + blk_mq_rq_init(hctx, rq); 257 + blk_mq_put_tag(hctx->tags, tag); 258 + 259 + blk_mq_queue_exit(q); 260 + } 261 + 262 + void blk_mq_free_request(struct request *rq) 263 + { 264 + struct blk_mq_ctx *ctx = rq->mq_ctx; 265 + struct blk_mq_hw_ctx *hctx; 266 + struct request_queue *q = rq->q; 267 + 268 + ctx->rq_completed[rq_is_sync(rq)]++; 269 + 270 + hctx = q->mq_ops->map_queue(q, ctx->cpu); 271 + __blk_mq_free_request(hctx, ctx, rq); 272 + } 273 + 274 + static void blk_mq_bio_endio(struct request *rq, struct bio *bio, int error) 275 + { 276 + if (error) 277 + clear_bit(BIO_UPTODATE, &bio->bi_flags); 278 + else if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) 279 + error = -EIO; 280 + 281 + if (unlikely(rq->cmd_flags & REQ_QUIET)) 282 + set_bit(BIO_QUIET, &bio->bi_flags); 283 + 284 + /* don't actually finish bio if it's part of flush sequence */ 285 + if (!(rq->cmd_flags & REQ_FLUSH_SEQ)) 286 + bio_endio(bio, error); 287 + } 288 + 289 + void blk_mq_complete_request(struct request *rq, int error) 290 + { 291 + struct bio *bio = rq->bio; 292 + unsigned int bytes = 0; 293 + 294 + trace_block_rq_complete(rq->q, rq); 295 + 296 + while (bio) { 297 + struct bio *next = bio->bi_next; 298 + 299 + bio->bi_next = NULL; 300 + bytes += bio->bi_size; 301 + blk_mq_bio_endio(rq, bio, error); 302 + bio = next; 303 + } 304 + 305 + blk_account_io_completion(rq, bytes); 306 + 307 + if (rq->end_io) 308 + rq->end_io(rq, error); 309 + else 310 + blk_mq_free_request(rq); 311 + 312 + blk_account_io_done(rq); 313 + } 314 + 315 + void __blk_mq_end_io(struct request *rq, int error) 316 + { 317 + if (!blk_mark_rq_complete(rq)) 318 + blk_mq_complete_request(rq, error); 319 + } 320 + 321 + #if defined(CONFIG_SMP) && defined(CONFIG_USE_GENERIC_SMP_HELPERS) 322 + 323 + /* 324 + * Called with interrupts disabled. 325 + */ 326 + static void ipi_end_io(void *data) 327 + { 328 + struct llist_head *list = &per_cpu(ipi_lists, smp_processor_id()); 329 + struct llist_node *entry, *next; 330 + struct request *rq; 331 + 332 + entry = llist_del_all(list); 333 + 334 + while (entry) { 335 + next = entry->next; 336 + rq = llist_entry(entry, struct request, ll_list); 337 + __blk_mq_end_io(rq, rq->errors); 338 + entry = next; 339 + } 340 + } 341 + 342 + static int ipi_remote_cpu(struct blk_mq_ctx *ctx, const int cpu, 343 + struct request *rq, const int error) 344 + { 345 + struct call_single_data *data = &rq->csd; 346 + 347 + rq->errors = error; 348 + rq->ll_list.next = NULL; 349 + 350 + /* 351 + * If the list is non-empty, an existing IPI must already 352 + * be "in flight". If that is the case, we need not schedule 353 + * a new one. 354 + */ 355 + if (llist_add(&rq->ll_list, &per_cpu(ipi_lists, ctx->cpu))) { 356 + data->func = ipi_end_io; 357 + data->flags = 0; 358 + __smp_call_function_single(ctx->cpu, data, 0); 359 + } 360 + 361 + return true; 362 + } 363 + #else /* CONFIG_SMP && CONFIG_USE_GENERIC_SMP_HELPERS */ 364 + static int ipi_remote_cpu(struct blk_mq_ctx *ctx, const int cpu, 365 + struct request *rq, const int error) 366 + { 367 + return false; 368 + } 369 + #endif 370 + 371 + /* 372 + * End IO on this request on a multiqueue enabled driver. We'll either do 373 + * it directly inline, or punt to a local IPI handler on the matching 374 + * remote CPU. 375 + */ 376 + void blk_mq_end_io(struct request *rq, int error) 377 + { 378 + struct blk_mq_ctx *ctx = rq->mq_ctx; 379 + int cpu; 380 + 381 + if (!ctx->ipi_redirect) 382 + return __blk_mq_end_io(rq, error); 383 + 384 + cpu = get_cpu(); 385 + 386 + if (cpu == ctx->cpu || !cpu_online(ctx->cpu) || 387 + !ipi_remote_cpu(ctx, cpu, rq, error)) 388 + __blk_mq_end_io(rq, error); 389 + 390 + put_cpu(); 391 + } 392 + EXPORT_SYMBOL(blk_mq_end_io); 393 + 394 + static void blk_mq_start_request(struct request *rq) 395 + { 396 + struct request_queue *q = rq->q; 397 + 398 + trace_block_rq_issue(q, rq); 399 + 400 + /* 401 + * Just mark start time and set the started bit. Due to memory 402 + * ordering, we know we'll see the correct deadline as long as 403 + * REQ_ATOMIC_STARTED is seen. 404 + */ 405 + rq->deadline = jiffies + q->rq_timeout; 406 + set_bit(REQ_ATOM_STARTED, &rq->atomic_flags); 407 + } 408 + 409 + static void blk_mq_requeue_request(struct request *rq) 410 + { 411 + struct request_queue *q = rq->q; 412 + 413 + trace_block_rq_requeue(q, rq); 414 + clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags); 415 + } 416 + 417 + struct blk_mq_timeout_data { 418 + struct blk_mq_hw_ctx *hctx; 419 + unsigned long *next; 420 + unsigned int *next_set; 421 + }; 422 + 423 + static void blk_mq_timeout_check(void *__data, unsigned long *free_tags) 424 + { 425 + struct blk_mq_timeout_data *data = __data; 426 + struct blk_mq_hw_ctx *hctx = data->hctx; 427 + unsigned int tag; 428 + 429 + /* It may not be in flight yet (this is where 430 + * the REQ_ATOMIC_STARTED flag comes in). The requests are 431 + * statically allocated, so we know it's always safe to access the 432 + * memory associated with a bit offset into ->rqs[]. 433 + */ 434 + tag = 0; 435 + do { 436 + struct request *rq; 437 + 438 + tag = find_next_zero_bit(free_tags, hctx->queue_depth, tag); 439 + if (tag >= hctx->queue_depth) 440 + break; 441 + 442 + rq = hctx->rqs[tag++]; 443 + 444 + if (!test_bit(REQ_ATOM_STARTED, &rq->atomic_flags)) 445 + continue; 446 + 447 + blk_rq_check_expired(rq, data->next, data->next_set); 448 + } while (1); 449 + } 450 + 451 + static void blk_mq_hw_ctx_check_timeout(struct blk_mq_hw_ctx *hctx, 452 + unsigned long *next, 453 + unsigned int *next_set) 454 + { 455 + struct blk_mq_timeout_data data = { 456 + .hctx = hctx, 457 + .next = next, 458 + .next_set = next_set, 459 + }; 460 + 461 + /* 462 + * Ask the tagging code to iterate busy requests, so we can 463 + * check them for timeout. 464 + */ 465 + blk_mq_tag_busy_iter(hctx->tags, blk_mq_timeout_check, &data); 466 + } 467 + 468 + static void blk_mq_rq_timer(unsigned long data) 469 + { 470 + struct request_queue *q = (struct request_queue *) data; 471 + struct blk_mq_hw_ctx *hctx; 472 + unsigned long next = 0; 473 + int i, next_set = 0; 474 + 475 + queue_for_each_hw_ctx(q, hctx, i) 476 + blk_mq_hw_ctx_check_timeout(hctx, &next, &next_set); 477 + 478 + if (next_set) 479 + mod_timer(&q->timeout, round_jiffies_up(next)); 480 + } 481 + 482 + /* 483 + * Reverse check our software queue for entries that we could potentially 484 + * merge with. Currently includes a hand-wavy stop count of 8, to not spend 485 + * too much time checking for merges. 486 + */ 487 + static bool blk_mq_attempt_merge(struct request_queue *q, 488 + struct blk_mq_ctx *ctx, struct bio *bio) 489 + { 490 + struct request *rq; 491 + int checked = 8; 492 + 493 + list_for_each_entry_reverse(rq, &ctx->rq_list, queuelist) { 494 + int el_ret; 495 + 496 + if (!checked--) 497 + break; 498 + 499 + if (!blk_rq_merge_ok(rq, bio)) 500 + continue; 501 + 502 + el_ret = blk_try_merge(rq, bio); 503 + if (el_ret == ELEVATOR_BACK_MERGE) { 504 + if (bio_attempt_back_merge(q, rq, bio)) { 505 + ctx->rq_merged++; 506 + return true; 507 + } 508 + break; 509 + } else if (el_ret == ELEVATOR_FRONT_MERGE) { 510 + if (bio_attempt_front_merge(q, rq, bio)) { 511 + ctx->rq_merged++; 512 + return true; 513 + } 514 + break; 515 + } 516 + } 517 + 518 + return false; 519 + } 520 + 521 + void blk_mq_add_timer(struct request *rq) 522 + { 523 + __blk_add_timer(rq, NULL); 524 + } 525 + 526 + /* 527 + * Run this hardware queue, pulling any software queues mapped to it in. 528 + * Note that this function currently has various problems around ordering 529 + * of IO. In particular, we'd like FIFO behaviour on handling existing 530 + * items on the hctx->dispatch list. Ignore that for now. 531 + */ 532 + static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx) 533 + { 534 + struct request_queue *q = hctx->queue; 535 + struct blk_mq_ctx *ctx; 536 + struct request *rq; 537 + LIST_HEAD(rq_list); 538 + int bit, queued; 539 + 540 + if (unlikely(test_bit(BLK_MQ_S_STOPPED, &hctx->flags))) 541 + return; 542 + 543 + hctx->run++; 544 + 545 + /* 546 + * Touch any software queue that has pending entries. 547 + */ 548 + for_each_set_bit(bit, hctx->ctx_map, hctx->nr_ctx) { 549 + clear_bit(bit, hctx->ctx_map); 550 + ctx = hctx->ctxs[bit]; 551 + BUG_ON(bit != ctx->index_hw); 552 + 553 + spin_lock(&ctx->lock); 554 + list_splice_tail_init(&ctx->rq_list, &rq_list); 555 + spin_unlock(&ctx->lock); 556 + } 557 + 558 + /* 559 + * If we have previous entries on our dispatch list, grab them 560 + * and stuff them at the front for more fair dispatch. 561 + */ 562 + if (!list_empty_careful(&hctx->dispatch)) { 563 + spin_lock(&hctx->lock); 564 + if (!list_empty(&hctx->dispatch)) 565 + list_splice_init(&hctx->dispatch, &rq_list); 566 + spin_unlock(&hctx->lock); 567 + } 568 + 569 + /* 570 + * Delete and return all entries from our dispatch list 571 + */ 572 + queued = 0; 573 + 574 + /* 575 + * Now process all the entries, sending them to the driver. 576 + */ 577 + while (!list_empty(&rq_list)) { 578 + int ret; 579 + 580 + rq = list_first_entry(&rq_list, struct request, queuelist); 581 + list_del_init(&rq->queuelist); 582 + blk_mq_start_request(rq); 583 + 584 + /* 585 + * Last request in the series. Flag it as such, this 586 + * enables drivers to know when IO should be kicked off, 587 + * if they don't do it on a per-request basis. 588 + * 589 + * Note: the flag isn't the only condition drivers 590 + * should do kick off. If drive is busy, the last 591 + * request might not have the bit set. 592 + */ 593 + if (list_empty(&rq_list)) 594 + rq->cmd_flags |= REQ_END; 595 + 596 + ret = q->mq_ops->queue_rq(hctx, rq); 597 + switch (ret) { 598 + case BLK_MQ_RQ_QUEUE_OK: 599 + queued++; 600 + continue; 601 + case BLK_MQ_RQ_QUEUE_BUSY: 602 + /* 603 + * FIXME: we should have a mechanism to stop the queue 604 + * like blk_stop_queue, otherwise we will waste cpu 605 + * time 606 + */ 607 + list_add(&rq->queuelist, &rq_list); 608 + blk_mq_requeue_request(rq); 609 + break; 610 + default: 611 + pr_err("blk-mq: bad return on queue: %d\n", ret); 612 + rq->errors = -EIO; 613 + case BLK_MQ_RQ_QUEUE_ERROR: 614 + blk_mq_end_io(rq, rq->errors); 615 + break; 616 + } 617 + 618 + if (ret == BLK_MQ_RQ_QUEUE_BUSY) 619 + break; 620 + } 621 + 622 + if (!queued) 623 + hctx->dispatched[0]++; 624 + else if (queued < (1 << (BLK_MQ_MAX_DISPATCH_ORDER - 1))) 625 + hctx->dispatched[ilog2(queued) + 1]++; 626 + 627 + /* 628 + * Any items that need requeuing? Stuff them into hctx->dispatch, 629 + * that is where we will continue on next queue run. 630 + */ 631 + if (!list_empty(&rq_list)) { 632 + spin_lock(&hctx->lock); 633 + list_splice(&rq_list, &hctx->dispatch); 634 + spin_unlock(&hctx->lock); 635 + } 636 + } 637 + 638 + void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async) 639 + { 640 + if (unlikely(test_bit(BLK_MQ_S_STOPPED, &hctx->flags))) 641 + return; 642 + 643 + if (!async) 644 + __blk_mq_run_hw_queue(hctx); 645 + else { 646 + struct request_queue *q = hctx->queue; 647 + 648 + kblockd_schedule_delayed_work(q, &hctx->delayed_work, 0); 649 + } 650 + } 651 + 652 + void blk_mq_run_queues(struct request_queue *q, bool async) 653 + { 654 + struct blk_mq_hw_ctx *hctx; 655 + int i; 656 + 657 + queue_for_each_hw_ctx(q, hctx, i) { 658 + if ((!blk_mq_hctx_has_pending(hctx) && 659 + list_empty_careful(&hctx->dispatch)) || 660 + test_bit(BLK_MQ_S_STOPPED, &hctx->flags)) 661 + continue; 662 + 663 + blk_mq_run_hw_queue(hctx, async); 664 + } 665 + } 666 + EXPORT_SYMBOL(blk_mq_run_queues); 667 + 668 + void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx) 669 + { 670 + cancel_delayed_work(&hctx->delayed_work); 671 + set_bit(BLK_MQ_S_STOPPED, &hctx->state); 672 + } 673 + EXPORT_SYMBOL(blk_mq_stop_hw_queue); 674 + 675 + void blk_mq_start_hw_queue(struct blk_mq_hw_ctx *hctx) 676 + { 677 + clear_bit(BLK_MQ_S_STOPPED, &hctx->state); 678 + __blk_mq_run_hw_queue(hctx); 679 + } 680 + EXPORT_SYMBOL(blk_mq_start_hw_queue); 681 + 682 + void blk_mq_start_stopped_hw_queues(struct request_queue *q) 683 + { 684 + struct blk_mq_hw_ctx *hctx; 685 + int i; 686 + 687 + queue_for_each_hw_ctx(q, hctx, i) { 688 + if (!test_bit(BLK_MQ_S_STOPPED, &hctx->state)) 689 + continue; 690 + 691 + clear_bit(BLK_MQ_S_STOPPED, &hctx->state); 692 + blk_mq_run_hw_queue(hctx, true); 693 + } 694 + } 695 + EXPORT_SYMBOL(blk_mq_start_stopped_hw_queues); 696 + 697 + static void blk_mq_work_fn(struct work_struct *work) 698 + { 699 + struct blk_mq_hw_ctx *hctx; 700 + 701 + hctx = container_of(work, struct blk_mq_hw_ctx, delayed_work.work); 702 + __blk_mq_run_hw_queue(hctx); 703 + } 704 + 705 + static void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx, 706 + struct request *rq) 707 + { 708 + struct blk_mq_ctx *ctx = rq->mq_ctx; 709 + 710 + list_add_tail(&rq->queuelist, &ctx->rq_list); 711 + blk_mq_hctx_mark_pending(hctx, ctx); 712 + 713 + /* 714 + * We do this early, to ensure we are on the right CPU. 715 + */ 716 + blk_mq_add_timer(rq); 717 + } 718 + 719 + void blk_mq_insert_request(struct request_queue *q, struct request *rq, 720 + bool run_queue) 721 + { 722 + struct blk_mq_hw_ctx *hctx; 723 + struct blk_mq_ctx *ctx, *current_ctx; 724 + 725 + ctx = rq->mq_ctx; 726 + hctx = q->mq_ops->map_queue(q, ctx->cpu); 727 + 728 + if (rq->cmd_flags & (REQ_FLUSH | REQ_FUA)) { 729 + blk_insert_flush(rq); 730 + } else { 731 + current_ctx = blk_mq_get_ctx(q); 732 + 733 + if (!cpu_online(ctx->cpu)) { 734 + ctx = current_ctx; 735 + hctx = q->mq_ops->map_queue(q, ctx->cpu); 736 + rq->mq_ctx = ctx; 737 + } 738 + spin_lock(&ctx->lock); 739 + __blk_mq_insert_request(hctx, rq); 740 + spin_unlock(&ctx->lock); 741 + 742 + blk_mq_put_ctx(current_ctx); 743 + } 744 + 745 + if (run_queue) 746 + __blk_mq_run_hw_queue(hctx); 747 + } 748 + EXPORT_SYMBOL(blk_mq_insert_request); 749 + 750 + /* 751 + * This is a special version of blk_mq_insert_request to bypass FLUSH request 752 + * check. Should only be used internally. 753 + */ 754 + void blk_mq_run_request(struct request *rq, bool run_queue, bool async) 755 + { 756 + struct request_queue *q = rq->q; 757 + struct blk_mq_hw_ctx *hctx; 758 + struct blk_mq_ctx *ctx, *current_ctx; 759 + 760 + current_ctx = blk_mq_get_ctx(q); 761 + 762 + ctx = rq->mq_ctx; 763 + if (!cpu_online(ctx->cpu)) { 764 + ctx = current_ctx; 765 + rq->mq_ctx = ctx; 766 + } 767 + hctx = q->mq_ops->map_queue(q, ctx->cpu); 768 + 769 + /* ctx->cpu might be offline */ 770 + spin_lock(&ctx->lock); 771 + __blk_mq_insert_request(hctx, rq); 772 + spin_unlock(&ctx->lock); 773 + 774 + blk_mq_put_ctx(current_ctx); 775 + 776 + if (run_queue) 777 + blk_mq_run_hw_queue(hctx, async); 778 + } 779 + 780 + static void blk_mq_insert_requests(struct request_queue *q, 781 + struct blk_mq_ctx *ctx, 782 + struct list_head *list, 783 + int depth, 784 + bool from_schedule) 785 + 786 + { 787 + struct blk_mq_hw_ctx *hctx; 788 + struct blk_mq_ctx *current_ctx; 789 + 790 + trace_block_unplug(q, depth, !from_schedule); 791 + 792 + current_ctx = blk_mq_get_ctx(q); 793 + 794 + if (!cpu_online(ctx->cpu)) 795 + ctx = current_ctx; 796 + hctx = q->mq_ops->map_queue(q, ctx->cpu); 797 + 798 + /* 799 + * preemption doesn't flush plug list, so it's possible ctx->cpu is 800 + * offline now 801 + */ 802 + spin_lock(&ctx->lock); 803 + while (!list_empty(list)) { 804 + struct request *rq; 805 + 806 + rq = list_first_entry(list, struct request, queuelist); 807 + list_del_init(&rq->queuelist); 808 + rq->mq_ctx = ctx; 809 + __blk_mq_insert_request(hctx, rq); 810 + } 811 + spin_unlock(&ctx->lock); 812 + 813 + blk_mq_put_ctx(current_ctx); 814 + 815 + blk_mq_run_hw_queue(hctx, from_schedule); 816 + } 817 + 818 + static int plug_ctx_cmp(void *priv, struct list_head *a, struct list_head *b) 819 + { 820 + struct request *rqa = container_of(a, struct request, queuelist); 821 + struct request *rqb = container_of(b, struct request, queuelist); 822 + 823 + return !(rqa->mq_ctx < rqb->mq_ctx || 824 + (rqa->mq_ctx == rqb->mq_ctx && 825 + blk_rq_pos(rqa) < blk_rq_pos(rqb))); 826 + } 827 + 828 + void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule) 829 + { 830 + struct blk_mq_ctx *this_ctx; 831 + struct request_queue *this_q; 832 + struct request *rq; 833 + LIST_HEAD(list); 834 + LIST_HEAD(ctx_list); 835 + unsigned int depth; 836 + 837 + list_splice_init(&plug->mq_list, &list); 838 + 839 + list_sort(NULL, &list, plug_ctx_cmp); 840 + 841 + this_q = NULL; 842 + this_ctx = NULL; 843 + depth = 0; 844 + 845 + while (!list_empty(&list)) { 846 + rq = list_entry_rq(list.next); 847 + list_del_init(&rq->queuelist); 848 + BUG_ON(!rq->q); 849 + if (rq->mq_ctx != this_ctx) { 850 + if (this_ctx) { 851 + blk_mq_insert_requests(this_q, this_ctx, 852 + &ctx_list, depth, 853 + from_schedule); 854 + } 855 + 856 + this_ctx = rq->mq_ctx; 857 + this_q = rq->q; 858 + depth = 0; 859 + } 860 + 861 + depth++; 862 + list_add_tail(&rq->queuelist, &ctx_list); 863 + } 864 + 865 + /* 866 + * If 'this_ctx' is set, we know we have entries to complete 867 + * on 'ctx_list'. Do those. 868 + */ 869 + if (this_ctx) { 870 + blk_mq_insert_requests(this_q, this_ctx, &ctx_list, depth, 871 + from_schedule); 872 + } 873 + } 874 + 875 + static void blk_mq_bio_to_request(struct request *rq, struct bio *bio) 876 + { 877 + init_request_from_bio(rq, bio); 878 + blk_account_io_start(rq, 1); 879 + } 880 + 881 + static void blk_mq_make_request(struct request_queue *q, struct bio *bio) 882 + { 883 + struct blk_mq_hw_ctx *hctx; 884 + struct blk_mq_ctx *ctx; 885 + const int is_sync = rw_is_sync(bio->bi_rw); 886 + const int is_flush_fua = bio->bi_rw & (REQ_FLUSH | REQ_FUA); 887 + int rw = bio_data_dir(bio); 888 + struct request *rq; 889 + unsigned int use_plug, request_count = 0; 890 + 891 + /* 892 + * If we have multiple hardware queues, just go directly to 893 + * one of those for sync IO. 894 + */ 895 + use_plug = !is_flush_fua && ((q->nr_hw_queues == 1) || !is_sync); 896 + 897 + blk_queue_bounce(q, &bio); 898 + 899 + if (use_plug && blk_attempt_plug_merge(q, bio, &request_count)) 900 + return; 901 + 902 + if (blk_mq_queue_enter(q)) { 903 + bio_endio(bio, -EIO); 904 + return; 905 + } 906 + 907 + ctx = blk_mq_get_ctx(q); 908 + hctx = q->mq_ops->map_queue(q, ctx->cpu); 909 + 910 + trace_block_getrq(q, bio, rw); 911 + rq = __blk_mq_alloc_request(hctx, GFP_ATOMIC, false); 912 + if (likely(rq)) 913 + blk_mq_rq_ctx_init(ctx, rq, rw); 914 + else { 915 + blk_mq_put_ctx(ctx); 916 + trace_block_sleeprq(q, bio, rw); 917 + rq = blk_mq_alloc_request_pinned(q, rw, __GFP_WAIT|GFP_ATOMIC, 918 + false); 919 + ctx = rq->mq_ctx; 920 + hctx = q->mq_ops->map_queue(q, ctx->cpu); 921 + } 922 + 923 + hctx->queued++; 924 + 925 + if (unlikely(is_flush_fua)) { 926 + blk_mq_bio_to_request(rq, bio); 927 + blk_mq_put_ctx(ctx); 928 + blk_insert_flush(rq); 929 + goto run_queue; 930 + } 931 + 932 + /* 933 + * A task plug currently exists. Since this is completely lockless, 934 + * utilize that to temporarily store requests until the task is 935 + * either done or scheduled away. 936 + */ 937 + if (use_plug) { 938 + struct blk_plug *plug = current->plug; 939 + 940 + if (plug) { 941 + blk_mq_bio_to_request(rq, bio); 942 + if (list_empty(&plug->list)) 943 + trace_block_plug(q); 944 + else if (request_count >= BLK_MAX_REQUEST_COUNT) { 945 + blk_flush_plug_list(plug, false); 946 + trace_block_plug(q); 947 + } 948 + list_add_tail(&rq->queuelist, &plug->mq_list); 949 + blk_mq_put_ctx(ctx); 950 + return; 951 + } 952 + } 953 + 954 + spin_lock(&ctx->lock); 955 + 956 + if ((hctx->flags & BLK_MQ_F_SHOULD_MERGE) && 957 + blk_mq_attempt_merge(q, ctx, bio)) 958 + __blk_mq_free_request(hctx, ctx, rq); 959 + else { 960 + blk_mq_bio_to_request(rq, bio); 961 + __blk_mq_insert_request(hctx, rq); 962 + } 963 + 964 + spin_unlock(&ctx->lock); 965 + blk_mq_put_ctx(ctx); 966 + 967 + /* 968 + * For a SYNC request, send it to the hardware immediately. For an 969 + * ASYNC request, just ensure that we run it later on. The latter 970 + * allows for merging opportunities and more efficient dispatching. 971 + */ 972 + run_queue: 973 + blk_mq_run_hw_queue(hctx, !is_sync || is_flush_fua); 974 + } 975 + 976 + /* 977 + * Default mapping to a software queue, since we use one per CPU. 978 + */ 979 + struct blk_mq_hw_ctx *blk_mq_map_queue(struct request_queue *q, const int cpu) 980 + { 981 + return q->queue_hw_ctx[q->mq_map[cpu]]; 982 + } 983 + EXPORT_SYMBOL(blk_mq_map_queue); 984 + 985 + struct blk_mq_hw_ctx *blk_mq_alloc_single_hw_queue(struct blk_mq_reg *reg, 986 + unsigned int hctx_index) 987 + { 988 + return kmalloc_node(sizeof(struct blk_mq_hw_ctx), 989 + GFP_KERNEL | __GFP_ZERO, reg->numa_node); 990 + } 991 + EXPORT_SYMBOL(blk_mq_alloc_single_hw_queue); 992 + 993 + void blk_mq_free_single_hw_queue(struct blk_mq_hw_ctx *hctx, 994 + unsigned int hctx_index) 995 + { 996 + kfree(hctx); 997 + } 998 + EXPORT_SYMBOL(blk_mq_free_single_hw_queue); 999 + 1000 + static void blk_mq_hctx_notify(void *data, unsigned long action, 1001 + unsigned int cpu) 1002 + { 1003 + struct blk_mq_hw_ctx *hctx = data; 1004 + struct blk_mq_ctx *ctx; 1005 + LIST_HEAD(tmp); 1006 + 1007 + if (action != CPU_DEAD && action != CPU_DEAD_FROZEN) 1008 + return; 1009 + 1010 + /* 1011 + * Move ctx entries to new CPU, if this one is going away. 1012 + */ 1013 + ctx = __blk_mq_get_ctx(hctx->queue, cpu); 1014 + 1015 + spin_lock(&ctx->lock); 1016 + if (!list_empty(&ctx->rq_list)) { 1017 + list_splice_init(&ctx->rq_list, &tmp); 1018 + clear_bit(ctx->index_hw, hctx->ctx_map); 1019 + } 1020 + spin_unlock(&ctx->lock); 1021 + 1022 + if (list_empty(&tmp)) 1023 + return; 1024 + 1025 + ctx = blk_mq_get_ctx(hctx->queue); 1026 + spin_lock(&ctx->lock); 1027 + 1028 + while (!list_empty(&tmp)) { 1029 + struct request *rq; 1030 + 1031 + rq = list_first_entry(&tmp, struct request, queuelist); 1032 + rq->mq_ctx = ctx; 1033 + list_move_tail(&rq->queuelist, &ctx->rq_list); 1034 + } 1035 + 1036 + blk_mq_hctx_mark_pending(hctx, ctx); 1037 + 1038 + spin_unlock(&ctx->lock); 1039 + blk_mq_put_ctx(ctx); 1040 + } 1041 + 1042 + static void blk_mq_init_hw_commands(struct blk_mq_hw_ctx *hctx, 1043 + void (*init)(void *, struct blk_mq_hw_ctx *, 1044 + struct request *, unsigned int), 1045 + void *data) 1046 + { 1047 + unsigned int i; 1048 + 1049 + for (i = 0; i < hctx->queue_depth; i++) { 1050 + struct request *rq = hctx->rqs[i]; 1051 + 1052 + init(data, hctx, rq, i); 1053 + } 1054 + } 1055 + 1056 + void blk_mq_init_commands(struct request_queue *q, 1057 + void (*init)(void *, struct blk_mq_hw_ctx *, 1058 + struct request *, unsigned int), 1059 + void *data) 1060 + { 1061 + struct blk_mq_hw_ctx *hctx; 1062 + unsigned int i; 1063 + 1064 + queue_for_each_hw_ctx(q, hctx, i) 1065 + blk_mq_init_hw_commands(hctx, init, data); 1066 + } 1067 + EXPORT_SYMBOL(blk_mq_init_commands); 1068 + 1069 + static void blk_mq_free_rq_map(struct blk_mq_hw_ctx *hctx) 1070 + { 1071 + struct page *page; 1072 + 1073 + while (!list_empty(&hctx->page_list)) { 1074 + page = list_first_entry(&hctx->page_list, struct page, list); 1075 + list_del_init(&page->list); 1076 + __free_pages(page, page->private); 1077 + } 1078 + 1079 + kfree(hctx->rqs); 1080 + 1081 + if (hctx->tags) 1082 + blk_mq_free_tags(hctx->tags); 1083 + } 1084 + 1085 + static size_t order_to_size(unsigned int order) 1086 + { 1087 + size_t ret = PAGE_SIZE; 1088 + 1089 + while (order--) 1090 + ret *= 2; 1091 + 1092 + return ret; 1093 + } 1094 + 1095 + static int blk_mq_init_rq_map(struct blk_mq_hw_ctx *hctx, 1096 + unsigned int reserved_tags, int node) 1097 + { 1098 + unsigned int i, j, entries_per_page, max_order = 4; 1099 + size_t rq_size, left; 1100 + 1101 + INIT_LIST_HEAD(&hctx->page_list); 1102 + 1103 + hctx->rqs = kmalloc_node(hctx->queue_depth * sizeof(struct request *), 1104 + GFP_KERNEL, node); 1105 + if (!hctx->rqs) 1106 + return -ENOMEM; 1107 + 1108 + /* 1109 + * rq_size is the size of the request plus driver payload, rounded 1110 + * to the cacheline size 1111 + */ 1112 + rq_size = round_up(sizeof(struct request) + hctx->cmd_size, 1113 + cache_line_size()); 1114 + left = rq_size * hctx->queue_depth; 1115 + 1116 + for (i = 0; i < hctx->queue_depth;) { 1117 + int this_order = max_order; 1118 + struct page *page; 1119 + int to_do; 1120 + void *p; 1121 + 1122 + while (left < order_to_size(this_order - 1) && this_order) 1123 + this_order--; 1124 + 1125 + do { 1126 + page = alloc_pages_node(node, GFP_KERNEL, this_order); 1127 + if (page) 1128 + break; 1129 + if (!this_order--) 1130 + break; 1131 + if (order_to_size(this_order) < rq_size) 1132 + break; 1133 + } while (1); 1134 + 1135 + if (!page) 1136 + break; 1137 + 1138 + page->private = this_order; 1139 + list_add_tail(&page->list, &hctx->page_list); 1140 + 1141 + p = page_address(page); 1142 + entries_per_page = order_to_size(this_order) / rq_size; 1143 + to_do = min(entries_per_page, hctx->queue_depth - i); 1144 + left -= to_do * rq_size; 1145 + for (j = 0; j < to_do; j++) { 1146 + hctx->rqs[i] = p; 1147 + blk_mq_rq_init(hctx, hctx->rqs[i]); 1148 + p += rq_size; 1149 + i++; 1150 + } 1151 + } 1152 + 1153 + if (i < (reserved_tags + BLK_MQ_TAG_MIN)) 1154 + goto err_rq_map; 1155 + else if (i != hctx->queue_depth) { 1156 + hctx->queue_depth = i; 1157 + pr_warn("%s: queue depth set to %u because of low memory\n", 1158 + __func__, i); 1159 + } 1160 + 1161 + hctx->tags = blk_mq_init_tags(hctx->queue_depth, reserved_tags, node); 1162 + if (!hctx->tags) { 1163 + err_rq_map: 1164 + blk_mq_free_rq_map(hctx); 1165 + return -ENOMEM; 1166 + } 1167 + 1168 + return 0; 1169 + } 1170 + 1171 + static int blk_mq_init_hw_queues(struct request_queue *q, 1172 + struct blk_mq_reg *reg, void *driver_data) 1173 + { 1174 + struct blk_mq_hw_ctx *hctx; 1175 + unsigned int i, j; 1176 + 1177 + /* 1178 + * Initialize hardware queues 1179 + */ 1180 + queue_for_each_hw_ctx(q, hctx, i) { 1181 + unsigned int num_maps; 1182 + int node; 1183 + 1184 + node = hctx->numa_node; 1185 + if (node == NUMA_NO_NODE) 1186 + node = hctx->numa_node = reg->numa_node; 1187 + 1188 + INIT_DELAYED_WORK(&hctx->delayed_work, blk_mq_work_fn); 1189 + spin_lock_init(&hctx->lock); 1190 + INIT_LIST_HEAD(&hctx->dispatch); 1191 + hctx->queue = q; 1192 + hctx->queue_num = i; 1193 + hctx->flags = reg->flags; 1194 + hctx->queue_depth = reg->queue_depth; 1195 + hctx->cmd_size = reg->cmd_size; 1196 + 1197 + blk_mq_init_cpu_notifier(&hctx->cpu_notifier, 1198 + blk_mq_hctx_notify, hctx); 1199 + blk_mq_register_cpu_notifier(&hctx->cpu_notifier); 1200 + 1201 + if (blk_mq_init_rq_map(hctx, reg->reserved_tags, node)) 1202 + break; 1203 + 1204 + /* 1205 + * Allocate space for all possible cpus to avoid allocation in 1206 + * runtime 1207 + */ 1208 + hctx->ctxs = kmalloc_node(nr_cpu_ids * sizeof(void *), 1209 + GFP_KERNEL, node); 1210 + if (!hctx->ctxs) 1211 + break; 1212 + 1213 + num_maps = ALIGN(nr_cpu_ids, BITS_PER_LONG) / BITS_PER_LONG; 1214 + hctx->ctx_map = kzalloc_node(num_maps * sizeof(unsigned long), 1215 + GFP_KERNEL, node); 1216 + if (!hctx->ctx_map) 1217 + break; 1218 + 1219 + hctx->nr_ctx_map = num_maps; 1220 + hctx->nr_ctx = 0; 1221 + 1222 + if (reg->ops->init_hctx && 1223 + reg->ops->init_hctx(hctx, driver_data, i)) 1224 + break; 1225 + } 1226 + 1227 + if (i == q->nr_hw_queues) 1228 + return 0; 1229 + 1230 + /* 1231 + * Init failed 1232 + */ 1233 + queue_for_each_hw_ctx(q, hctx, j) { 1234 + if (i == j) 1235 + break; 1236 + 1237 + if (reg->ops->exit_hctx) 1238 + reg->ops->exit_hctx(hctx, j); 1239 + 1240 + blk_mq_unregister_cpu_notifier(&hctx->cpu_notifier); 1241 + blk_mq_free_rq_map(hctx); 1242 + kfree(hctx->ctxs); 1243 + } 1244 + 1245 + return 1; 1246 + } 1247 + 1248 + static void blk_mq_init_cpu_queues(struct request_queue *q, 1249 + unsigned int nr_hw_queues) 1250 + { 1251 + unsigned int i; 1252 + 1253 + for_each_possible_cpu(i) { 1254 + struct blk_mq_ctx *__ctx = per_cpu_ptr(q->queue_ctx, i); 1255 + struct blk_mq_hw_ctx *hctx; 1256 + 1257 + memset(__ctx, 0, sizeof(*__ctx)); 1258 + __ctx->cpu = i; 1259 + spin_lock_init(&__ctx->lock); 1260 + INIT_LIST_HEAD(&__ctx->rq_list); 1261 + __ctx->queue = q; 1262 + 1263 + /* If the cpu isn't online, the cpu is mapped to first hctx */ 1264 + hctx = q->mq_ops->map_queue(q, i); 1265 + hctx->nr_ctx++; 1266 + 1267 + if (!cpu_online(i)) 1268 + continue; 1269 + 1270 + /* 1271 + * Set local node, IFF we have more than one hw queue. If 1272 + * not, we remain on the home node of the device 1273 + */ 1274 + if (nr_hw_queues > 1 && hctx->numa_node == NUMA_NO_NODE) 1275 + hctx->numa_node = cpu_to_node(i); 1276 + } 1277 + } 1278 + 1279 + static void blk_mq_map_swqueue(struct request_queue *q) 1280 + { 1281 + unsigned int i; 1282 + struct blk_mq_hw_ctx *hctx; 1283 + struct blk_mq_ctx *ctx; 1284 + 1285 + queue_for_each_hw_ctx(q, hctx, i) { 1286 + hctx->nr_ctx = 0; 1287 + } 1288 + 1289 + /* 1290 + * Map software to hardware queues 1291 + */ 1292 + queue_for_each_ctx(q, ctx, i) { 1293 + /* If the cpu isn't online, the cpu is mapped to first hctx */ 1294 + hctx = q->mq_ops->map_queue(q, i); 1295 + ctx->index_hw = hctx->nr_ctx; 1296 + hctx->ctxs[hctx->nr_ctx++] = ctx; 1297 + } 1298 + } 1299 + 1300 + struct request_queue *blk_mq_init_queue(struct blk_mq_reg *reg, 1301 + void *driver_data) 1302 + { 1303 + struct blk_mq_hw_ctx **hctxs; 1304 + struct blk_mq_ctx *ctx; 1305 + struct request_queue *q; 1306 + int i; 1307 + 1308 + if (!reg->nr_hw_queues || 1309 + !reg->ops->queue_rq || !reg->ops->map_queue || 1310 + !reg->ops->alloc_hctx || !reg->ops->free_hctx) 1311 + return ERR_PTR(-EINVAL); 1312 + 1313 + if (!reg->queue_depth) 1314 + reg->queue_depth = BLK_MQ_MAX_DEPTH; 1315 + else if (reg->queue_depth > BLK_MQ_MAX_DEPTH) { 1316 + pr_err("blk-mq: queuedepth too large (%u)\n", reg->queue_depth); 1317 + reg->queue_depth = BLK_MQ_MAX_DEPTH; 1318 + } 1319 + 1320 + if (reg->queue_depth < (reg->reserved_tags + BLK_MQ_TAG_MIN)) 1321 + return ERR_PTR(-EINVAL); 1322 + 1323 + ctx = alloc_percpu(struct blk_mq_ctx); 1324 + if (!ctx) 1325 + return ERR_PTR(-ENOMEM); 1326 + 1327 + hctxs = kmalloc_node(reg->nr_hw_queues * sizeof(*hctxs), GFP_KERNEL, 1328 + reg->numa_node); 1329 + 1330 + if (!hctxs) 1331 + goto err_percpu; 1332 + 1333 + for (i = 0; i < reg->nr_hw_queues; i++) { 1334 + hctxs[i] = reg->ops->alloc_hctx(reg, i); 1335 + if (!hctxs[i]) 1336 + goto err_hctxs; 1337 + 1338 + hctxs[i]->numa_node = NUMA_NO_NODE; 1339 + hctxs[i]->queue_num = i; 1340 + } 1341 + 1342 + q = blk_alloc_queue_node(GFP_KERNEL, reg->numa_node); 1343 + if (!q) 1344 + goto err_hctxs; 1345 + 1346 + q->mq_map = blk_mq_make_queue_map(reg); 1347 + if (!q->mq_map) 1348 + goto err_map; 1349 + 1350 + setup_timer(&q->timeout, blk_mq_rq_timer, (unsigned long) q); 1351 + blk_queue_rq_timeout(q, 30000); 1352 + 1353 + q->nr_queues = nr_cpu_ids; 1354 + q->nr_hw_queues = reg->nr_hw_queues; 1355 + 1356 + q->queue_ctx = ctx; 1357 + q->queue_hw_ctx = hctxs; 1358 + 1359 + q->mq_ops = reg->ops; 1360 + 1361 + blk_queue_make_request(q, blk_mq_make_request); 1362 + blk_queue_rq_timed_out(q, reg->ops->timeout); 1363 + if (reg->timeout) 1364 + blk_queue_rq_timeout(q, reg->timeout); 1365 + 1366 + blk_mq_init_flush(q); 1367 + blk_mq_init_cpu_queues(q, reg->nr_hw_queues); 1368 + 1369 + if (blk_mq_init_hw_queues(q, reg, driver_data)) 1370 + goto err_hw; 1371 + 1372 + blk_mq_map_swqueue(q); 1373 + 1374 + mutex_lock(&all_q_mutex); 1375 + list_add_tail(&q->all_q_node, &all_q_list); 1376 + mutex_unlock(&all_q_mutex); 1377 + 1378 + return q; 1379 + err_hw: 1380 + kfree(q->mq_map); 1381 + err_map: 1382 + blk_cleanup_queue(q); 1383 + err_hctxs: 1384 + for (i = 0; i < reg->nr_hw_queues; i++) { 1385 + if (!hctxs[i]) 1386 + break; 1387 + reg->ops->free_hctx(hctxs[i], i); 1388 + } 1389 + kfree(hctxs); 1390 + err_percpu: 1391 + free_percpu(ctx); 1392 + return ERR_PTR(-ENOMEM); 1393 + } 1394 + EXPORT_SYMBOL(blk_mq_init_queue); 1395 + 1396 + void blk_mq_free_queue(struct request_queue *q) 1397 + { 1398 + struct blk_mq_hw_ctx *hctx; 1399 + int i; 1400 + 1401 + queue_for_each_hw_ctx(q, hctx, i) { 1402 + cancel_delayed_work_sync(&hctx->delayed_work); 1403 + kfree(hctx->ctx_map); 1404 + kfree(hctx->ctxs); 1405 + blk_mq_free_rq_map(hctx); 1406 + blk_mq_unregister_cpu_notifier(&hctx->cpu_notifier); 1407 + if (q->mq_ops->exit_hctx) 1408 + q->mq_ops->exit_hctx(hctx, i); 1409 + q->mq_ops->free_hctx(hctx, i); 1410 + } 1411 + 1412 + free_percpu(q->queue_ctx); 1413 + kfree(q->queue_hw_ctx); 1414 + kfree(q->mq_map); 1415 + 1416 + q->queue_ctx = NULL; 1417 + q->queue_hw_ctx = NULL; 1418 + q->mq_map = NULL; 1419 + 1420 + mutex_lock(&all_q_mutex); 1421 + list_del_init(&q->all_q_node); 1422 + mutex_unlock(&all_q_mutex); 1423 + } 1424 + EXPORT_SYMBOL(blk_mq_free_queue); 1425 + 1426 + /* Basically redo blk_mq_init_queue with queue frozen */ 1427 + static void __cpuinit blk_mq_queue_reinit(struct request_queue *q) 1428 + { 1429 + blk_mq_freeze_queue(q); 1430 + 1431 + blk_mq_update_queue_map(q->mq_map, q->nr_hw_queues); 1432 + 1433 + /* 1434 + * redo blk_mq_init_cpu_queues and blk_mq_init_hw_queues. FIXME: maybe 1435 + * we should change hctx numa_node according to new topology (this 1436 + * involves free and re-allocate memory, worthy doing?) 1437 + */ 1438 + 1439 + blk_mq_map_swqueue(q); 1440 + 1441 + blk_mq_unfreeze_queue(q); 1442 + } 1443 + 1444 + static int __cpuinit blk_mq_queue_reinit_notify(struct notifier_block *nb, 1445 + unsigned long action, void *hcpu) 1446 + { 1447 + struct request_queue *q; 1448 + 1449 + /* 1450 + * Before new mapping is established, hotadded cpu might already start 1451 + * handling requests. This doesn't break anything as we map offline 1452 + * CPUs to first hardware queue. We will re-init queue below to get 1453 + * optimal settings. 1454 + */ 1455 + if (action != CPU_DEAD && action != CPU_DEAD_FROZEN && 1456 + action != CPU_ONLINE && action != CPU_ONLINE_FROZEN) 1457 + return NOTIFY_OK; 1458 + 1459 + mutex_lock(&all_q_mutex); 1460 + list_for_each_entry(q, &all_q_list, all_q_node) 1461 + blk_mq_queue_reinit(q); 1462 + mutex_unlock(&all_q_mutex); 1463 + return NOTIFY_OK; 1464 + } 1465 + 1466 + static int __init blk_mq_init(void) 1467 + { 1468 + unsigned int i; 1469 + 1470 + for_each_possible_cpu(i) 1471 + init_llist_head(&per_cpu(ipi_lists, i)); 1472 + 1473 + blk_mq_cpu_init(); 1474 + 1475 + /* Must be called after percpu_counter_hotcpu_callback() */ 1476 + hotcpu_notifier(blk_mq_queue_reinit_notify, -10); 1477 + 1478 + return 0; 1479 + } 1480 + subsys_initcall(blk_mq_init);

+52

block/blk-mq.h

··· 1 + #ifndef INT_BLK_MQ_H 2 + #define INT_BLK_MQ_H 3 + 4 + struct blk_mq_ctx { 5 + struct { 6 + spinlock_t lock; 7 + struct list_head rq_list; 8 + } ____cacheline_aligned_in_smp; 9 + 10 + unsigned int cpu; 11 + unsigned int index_hw; 12 + unsigned int ipi_redirect; 13 + 14 + /* incremented at dispatch time */ 15 + unsigned long rq_dispatched[2]; 16 + unsigned long rq_merged; 17 + 18 + /* incremented at completion time */ 19 + unsigned long ____cacheline_aligned_in_smp rq_completed[2]; 20 + 21 + struct request_queue *queue; 22 + struct kobject kobj; 23 + }; 24 + 25 + void __blk_mq_end_io(struct request *rq, int error); 26 + void blk_mq_complete_request(struct request *rq, int error); 27 + void blk_mq_run_request(struct request *rq, bool run_queue, bool async); 28 + void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async); 29 + void blk_mq_init_flush(struct request_queue *q); 30 + 31 + /* 32 + * CPU hotplug helpers 33 + */ 34 + struct blk_mq_cpu_notifier; 35 + void blk_mq_init_cpu_notifier(struct blk_mq_cpu_notifier *notifier, 36 + void (*fn)(void *, unsigned long, unsigned int), 37 + void *data); 38 + void blk_mq_register_cpu_notifier(struct blk_mq_cpu_notifier *notifier); 39 + void blk_mq_unregister_cpu_notifier(struct blk_mq_cpu_notifier *notifier); 40 + void blk_mq_cpu_init(void); 41 + DECLARE_PER_CPU(struct llist_head, ipi_lists); 42 + 43 + /* 44 + * CPU -> queue mappings 45 + */ 46 + struct blk_mq_reg; 47 + extern unsigned int *blk_mq_make_queue_map(struct blk_mq_reg *reg); 48 + extern int blk_mq_update_queue_map(unsigned int *map, unsigned int nr_queues); 49 + 50 + void blk_mq_add_timer(struct request *rq); 51 + 52 + #endif

+13

block/blk-sysfs.c

··· 7 7 #include <linux/bio.h> 8 8 #include <linux/blkdev.h> 9 9 #include <linux/blktrace_api.h> 10 + #include <linux/blk-mq.h> 10 11 11 12 #include "blk.h" 12 13 #include "blk-cgroup.h" ··· 543 542 if (q->queue_tags) 544 543 __blk_queue_free_tags(q); 545 544 545 + percpu_counter_destroy(&q->mq_usage_counter); 546 + 547 + if (q->mq_ops) 548 + blk_mq_free_queue(q); 549 + 546 550 blk_trace_shutdown(q); 547 551 548 552 bdi_destroy(&q->backing_dev_info); ··· 581 575 * bypass from queue allocation. 582 576 */ 583 577 blk_queue_bypass_end(q); 578 + queue_flag_set_unlocked(QUEUE_FLAG_INIT_DONE, q); 584 579 585 580 ret = blk_trace_init_sysfs(dev); 586 581 if (ret) ··· 594 587 } 595 588 596 589 kobject_uevent(&q->kobj, KOBJ_ADD); 590 + 591 + if (q->mq_ops) 592 + blk_mq_register_disk(disk); 597 593 598 594 if (!q->request_fn) 599 595 return 0; ··· 619 609 620 610 if (WARN_ON(!q)) 621 611 return; 612 + 613 + if (q->mq_ops) 614 + blk_mq_unregister_disk(disk); 622 615 623 616 if (q->request_fn) 624 617 elv_unregister_queue(q);

+46 -27

block/blk-timeout.c

··· 7 7 #include <linux/fault-inject.h> 8 8 9 9 #include "blk.h" 10 + #include "blk-mq.h" 10 11 11 12 #ifdef CONFIG_FAIL_IO_TIMEOUT 12 13 ··· 89 88 ret = q->rq_timed_out_fn(req); 90 89 switch (ret) { 91 90 case BLK_EH_HANDLED: 92 - __blk_complete_request(req); 91 + /* Can we use req->errors here? */ 92 + if (q->mq_ops) 93 + blk_mq_complete_request(req, req->errors); 94 + else 95 + __blk_complete_request(req); 93 96 break; 94 97 case BLK_EH_RESET_TIMER: 95 98 blk_clear_rq_complete(req); 96 - blk_add_timer(req); 99 + if (q->mq_ops) 100 + blk_mq_add_timer(req); 101 + else 102 + blk_add_timer(req); 97 103 break; 98 104 case BLK_EH_NOT_HANDLED: 99 105 /* ··· 116 108 } 117 109 } 118 110 111 + void blk_rq_check_expired(struct request *rq, unsigned long *next_timeout, 112 + unsigned int *next_set) 113 + { 114 + if (time_after_eq(jiffies, rq->deadline)) { 115 + list_del_init(&rq->timeout_list); 116 + 117 + /* 118 + * Check if we raced with end io completion 119 + */ 120 + if (!blk_mark_rq_complete(rq)) 121 + blk_rq_timed_out(rq); 122 + } else if (!*next_set || time_after(*next_timeout, rq->deadline)) { 123 + *next_timeout = rq->deadline; 124 + *next_set = 1; 125 + } 126 + } 127 + 119 128 void blk_rq_timed_out_timer(unsigned long data) 120 129 { 121 130 struct request_queue *q = (struct request_queue *) data; ··· 142 117 143 118 spin_lock_irqsave(q->queue_lock, flags); 144 119 145 - list_for_each_entry_safe(rq, tmp, &q->timeout_list, timeout_list) { 146 - if (time_after_eq(jiffies, rq->deadline)) { 147 - list_del_init(&rq->timeout_list); 148 - 149 - /* 150 - * Check if we raced with end io completion 151 - */ 152 - if (blk_mark_rq_complete(rq)) 153 - continue; 154 - blk_rq_timed_out(rq); 155 - } else if (!next_set || time_after(next, rq->deadline)) { 156 - next = rq->deadline; 157 - next_set = 1; 158 - } 159 - } 120 + list_for_each_entry_safe(rq, tmp, &q->timeout_list, timeout_list) 121 + blk_rq_check_expired(rq, &next, &next_set); 160 122 161 123 if (next_set) 162 124 mod_timer(&q->timeout, round_jiffies_up(next)); ··· 169 157 } 170 158 EXPORT_SYMBOL_GPL(blk_abort_request); 171 159 172 - /** 173 - * blk_add_timer - Start timeout timer for a single request 174 - * @req: request that is about to start running. 175 - * 176 - * Notes: 177 - * Each request has its own timer, and as it is added to the queue, we 178 - * set up the timer. When the request completes, we cancel the timer. 179 - */ 180 - void blk_add_timer(struct request *req) 160 + void __blk_add_timer(struct request *req, struct list_head *timeout_list) 181 161 { 182 162 struct request_queue *q = req->q; 183 163 unsigned long expiry; ··· 188 184 req->timeout = q->rq_timeout; 189 185 190 186 req->deadline = jiffies + req->timeout; 191 - list_add_tail(&req->timeout_list, &q->timeout_list); 187 + if (timeout_list) 188 + list_add_tail(&req->timeout_list, timeout_list); 192 189 193 190 /* 194 191 * If the timer isn't already pending or this timeout is earlier ··· 201 196 if (!timer_pending(&q->timeout) || 202 197 time_before(expiry, q->timeout.expires)) 203 198 mod_timer(&q->timeout, expiry); 199 + 200 + } 201 + 202 + /** 203 + * blk_add_timer - Start timeout timer for a single request 204 + * @req: request that is about to start running. 205 + * 206 + * Notes: 207 + * Each request has its own timer, and as it is added to the queue, we 208 + * set up the timer. When the request completes, we cancel the timer. 209 + */ 210 + void blk_add_timer(struct request *req) 211 + { 212 + __blk_add_timer(req, &req->q->timeout_list); 204 213 } 205 214

+17

block/blk.h

··· 10 10 #define BLK_BATCH_REQ 32 11 11 12 12 extern struct kmem_cache *blk_requestq_cachep; 13 + extern struct kmem_cache *request_cachep; 13 14 extern struct kobj_type blk_queue_ktype; 14 15 extern struct ida blk_queue_ida; 15 16 ··· 35 34 unsigned int nr_bytes, unsigned int bidi_bytes); 36 35 37 36 void blk_rq_timed_out_timer(unsigned long data); 37 + void blk_rq_check_expired(struct request *rq, unsigned long *next_timeout, 38 + unsigned int *next_set); 39 + void __blk_add_timer(struct request *req, struct list_head *timeout_list); 38 40 void blk_delete_timer(struct request *); 39 41 void blk_add_timer(struct request *); 42 + 43 + 44 + bool bio_attempt_front_merge(struct request_queue *q, struct request *req, 45 + struct bio *bio); 46 + bool bio_attempt_back_merge(struct request_queue *q, struct request *req, 47 + struct bio *bio); 48 + bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio, 49 + unsigned int *request_count); 50 + 51 + void blk_account_io_start(struct request *req, bool new_io); 52 + void blk_account_io_completion(struct request *req, unsigned int bytes); 53 + void blk_account_io_done(struct request *req); 40 54 41 55 /* 42 56 * Internal atomic flags for request handling 43 57 */ 44 58 enum rq_atomic_flags { 45 59 REQ_ATOM_COMPLETE = 0, 60 + REQ_ATOM_STARTED, 46 61 }; 47 62 48 63 /*

+2

include/linux/bio.h

··· 419 419 bl->head = bl->tail = NULL; 420 420 } 421 421 422 + #define BIO_EMPTY_LIST { NULL, NULL } 423 + 422 424 #define bio_list_for_each(bio, bl) \ 423 425 for (bio = (bl)->head; bio; bio = bio->bi_next) 424 426

+182

include/linux/blk-mq.h

··· 1 + #ifndef BLK_MQ_H 2 + #define BLK_MQ_H 3 + 4 + #include <linux/blkdev.h> 5 + 6 + struct blk_mq_tags; 7 + 8 + struct blk_mq_cpu_notifier { 9 + struct list_head list; 10 + void *data; 11 + void (*notify)(void *data, unsigned long action, unsigned int cpu); 12 + }; 13 + 14 + struct blk_mq_hw_ctx { 15 + struct { 16 + spinlock_t lock; 17 + struct list_head dispatch; 18 + } ____cacheline_aligned_in_smp; 19 + 20 + unsigned long state; /* BLK_MQ_S_* flags */ 21 + struct delayed_work delayed_work; 22 + 23 + unsigned long flags; /* BLK_MQ_F_* flags */ 24 + 25 + struct request_queue *queue; 26 + unsigned int queue_num; 27 + 28 + void *driver_data; 29 + 30 + unsigned int nr_ctx; 31 + struct blk_mq_ctx **ctxs; 32 + unsigned int nr_ctx_map; 33 + unsigned long *ctx_map; 34 + 35 + struct request **rqs; 36 + struct list_head page_list; 37 + struct blk_mq_tags *tags; 38 + 39 + unsigned long queued; 40 + unsigned long run; 41 + #define BLK_MQ_MAX_DISPATCH_ORDER 10 42 + unsigned long dispatched[BLK_MQ_MAX_DISPATCH_ORDER]; 43 + 44 + unsigned int queue_depth; 45 + unsigned int numa_node; 46 + unsigned int cmd_size; /* per-request extra data */ 47 + 48 + struct blk_mq_cpu_notifier cpu_notifier; 49 + struct kobject kobj; 50 + }; 51 + 52 + struct blk_mq_reg { 53 + struct blk_mq_ops *ops; 54 + unsigned int nr_hw_queues; 55 + unsigned int queue_depth; 56 + unsigned int reserved_tags; 57 + unsigned int cmd_size; /* per-request extra data */ 58 + int numa_node; 59 + unsigned int timeout; 60 + unsigned int flags; /* BLK_MQ_F_* */ 61 + }; 62 + 63 + typedef int (queue_rq_fn)(struct blk_mq_hw_ctx *, struct request *); 64 + typedef struct blk_mq_hw_ctx *(map_queue_fn)(struct request_queue *, const int); 65 + typedef struct blk_mq_hw_ctx *(alloc_hctx_fn)(struct blk_mq_reg *,unsigned int); 66 + typedef void (free_hctx_fn)(struct blk_mq_hw_ctx *, unsigned int); 67 + typedef int (init_hctx_fn)(struct blk_mq_hw_ctx *, void *, unsigned int); 68 + typedef void (exit_hctx_fn)(struct blk_mq_hw_ctx *, unsigned int); 69 + 70 + struct blk_mq_ops { 71 + /* 72 + * Queue request 73 + */ 74 + queue_rq_fn *queue_rq; 75 + 76 + /* 77 + * Map to specific hardware queue 78 + */ 79 + map_queue_fn *map_queue; 80 + 81 + /* 82 + * Called on request timeout 83 + */ 84 + rq_timed_out_fn *timeout; 85 + 86 + /* 87 + * Override for hctx allocations (should probably go) 88 + */ 89 + alloc_hctx_fn *alloc_hctx; 90 + free_hctx_fn *free_hctx; 91 + 92 + /* 93 + * Called when the block layer side of a hardware queue has been 94 + * set up, allowing the driver to allocate/init matching structures. 95 + * Ditto for exit/teardown. 96 + */ 97 + init_hctx_fn *init_hctx; 98 + exit_hctx_fn *exit_hctx; 99 + }; 100 + 101 + enum { 102 + BLK_MQ_RQ_QUEUE_OK = 0, /* queued fine */ 103 + BLK_MQ_RQ_QUEUE_BUSY = 1, /* requeue IO for later */ 104 + BLK_MQ_RQ_QUEUE_ERROR = 2, /* end IO with error */ 105 + 106 + BLK_MQ_F_SHOULD_MERGE = 1 << 0, 107 + BLK_MQ_F_SHOULD_SORT = 1 << 1, 108 + BLK_MQ_F_SHOULD_IPI = 1 << 2, 109 + 110 + BLK_MQ_S_STOPPED = 1 << 0, 111 + 112 + BLK_MQ_MAX_DEPTH = 2048, 113 + }; 114 + 115 + struct request_queue *blk_mq_init_queue(struct blk_mq_reg *, void *); 116 + void blk_mq_free_queue(struct request_queue *); 117 + int blk_mq_register_disk(struct gendisk *); 118 + void blk_mq_unregister_disk(struct gendisk *); 119 + void blk_mq_init_commands(struct request_queue *, void (*init)(void *data, struct blk_mq_hw_ctx *, struct request *, unsigned int), void *data); 120 + 121 + void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule); 122 + 123 + void blk_mq_insert_request(struct request_queue *, struct request *, bool); 124 + void blk_mq_run_queues(struct request_queue *q, bool async); 125 + void blk_mq_free_request(struct request *rq); 126 + bool blk_mq_can_queue(struct blk_mq_hw_ctx *); 127 + struct request *blk_mq_alloc_request(struct request_queue *q, int rw, gfp_t gfp); 128 + struct request *blk_mq_alloc_reserved_request(struct request_queue *q, int rw, gfp_t gfp); 129 + struct request *blk_mq_rq_from_tag(struct request_queue *q, unsigned int tag); 130 + 131 + struct blk_mq_hw_ctx *blk_mq_map_queue(struct request_queue *, const int ctx_index); 132 + struct blk_mq_hw_ctx *blk_mq_alloc_single_hw_queue(struct blk_mq_reg *, unsigned int); 133 + void blk_mq_free_single_hw_queue(struct blk_mq_hw_ctx *, unsigned int); 134 + 135 + void blk_mq_end_io(struct request *rq, int error); 136 + 137 + void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx); 138 + void blk_mq_start_hw_queue(struct blk_mq_hw_ctx *hctx); 139 + void blk_mq_start_stopped_hw_queues(struct request_queue *q); 140 + 141 + /* 142 + * Driver command data is immediately after the request. So subtract request 143 + * size to get back to the original request. 144 + */ 145 + static inline struct request *blk_mq_rq_from_pdu(void *pdu) 146 + { 147 + return pdu - sizeof(struct request); 148 + } 149 + static inline void *blk_mq_rq_to_pdu(struct request *rq) 150 + { 151 + return (void *) rq + sizeof(*rq); 152 + } 153 + 154 + static inline struct request *blk_mq_tag_to_rq(struct blk_mq_hw_ctx *hctx, 155 + unsigned int tag) 156 + { 157 + return hctx->rqs[tag]; 158 + } 159 + 160 + #define queue_for_each_hw_ctx(q, hctx, i) \ 161 + for ((i) = 0, hctx = (q)->queue_hw_ctx[0]; \ 162 + (i) < (q)->nr_hw_queues; (i)++, hctx = (q)->queue_hw_ctx[i]) 163 + 164 + #define queue_for_each_ctx(q, ctx, i) \ 165 + for ((i) = 0, ctx = per_cpu_ptr((q)->queue_ctx, 0); \ 166 + (i) < (q)->nr_queues; (i)++, ctx = per_cpu_ptr(q->queue_ctx, (i))) 167 + 168 + #define hctx_for_each_ctx(hctx, ctx, i) \ 169 + for ((i) = 0, ctx = (hctx)->ctxs[0]; \ 170 + (i) < (hctx)->nr_ctx; (i)++, ctx = (hctx)->ctxs[(i)]) 171 + 172 + #define blk_ctx_sum(q, sum) \ 173 + ({ \ 174 + struct blk_mq_ctx *__x; \ 175 + unsigned int __ret = 0, __i; \ 176 + \ 177 + queue_for_each_ctx((q), __x, __i) \ 178 + __ret += sum; \ 179 + __ret; \ 180 + }) 181 + 182 + #endif

+2

include/linux/blk_types.h

··· 178 178 __REQ_MIXED_MERGE, /* merge of different types, fail separately */ 179 179 __REQ_KERNEL, /* direct IO to kernel pages */ 180 180 __REQ_PM, /* runtime pm request */ 181 + __REQ_END, /* last of chain of requests */ 181 182 __REQ_NR_BITS, /* stops here */ 182 183 }; 183 184 ··· 230 229 #define REQ_SECURE (1ULL << __REQ_SECURE) 231 230 #define REQ_KERNEL (1ULL << __REQ_KERNEL) 232 231 #define REQ_PM (1ULL << __REQ_PM) 232 + #define REQ_END (1ULL << __REQ_END) 233 233 234 234 #endif /* __LINUX_BLK_TYPES_H */

+47 -7

include/linux/blkdev.h

··· 8 8 #include <linux/major.h> 9 9 #include <linux/genhd.h> 10 10 #include <linux/list.h> 11 + #include <linux/llist.h> 11 12 #include <linux/timer.h> 12 13 #include <linux/workqueue.h> 13 14 #include <linux/pagemap.h> ··· 95 94 * as well! 96 95 */ 97 96 struct request { 98 - struct list_head queuelist; 99 - struct call_single_data csd; 97 + union { 98 + struct list_head queuelist; 99 + struct llist_node ll_list; 100 + }; 101 + union { 102 + struct call_single_data csd; 103 + struct work_struct mq_flush_data; 104 + }; 100 105 101 106 struct request_queue *q; 107 + struct blk_mq_ctx *mq_ctx; 102 108 103 109 u64 cmd_flags; 104 110 enum rq_cmd_type_bits cmd_type; ··· 221 213 222 214 #include <linux/elevator.h> 223 215 216 + struct blk_queue_ctx; 217 + 224 218 typedef void (request_fn_proc) (struct request_queue *q); 225 219 typedef void (make_request_fn) (struct request_queue *q, struct bio *bio); 226 220 typedef int (prep_rq_fn) (struct request_queue *, struct request *); ··· 321 311 dma_drain_needed_fn *dma_drain_needed; 322 312 lld_busy_fn *lld_busy_fn; 323 313 314 + struct blk_mq_ops *mq_ops; 315 + 316 + unsigned int *mq_map; 317 + 318 + /* sw queues */ 319 + struct blk_mq_ctx *queue_ctx; 320 + unsigned int nr_queues; 321 + 322 + /* hw dispatch queues */ 323 + struct blk_mq_hw_ctx **queue_hw_ctx; 324 + unsigned int nr_hw_queues; 325 + 324 326 /* 325 327 * Dispatch queue sorting 326 328 */ ··· 380 358 * queue kobject 381 359 */ 382 360 struct kobject kobj; 361 + 362 + /* 363 + * mq queue kobject 364 + */ 365 + struct kobject mq_kobj; 383 366 384 367 #ifdef CONFIG_PM_RUNTIME 385 368 struct device *dev; ··· 450 423 unsigned long flush_pending_since; 451 424 struct list_head flush_queue[2]; 452 425 struct list_head flush_data_in_flight; 453 - struct request flush_rq; 426 + union { 427 + struct request flush_rq; 428 + struct { 429 + spinlock_t mq_flush_lock; 430 + struct work_struct mq_flush_work; 431 + }; 432 + }; 454 433 455 434 struct mutex sysfs_lock; 456 435 ··· 468 435 struct bsg_class_device bsg_dev; 469 436 #endif 470 437 471 - #ifdef CONFIG_BLK_CGROUP 472 - struct list_head all_q_node; 473 - #endif 474 438 #ifdef CONFIG_BLK_DEV_THROTTLING 475 439 /* Throttle data */ 476 440 struct throtl_data *td; 477 441 #endif 478 442 struct rcu_head rcu_head; 443 + wait_queue_head_t mq_freeze_wq; 444 + struct percpu_counter mq_usage_counter; 445 + struct list_head all_q_node; 479 446 }; 480 447 481 448 #define QUEUE_FLAG_QUEUED 1 /* uses generic tag queueing */ ··· 498 465 #define QUEUE_FLAG_SECDISCARD 17 /* supports SECDISCARD */ 499 466 #define QUEUE_FLAG_SAME_FORCE 18 /* force complete on same CPU */ 500 467 #define QUEUE_FLAG_DEAD 19 /* queue tear-down finished */ 468 + #define QUEUE_FLAG_INIT_DONE 20 /* queue is initialized */ 501 469 502 470 #define QUEUE_FLAG_DEFAULT ((1 << QUEUE_FLAG_IO_STAT) | \ 503 471 (1 << QUEUE_FLAG_STACKABLE) | \ ··· 571 537 #define blk_queue_dying(q) test_bit(QUEUE_FLAG_DYING, &(q)->queue_flags) 572 538 #define blk_queue_dead(q) test_bit(QUEUE_FLAG_DEAD, &(q)->queue_flags) 573 539 #define blk_queue_bypass(q) test_bit(QUEUE_FLAG_BYPASS, &(q)->queue_flags) 540 + #define blk_queue_init_done(q) test_bit(QUEUE_FLAG_INIT_DONE, &(q)->queue_flags) 574 541 #define blk_queue_nomerges(q) test_bit(QUEUE_FLAG_NOMERGES, &(q)->queue_flags) 575 542 #define blk_queue_noxmerges(q) \ 576 543 test_bit(QUEUE_FLAG_NOXMERGES, &(q)->queue_flags) ··· 1046 1011 struct blk_plug { 1047 1012 unsigned long magic; /* detect uninitialized use-cases */ 1048 1013 struct list_head list; /* requests */ 1014 + struct list_head mq_list; /* blk-mq requests */ 1049 1015 struct list_head cb_list; /* md requires an unplug callback */ 1050 1016 }; 1051 1017 #define BLK_MAX_REQUEST_COUNT 16 ··· 1084 1048 { 1085 1049 struct blk_plug *plug = tsk->plug; 1086 1050 1087 - return plug && (!list_empty(&plug->list) || !list_empty(&plug->cb_list)); 1051 + return plug && 1052 + (!list_empty(&plug->list) || 1053 + !list_empty(&plug->mq_list) || 1054 + !list_empty(&plug->cb_list)); 1088 1055 } 1089 1056 1090 1057 /* ··· 1362 1323 1363 1324 struct work_struct; 1364 1325 int kblockd_schedule_work(struct request_queue *q, struct work_struct *work); 1326 + int kblockd_schedule_delayed_work(struct request_queue *q, struct delayed_work *dwork, unsigned long delay); 1365 1327 1366 1328 #ifdef CONFIG_BLK_CGROUP 1367 1329 /*