Merge branch 'blk-mq/core' into for-3.13/core

+3 -2

block/Makefile

··· 5 5 obj-$(CONFIG_BLOCK) := elevator.o blk-core.o blk-tag.o blk-sysfs.o \ 6 6 blk-flush.o blk-settings.o blk-ioc.o blk-map.o \ 7 7 blk-exec.o blk-merge.o blk-softirq.o blk-timeout.o \ 8 - blk-iopoll.o blk-lib.o ioctl.o genhd.o scsi_ioctl.o \ 9 - partition-generic.o partitions/ 8 + blk-iopoll.o blk-lib.o blk-mq.o blk-mq-tag.o \ 9 + blk-mq-sysfs.o blk-mq-cpu.o blk-mq-cpumap.o ioctl.o \ 10 + genhd.o scsi_ioctl.o partition-generic.o partitions/ 10 11 11 12 obj-$(CONFIG_BLK_DEV_BSG) += bsg.o 12 13 obj-$(CONFIG_BLK_DEV_BSGLIB) += bsg-lib.o

+93 -64

block/blk-core.c

··· 16 16 #include <linux/backing-dev.h> 17 17 #include <linux/bio.h> 18 18 #include <linux/blkdev.h> 19 + #include <linux/blk-mq.h> 19 20 #include <linux/highmem.h> 20 21 #include <linux/mm.h> 21 22 #include <linux/kernel_stat.h> ··· 49 48 /* 50 49 * For the allocated request tables 51 50 */ 52 - static struct kmem_cache *request_cachep; 51 + struct kmem_cache *request_cachep = NULL; 53 52 54 53 /* 55 54 * For queue allocation ··· 60 59 * Controlling structure to kblockd 61 60 */ 62 61 static struct workqueue_struct *kblockd_workqueue; 63 - 64 - static void drive_stat_acct(struct request *rq, int new_io) 65 - { 66 - struct hd_struct *part; 67 - int rw = rq_data_dir(rq); 68 - int cpu; 69 - 70 - if (!blk_do_io_stat(rq)) 71 - return; 72 - 73 - cpu = part_stat_lock(); 74 - 75 - if (!new_io) { 76 - part = rq->part; 77 - part_stat_inc(cpu, part, merges[rw]); 78 - } else { 79 - part = disk_map_sector_rcu(rq->rq_disk, blk_rq_pos(rq)); 80 - if (!hd_struct_try_get(part)) { 81 - /* 82 - * The partition is already being removed, 83 - * the request will be accounted on the disk only 84 - * 85 - * We take a reference on disk->part0 although that 86 - * partition will never be deleted, so we can treat 87 - * it as any other partition. 88 - */ 89 - part = &rq->rq_disk->part0; 90 - hd_struct_get(part); 91 - } 92 - part_round_stats(cpu, part); 93 - part_inc_in_flight(part, rw); 94 - rq->part = part; 95 - } 96 - 97 - part_stat_unlock(); 98 - } 99 62 100 63 void blk_queue_congestion_threshold(struct request_queue *q) 101 64 { ··· 110 145 rq->cmd = rq->__cmd; 111 146 rq->cmd_len = BLK_MAX_CDB; 112 147 rq->tag = -1; 113 - rq->ref_count = 1; 114 148 rq->start_time = jiffies; 115 149 set_start_time_ns(rq); 116 150 rq->part = NULL; ··· 138 174 { 139 175 int bit; 140 176 141 - printk(KERN_INFO "%s: dev %s: type=%x, flags=%x\n", msg, 177 + printk(KERN_INFO "%s: dev %s: type=%x, flags=%llx\n", msg, 142 178 rq->rq_disk ? rq->rq_disk->disk_name : "?", rq->cmd_type, 143 - rq->cmd_flags); 179 + (unsigned long long) rq->cmd_flags); 144 180 145 181 printk(KERN_INFO " sector %llu, nr/cnr %u/%u\n", 146 182 (unsigned long long)blk_rq_pos(rq), ··· 559 595 if (!q) 560 596 return NULL; 561 597 598 + if (percpu_counter_init(&q->mq_usage_counter, 0)) 599 + goto fail_q; 600 + 562 601 q->id = ida_simple_get(&blk_queue_ida, 0, 0, gfp_mask); 563 602 if (q->id < 0) 564 - goto fail_q; 603 + goto fail_c; 565 604 566 605 q->backing_dev_info.ra_pages = 567 606 (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE; ··· 611 644 q->bypass_depth = 1; 612 645 __set_bit(QUEUE_FLAG_BYPASS, &q->queue_flags); 613 646 647 + init_waitqueue_head(&q->mq_freeze_wq); 648 + 614 649 if (blkcg_init_queue(q)) 615 650 goto fail_bdi; 616 651 ··· 622 653 bdi_destroy(&q->backing_dev_info); 623 654 fail_id: 624 655 ida_simple_remove(&blk_queue_ida, q->id); 656 + fail_c: 657 + percpu_counter_destroy(&q->mq_usage_counter); 625 658 fail_q: 626 659 kmem_cache_free(blk_requestq_cachep, q); 627 660 return NULL; ··· 1090 1119 goto retry; 1091 1120 } 1092 1121 1093 - struct request *blk_get_request(struct request_queue *q, int rw, gfp_t gfp_mask) 1122 + static struct request *blk_old_get_request(struct request_queue *q, int rw, 1123 + gfp_t gfp_mask) 1094 1124 { 1095 1125 struct request *rq; 1096 1126 ··· 1107 1135 /* q->queue_lock is unlocked at this point */ 1108 1136 1109 1137 return rq; 1138 + } 1139 + 1140 + struct request *blk_get_request(struct request_queue *q, int rw, gfp_t gfp_mask) 1141 + { 1142 + if (q->mq_ops) 1143 + return blk_mq_alloc_request(q, rw, gfp_mask, false); 1144 + else 1145 + return blk_old_get_request(q, rw, gfp_mask); 1110 1146 } 1111 1147 EXPORT_SYMBOL(blk_get_request); 1112 1148 ··· 1201 1221 static void add_acct_request(struct request_queue *q, struct request *rq, 1202 1222 int where) 1203 1223 { 1204 - drive_stat_acct(rq, 1); 1224 + blk_account_io_start(rq, true); 1205 1225 __elv_add_request(q, rq, where); 1206 1226 } 1207 1227 ··· 1262 1282 { 1263 1283 if (unlikely(!q)) 1264 1284 return; 1265 - if (unlikely(--req->ref_count)) 1266 - return; 1267 1285 1268 1286 blk_pm_put_request(req); 1269 1287 ··· 1290 1312 1291 1313 void blk_put_request(struct request *req) 1292 1314 { 1293 - unsigned long flags; 1294 1315 struct request_queue *q = req->q; 1295 1316 1296 - spin_lock_irqsave(q->queue_lock, flags); 1297 - __blk_put_request(q, req); 1298 - spin_unlock_irqrestore(q->queue_lock, flags); 1317 + if (q->mq_ops) 1318 + blk_mq_free_request(req); 1319 + else { 1320 + unsigned long flags; 1321 + 1322 + spin_lock_irqsave(q->queue_lock, flags); 1323 + __blk_put_request(q, req); 1324 + spin_unlock_irqrestore(q->queue_lock, flags); 1325 + } 1299 1326 } 1300 1327 EXPORT_SYMBOL(blk_put_request); 1301 1328 ··· 1336 1353 } 1337 1354 EXPORT_SYMBOL_GPL(blk_add_request_payload); 1338 1355 1339 - static bool bio_attempt_back_merge(struct request_queue *q, struct request *req, 1340 - struct bio *bio) 1356 + bool bio_attempt_back_merge(struct request_queue *q, struct request *req, 1357 + struct bio *bio) 1341 1358 { 1342 1359 const int ff = bio->bi_rw & REQ_FAILFAST_MASK; 1343 1360 ··· 1354 1371 req->__data_len += bio->bi_size; 1355 1372 req->ioprio = ioprio_best(req->ioprio, bio_prio(bio)); 1356 1373 1357 - drive_stat_acct(req, 0); 1374 + blk_account_io_start(req, false); 1358 1375 return true; 1359 1376 } 1360 1377 1361 - static bool bio_attempt_front_merge(struct request_queue *q, 1362 - struct request *req, struct bio *bio) 1378 + bool bio_attempt_front_merge(struct request_queue *q, struct request *req, 1379 + struct bio *bio) 1363 1380 { 1364 1381 const int ff = bio->bi_rw & REQ_FAILFAST_MASK; 1365 1382 ··· 1384 1401 req->__data_len += bio->bi_size; 1385 1402 req->ioprio = ioprio_best(req->ioprio, bio_prio(bio)); 1386 1403 1387 - drive_stat_acct(req, 0); 1404 + blk_account_io_start(req, false); 1388 1405 return true; 1389 1406 } 1390 1407 1391 1408 /** 1392 - * attempt_plug_merge - try to merge with %current's plugged list 1409 + * blk_attempt_plug_merge - try to merge with %current's plugged list 1393 1410 * @q: request_queue new bio is being queued at 1394 1411 * @bio: new bio being queued 1395 1412 * @request_count: out parameter for number of traversed plugged requests ··· 1405 1422 * reliable access to the elevator outside queue lock. Only check basic 1406 1423 * merging parameters without querying the elevator. 1407 1424 */ 1408 - static bool attempt_plug_merge(struct request_queue *q, struct bio *bio, 1409 - unsigned int *request_count) 1425 + bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio, 1426 + unsigned int *request_count) 1410 1427 { 1411 1428 struct blk_plug *plug; 1412 1429 struct request *rq; 1413 1430 bool ret = false; 1431 + struct list_head *plug_list; 1414 1432 1415 1433 if (blk_queue_nomerges(q)) 1416 1434 goto out; ··· 1421 1437 goto out; 1422 1438 *request_count = 0; 1423 1439 1424 - list_for_each_entry_reverse(rq, &plug->list, queuelist) { 1440 + if (q->mq_ops) 1441 + plug_list = &plug->mq_list; 1442 + else 1443 + plug_list = &plug->list; 1444 + 1445 + list_for_each_entry_reverse(rq, plug_list, queuelist) { 1425 1446 int el_ret; 1426 1447 1427 1448 if (rq->q == q) ··· 1494 1505 * Check if we can merge with the plugged list before grabbing 1495 1506 * any locks. 1496 1507 */ 1497 - if (attempt_plug_merge(q, bio, &request_count)) 1508 + if (blk_attempt_plug_merge(q, bio, &request_count)) 1498 1509 return; 1499 1510 1500 1511 spin_lock_irq(q->queue_lock); ··· 1562 1573 } 1563 1574 } 1564 1575 list_add_tail(&req->queuelist, &plug->list); 1565 - drive_stat_acct(req, 1); 1576 + blk_account_io_start(req, true); 1566 1577 } else { 1567 1578 spin_lock_irq(q->queue_lock); 1568 1579 add_acct_request(q, req, where); ··· 2016 2027 } 2017 2028 EXPORT_SYMBOL_GPL(blk_rq_err_bytes); 2018 2029 2019 - static void blk_account_io_completion(struct request *req, unsigned int bytes) 2030 + void blk_account_io_completion(struct request *req, unsigned int bytes) 2020 2031 { 2021 2032 if (blk_do_io_stat(req)) { 2022 2033 const int rw = rq_data_dir(req); ··· 2030 2041 } 2031 2042 } 2032 2043 2033 - static void blk_account_io_done(struct request *req) 2044 + void blk_account_io_done(struct request *req) 2034 2045 { 2035 2046 /* 2036 2047 * Account IO completion. flush_rq isn't accounted as a ··· 2077 2088 return rq; 2078 2089 } 2079 2090 #endif 2091 + 2092 + void blk_account_io_start(struct request *rq, bool new_io) 2093 + { 2094 + struct hd_struct *part; 2095 + int rw = rq_data_dir(rq); 2096 + int cpu; 2097 + 2098 + if (!blk_do_io_stat(rq)) 2099 + return; 2100 + 2101 + cpu = part_stat_lock(); 2102 + 2103 + if (!new_io) { 2104 + part = rq->part; 2105 + part_stat_inc(cpu, part, merges[rw]); 2106 + } else { 2107 + part = disk_map_sector_rcu(rq->rq_disk, blk_rq_pos(rq)); 2108 + if (!hd_struct_try_get(part)) { 2109 + /* 2110 + * The partition is already being removed, 2111 + * the request will be accounted on the disk only 2112 + * 2113 + * We take a reference on disk->part0 although that 2114 + * partition will never be deleted, so we can treat 2115 + * it as any other partition. 2116 + */ 2117 + part = &rq->rq_disk->part0; 2118 + hd_struct_get(part); 2119 + } 2120 + part_round_stats(cpu, part); 2121 + part_inc_in_flight(part, rw); 2122 + rq->part = part; 2123 + } 2124 + 2125 + part_stat_unlock(); 2126 + } 2080 2127 2081 2128 /** 2082 2129 * blk_peek_request - peek at the top of a request queue ··· 2489 2464 2490 2465 if (req->cmd_flags & REQ_DONTPREP) 2491 2466 blk_unprep_request(req); 2492 - 2493 2467 2494 2468 blk_account_io_done(req); 2495 2469 ··· 2911 2887 2912 2888 plug->magic = PLUG_MAGIC; 2913 2889 INIT_LIST_HEAD(&plug->list); 2890 + INIT_LIST_HEAD(&plug->mq_list); 2914 2891 INIT_LIST_HEAD(&plug->cb_list); 2915 2892 2916 2893 /* ··· 3009 2984 BUG_ON(plug->magic != PLUG_MAGIC); 3010 2985 3011 2986 flush_plug_callbacks(plug, from_schedule); 2987 + 2988 + if (!list_empty(&plug->mq_list)) 2989 + blk_mq_flush_plug_list(plug, from_schedule); 2990 + 3012 2991 if (list_empty(&plug->list)) 3013 2992 return; 3014 2993

+7 -7

block/blk-exec.c

··· 5 5 #include <linux/module.h> 6 6 #include <linux/bio.h> 7 7 #include <linux/blkdev.h> 8 + #include <linux/blk-mq.h> 8 9 #include <linux/sched/sysctl.h> 9 10 10 11 #include "blk.h" ··· 25 24 struct completion *waiting = rq->end_io_data; 26 25 27 26 rq->end_io_data = NULL; 28 - __blk_put_request(rq->q, rq); 29 27 30 28 /* 31 29 * complete last, if this is a stack request the process (and thus ··· 59 59 60 60 rq->rq_disk = bd_disk; 61 61 rq->end_io = done; 62 + 63 + if (q->mq_ops) { 64 + blk_mq_insert_request(q, rq, true); 65 + return; 66 + } 67 + 62 68 /* 63 69 * need to check this before __blk_run_queue(), because rq can 64 70 * be freed before that returns. ··· 108 102 char sense[SCSI_SENSE_BUFFERSIZE]; 109 103 int err = 0; 110 104 unsigned long hang_check; 111 - 112 - /* 113 - * we need an extra reference to the request, so we can look at 114 - * it after io completion 115 - */ 116 - rq->ref_count++; 117 105 118 106 if (!rq->sense) { 119 107 memset(sense, 0, sizeof(sense));

+139 -15

block/blk-flush.c

··· 69 69 #include <linux/bio.h> 70 70 #include <linux/blkdev.h> 71 71 #include <linux/gfp.h> 72 + #include <linux/blk-mq.h> 72 73 73 74 #include "blk.h" 75 + #include "blk-mq.h" 74 76 75 77 /* FLUSH/FUA sequences */ 76 78 enum { ··· 126 124 /* make @rq a normal request */ 127 125 rq->cmd_flags &= ~REQ_FLUSH_SEQ; 128 126 rq->end_io = rq->flush.saved_end_io; 127 + 128 + blk_clear_rq_complete(rq); 129 + } 130 + 131 + static void mq_flush_data_run(struct work_struct *work) 132 + { 133 + struct request *rq; 134 + 135 + rq = container_of(work, struct request, mq_flush_data); 136 + 137 + memset(&rq->csd, 0, sizeof(rq->csd)); 138 + blk_mq_run_request(rq, true, false); 139 + } 140 + 141 + static void blk_mq_flush_data_insert(struct request *rq) 142 + { 143 + INIT_WORK(&rq->mq_flush_data, mq_flush_data_run); 144 + kblockd_schedule_work(rq->q, &rq->mq_flush_data); 129 145 } 130 146 131 147 /** ··· 156 136 * completion and trigger the next step. 157 137 * 158 138 * CONTEXT: 159 - * spin_lock_irq(q->queue_lock) 139 + * spin_lock_irq(q->queue_lock or q->mq_flush_lock) 160 140 * 161 141 * RETURNS: 162 142 * %true if requests were added to the dispatch queue, %false otherwise. ··· 166 146 { 167 147 struct request_queue *q = rq->q; 168 148 struct list_head *pending = &q->flush_queue[q->flush_pending_idx]; 169 - bool queued = false; 149 + bool queued = false, kicked; 170 150 171 151 BUG_ON(rq->flush.seq & seq); 172 152 rq->flush.seq |= seq; ··· 187 167 188 168 case REQ_FSEQ_DATA: 189 169 list_move_tail(&rq->flush.list, &q->flush_data_in_flight); 190 - list_add(&rq->queuelist, &q->queue_head); 191 - queued = true; 170 + if (q->mq_ops) 171 + blk_mq_flush_data_insert(rq); 172 + else { 173 + list_add(&rq->queuelist, &q->queue_head); 174 + queued = true; 175 + } 192 176 break; 193 177 194 178 case REQ_FSEQ_DONE: ··· 205 181 BUG_ON(!list_empty(&rq->queuelist)); 206 182 list_del_init(&rq->flush.list); 207 183 blk_flush_restore_request(rq); 208 - __blk_end_request_all(rq, error); 184 + if (q->mq_ops) 185 + blk_mq_end_io(rq, error); 186 + else 187 + __blk_end_request_all(rq, error); 209 188 break; 210 189 211 190 default: 212 191 BUG(); 213 192 } 214 193 215 - return blk_kick_flush(q) | queued; 194 + kicked = blk_kick_flush(q); 195 + /* blk_mq_run_flush will run queue */ 196 + if (q->mq_ops) 197 + return queued; 198 + return kicked | queued; 216 199 } 217 200 218 201 static void flush_end_io(struct request *flush_rq, int error) 219 202 { 220 203 struct request_queue *q = flush_rq->q; 221 - struct list_head *running = &q->flush_queue[q->flush_running_idx]; 204 + struct list_head *running; 222 205 bool queued = false; 223 206 struct request *rq, *n; 207 + unsigned long flags = 0; 224 208 209 + if (q->mq_ops) { 210 + blk_mq_free_request(flush_rq); 211 + spin_lock_irqsave(&q->mq_flush_lock, flags); 212 + } 213 + running = &q->flush_queue[q->flush_running_idx]; 225 214 BUG_ON(q->flush_pending_idx == q->flush_running_idx); 226 215 227 216 /* account completion of the flush request */ 228 217 q->flush_running_idx ^= 1; 229 - elv_completed_request(q, flush_rq); 218 + 219 + if (!q->mq_ops) 220 + elv_completed_request(q, flush_rq); 230 221 231 222 /* and push the waiting requests to the next stage */ 232 223 list_for_each_entry_safe(rq, n, running, flush.list) { ··· 262 223 * directly into request_fn may confuse the driver. Always use 263 224 * kblockd. 264 225 */ 265 - if (queued || q->flush_queue_delayed) 266 - blk_run_queue_async(q); 226 + if (queued || q->flush_queue_delayed) { 227 + if (!q->mq_ops) 228 + blk_run_queue_async(q); 229 + else 230 + /* 231 + * This can be optimized to only run queues with requests 232 + * queued if necessary. 233 + */ 234 + blk_mq_run_queues(q, true); 235 + } 267 236 q->flush_queue_delayed = 0; 237 + if (q->mq_ops) 238 + spin_unlock_irqrestore(&q->mq_flush_lock, flags); 239 + } 240 + 241 + static void mq_flush_work(struct work_struct *work) 242 + { 243 + struct request_queue *q; 244 + struct request *rq; 245 + 246 + q = container_of(work, struct request_queue, mq_flush_work); 247 + 248 + /* We don't need set REQ_FLUSH_SEQ, it's for consistency */ 249 + rq = blk_mq_alloc_request(q, WRITE_FLUSH|REQ_FLUSH_SEQ, 250 + __GFP_WAIT|GFP_ATOMIC, true); 251 + rq->cmd_type = REQ_TYPE_FS; 252 + rq->end_io = flush_end_io; 253 + 254 + blk_mq_run_request(rq, true, false); 255 + } 256 + 257 + /* 258 + * We can't directly use q->flush_rq, because it doesn't have tag and is not in 259 + * hctx->rqs[]. so we must allocate a new request, since we can't sleep here, 260 + * so offload the work to workqueue. 261 + * 262 + * Note: we assume a flush request finished in any hardware queue will flush 263 + * the whole disk cache. 264 + */ 265 + static void mq_run_flush(struct request_queue *q) 266 + { 267 + kblockd_schedule_work(q, &q->mq_flush_work); 268 268 } 269 269 270 270 /** ··· 314 236 * Please read the comment at the top of this file for more info. 315 237 * 316 238 * CONTEXT: 317 - * spin_lock_irq(q->queue_lock) 239 + * spin_lock_irq(q->queue_lock or q->mq_flush_lock) 318 240 * 319 241 * RETURNS: 320 242 * %true if flush was issued, %false otherwise. ··· 339 261 * Issue flush and toggle pending_idx. This makes pending_idx 340 262 * different from running_idx, which means flush is in flight. 341 263 */ 264 + q->flush_pending_idx ^= 1; 265 + if (q->mq_ops) { 266 + mq_run_flush(q); 267 + return true; 268 + } 269 + 342 270 blk_rq_init(q, &q->flush_rq); 343 271 q->flush_rq.cmd_type = REQ_TYPE_FS; 344 272 q->flush_rq.cmd_flags = WRITE_FLUSH | REQ_FLUSH_SEQ; 345 273 q->flush_rq.rq_disk = first_rq->rq_disk; 346 274 q->flush_rq.end_io = flush_end_io; 347 275 348 - q->flush_pending_idx ^= 1; 349 276 list_add_tail(&q->flush_rq.queuelist, &q->queue_head); 350 277 return true; 351 278 } ··· 367 284 blk_run_queue_async(q); 368 285 } 369 286 287 + static void mq_flush_data_end_io(struct request *rq, int error) 288 + { 289 + struct request_queue *q = rq->q; 290 + struct blk_mq_hw_ctx *hctx; 291 + struct blk_mq_ctx *ctx; 292 + unsigned long flags; 293 + 294 + ctx = rq->mq_ctx; 295 + hctx = q->mq_ops->map_queue(q, ctx->cpu); 296 + 297 + /* 298 + * After populating an empty queue, kick it to avoid stall. Read 299 + * the comment in flush_end_io(). 300 + */ 301 + spin_lock_irqsave(&q->mq_flush_lock, flags); 302 + if (blk_flush_complete_seq(rq, REQ_FSEQ_DATA, error)) 303 + blk_mq_run_hw_queue(hctx, true); 304 + spin_unlock_irqrestore(&q->mq_flush_lock, flags); 305 + } 306 + 370 307 /** 371 308 * blk_insert_flush - insert a new FLUSH/FUA request 372 309 * @rq: request to insert 373 310 * 374 311 * To be called from __elv_add_request() for %ELEVATOR_INSERT_FLUSH insertions. 312 + * or __blk_mq_run_hw_queue() to dispatch request. 375 313 * @rq is being submitted. Analyze what needs to be done and put it on the 376 314 * right queue. 377 315 * 378 316 * CONTEXT: 379 - * spin_lock_irq(q->queue_lock) 317 + * spin_lock_irq(q->queue_lock) in !mq case 380 318 */ 381 319 void blk_insert_flush(struct request *rq) 382 320 { ··· 420 316 * complete the request. 421 317 */ 422 318 if (!policy) { 423 - __blk_end_bidi_request(rq, 0, 0, 0); 319 + if (q->mq_ops) 320 + blk_mq_end_io(rq, 0); 321 + else 322 + __blk_end_bidi_request(rq, 0, 0, 0); 424 323 return; 425 324 } 426 325 ··· 436 329 */ 437 330 if ((policy & REQ_FSEQ_DATA) && 438 331 !(policy & (REQ_FSEQ_PREFLUSH | REQ_FSEQ_POSTFLUSH))) { 439 - list_add_tail(&rq->queuelist, &q->queue_head); 332 + if (q->mq_ops) { 333 + blk_mq_run_request(rq, false, true); 334 + } else 335 + list_add_tail(&rq->queuelist, &q->queue_head); 440 336 return; 441 337 } 442 338 ··· 451 341 INIT_LIST_HEAD(&rq->flush.list); 452 342 rq->cmd_flags |= REQ_FLUSH_SEQ; 453 343 rq->flush.saved_end_io = rq->end_io; /* Usually NULL */ 344 + if (q->mq_ops) { 345 + rq->end_io = mq_flush_data_end_io; 346 + 347 + spin_lock_irq(&q->mq_flush_lock); 348 + blk_flush_complete_seq(rq, REQ_FSEQ_ACTIONS & ~policy, 0); 349 + spin_unlock_irq(&q->mq_flush_lock); 350 + return; 351 + } 454 352 rq->end_io = flush_data_end_io; 455 353 456 354 blk_flush_complete_seq(rq, REQ_FSEQ_ACTIONS & ~policy, 0); ··· 571 453 return ret; 572 454 } 573 455 EXPORT_SYMBOL(blkdev_issue_flush); 456 + 457 + void blk_mq_init_flush(struct request_queue *q) 458 + { 459 + spin_lock_init(&q->mq_flush_lock); 460 + INIT_WORK(&q->mq_flush_work, mq_flush_work); 461 + }

+14 -3

block/blk-merge.c

··· 308 308 return ll_new_hw_segment(q, req, bio); 309 309 } 310 310 311 + /* 312 + * blk-mq uses req->special to carry normal driver per-request payload, it 313 + * does not indicate a prepared command that we cannot merge with. 314 + */ 315 + static bool req_no_special_merge(struct request *req) 316 + { 317 + struct request_queue *q = req->q; 318 + 319 + return !q->mq_ops && req->special; 320 + } 321 + 311 322 static int ll_merge_requests_fn(struct request_queue *q, struct request *req, 312 323 struct request *next) 313 324 { ··· 330 319 * First check if the either of the requests are re-queued 331 320 * requests. Can't merge them if they are. 332 321 */ 333 - if (req->special || next->special) 322 + if (req_no_special_merge(req) || req_no_special_merge(next)) 334 323 return 0; 335 324 336 325 /* ··· 427 416 428 417 if (rq_data_dir(req) != rq_data_dir(next) 429 418 || req->rq_disk != next->rq_disk 430 - || next->special) 419 + || req_no_special_merge(next)) 431 420 return 0; 432 421 433 422 if (req->cmd_flags & REQ_WRITE_SAME && ··· 526 515 return false; 527 516 528 517 /* must be same device and not a special request */ 529 - if (rq->rq_disk != bio->bi_bdev->bd_disk || rq->special) 518 + if (rq->rq_disk != bio->bi_bdev->bd_disk || req_no_special_merge(rq)) 530 519 return false; 531 520 532 521 /* only merge integrity protected bio into ditto rq */

+93

block/blk-mq-cpu.c

··· 1 + #include <linux/kernel.h> 2 + #include <linux/module.h> 3 + #include <linux/init.h> 4 + #include <linux/blkdev.h> 5 + #include <linux/list.h> 6 + #include <linux/llist.h> 7 + #include <linux/smp.h> 8 + #include <linux/cpu.h> 9 + 10 + #include <linux/blk-mq.h> 11 + #include "blk-mq.h" 12 + 13 + static LIST_HEAD(blk_mq_cpu_notify_list); 14 + static DEFINE_SPINLOCK(blk_mq_cpu_notify_lock); 15 + 16 + static int __cpuinit blk_mq_main_cpu_notify(struct notifier_block *self, 17 + unsigned long action, void *hcpu) 18 + { 19 + unsigned int cpu = (unsigned long) hcpu; 20 + struct blk_mq_cpu_notifier *notify; 21 + 22 + spin_lock(&blk_mq_cpu_notify_lock); 23 + 24 + list_for_each_entry(notify, &blk_mq_cpu_notify_list, list) 25 + notify->notify(notify->data, action, cpu); 26 + 27 + spin_unlock(&blk_mq_cpu_notify_lock); 28 + return NOTIFY_OK; 29 + } 30 + 31 + static void __cpuinit blk_mq_cpu_notify(void *data, unsigned long action, 32 + unsigned int cpu) 33 + { 34 + if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) { 35 + /* 36 + * If the CPU goes away, ensure that we run any pending 37 + * completions. 38 + */ 39 + struct llist_node *node; 40 + struct request *rq; 41 + 42 + local_irq_disable(); 43 + 44 + node = llist_del_all(&per_cpu(ipi_lists, cpu)); 45 + while (node) { 46 + struct llist_node *next = node->next; 47 + 48 + rq = llist_entry(node, struct request, ll_list); 49 + __blk_mq_end_io(rq, rq->errors); 50 + node = next; 51 + } 52 + 53 + local_irq_enable(); 54 + } 55 + } 56 + 57 + static struct notifier_block __cpuinitdata blk_mq_main_cpu_notifier = { 58 + .notifier_call = blk_mq_main_cpu_notify, 59 + }; 60 + 61 + void blk_mq_register_cpu_notifier(struct blk_mq_cpu_notifier *notifier) 62 + { 63 + BUG_ON(!notifier->notify); 64 + 65 + spin_lock(&blk_mq_cpu_notify_lock); 66 + list_add_tail(&notifier->list, &blk_mq_cpu_notify_list); 67 + spin_unlock(&blk_mq_cpu_notify_lock); 68 + } 69 + 70 + void blk_mq_unregister_cpu_notifier(struct blk_mq_cpu_notifier *notifier) 71 + { 72 + spin_lock(&blk_mq_cpu_notify_lock); 73 + list_del(&notifier->list); 74 + spin_unlock(&blk_mq_cpu_notify_lock); 75 + } 76 + 77 + void blk_mq_init_cpu_notifier(struct blk_mq_cpu_notifier *notifier, 78 + void (*fn)(void *, unsigned long, unsigned int), 79 + void *data) 80 + { 81 + notifier->notify = fn; 82 + notifier->data = data; 83 + } 84 + 85 + static struct blk_mq_cpu_notifier __cpuinitdata cpu_notifier = { 86 + .notify = blk_mq_cpu_notify, 87 + }; 88 + 89 + void __init blk_mq_cpu_init(void) 90 + { 91 + register_hotcpu_notifier(&blk_mq_main_cpu_notifier); 92 + blk_mq_register_cpu_notifier(&cpu_notifier); 93 + }

+108

block/blk-mq-cpumap.c

··· 1 + #include <linux/kernel.h> 2 + #include <linux/threads.h> 3 + #include <linux/module.h> 4 + #include <linux/mm.h> 5 + #include <linux/smp.h> 6 + #include <linux/cpu.h> 7 + 8 + #include <linux/blk-mq.h> 9 + #include "blk.h" 10 + #include "blk-mq.h" 11 + 12 + static void show_map(unsigned int *map, unsigned int nr) 13 + { 14 + int i; 15 + 16 + pr_info("blk-mq: CPU -> queue map\n"); 17 + for_each_online_cpu(i) 18 + pr_info(" CPU%2u -> Queue %u\n", i, map[i]); 19 + } 20 + 21 + static int cpu_to_queue_index(unsigned int nr_cpus, unsigned int nr_queues, 22 + const int cpu) 23 + { 24 + return cpu / ((nr_cpus + nr_queues - 1) / nr_queues); 25 + } 26 + 27 + static int get_first_sibling(unsigned int cpu) 28 + { 29 + unsigned int ret; 30 + 31 + ret = cpumask_first(topology_thread_cpumask(cpu)); 32 + if (ret < nr_cpu_ids) 33 + return ret; 34 + 35 + return cpu; 36 + } 37 + 38 + int blk_mq_update_queue_map(unsigned int *map, unsigned int nr_queues) 39 + { 40 + unsigned int i, nr_cpus, nr_uniq_cpus, queue, first_sibling; 41 + cpumask_var_t cpus; 42 + 43 + if (!alloc_cpumask_var(&cpus, GFP_ATOMIC)) 44 + return 1; 45 + 46 + cpumask_clear(cpus); 47 + nr_cpus = nr_uniq_cpus = 0; 48 + for_each_online_cpu(i) { 49 + nr_cpus++; 50 + first_sibling = get_first_sibling(i); 51 + if (!cpumask_test_cpu(first_sibling, cpus)) 52 + nr_uniq_cpus++; 53 + cpumask_set_cpu(i, cpus); 54 + } 55 + 56 + queue = 0; 57 + for_each_possible_cpu(i) { 58 + if (!cpu_online(i)) { 59 + map[i] = 0; 60 + continue; 61 + } 62 + 63 + /* 64 + * Easy case - we have equal or more hardware queues. Or 65 + * there are no thread siblings to take into account. Do 66 + * 1:1 if enough, or sequential mapping if less. 67 + */ 68 + if (nr_queues >= nr_cpus || nr_cpus == nr_uniq_cpus) { 69 + map[i] = cpu_to_queue_index(nr_cpus, nr_queues, queue); 70 + queue++; 71 + continue; 72 + } 73 + 74 + /* 75 + * Less then nr_cpus queues, and we have some number of 76 + * threads per cores. Map sibling threads to the same 77 + * queue. 78 + */ 79 + first_sibling = get_first_sibling(i); 80 + if (first_sibling == i) { 81 + map[i] = cpu_to_queue_index(nr_uniq_cpus, nr_queues, 82 + queue); 83 + queue++; 84 + } else 85 + map[i] = map[first_sibling]; 86 + } 87 + 88 + show_map(map, nr_cpus); 89 + free_cpumask_var(cpus); 90 + return 0; 91 + } 92 + 93 + unsigned int *blk_mq_make_queue_map(struct blk_mq_reg *reg) 94 + { 95 + unsigned int *map; 96 + 97 + /* If cpus are offline, map them to first hctx */ 98 + map = kzalloc_node(sizeof(*map) * num_possible_cpus(), GFP_KERNEL, 99 + reg->numa_node); 100 + if (!map) 101 + return NULL; 102 + 103 + if (!blk_mq_update_queue_map(map, reg->nr_hw_queues)) 104 + return map; 105 + 106 + kfree(map); 107 + return NULL; 108 + }

+384

block/blk-mq-sysfs.c

··· 1 + #include <linux/kernel.h> 2 + #include <linux/module.h> 3 + #include <linux/backing-dev.h> 4 + #include <linux/bio.h> 5 + #include <linux/blkdev.h> 6 + #include <linux/mm.h> 7 + #include <linux/init.h> 8 + #include <linux/slab.h> 9 + #include <linux/workqueue.h> 10 + #include <linux/smp.h> 11 + 12 + #include <linux/blk-mq.h> 13 + #include "blk-mq.h" 14 + #include "blk-mq-tag.h" 15 + 16 + static void blk_mq_sysfs_release(struct kobject *kobj) 17 + { 18 + } 19 + 20 + struct blk_mq_ctx_sysfs_entry { 21 + struct attribute attr; 22 + ssize_t (*show)(struct blk_mq_ctx *, char *); 23 + ssize_t (*store)(struct blk_mq_ctx *, const char *, size_t); 24 + }; 25 + 26 + struct blk_mq_hw_ctx_sysfs_entry { 27 + struct attribute attr; 28 + ssize_t (*show)(struct blk_mq_hw_ctx *, char *); 29 + ssize_t (*store)(struct blk_mq_hw_ctx *, const char *, size_t); 30 + }; 31 + 32 + static ssize_t blk_mq_sysfs_show(struct kobject *kobj, struct attribute *attr, 33 + char *page) 34 + { 35 + struct blk_mq_ctx_sysfs_entry *entry; 36 + struct blk_mq_ctx *ctx; 37 + struct request_queue *q; 38 + ssize_t res; 39 + 40 + entry = container_of(attr, struct blk_mq_ctx_sysfs_entry, attr); 41 + ctx = container_of(kobj, struct blk_mq_ctx, kobj); 42 + q = ctx->queue; 43 + 44 + if (!entry->show) 45 + return -EIO; 46 + 47 + res = -ENOENT; 48 + mutex_lock(&q->sysfs_lock); 49 + if (!blk_queue_dying(q)) 50 + res = entry->show(ctx, page); 51 + mutex_unlock(&q->sysfs_lock); 52 + return res; 53 + } 54 + 55 + static ssize_t blk_mq_sysfs_store(struct kobject *kobj, struct attribute *attr, 56 + const char *page, size_t length) 57 + { 58 + struct blk_mq_ctx_sysfs_entry *entry; 59 + struct blk_mq_ctx *ctx; 60 + struct request_queue *q; 61 + ssize_t res; 62 + 63 + entry = container_of(attr, struct blk_mq_ctx_sysfs_entry, attr); 64 + ctx = container_of(kobj, struct blk_mq_ctx, kobj); 65 + q = ctx->queue; 66 + 67 + if (!entry->store) 68 + return -EIO; 69 + 70 + res = -ENOENT; 71 + mutex_lock(&q->sysfs_lock); 72 + if (!blk_queue_dying(q)) 73 + res = entry->store(ctx, page, length); 74 + mutex_unlock(&q->sysfs_lock); 75 + return res; 76 + } 77 + 78 + static ssize_t blk_mq_hw_sysfs_show(struct kobject *kobj, 79 + struct attribute *attr, char *page) 80 + { 81 + struct blk_mq_hw_ctx_sysfs_entry *entry; 82 + struct blk_mq_hw_ctx *hctx; 83 + struct request_queue *q; 84 + ssize_t res; 85 + 86 + entry = container_of(attr, struct blk_mq_hw_ctx_sysfs_entry, attr); 87 + hctx = container_of(kobj, struct blk_mq_hw_ctx, kobj); 88 + q = hctx->queue; 89 + 90 + if (!entry->show) 91 + return -EIO; 92 + 93 + res = -ENOENT; 94 + mutex_lock(&q->sysfs_lock); 95 + if (!blk_queue_dying(q)) 96 + res = entry->show(hctx, page); 97 + mutex_unlock(&q->sysfs_lock); 98 + return res; 99 + } 100 + 101 + static ssize_t blk_mq_hw_sysfs_store(struct kobject *kobj, 102 + struct attribute *attr, const char *page, 103 + size_t length) 104 + { 105 + struct blk_mq_hw_ctx_sysfs_entry *entry; 106 + struct blk_mq_hw_ctx *hctx; 107 + struct request_queue *q; 108 + ssize_t res; 109 + 110 + entry = container_of(attr, struct blk_mq_hw_ctx_sysfs_entry, attr); 111 + hctx = container_of(kobj, struct blk_mq_hw_ctx, kobj); 112 + q = hctx->queue; 113 + 114 + if (!entry->store) 115 + return -EIO; 116 + 117 + res = -ENOENT; 118 + mutex_lock(&q->sysfs_lock); 119 + if (!blk_queue_dying(q)) 120 + res = entry->store(hctx, page, length); 121 + mutex_unlock(&q->sysfs_lock); 122 + return res; 123 + } 124 + 125 + static ssize_t blk_mq_sysfs_dispatched_show(struct blk_mq_ctx *ctx, char *page) 126 + { 127 + return sprintf(page, "%lu %lu\n", ctx->rq_dispatched[1], 128 + ctx->rq_dispatched[0]); 129 + } 130 + 131 + static ssize_t blk_mq_sysfs_merged_show(struct blk_mq_ctx *ctx, char *page) 132 + { 133 + return sprintf(page, "%lu\n", ctx->rq_merged); 134 + } 135 + 136 + static ssize_t blk_mq_sysfs_completed_show(struct blk_mq_ctx *ctx, char *page) 137 + { 138 + return sprintf(page, "%lu %lu\n", ctx->rq_completed[1], 139 + ctx->rq_completed[0]); 140 + } 141 + 142 + static ssize_t sysfs_list_show(char *page, struct list_head *list, char *msg) 143 + { 144 + char *start_page = page; 145 + struct request *rq; 146 + 147 + page += sprintf(page, "%s:\n", msg); 148 + 149 + list_for_each_entry(rq, list, queuelist) 150 + page += sprintf(page, "\t%p\n", rq); 151 + 152 + return page - start_page; 153 + } 154 + 155 + static ssize_t blk_mq_sysfs_rq_list_show(struct blk_mq_ctx *ctx, char *page) 156 + { 157 + ssize_t ret; 158 + 159 + spin_lock(&ctx->lock); 160 + ret = sysfs_list_show(page, &ctx->rq_list, "CTX pending"); 161 + spin_unlock(&ctx->lock); 162 + 163 + return ret; 164 + } 165 + 166 + static ssize_t blk_mq_hw_sysfs_queued_show(struct blk_mq_hw_ctx *hctx, 167 + char *page) 168 + { 169 + return sprintf(page, "%lu\n", hctx->queued); 170 + } 171 + 172 + static ssize_t blk_mq_hw_sysfs_run_show(struct blk_mq_hw_ctx *hctx, char *page) 173 + { 174 + return sprintf(page, "%lu\n", hctx->run); 175 + } 176 + 177 + static ssize_t blk_mq_hw_sysfs_dispatched_show(struct blk_mq_hw_ctx *hctx, 178 + char *page) 179 + { 180 + char *start_page = page; 181 + int i; 182 + 183 + page += sprintf(page, "%8u\t%lu\n", 0U, hctx->dispatched[0]); 184 + 185 + for (i = 1; i < BLK_MQ_MAX_DISPATCH_ORDER; i++) { 186 + unsigned long d = 1U << (i - 1); 187 + 188 + page += sprintf(page, "%8lu\t%lu\n", d, hctx->dispatched[i]); 189 + } 190 + 191 + return page - start_page; 192 + } 193 + 194 + static ssize_t blk_mq_hw_sysfs_rq_list_show(struct blk_mq_hw_ctx *hctx, 195 + char *page) 196 + { 197 + ssize_t ret; 198 + 199 + spin_lock(&hctx->lock); 200 + ret = sysfs_list_show(page, &hctx->dispatch, "HCTX pending"); 201 + spin_unlock(&hctx->lock); 202 + 203 + return ret; 204 + } 205 + 206 + static ssize_t blk_mq_hw_sysfs_ipi_show(struct blk_mq_hw_ctx *hctx, char *page) 207 + { 208 + ssize_t ret; 209 + 210 + spin_lock(&hctx->lock); 211 + ret = sprintf(page, "%u\n", !!(hctx->flags & BLK_MQ_F_SHOULD_IPI)); 212 + spin_unlock(&hctx->lock); 213 + 214 + return ret; 215 + } 216 + 217 + static ssize_t blk_mq_hw_sysfs_ipi_store(struct blk_mq_hw_ctx *hctx, 218 + const char *page, size_t len) 219 + { 220 + struct blk_mq_ctx *ctx; 221 + unsigned long ret; 222 + unsigned int i; 223 + 224 + if (kstrtoul(page, 10, &ret)) { 225 + pr_err("blk-mq-sysfs: invalid input '%s'\n", page); 226 + return -EINVAL; 227 + } 228 + 229 + spin_lock(&hctx->lock); 230 + if (ret) 231 + hctx->flags |= BLK_MQ_F_SHOULD_IPI; 232 + else 233 + hctx->flags &= ~BLK_MQ_F_SHOULD_IPI; 234 + spin_unlock(&hctx->lock); 235 + 236 + hctx_for_each_ctx(hctx, ctx, i) 237 + ctx->ipi_redirect = !!ret; 238 + 239 + return len; 240 + } 241 + 242 + static ssize_t blk_mq_hw_sysfs_tags_show(struct blk_mq_hw_ctx *hctx, char *page) 243 + { 244 + return blk_mq_tag_sysfs_show(hctx->tags, page); 245 + } 246 + 247 + static struct blk_mq_ctx_sysfs_entry blk_mq_sysfs_dispatched = { 248 + .attr = {.name = "dispatched", .mode = S_IRUGO }, 249 + .show = blk_mq_sysfs_dispatched_show, 250 + }; 251 + static struct blk_mq_ctx_sysfs_entry blk_mq_sysfs_merged = { 252 + .attr = {.name = "merged", .mode = S_IRUGO }, 253 + .show = blk_mq_sysfs_merged_show, 254 + }; 255 + static struct blk_mq_ctx_sysfs_entry blk_mq_sysfs_completed = { 256 + .attr = {.name = "completed", .mode = S_IRUGO }, 257 + .show = blk_mq_sysfs_completed_show, 258 + }; 259 + static struct blk_mq_ctx_sysfs_entry blk_mq_sysfs_rq_list = { 260 + .attr = {.name = "rq_list", .mode = S_IRUGO }, 261 + .show = blk_mq_sysfs_rq_list_show, 262 + }; 263 + 264 + static struct attribute *default_ctx_attrs[] = { 265 + &blk_mq_sysfs_dispatched.attr, 266 + &blk_mq_sysfs_merged.attr, 267 + &blk_mq_sysfs_completed.attr, 268 + &blk_mq_sysfs_rq_list.attr, 269 + NULL, 270 + }; 271 + 272 + static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_queued = { 273 + .attr = {.name = "queued", .mode = S_IRUGO }, 274 + .show = blk_mq_hw_sysfs_queued_show, 275 + }; 276 + static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_run = { 277 + .attr = {.name = "run", .mode = S_IRUGO }, 278 + .show = blk_mq_hw_sysfs_run_show, 279 + }; 280 + static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_dispatched = { 281 + .attr = {.name = "dispatched", .mode = S_IRUGO }, 282 + .show = blk_mq_hw_sysfs_dispatched_show, 283 + }; 284 + static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_pending = { 285 + .attr = {.name = "pending", .mode = S_IRUGO }, 286 + .show = blk_mq_hw_sysfs_rq_list_show, 287 + }; 288 + static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_ipi = { 289 + .attr = {.name = "ipi_redirect", .mode = S_IRUGO | S_IWUSR}, 290 + .show = blk_mq_hw_sysfs_ipi_show, 291 + .store = blk_mq_hw_sysfs_ipi_store, 292 + }; 293 + static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_tags = { 294 + .attr = {.name = "tags", .mode = S_IRUGO }, 295 + .show = blk_mq_hw_sysfs_tags_show, 296 + }; 297 + 298 + static struct attribute *default_hw_ctx_attrs[] = { 299 + &blk_mq_hw_sysfs_queued.attr, 300 + &blk_mq_hw_sysfs_run.attr, 301 + &blk_mq_hw_sysfs_dispatched.attr, 302 + &blk_mq_hw_sysfs_pending.attr, 303 + &blk_mq_hw_sysfs_ipi.attr, 304 + &blk_mq_hw_sysfs_tags.attr, 305 + NULL, 306 + }; 307 + 308 + static const struct sysfs_ops blk_mq_sysfs_ops = { 309 + .show = blk_mq_sysfs_show, 310 + .store = blk_mq_sysfs_store, 311 + }; 312 + 313 + static const struct sysfs_ops blk_mq_hw_sysfs_ops = { 314 + .show = blk_mq_hw_sysfs_show, 315 + .store = blk_mq_hw_sysfs_store, 316 + }; 317 + 318 + static struct kobj_type blk_mq_ktype = { 319 + .sysfs_ops = &blk_mq_sysfs_ops, 320 + .release = blk_mq_sysfs_release, 321 + }; 322 + 323 + static struct kobj_type blk_mq_ctx_ktype = { 324 + .sysfs_ops = &blk_mq_sysfs_ops, 325 + .default_attrs = default_ctx_attrs, 326 + .release = blk_mq_sysfs_release, 327 + }; 328 + 329 + static struct kobj_type blk_mq_hw_ktype = { 330 + .sysfs_ops = &blk_mq_hw_sysfs_ops, 331 + .default_attrs = default_hw_ctx_attrs, 332 + .release = blk_mq_sysfs_release, 333 + }; 334 + 335 + void blk_mq_unregister_disk(struct gendisk *disk) 336 + { 337 + struct request_queue *q = disk->queue; 338 + 339 + kobject_uevent(&q->mq_kobj, KOBJ_REMOVE); 340 + kobject_del(&q->mq_kobj); 341 + 342 + kobject_put(&disk_to_dev(disk)->kobj); 343 + } 344 + 345 + int blk_mq_register_disk(struct gendisk *disk) 346 + { 347 + struct device *dev = disk_to_dev(disk); 348 + struct request_queue *q = disk->queue; 349 + struct blk_mq_hw_ctx *hctx; 350 + struct blk_mq_ctx *ctx; 351 + int ret, i, j; 352 + 353 + kobject_init(&q->mq_kobj, &blk_mq_ktype); 354 + 355 + ret = kobject_add(&q->mq_kobj, kobject_get(&dev->kobj), "%s", "mq"); 356 + if (ret < 0) 357 + return ret; 358 + 359 + kobject_uevent(&q->mq_kobj, KOBJ_ADD); 360 + 361 + queue_for_each_hw_ctx(q, hctx, i) { 362 + kobject_init(&hctx->kobj, &blk_mq_hw_ktype); 363 + ret = kobject_add(&hctx->kobj, &q->mq_kobj, "%u", i); 364 + if (ret) 365 + break; 366 + 367 + if (!hctx->nr_ctx) 368 + continue; 369 + 370 + hctx_for_each_ctx(hctx, ctx, j) { 371 + kobject_init(&ctx->kobj, &blk_mq_ctx_ktype); 372 + ret = kobject_add(&ctx->kobj, &hctx->kobj, "cpu%u", ctx->cpu); 373 + if (ret) 374 + break; 375 + } 376 + } 377 + 378 + if (ret) { 379 + blk_mq_unregister_disk(disk); 380 + return ret; 381 + } 382 + 383 + return 0; 384 + }

+204

block/blk-mq-tag.c

··· 1 + #include <linux/kernel.h> 2 + #include <linux/module.h> 3 + #include <linux/percpu_ida.h> 4 + 5 + #include <linux/blk-mq.h> 6 + #include "blk.h" 7 + #include "blk-mq.h" 8 + #include "blk-mq-tag.h" 9 + 10 + /* 11 + * Per tagged queue (tag address space) map 12 + */ 13 + struct blk_mq_tags { 14 + unsigned int nr_tags; 15 + unsigned int nr_reserved_tags; 16 + unsigned int nr_batch_move; 17 + unsigned int nr_max_cache; 18 + 19 + struct percpu_ida free_tags; 20 + struct percpu_ida reserved_tags; 21 + }; 22 + 23 + void blk_mq_wait_for_tags(struct blk_mq_tags *tags) 24 + { 25 + int tag = blk_mq_get_tag(tags, __GFP_WAIT, false); 26 + blk_mq_put_tag(tags, tag); 27 + } 28 + 29 + bool blk_mq_has_free_tags(struct blk_mq_tags *tags) 30 + { 31 + return !tags || 32 + percpu_ida_free_tags(&tags->free_tags, nr_cpu_ids) != 0; 33 + } 34 + 35 + static unsigned int __blk_mq_get_tag(struct blk_mq_tags *tags, gfp_t gfp) 36 + { 37 + int tag; 38 + 39 + tag = percpu_ida_alloc(&tags->free_tags, gfp); 40 + if (tag < 0) 41 + return BLK_MQ_TAG_FAIL; 42 + return tag + tags->nr_reserved_tags; 43 + } 44 + 45 + static unsigned int __blk_mq_get_reserved_tag(struct blk_mq_tags *tags, 46 + gfp_t gfp) 47 + { 48 + int tag; 49 + 50 + if (unlikely(!tags->nr_reserved_tags)) { 51 + WARN_ON_ONCE(1); 52 + return BLK_MQ_TAG_FAIL; 53 + } 54 + 55 + tag = percpu_ida_alloc(&tags->reserved_tags, gfp); 56 + if (tag < 0) 57 + return BLK_MQ_TAG_FAIL; 58 + return tag; 59 + } 60 + 61 + unsigned int blk_mq_get_tag(struct blk_mq_tags *tags, gfp_t gfp, bool reserved) 62 + { 63 + if (!reserved) 64 + return __blk_mq_get_tag(tags, gfp); 65 + 66 + return __blk_mq_get_reserved_tag(tags, gfp); 67 + } 68 + 69 + static void __blk_mq_put_tag(struct blk_mq_tags *tags, unsigned int tag) 70 + { 71 + BUG_ON(tag >= tags->nr_tags); 72 + 73 + percpu_ida_free(&tags->free_tags, tag - tags->nr_reserved_tags); 74 + } 75 + 76 + static void __blk_mq_put_reserved_tag(struct blk_mq_tags *tags, 77 + unsigned int tag) 78 + { 79 + BUG_ON(tag >= tags->nr_reserved_tags); 80 + 81 + percpu_ida_free(&tags->reserved_tags, tag); 82 + } 83 + 84 + void blk_mq_put_tag(struct blk_mq_tags *tags, unsigned int tag) 85 + { 86 + if (tag >= tags->nr_reserved_tags) 87 + __blk_mq_put_tag(tags, tag); 88 + else 89 + __blk_mq_put_reserved_tag(tags, tag); 90 + } 91 + 92 + static int __blk_mq_tag_iter(unsigned id, void *data) 93 + { 94 + unsigned long *tag_map = data; 95 + __set_bit(id, tag_map); 96 + return 0; 97 + } 98 + 99 + void blk_mq_tag_busy_iter(struct blk_mq_tags *tags, 100 + void (*fn)(void *, unsigned long *), void *data) 101 + { 102 + unsigned long *tag_map; 103 + size_t map_size; 104 + 105 + map_size = ALIGN(tags->nr_tags, BITS_PER_LONG) / BITS_PER_LONG; 106 + tag_map = kzalloc(map_size * sizeof(unsigned long), GFP_ATOMIC); 107 + if (!tag_map) 108 + return; 109 + 110 + percpu_ida_for_each_free(&tags->free_tags, __blk_mq_tag_iter, tag_map); 111 + if (tags->nr_reserved_tags) 112 + percpu_ida_for_each_free(&tags->reserved_tags, __blk_mq_tag_iter, 113 + tag_map); 114 + 115 + fn(data, tag_map); 116 + kfree(tag_map); 117 + } 118 + 119 + struct blk_mq_tags *blk_mq_init_tags(unsigned int total_tags, 120 + unsigned int reserved_tags, int node) 121 + { 122 + unsigned int nr_tags, nr_cache; 123 + struct blk_mq_tags *tags; 124 + int ret; 125 + 126 + if (total_tags > BLK_MQ_TAG_MAX) { 127 + pr_err("blk-mq: tag depth too large\n"); 128 + return NULL; 129 + } 130 + 131 + tags = kzalloc_node(sizeof(*tags), GFP_KERNEL, node); 132 + if (!tags) 133 + return NULL; 134 + 135 + nr_tags = total_tags - reserved_tags; 136 + nr_cache = nr_tags / num_possible_cpus(); 137 + 138 + if (nr_cache < BLK_MQ_TAG_CACHE_MIN) 139 + nr_cache = BLK_MQ_TAG_CACHE_MIN; 140 + else if (nr_cache > BLK_MQ_TAG_CACHE_MAX) 141 + nr_cache = BLK_MQ_TAG_CACHE_MAX; 142 + 143 + tags->nr_tags = total_tags; 144 + tags->nr_reserved_tags = reserved_tags; 145 + tags->nr_max_cache = nr_cache; 146 + tags->nr_batch_move = max(1u, nr_cache / 2); 147 + 148 + ret = __percpu_ida_init(&tags->free_tags, tags->nr_tags - 149 + tags->nr_reserved_tags, 150 + tags->nr_max_cache, 151 + tags->nr_batch_move); 152 + if (ret) 153 + goto err_free_tags; 154 + 155 + if (reserved_tags) { 156 + /* 157 + * With max_cahe and batch set to 1, the allocator fallbacks to 158 + * no cached. It's fine reserved tags allocation is slow. 159 + */ 160 + ret = __percpu_ida_init(&tags->reserved_tags, reserved_tags, 161 + 1, 1); 162 + if (ret) 163 + goto err_reserved_tags; 164 + } 165 + 166 + return tags; 167 + 168 + err_reserved_tags: 169 + percpu_ida_destroy(&tags->free_tags); 170 + err_free_tags: 171 + kfree(tags); 172 + return NULL; 173 + } 174 + 175 + void blk_mq_free_tags(struct blk_mq_tags *tags) 176 + { 177 + percpu_ida_destroy(&tags->free_tags); 178 + percpu_ida_destroy(&tags->reserved_tags); 179 + kfree(tags); 180 + } 181 + 182 + ssize_t blk_mq_tag_sysfs_show(struct blk_mq_tags *tags, char *page) 183 + { 184 + char *orig_page = page; 185 + int cpu; 186 + 187 + if (!tags) 188 + return 0; 189 + 190 + page += sprintf(page, "nr_tags=%u, reserved_tags=%u, batch_move=%u," 191 + " max_cache=%u\n", tags->nr_tags, tags->nr_reserved_tags, 192 + tags->nr_batch_move, tags->nr_max_cache); 193 + 194 + page += sprintf(page, "nr_free=%u, nr_reserved=%u\n", 195 + percpu_ida_free_tags(&tags->free_tags, nr_cpu_ids), 196 + percpu_ida_free_tags(&tags->reserved_tags, nr_cpu_ids)); 197 + 198 + for_each_possible_cpu(cpu) { 199 + page += sprintf(page, " cpu%02u: nr_free=%u\n", cpu, 200 + percpu_ida_free_tags(&tags->free_tags, cpu)); 201 + } 202 + 203 + return page - orig_page; 204 + }

+27

block/blk-mq-tag.h

··· 1 + #ifndef INT_BLK_MQ_TAG_H 2 + #define INT_BLK_MQ_TAG_H 3 + 4 + struct blk_mq_tags; 5 + 6 + extern struct blk_mq_tags *blk_mq_init_tags(unsigned int nr_tags, unsigned int reserved_tags, int node); 7 + extern void blk_mq_free_tags(struct blk_mq_tags *tags); 8 + 9 + extern unsigned int blk_mq_get_tag(struct blk_mq_tags *tags, gfp_t gfp, bool reserved); 10 + extern void blk_mq_wait_for_tags(struct blk_mq_tags *tags); 11 + extern void blk_mq_put_tag(struct blk_mq_tags *tags, unsigned int tag); 12 + extern void blk_mq_tag_busy_iter(struct blk_mq_tags *tags, void (*fn)(void *data, unsigned long *), void *data); 13 + extern bool blk_mq_has_free_tags(struct blk_mq_tags *tags); 14 + extern ssize_t blk_mq_tag_sysfs_show(struct blk_mq_tags *tags, char *page); 15 + 16 + enum { 17 + BLK_MQ_TAG_CACHE_MIN = 1, 18 + BLK_MQ_TAG_CACHE_MAX = 64, 19 + }; 20 + 21 + enum { 22 + BLK_MQ_TAG_FAIL = -1U, 23 + BLK_MQ_TAG_MIN = BLK_MQ_TAG_CACHE_MIN, 24 + BLK_MQ_TAG_MAX = BLK_MQ_TAG_FAIL - 1, 25 + }; 26 + 27 + #endif

+1500

block/blk-mq.c

··· 1 + #include <linux/kernel.h> 2 + #include <linux/module.h> 3 + #include <linux/backing-dev.h> 4 + #include <linux/bio.h> 5 + #include <linux/blkdev.h> 6 + #include <linux/mm.h> 7 + #include <linux/init.h> 8 + #include <linux/slab.h> 9 + #include <linux/workqueue.h> 10 + #include <linux/smp.h> 11 + #include <linux/llist.h> 12 + #include <linux/list_sort.h> 13 + #include <linux/cpu.h> 14 + #include <linux/cache.h> 15 + #include <linux/sched/sysctl.h> 16 + #include <linux/delay.h> 17 + 18 + #include <trace/events/block.h> 19 + 20 + #include <linux/blk-mq.h> 21 + #include "blk.h" 22 + #include "blk-mq.h" 23 + #include "blk-mq-tag.h" 24 + 25 + static DEFINE_MUTEX(all_q_mutex); 26 + static LIST_HEAD(all_q_list); 27 + 28 + static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx); 29 + 30 + DEFINE_PER_CPU(struct llist_head, ipi_lists); 31 + 32 + static struct blk_mq_ctx *__blk_mq_get_ctx(struct request_queue *q, 33 + unsigned int cpu) 34 + { 35 + return per_cpu_ptr(q->queue_ctx, cpu); 36 + } 37 + 38 + /* 39 + * This assumes per-cpu software queueing queues. They could be per-node 40 + * as well, for instance. For now this is hardcoded as-is. Note that we don't 41 + * care about preemption, since we know the ctx's are persistent. This does 42 + * mean that we can't rely on ctx always matching the currently running CPU. 43 + */ 44 + static struct blk_mq_ctx *blk_mq_get_ctx(struct request_queue *q) 45 + { 46 + return __blk_mq_get_ctx(q, get_cpu()); 47 + } 48 + 49 + static void blk_mq_put_ctx(struct blk_mq_ctx *ctx) 50 + { 51 + put_cpu(); 52 + } 53 + 54 + /* 55 + * Check if any of the ctx's have pending work in this hardware queue 56 + */ 57 + static bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx) 58 + { 59 + unsigned int i; 60 + 61 + for (i = 0; i < hctx->nr_ctx_map; i++) 62 + if (hctx->ctx_map[i]) 63 + return true; 64 + 65 + return false; 66 + } 67 + 68 + /* 69 + * Mark this ctx as having pending work in this hardware queue 70 + */ 71 + static void blk_mq_hctx_mark_pending(struct blk_mq_hw_ctx *hctx, 72 + struct blk_mq_ctx *ctx) 73 + { 74 + if (!test_bit(ctx->index_hw, hctx->ctx_map)) 75 + set_bit(ctx->index_hw, hctx->ctx_map); 76 + } 77 + 78 + static struct request *blk_mq_alloc_rq(struct blk_mq_hw_ctx *hctx, gfp_t gfp, 79 + bool reserved) 80 + { 81 + struct request *rq; 82 + unsigned int tag; 83 + 84 + tag = blk_mq_get_tag(hctx->tags, gfp, reserved); 85 + if (tag != BLK_MQ_TAG_FAIL) { 86 + rq = hctx->rqs[tag]; 87 + rq->tag = tag; 88 + 89 + return rq; 90 + } 91 + 92 + return NULL; 93 + } 94 + 95 + static int blk_mq_queue_enter(struct request_queue *q) 96 + { 97 + int ret; 98 + 99 + __percpu_counter_add(&q->mq_usage_counter, 1, 1000000); 100 + smp_wmb(); 101 + /* we have problems to freeze the queue if it's initializing */ 102 + if (!blk_queue_bypass(q) || !blk_queue_init_done(q)) 103 + return 0; 104 + 105 + __percpu_counter_add(&q->mq_usage_counter, -1, 1000000); 106 + 107 + spin_lock_irq(q->queue_lock); 108 + ret = wait_event_interruptible_lock_irq(q->mq_freeze_wq, 109 + !blk_queue_bypass(q), *q->queue_lock); 110 + /* inc usage with lock hold to avoid freeze_queue runs here */ 111 + if (!ret) 112 + __percpu_counter_add(&q->mq_usage_counter, 1, 1000000); 113 + spin_unlock_irq(q->queue_lock); 114 + 115 + return ret; 116 + } 117 + 118 + static void blk_mq_queue_exit(struct request_queue *q) 119 + { 120 + __percpu_counter_add(&q->mq_usage_counter, -1, 1000000); 121 + } 122 + 123 + /* 124 + * Guarantee no request is in use, so we can change any data structure of 125 + * the queue afterward. 126 + */ 127 + static void blk_mq_freeze_queue(struct request_queue *q) 128 + { 129 + bool drain; 130 + 131 + spin_lock_irq(q->queue_lock); 132 + drain = !q->bypass_depth++; 133 + queue_flag_set(QUEUE_FLAG_BYPASS, q); 134 + spin_unlock_irq(q->queue_lock); 135 + 136 + if (!drain) 137 + return; 138 + 139 + while (true) { 140 + s64 count; 141 + 142 + spin_lock_irq(q->queue_lock); 143 + count = percpu_counter_sum(&q->mq_usage_counter); 144 + spin_unlock_irq(q->queue_lock); 145 + 146 + if (count == 0) 147 + break; 148 + blk_mq_run_queues(q, false); 149 + msleep(10); 150 + } 151 + } 152 + 153 + static void blk_mq_unfreeze_queue(struct request_queue *q) 154 + { 155 + bool wake = false; 156 + 157 + spin_lock_irq(q->queue_lock); 158 + if (!--q->bypass_depth) { 159 + queue_flag_clear(QUEUE_FLAG_BYPASS, q); 160 + wake = true; 161 + } 162 + WARN_ON_ONCE(q->bypass_depth < 0); 163 + spin_unlock_irq(q->queue_lock); 164 + if (wake) 165 + wake_up_all(&q->mq_freeze_wq); 166 + } 167 + 168 + bool blk_mq_can_queue(struct blk_mq_hw_ctx *hctx) 169 + { 170 + return blk_mq_has_free_tags(hctx->tags); 171 + } 172 + EXPORT_SYMBOL(blk_mq_can_queue); 173 + 174 + static void blk_mq_rq_ctx_init(struct blk_mq_ctx *ctx, struct request *rq, 175 + unsigned int rw_flags) 176 + { 177 + rq->mq_ctx = ctx; 178 + rq->cmd_flags = rw_flags; 179 + ctx->rq_dispatched[rw_is_sync(rw_flags)]++; 180 + } 181 + 182 + static struct request *__blk_mq_alloc_request(struct blk_mq_hw_ctx *hctx, 183 + gfp_t gfp, bool reserved) 184 + { 185 + return blk_mq_alloc_rq(hctx, gfp, reserved); 186 + } 187 + 188 + static struct request *blk_mq_alloc_request_pinned(struct request_queue *q, 189 + int rw, gfp_t gfp, 190 + bool reserved) 191 + { 192 + struct request *rq; 193 + 194 + do { 195 + struct blk_mq_ctx *ctx = blk_mq_get_ctx(q); 196 + struct blk_mq_hw_ctx *hctx = q->mq_ops->map_queue(q, ctx->cpu); 197 + 198 + rq = __blk_mq_alloc_request(hctx, gfp & ~__GFP_WAIT, reserved); 199 + if (rq) { 200 + blk_mq_rq_ctx_init(ctx, rq, rw); 201 + break; 202 + } else if (!(gfp & __GFP_WAIT)) 203 + break; 204 + 205 + blk_mq_put_ctx(ctx); 206 + __blk_mq_run_hw_queue(hctx); 207 + blk_mq_wait_for_tags(hctx->tags); 208 + } while (1); 209 + 210 + return rq; 211 + } 212 + 213 + struct request *blk_mq_alloc_request(struct request_queue *q, int rw, 214 + gfp_t gfp, bool reserved) 215 + { 216 + struct request *rq; 217 + 218 + if (blk_mq_queue_enter(q)) 219 + return NULL; 220 + 221 + rq = blk_mq_alloc_request_pinned(q, rw, gfp, reserved); 222 + blk_mq_put_ctx(rq->mq_ctx); 223 + return rq; 224 + } 225 + 226 + struct request *blk_mq_alloc_reserved_request(struct request_queue *q, int rw, 227 + gfp_t gfp) 228 + { 229 + struct request *rq; 230 + 231 + if (blk_mq_queue_enter(q)) 232 + return NULL; 233 + 234 + rq = blk_mq_alloc_request_pinned(q, rw, gfp, true); 235 + blk_mq_put_ctx(rq->mq_ctx); 236 + return rq; 237 + } 238 + EXPORT_SYMBOL(blk_mq_alloc_reserved_request); 239 + 240 + /* 241 + * Re-init and set pdu, if we have it 242 + */ 243 + static void blk_mq_rq_init(struct blk_mq_hw_ctx *hctx, struct request *rq) 244 + { 245 + blk_rq_init(hctx->queue, rq); 246 + 247 + if (hctx->cmd_size) 248 + rq->special = blk_mq_rq_to_pdu(rq); 249 + } 250 + 251 + static void __blk_mq_free_request(struct blk_mq_hw_ctx *hctx, 252 + struct blk_mq_ctx *ctx, struct request *rq) 253 + { 254 + const int tag = rq->tag; 255 + struct request_queue *q = rq->q; 256 + 257 + blk_mq_rq_init(hctx, rq); 258 + blk_mq_put_tag(hctx->tags, tag); 259 + 260 + blk_mq_queue_exit(q); 261 + } 262 + 263 + void blk_mq_free_request(struct request *rq) 264 + { 265 + struct blk_mq_ctx *ctx = rq->mq_ctx; 266 + struct blk_mq_hw_ctx *hctx; 267 + struct request_queue *q = rq->q; 268 + 269 + ctx->rq_completed[rq_is_sync(rq)]++; 270 + 271 + hctx = q->mq_ops->map_queue(q, ctx->cpu); 272 + __blk_mq_free_request(hctx, ctx, rq); 273 + } 274 + 275 + static void blk_mq_bio_endio(struct request *rq, struct bio *bio, int error) 276 + { 277 + if (error) 278 + clear_bit(BIO_UPTODATE, &bio->bi_flags); 279 + else if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) 280 + error = -EIO; 281 + 282 + if (unlikely(rq->cmd_flags & REQ_QUIET)) 283 + set_bit(BIO_QUIET, &bio->bi_flags); 284 + 285 + /* don't actually finish bio if it's part of flush sequence */ 286 + if (!(rq->cmd_flags & REQ_FLUSH_SEQ)) 287 + bio_endio(bio, error); 288 + } 289 + 290 + void blk_mq_complete_request(struct request *rq, int error) 291 + { 292 + struct bio *bio = rq->bio; 293 + unsigned int bytes = 0; 294 + 295 + trace_block_rq_complete(rq->q, rq); 296 + 297 + while (bio) { 298 + struct bio *next = bio->bi_next; 299 + 300 + bio->bi_next = NULL; 301 + bytes += bio->bi_size; 302 + blk_mq_bio_endio(rq, bio, error); 303 + bio = next; 304 + } 305 + 306 + blk_account_io_completion(rq, bytes); 307 + 308 + if (rq->end_io) 309 + rq->end_io(rq, error); 310 + else 311 + blk_mq_free_request(rq); 312 + 313 + blk_account_io_done(rq); 314 + } 315 + 316 + void __blk_mq_end_io(struct request *rq, int error) 317 + { 318 + if (!blk_mark_rq_complete(rq)) 319 + blk_mq_complete_request(rq, error); 320 + } 321 + 322 + #if defined(CONFIG_SMP) && defined(CONFIG_USE_GENERIC_SMP_HELPERS) 323 + 324 + /* 325 + * Called with interrupts disabled. 326 + */ 327 + static void ipi_end_io(void *data) 328 + { 329 + struct llist_head *list = &per_cpu(ipi_lists, smp_processor_id()); 330 + struct llist_node *entry, *next; 331 + struct request *rq; 332 + 333 + entry = llist_del_all(list); 334 + 335 + while (entry) { 336 + next = entry->next; 337 + rq = llist_entry(entry, struct request, ll_list); 338 + __blk_mq_end_io(rq, rq->errors); 339 + entry = next; 340 + } 341 + } 342 + 343 + static int ipi_remote_cpu(struct blk_mq_ctx *ctx, const int cpu, 344 + struct request *rq, const int error) 345 + { 346 + struct call_single_data *data = &rq->csd; 347 + 348 + rq->errors = error; 349 + rq->ll_list.next = NULL; 350 + 351 + /* 352 + * If the list is non-empty, an existing IPI must already 353 + * be "in flight". If that is the case, we need not schedule 354 + * a new one. 355 + */ 356 + if (llist_add(&rq->ll_list, &per_cpu(ipi_lists, ctx->cpu))) { 357 + data->func = ipi_end_io; 358 + data->flags = 0; 359 + __smp_call_function_single(ctx->cpu, data, 0); 360 + } 361 + 362 + return true; 363 + } 364 + #else /* CONFIG_SMP && CONFIG_USE_GENERIC_SMP_HELPERS */ 365 + static int ipi_remote_cpu(struct blk_mq_ctx *ctx, const int cpu, 366 + struct request *rq, const int error) 367 + { 368 + return false; 369 + } 370 + #endif 371 + 372 + /* 373 + * End IO on this request on a multiqueue enabled driver. We'll either do 374 + * it directly inline, or punt to a local IPI handler on the matching 375 + * remote CPU. 376 + */ 377 + void blk_mq_end_io(struct request *rq, int error) 378 + { 379 + struct blk_mq_ctx *ctx = rq->mq_ctx; 380 + int cpu; 381 + 382 + if (!ctx->ipi_redirect) 383 + return __blk_mq_end_io(rq, error); 384 + 385 + cpu = get_cpu(); 386 + 387 + if (cpu == ctx->cpu || !cpu_online(ctx->cpu) || 388 + !ipi_remote_cpu(ctx, cpu, rq, error)) 389 + __blk_mq_end_io(rq, error); 390 + 391 + put_cpu(); 392 + } 393 + EXPORT_SYMBOL(blk_mq_end_io); 394 + 395 + static void blk_mq_start_request(struct request *rq) 396 + { 397 + struct request_queue *q = rq->q; 398 + 399 + trace_block_rq_issue(q, rq); 400 + 401 + /* 402 + * Just mark start time and set the started bit. Due to memory 403 + * ordering, we know we'll see the correct deadline as long as 404 + * REQ_ATOMIC_STARTED is seen. 405 + */ 406 + rq->deadline = jiffies + q->rq_timeout; 407 + set_bit(REQ_ATOM_STARTED, &rq->atomic_flags); 408 + } 409 + 410 + static void blk_mq_requeue_request(struct request *rq) 411 + { 412 + struct request_queue *q = rq->q; 413 + 414 + trace_block_rq_requeue(q, rq); 415 + clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags); 416 + } 417 + 418 + struct blk_mq_timeout_data { 419 + struct blk_mq_hw_ctx *hctx; 420 + unsigned long *next; 421 + unsigned int *next_set; 422 + }; 423 + 424 + static void blk_mq_timeout_check(void *__data, unsigned long *free_tags) 425 + { 426 + struct blk_mq_timeout_data *data = __data; 427 + struct blk_mq_hw_ctx *hctx = data->hctx; 428 + unsigned int tag; 429 + 430 + /* It may not be in flight yet (this is where 431 + * the REQ_ATOMIC_STARTED flag comes in). The requests are 432 + * statically allocated, so we know it's always safe to access the 433 + * memory associated with a bit offset into ->rqs[]. 434 + */ 435 + tag = 0; 436 + do { 437 + struct request *rq; 438 + 439 + tag = find_next_zero_bit(free_tags, hctx->queue_depth, tag); 440 + if (tag >= hctx->queue_depth) 441 + break; 442 + 443 + rq = hctx->rqs[tag++]; 444 + 445 + if (!test_bit(REQ_ATOM_STARTED, &rq->atomic_flags)) 446 + continue; 447 + 448 + blk_rq_check_expired(rq, data->next, data->next_set); 449 + } while (1); 450 + } 451 + 452 + static void blk_mq_hw_ctx_check_timeout(struct blk_mq_hw_ctx *hctx, 453 + unsigned long *next, 454 + unsigned int *next_set) 455 + { 456 + struct blk_mq_timeout_data data = { 457 + .hctx = hctx, 458 + .next = next, 459 + .next_set = next_set, 460 + }; 461 + 462 + /* 463 + * Ask the tagging code to iterate busy requests, so we can 464 + * check them for timeout. 465 + */ 466 + blk_mq_tag_busy_iter(hctx->tags, blk_mq_timeout_check, &data); 467 + } 468 + 469 + static void blk_mq_rq_timer(unsigned long data) 470 + { 471 + struct request_queue *q = (struct request_queue *) data; 472 + struct blk_mq_hw_ctx *hctx; 473 + unsigned long next = 0; 474 + int i, next_set = 0; 475 + 476 + queue_for_each_hw_ctx(q, hctx, i) 477 + blk_mq_hw_ctx_check_timeout(hctx, &next, &next_set); 478 + 479 + if (next_set) 480 + mod_timer(&q->timeout, round_jiffies_up(next)); 481 + } 482 + 483 + /* 484 + * Reverse check our software queue for entries that we could potentially 485 + * merge with. Currently includes a hand-wavy stop count of 8, to not spend 486 + * too much time checking for merges. 487 + */ 488 + static bool blk_mq_attempt_merge(struct request_queue *q, 489 + struct blk_mq_ctx *ctx, struct bio *bio) 490 + { 491 + struct request *rq; 492 + int checked = 8; 493 + 494 + list_for_each_entry_reverse(rq, &ctx->rq_list, queuelist) { 495 + int el_ret; 496 + 497 + if (!checked--) 498 + break; 499 + 500 + if (!blk_rq_merge_ok(rq, bio)) 501 + continue; 502 + 503 + el_ret = blk_try_merge(rq, bio); 504 + if (el_ret == ELEVATOR_BACK_MERGE) { 505 + if (bio_attempt_back_merge(q, rq, bio)) { 506 + ctx->rq_merged++; 507 + return true; 508 + } 509 + break; 510 + } else if (el_ret == ELEVATOR_FRONT_MERGE) { 511 + if (bio_attempt_front_merge(q, rq, bio)) { 512 + ctx->rq_merged++; 513 + return true; 514 + } 515 + break; 516 + } 517 + } 518 + 519 + return false; 520 + } 521 + 522 + void blk_mq_add_timer(struct request *rq) 523 + { 524 + __blk_add_timer(rq, NULL); 525 + } 526 + 527 + /* 528 + * Run this hardware queue, pulling any software queues mapped to it in. 529 + * Note that this function currently has various problems around ordering 530 + * of IO. In particular, we'd like FIFO behaviour on handling existing 531 + * items on the hctx->dispatch list. Ignore that for now. 532 + */ 533 + static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx) 534 + { 535 + struct request_queue *q = hctx->queue; 536 + struct blk_mq_ctx *ctx; 537 + struct request *rq; 538 + LIST_HEAD(rq_list); 539 + int bit, queued; 540 + 541 + if (unlikely(test_bit(BLK_MQ_S_STOPPED, &hctx->flags))) 542 + return; 543 + 544 + hctx->run++; 545 + 546 + /* 547 + * Touch any software queue that has pending entries. 548 + */ 549 + for_each_set_bit(bit, hctx->ctx_map, hctx->nr_ctx) { 550 + clear_bit(bit, hctx->ctx_map); 551 + ctx = hctx->ctxs[bit]; 552 + BUG_ON(bit != ctx->index_hw); 553 + 554 + spin_lock(&ctx->lock); 555 + list_splice_tail_init(&ctx->rq_list, &rq_list); 556 + spin_unlock(&ctx->lock); 557 + } 558 + 559 + /* 560 + * If we have previous entries on our dispatch list, grab them 561 + * and stuff them at the front for more fair dispatch. 562 + */ 563 + if (!list_empty_careful(&hctx->dispatch)) { 564 + spin_lock(&hctx->lock); 565 + if (!list_empty(&hctx->dispatch)) 566 + list_splice_init(&hctx->dispatch, &rq_list); 567 + spin_unlock(&hctx->lock); 568 + } 569 + 570 + /* 571 + * Delete and return all entries from our dispatch list 572 + */ 573 + queued = 0; 574 + 575 + /* 576 + * Now process all the entries, sending them to the driver. 577 + */ 578 + while (!list_empty(&rq_list)) { 579 + int ret; 580 + 581 + rq = list_first_entry(&rq_list, struct request, queuelist); 582 + list_del_init(&rq->queuelist); 583 + blk_mq_start_request(rq); 584 + 585 + /* 586 + * Last request in the series. Flag it as such, this 587 + * enables drivers to know when IO should be kicked off, 588 + * if they don't do it on a per-request basis. 589 + * 590 + * Note: the flag isn't the only condition drivers 591 + * should do kick off. If drive is busy, the last 592 + * request might not have the bit set. 593 + */ 594 + if (list_empty(&rq_list)) 595 + rq->cmd_flags |= REQ_END; 596 + 597 + ret = q->mq_ops->queue_rq(hctx, rq); 598 + switch (ret) { 599 + case BLK_MQ_RQ_QUEUE_OK: 600 + queued++; 601 + continue; 602 + case BLK_MQ_RQ_QUEUE_BUSY: 603 + /* 604 + * FIXME: we should have a mechanism to stop the queue 605 + * like blk_stop_queue, otherwise we will waste cpu 606 + * time 607 + */ 608 + list_add(&rq->queuelist, &rq_list); 609 + blk_mq_requeue_request(rq); 610 + break; 611 + default: 612 + pr_err("blk-mq: bad return on queue: %d\n", ret); 613 + rq->errors = -EIO; 614 + case BLK_MQ_RQ_QUEUE_ERROR: 615 + blk_mq_end_io(rq, rq->errors); 616 + break; 617 + } 618 + 619 + if (ret == BLK_MQ_RQ_QUEUE_BUSY) 620 + break; 621 + } 622 + 623 + if (!queued) 624 + hctx->dispatched[0]++; 625 + else if (queued < (1 << (BLK_MQ_MAX_DISPATCH_ORDER - 1))) 626 + hctx->dispatched[ilog2(queued) + 1]++; 627 + 628 + /* 629 + * Any items that need requeuing? Stuff them into hctx->dispatch, 630 + * that is where we will continue on next queue run. 631 + */ 632 + if (!list_empty(&rq_list)) { 633 + spin_lock(&hctx->lock); 634 + list_splice(&rq_list, &hctx->dispatch); 635 + spin_unlock(&hctx->lock); 636 + } 637 + } 638 + 639 + void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async) 640 + { 641 + if (unlikely(test_bit(BLK_MQ_S_STOPPED, &hctx->flags))) 642 + return; 643 + 644 + if (!async) 645 + __blk_mq_run_hw_queue(hctx); 646 + else { 647 + struct request_queue *q = hctx->queue; 648 + 649 + kblockd_schedule_delayed_work(q, &hctx->delayed_work, 0); 650 + } 651 + } 652 + 653 + void blk_mq_run_queues(struct request_queue *q, bool async) 654 + { 655 + struct blk_mq_hw_ctx *hctx; 656 + int i; 657 + 658 + queue_for_each_hw_ctx(q, hctx, i) { 659 + if ((!blk_mq_hctx_has_pending(hctx) && 660 + list_empty_careful(&hctx->dispatch)) || 661 + test_bit(BLK_MQ_S_STOPPED, &hctx->flags)) 662 + continue; 663 + 664 + blk_mq_run_hw_queue(hctx, async); 665 + } 666 + } 667 + EXPORT_SYMBOL(blk_mq_run_queues); 668 + 669 + void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx) 670 + { 671 + cancel_delayed_work(&hctx->delayed_work); 672 + set_bit(BLK_MQ_S_STOPPED, &hctx->state); 673 + } 674 + EXPORT_SYMBOL(blk_mq_stop_hw_queue); 675 + 676 + void blk_mq_stop_hw_queues(struct request_queue *q) 677 + { 678 + struct blk_mq_hw_ctx *hctx; 679 + int i; 680 + 681 + queue_for_each_hw_ctx(q, hctx, i) 682 + blk_mq_stop_hw_queue(hctx); 683 + } 684 + EXPORT_SYMBOL(blk_mq_stop_hw_queues); 685 + 686 + void blk_mq_start_hw_queue(struct blk_mq_hw_ctx *hctx) 687 + { 688 + clear_bit(BLK_MQ_S_STOPPED, &hctx->state); 689 + __blk_mq_run_hw_queue(hctx); 690 + } 691 + EXPORT_SYMBOL(blk_mq_start_hw_queue); 692 + 693 + void blk_mq_start_stopped_hw_queues(struct request_queue *q) 694 + { 695 + struct blk_mq_hw_ctx *hctx; 696 + int i; 697 + 698 + queue_for_each_hw_ctx(q, hctx, i) { 699 + if (!test_bit(BLK_MQ_S_STOPPED, &hctx->state)) 700 + continue; 701 + 702 + clear_bit(BLK_MQ_S_STOPPED, &hctx->state); 703 + blk_mq_run_hw_queue(hctx, true); 704 + } 705 + } 706 + EXPORT_SYMBOL(blk_mq_start_stopped_hw_queues); 707 + 708 + static void blk_mq_work_fn(struct work_struct *work) 709 + { 710 + struct blk_mq_hw_ctx *hctx; 711 + 712 + hctx = container_of(work, struct blk_mq_hw_ctx, delayed_work.work); 713 + __blk_mq_run_hw_queue(hctx); 714 + } 715 + 716 + static void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx, 717 + struct request *rq) 718 + { 719 + struct blk_mq_ctx *ctx = rq->mq_ctx; 720 + 721 + list_add_tail(&rq->queuelist, &ctx->rq_list); 722 + blk_mq_hctx_mark_pending(hctx, ctx); 723 + 724 + /* 725 + * We do this early, to ensure we are on the right CPU. 726 + */ 727 + blk_mq_add_timer(rq); 728 + } 729 + 730 + void blk_mq_insert_request(struct request_queue *q, struct request *rq, 731 + bool run_queue) 732 + { 733 + struct blk_mq_hw_ctx *hctx; 734 + struct blk_mq_ctx *ctx, *current_ctx; 735 + 736 + ctx = rq->mq_ctx; 737 + hctx = q->mq_ops->map_queue(q, ctx->cpu); 738 + 739 + if (rq->cmd_flags & (REQ_FLUSH | REQ_FUA)) { 740 + blk_insert_flush(rq); 741 + } else { 742 + current_ctx = blk_mq_get_ctx(q); 743 + 744 + if (!cpu_online(ctx->cpu)) { 745 + ctx = current_ctx; 746 + hctx = q->mq_ops->map_queue(q, ctx->cpu); 747 + rq->mq_ctx = ctx; 748 + } 749 + spin_lock(&ctx->lock); 750 + __blk_mq_insert_request(hctx, rq); 751 + spin_unlock(&ctx->lock); 752 + 753 + blk_mq_put_ctx(current_ctx); 754 + } 755 + 756 + if (run_queue) 757 + __blk_mq_run_hw_queue(hctx); 758 + } 759 + EXPORT_SYMBOL(blk_mq_insert_request); 760 + 761 + /* 762 + * This is a special version of blk_mq_insert_request to bypass FLUSH request 763 + * check. Should only be used internally. 764 + */ 765 + void blk_mq_run_request(struct request *rq, bool run_queue, bool async) 766 + { 767 + struct request_queue *q = rq->q; 768 + struct blk_mq_hw_ctx *hctx; 769 + struct blk_mq_ctx *ctx, *current_ctx; 770 + 771 + current_ctx = blk_mq_get_ctx(q); 772 + 773 + ctx = rq->mq_ctx; 774 + if (!cpu_online(ctx->cpu)) { 775 + ctx = current_ctx; 776 + rq->mq_ctx = ctx; 777 + } 778 + hctx = q->mq_ops->map_queue(q, ctx->cpu); 779 + 780 + /* ctx->cpu might be offline */ 781 + spin_lock(&ctx->lock); 782 + __blk_mq_insert_request(hctx, rq); 783 + spin_unlock(&ctx->lock); 784 + 785 + blk_mq_put_ctx(current_ctx); 786 + 787 + if (run_queue) 788 + blk_mq_run_hw_queue(hctx, async); 789 + } 790 + 791 + static void blk_mq_insert_requests(struct request_queue *q, 792 + struct blk_mq_ctx *ctx, 793 + struct list_head *list, 794 + int depth, 795 + bool from_schedule) 796 + 797 + { 798 + struct blk_mq_hw_ctx *hctx; 799 + struct blk_mq_ctx *current_ctx; 800 + 801 + trace_block_unplug(q, depth, !from_schedule); 802 + 803 + current_ctx = blk_mq_get_ctx(q); 804 + 805 + if (!cpu_online(ctx->cpu)) 806 + ctx = current_ctx; 807 + hctx = q->mq_ops->map_queue(q, ctx->cpu); 808 + 809 + /* 810 + * preemption doesn't flush plug list, so it's possible ctx->cpu is 811 + * offline now 812 + */ 813 + spin_lock(&ctx->lock); 814 + while (!list_empty(list)) { 815 + struct request *rq; 816 + 817 + rq = list_first_entry(list, struct request, queuelist); 818 + list_del_init(&rq->queuelist); 819 + rq->mq_ctx = ctx; 820 + __blk_mq_insert_request(hctx, rq); 821 + } 822 + spin_unlock(&ctx->lock); 823 + 824 + blk_mq_put_ctx(current_ctx); 825 + 826 + blk_mq_run_hw_queue(hctx, from_schedule); 827 + } 828 + 829 + static int plug_ctx_cmp(void *priv, struct list_head *a, struct list_head *b) 830 + { 831 + struct request *rqa = container_of(a, struct request, queuelist); 832 + struct request *rqb = container_of(b, struct request, queuelist); 833 + 834 + return !(rqa->mq_ctx < rqb->mq_ctx || 835 + (rqa->mq_ctx == rqb->mq_ctx && 836 + blk_rq_pos(rqa) < blk_rq_pos(rqb))); 837 + } 838 + 839 + void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule) 840 + { 841 + struct blk_mq_ctx *this_ctx; 842 + struct request_queue *this_q; 843 + struct request *rq; 844 + LIST_HEAD(list); 845 + LIST_HEAD(ctx_list); 846 + unsigned int depth; 847 + 848 + list_splice_init(&plug->mq_list, &list); 849 + 850 + list_sort(NULL, &list, plug_ctx_cmp); 851 + 852 + this_q = NULL; 853 + this_ctx = NULL; 854 + depth = 0; 855 + 856 + while (!list_empty(&list)) { 857 + rq = list_entry_rq(list.next); 858 + list_del_init(&rq->queuelist); 859 + BUG_ON(!rq->q); 860 + if (rq->mq_ctx != this_ctx) { 861 + if (this_ctx) { 862 + blk_mq_insert_requests(this_q, this_ctx, 863 + &ctx_list, depth, 864 + from_schedule); 865 + } 866 + 867 + this_ctx = rq->mq_ctx; 868 + this_q = rq->q; 869 + depth = 0; 870 + } 871 + 872 + depth++; 873 + list_add_tail(&rq->queuelist, &ctx_list); 874 + } 875 + 876 + /* 877 + * If 'this_ctx' is set, we know we have entries to complete 878 + * on 'ctx_list'. Do those. 879 + */ 880 + if (this_ctx) { 881 + blk_mq_insert_requests(this_q, this_ctx, &ctx_list, depth, 882 + from_schedule); 883 + } 884 + } 885 + 886 + static void blk_mq_bio_to_request(struct request *rq, struct bio *bio) 887 + { 888 + init_request_from_bio(rq, bio); 889 + blk_account_io_start(rq, 1); 890 + } 891 + 892 + static void blk_mq_make_request(struct request_queue *q, struct bio *bio) 893 + { 894 + struct blk_mq_hw_ctx *hctx; 895 + struct blk_mq_ctx *ctx; 896 + const int is_sync = rw_is_sync(bio->bi_rw); 897 + const int is_flush_fua = bio->bi_rw & (REQ_FLUSH | REQ_FUA); 898 + int rw = bio_data_dir(bio); 899 + struct request *rq; 900 + unsigned int use_plug, request_count = 0; 901 + 902 + /* 903 + * If we have multiple hardware queues, just go directly to 904 + * one of those for sync IO. 905 + */ 906 + use_plug = !is_flush_fua && ((q->nr_hw_queues == 1) || !is_sync); 907 + 908 + blk_queue_bounce(q, &bio); 909 + 910 + if (use_plug && blk_attempt_plug_merge(q, bio, &request_count)) 911 + return; 912 + 913 + if (blk_mq_queue_enter(q)) { 914 + bio_endio(bio, -EIO); 915 + return; 916 + } 917 + 918 + ctx = blk_mq_get_ctx(q); 919 + hctx = q->mq_ops->map_queue(q, ctx->cpu); 920 + 921 + trace_block_getrq(q, bio, rw); 922 + rq = __blk_mq_alloc_request(hctx, GFP_ATOMIC, false); 923 + if (likely(rq)) 924 + blk_mq_rq_ctx_init(ctx, rq, rw); 925 + else { 926 + blk_mq_put_ctx(ctx); 927 + trace_block_sleeprq(q, bio, rw); 928 + rq = blk_mq_alloc_request_pinned(q, rw, __GFP_WAIT|GFP_ATOMIC, 929 + false); 930 + ctx = rq->mq_ctx; 931 + hctx = q->mq_ops->map_queue(q, ctx->cpu); 932 + } 933 + 934 + hctx->queued++; 935 + 936 + if (unlikely(is_flush_fua)) { 937 + blk_mq_bio_to_request(rq, bio); 938 + blk_mq_put_ctx(ctx); 939 + blk_insert_flush(rq); 940 + goto run_queue; 941 + } 942 + 943 + /* 944 + * A task plug currently exists. Since this is completely lockless, 945 + * utilize that to temporarily store requests until the task is 946 + * either done or scheduled away. 947 + */ 948 + if (use_plug) { 949 + struct blk_plug *plug = current->plug; 950 + 951 + if (plug) { 952 + blk_mq_bio_to_request(rq, bio); 953 + if (list_empty(&plug->mq_list)) 954 + trace_block_plug(q); 955 + else if (request_count >= BLK_MAX_REQUEST_COUNT) { 956 + blk_flush_plug_list(plug, false); 957 + trace_block_plug(q); 958 + } 959 + list_add_tail(&rq->queuelist, &plug->mq_list); 960 + blk_mq_put_ctx(ctx); 961 + return; 962 + } 963 + } 964 + 965 + spin_lock(&ctx->lock); 966 + 967 + if ((hctx->flags & BLK_MQ_F_SHOULD_MERGE) && 968 + blk_mq_attempt_merge(q, ctx, bio)) 969 + __blk_mq_free_request(hctx, ctx, rq); 970 + else { 971 + blk_mq_bio_to_request(rq, bio); 972 + __blk_mq_insert_request(hctx, rq); 973 + } 974 + 975 + spin_unlock(&ctx->lock); 976 + blk_mq_put_ctx(ctx); 977 + 978 + /* 979 + * For a SYNC request, send it to the hardware immediately. For an 980 + * ASYNC request, just ensure that we run it later on. The latter 981 + * allows for merging opportunities and more efficient dispatching. 982 + */ 983 + run_queue: 984 + blk_mq_run_hw_queue(hctx, !is_sync || is_flush_fua); 985 + } 986 + 987 + /* 988 + * Default mapping to a software queue, since we use one per CPU. 989 + */ 990 + struct blk_mq_hw_ctx *blk_mq_map_queue(struct request_queue *q, const int cpu) 991 + { 992 + return q->queue_hw_ctx[q->mq_map[cpu]]; 993 + } 994 + EXPORT_SYMBOL(blk_mq_map_queue); 995 + 996 + struct blk_mq_hw_ctx *blk_mq_alloc_single_hw_queue(struct blk_mq_reg *reg, 997 + unsigned int hctx_index) 998 + { 999 + return kmalloc_node(sizeof(struct blk_mq_hw_ctx), 1000 + GFP_KERNEL | __GFP_ZERO, reg->numa_node); 1001 + } 1002 + EXPORT_SYMBOL(blk_mq_alloc_single_hw_queue); 1003 + 1004 + void blk_mq_free_single_hw_queue(struct blk_mq_hw_ctx *hctx, 1005 + unsigned int hctx_index) 1006 + { 1007 + kfree(hctx); 1008 + } 1009 + EXPORT_SYMBOL(blk_mq_free_single_hw_queue); 1010 + 1011 + static void blk_mq_hctx_notify(void *data, unsigned long action, 1012 + unsigned int cpu) 1013 + { 1014 + struct blk_mq_hw_ctx *hctx = data; 1015 + struct blk_mq_ctx *ctx; 1016 + LIST_HEAD(tmp); 1017 + 1018 + if (action != CPU_DEAD && action != CPU_DEAD_FROZEN) 1019 + return; 1020 + 1021 + /* 1022 + * Move ctx entries to new CPU, if this one is going away. 1023 + */ 1024 + ctx = __blk_mq_get_ctx(hctx->queue, cpu); 1025 + 1026 + spin_lock(&ctx->lock); 1027 + if (!list_empty(&ctx->rq_list)) { 1028 + list_splice_init(&ctx->rq_list, &tmp); 1029 + clear_bit(ctx->index_hw, hctx->ctx_map); 1030 + } 1031 + spin_unlock(&ctx->lock); 1032 + 1033 + if (list_empty(&tmp)) 1034 + return; 1035 + 1036 + ctx = blk_mq_get_ctx(hctx->queue); 1037 + spin_lock(&ctx->lock); 1038 + 1039 + while (!list_empty(&tmp)) { 1040 + struct request *rq; 1041 + 1042 + rq = list_first_entry(&tmp, struct request, queuelist); 1043 + rq->mq_ctx = ctx; 1044 + list_move_tail(&rq->queuelist, &ctx->rq_list); 1045 + } 1046 + 1047 + blk_mq_hctx_mark_pending(hctx, ctx); 1048 + 1049 + spin_unlock(&ctx->lock); 1050 + blk_mq_put_ctx(ctx); 1051 + } 1052 + 1053 + static void blk_mq_init_hw_commands(struct blk_mq_hw_ctx *hctx, 1054 + void (*init)(void *, struct blk_mq_hw_ctx *, 1055 + struct request *, unsigned int), 1056 + void *data) 1057 + { 1058 + unsigned int i; 1059 + 1060 + for (i = 0; i < hctx->queue_depth; i++) { 1061 + struct request *rq = hctx->rqs[i]; 1062 + 1063 + init(data, hctx, rq, i); 1064 + } 1065 + } 1066 + 1067 + void blk_mq_init_commands(struct request_queue *q, 1068 + void (*init)(void *, struct blk_mq_hw_ctx *, 1069 + struct request *, unsigned int), 1070 + void *data) 1071 + { 1072 + struct blk_mq_hw_ctx *hctx; 1073 + unsigned int i; 1074 + 1075 + queue_for_each_hw_ctx(q, hctx, i) 1076 + blk_mq_init_hw_commands(hctx, init, data); 1077 + } 1078 + EXPORT_SYMBOL(blk_mq_init_commands); 1079 + 1080 + static void blk_mq_free_rq_map(struct blk_mq_hw_ctx *hctx) 1081 + { 1082 + struct page *page; 1083 + 1084 + while (!list_empty(&hctx->page_list)) { 1085 + page = list_first_entry(&hctx->page_list, struct page, list); 1086 + list_del_init(&page->list); 1087 + __free_pages(page, page->private); 1088 + } 1089 + 1090 + kfree(hctx->rqs); 1091 + 1092 + if (hctx->tags) 1093 + blk_mq_free_tags(hctx->tags); 1094 + } 1095 + 1096 + static size_t order_to_size(unsigned int order) 1097 + { 1098 + size_t ret = PAGE_SIZE; 1099 + 1100 + while (order--) 1101 + ret *= 2; 1102 + 1103 + return ret; 1104 + } 1105 + 1106 + static int blk_mq_init_rq_map(struct blk_mq_hw_ctx *hctx, 1107 + unsigned int reserved_tags, int node) 1108 + { 1109 + unsigned int i, j, entries_per_page, max_order = 4; 1110 + size_t rq_size, left; 1111 + 1112 + INIT_LIST_HEAD(&hctx->page_list); 1113 + 1114 + hctx->rqs = kmalloc_node(hctx->queue_depth * sizeof(struct request *), 1115 + GFP_KERNEL, node); 1116 + if (!hctx->rqs) 1117 + return -ENOMEM; 1118 + 1119 + /* 1120 + * rq_size is the size of the request plus driver payload, rounded 1121 + * to the cacheline size 1122 + */ 1123 + rq_size = round_up(sizeof(struct request) + hctx->cmd_size, 1124 + cache_line_size()); 1125 + left = rq_size * hctx->queue_depth; 1126 + 1127 + for (i = 0; i < hctx->queue_depth;) { 1128 + int this_order = max_order; 1129 + struct page *page; 1130 + int to_do; 1131 + void *p; 1132 + 1133 + while (left < order_to_size(this_order - 1) && this_order) 1134 + this_order--; 1135 + 1136 + do { 1137 + page = alloc_pages_node(node, GFP_KERNEL, this_order); 1138 + if (page) 1139 + break; 1140 + if (!this_order--) 1141 + break; 1142 + if (order_to_size(this_order) < rq_size) 1143 + break; 1144 + } while (1); 1145 + 1146 + if (!page) 1147 + break; 1148 + 1149 + page->private = this_order; 1150 + list_add_tail(&page->list, &hctx->page_list); 1151 + 1152 + p = page_address(page); 1153 + entries_per_page = order_to_size(this_order) / rq_size; 1154 + to_do = min(entries_per_page, hctx->queue_depth - i); 1155 + left -= to_do * rq_size; 1156 + for (j = 0; j < to_do; j++) { 1157 + hctx->rqs[i] = p; 1158 + blk_mq_rq_init(hctx, hctx->rqs[i]); 1159 + p += rq_size; 1160 + i++; 1161 + } 1162 + } 1163 + 1164 + if (i < (reserved_tags + BLK_MQ_TAG_MIN)) 1165 + goto err_rq_map; 1166 + else if (i != hctx->queue_depth) { 1167 + hctx->queue_depth = i; 1168 + pr_warn("%s: queue depth set to %u because of low memory\n", 1169 + __func__, i); 1170 + } 1171 + 1172 + hctx->tags = blk_mq_init_tags(hctx->queue_depth, reserved_tags, node); 1173 + if (!hctx->tags) { 1174 + err_rq_map: 1175 + blk_mq_free_rq_map(hctx); 1176 + return -ENOMEM; 1177 + } 1178 + 1179 + return 0; 1180 + } 1181 + 1182 + static int blk_mq_init_hw_queues(struct request_queue *q, 1183 + struct blk_mq_reg *reg, void *driver_data) 1184 + { 1185 + struct blk_mq_hw_ctx *hctx; 1186 + unsigned int i, j; 1187 + 1188 + /* 1189 + * Initialize hardware queues 1190 + */ 1191 + queue_for_each_hw_ctx(q, hctx, i) { 1192 + unsigned int num_maps; 1193 + int node; 1194 + 1195 + node = hctx->numa_node; 1196 + if (node == NUMA_NO_NODE) 1197 + node = hctx->numa_node = reg->numa_node; 1198 + 1199 + INIT_DELAYED_WORK(&hctx->delayed_work, blk_mq_work_fn); 1200 + spin_lock_init(&hctx->lock); 1201 + INIT_LIST_HEAD(&hctx->dispatch); 1202 + hctx->queue = q; 1203 + hctx->queue_num = i; 1204 + hctx->flags = reg->flags; 1205 + hctx->queue_depth = reg->queue_depth; 1206 + hctx->cmd_size = reg->cmd_size; 1207 + 1208 + blk_mq_init_cpu_notifier(&hctx->cpu_notifier, 1209 + blk_mq_hctx_notify, hctx); 1210 + blk_mq_register_cpu_notifier(&hctx->cpu_notifier); 1211 + 1212 + if (blk_mq_init_rq_map(hctx, reg->reserved_tags, node)) 1213 + break; 1214 + 1215 + /* 1216 + * Allocate space for all possible cpus to avoid allocation in 1217 + * runtime 1218 + */ 1219 + hctx->ctxs = kmalloc_node(nr_cpu_ids * sizeof(void *), 1220 + GFP_KERNEL, node); 1221 + if (!hctx->ctxs) 1222 + break; 1223 + 1224 + num_maps = ALIGN(nr_cpu_ids, BITS_PER_LONG) / BITS_PER_LONG; 1225 + hctx->ctx_map = kzalloc_node(num_maps * sizeof(unsigned long), 1226 + GFP_KERNEL, node); 1227 + if (!hctx->ctx_map) 1228 + break; 1229 + 1230 + hctx->nr_ctx_map = num_maps; 1231 + hctx->nr_ctx = 0; 1232 + 1233 + if (reg->ops->init_hctx && 1234 + reg->ops->init_hctx(hctx, driver_data, i)) 1235 + break; 1236 + } 1237 + 1238 + if (i == q->nr_hw_queues) 1239 + return 0; 1240 + 1241 + /* 1242 + * Init failed 1243 + */ 1244 + queue_for_each_hw_ctx(q, hctx, j) { 1245 + if (i == j) 1246 + break; 1247 + 1248 + if (reg->ops->exit_hctx) 1249 + reg->ops->exit_hctx(hctx, j); 1250 + 1251 + blk_mq_unregister_cpu_notifier(&hctx->cpu_notifier); 1252 + blk_mq_free_rq_map(hctx); 1253 + kfree(hctx->ctxs); 1254 + } 1255 + 1256 + return 1; 1257 + } 1258 + 1259 + static void blk_mq_init_cpu_queues(struct request_queue *q, 1260 + unsigned int nr_hw_queues) 1261 + { 1262 + unsigned int i; 1263 + 1264 + for_each_possible_cpu(i) { 1265 + struct blk_mq_ctx *__ctx = per_cpu_ptr(q->queue_ctx, i); 1266 + struct blk_mq_hw_ctx *hctx; 1267 + 1268 + memset(__ctx, 0, sizeof(*__ctx)); 1269 + __ctx->cpu = i; 1270 + spin_lock_init(&__ctx->lock); 1271 + INIT_LIST_HEAD(&__ctx->rq_list); 1272 + __ctx->queue = q; 1273 + 1274 + /* If the cpu isn't online, the cpu is mapped to first hctx */ 1275 + hctx = q->mq_ops->map_queue(q, i); 1276 + hctx->nr_ctx++; 1277 + 1278 + if (!cpu_online(i)) 1279 + continue; 1280 + 1281 + /* 1282 + * Set local node, IFF we have more than one hw queue. If 1283 + * not, we remain on the home node of the device 1284 + */ 1285 + if (nr_hw_queues > 1 && hctx->numa_node == NUMA_NO_NODE) 1286 + hctx->numa_node = cpu_to_node(i); 1287 + } 1288 + } 1289 + 1290 + static void blk_mq_map_swqueue(struct request_queue *q) 1291 + { 1292 + unsigned int i; 1293 + struct blk_mq_hw_ctx *hctx; 1294 + struct blk_mq_ctx *ctx; 1295 + 1296 + queue_for_each_hw_ctx(q, hctx, i) { 1297 + hctx->nr_ctx = 0; 1298 + } 1299 + 1300 + /* 1301 + * Map software to hardware queues 1302 + */ 1303 + queue_for_each_ctx(q, ctx, i) { 1304 + /* If the cpu isn't online, the cpu is mapped to first hctx */ 1305 + hctx = q->mq_ops->map_queue(q, i); 1306 + ctx->index_hw = hctx->nr_ctx; 1307 + hctx->ctxs[hctx->nr_ctx++] = ctx; 1308 + } 1309 + } 1310 + 1311 + struct request_queue *blk_mq_init_queue(struct blk_mq_reg *reg, 1312 + void *driver_data) 1313 + { 1314 + struct blk_mq_hw_ctx **hctxs; 1315 + struct blk_mq_ctx *ctx; 1316 + struct request_queue *q; 1317 + int i; 1318 + 1319 + if (!reg->nr_hw_queues || 1320 + !reg->ops->queue_rq || !reg->ops->map_queue || 1321 + !reg->ops->alloc_hctx || !reg->ops->free_hctx) 1322 + return ERR_PTR(-EINVAL); 1323 + 1324 + if (!reg->queue_depth) 1325 + reg->queue_depth = BLK_MQ_MAX_DEPTH; 1326 + else if (reg->queue_depth > BLK_MQ_MAX_DEPTH) { 1327 + pr_err("blk-mq: queuedepth too large (%u)\n", reg->queue_depth); 1328 + reg->queue_depth = BLK_MQ_MAX_DEPTH; 1329 + } 1330 + 1331 + /* 1332 + * Set aside a tag for flush requests. It will only be used while 1333 + * another flush request is in progress but outside the driver. 1334 + * 1335 + * TODO: only allocate if flushes are supported 1336 + */ 1337 + reg->queue_depth++; 1338 + reg->reserved_tags++; 1339 + 1340 + if (reg->queue_depth < (reg->reserved_tags + BLK_MQ_TAG_MIN)) 1341 + return ERR_PTR(-EINVAL); 1342 + 1343 + ctx = alloc_percpu(struct blk_mq_ctx); 1344 + if (!ctx) 1345 + return ERR_PTR(-ENOMEM); 1346 + 1347 + hctxs = kmalloc_node(reg->nr_hw_queues * sizeof(*hctxs), GFP_KERNEL, 1348 + reg->numa_node); 1349 + 1350 + if (!hctxs) 1351 + goto err_percpu; 1352 + 1353 + for (i = 0; i < reg->nr_hw_queues; i++) { 1354 + hctxs[i] = reg->ops->alloc_hctx(reg, i); 1355 + if (!hctxs[i]) 1356 + goto err_hctxs; 1357 + 1358 + hctxs[i]->numa_node = NUMA_NO_NODE; 1359 + hctxs[i]->queue_num = i; 1360 + } 1361 + 1362 + q = blk_alloc_queue_node(GFP_KERNEL, reg->numa_node); 1363 + if (!q) 1364 + goto err_hctxs; 1365 + 1366 + q->mq_map = blk_mq_make_queue_map(reg); 1367 + if (!q->mq_map) 1368 + goto err_map; 1369 + 1370 + setup_timer(&q->timeout, blk_mq_rq_timer, (unsigned long) q); 1371 + blk_queue_rq_timeout(q, 30000); 1372 + 1373 + q->nr_queues = nr_cpu_ids; 1374 + q->nr_hw_queues = reg->nr_hw_queues; 1375 + 1376 + q->queue_ctx = ctx; 1377 + q->queue_hw_ctx = hctxs; 1378 + 1379 + q->mq_ops = reg->ops; 1380 + 1381 + blk_queue_make_request(q, blk_mq_make_request); 1382 + blk_queue_rq_timed_out(q, reg->ops->timeout); 1383 + if (reg->timeout) 1384 + blk_queue_rq_timeout(q, reg->timeout); 1385 + 1386 + blk_mq_init_flush(q); 1387 + blk_mq_init_cpu_queues(q, reg->nr_hw_queues); 1388 + 1389 + if (blk_mq_init_hw_queues(q, reg, driver_data)) 1390 + goto err_hw; 1391 + 1392 + blk_mq_map_swqueue(q); 1393 + 1394 + mutex_lock(&all_q_mutex); 1395 + list_add_tail(&q->all_q_node, &all_q_list); 1396 + mutex_unlock(&all_q_mutex); 1397 + 1398 + return q; 1399 + err_hw: 1400 + kfree(q->mq_map); 1401 + err_map: 1402 + blk_cleanup_queue(q); 1403 + err_hctxs: 1404 + for (i = 0; i < reg->nr_hw_queues; i++) { 1405 + if (!hctxs[i]) 1406 + break; 1407 + reg->ops->free_hctx(hctxs[i], i); 1408 + } 1409 + kfree(hctxs); 1410 + err_percpu: 1411 + free_percpu(ctx); 1412 + return ERR_PTR(-ENOMEM); 1413 + } 1414 + EXPORT_SYMBOL(blk_mq_init_queue); 1415 + 1416 + void blk_mq_free_queue(struct request_queue *q) 1417 + { 1418 + struct blk_mq_hw_ctx *hctx; 1419 + int i; 1420 + 1421 + queue_for_each_hw_ctx(q, hctx, i) { 1422 + cancel_delayed_work_sync(&hctx->delayed_work); 1423 + kfree(hctx->ctx_map); 1424 + kfree(hctx->ctxs); 1425 + blk_mq_free_rq_map(hctx); 1426 + blk_mq_unregister_cpu_notifier(&hctx->cpu_notifier); 1427 + if (q->mq_ops->exit_hctx) 1428 + q->mq_ops->exit_hctx(hctx, i); 1429 + q->mq_ops->free_hctx(hctx, i); 1430 + } 1431 + 1432 + free_percpu(q->queue_ctx); 1433 + kfree(q->queue_hw_ctx); 1434 + kfree(q->mq_map); 1435 + 1436 + q->queue_ctx = NULL; 1437 + q->queue_hw_ctx = NULL; 1438 + q->mq_map = NULL; 1439 + 1440 + mutex_lock(&all_q_mutex); 1441 + list_del_init(&q->all_q_node); 1442 + mutex_unlock(&all_q_mutex); 1443 + } 1444 + EXPORT_SYMBOL(blk_mq_free_queue); 1445 + 1446 + /* Basically redo blk_mq_init_queue with queue frozen */ 1447 + static void __cpuinit blk_mq_queue_reinit(struct request_queue *q) 1448 + { 1449 + blk_mq_freeze_queue(q); 1450 + 1451 + blk_mq_update_queue_map(q->mq_map, q->nr_hw_queues); 1452 + 1453 + /* 1454 + * redo blk_mq_init_cpu_queues and blk_mq_init_hw_queues. FIXME: maybe 1455 + * we should change hctx numa_node according to new topology (this 1456 + * involves free and re-allocate memory, worthy doing?) 1457 + */ 1458 + 1459 + blk_mq_map_swqueue(q); 1460 + 1461 + blk_mq_unfreeze_queue(q); 1462 + } 1463 + 1464 + static int __cpuinit blk_mq_queue_reinit_notify(struct notifier_block *nb, 1465 + unsigned long action, void *hcpu) 1466 + { 1467 + struct request_queue *q; 1468 + 1469 + /* 1470 + * Before new mapping is established, hotadded cpu might already start 1471 + * handling requests. This doesn't break anything as we map offline 1472 + * CPUs to first hardware queue. We will re-init queue below to get 1473 + * optimal settings. 1474 + */ 1475 + if (action != CPU_DEAD && action != CPU_DEAD_FROZEN && 1476 + action != CPU_ONLINE && action != CPU_ONLINE_FROZEN) 1477 + return NOTIFY_OK; 1478 + 1479 + mutex_lock(&all_q_mutex); 1480 + list_for_each_entry(q, &all_q_list, all_q_node) 1481 + blk_mq_queue_reinit(q); 1482 + mutex_unlock(&all_q_mutex); 1483 + return NOTIFY_OK; 1484 + } 1485 + 1486 + static int __init blk_mq_init(void) 1487 + { 1488 + unsigned int i; 1489 + 1490 + for_each_possible_cpu(i) 1491 + init_llist_head(&per_cpu(ipi_lists, i)); 1492 + 1493 + blk_mq_cpu_init(); 1494 + 1495 + /* Must be called after percpu_counter_hotcpu_callback() */ 1496 + hotcpu_notifier(blk_mq_queue_reinit_notify, -10); 1497 + 1498 + return 0; 1499 + } 1500 + subsys_initcall(blk_mq_init);

+52

block/blk-mq.h

··· 1 + #ifndef INT_BLK_MQ_H 2 + #define INT_BLK_MQ_H 3 + 4 + struct blk_mq_ctx { 5 + struct { 6 + spinlock_t lock; 7 + struct list_head rq_list; 8 + } ____cacheline_aligned_in_smp; 9 + 10 + unsigned int cpu; 11 + unsigned int index_hw; 12 + unsigned int ipi_redirect; 13 + 14 + /* incremented at dispatch time */ 15 + unsigned long rq_dispatched[2]; 16 + unsigned long rq_merged; 17 + 18 + /* incremented at completion time */ 19 + unsigned long ____cacheline_aligned_in_smp rq_completed[2]; 20 + 21 + struct request_queue *queue; 22 + struct kobject kobj; 23 + }; 24 + 25 + void __blk_mq_end_io(struct request *rq, int error); 26 + void blk_mq_complete_request(struct request *rq, int error); 27 + void blk_mq_run_request(struct request *rq, bool run_queue, bool async); 28 + void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async); 29 + void blk_mq_init_flush(struct request_queue *q); 30 + 31 + /* 32 + * CPU hotplug helpers 33 + */ 34 + struct blk_mq_cpu_notifier; 35 + void blk_mq_init_cpu_notifier(struct blk_mq_cpu_notifier *notifier, 36 + void (*fn)(void *, unsigned long, unsigned int), 37 + void *data); 38 + void blk_mq_register_cpu_notifier(struct blk_mq_cpu_notifier *notifier); 39 + void blk_mq_unregister_cpu_notifier(struct blk_mq_cpu_notifier *notifier); 40 + void blk_mq_cpu_init(void); 41 + DECLARE_PER_CPU(struct llist_head, ipi_lists); 42 + 43 + /* 44 + * CPU -> queue mappings 45 + */ 46 + struct blk_mq_reg; 47 + extern unsigned int *blk_mq_make_queue_map(struct blk_mq_reg *reg); 48 + extern int blk_mq_update_queue_map(unsigned int *map, unsigned int nr_queues); 49 + 50 + void blk_mq_add_timer(struct request *rq); 51 + 52 + #endif

+13

block/blk-sysfs.c

··· 7 7 #include <linux/bio.h> 8 8 #include <linux/blkdev.h> 9 9 #include <linux/blktrace_api.h> 10 + #include <linux/blk-mq.h> 10 11 11 12 #include "blk.h" 12 13 #include "blk-cgroup.h" ··· 543 542 if (q->queue_tags) 544 543 __blk_queue_free_tags(q); 545 544 545 + percpu_counter_destroy(&q->mq_usage_counter); 546 + 547 + if (q->mq_ops) 548 + blk_mq_free_queue(q); 549 + 546 550 blk_trace_shutdown(q); 547 551 548 552 bdi_destroy(&q->backing_dev_info); ··· 581 575 * bypass from queue allocation. 582 576 */ 583 577 blk_queue_bypass_end(q); 578 + queue_flag_set_unlocked(QUEUE_FLAG_INIT_DONE, q); 584 579 585 580 ret = blk_trace_init_sysfs(dev); 586 581 if (ret) ··· 594 587 } 595 588 596 589 kobject_uevent(&q->kobj, KOBJ_ADD); 590 + 591 + if (q->mq_ops) 592 + blk_mq_register_disk(disk); 597 593 598 594 if (!q->request_fn) 599 595 return 0; ··· 619 609 620 610 if (WARN_ON(!q)) 621 611 return; 612 + 613 + if (q->mq_ops) 614 + blk_mq_unregister_disk(disk); 622 615 623 616 if (q->request_fn) 624 617 elv_unregister_queue(q);

+47 -27

block/blk-timeout.c

··· 7 7 #include <linux/fault-inject.h> 8 8 9 9 #include "blk.h" 10 + #include "blk-mq.h" 10 11 11 12 #ifdef CONFIG_FAIL_IO_TIMEOUT 12 13 ··· 89 88 ret = q->rq_timed_out_fn(req); 90 89 switch (ret) { 91 90 case BLK_EH_HANDLED: 92 - __blk_complete_request(req); 91 + /* Can we use req->errors here? */ 92 + if (q->mq_ops) 93 + blk_mq_complete_request(req, req->errors); 94 + else 95 + __blk_complete_request(req); 93 96 break; 94 97 case BLK_EH_RESET_TIMER: 95 - blk_add_timer(req); 98 + if (q->mq_ops) 99 + blk_mq_add_timer(req); 100 + else 101 + blk_add_timer(req); 102 + 96 103 blk_clear_rq_complete(req); 97 104 break; 98 105 case BLK_EH_NOT_HANDLED: ··· 117 108 } 118 109 } 119 110 111 + void blk_rq_check_expired(struct request *rq, unsigned long *next_timeout, 112 + unsigned int *next_set) 113 + { 114 + if (time_after_eq(jiffies, rq->deadline)) { 115 + list_del_init(&rq->timeout_list); 116 + 117 + /* 118 + * Check if we raced with end io completion 119 + */ 120 + if (!blk_mark_rq_complete(rq)) 121 + blk_rq_timed_out(rq); 122 + } else if (!*next_set || time_after(*next_timeout, rq->deadline)) { 123 + *next_timeout = rq->deadline; 124 + *next_set = 1; 125 + } 126 + } 127 + 120 128 void blk_rq_timed_out_timer(unsigned long data) 121 129 { 122 130 struct request_queue *q = (struct request_queue *) data; ··· 143 117 144 118 spin_lock_irqsave(q->queue_lock, flags); 145 119 146 - list_for_each_entry_safe(rq, tmp, &q->timeout_list, timeout_list) { 147 - if (time_after_eq(jiffies, rq->deadline)) { 148 - list_del_init(&rq->timeout_list); 149 - 150 - /* 151 - * Check if we raced with end io completion 152 - */ 153 - if (blk_mark_rq_complete(rq)) 154 - continue; 155 - blk_rq_timed_out(rq); 156 - } else if (!next_set || time_after(next, rq->deadline)) { 157 - next = rq->deadline; 158 - next_set = 1; 159 - } 160 - } 120 + list_for_each_entry_safe(rq, tmp, &q->timeout_list, timeout_list) 121 + blk_rq_check_expired(rq, &next, &next_set); 161 122 162 123 if (next_set) 163 124 mod_timer(&q->timeout, round_jiffies_up(next)); ··· 170 157 } 171 158 EXPORT_SYMBOL_GPL(blk_abort_request); 172 159 173 - /** 174 - * blk_add_timer - Start timeout timer for a single request 175 - * @req: request that is about to start running. 176 - * 177 - * Notes: 178 - * Each request has its own timer, and as it is added to the queue, we 179 - * set up the timer. When the request completes, we cancel the timer. 180 - */ 181 - void blk_add_timer(struct request *req) 160 + void __blk_add_timer(struct request *req, struct list_head *timeout_list) 182 161 { 183 162 struct request_queue *q = req->q; 184 163 unsigned long expiry; ··· 188 183 req->timeout = q->rq_timeout; 189 184 190 185 req->deadline = jiffies + req->timeout; 191 - list_add_tail(&req->timeout_list, &q->timeout_list); 186 + if (timeout_list) 187 + list_add_tail(&req->timeout_list, timeout_list); 192 188 193 189 /* 194 190 * If the timer isn't already pending or this timeout is earlier ··· 201 195 if (!timer_pending(&q->timeout) || 202 196 time_before(expiry, q->timeout.expires)) 203 197 mod_timer(&q->timeout, expiry); 198 + 199 + } 200 + 201 + /** 202 + * blk_add_timer - Start timeout timer for a single request 203 + * @req: request that is about to start running. 204 + * 205 + * Notes: 206 + * Each request has its own timer, and as it is added to the queue, we 207 + * set up the timer. When the request completes, we cancel the timer. 208 + */ 209 + void blk_add_timer(struct request *req) 210 + { 211 + __blk_add_timer(req, &req->q->timeout_list); 204 212 } 205 213

+17

block/blk.h

··· 10 10 #define BLK_BATCH_REQ 32 11 11 12 12 extern struct kmem_cache *blk_requestq_cachep; 13 + extern struct kmem_cache *request_cachep; 13 14 extern struct kobj_type blk_queue_ktype; 14 15 extern struct ida blk_queue_ida; 15 16 ··· 35 34 unsigned int nr_bytes, unsigned int bidi_bytes); 36 35 37 36 void blk_rq_timed_out_timer(unsigned long data); 37 + void blk_rq_check_expired(struct request *rq, unsigned long *next_timeout, 38 + unsigned int *next_set); 39 + void __blk_add_timer(struct request *req, struct list_head *timeout_list); 38 40 void blk_delete_timer(struct request *); 39 41 void blk_add_timer(struct request *); 42 + 43 + 44 + bool bio_attempt_front_merge(struct request_queue *q, struct request *req, 45 + struct bio *bio); 46 + bool bio_attempt_back_merge(struct request_queue *q, struct request *req, 47 + struct bio *bio); 48 + bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio, 49 + unsigned int *request_count); 50 + 51 + void blk_account_io_start(struct request *req, bool new_io); 52 + void blk_account_io_completion(struct request *req, unsigned int bytes); 53 + void blk_account_io_done(struct request *req); 40 54 41 55 /* 42 56 * Internal atomic flags for request handling 43 57 */ 44 58 enum rq_atomic_flags { 45 59 REQ_ATOM_COMPLETE = 0, 60 + REQ_ATOM_STARTED, 46 61 }; 47 62 48 63 /*

+3

drivers/block/Kconfig

··· 15 15 16 16 if BLK_DEV 17 17 18 + config BLK_DEV_NULL_BLK 19 + tristate "Null test block driver" 20 + 18 21 config BLK_DEV_FD 19 22 tristate "Normal floppy disk support" 20 23 depends on ARCH_MAY_HAVE_PC_FDC

+1

drivers/block/Makefile

··· 41 41 obj-$(CONFIG_BLK_DEV_PCIESSD_MTIP32XX) += mtip32xx/ 42 42 43 43 obj-$(CONFIG_BLK_DEV_RSXX) += rsxx/ 44 + obj-$(CONFIG_BLK_DEV_NULL_BLK) += null_blk.o 44 45 45 46 nvme-y := nvme-core.o nvme-scsi.o 46 47 swim_mod-y := swim.o swim_asm.o

+2 -2

drivers/block/floppy.c

··· 2886 2886 return; 2887 2887 2888 2888 if (WARN(atomic_read(&usage_count) == 0, 2889 - "warning: usage count=0, current_req=%p sect=%ld type=%x flags=%x\n", 2889 + "warning: usage count=0, current_req=%p sect=%ld type=%x flags=%llx\n", 2890 2890 current_req, (long)blk_rq_pos(current_req), current_req->cmd_type, 2891 - current_req->cmd_flags)) 2891 + (unsigned long long) current_req->cmd_flags)) 2892 2892 return; 2893 2893 2894 2894 if (test_and_set_bit(0, &fdc_busy)) {

+635

drivers/block/null_blk.c

··· 1 + #include <linux/module.h> 2 + #include <linux/moduleparam.h> 3 + #include <linux/sched.h> 4 + #include <linux/fs.h> 5 + #include <linux/blkdev.h> 6 + #include <linux/init.h> 7 + #include <linux/slab.h> 8 + #include <linux/blk-mq.h> 9 + #include <linux/hrtimer.h> 10 + 11 + struct nullb_cmd { 12 + struct list_head list; 13 + struct llist_node ll_list; 14 + struct call_single_data csd; 15 + struct request *rq; 16 + struct bio *bio; 17 + unsigned int tag; 18 + struct nullb_queue *nq; 19 + }; 20 + 21 + struct nullb_queue { 22 + unsigned long *tag_map; 23 + wait_queue_head_t wait; 24 + unsigned int queue_depth; 25 + 26 + struct nullb_cmd *cmds; 27 + }; 28 + 29 + struct nullb { 30 + struct list_head list; 31 + unsigned int index; 32 + struct request_queue *q; 33 + struct gendisk *disk; 34 + struct hrtimer timer; 35 + unsigned int queue_depth; 36 + spinlock_t lock; 37 + 38 + struct nullb_queue *queues; 39 + unsigned int nr_queues; 40 + }; 41 + 42 + static LIST_HEAD(nullb_list); 43 + static struct mutex lock; 44 + static int null_major; 45 + static int nullb_indexes; 46 + 47 + struct completion_queue { 48 + struct llist_head list; 49 + struct hrtimer timer; 50 + }; 51 + 52 + /* 53 + * These are per-cpu for now, they will need to be configured by the 54 + * complete_queues parameter and appropriately mapped. 55 + */ 56 + static DEFINE_PER_CPU(struct completion_queue, completion_queues); 57 + 58 + enum { 59 + NULL_IRQ_NONE = 0, 60 + NULL_IRQ_SOFTIRQ = 1, 61 + NULL_IRQ_TIMER = 2, 62 + 63 + NULL_Q_BIO = 0, 64 + NULL_Q_RQ = 1, 65 + NULL_Q_MQ = 2, 66 + }; 67 + 68 + static int submit_queues = 1; 69 + module_param(submit_queues, int, S_IRUGO); 70 + MODULE_PARM_DESC(submit_queues, "Number of submission queues"); 71 + 72 + static int home_node = NUMA_NO_NODE; 73 + module_param(home_node, int, S_IRUGO); 74 + MODULE_PARM_DESC(home_node, "Home node for the device"); 75 + 76 + static int queue_mode = NULL_Q_MQ; 77 + module_param(queue_mode, int, S_IRUGO); 78 + MODULE_PARM_DESC(use_mq, "Use blk-mq interface (0=bio,1=rq,2=multiqueue)"); 79 + 80 + static int gb = 250; 81 + module_param(gb, int, S_IRUGO); 82 + MODULE_PARM_DESC(gb, "Size in GB"); 83 + 84 + static int bs = 512; 85 + module_param(bs, int, S_IRUGO); 86 + MODULE_PARM_DESC(bs, "Block size (in bytes)"); 87 + 88 + static int nr_devices = 2; 89 + module_param(nr_devices, int, S_IRUGO); 90 + MODULE_PARM_DESC(nr_devices, "Number of devices to register"); 91 + 92 + static int irqmode = NULL_IRQ_SOFTIRQ; 93 + module_param(irqmode, int, S_IRUGO); 94 + MODULE_PARM_DESC(irqmode, "IRQ completion handler. 0-none, 1-softirq, 2-timer"); 95 + 96 + static int completion_nsec = 10000; 97 + module_param(completion_nsec, int, S_IRUGO); 98 + MODULE_PARM_DESC(completion_nsec, "Time in ns to complete a request in hardware. Default: 10,000ns"); 99 + 100 + static int hw_queue_depth = 64; 101 + module_param(hw_queue_depth, int, S_IRUGO); 102 + MODULE_PARM_DESC(hw_queue_depth, "Queue depth for each hardware queue. Default: 64"); 103 + 104 + static bool use_per_node_hctx = true; 105 + module_param(use_per_node_hctx, bool, S_IRUGO); 106 + MODULE_PARM_DESC(use_per_node_hctx, "Use per-node allocation for hardware context queues. Default: true"); 107 + 108 + static void put_tag(struct nullb_queue *nq, unsigned int tag) 109 + { 110 + clear_bit_unlock(tag, nq->tag_map); 111 + 112 + if (waitqueue_active(&nq->wait)) 113 + wake_up(&nq->wait); 114 + } 115 + 116 + static unsigned int get_tag(struct nullb_queue *nq) 117 + { 118 + unsigned int tag; 119 + 120 + do { 121 + tag = find_first_zero_bit(nq->tag_map, nq->queue_depth); 122 + if (tag >= nq->queue_depth) 123 + return -1U; 124 + } while (test_and_set_bit_lock(tag, nq->tag_map)); 125 + 126 + return tag; 127 + } 128 + 129 + static void free_cmd(struct nullb_cmd *cmd) 130 + { 131 + put_tag(cmd->nq, cmd->tag); 132 + } 133 + 134 + static struct nullb_cmd *__alloc_cmd(struct nullb_queue *nq) 135 + { 136 + struct nullb_cmd *cmd; 137 + unsigned int tag; 138 + 139 + tag = get_tag(nq); 140 + if (tag != -1U) { 141 + cmd = &nq->cmds[tag]; 142 + cmd->tag = tag; 143 + cmd->nq = nq; 144 + return cmd; 145 + } 146 + 147 + return NULL; 148 + } 149 + 150 + static struct nullb_cmd *alloc_cmd(struct nullb_queue *nq, int can_wait) 151 + { 152 + struct nullb_cmd *cmd; 153 + DEFINE_WAIT(wait); 154 + 155 + cmd = __alloc_cmd(nq); 156 + if (cmd || !can_wait) 157 + return cmd; 158 + 159 + do { 160 + prepare_to_wait(&nq->wait, &wait, TASK_UNINTERRUPTIBLE); 161 + cmd = __alloc_cmd(nq); 162 + if (cmd) 163 + break; 164 + 165 + io_schedule(); 166 + } while (1); 167 + 168 + finish_wait(&nq->wait, &wait); 169 + return cmd; 170 + } 171 + 172 + static void end_cmd(struct nullb_cmd *cmd) 173 + { 174 + if (cmd->rq) { 175 + if (queue_mode == NULL_Q_MQ) 176 + blk_mq_end_io(cmd->rq, 0); 177 + else { 178 + INIT_LIST_HEAD(&cmd->rq->queuelist); 179 + blk_end_request_all(cmd->rq, 0); 180 + } 181 + } else if (cmd->bio) 182 + bio_endio(cmd->bio, 0); 183 + 184 + if (queue_mode != NULL_Q_MQ) 185 + free_cmd(cmd); 186 + } 187 + 188 + static enum hrtimer_restart null_cmd_timer_expired(struct hrtimer *timer) 189 + { 190 + struct completion_queue *cq; 191 + struct llist_node *entry; 192 + struct nullb_cmd *cmd; 193 + 194 + cq = &per_cpu(completion_queues, smp_processor_id()); 195 + 196 + while ((entry = llist_del_all(&cq->list)) != NULL) { 197 + do { 198 + cmd = container_of(entry, struct nullb_cmd, ll_list); 199 + end_cmd(cmd); 200 + entry = entry->next; 201 + } while (entry); 202 + } 203 + 204 + return HRTIMER_NORESTART; 205 + } 206 + 207 + static void null_cmd_end_timer(struct nullb_cmd *cmd) 208 + { 209 + struct completion_queue *cq = &per_cpu(completion_queues, get_cpu()); 210 + 211 + cmd->ll_list.next = NULL; 212 + if (llist_add(&cmd->ll_list, &cq->list)) { 213 + ktime_t kt = ktime_set(0, completion_nsec); 214 + 215 + hrtimer_start(&cq->timer, kt, HRTIMER_MODE_REL); 216 + } 217 + 218 + put_cpu(); 219 + } 220 + 221 + static void null_softirq_done_fn(struct request *rq) 222 + { 223 + blk_end_request_all(rq, 0); 224 + } 225 + 226 + #if defined(CONFIG_SMP) && defined(CONFIG_USE_GENERIC_SMP_HELPERS) 227 + 228 + static void null_ipi_cmd_end_io(void *data) 229 + { 230 + struct completion_queue *cq; 231 + struct llist_node *entry, *next; 232 + struct nullb_cmd *cmd; 233 + 234 + cq = &per_cpu(completion_queues, smp_processor_id()); 235 + 236 + entry = llist_del_all(&cq->list); 237 + 238 + while (entry) { 239 + next = entry->next; 240 + cmd = llist_entry(entry, struct nullb_cmd, ll_list); 241 + end_cmd(cmd); 242 + entry = next; 243 + } 244 + } 245 + 246 + static void null_cmd_end_ipi(struct nullb_cmd *cmd) 247 + { 248 + struct call_single_data *data = &cmd->csd; 249 + int cpu = get_cpu(); 250 + struct completion_queue *cq = &per_cpu(completion_queues, cpu); 251 + 252 + cmd->ll_list.next = NULL; 253 + 254 + if (llist_add(&cmd->ll_list, &cq->list)) { 255 + data->func = null_ipi_cmd_end_io; 256 + data->flags = 0; 257 + __smp_call_function_single(cpu, data, 0); 258 + } 259 + 260 + put_cpu(); 261 + } 262 + 263 + #endif /* CONFIG_SMP && CONFIG_USE_GENERIC_SMP_HELPERS */ 264 + 265 + static inline void null_handle_cmd(struct nullb_cmd *cmd) 266 + { 267 + /* Complete IO by inline, softirq or timer */ 268 + switch (irqmode) { 269 + case NULL_IRQ_NONE: 270 + end_cmd(cmd); 271 + break; 272 + case NULL_IRQ_SOFTIRQ: 273 + #if defined(CONFIG_SMP) && defined(CONFIG_USE_GENERIC_SMP_HELPERS) 274 + null_cmd_end_ipi(cmd); 275 + #else 276 + end_cmd(cmd); 277 + #endif 278 + break; 279 + case NULL_IRQ_TIMER: 280 + null_cmd_end_timer(cmd); 281 + break; 282 + } 283 + } 284 + 285 + static struct nullb_queue *nullb_to_queue(struct nullb *nullb) 286 + { 287 + int index = 0; 288 + 289 + if (nullb->nr_queues != 1) 290 + index = raw_smp_processor_id() / ((nr_cpu_ids + nullb->nr_queues - 1) / nullb->nr_queues); 291 + 292 + return &nullb->queues[index]; 293 + } 294 + 295 + static void null_queue_bio(struct request_queue *q, struct bio *bio) 296 + { 297 + struct nullb *nullb = q->queuedata; 298 + struct nullb_queue *nq = nullb_to_queue(nullb); 299 + struct nullb_cmd *cmd; 300 + 301 + cmd = alloc_cmd(nq, 1); 302 + cmd->bio = bio; 303 + 304 + null_handle_cmd(cmd); 305 + } 306 + 307 + static int null_rq_prep_fn(struct request_queue *q, struct request *req) 308 + { 309 + struct nullb *nullb = q->queuedata; 310 + struct nullb_queue *nq = nullb_to_queue(nullb); 311 + struct nullb_cmd *cmd; 312 + 313 + cmd = alloc_cmd(nq, 0); 314 + if (cmd) { 315 + cmd->rq = req; 316 + req->special = cmd; 317 + return BLKPREP_OK; 318 + } 319 + 320 + return BLKPREP_DEFER; 321 + } 322 + 323 + static void null_request_fn(struct request_queue *q) 324 + { 325 + struct request *rq; 326 + 327 + while ((rq = blk_fetch_request(q)) != NULL) { 328 + struct nullb_cmd *cmd = rq->special; 329 + 330 + spin_unlock_irq(q->queue_lock); 331 + null_handle_cmd(cmd); 332 + spin_lock_irq(q->queue_lock); 333 + } 334 + } 335 + 336 + static int null_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *rq) 337 + { 338 + struct nullb_cmd *cmd = rq->special; 339 + 340 + cmd->rq = rq; 341 + cmd->nq = hctx->driver_data; 342 + 343 + null_handle_cmd(cmd); 344 + return BLK_MQ_RQ_QUEUE_OK; 345 + } 346 + 347 + static struct blk_mq_hw_ctx *null_alloc_hctx(struct blk_mq_reg *reg, unsigned int hctx_index) 348 + { 349 + return kzalloc_node(sizeof(struct blk_mq_hw_ctx), GFP_KERNEL, 350 + hctx_index); 351 + } 352 + 353 + static void null_free_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_index) 354 + { 355 + kfree(hctx); 356 + } 357 + 358 + static int null_init_hctx(struct blk_mq_hw_ctx *hctx, void *data, 359 + unsigned int index) 360 + { 361 + struct nullb *nullb = data; 362 + struct nullb_queue *nq = &nullb->queues[index]; 363 + 364 + init_waitqueue_head(&nq->wait); 365 + nq->queue_depth = nullb->queue_depth; 366 + nullb->nr_queues++; 367 + hctx->driver_data = nq; 368 + 369 + return 0; 370 + } 371 + 372 + static struct blk_mq_ops null_mq_ops = { 373 + .queue_rq = null_queue_rq, 374 + .map_queue = blk_mq_map_queue, 375 + .init_hctx = null_init_hctx, 376 + }; 377 + 378 + static struct blk_mq_reg null_mq_reg = { 379 + .ops = &null_mq_ops, 380 + .queue_depth = 64, 381 + .cmd_size = sizeof(struct nullb_cmd), 382 + .flags = BLK_MQ_F_SHOULD_MERGE, 383 + }; 384 + 385 + static void null_del_dev(struct nullb *nullb) 386 + { 387 + list_del_init(&nullb->list); 388 + 389 + del_gendisk(nullb->disk); 390 + if (queue_mode == NULL_Q_MQ) 391 + blk_mq_free_queue(nullb->q); 392 + else 393 + blk_cleanup_queue(nullb->q); 394 + put_disk(nullb->disk); 395 + kfree(nullb); 396 + } 397 + 398 + static int null_open(struct block_device *bdev, fmode_t mode) 399 + { 400 + return 0; 401 + } 402 + 403 + static void null_release(struct gendisk *disk, fmode_t mode) 404 + { 405 + } 406 + 407 + static const struct block_device_operations null_fops = { 408 + .owner = THIS_MODULE, 409 + .open = null_open, 410 + .release = null_release, 411 + }; 412 + 413 + static int setup_commands(struct nullb_queue *nq) 414 + { 415 + struct nullb_cmd *cmd; 416 + int i, tag_size; 417 + 418 + nq->cmds = kzalloc(nq->queue_depth * sizeof(*cmd), GFP_KERNEL); 419 + if (!nq->cmds) 420 + return 1; 421 + 422 + tag_size = ALIGN(nq->queue_depth, BITS_PER_LONG) / BITS_PER_LONG; 423 + nq->tag_map = kzalloc(tag_size * sizeof(unsigned long), GFP_KERNEL); 424 + if (!nq->tag_map) { 425 + kfree(nq->cmds); 426 + return 1; 427 + } 428 + 429 + for (i = 0; i < nq->queue_depth; i++) { 430 + cmd = &nq->cmds[i]; 431 + INIT_LIST_HEAD(&cmd->list); 432 + cmd->ll_list.next = NULL; 433 + cmd->tag = -1U; 434 + } 435 + 436 + return 0; 437 + } 438 + 439 + static void cleanup_queue(struct nullb_queue *nq) 440 + { 441 + kfree(nq->tag_map); 442 + kfree(nq->cmds); 443 + } 444 + 445 + static void cleanup_queues(struct nullb *nullb) 446 + { 447 + int i; 448 + 449 + for (i = 0; i < nullb->nr_queues; i++) 450 + cleanup_queue(&nullb->queues[i]); 451 + 452 + kfree(nullb->queues); 453 + } 454 + 455 + static int setup_queues(struct nullb *nullb) 456 + { 457 + struct nullb_queue *nq; 458 + int i; 459 + 460 + nullb->queues = kzalloc(submit_queues * sizeof(*nq), GFP_KERNEL); 461 + if (!nullb->queues) 462 + return 1; 463 + 464 + nullb->nr_queues = 0; 465 + nullb->queue_depth = hw_queue_depth; 466 + 467 + if (queue_mode == NULL_Q_MQ) 468 + return 0; 469 + 470 + for (i = 0; i < submit_queues; i++) { 471 + nq = &nullb->queues[i]; 472 + init_waitqueue_head(&nq->wait); 473 + nq->queue_depth = hw_queue_depth; 474 + if (setup_commands(nq)) 475 + break; 476 + nullb->nr_queues++; 477 + } 478 + 479 + if (i == submit_queues) 480 + return 0; 481 + 482 + cleanup_queues(nullb); 483 + return 1; 484 + } 485 + 486 + static int null_add_dev(void) 487 + { 488 + struct gendisk *disk; 489 + struct nullb *nullb; 490 + sector_t size; 491 + 492 + nullb = kzalloc_node(sizeof(*nullb), GFP_KERNEL, home_node); 493 + if (!nullb) 494 + return -ENOMEM; 495 + 496 + spin_lock_init(&nullb->lock); 497 + 498 + if (setup_queues(nullb)) 499 + goto err; 500 + 501 + if (queue_mode == NULL_Q_MQ) { 502 + null_mq_reg.numa_node = home_node; 503 + null_mq_reg.queue_depth = hw_queue_depth; 504 + 505 + if (use_per_node_hctx) { 506 + null_mq_reg.ops->alloc_hctx = null_alloc_hctx; 507 + null_mq_reg.ops->free_hctx = null_free_hctx; 508 + 509 + null_mq_reg.nr_hw_queues = nr_online_nodes; 510 + } else { 511 + null_mq_reg.ops->alloc_hctx = blk_mq_alloc_single_hw_queue; 512 + null_mq_reg.ops->free_hctx = blk_mq_free_single_hw_queue; 513 + 514 + null_mq_reg.nr_hw_queues = submit_queues; 515 + } 516 + 517 + nullb->q = blk_mq_init_queue(&null_mq_reg, nullb); 518 + } else if (queue_mode == NULL_Q_BIO) { 519 + nullb->q = blk_alloc_queue_node(GFP_KERNEL, home_node); 520 + blk_queue_make_request(nullb->q, null_queue_bio); 521 + } else { 522 + nullb->q = blk_init_queue_node(null_request_fn, &nullb->lock, home_node); 523 + blk_queue_prep_rq(nullb->q, null_rq_prep_fn); 524 + if (nullb->q) 525 + blk_queue_softirq_done(nullb->q, null_softirq_done_fn); 526 + } 527 + 528 + if (!nullb->q) 529 + goto queue_fail; 530 + 531 + nullb->q->queuedata = nullb; 532 + queue_flag_set_unlocked(QUEUE_FLAG_NONROT, nullb->q); 533 + 534 + disk = nullb->disk = alloc_disk_node(1, home_node); 535 + if (!disk) { 536 + queue_fail: 537 + if (queue_mode == NULL_Q_MQ) 538 + blk_mq_free_queue(nullb->q); 539 + else 540 + blk_cleanup_queue(nullb->q); 541 + cleanup_queues(nullb); 542 + err: 543 + kfree(nullb); 544 + return -ENOMEM; 545 + } 546 + 547 + mutex_lock(&lock); 548 + list_add_tail(&nullb->list, &nullb_list); 549 + nullb->index = nullb_indexes++; 550 + mutex_unlock(&lock); 551 + 552 + blk_queue_logical_block_size(nullb->q, bs); 553 + blk_queue_physical_block_size(nullb->q, bs); 554 + 555 + size = gb * 1024 * 1024 * 1024ULL; 556 + sector_div(size, bs); 557 + set_capacity(disk, size); 558 + 559 + disk->flags |= GENHD_FL_EXT_DEVT; 560 + disk->major = null_major; 561 + disk->first_minor = nullb->index; 562 + disk->fops = &null_fops; 563 + disk->private_data = nullb; 564 + disk->queue = nullb->q; 565 + sprintf(disk->disk_name, "nullb%d", nullb->index); 566 + add_disk(disk); 567 + return 0; 568 + } 569 + 570 + static int __init null_init(void) 571 + { 572 + unsigned int i; 573 + 574 + #if !defined(CONFIG_SMP) || !defined(CONFIG_USE_GENERIC_SMP_HELPERS) 575 + if (irqmode == NULL_IRQ_SOFTIRQ) { 576 + pr_warn("null_blk: softirq completions not available.\n"); 577 + pr_warn("null_blk: using direct completions.\n"); 578 + irqmode = NULL_IRQ_NONE; 579 + } 580 + #endif 581 + 582 + if (submit_queues > nr_cpu_ids) 583 + submit_queues = nr_cpu_ids; 584 + else if (!submit_queues) 585 + submit_queues = 1; 586 + 587 + mutex_init(&lock); 588 + 589 + /* Initialize a separate list for each CPU for issuing softirqs */ 590 + for_each_possible_cpu(i) { 591 + struct completion_queue *cq = &per_cpu(completion_queues, i); 592 + 593 + init_llist_head(&cq->list); 594 + 595 + if (irqmode != NULL_IRQ_TIMER) 596 + continue; 597 + 598 + hrtimer_init(&cq->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); 599 + cq->timer.function = null_cmd_timer_expired; 600 + } 601 + 602 + null_major = register_blkdev(0, "nullb"); 603 + if (null_major < 0) 604 + return null_major; 605 + 606 + for (i = 0; i < nr_devices; i++) { 607 + if (null_add_dev()) { 608 + unregister_blkdev(null_major, "nullb"); 609 + return -EINVAL; 610 + } 611 + } 612 + 613 + pr_info("null: module loaded\n"); 614 + return 0; 615 + } 616 + 617 + static void __exit null_exit(void) 618 + { 619 + struct nullb *nullb; 620 + 621 + unregister_blkdev(null_major, "nullb"); 622 + 623 + mutex_lock(&lock); 624 + while (!list_empty(&nullb_list)) { 625 + nullb = list_entry(nullb_list.next, struct nullb, list); 626 + null_del_dev(nullb); 627 + } 628 + mutex_unlock(&lock); 629 + } 630 + 631 + module_init(null_init); 632 + module_exit(null_exit); 633 + 634 + MODULE_AUTHOR("Jens Axboe <jaxboe@fusionio.com>"); 635 + MODULE_LICENSE("GPL");

+1 -1

drivers/scsi/sd.c

··· 1002 1002 SCpnt->cmnd[0] = READ_6; 1003 1003 SCpnt->sc_data_direction = DMA_FROM_DEVICE; 1004 1004 } else { 1005 - scmd_printk(KERN_ERR, SCpnt, "Unknown command %x\n", rq->cmd_flags); 1005 + scmd_printk(KERN_ERR, SCpnt, "Unknown command %llx\n", (unsigned long long) rq->cmd_flags); 1006 1006 goto out; 1007 1007 } 1008 1008

+2

include/linux/bio.h

··· 420 420 bl->head = bl->tail = NULL; 421 421 } 422 422 423 + #define BIO_EMPTY_LIST { NULL, NULL } 424 + 423 425 #define bio_list_for_each(bio, bl) \ 424 426 for (bio = (bl)->head; bio; bio = bio->bi_next) 425 427

+183

include/linux/blk-mq.h

··· 1 + #ifndef BLK_MQ_H 2 + #define BLK_MQ_H 3 + 4 + #include <linux/blkdev.h> 5 + 6 + struct blk_mq_tags; 7 + 8 + struct blk_mq_cpu_notifier { 9 + struct list_head list; 10 + void *data; 11 + void (*notify)(void *data, unsigned long action, unsigned int cpu); 12 + }; 13 + 14 + struct blk_mq_hw_ctx { 15 + struct { 16 + spinlock_t lock; 17 + struct list_head dispatch; 18 + } ____cacheline_aligned_in_smp; 19 + 20 + unsigned long state; /* BLK_MQ_S_* flags */ 21 + struct delayed_work delayed_work; 22 + 23 + unsigned long flags; /* BLK_MQ_F_* flags */ 24 + 25 + struct request_queue *queue; 26 + unsigned int queue_num; 27 + 28 + void *driver_data; 29 + 30 + unsigned int nr_ctx; 31 + struct blk_mq_ctx **ctxs; 32 + unsigned int nr_ctx_map; 33 + unsigned long *ctx_map; 34 + 35 + struct request **rqs; 36 + struct list_head page_list; 37 + struct blk_mq_tags *tags; 38 + 39 + unsigned long queued; 40 + unsigned long run; 41 + #define BLK_MQ_MAX_DISPATCH_ORDER 10 42 + unsigned long dispatched[BLK_MQ_MAX_DISPATCH_ORDER]; 43 + 44 + unsigned int queue_depth; 45 + unsigned int numa_node; 46 + unsigned int cmd_size; /* per-request extra data */ 47 + 48 + struct blk_mq_cpu_notifier cpu_notifier; 49 + struct kobject kobj; 50 + }; 51 + 52 + struct blk_mq_reg { 53 + struct blk_mq_ops *ops; 54 + unsigned int nr_hw_queues; 55 + unsigned int queue_depth; 56 + unsigned int reserved_tags; 57 + unsigned int cmd_size; /* per-request extra data */ 58 + int numa_node; 59 + unsigned int timeout; 60 + unsigned int flags; /* BLK_MQ_F_* */ 61 + }; 62 + 63 + typedef int (queue_rq_fn)(struct blk_mq_hw_ctx *, struct request *); 64 + typedef struct blk_mq_hw_ctx *(map_queue_fn)(struct request_queue *, const int); 65 + typedef struct blk_mq_hw_ctx *(alloc_hctx_fn)(struct blk_mq_reg *,unsigned int); 66 + typedef void (free_hctx_fn)(struct blk_mq_hw_ctx *, unsigned int); 67 + typedef int (init_hctx_fn)(struct blk_mq_hw_ctx *, void *, unsigned int); 68 + typedef void (exit_hctx_fn)(struct blk_mq_hw_ctx *, unsigned int); 69 + 70 + struct blk_mq_ops { 71 + /* 72 + * Queue request 73 + */ 74 + queue_rq_fn *queue_rq; 75 + 76 + /* 77 + * Map to specific hardware queue 78 + */ 79 + map_queue_fn *map_queue; 80 + 81 + /* 82 + * Called on request timeout 83 + */ 84 + rq_timed_out_fn *timeout; 85 + 86 + /* 87 + * Override for hctx allocations (should probably go) 88 + */ 89 + alloc_hctx_fn *alloc_hctx; 90 + free_hctx_fn *free_hctx; 91 + 92 + /* 93 + * Called when the block layer side of a hardware queue has been 94 + * set up, allowing the driver to allocate/init matching structures. 95 + * Ditto for exit/teardown. 96 + */ 97 + init_hctx_fn *init_hctx; 98 + exit_hctx_fn *exit_hctx; 99 + }; 100 + 101 + enum { 102 + BLK_MQ_RQ_QUEUE_OK = 0, /* queued fine */ 103 + BLK_MQ_RQ_QUEUE_BUSY = 1, /* requeue IO for later */ 104 + BLK_MQ_RQ_QUEUE_ERROR = 2, /* end IO with error */ 105 + 106 + BLK_MQ_F_SHOULD_MERGE = 1 << 0, 107 + BLK_MQ_F_SHOULD_SORT = 1 << 1, 108 + BLK_MQ_F_SHOULD_IPI = 1 << 2, 109 + 110 + BLK_MQ_S_STOPPED = 1 << 0, 111 + 112 + BLK_MQ_MAX_DEPTH = 2048, 113 + }; 114 + 115 + struct request_queue *blk_mq_init_queue(struct blk_mq_reg *, void *); 116 + void blk_mq_free_queue(struct request_queue *); 117 + int blk_mq_register_disk(struct gendisk *); 118 + void blk_mq_unregister_disk(struct gendisk *); 119 + void blk_mq_init_commands(struct request_queue *, void (*init)(void *data, struct blk_mq_hw_ctx *, struct request *, unsigned int), void *data); 120 + 121 + void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule); 122 + 123 + void blk_mq_insert_request(struct request_queue *, struct request *, bool); 124 + void blk_mq_run_queues(struct request_queue *q, bool async); 125 + void blk_mq_free_request(struct request *rq); 126 + bool blk_mq_can_queue(struct blk_mq_hw_ctx *); 127 + struct request *blk_mq_alloc_request(struct request_queue *q, int rw, gfp_t gfp, bool reserved); 128 + struct request *blk_mq_alloc_reserved_request(struct request_queue *q, int rw, gfp_t gfp); 129 + struct request *blk_mq_rq_from_tag(struct request_queue *q, unsigned int tag); 130 + 131 + struct blk_mq_hw_ctx *blk_mq_map_queue(struct request_queue *, const int ctx_index); 132 + struct blk_mq_hw_ctx *blk_mq_alloc_single_hw_queue(struct blk_mq_reg *, unsigned int); 133 + void blk_mq_free_single_hw_queue(struct blk_mq_hw_ctx *, unsigned int); 134 + 135 + void blk_mq_end_io(struct request *rq, int error); 136 + 137 + void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx); 138 + void blk_mq_start_hw_queue(struct blk_mq_hw_ctx *hctx); 139 + void blk_mq_stop_hw_queues(struct request_queue *q); 140 + void blk_mq_start_stopped_hw_queues(struct request_queue *q); 141 + 142 + /* 143 + * Driver command data is immediately after the request. So subtract request 144 + * size to get back to the original request. 145 + */ 146 + static inline struct request *blk_mq_rq_from_pdu(void *pdu) 147 + { 148 + return pdu - sizeof(struct request); 149 + } 150 + static inline void *blk_mq_rq_to_pdu(struct request *rq) 151 + { 152 + return (void *) rq + sizeof(*rq); 153 + } 154 + 155 + static inline struct request *blk_mq_tag_to_rq(struct blk_mq_hw_ctx *hctx, 156 + unsigned int tag) 157 + { 158 + return hctx->rqs[tag]; 159 + } 160 + 161 + #define queue_for_each_hw_ctx(q, hctx, i) \ 162 + for ((i) = 0, hctx = (q)->queue_hw_ctx[0]; \ 163 + (i) < (q)->nr_hw_queues; (i)++, hctx = (q)->queue_hw_ctx[i]) 164 + 165 + #define queue_for_each_ctx(q, ctx, i) \ 166 + for ((i) = 0, ctx = per_cpu_ptr((q)->queue_ctx, 0); \ 167 + (i) < (q)->nr_queues; (i)++, ctx = per_cpu_ptr(q->queue_ctx, (i))) 168 + 169 + #define hctx_for_each_ctx(hctx, ctx, i) \ 170 + for ((i) = 0, ctx = (hctx)->ctxs[0]; \ 171 + (i) < (hctx)->nr_ctx; (i)++, ctx = (hctx)->ctxs[(i)]) 172 + 173 + #define blk_ctx_sum(q, sum) \ 174 + ({ \ 175 + struct blk_mq_ctx *__x; \ 176 + unsigned int __ret = 0, __i; \ 177 + \ 178 + queue_for_each_ctx((q), __x, __i) \ 179 + __ret += sum; \ 180 + __ret; \ 181 + }) 182 + 183 + #endif

+34 -32

include/linux/blk_types.h

··· 178 178 __REQ_MIXED_MERGE, /* merge of different types, fail separately */ 179 179 __REQ_KERNEL, /* direct IO to kernel pages */ 180 180 __REQ_PM, /* runtime pm request */ 181 + __REQ_END, /* last of chain of requests */ 181 182 __REQ_NR_BITS, /* stops here */ 182 183 }; 183 184 184 - #define REQ_WRITE (1 << __REQ_WRITE) 185 - #define REQ_FAILFAST_DEV (1 << __REQ_FAILFAST_DEV) 186 - #define REQ_FAILFAST_TRANSPORT (1 << __REQ_FAILFAST_TRANSPORT) 187 - #define REQ_FAILFAST_DRIVER (1 << __REQ_FAILFAST_DRIVER) 188 - #define REQ_SYNC (1 << __REQ_SYNC) 189 - #define REQ_META (1 << __REQ_META) 190 - #define REQ_PRIO (1 << __REQ_PRIO) 191 - #define REQ_DISCARD (1 << __REQ_DISCARD) 192 - #define REQ_WRITE_SAME (1 << __REQ_WRITE_SAME) 193 - #define REQ_NOIDLE (1 << __REQ_NOIDLE) 185 + #define REQ_WRITE (1ULL << __REQ_WRITE) 186 + #define REQ_FAILFAST_DEV (1ULL << __REQ_FAILFAST_DEV) 187 + #define REQ_FAILFAST_TRANSPORT (1ULL << __REQ_FAILFAST_TRANSPORT) 188 + #define REQ_FAILFAST_DRIVER (1ULL << __REQ_FAILFAST_DRIVER) 189 + #define REQ_SYNC (1ULL << __REQ_SYNC) 190 + #define REQ_META (1ULL << __REQ_META) 191 + #define REQ_PRIO (1ULL << __REQ_PRIO) 192 + #define REQ_DISCARD (1ULL << __REQ_DISCARD) 193 + #define REQ_WRITE_SAME (1ULL << __REQ_WRITE_SAME) 194 + #define REQ_NOIDLE (1ULL << __REQ_NOIDLE) 194 195 195 196 #define REQ_FAILFAST_MASK \ 196 197 (REQ_FAILFAST_DEV | REQ_FAILFAST_TRANSPORT | REQ_FAILFAST_DRIVER) ··· 207 206 #define REQ_NOMERGE_FLAGS \ 208 207 (REQ_NOMERGE | REQ_STARTED | REQ_SOFTBARRIER | REQ_FLUSH | REQ_FUA) 209 208 210 - #define REQ_RAHEAD (1 << __REQ_RAHEAD) 211 - #define REQ_THROTTLED (1 << __REQ_THROTTLED) 209 + #define REQ_RAHEAD (1ULL << __REQ_RAHEAD) 210 + #define REQ_THROTTLED (1ULL << __REQ_THROTTLED) 212 211 213 - #define REQ_SORTED (1 << __REQ_SORTED) 214 - #define REQ_SOFTBARRIER (1 << __REQ_SOFTBARRIER) 215 - #define REQ_FUA (1 << __REQ_FUA) 216 - #define REQ_NOMERGE (1 << __REQ_NOMERGE) 217 - #define REQ_STARTED (1 << __REQ_STARTED) 218 - #define REQ_DONTPREP (1 << __REQ_DONTPREP) 219 - #define REQ_QUEUED (1 << __REQ_QUEUED) 220 - #define REQ_ELVPRIV (1 << __REQ_ELVPRIV) 221 - #define REQ_FAILED (1 << __REQ_FAILED) 222 - #define REQ_QUIET (1 << __REQ_QUIET) 223 - #define REQ_PREEMPT (1 << __REQ_PREEMPT) 224 - #define REQ_ALLOCED (1 << __REQ_ALLOCED) 225 - #define REQ_COPY_USER (1 << __REQ_COPY_USER) 226 - #define REQ_FLUSH (1 << __REQ_FLUSH) 227 - #define REQ_FLUSH_SEQ (1 << __REQ_FLUSH_SEQ) 228 - #define REQ_IO_STAT (1 << __REQ_IO_STAT) 229 - #define REQ_MIXED_MERGE (1 << __REQ_MIXED_MERGE) 230 - #define REQ_SECURE (1 << __REQ_SECURE) 231 - #define REQ_KERNEL (1 << __REQ_KERNEL) 232 - #define REQ_PM (1 << __REQ_PM) 212 + #define REQ_SORTED (1ULL << __REQ_SORTED) 213 + #define REQ_SOFTBARRIER (1ULL << __REQ_SOFTBARRIER) 214 + #define REQ_FUA (1ULL << __REQ_FUA) 215 + #define REQ_NOMERGE (1ULL << __REQ_NOMERGE) 216 + #define REQ_STARTED (1ULL << __REQ_STARTED) 217 + #define REQ_DONTPREP (1ULL << __REQ_DONTPREP) 218 + #define REQ_QUEUED (1ULL << __REQ_QUEUED) 219 + #define REQ_ELVPRIV (1ULL << __REQ_ELVPRIV) 220 + #define REQ_FAILED (1ULL << __REQ_FAILED) 221 + #define REQ_QUIET (1ULL << __REQ_QUIET) 222 + #define REQ_PREEMPT (1ULL << __REQ_PREEMPT) 223 + #define REQ_ALLOCED (1ULL << __REQ_ALLOCED) 224 + #define REQ_COPY_USER (1ULL << __REQ_COPY_USER) 225 + #define REQ_FLUSH (1ULL << __REQ_FLUSH) 226 + #define REQ_FLUSH_SEQ (1ULL << __REQ_FLUSH_SEQ) 227 + #define REQ_IO_STAT (1ULL << __REQ_IO_STAT) 228 + #define REQ_MIXED_MERGE (1ULL << __REQ_MIXED_MERGE) 229 + #define REQ_SECURE (1ULL << __REQ_SECURE) 230 + #define REQ_KERNEL (1ULL << __REQ_KERNEL) 231 + #define REQ_PM (1ULL << __REQ_PM) 232 + #define REQ_END (1ULL << __REQ_END) 233 233 234 234 #endif /* __LINUX_BLK_TYPES_H */

+49 -11

include/linux/blkdev.h

··· 8 8 #include <linux/major.h> 9 9 #include <linux/genhd.h> 10 10 #include <linux/list.h> 11 + #include <linux/llist.h> 11 12 #include <linux/timer.h> 12 13 #include <linux/workqueue.h> 13 14 #include <linux/pagemap.h> ··· 95 94 * as well! 96 95 */ 97 96 struct request { 98 - struct list_head queuelist; 99 - struct call_single_data csd; 97 + union { 98 + struct list_head queuelist; 99 + struct llist_node ll_list; 100 + }; 101 + union { 102 + struct call_single_data csd; 103 + struct work_struct mq_flush_data; 104 + }; 100 105 101 106 struct request_queue *q; 107 + struct blk_mq_ctx *mq_ctx; 102 108 103 - unsigned int cmd_flags; 109 + u64 cmd_flags; 104 110 enum rq_cmd_type_bits cmd_type; 105 111 unsigned long atomic_flags; 106 112 ··· 168 160 169 161 unsigned short ioprio; 170 162 171 - int ref_count; 172 - 173 163 void *special; /* opaque pointer available for LLD use */ 174 164 char *buffer; /* kaddr of the current segment if available */ 175 165 ··· 220 214 }; 221 215 222 216 #include <linux/elevator.h> 217 + 218 + struct blk_queue_ctx; 223 219 224 220 typedef void (request_fn_proc) (struct request_queue *q); 225 221 typedef void (make_request_fn) (struct request_queue *q, struct bio *bio); ··· 321 313 dma_drain_needed_fn *dma_drain_needed; 322 314 lld_busy_fn *lld_busy_fn; 323 315 316 + struct blk_mq_ops *mq_ops; 317 + 318 + unsigned int *mq_map; 319 + 320 + /* sw queues */ 321 + struct blk_mq_ctx *queue_ctx; 322 + unsigned int nr_queues; 323 + 324 + /* hw dispatch queues */ 325 + struct blk_mq_hw_ctx **queue_hw_ctx; 326 + unsigned int nr_hw_queues; 327 + 324 328 /* 325 329 * Dispatch queue sorting 326 330 */ ··· 380 360 * queue kobject 381 361 */ 382 362 struct kobject kobj; 363 + 364 + /* 365 + * mq queue kobject 366 + */ 367 + struct kobject mq_kobj; 383 368 384 369 #ifdef CONFIG_PM_RUNTIME 385 370 struct device *dev; ··· 450 425 unsigned long flush_pending_since; 451 426 struct list_head flush_queue[2]; 452 427 struct list_head flush_data_in_flight; 453 - struct request flush_rq; 428 + union { 429 + struct request flush_rq; 430 + struct { 431 + spinlock_t mq_flush_lock; 432 + struct work_struct mq_flush_work; 433 + }; 434 + }; 454 435 455 436 struct mutex sysfs_lock; 456 437 ··· 468 437 struct bsg_class_device bsg_dev; 469 438 #endif 470 439 471 - #ifdef CONFIG_BLK_CGROUP 472 - struct list_head all_q_node; 473 - #endif 474 440 #ifdef CONFIG_BLK_DEV_THROTTLING 475 441 /* Throttle data */ 476 442 struct throtl_data *td; 477 443 #endif 478 444 struct rcu_head rcu_head; 445 + wait_queue_head_t mq_freeze_wq; 446 + struct percpu_counter mq_usage_counter; 447 + struct list_head all_q_node; 479 448 }; 480 449 481 450 #define QUEUE_FLAG_QUEUED 1 /* uses generic tag queueing */ ··· 498 467 #define QUEUE_FLAG_SECDISCARD 17 /* supports SECDISCARD */ 499 468 #define QUEUE_FLAG_SAME_FORCE 18 /* force complete on same CPU */ 500 469 #define QUEUE_FLAG_DEAD 19 /* queue tear-down finished */ 470 + #define QUEUE_FLAG_INIT_DONE 20 /* queue is initialized */ 501 471 502 472 #define QUEUE_FLAG_DEFAULT ((1 << QUEUE_FLAG_IO_STAT) | \ 503 473 (1 << QUEUE_FLAG_STACKABLE) | \ ··· 571 539 #define blk_queue_dying(q) test_bit(QUEUE_FLAG_DYING, &(q)->queue_flags) 572 540 #define blk_queue_dead(q) test_bit(QUEUE_FLAG_DEAD, &(q)->queue_flags) 573 541 #define blk_queue_bypass(q) test_bit(QUEUE_FLAG_BYPASS, &(q)->queue_flags) 542 + #define blk_queue_init_done(q) test_bit(QUEUE_FLAG_INIT_DONE, &(q)->queue_flags) 574 543 #define blk_queue_nomerges(q) test_bit(QUEUE_FLAG_NOMERGES, &(q)->queue_flags) 575 544 #define blk_queue_noxmerges(q) \ 576 545 test_bit(QUEUE_FLAG_NOXMERGES, &(q)->queue_flags) ··· 603 570 604 571 #define list_entry_rq(ptr) list_entry((ptr), struct request, queuelist) 605 572 606 - #define rq_data_dir(rq) ((rq)->cmd_flags & 1) 573 + #define rq_data_dir(rq) (((rq)->cmd_flags & 1) != 0) 607 574 608 575 static inline unsigned int blk_queue_cluster(struct request_queue *q) 609 576 { ··· 1046 1013 struct blk_plug { 1047 1014 unsigned long magic; /* detect uninitialized use-cases */ 1048 1015 struct list_head list; /* requests */ 1016 + struct list_head mq_list; /* blk-mq requests */ 1049 1017 struct list_head cb_list; /* md requires an unplug callback */ 1050 1018 }; 1051 1019 #define BLK_MAX_REQUEST_COUNT 16 ··· 1084 1050 { 1085 1051 struct blk_plug *plug = tsk->plug; 1086 1052 1087 - return plug && (!list_empty(&plug->list) || !list_empty(&plug->cb_list)); 1053 + return plug && 1054 + (!list_empty(&plug->list) || 1055 + !list_empty(&plug->mq_list) || 1056 + !list_empty(&plug->cb_list)); 1088 1057 } 1089 1058 1090 1059 /* ··· 1362 1325 1363 1326 struct work_struct; 1364 1327 int kblockd_schedule_work(struct request_queue *q, struct work_struct *work); 1328 + int kblockd_schedule_delayed_work(struct request_queue *q, struct delayed_work *dwork, unsigned long delay); 1365 1329 1366 1330 #ifdef CONFIG_BLK_CGROUP 1367 1331 /*

+22 -1

include/linux/percpu_ida.h

··· 16 16 * percpu_ida_init() 17 17 */ 18 18 unsigned nr_tags; 19 + unsigned percpu_max_size; 20 + unsigned percpu_batch_size; 19 21 20 22 struct percpu_ida_cpu __percpu *tag_cpu; 21 23 ··· 53 51 } ____cacheline_aligned_in_smp; 54 52 }; 55 53 54 + /* 55 + * Number of tags we move between the percpu freelist and the global freelist at 56 + * a time 57 + */ 58 + #define IDA_DEFAULT_PCPU_BATCH_MOVE 32U 59 + /* Max size of percpu freelist, */ 60 + #define IDA_DEFAULT_PCPU_SIZE ((IDA_DEFAULT_PCPU_BATCH_MOVE * 3) / 2) 61 + 56 62 int percpu_ida_alloc(struct percpu_ida *pool, gfp_t gfp); 57 63 void percpu_ida_free(struct percpu_ida *pool, unsigned tag); 58 64 59 65 void percpu_ida_destroy(struct percpu_ida *pool); 60 - int percpu_ida_init(struct percpu_ida *pool, unsigned long nr_tags); 66 + int __percpu_ida_init(struct percpu_ida *pool, unsigned long nr_tags, 67 + unsigned long max_size, unsigned long batch_size); 68 + static inline int percpu_ida_init(struct percpu_ida *pool, unsigned long nr_tags) 69 + { 70 + return __percpu_ida_init(pool, nr_tags, IDA_DEFAULT_PCPU_SIZE, 71 + IDA_DEFAULT_PCPU_BATCH_MOVE); 72 + } 61 73 74 + typedef int (*percpu_ida_cb)(unsigned, void *); 75 + int percpu_ida_for_each_free(struct percpu_ida *pool, percpu_ida_cb fn, 76 + void *data); 77 + 78 + unsigned percpu_ida_free_tags(struct percpu_ida *pool, int cpu); 62 79 #endif /* __PERCPU_IDA_H__ */

+6 -1

kernel/smp.c

··· 18 18 #ifdef CONFIG_USE_GENERIC_SMP_HELPERS 19 19 enum { 20 20 CSD_FLAG_LOCK = 0x01, 21 + CSD_FLAG_WAIT = 0x02, 21 22 }; 22 23 23 24 struct call_function_data { ··· 125 124 126 125 static void csd_unlock(struct call_single_data *csd) 127 126 { 128 - WARN_ON(!(csd->flags & CSD_FLAG_LOCK)); 127 + WARN_ON((csd->flags & CSD_FLAG_WAIT) && !(csd->flags & CSD_FLAG_LOCK)); 129 128 130 129 /* 131 130 * ensure we're all done before releasing data: ··· 146 145 struct call_single_queue *dst = &per_cpu(call_single_queue, cpu); 147 146 unsigned long flags; 148 147 int ipi; 148 + 149 + if (wait) 150 + csd->flags |= CSD_FLAG_WAIT; 149 151 150 152 raw_spin_lock_irqsave(&dst->lock, flags); 151 153 ipi = list_empty(&dst->list); ··· 344 340 } 345 341 put_cpu(); 346 342 } 343 + EXPORT_SYMBOL_GPL(__smp_call_function_single); 347 344 348 345 /** 349 346 * smp_call_function_many(): Run a function on a set of other CPUs.

+9 -6

lib/percpu_counter.c

··· 60 60 void percpu_counter_set(struct percpu_counter *fbc, s64 amount) 61 61 { 62 62 int cpu; 63 + unsigned long flags; 63 64 64 - raw_spin_lock(&fbc->lock); 65 + raw_spin_lock_irqsave(&fbc->lock, flags); 65 66 for_each_possible_cpu(cpu) { 66 67 s32 *pcount = per_cpu_ptr(fbc->counters, cpu); 67 68 *pcount = 0; 68 69 } 69 70 fbc->count = amount; 70 - raw_spin_unlock(&fbc->lock); 71 + raw_spin_unlock_irqrestore(&fbc->lock, flags); 71 72 } 72 73 EXPORT_SYMBOL(percpu_counter_set); 73 74 ··· 79 78 preempt_disable(); 80 79 count = __this_cpu_read(*fbc->counters) + amount; 81 80 if (count >= batch || count <= -batch) { 82 - raw_spin_lock(&fbc->lock); 81 + unsigned long flags; 82 + raw_spin_lock_irqsave(&fbc->lock, flags); 83 83 fbc->count += count; 84 - raw_spin_unlock(&fbc->lock); 84 + raw_spin_unlock_irqrestore(&fbc->lock, flags); 85 85 __this_cpu_write(*fbc->counters, 0); 86 86 } else { 87 87 __this_cpu_write(*fbc->counters, count); ··· 99 97 { 100 98 s64 ret; 101 99 int cpu; 100 + unsigned long flags; 102 101 103 - raw_spin_lock(&fbc->lock); 102 + raw_spin_lock_irqsave(&fbc->lock, flags); 104 103 ret = fbc->count; 105 104 for_each_online_cpu(cpu) { 106 105 s32 *pcount = per_cpu_ptr(fbc->counters, cpu); 107 106 ret += *pcount; 108 107 } 109 - raw_spin_unlock(&fbc->lock); 108 + raw_spin_unlock_irqrestore(&fbc->lock, flags); 110 109 return ret; 111 110 } 112 111 EXPORT_SYMBOL(__percpu_counter_sum);

+72 -17

lib/percpu_ida.c

··· 30 30 #include <linux/spinlock.h> 31 31 #include <linux/percpu_ida.h> 32 32 33 - /* 34 - * Number of tags we move between the percpu freelist and the global freelist at 35 - * a time 36 - */ 37 - #define IDA_PCPU_BATCH_MOVE 32U 38 - 39 - /* Max size of percpu freelist, */ 40 - #define IDA_PCPU_SIZE ((IDA_PCPU_BATCH_MOVE * 3) / 2) 41 - 42 33 struct percpu_ida_cpu { 43 34 /* 44 35 * Even though this is percpu, we need a lock for tag stealing by remote ··· 69 78 struct percpu_ida_cpu *remote; 70 79 71 80 for (cpus_have_tags = cpumask_weight(&pool->cpus_have_tags); 72 - cpus_have_tags * IDA_PCPU_SIZE > pool->nr_tags / 2; 81 + cpus_have_tags * pool->percpu_max_size > pool->nr_tags / 2; 73 82 cpus_have_tags--) { 74 83 cpu = cpumask_next(cpu, &pool->cpus_have_tags); 75 84 ··· 114 123 { 115 124 move_tags(tags->freelist, &tags->nr_free, 116 125 pool->freelist, &pool->nr_free, 117 - min(pool->nr_free, IDA_PCPU_BATCH_MOVE)); 126 + min(pool->nr_free, pool->percpu_batch_size)); 118 127 } 119 128 120 129 static inline unsigned alloc_local_tag(struct percpu_ida *pool, ··· 236 245 wake_up(&pool->wait); 237 246 } 238 247 239 - if (nr_free == IDA_PCPU_SIZE) { 248 + if (nr_free == pool->percpu_max_size) { 240 249 spin_lock(&pool->lock); 241 250 242 251 /* 243 252 * Global lock held and irqs disabled, don't need percpu 244 253 * lock 245 254 */ 246 - if (tags->nr_free == IDA_PCPU_SIZE) { 255 + if (tags->nr_free == pool->percpu_max_size) { 247 256 move_tags(pool->freelist, &pool->nr_free, 248 257 tags->freelist, &tags->nr_free, 249 - IDA_PCPU_BATCH_MOVE); 258 + pool->percpu_batch_size); 250 259 251 260 wake_up(&pool->wait); 252 261 } ··· 283 292 * Allocation is percpu, but sharding is limited by nr_tags - for best 284 293 * performance, the workload should not span more cpus than nr_tags / 128. 285 294 */ 286 - int percpu_ida_init(struct percpu_ida *pool, unsigned long nr_tags) 295 + int __percpu_ida_init(struct percpu_ida *pool, unsigned long nr_tags, 296 + unsigned long max_size, unsigned long batch_size) 287 297 { 288 298 unsigned i, cpu, order; 289 299 ··· 293 301 init_waitqueue_head(&pool->wait); 294 302 spin_lock_init(&pool->lock); 295 303 pool->nr_tags = nr_tags; 304 + pool->percpu_max_size = max_size; 305 + pool->percpu_batch_size = batch_size; 296 306 297 307 /* Guard against overflow */ 298 308 if (nr_tags > (unsigned) INT_MAX + 1) { ··· 313 319 pool->nr_free = nr_tags; 314 320 315 321 pool->tag_cpu = __alloc_percpu(sizeof(struct percpu_ida_cpu) + 316 - IDA_PCPU_SIZE * sizeof(unsigned), 322 + pool->percpu_max_size * sizeof(unsigned), 317 323 sizeof(unsigned)); 318 324 if (!pool->tag_cpu) 319 325 goto err; ··· 326 332 percpu_ida_destroy(pool); 327 333 return -ENOMEM; 328 334 } 329 - EXPORT_SYMBOL_GPL(percpu_ida_init); 335 + EXPORT_SYMBOL_GPL(__percpu_ida_init); 336 + 337 + /** 338 + * percpu_ida_for_each_free - iterate free ids of a pool 339 + * @pool: pool to iterate 340 + * @fn: interate callback function 341 + * @data: parameter for @fn 342 + * 343 + * Note, this doesn't guarantee to iterate all free ids restrictly. Some free 344 + * ids might be missed, some might be iterated duplicated, and some might 345 + * be iterated and not free soon. 346 + */ 347 + int percpu_ida_for_each_free(struct percpu_ida *pool, percpu_ida_cb fn, 348 + void *data) 349 + { 350 + unsigned long flags; 351 + struct percpu_ida_cpu *remote; 352 + unsigned cpu, i, err = 0; 353 + 354 + local_irq_save(flags); 355 + for_each_possible_cpu(cpu) { 356 + remote = per_cpu_ptr(pool->tag_cpu, cpu); 357 + spin_lock(&remote->lock); 358 + for (i = 0; i < remote->nr_free; i++) { 359 + err = fn(remote->freelist[i], data); 360 + if (err) 361 + break; 362 + } 363 + spin_unlock(&remote->lock); 364 + if (err) 365 + goto out; 366 + } 367 + 368 + spin_lock(&pool->lock); 369 + for (i = 0; i < pool->nr_free; i++) { 370 + err = fn(pool->freelist[i], data); 371 + if (err) 372 + break; 373 + } 374 + spin_unlock(&pool->lock); 375 + out: 376 + local_irq_restore(flags); 377 + return err; 378 + } 379 + EXPORT_SYMBOL_GPL(percpu_ida_for_each_free); 380 + 381 + /** 382 + * percpu_ida_free_tags - return free tags number of a specific cpu or global pool 383 + * @pool: pool related 384 + * @cpu: specific cpu or global pool if @cpu == nr_cpu_ids 385 + * 386 + * Note: this just returns a snapshot of free tags number. 387 + */ 388 + unsigned percpu_ida_free_tags(struct percpu_ida *pool, int cpu) 389 + { 390 + struct percpu_ida_cpu *remote; 391 + if (cpu == nr_cpu_ids) 392 + return pool->nr_free; 393 + remote = per_cpu_ptr(pool->tag_cpu, cpu); 394 + return remote->nr_free; 395 + } 396 + EXPORT_SYMBOL_GPL(percpu_ida_free_tags);