dm mpath: change to be request based

This patch converts dm-multipath target to request-based from bio-based.

Basically, the patch just converts the I/O unit from struct bio
to struct request.
In the course of the conversion, it also changes the I/O queueing
mechanism. The change in the I/O queueing is described in details
as follows.

I/O queueing mechanism change
-----------------------------
In I/O submission, map_io(), there is no mechanism change from
bio-based, since the clone request is ready for retry as it is.
However, in I/O complition, do_end_io(), there is a mechanism change
from bio-based, since the clone request is not ready for retry.

In do_end_io() of bio-based, the clone bio has all needed memory
for resubmission. So the target driver can queue it and resubmit
it later without memory allocations.
The mechanism has almost no overhead.

On the other hand, in do_end_io() of request-based, the clone request
doesn't have clone bios, so the target driver can't resubmit it
as it is. To resubmit the clone request, memory allocation for
clone bios is needed, and it takes some overheads.
To avoid the overheads just for queueing, the target driver doesn't
queue the clone request inside itself.
Instead, the target driver asks dm core for queueing and remapping
the original request of the clone request, since the overhead for
queueing is just a freeing memory for the clone request.

As a result, the target driver doesn't need to record/restore
the information of the original request for resubmitting
the clone request. So dm_bio_details in dm_mpath_io is removed.

multipath_busy()
---------------------
The target driver returns "busy", only when the following case:
o The target driver will map I/Os, if map() function is called
and
o The mapped I/Os will wait on underlying device's queue due to
their congestions, if map() function is called now.

In other cases, the target driver doesn't return "busy".
Otherwise, dm core will keep the I/Os and the target driver can't
do what it wants.
(e.g. the target driver can't map I/Os now, so wants to kill I/Os.)

Signed-off-by: Kiyoshi Ueda <k-ueda@ct.jp.nec.com>
Signed-off-by: Jun'ichi Nomura <j-nomura@ce.jp.nec.com>
Acked-by: Hannes Reinecke <hare@suse.de>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>

authored by Kiyoshi Ueda and committed by Alasdair G Kergon f40c67f0 523d9297

+128 -65
+128 -65
drivers/md/dm-mpath.c
··· 8 8 #include <linux/device-mapper.h> 9 9 10 10 #include "dm-path-selector.h" 11 - #include "dm-bio-record.h" 12 11 #include "dm-uevent.h" 13 12 14 13 #include <linux/ctype.h> ··· 82 83 unsigned pg_init_count; /* Number of times pg_init called */ 83 84 84 85 struct work_struct process_queued_ios; 85 - struct bio_list queued_ios; 86 + struct list_head queued_ios; 86 87 unsigned queue_size; 87 88 88 89 struct work_struct trigger_event; ··· 99 100 */ 100 101 struct dm_mpath_io { 101 102 struct pgpath *pgpath; 102 - struct dm_bio_details details; 103 103 size_t nr_bytes; 104 104 }; 105 105 ··· 192 194 m = kzalloc(sizeof(*m), GFP_KERNEL); 193 195 if (m) { 194 196 INIT_LIST_HEAD(&m->priority_groups); 197 + INIT_LIST_HEAD(&m->queued_ios); 195 198 spin_lock_init(&m->lock); 196 199 m->queue_io = 1; 197 200 INIT_WORK(&m->process_queued_ios, process_queued_ios); ··· 317 318 dm_noflush_suspending(m->ti)); 318 319 } 319 320 320 - static int map_io(struct multipath *m, struct bio *bio, 321 + static int map_io(struct multipath *m, struct request *clone, 321 322 struct dm_mpath_io *mpio, unsigned was_queued) 322 323 { 323 324 int r = DM_MAPIO_REMAPPED; 324 - size_t nr_bytes = bio->bi_size; 325 + size_t nr_bytes = blk_rq_bytes(clone); 325 326 unsigned long flags; 326 327 struct pgpath *pgpath; 328 + struct block_device *bdev; 327 329 328 330 spin_lock_irqsave(&m->lock, flags); 329 331 ··· 341 341 if ((pgpath && m->queue_io) || 342 342 (!pgpath && m->queue_if_no_path)) { 343 343 /* Queue for the daemon to resubmit */ 344 - bio_list_add(&m->queued_ios, bio); 344 + list_add_tail(&clone->queuelist, &m->queued_ios); 345 345 m->queue_size++; 346 346 if ((m->pg_init_required && !m->pg_init_in_progress) || 347 347 !m->queue_io) 348 348 queue_work(kmultipathd, &m->process_queued_ios); 349 349 pgpath = NULL; 350 350 r = DM_MAPIO_SUBMITTED; 351 - } else if (pgpath) 352 - bio->bi_bdev = pgpath->path.dev->bdev; 353 - else if (__must_push_back(m)) 351 + } else if (pgpath) { 352 + bdev = pgpath->path.dev->bdev; 353 + clone->q = bdev_get_queue(bdev); 354 + clone->rq_disk = bdev->bd_disk; 355 + } else if (__must_push_back(m)) 354 356 r = DM_MAPIO_REQUEUE; 355 357 else 356 358 r = -EIO; /* Failed */ ··· 400 398 { 401 399 int r; 402 400 unsigned long flags; 403 - struct bio *bio = NULL, *next; 404 401 struct dm_mpath_io *mpio; 405 402 union map_info *info; 403 + struct request *clone, *n; 404 + LIST_HEAD(cl); 406 405 407 406 spin_lock_irqsave(&m->lock, flags); 408 - bio = bio_list_get(&m->queued_ios); 407 + list_splice_init(&m->queued_ios, &cl); 409 408 spin_unlock_irqrestore(&m->lock, flags); 410 409 411 - while (bio) { 412 - next = bio->bi_next; 413 - bio->bi_next = NULL; 410 + list_for_each_entry_safe(clone, n, &cl, queuelist) { 411 + list_del_init(&clone->queuelist); 414 412 415 - info = dm_get_mapinfo(bio); 413 + info = dm_get_rq_mapinfo(clone); 416 414 mpio = info->ptr; 417 415 418 - r = map_io(m, bio, mpio, 1); 419 - if (r < 0) 420 - bio_endio(bio, r); 421 - else if (r == DM_MAPIO_REMAPPED) 422 - generic_make_request(bio); 423 - else if (r == DM_MAPIO_REQUEUE) 424 - bio_endio(bio, -EIO); 425 - 426 - bio = next; 416 + r = map_io(m, clone, mpio, 1); 417 + if (r < 0) { 418 + mempool_free(mpio, m->mpio_pool); 419 + dm_kill_unmapped_request(clone, r); 420 + } else if (r == DM_MAPIO_REMAPPED) 421 + dm_dispatch_request(clone); 422 + else if (r == DM_MAPIO_REQUEUE) { 423 + mempool_free(mpio, m->mpio_pool); 424 + dm_requeue_unmapped_request(clone); 425 + } 427 426 } 428 427 } 429 428 ··· 866 863 } 867 864 868 865 /* 869 - * Map bios, recording original fields for later in case we have to resubmit 866 + * Map cloned requests 870 867 */ 871 - static int multipath_map(struct dm_target *ti, struct bio *bio, 868 + static int multipath_map(struct dm_target *ti, struct request *clone, 872 869 union map_info *map_context) 873 870 { 874 871 int r; 875 872 struct dm_mpath_io *mpio; 876 873 struct multipath *m = (struct multipath *) ti->private; 877 874 878 - mpio = mempool_alloc(m->mpio_pool, GFP_NOIO); 879 - dm_bio_record(&mpio->details, bio); 875 + mpio = mempool_alloc(m->mpio_pool, GFP_ATOMIC); 876 + if (!mpio) 877 + /* ENOMEM, requeue */ 878 + return DM_MAPIO_REQUEUE; 879 + memset(mpio, 0, sizeof(*mpio)); 880 880 881 881 map_context->ptr = mpio; 882 - bio->bi_rw |= (1 << BIO_RW_FAILFAST_TRANSPORT); 883 - r = map_io(m, bio, mpio, 0); 882 + clone->cmd_flags |= REQ_FAILFAST_TRANSPORT; 883 + r = map_io(m, clone, mpio, 0); 884 884 if (r < 0 || r == DM_MAPIO_REQUEUE) 885 885 mempool_free(mpio, m->mpio_pool); 886 886 ··· 1164 1158 /* 1165 1159 * end_io handling 1166 1160 */ 1167 - static int do_end_io(struct multipath *m, struct bio *bio, 1161 + static int do_end_io(struct multipath *m, struct request *clone, 1168 1162 int error, struct dm_mpath_io *mpio) 1169 1163 { 1164 + /* 1165 + * We don't queue any clone request inside the multipath target 1166 + * during end I/O handling, since those clone requests don't have 1167 + * bio clones. If we queue them inside the multipath target, 1168 + * we need to make bio clones, that requires memory allocation. 1169 + * (See drivers/md/dm.c:end_clone_bio() about why the clone requests 1170 + * don't have bio clones.) 1171 + * Instead of queueing the clone request here, we queue the original 1172 + * request into dm core, which will remake a clone request and 1173 + * clone bios for it and resubmit it later. 1174 + */ 1175 + int r = DM_ENDIO_REQUEUE; 1170 1176 unsigned long flags; 1171 1177 1172 - if (!error) 1178 + if (!error && !clone->errors) 1173 1179 return 0; /* I/O complete */ 1174 - 1175 - if ((error == -EWOULDBLOCK) && bio_rw_ahead(bio)) 1176 - return error; 1177 1180 1178 1181 if (error == -EOPNOTSUPP) 1179 1182 return error; 1180 1183 1181 - spin_lock_irqsave(&m->lock, flags); 1182 - if (!m->nr_valid_paths) { 1183 - if (__must_push_back(m)) { 1184 - spin_unlock_irqrestore(&m->lock, flags); 1185 - return DM_ENDIO_REQUEUE; 1186 - } else if (!m->queue_if_no_path) { 1187 - spin_unlock_irqrestore(&m->lock, flags); 1188 - return -EIO; 1189 - } else { 1190 - spin_unlock_irqrestore(&m->lock, flags); 1191 - goto requeue; 1192 - } 1193 - } 1194 - spin_unlock_irqrestore(&m->lock, flags); 1195 - 1196 1184 if (mpio->pgpath) 1197 1185 fail_path(mpio->pgpath); 1198 1186 1199 - requeue: 1200 - dm_bio_restore(&mpio->details, bio); 1201 - 1202 - /* queue for the daemon to resubmit or fail */ 1203 1187 spin_lock_irqsave(&m->lock, flags); 1204 - bio_list_add(&m->queued_ios, bio); 1205 - m->queue_size++; 1206 - if (!m->queue_io) 1207 - queue_work(kmultipathd, &m->process_queued_ios); 1188 + if (!m->nr_valid_paths && !m->queue_if_no_path && !__must_push_back(m)) 1189 + r = -EIO; 1208 1190 spin_unlock_irqrestore(&m->lock, flags); 1209 1191 1210 - return DM_ENDIO_INCOMPLETE; /* io not complete */ 1192 + return r; 1211 1193 } 1212 1194 1213 - static int multipath_end_io(struct dm_target *ti, struct bio *bio, 1195 + static int multipath_end_io(struct dm_target *ti, struct request *clone, 1214 1196 int error, union map_info *map_context) 1215 1197 { 1216 1198 struct multipath *m = ti->private; ··· 1207 1213 struct path_selector *ps; 1208 1214 int r; 1209 1215 1210 - r = do_end_io(m, bio, error, mpio); 1216 + r = do_end_io(m, clone, error, mpio); 1211 1217 if (pgpath) { 1212 1218 ps = &pgpath->pg->ps; 1213 1219 if (ps->type->end_io) 1214 1220 ps->type->end_io(ps, &pgpath->path, mpio->nr_bytes); 1215 1221 } 1216 - if (r != DM_ENDIO_INCOMPLETE) 1217 - mempool_free(mpio, m->mpio_pool); 1222 + mempool_free(mpio, m->mpio_pool); 1218 1223 1219 1224 return r; 1220 1225 } ··· 1463 1470 return ret; 1464 1471 } 1465 1472 1473 + static int __pgpath_busy(struct pgpath *pgpath) 1474 + { 1475 + struct request_queue *q = bdev_get_queue(pgpath->path.dev->bdev); 1476 + 1477 + return dm_underlying_device_busy(q); 1478 + } 1479 + 1480 + /* 1481 + * We return "busy", only when we can map I/Os but underlying devices 1482 + * are busy (so even if we map I/Os now, the I/Os will wait on 1483 + * the underlying queue). 1484 + * In other words, if we want to kill I/Os or queue them inside us 1485 + * due to map unavailability, we don't return "busy". Otherwise, 1486 + * dm core won't give us the I/Os and we can't do what we want. 1487 + */ 1488 + static int multipath_busy(struct dm_target *ti) 1489 + { 1490 + int busy = 0, has_active = 0; 1491 + struct multipath *m = ti->private; 1492 + struct priority_group *pg; 1493 + struct pgpath *pgpath; 1494 + unsigned long flags; 1495 + 1496 + spin_lock_irqsave(&m->lock, flags); 1497 + 1498 + /* Guess which priority_group will be used at next mapping time */ 1499 + if (unlikely(!m->current_pgpath && m->next_pg)) 1500 + pg = m->next_pg; 1501 + else if (likely(m->current_pg)) 1502 + pg = m->current_pg; 1503 + else 1504 + /* 1505 + * We don't know which pg will be used at next mapping time. 1506 + * We don't call __choose_pgpath() here to avoid to trigger 1507 + * pg_init just by busy checking. 1508 + * So we don't know whether underlying devices we will be using 1509 + * at next mapping time are busy or not. Just try mapping. 1510 + */ 1511 + goto out; 1512 + 1513 + /* 1514 + * If there is one non-busy active path at least, the path selector 1515 + * will be able to select it. So we consider such a pg as not busy. 1516 + */ 1517 + busy = 1; 1518 + list_for_each_entry(pgpath, &pg->pgpaths, list) 1519 + if (pgpath->is_active) { 1520 + has_active = 1; 1521 + 1522 + if (!__pgpath_busy(pgpath)) { 1523 + busy = 0; 1524 + break; 1525 + } 1526 + } 1527 + 1528 + if (!has_active) 1529 + /* 1530 + * No active path in this pg, so this pg won't be used and 1531 + * the current_pg will be changed at next mapping time. 1532 + * We need to try mapping to determine it. 1533 + */ 1534 + busy = 0; 1535 + 1536 + out: 1537 + spin_unlock_irqrestore(&m->lock, flags); 1538 + 1539 + return busy; 1540 + } 1541 + 1466 1542 /*----------------------------------------------------------------- 1467 1543 * Module setup 1468 1544 *---------------------------------------------------------------*/ ··· 1541 1479 .module = THIS_MODULE, 1542 1480 .ctr = multipath_ctr, 1543 1481 .dtr = multipath_dtr, 1544 - .map = multipath_map, 1545 - .end_io = multipath_end_io, 1482 + .map_rq = multipath_map, 1483 + .rq_end_io = multipath_end_io, 1546 1484 .presuspend = multipath_presuspend, 1547 1485 .resume = multipath_resume, 1548 1486 .status = multipath_status, 1549 1487 .message = multipath_message, 1550 1488 .ioctl = multipath_ioctl, 1551 1489 .iterate_devices = multipath_iterate_devices, 1490 + .busy = multipath_busy, 1552 1491 }; 1553 1492 1554 1493 static int __init dm_multipath_init(void)