dm mpath: change to be request based

This patch converts dm-multipath target to request-based from bio-based.

Basically, the patch just converts the I/O unit from struct bio
to struct request.
In the course of the conversion, it also changes the I/O queueing
mechanism. The change in the I/O queueing is described in details
as follows.

I/O queueing mechanism change
-----------------------------
In I/O submission, map_io(), there is no mechanism change from
bio-based, since the clone request is ready for retry as it is.
However, in I/O complition, do_end_io(), there is a mechanism change
from bio-based, since the clone request is not ready for retry.

In do_end_io() of bio-based, the clone bio has all needed memory
for resubmission. So the target driver can queue it and resubmit
it later without memory allocations.
The mechanism has almost no overhead.

On the other hand, in do_end_io() of request-based, the clone request
doesn't have clone bios, so the target driver can't resubmit it
as it is. To resubmit the clone request, memory allocation for
clone bios is needed, and it takes some overheads.
To avoid the overheads just for queueing, the target driver doesn't
queue the clone request inside itself.
Instead, the target driver asks dm core for queueing and remapping
the original request of the clone request, since the overhead for
queueing is just a freeing memory for the clone request.

As a result, the target driver doesn't need to record/restore
the information of the original request for resubmitting
the clone request. So dm_bio_details in dm_mpath_io is removed.

multipath_busy()
---------------------
The target driver returns "busy", only when the following case:
o The target driver will map I/Os, if map() function is called
and
o The mapped I/Os will wait on underlying device's queue due to
their congestions, if map() function is called now.

In other cases, the target driver doesn't return "busy".
Otherwise, dm core will keep the I/Os and the target driver can't
do what it wants.
(e.g. the target driver can't map I/Os now, so wants to kill I/Os.)

Signed-off-by: Kiyoshi Ueda <k-ueda@ct.jp.nec.com>
Signed-off-by: Jun'ichi Nomura <j-nomura@ce.jp.nec.com>
Acked-by: Hannes Reinecke <hare@suse.de>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>

authored by

Kiyoshi Ueda and committed by
Alasdair G Kergon
f40c67f0 523d9297

+128 -65
+128 -65
drivers/md/dm-mpath.c
··· 8 #include <linux/device-mapper.h> 9 10 #include "dm-path-selector.h" 11 - #include "dm-bio-record.h" 12 #include "dm-uevent.h" 13 14 #include <linux/ctype.h> ··· 82 unsigned pg_init_count; /* Number of times pg_init called */ 83 84 struct work_struct process_queued_ios; 85 - struct bio_list queued_ios; 86 unsigned queue_size; 87 88 struct work_struct trigger_event; ··· 99 */ 100 struct dm_mpath_io { 101 struct pgpath *pgpath; 102 - struct dm_bio_details details; 103 size_t nr_bytes; 104 }; 105 ··· 192 m = kzalloc(sizeof(*m), GFP_KERNEL); 193 if (m) { 194 INIT_LIST_HEAD(&m->priority_groups); 195 spin_lock_init(&m->lock); 196 m->queue_io = 1; 197 INIT_WORK(&m->process_queued_ios, process_queued_ios); ··· 317 dm_noflush_suspending(m->ti)); 318 } 319 320 - static int map_io(struct multipath *m, struct bio *bio, 321 struct dm_mpath_io *mpio, unsigned was_queued) 322 { 323 int r = DM_MAPIO_REMAPPED; 324 - size_t nr_bytes = bio->bi_size; 325 unsigned long flags; 326 struct pgpath *pgpath; 327 328 spin_lock_irqsave(&m->lock, flags); 329 ··· 341 if ((pgpath && m->queue_io) || 342 (!pgpath && m->queue_if_no_path)) { 343 /* Queue for the daemon to resubmit */ 344 - bio_list_add(&m->queued_ios, bio); 345 m->queue_size++; 346 if ((m->pg_init_required && !m->pg_init_in_progress) || 347 !m->queue_io) 348 queue_work(kmultipathd, &m->process_queued_ios); 349 pgpath = NULL; 350 r = DM_MAPIO_SUBMITTED; 351 - } else if (pgpath) 352 - bio->bi_bdev = pgpath->path.dev->bdev; 353 - else if (__must_push_back(m)) 354 r = DM_MAPIO_REQUEUE; 355 else 356 r = -EIO; /* Failed */ ··· 400 { 401 int r; 402 unsigned long flags; 403 - struct bio *bio = NULL, *next; 404 struct dm_mpath_io *mpio; 405 union map_info *info; 406 407 spin_lock_irqsave(&m->lock, flags); 408 - bio = bio_list_get(&m->queued_ios); 409 spin_unlock_irqrestore(&m->lock, flags); 410 411 - while (bio) { 412 - next = bio->bi_next; 413 - bio->bi_next = NULL; 414 415 - info = dm_get_mapinfo(bio); 416 mpio = info->ptr; 417 418 - r = map_io(m, bio, mpio, 1); 419 - if (r < 0) 420 - bio_endio(bio, r); 421 - else if (r == DM_MAPIO_REMAPPED) 422 - generic_make_request(bio); 423 - else if (r == DM_MAPIO_REQUEUE) 424 - bio_endio(bio, -EIO); 425 - 426 - bio = next; 427 } 428 } 429 ··· 866 } 867 868 /* 869 - * Map bios, recording original fields for later in case we have to resubmit 870 */ 871 - static int multipath_map(struct dm_target *ti, struct bio *bio, 872 union map_info *map_context) 873 { 874 int r; 875 struct dm_mpath_io *mpio; 876 struct multipath *m = (struct multipath *) ti->private; 877 878 - mpio = mempool_alloc(m->mpio_pool, GFP_NOIO); 879 - dm_bio_record(&mpio->details, bio); 880 881 map_context->ptr = mpio; 882 - bio->bi_rw |= (1 << BIO_RW_FAILFAST_TRANSPORT); 883 - r = map_io(m, bio, mpio, 0); 884 if (r < 0 || r == DM_MAPIO_REQUEUE) 885 mempool_free(mpio, m->mpio_pool); 886 ··· 1164 /* 1165 * end_io handling 1166 */ 1167 - static int do_end_io(struct multipath *m, struct bio *bio, 1168 int error, struct dm_mpath_io *mpio) 1169 { 1170 unsigned long flags; 1171 1172 - if (!error) 1173 return 0; /* I/O complete */ 1174 - 1175 - if ((error == -EWOULDBLOCK) && bio_rw_ahead(bio)) 1176 - return error; 1177 1178 if (error == -EOPNOTSUPP) 1179 return error; 1180 1181 - spin_lock_irqsave(&m->lock, flags); 1182 - if (!m->nr_valid_paths) { 1183 - if (__must_push_back(m)) { 1184 - spin_unlock_irqrestore(&m->lock, flags); 1185 - return DM_ENDIO_REQUEUE; 1186 - } else if (!m->queue_if_no_path) { 1187 - spin_unlock_irqrestore(&m->lock, flags); 1188 - return -EIO; 1189 - } else { 1190 - spin_unlock_irqrestore(&m->lock, flags); 1191 - goto requeue; 1192 - } 1193 - } 1194 - spin_unlock_irqrestore(&m->lock, flags); 1195 - 1196 if (mpio->pgpath) 1197 fail_path(mpio->pgpath); 1198 1199 - requeue: 1200 - dm_bio_restore(&mpio->details, bio); 1201 - 1202 - /* queue for the daemon to resubmit or fail */ 1203 spin_lock_irqsave(&m->lock, flags); 1204 - bio_list_add(&m->queued_ios, bio); 1205 - m->queue_size++; 1206 - if (!m->queue_io) 1207 - queue_work(kmultipathd, &m->process_queued_ios); 1208 spin_unlock_irqrestore(&m->lock, flags); 1209 1210 - return DM_ENDIO_INCOMPLETE; /* io not complete */ 1211 } 1212 1213 - static int multipath_end_io(struct dm_target *ti, struct bio *bio, 1214 int error, union map_info *map_context) 1215 { 1216 struct multipath *m = ti->private; ··· 1207 struct path_selector *ps; 1208 int r; 1209 1210 - r = do_end_io(m, bio, error, mpio); 1211 if (pgpath) { 1212 ps = &pgpath->pg->ps; 1213 if (ps->type->end_io) 1214 ps->type->end_io(ps, &pgpath->path, mpio->nr_bytes); 1215 } 1216 - if (r != DM_ENDIO_INCOMPLETE) 1217 - mempool_free(mpio, m->mpio_pool); 1218 1219 return r; 1220 } ··· 1463 return ret; 1464 } 1465 1466 /*----------------------------------------------------------------- 1467 * Module setup 1468 *---------------------------------------------------------------*/ ··· 1541 .module = THIS_MODULE, 1542 .ctr = multipath_ctr, 1543 .dtr = multipath_dtr, 1544 - .map = multipath_map, 1545 - .end_io = multipath_end_io, 1546 .presuspend = multipath_presuspend, 1547 .resume = multipath_resume, 1548 .status = multipath_status, 1549 .message = multipath_message, 1550 .ioctl = multipath_ioctl, 1551 .iterate_devices = multipath_iterate_devices, 1552 }; 1553 1554 static int __init dm_multipath_init(void)
··· 8 #include <linux/device-mapper.h> 9 10 #include "dm-path-selector.h" 11 #include "dm-uevent.h" 12 13 #include <linux/ctype.h> ··· 83 unsigned pg_init_count; /* Number of times pg_init called */ 84 85 struct work_struct process_queued_ios; 86 + struct list_head queued_ios; 87 unsigned queue_size; 88 89 struct work_struct trigger_event; ··· 100 */ 101 struct dm_mpath_io { 102 struct pgpath *pgpath; 103 size_t nr_bytes; 104 }; 105 ··· 194 m = kzalloc(sizeof(*m), GFP_KERNEL); 195 if (m) { 196 INIT_LIST_HEAD(&m->priority_groups); 197 + INIT_LIST_HEAD(&m->queued_ios); 198 spin_lock_init(&m->lock); 199 m->queue_io = 1; 200 INIT_WORK(&m->process_queued_ios, process_queued_ios); ··· 318 dm_noflush_suspending(m->ti)); 319 } 320 321 + static int map_io(struct multipath *m, struct request *clone, 322 struct dm_mpath_io *mpio, unsigned was_queued) 323 { 324 int r = DM_MAPIO_REMAPPED; 325 + size_t nr_bytes = blk_rq_bytes(clone); 326 unsigned long flags; 327 struct pgpath *pgpath; 328 + struct block_device *bdev; 329 330 spin_lock_irqsave(&m->lock, flags); 331 ··· 341 if ((pgpath && m->queue_io) || 342 (!pgpath && m->queue_if_no_path)) { 343 /* Queue for the daemon to resubmit */ 344 + list_add_tail(&clone->queuelist, &m->queued_ios); 345 m->queue_size++; 346 if ((m->pg_init_required && !m->pg_init_in_progress) || 347 !m->queue_io) 348 queue_work(kmultipathd, &m->process_queued_ios); 349 pgpath = NULL; 350 r = DM_MAPIO_SUBMITTED; 351 + } else if (pgpath) { 352 + bdev = pgpath->path.dev->bdev; 353 + clone->q = bdev_get_queue(bdev); 354 + clone->rq_disk = bdev->bd_disk; 355 + } else if (__must_push_back(m)) 356 r = DM_MAPIO_REQUEUE; 357 else 358 r = -EIO; /* Failed */ ··· 398 { 399 int r; 400 unsigned long flags; 401 struct dm_mpath_io *mpio; 402 union map_info *info; 403 + struct request *clone, *n; 404 + LIST_HEAD(cl); 405 406 spin_lock_irqsave(&m->lock, flags); 407 + list_splice_init(&m->queued_ios, &cl); 408 spin_unlock_irqrestore(&m->lock, flags); 409 410 + list_for_each_entry_safe(clone, n, &cl, queuelist) { 411 + list_del_init(&clone->queuelist); 412 413 + info = dm_get_rq_mapinfo(clone); 414 mpio = info->ptr; 415 416 + r = map_io(m, clone, mpio, 1); 417 + if (r < 0) { 418 + mempool_free(mpio, m->mpio_pool); 419 + dm_kill_unmapped_request(clone, r); 420 + } else if (r == DM_MAPIO_REMAPPED) 421 + dm_dispatch_request(clone); 422 + else if (r == DM_MAPIO_REQUEUE) { 423 + mempool_free(mpio, m->mpio_pool); 424 + dm_requeue_unmapped_request(clone); 425 + } 426 } 427 } 428 ··· 863 } 864 865 /* 866 + * Map cloned requests 867 */ 868 + static int multipath_map(struct dm_target *ti, struct request *clone, 869 union map_info *map_context) 870 { 871 int r; 872 struct dm_mpath_io *mpio; 873 struct multipath *m = (struct multipath *) ti->private; 874 875 + mpio = mempool_alloc(m->mpio_pool, GFP_ATOMIC); 876 + if (!mpio) 877 + /* ENOMEM, requeue */ 878 + return DM_MAPIO_REQUEUE; 879 + memset(mpio, 0, sizeof(*mpio)); 880 881 map_context->ptr = mpio; 882 + clone->cmd_flags |= REQ_FAILFAST_TRANSPORT; 883 + r = map_io(m, clone, mpio, 0); 884 if (r < 0 || r == DM_MAPIO_REQUEUE) 885 mempool_free(mpio, m->mpio_pool); 886 ··· 1158 /* 1159 * end_io handling 1160 */ 1161 + static int do_end_io(struct multipath *m, struct request *clone, 1162 int error, struct dm_mpath_io *mpio) 1163 { 1164 + /* 1165 + * We don't queue any clone request inside the multipath target 1166 + * during end I/O handling, since those clone requests don't have 1167 + * bio clones. If we queue them inside the multipath target, 1168 + * we need to make bio clones, that requires memory allocation. 1169 + * (See drivers/md/dm.c:end_clone_bio() about why the clone requests 1170 + * don't have bio clones.) 1171 + * Instead of queueing the clone request here, we queue the original 1172 + * request into dm core, which will remake a clone request and 1173 + * clone bios for it and resubmit it later. 1174 + */ 1175 + int r = DM_ENDIO_REQUEUE; 1176 unsigned long flags; 1177 1178 + if (!error && !clone->errors) 1179 return 0; /* I/O complete */ 1180 1181 if (error == -EOPNOTSUPP) 1182 return error; 1183 1184 if (mpio->pgpath) 1185 fail_path(mpio->pgpath); 1186 1187 spin_lock_irqsave(&m->lock, flags); 1188 + if (!m->nr_valid_paths && !m->queue_if_no_path && !__must_push_back(m)) 1189 + r = -EIO; 1190 spin_unlock_irqrestore(&m->lock, flags); 1191 1192 + return r; 1193 } 1194 1195 + static int multipath_end_io(struct dm_target *ti, struct request *clone, 1196 int error, union map_info *map_context) 1197 { 1198 struct multipath *m = ti->private; ··· 1213 struct path_selector *ps; 1214 int r; 1215 1216 + r = do_end_io(m, clone, error, mpio); 1217 if (pgpath) { 1218 ps = &pgpath->pg->ps; 1219 if (ps->type->end_io) 1220 ps->type->end_io(ps, &pgpath->path, mpio->nr_bytes); 1221 } 1222 + mempool_free(mpio, m->mpio_pool); 1223 1224 return r; 1225 } ··· 1470 return ret; 1471 } 1472 1473 + static int __pgpath_busy(struct pgpath *pgpath) 1474 + { 1475 + struct request_queue *q = bdev_get_queue(pgpath->path.dev->bdev); 1476 + 1477 + return dm_underlying_device_busy(q); 1478 + } 1479 + 1480 + /* 1481 + * We return "busy", only when we can map I/Os but underlying devices 1482 + * are busy (so even if we map I/Os now, the I/Os will wait on 1483 + * the underlying queue). 1484 + * In other words, if we want to kill I/Os or queue them inside us 1485 + * due to map unavailability, we don't return "busy". Otherwise, 1486 + * dm core won't give us the I/Os and we can't do what we want. 1487 + */ 1488 + static int multipath_busy(struct dm_target *ti) 1489 + { 1490 + int busy = 0, has_active = 0; 1491 + struct multipath *m = ti->private; 1492 + struct priority_group *pg; 1493 + struct pgpath *pgpath; 1494 + unsigned long flags; 1495 + 1496 + spin_lock_irqsave(&m->lock, flags); 1497 + 1498 + /* Guess which priority_group will be used at next mapping time */ 1499 + if (unlikely(!m->current_pgpath && m->next_pg)) 1500 + pg = m->next_pg; 1501 + else if (likely(m->current_pg)) 1502 + pg = m->current_pg; 1503 + else 1504 + /* 1505 + * We don't know which pg will be used at next mapping time. 1506 + * We don't call __choose_pgpath() here to avoid to trigger 1507 + * pg_init just by busy checking. 1508 + * So we don't know whether underlying devices we will be using 1509 + * at next mapping time are busy or not. Just try mapping. 1510 + */ 1511 + goto out; 1512 + 1513 + /* 1514 + * If there is one non-busy active path at least, the path selector 1515 + * will be able to select it. So we consider such a pg as not busy. 1516 + */ 1517 + busy = 1; 1518 + list_for_each_entry(pgpath, &pg->pgpaths, list) 1519 + if (pgpath->is_active) { 1520 + has_active = 1; 1521 + 1522 + if (!__pgpath_busy(pgpath)) { 1523 + busy = 0; 1524 + break; 1525 + } 1526 + } 1527 + 1528 + if (!has_active) 1529 + /* 1530 + * No active path in this pg, so this pg won't be used and 1531 + * the current_pg will be changed at next mapping time. 1532 + * We need to try mapping to determine it. 1533 + */ 1534 + busy = 0; 1535 + 1536 + out: 1537 + spin_unlock_irqrestore(&m->lock, flags); 1538 + 1539 + return busy; 1540 + } 1541 + 1542 /*----------------------------------------------------------------- 1543 * Module setup 1544 *---------------------------------------------------------------*/ ··· 1479 .module = THIS_MODULE, 1480 .ctr = multipath_ctr, 1481 .dtr = multipath_dtr, 1482 + .map_rq = multipath_map, 1483 + .rq_end_io = multipath_end_io, 1484 .presuspend = multipath_presuspend, 1485 .resume = multipath_resume, 1486 .status = multipath_status, 1487 .message = multipath_message, 1488 .ioctl = multipath_ioctl, 1489 .iterate_devices = multipath_iterate_devices, 1490 + .busy = multipath_busy, 1491 }; 1492 1493 static int __init dm_multipath_init(void)