dm raid1: handle read failures

This patch gives the ability to respond-to/record device failures
that happen during read operations. It also adds the ability to
read from mirror devices that are not the primary if they are
in-sync.

There are essentially two read paths in mirroring; the direct path
and the queued path. When a read request is mapped, if the region
is 'in-sync' the direct path is taken; otherwise the queued path
is taken.

If the direct path is taken, we must record bio information so that
if the read fails we can retry it. We then discover the status of
a direct read through mirror_end_io. If the read has failed, we will
mark the device from which the read was attempted as failed (so we
don't try to read from it again), restore the bio and try again.

If the queued path is taken, we discover the results of the read
from 'read_callback'. If the device failed, we will mark the device
as failed and attempt the read again if there is another device
where this region is known to be 'in-sync'.

Signed-off-by: Jonathan Brassow <jbrassow@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>

authored by Jonathan Brassow and committed by Alasdair G Kergon 06386bbf b80aa7a0

+211 -45
+211 -45
drivers/md/dm-raid1.c
··· 6 7 #include "dm.h" 8 #include "dm-bio-list.h" 9 #include "dm-io.h" 10 #include "dm-log.h" 11 #include "kcopyd.h" ··· 142 struct bio_list failures; 143 144 struct dm_io_client *io_client; 145 146 /* recovery */ 147 region_t nr_regions; ··· 649 wake(rh->ms); 650 } 651 652 /* 653 * Every mirror should look like this one. 654 */ 655 #define DEFAULT_MIRROR 0 656 657 /* 658 - * This is yucky. We squirrel the mirror_set struct away inside 659 - * bi_next for write buffers. This is safe since the bh 660 * doesn't get submitted to the lower levels of block layer. 661 */ 662 - static struct mirror_set *bio_get_ms(struct bio *bio) 663 { 664 - return (struct mirror_set *) bio->bi_next; 665 } 666 667 - static void bio_set_ms(struct bio *bio, struct mirror_set *ms) 668 { 669 - bio->bi_next = (struct bio *) ms; 670 } 671 672 static struct mirror *get_default_mirror(struct mirror_set *ms) ··· 865 *---------------------------------------------------------------*/ 866 static struct mirror *choose_mirror(struct mirror_set *ms, sector_t sector) 867 { 868 - /* FIXME: add read balancing */ 869 - return get_default_mirror(ms); 870 } 871 872 /* 873 * remap a buffer to a particular mirror. 874 */ 875 - static void map_bio(struct mirror_set *ms, struct mirror *m, struct bio *bio) 876 { 877 bio->bi_bdev = m->dev->bdev; 878 - bio->bi_sector = m->offset + (bio->bi_sector - ms->ti->begin); 879 } 880 881 static void do_reads(struct mirror_set *ms, struct bio_list *reads) ··· 974 975 while ((bio = bio_list_pop(reads))) { 976 region = bio_to_region(&ms->rh, bio); 977 978 /* 979 * We can only read balance if the region is in sync. 980 */ 981 - if (rh_in_sync(&ms->rh, region, 1)) 982 m = choose_mirror(ms, bio->bi_sector); 983 - else 984 - m = get_default_mirror(ms); 985 986 - map_bio(ms, m, bio); 987 - generic_make_request(bio); 988 } 989 } 990 ··· 1063 int should_wake = 0; 1064 unsigned long flags; 1065 1066 - ms = bio_get_ms(bio); 1067 - bio_set_ms(bio, NULL); 1068 1069 /* 1070 * NOTE: We don't decrement the pending count here, ··· 1107 static void do_write(struct mirror_set *ms, struct bio *bio) 1108 { 1109 unsigned int i; 1110 - struct io_region io[KCOPYD_MAX_REGIONS+1]; 1111 struct mirror *m; 1112 struct dm_io_request io_req = { 1113 .bi_rw = WRITE, ··· 1118 .client = ms->io_client, 1119 }; 1120 1121 - for (i = 0; i < ms->nr_mirrors; i++) { 1122 - m = ms->mirror + i; 1123 1124 - io[i].bdev = m->dev->bdev; 1125 - io[i].sector = m->offset + (bio->bi_sector - ms->ti->begin); 1126 - io[i].count = bio->bi_size >> 9; 1127 - } 1128 - 1129 - bio_set_ms(bio, ms); 1130 1131 (void) dm_io(&io_req, ms->nr_mirrors, io, NULL); 1132 } ··· 1190 rh_delay(&ms->rh, bio); 1191 1192 while ((bio = bio_list_pop(&nosync))) { 1193 - map_bio(ms, get_default_mirror(ms), bio); 1194 generic_make_request(bio); 1195 } 1196 } ··· 1329 atomic_set(&ms->suspend, 0); 1330 atomic_set(&ms->default_mirror, DEFAULT_MIRROR); 1331 1332 ms->io_client = dm_io_client_create(DM_IO_PAGES); 1333 if (IS_ERR(ms->io_client)) { 1334 ti->error = "Error creating dm_io client"; 1335 kfree(ms); 1336 return NULL; 1337 } ··· 1349 if (rh_init(&ms->rh, ms, dl, region_size, ms->nr_regions)) { 1350 ti->error = "Error creating dirty region hash"; 1351 dm_io_client_destroy(ms->io_client); 1352 kfree(ms); 1353 return NULL; 1354 } ··· 1365 1366 dm_io_client_destroy(ms->io_client); 1367 rh_exit(&ms->rh); 1368 kfree(ms); 1369 } 1370 ··· 1620 int r, rw = bio_rw(bio); 1621 struct mirror *m; 1622 struct mirror_set *ms = ti->private; 1623 - 1624 - map_context->ll = bio_to_region(&ms->rh, bio); 1625 1626 if (rw == WRITE) { 1627 queue_bio(ms, bio, rw); 1628 return DM_MAPIO_SUBMITTED; 1629 } ··· 1634 if (r < 0 && r != -EWOULDBLOCK) 1635 return r; 1636 1637 - if (r == -EWOULDBLOCK) /* FIXME: ugly */ 1638 - r = DM_MAPIO_SUBMITTED; 1639 - 1640 /* 1641 - * We don't want to fast track a recovery just for a read 1642 - * ahead. So we just let it silently fail. 1643 - * FIXME: get rid of this. 1644 */ 1645 - if (!r && rw == READA) 1646 - return -EIO; 1647 1648 - if (!r) { 1649 - /* Pass this io over to the daemon */ 1650 queue_bio(ms, bio, rw); 1651 return DM_MAPIO_SUBMITTED; 1652 } 1653 1654 m = choose_mirror(ms, bio->bi_sector); 1655 - if (!m) 1656 return -EIO; 1657 1658 - map_bio(ms, m, bio); 1659 return DM_MAPIO_REMAPPED; 1660 } 1661 ··· 1670 { 1671 int rw = bio_rw(bio); 1672 struct mirror_set *ms = (struct mirror_set *) ti->private; 1673 - region_t region = map_context->ll; 1674 1675 /* 1676 * We need to dec pending if this was a write. 1677 */ 1678 - if (rw == WRITE) 1679 - rh_dec(&ms->rh, region); 1680 1681 - return 0; 1682 } 1683 1684 static void mirror_presuspend(struct dm_target *ti)
··· 6 7 #include "dm.h" 8 #include "dm-bio-list.h" 9 + #include "dm-bio-record.h" 10 #include "dm-io.h" 11 #include "dm-log.h" 12 #include "kcopyd.h" ··· 141 struct bio_list failures; 142 143 struct dm_io_client *io_client; 144 + mempool_t *read_record_pool; 145 146 /* recovery */ 147 region_t nr_regions; ··· 647 wake(rh->ms); 648 } 649 650 + #define MIN_READ_RECORDS 20 651 + struct dm_raid1_read_record { 652 + struct mirror *m; 653 + struct dm_bio_details details; 654 + }; 655 + 656 /* 657 * Every mirror should look like this one. 658 */ 659 #define DEFAULT_MIRROR 0 660 661 /* 662 + * This is yucky. We squirrel the mirror struct away inside 663 + * bi_next for read/write buffers. This is safe since the bh 664 * doesn't get submitted to the lower levels of block layer. 665 */ 666 + static struct mirror *bio_get_m(struct bio *bio) 667 { 668 + return (struct mirror *) bio->bi_next; 669 } 670 671 + static void bio_set_m(struct bio *bio, struct mirror *m) 672 { 673 + bio->bi_next = (struct bio *) m; 674 } 675 676 static struct mirror *get_default_mirror(struct mirror_set *ms) ··· 857 *---------------------------------------------------------------*/ 858 static struct mirror *choose_mirror(struct mirror_set *ms, sector_t sector) 859 { 860 + struct mirror *m = get_default_mirror(ms); 861 + 862 + do { 863 + if (likely(!atomic_read(&m->error_count))) 864 + return m; 865 + 866 + if (m-- == ms->mirror) 867 + m += ms->nr_mirrors; 868 + } while (m != get_default_mirror(ms)); 869 + 870 + return NULL; 871 + } 872 + 873 + static int default_ok(struct mirror *m) 874 + { 875 + struct mirror *default_mirror = get_default_mirror(m->ms); 876 + 877 + return !atomic_read(&default_mirror->error_count); 878 + } 879 + 880 + static int mirror_available(struct mirror_set *ms, struct bio *bio) 881 + { 882 + region_t region = bio_to_region(&ms->rh, bio); 883 + 884 + if (ms->rh.log->type->in_sync(ms->rh.log, region, 0)) 885 + return choose_mirror(ms, bio->bi_sector) ? 1 : 0; 886 + 887 + return 0; 888 } 889 890 /* 891 * remap a buffer to a particular mirror. 892 */ 893 + static sector_t map_sector(struct mirror *m, struct bio *bio) 894 + { 895 + return m->offset + (bio->bi_sector - m->ms->ti->begin); 896 + } 897 + 898 + static void map_bio(struct mirror *m, struct bio *bio) 899 { 900 bio->bi_bdev = m->dev->bdev; 901 + bio->bi_sector = map_sector(m, bio); 902 + } 903 + 904 + static void map_region(struct io_region *io, struct mirror *m, 905 + struct bio *bio) 906 + { 907 + io->bdev = m->dev->bdev; 908 + io->sector = map_sector(m, bio); 909 + io->count = bio->bi_size >> 9; 910 + } 911 + 912 + /*----------------------------------------------------------------- 913 + * Reads 914 + *---------------------------------------------------------------*/ 915 + static void read_callback(unsigned long error, void *context) 916 + { 917 + struct bio *bio = context; 918 + struct mirror *m; 919 + 920 + m = bio_get_m(bio); 921 + bio_set_m(bio, NULL); 922 + 923 + if (likely(!error)) { 924 + bio_endio(bio, 0); 925 + return; 926 + } 927 + 928 + fail_mirror(m, DM_RAID1_READ_ERROR); 929 + 930 + if (likely(default_ok(m)) || mirror_available(m->ms, bio)) { 931 + DMWARN_LIMIT("Read failure on mirror device %s. " 932 + "Trying alternative device.", 933 + m->dev->name); 934 + queue_bio(m->ms, bio, bio_rw(bio)); 935 + return; 936 + } 937 + 938 + DMERR_LIMIT("Read failure on mirror device %s. Failing I/O.", 939 + m->dev->name); 940 + bio_endio(bio, -EIO); 941 + } 942 + 943 + /* Asynchronous read. */ 944 + static void read_async_bio(struct mirror *m, struct bio *bio) 945 + { 946 + struct io_region io; 947 + struct dm_io_request io_req = { 948 + .bi_rw = READ, 949 + .mem.type = DM_IO_BVEC, 950 + .mem.ptr.bvec = bio->bi_io_vec + bio->bi_idx, 951 + .notify.fn = read_callback, 952 + .notify.context = bio, 953 + .client = m->ms->io_client, 954 + }; 955 + 956 + map_region(&io, m, bio); 957 + bio_set_m(bio, m); 958 + (void) dm_io(&io_req, 1, &io, NULL); 959 } 960 961 static void do_reads(struct mirror_set *ms, struct bio_list *reads) ··· 878 879 while ((bio = bio_list_pop(reads))) { 880 region = bio_to_region(&ms->rh, bio); 881 + m = get_default_mirror(ms); 882 883 /* 884 * We can only read balance if the region is in sync. 885 */ 886 + if (likely(rh_in_sync(&ms->rh, region, 1))) 887 m = choose_mirror(ms, bio->bi_sector); 888 + else if (m && atomic_read(&m->error_count)) 889 + m = NULL; 890 891 + if (likely(m)) 892 + read_async_bio(m, bio); 893 + else 894 + bio_endio(bio, -EIO); 895 } 896 } 897 ··· 964 int should_wake = 0; 965 unsigned long flags; 966 967 + ms = bio_get_m(bio)->ms; 968 + bio_set_m(bio, NULL); 969 970 /* 971 * NOTE: We don't decrement the pending count here, ··· 1008 static void do_write(struct mirror_set *ms, struct bio *bio) 1009 { 1010 unsigned int i; 1011 + struct io_region io[ms->nr_mirrors], *dest = io; 1012 struct mirror *m; 1013 struct dm_io_request io_req = { 1014 .bi_rw = WRITE, ··· 1019 .client = ms->io_client, 1020 }; 1021 1022 + for (i = 0, m = ms->mirror; i < ms->nr_mirrors; i++, m++) 1023 + map_region(dest++, m, bio); 1024 1025 + /* 1026 + * Use default mirror because we only need it to retrieve the reference 1027 + * to the mirror set in write_callback(). 1028 + */ 1029 + bio_set_m(bio, get_default_mirror(ms)); 1030 1031 (void) dm_io(&io_req, ms->nr_mirrors, io, NULL); 1032 } ··· 1092 rh_delay(&ms->rh, bio); 1093 1094 while ((bio = bio_list_pop(&nosync))) { 1095 + map_bio(get_default_mirror(ms), bio); 1096 generic_make_request(bio); 1097 } 1098 } ··· 1231 atomic_set(&ms->suspend, 0); 1232 atomic_set(&ms->default_mirror, DEFAULT_MIRROR); 1233 1234 + len = sizeof(struct dm_raid1_read_record); 1235 + ms->read_record_pool = mempool_create_kmalloc_pool(MIN_READ_RECORDS, 1236 + len); 1237 + if (!ms->read_record_pool) { 1238 + ti->error = "Error creating mirror read_record_pool"; 1239 + kfree(ms); 1240 + return NULL; 1241 + } 1242 + 1243 ms->io_client = dm_io_client_create(DM_IO_PAGES); 1244 if (IS_ERR(ms->io_client)) { 1245 ti->error = "Error creating dm_io client"; 1246 + mempool_destroy(ms->read_record_pool); 1247 kfree(ms); 1248 return NULL; 1249 } ··· 1241 if (rh_init(&ms->rh, ms, dl, region_size, ms->nr_regions)) { 1242 ti->error = "Error creating dirty region hash"; 1243 dm_io_client_destroy(ms->io_client); 1244 + mempool_destroy(ms->read_record_pool); 1245 kfree(ms); 1246 return NULL; 1247 } ··· 1256 1257 dm_io_client_destroy(ms->io_client); 1258 rh_exit(&ms->rh); 1259 + mempool_destroy(ms->read_record_pool); 1260 kfree(ms); 1261 } 1262 ··· 1510 int r, rw = bio_rw(bio); 1511 struct mirror *m; 1512 struct mirror_set *ms = ti->private; 1513 + struct dm_raid1_read_record *read_record = NULL; 1514 1515 if (rw == WRITE) { 1516 + /* Save region for mirror_end_io() handler */ 1517 + map_context->ll = bio_to_region(&ms->rh, bio); 1518 queue_bio(ms, bio, rw); 1519 return DM_MAPIO_SUBMITTED; 1520 } ··· 1523 if (r < 0 && r != -EWOULDBLOCK) 1524 return r; 1525 1526 /* 1527 + * If region is not in-sync queue the bio. 1528 */ 1529 + if (!r || (r == -EWOULDBLOCK)) { 1530 + if (rw == READA) 1531 + return -EWOULDBLOCK; 1532 1533 queue_bio(ms, bio, rw); 1534 return DM_MAPIO_SUBMITTED; 1535 } 1536 1537 + /* 1538 + * The region is in-sync and we can perform reads directly. 1539 + * Store enough information so we can retry if it fails. 1540 + */ 1541 m = choose_mirror(ms, bio->bi_sector); 1542 + if (unlikely(!m)) 1543 return -EIO; 1544 1545 + read_record = mempool_alloc(ms->read_record_pool, GFP_NOIO); 1546 + if (likely(read_record)) { 1547 + dm_bio_record(&read_record->details, bio); 1548 + map_context->ptr = read_record; 1549 + read_record->m = m; 1550 + } 1551 + 1552 + map_bio(m, bio); 1553 + 1554 return DM_MAPIO_REMAPPED; 1555 } 1556 ··· 1553 { 1554 int rw = bio_rw(bio); 1555 struct mirror_set *ms = (struct mirror_set *) ti->private; 1556 + struct mirror *m = NULL; 1557 + struct dm_bio_details *bd = NULL; 1558 + struct dm_raid1_read_record *read_record = map_context->ptr; 1559 1560 /* 1561 * We need to dec pending if this was a write. 1562 */ 1563 + if (rw == WRITE) { 1564 + rh_dec(&ms->rh, map_context->ll); 1565 + return error; 1566 + } 1567 1568 + if (error == -EOPNOTSUPP) 1569 + goto out; 1570 + 1571 + if ((error == -EWOULDBLOCK) && bio_rw_ahead(bio)) 1572 + goto out; 1573 + 1574 + if (unlikely(error)) { 1575 + if (!read_record) { 1576 + /* 1577 + * There wasn't enough memory to record necessary 1578 + * information for a retry or there was no other 1579 + * mirror in-sync. 1580 + */ 1581 + DMERR_LIMIT("Mirror read failed from %s.", 1582 + m->dev->name); 1583 + return -EIO; 1584 + } 1585 + DMERR("Mirror read failed from %s. Trying alternative device.", 1586 + m->dev->name); 1587 + 1588 + m = read_record->m; 1589 + fail_mirror(m, DM_RAID1_READ_ERROR); 1590 + 1591 + /* 1592 + * A failed read is requeued for another attempt using an intact 1593 + * mirror. 1594 + */ 1595 + if (default_ok(m) || mirror_available(ms, bio)) { 1596 + bd = &read_record->details; 1597 + 1598 + dm_bio_restore(bd, bio); 1599 + mempool_free(read_record, ms->read_record_pool); 1600 + map_context->ptr = NULL; 1601 + queue_bio(ms, bio, rw); 1602 + return 1; 1603 + } 1604 + DMERR("All replicated volumes dead, failing I/O"); 1605 + } 1606 + 1607 + out: 1608 + if (read_record) { 1609 + mempool_free(read_record, ms->read_record_pool); 1610 + map_context->ptr = NULL; 1611 + } 1612 + 1613 + return error; 1614 } 1615 1616 static void mirror_presuspend(struct dm_target *ti)