dm raid1: handle read failures

This patch gives the ability to respond-to/record device failures
that happen during read operations. It also adds the ability to
read from mirror devices that are not the primary if they are
in-sync.

There are essentially two read paths in mirroring; the direct path
and the queued path. When a read request is mapped, if the region
is 'in-sync' the direct path is taken; otherwise the queued path
is taken.

If the direct path is taken, we must record bio information so that
if the read fails we can retry it. We then discover the status of
a direct read through mirror_end_io. If the read has failed, we will
mark the device from which the read was attempted as failed (so we
don't try to read from it again), restore the bio and try again.

If the queued path is taken, we discover the results of the read
from 'read_callback'. If the device failed, we will mark the device
as failed and attempt the read again if there is another device
where this region is known to be 'in-sync'.

Signed-off-by: Jonathan Brassow <jbrassow@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>

authored by Jonathan Brassow and committed by Alasdair G Kergon 06386bbf b80aa7a0

+211 -45
+211 -45
drivers/md/dm-raid1.c
··· 6 6 7 7 #include "dm.h" 8 8 #include "dm-bio-list.h" 9 + #include "dm-bio-record.h" 9 10 #include "dm-io.h" 10 11 #include "dm-log.h" 11 12 #include "kcopyd.h" ··· 142 141 struct bio_list failures; 143 142 144 143 struct dm_io_client *io_client; 144 + mempool_t *read_record_pool; 145 145 146 146 /* recovery */ 147 147 region_t nr_regions; ··· 649 647 wake(rh->ms); 650 648 } 651 649 650 + #define MIN_READ_RECORDS 20 651 + struct dm_raid1_read_record { 652 + struct mirror *m; 653 + struct dm_bio_details details; 654 + }; 655 + 652 656 /* 653 657 * Every mirror should look like this one. 654 658 */ 655 659 #define DEFAULT_MIRROR 0 656 660 657 661 /* 658 - * This is yucky. We squirrel the mirror_set struct away inside 659 - * bi_next for write buffers. This is safe since the bh 662 + * This is yucky. We squirrel the mirror struct away inside 663 + * bi_next for read/write buffers. This is safe since the bh 660 664 * doesn't get submitted to the lower levels of block layer. 661 665 */ 662 - static struct mirror_set *bio_get_ms(struct bio *bio) 666 + static struct mirror *bio_get_m(struct bio *bio) 663 667 { 664 - return (struct mirror_set *) bio->bi_next; 668 + return (struct mirror *) bio->bi_next; 665 669 } 666 670 667 - static void bio_set_ms(struct bio *bio, struct mirror_set *ms) 671 + static void bio_set_m(struct bio *bio, struct mirror *m) 668 672 { 669 - bio->bi_next = (struct bio *) ms; 673 + bio->bi_next = (struct bio *) m; 670 674 } 671 675 672 676 static struct mirror *get_default_mirror(struct mirror_set *ms) ··· 865 857 *---------------------------------------------------------------*/ 866 858 static struct mirror *choose_mirror(struct mirror_set *ms, sector_t sector) 867 859 { 868 - /* FIXME: add read balancing */ 869 - return get_default_mirror(ms); 860 + struct mirror *m = get_default_mirror(ms); 861 + 862 + do { 863 + if (likely(!atomic_read(&m->error_count))) 864 + return m; 865 + 866 + if (m-- == ms->mirror) 867 + m += ms->nr_mirrors; 868 + } while (m != get_default_mirror(ms)); 869 + 870 + return NULL; 871 + } 872 + 873 + static int default_ok(struct mirror *m) 874 + { 875 + struct mirror *default_mirror = get_default_mirror(m->ms); 876 + 877 + return !atomic_read(&default_mirror->error_count); 878 + } 879 + 880 + static int mirror_available(struct mirror_set *ms, struct bio *bio) 881 + { 882 + region_t region = bio_to_region(&ms->rh, bio); 883 + 884 + if (ms->rh.log->type->in_sync(ms->rh.log, region, 0)) 885 + return choose_mirror(ms, bio->bi_sector) ? 1 : 0; 886 + 887 + return 0; 870 888 } 871 889 872 890 /* 873 891 * remap a buffer to a particular mirror. 874 892 */ 875 - static void map_bio(struct mirror_set *ms, struct mirror *m, struct bio *bio) 893 + static sector_t map_sector(struct mirror *m, struct bio *bio) 894 + { 895 + return m->offset + (bio->bi_sector - m->ms->ti->begin); 896 + } 897 + 898 + static void map_bio(struct mirror *m, struct bio *bio) 876 899 { 877 900 bio->bi_bdev = m->dev->bdev; 878 - bio->bi_sector = m->offset + (bio->bi_sector - ms->ti->begin); 901 + bio->bi_sector = map_sector(m, bio); 902 + } 903 + 904 + static void map_region(struct io_region *io, struct mirror *m, 905 + struct bio *bio) 906 + { 907 + io->bdev = m->dev->bdev; 908 + io->sector = map_sector(m, bio); 909 + io->count = bio->bi_size >> 9; 910 + } 911 + 912 + /*----------------------------------------------------------------- 913 + * Reads 914 + *---------------------------------------------------------------*/ 915 + static void read_callback(unsigned long error, void *context) 916 + { 917 + struct bio *bio = context; 918 + struct mirror *m; 919 + 920 + m = bio_get_m(bio); 921 + bio_set_m(bio, NULL); 922 + 923 + if (likely(!error)) { 924 + bio_endio(bio, 0); 925 + return; 926 + } 927 + 928 + fail_mirror(m, DM_RAID1_READ_ERROR); 929 + 930 + if (likely(default_ok(m)) || mirror_available(m->ms, bio)) { 931 + DMWARN_LIMIT("Read failure on mirror device %s. " 932 + "Trying alternative device.", 933 + m->dev->name); 934 + queue_bio(m->ms, bio, bio_rw(bio)); 935 + return; 936 + } 937 + 938 + DMERR_LIMIT("Read failure on mirror device %s. Failing I/O.", 939 + m->dev->name); 940 + bio_endio(bio, -EIO); 941 + } 942 + 943 + /* Asynchronous read. */ 944 + static void read_async_bio(struct mirror *m, struct bio *bio) 945 + { 946 + struct io_region io; 947 + struct dm_io_request io_req = { 948 + .bi_rw = READ, 949 + .mem.type = DM_IO_BVEC, 950 + .mem.ptr.bvec = bio->bi_io_vec + bio->bi_idx, 951 + .notify.fn = read_callback, 952 + .notify.context = bio, 953 + .client = m->ms->io_client, 954 + }; 955 + 956 + map_region(&io, m, bio); 957 + bio_set_m(bio, m); 958 + (void) dm_io(&io_req, 1, &io, NULL); 879 959 } 880 960 881 961 static void do_reads(struct mirror_set *ms, struct bio_list *reads) ··· 974 878 975 879 while ((bio = bio_list_pop(reads))) { 976 880 region = bio_to_region(&ms->rh, bio); 881 + m = get_default_mirror(ms); 977 882 978 883 /* 979 884 * We can only read balance if the region is in sync. 980 885 */ 981 - if (rh_in_sync(&ms->rh, region, 1)) 886 + if (likely(rh_in_sync(&ms->rh, region, 1))) 982 887 m = choose_mirror(ms, bio->bi_sector); 983 - else 984 - m = get_default_mirror(ms); 888 + else if (m && atomic_read(&m->error_count)) 889 + m = NULL; 985 890 986 - map_bio(ms, m, bio); 987 - generic_make_request(bio); 891 + if (likely(m)) 892 + read_async_bio(m, bio); 893 + else 894 + bio_endio(bio, -EIO); 988 895 } 989 896 } 990 897 ··· 1063 964 int should_wake = 0; 1064 965 unsigned long flags; 1065 966 1066 - ms = bio_get_ms(bio); 1067 - bio_set_ms(bio, NULL); 967 + ms = bio_get_m(bio)->ms; 968 + bio_set_m(bio, NULL); 1068 969 1069 970 /* 1070 971 * NOTE: We don't decrement the pending count here, ··· 1107 1008 static void do_write(struct mirror_set *ms, struct bio *bio) 1108 1009 { 1109 1010 unsigned int i; 1110 - struct io_region io[KCOPYD_MAX_REGIONS+1]; 1011 + struct io_region io[ms->nr_mirrors], *dest = io; 1111 1012 struct mirror *m; 1112 1013 struct dm_io_request io_req = { 1113 1014 .bi_rw = WRITE, ··· 1118 1019 .client = ms->io_client, 1119 1020 }; 1120 1021 1121 - for (i = 0; i < ms->nr_mirrors; i++) { 1122 - m = ms->mirror + i; 1022 + for (i = 0, m = ms->mirror; i < ms->nr_mirrors; i++, m++) 1023 + map_region(dest++, m, bio); 1123 1024 1124 - io[i].bdev = m->dev->bdev; 1125 - io[i].sector = m->offset + (bio->bi_sector - ms->ti->begin); 1126 - io[i].count = bio->bi_size >> 9; 1127 - } 1128 - 1129 - bio_set_ms(bio, ms); 1025 + /* 1026 + * Use default mirror because we only need it to retrieve the reference 1027 + * to the mirror set in write_callback(). 1028 + */ 1029 + bio_set_m(bio, get_default_mirror(ms)); 1130 1030 1131 1031 (void) dm_io(&io_req, ms->nr_mirrors, io, NULL); 1132 1032 } ··· 1190 1092 rh_delay(&ms->rh, bio); 1191 1093 1192 1094 while ((bio = bio_list_pop(&nosync))) { 1193 - map_bio(ms, get_default_mirror(ms), bio); 1095 + map_bio(get_default_mirror(ms), bio); 1194 1096 generic_make_request(bio); 1195 1097 } 1196 1098 } ··· 1329 1231 atomic_set(&ms->suspend, 0); 1330 1232 atomic_set(&ms->default_mirror, DEFAULT_MIRROR); 1331 1233 1234 + len = sizeof(struct dm_raid1_read_record); 1235 + ms->read_record_pool = mempool_create_kmalloc_pool(MIN_READ_RECORDS, 1236 + len); 1237 + if (!ms->read_record_pool) { 1238 + ti->error = "Error creating mirror read_record_pool"; 1239 + kfree(ms); 1240 + return NULL; 1241 + } 1242 + 1332 1243 ms->io_client = dm_io_client_create(DM_IO_PAGES); 1333 1244 if (IS_ERR(ms->io_client)) { 1334 1245 ti->error = "Error creating dm_io client"; 1246 + mempool_destroy(ms->read_record_pool); 1335 1247 kfree(ms); 1336 1248 return NULL; 1337 1249 } ··· 1349 1241 if (rh_init(&ms->rh, ms, dl, region_size, ms->nr_regions)) { 1350 1242 ti->error = "Error creating dirty region hash"; 1351 1243 dm_io_client_destroy(ms->io_client); 1244 + mempool_destroy(ms->read_record_pool); 1352 1245 kfree(ms); 1353 1246 return NULL; 1354 1247 } ··· 1365 1256 1366 1257 dm_io_client_destroy(ms->io_client); 1367 1258 rh_exit(&ms->rh); 1259 + mempool_destroy(ms->read_record_pool); 1368 1260 kfree(ms); 1369 1261 } 1370 1262 ··· 1620 1510 int r, rw = bio_rw(bio); 1621 1511 struct mirror *m; 1622 1512 struct mirror_set *ms = ti->private; 1623 - 1624 - map_context->ll = bio_to_region(&ms->rh, bio); 1513 + struct dm_raid1_read_record *read_record = NULL; 1625 1514 1626 1515 if (rw == WRITE) { 1516 + /* Save region for mirror_end_io() handler */ 1517 + map_context->ll = bio_to_region(&ms->rh, bio); 1627 1518 queue_bio(ms, bio, rw); 1628 1519 return DM_MAPIO_SUBMITTED; 1629 1520 } ··· 1634 1523 if (r < 0 && r != -EWOULDBLOCK) 1635 1524 return r; 1636 1525 1637 - if (r == -EWOULDBLOCK) /* FIXME: ugly */ 1638 - r = DM_MAPIO_SUBMITTED; 1639 - 1640 1526 /* 1641 - * We don't want to fast track a recovery just for a read 1642 - * ahead. So we just let it silently fail. 1643 - * FIXME: get rid of this. 1527 + * If region is not in-sync queue the bio. 1644 1528 */ 1645 - if (!r && rw == READA) 1646 - return -EIO; 1529 + if (!r || (r == -EWOULDBLOCK)) { 1530 + if (rw == READA) 1531 + return -EWOULDBLOCK; 1647 1532 1648 - if (!r) { 1649 - /* Pass this io over to the daemon */ 1650 1533 queue_bio(ms, bio, rw); 1651 1534 return DM_MAPIO_SUBMITTED; 1652 1535 } 1653 1536 1537 + /* 1538 + * The region is in-sync and we can perform reads directly. 1539 + * Store enough information so we can retry if it fails. 1540 + */ 1654 1541 m = choose_mirror(ms, bio->bi_sector); 1655 - if (!m) 1542 + if (unlikely(!m)) 1656 1543 return -EIO; 1657 1544 1658 - map_bio(ms, m, bio); 1545 + read_record = mempool_alloc(ms->read_record_pool, GFP_NOIO); 1546 + if (likely(read_record)) { 1547 + dm_bio_record(&read_record->details, bio); 1548 + map_context->ptr = read_record; 1549 + read_record->m = m; 1550 + } 1551 + 1552 + map_bio(m, bio); 1553 + 1659 1554 return DM_MAPIO_REMAPPED; 1660 1555 } 1661 1556 ··· 1670 1553 { 1671 1554 int rw = bio_rw(bio); 1672 1555 struct mirror_set *ms = (struct mirror_set *) ti->private; 1673 - region_t region = map_context->ll; 1556 + struct mirror *m = NULL; 1557 + struct dm_bio_details *bd = NULL; 1558 + struct dm_raid1_read_record *read_record = map_context->ptr; 1674 1559 1675 1560 /* 1676 1561 * We need to dec pending if this was a write. 1677 1562 */ 1678 - if (rw == WRITE) 1679 - rh_dec(&ms->rh, region); 1563 + if (rw == WRITE) { 1564 + rh_dec(&ms->rh, map_context->ll); 1565 + return error; 1566 + } 1680 1567 1681 - return 0; 1568 + if (error == -EOPNOTSUPP) 1569 + goto out; 1570 + 1571 + if ((error == -EWOULDBLOCK) && bio_rw_ahead(bio)) 1572 + goto out; 1573 + 1574 + if (unlikely(error)) { 1575 + if (!read_record) { 1576 + /* 1577 + * There wasn't enough memory to record necessary 1578 + * information for a retry or there was no other 1579 + * mirror in-sync. 1580 + */ 1581 + DMERR_LIMIT("Mirror read failed from %s.", 1582 + m->dev->name); 1583 + return -EIO; 1584 + } 1585 + DMERR("Mirror read failed from %s. Trying alternative device.", 1586 + m->dev->name); 1587 + 1588 + m = read_record->m; 1589 + fail_mirror(m, DM_RAID1_READ_ERROR); 1590 + 1591 + /* 1592 + * A failed read is requeued for another attempt using an intact 1593 + * mirror. 1594 + */ 1595 + if (default_ok(m) || mirror_available(ms, bio)) { 1596 + bd = &read_record->details; 1597 + 1598 + dm_bio_restore(bd, bio); 1599 + mempool_free(read_record, ms->read_record_pool); 1600 + map_context->ptr = NULL; 1601 + queue_bio(ms, bio, rw); 1602 + return 1; 1603 + } 1604 + DMERR("All replicated volumes dead, failing I/O"); 1605 + } 1606 + 1607 + out: 1608 + if (read_record) { 1609 + mempool_free(read_record, ms->read_record_pool); 1610 + map_context->ptr = NULL; 1611 + } 1612 + 1613 + return error; 1682 1614 } 1683 1615 1684 1616 static void mirror_presuspend(struct dm_target *ti)