Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

md/raid10: improve raid10 discard request

Now the discard request is split by chunk size. So it takes a long time
to finish mkfs on disks which support discard function. This patch improve
handling raid10 discard request. It uses the similar way with patch
29efc390b (md/md0: optimize raid0 discard handling).

But it's a little complex than raid0. Because raid10 has different layout.
If raid10 is offset layout and the discard request is smaller than stripe
size. There are some holes when we submit discard bio to underlayer disks.

For example: five disks (disk1 - disk5)
D01 D02 D03 D04 D05
D05 D01 D02 D03 D04
D06 D07 D08 D09 D10
D10 D06 D07 D08 D09
The discard bio just wants to discard from D03 to D10. For disk3, there is
a hole between D03 and D08. For disk4, there is a hole between D04 and D09.
D03 is a chunk, raid10_write_request can handle one chunk perfectly. So
the part that is not aligned with stripe size is still handled by
raid10_write_request.

If reshape is running when discard bio comes and the discard bio spans the
reshape position, raid10_write_request is responsible to handle this
discard bio.

I did a test with this patch set.
Without patch:
time mkfs.xfs /dev/md0
real4m39.775s
user0m0.000s
sys0m0.298s

With patch:
time mkfs.xfs /dev/md0
real0m0.105s
user0m0.000s
sys0m0.007s

nvme3n1 259:1 0 477G 0 disk
└─nvme3n1p1 259:10 0 50G 0 part
nvme4n1 259:2 0 477G 0 disk
└─nvme4n1p1 259:11 0 50G 0 part
nvme5n1 259:6 0 477G 0 disk
└─nvme5n1p1 259:12 0 50G 0 part
nvme2n1 259:9 0 477G 0 disk
└─nvme2n1p1 259:15 0 50G 0 part
nvme0n1 259:13 0 477G 0 disk
└─nvme0n1p1 259:14 0 50G 0 part

Reviewed-by: Coly Li <colyli@suse.de>
Reviewed-by: Guoqing Jiang <guoqing.jiang@cloud.ionos.com>
Tested-by: Adrian Huang <ahuang12@lenovo.com>
Signed-off-by: Xiao Ni <xni@redhat.com>
Signed-off-by: Song Liu <songliubraving@fb.com>

authored by

Xiao Ni and committed by
Song Liu
d30588b2 f2e7e269

+262 -1
+262 -1
drivers/md/raid10.c
··· 1518 1518 raid10_write_request(mddev, bio, r10_bio); 1519 1519 } 1520 1520 1521 + static void raid10_end_discard_request(struct bio *bio) 1522 + { 1523 + struct r10bio *r10_bio = bio->bi_private; 1524 + struct r10conf *conf = r10_bio->mddev->private; 1525 + struct md_rdev *rdev = NULL; 1526 + int dev; 1527 + int slot, repl; 1528 + 1529 + /* 1530 + * We don't care the return value of discard bio 1531 + */ 1532 + if (!test_bit(R10BIO_Uptodate, &r10_bio->state)) 1533 + set_bit(R10BIO_Uptodate, &r10_bio->state); 1534 + 1535 + dev = find_bio_disk(conf, r10_bio, bio, &slot, &repl); 1536 + if (repl) 1537 + rdev = conf->mirrors[dev].replacement; 1538 + if (!rdev) { 1539 + /* 1540 + * raid10_remove_disk uses smp_mb to make sure rdev is set to 1541 + * replacement before setting replacement to NULL. It can read 1542 + * rdev first without barrier protect even replacment is NULL 1543 + */ 1544 + smp_rmb(); 1545 + rdev = conf->mirrors[dev].rdev; 1546 + } 1547 + 1548 + if (atomic_dec_and_test(&r10_bio->remaining)) { 1549 + md_write_end(r10_bio->mddev); 1550 + raid_end_bio_io(r10_bio); 1551 + } 1552 + 1553 + rdev_dec_pending(rdev, conf->mddev); 1554 + } 1555 + 1556 + /* 1557 + * There are some limitations to handle discard bio 1558 + * 1st, the discard size is bigger than stripe_size*2. 1559 + * 2st, if the discard bio spans reshape progress, we use the old way to 1560 + * handle discard bio 1561 + */ 1562 + static int raid10_handle_discard(struct mddev *mddev, struct bio *bio) 1563 + { 1564 + struct r10conf *conf = mddev->private; 1565 + struct geom *geo = &conf->geo; 1566 + struct r10bio *r10_bio; 1567 + struct bio *split; 1568 + int disk; 1569 + sector_t chunk; 1570 + unsigned int stripe_size; 1571 + unsigned int stripe_data_disks; 1572 + sector_t split_size; 1573 + sector_t bio_start, bio_end; 1574 + sector_t first_stripe_index, last_stripe_index; 1575 + sector_t start_disk_offset; 1576 + unsigned int start_disk_index; 1577 + sector_t end_disk_offset; 1578 + unsigned int end_disk_index; 1579 + unsigned int remainder; 1580 + 1581 + if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) 1582 + return -EAGAIN; 1583 + 1584 + wait_barrier(conf); 1585 + 1586 + /* 1587 + * Check reshape again to avoid reshape happens after checking 1588 + * MD_RECOVERY_RESHAPE and before wait_barrier 1589 + */ 1590 + if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) 1591 + goto out; 1592 + 1593 + if (geo->near_copies) 1594 + stripe_data_disks = geo->raid_disks / geo->near_copies + 1595 + geo->raid_disks % geo->near_copies; 1596 + else 1597 + stripe_data_disks = geo->raid_disks; 1598 + 1599 + stripe_size = stripe_data_disks << geo->chunk_shift; 1600 + 1601 + bio_start = bio->bi_iter.bi_sector; 1602 + bio_end = bio_end_sector(bio); 1603 + 1604 + /* 1605 + * Maybe one discard bio is smaller than strip size or across one 1606 + * stripe and discard region is larger than one stripe size. For far 1607 + * offset layout, if the discard region is not aligned with stripe 1608 + * size, there is hole when we submit discard bio to member disk. 1609 + * For simplicity, we only handle discard bio which discard region 1610 + * is bigger than stripe_size * 2 1611 + */ 1612 + if (bio_sectors(bio) < stripe_size*2) 1613 + goto out; 1614 + 1615 + /* 1616 + * Keep bio aligned with strip size. 1617 + */ 1618 + div_u64_rem(bio_start, stripe_size, &remainder); 1619 + if (remainder) { 1620 + split_size = stripe_size - remainder; 1621 + split = bio_split(bio, split_size, GFP_NOIO, &conf->bio_split); 1622 + bio_chain(split, bio); 1623 + allow_barrier(conf); 1624 + /* Resend the fist split part */ 1625 + submit_bio_noacct(split); 1626 + wait_barrier(conf); 1627 + } 1628 + div_u64_rem(bio_end, stripe_size, &remainder); 1629 + if (remainder) { 1630 + split_size = bio_sectors(bio) - remainder; 1631 + split = bio_split(bio, split_size, GFP_NOIO, &conf->bio_split); 1632 + bio_chain(split, bio); 1633 + allow_barrier(conf); 1634 + /* Resend the second split part */ 1635 + submit_bio_noacct(bio); 1636 + bio = split; 1637 + wait_barrier(conf); 1638 + } 1639 + 1640 + r10_bio = mempool_alloc(&conf->r10bio_pool, GFP_NOIO); 1641 + r10_bio->mddev = mddev; 1642 + r10_bio->state = 0; 1643 + r10_bio->sectors = 0; 1644 + memset(r10_bio->devs, 0, sizeof(r10_bio->devs[0]) * geo->raid_disks); 1645 + 1646 + wait_blocked_dev(mddev, r10_bio); 1647 + 1648 + r10_bio->master_bio = bio; 1649 + 1650 + bio_start = bio->bi_iter.bi_sector; 1651 + bio_end = bio_end_sector(bio); 1652 + 1653 + /* 1654 + * Raid10 uses chunk as the unit to store data. It's similar like raid0. 1655 + * One stripe contains the chunks from all member disk (one chunk from 1656 + * one disk at the same HBA address). For layout detail, see 'man md 4' 1657 + */ 1658 + chunk = bio_start >> geo->chunk_shift; 1659 + chunk *= geo->near_copies; 1660 + first_stripe_index = chunk; 1661 + start_disk_index = sector_div(first_stripe_index, geo->raid_disks); 1662 + if (geo->far_offset) 1663 + first_stripe_index *= geo->far_copies; 1664 + start_disk_offset = (bio_start & geo->chunk_mask) + 1665 + (first_stripe_index << geo->chunk_shift); 1666 + 1667 + chunk = bio_end >> geo->chunk_shift; 1668 + chunk *= geo->near_copies; 1669 + last_stripe_index = chunk; 1670 + end_disk_index = sector_div(last_stripe_index, geo->raid_disks); 1671 + if (geo->far_offset) 1672 + last_stripe_index *= geo->far_copies; 1673 + end_disk_offset = (bio_end & geo->chunk_mask) + 1674 + (last_stripe_index << geo->chunk_shift); 1675 + 1676 + rcu_read_lock(); 1677 + for (disk = 0; disk < geo->raid_disks; disk++) { 1678 + struct md_rdev *rdev = rcu_dereference(conf->mirrors[disk].rdev); 1679 + struct md_rdev *rrdev = rcu_dereference( 1680 + conf->mirrors[disk].replacement); 1681 + 1682 + r10_bio->devs[disk].bio = NULL; 1683 + r10_bio->devs[disk].repl_bio = NULL; 1684 + 1685 + if (rdev && (test_bit(Faulty, &rdev->flags))) 1686 + rdev = NULL; 1687 + if (rrdev && (test_bit(Faulty, &rrdev->flags))) 1688 + rrdev = NULL; 1689 + if (!rdev && !rrdev) 1690 + continue; 1691 + 1692 + if (rdev) { 1693 + r10_bio->devs[disk].bio = bio; 1694 + atomic_inc(&rdev->nr_pending); 1695 + } 1696 + if (rrdev) { 1697 + r10_bio->devs[disk].repl_bio = bio; 1698 + atomic_inc(&rrdev->nr_pending); 1699 + } 1700 + } 1701 + rcu_read_unlock(); 1702 + 1703 + atomic_set(&r10_bio->remaining, 1); 1704 + for (disk = 0; disk < geo->raid_disks; disk++) { 1705 + sector_t dev_start, dev_end; 1706 + struct bio *mbio, *rbio = NULL; 1707 + struct md_rdev *rdev = rcu_dereference(conf->mirrors[disk].rdev); 1708 + struct md_rdev *rrdev = rcu_dereference( 1709 + conf->mirrors[disk].replacement); 1710 + 1711 + /* 1712 + * Now start to calculate the start and end address for each disk. 1713 + * The space between dev_start and dev_end is the discard region. 1714 + * 1715 + * For dev_start, it needs to consider three conditions: 1716 + * 1st, the disk is before start_disk, you can imagine the disk in 1717 + * the next stripe. So the dev_start is the start address of next 1718 + * stripe. 1719 + * 2st, the disk is after start_disk, it means the disk is at the 1720 + * same stripe of first disk 1721 + * 3st, the first disk itself, we can use start_disk_offset directly 1722 + */ 1723 + if (disk < start_disk_index) 1724 + dev_start = (first_stripe_index + 1) * mddev->chunk_sectors; 1725 + else if (disk > start_disk_index) 1726 + dev_start = first_stripe_index * mddev->chunk_sectors; 1727 + else 1728 + dev_start = start_disk_offset; 1729 + 1730 + if (disk < end_disk_index) 1731 + dev_end = (last_stripe_index + 1) * mddev->chunk_sectors; 1732 + else if (disk > end_disk_index) 1733 + dev_end = last_stripe_index * mddev->chunk_sectors; 1734 + else 1735 + dev_end = end_disk_offset; 1736 + 1737 + /* 1738 + * It only handles discard bio which size is >= stripe size, so 1739 + * dev_end > dev_start all the time 1740 + */ 1741 + if (r10_bio->devs[disk].bio) { 1742 + mbio = bio_clone_fast(bio, GFP_NOIO, &mddev->bio_set); 1743 + mbio->bi_end_io = raid10_end_discard_request; 1744 + mbio->bi_private = r10_bio; 1745 + r10_bio->devs[disk].bio = mbio; 1746 + r10_bio->devs[disk].devnum = disk; 1747 + atomic_inc(&r10_bio->remaining); 1748 + md_submit_discard_bio(mddev, rdev, mbio, 1749 + dev_start + choose_data_offset(r10_bio, rdev), 1750 + dev_end - dev_start); 1751 + bio_endio(mbio); 1752 + } 1753 + if (r10_bio->devs[disk].repl_bio) { 1754 + rbio = bio_clone_fast(bio, GFP_NOIO, &mddev->bio_set); 1755 + rbio->bi_end_io = raid10_end_discard_request; 1756 + rbio->bi_private = r10_bio; 1757 + r10_bio->devs[disk].repl_bio = rbio; 1758 + r10_bio->devs[disk].devnum = disk; 1759 + atomic_inc(&r10_bio->remaining); 1760 + md_submit_discard_bio(mddev, rrdev, rbio, 1761 + dev_start + choose_data_offset(r10_bio, rrdev), 1762 + dev_end - dev_start); 1763 + bio_endio(rbio); 1764 + } 1765 + } 1766 + 1767 + if (atomic_dec_and_test(&r10_bio->remaining)) { 1768 + md_write_end(r10_bio->mddev); 1769 + raid_end_bio_io(r10_bio); 1770 + } 1771 + 1772 + return 0; 1773 + out: 1774 + allow_barrier(conf); 1775 + return -EAGAIN; 1776 + } 1777 + 1521 1778 static bool raid10_make_request(struct mddev *mddev, struct bio *bio) 1522 1779 { 1523 1780 struct r10conf *conf = mddev->private; ··· 1788 1531 1789 1532 if (!md_write_start(mddev, bio)) 1790 1533 return false; 1534 + 1535 + if (unlikely(bio_op(bio) == REQ_OP_DISCARD)) 1536 + if (!raid10_handle_discard(mddev, bio)) 1537 + return true; 1791 1538 1792 1539 /* 1793 1540 * If this request crosses a chunk boundary, we need to split ··· 4032 3771 4033 3772 if (mddev->queue) { 4034 3773 blk_queue_max_discard_sectors(mddev->queue, 4035 - mddev->chunk_sectors); 3774 + UINT_MAX); 4036 3775 blk_queue_max_write_same_sectors(mddev->queue, 0); 4037 3776 blk_queue_max_write_zeroes_sectors(mddev->queue, 0); 4038 3777 blk_queue_io_min(mddev->queue, mddev->chunk_sectors << 9);