Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

dm snapshot: Make exception tables scalable

Use list_bl to implement the exception hash tables' buckets. This change
permits concurrent access, to distinct buckets, by multiple threads.

Also, implement helper functions to lock and unlock the exception tables
based on the chunk number of the exception at hand.

We retain the global locking, by means of down_write(), which is
replaced by the next commit.

Still, we must acquire the per-bucket spinlocks when accessing the hash
tables, since list_bl does not allow modification on unlocked lists.

Co-developed-by: Ilias Tsitsimpis <iliastsi@arrikto.com>
Signed-off-by: Nikos Tsironis <ntsironis@arrikto.com>
Acked-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>

authored by

Nikos Tsironis and committed by
Mike Snitzer
f79ae415 4ad8d880

+116 -24
+2 -1
drivers/md/dm-exception-store.h
··· 11 11 #define _LINUX_DM_EXCEPTION_STORE 12 12 13 13 #include <linux/blkdev.h> 14 + #include <linux/list_bl.h> 14 15 #include <linux/device-mapper.h> 15 16 16 17 /* ··· 28 27 * chunk within the device. 29 28 */ 30 29 struct dm_exception { 31 - struct list_head hash_list; 30 + struct hlist_bl_node hash_list; 32 31 33 32 chunk_t old_chunk; 34 33 chunk_t new_chunk;
+114 -23
drivers/md/dm-snap.c
··· 13 13 #include <linux/init.h> 14 14 #include <linux/kdev_t.h> 15 15 #include <linux/list.h> 16 + #include <linux/list_bl.h> 16 17 #include <linux/mempool.h> 17 18 #include <linux/module.h> 18 19 #include <linux/slab.h> ··· 45 44 struct dm_exception_table { 46 45 uint32_t hash_mask; 47 46 unsigned hash_shift; 48 - struct list_head *table; 47 + struct hlist_bl_head *table; 49 48 }; 50 49 51 50 struct dm_snapshot { ··· 619 618 * The lowest hash_shift bits of the chunk number are ignored, allowing 620 619 * some consecutive chunks to be grouped together. 621 620 */ 621 + static uint32_t exception_hash(struct dm_exception_table *et, chunk_t chunk); 622 + 623 + /* Lock to protect access to the completed and pending exception hash tables. */ 624 + struct dm_exception_table_lock { 625 + struct hlist_bl_head *complete_slot; 626 + struct hlist_bl_head *pending_slot; 627 + }; 628 + 629 + static void dm_exception_table_lock_init(struct dm_snapshot *s, chunk_t chunk, 630 + struct dm_exception_table_lock *lock) 631 + { 632 + struct dm_exception_table *complete = &s->complete; 633 + struct dm_exception_table *pending = &s->pending; 634 + 635 + lock->complete_slot = &complete->table[exception_hash(complete, chunk)]; 636 + lock->pending_slot = &pending->table[exception_hash(pending, chunk)]; 637 + } 638 + 639 + static void dm_exception_table_lock(struct dm_exception_table_lock *lock) 640 + { 641 + hlist_bl_lock(lock->complete_slot); 642 + hlist_bl_lock(lock->pending_slot); 643 + } 644 + 645 + static void dm_exception_table_unlock(struct dm_exception_table_lock *lock) 646 + { 647 + hlist_bl_unlock(lock->pending_slot); 648 + hlist_bl_unlock(lock->complete_slot); 649 + } 650 + 622 651 static int dm_exception_table_init(struct dm_exception_table *et, 623 652 uint32_t size, unsigned hash_shift) 624 653 { ··· 656 625 657 626 et->hash_shift = hash_shift; 658 627 et->hash_mask = size - 1; 659 - et->table = dm_vcalloc(size, sizeof(struct list_head)); 628 + et->table = dm_vcalloc(size, sizeof(struct hlist_bl_head)); 660 629 if (!et->table) 661 630 return -ENOMEM; 662 631 663 632 for (i = 0; i < size; i++) 664 - INIT_LIST_HEAD(et->table + i); 633 + INIT_HLIST_BL_HEAD(et->table + i); 665 634 666 635 return 0; 667 636 } ··· 669 638 static void dm_exception_table_exit(struct dm_exception_table *et, 670 639 struct kmem_cache *mem) 671 640 { 672 - struct list_head *slot; 673 - struct dm_exception *ex, *next; 641 + struct hlist_bl_head *slot; 642 + struct dm_exception *ex; 643 + struct hlist_bl_node *pos, *n; 674 644 int i, size; 675 645 676 646 size = et->hash_mask + 1; 677 647 for (i = 0; i < size; i++) { 678 648 slot = et->table + i; 679 649 680 - list_for_each_entry_safe (ex, next, slot, hash_list) 650 + hlist_bl_for_each_entry_safe(ex, pos, n, slot, hash_list) 681 651 kmem_cache_free(mem, ex); 682 652 } 683 653 ··· 692 660 693 661 static void dm_remove_exception(struct dm_exception *e) 694 662 { 695 - list_del(&e->hash_list); 663 + hlist_bl_del(&e->hash_list); 696 664 } 697 665 698 666 /* ··· 702 670 static struct dm_exception *dm_lookup_exception(struct dm_exception_table *et, 703 671 chunk_t chunk) 704 672 { 705 - struct list_head *slot; 673 + struct hlist_bl_head *slot; 674 + struct hlist_bl_node *pos; 706 675 struct dm_exception *e; 707 676 708 677 slot = &et->table[exception_hash(et, chunk)]; 709 - list_for_each_entry (e, slot, hash_list) 678 + hlist_bl_for_each_entry(e, pos, slot, hash_list) 710 679 if (chunk >= e->old_chunk && 711 680 chunk <= e->old_chunk + dm_consecutive_chunk_count(e)) 712 681 return e; ··· 754 721 static void dm_insert_exception(struct dm_exception_table *eh, 755 722 struct dm_exception *new_e) 756 723 { 757 - struct list_head *l; 724 + struct hlist_bl_head *l; 725 + struct hlist_bl_node *pos; 758 726 struct dm_exception *e = NULL; 759 727 760 728 l = &eh->table[exception_hash(eh, new_e->old_chunk)]; ··· 765 731 goto out; 766 732 767 733 /* List is ordered by old_chunk */ 768 - list_for_each_entry_reverse(e, l, hash_list) { 734 + hlist_bl_for_each_entry(e, pos, l, hash_list) { 769 735 /* Insert after an existing chunk? */ 770 736 if (new_e->old_chunk == (e->old_chunk + 771 737 dm_consecutive_chunk_count(e) + 1) && ··· 786 752 return; 787 753 } 788 754 789 - if (new_e->old_chunk > e->old_chunk) 755 + if (new_e->old_chunk < e->old_chunk) 790 756 break; 791 757 } 792 758 793 759 out: 794 - list_add(&new_e->hash_list, e ? &e->hash_list : l); 760 + if (!e) { 761 + /* 762 + * Either the table doesn't support consecutive chunks or slot 763 + * l is empty. 764 + */ 765 + hlist_bl_add_head(&new_e->hash_list, l); 766 + } else if (new_e->old_chunk < e->old_chunk) { 767 + /* Add before an existing exception */ 768 + hlist_bl_add_before(&new_e->hash_list, &e->hash_list); 769 + } else { 770 + /* Add to l's tail: e is the last exception in this slot */ 771 + hlist_bl_add_behind(&new_e->hash_list, &e->hash_list); 772 + } 795 773 } 796 774 797 775 /* ··· 812 766 */ 813 767 static int dm_add_exception(void *context, chunk_t old, chunk_t new) 814 768 { 769 + struct dm_exception_table_lock lock; 815 770 struct dm_snapshot *s = context; 816 771 struct dm_exception *e; 817 772 ··· 825 778 /* Consecutive_count is implicitly initialised to zero */ 826 779 e->new_chunk = new; 827 780 781 + /* 782 + * Although there is no need to lock access to the exception tables 783 + * here, if we don't then hlist_bl_add_head(), called by 784 + * dm_insert_exception(), will complain about accessing the 785 + * corresponding list without locking it first. 786 + */ 787 + dm_exception_table_lock_init(s, old, &lock); 788 + 789 + dm_exception_table_lock(&lock); 828 790 dm_insert_exception(&s->complete, e); 791 + dm_exception_table_unlock(&lock); 829 792 830 793 return 0; 831 794 } ··· 864 807 { 865 808 /* use a fixed size of 2MB */ 866 809 unsigned long mem = 2 * 1024 * 1024; 867 - mem /= sizeof(struct list_head); 810 + mem /= sizeof(struct hlist_bl_head); 868 811 869 812 return mem; 870 813 } ··· 1530 1473 struct bio *origin_bios = NULL; 1531 1474 struct bio *snapshot_bios = NULL; 1532 1475 struct bio *full_bio = NULL; 1476 + struct dm_exception_table_lock lock; 1533 1477 int error = 0; 1478 + 1479 + dm_exception_table_lock_init(s, pe->e.old_chunk, &lock); 1534 1480 1535 1481 if (!success) { 1536 1482 /* Read/write error - snapshot is unusable */ 1537 1483 down_write(&s->lock); 1538 1484 __invalidate_snapshot(s, -EIO); 1539 1485 error = 1; 1486 + 1487 + dm_exception_table_lock(&lock); 1540 1488 goto out; 1541 1489 } 1542 1490 ··· 1550 1488 down_write(&s->lock); 1551 1489 __invalidate_snapshot(s, -ENOMEM); 1552 1490 error = 1; 1491 + 1492 + dm_exception_table_lock(&lock); 1553 1493 goto out; 1554 1494 } 1555 1495 *e = pe->e; 1556 1496 1557 1497 down_write(&s->lock); 1498 + dm_exception_table_lock(&lock); 1558 1499 if (!s->valid) { 1559 1500 free_completed_exception(e); 1560 1501 error = 1; ··· 1575 1510 1576 1511 /* Wait for conflicting reads to drain */ 1577 1512 if (__chunk_is_tracked(s, pe->e.old_chunk)) { 1513 + dm_exception_table_unlock(&lock); 1578 1514 up_write(&s->lock); 1579 1515 __check_for_conflicting_io(s, pe->e.old_chunk); 1580 1516 down_write(&s->lock); 1517 + dm_exception_table_lock(&lock); 1581 1518 } 1582 1519 1583 1520 out: 1584 1521 /* Remove the in-flight exception from the list */ 1585 1522 dm_remove_exception(&pe->e); 1523 + 1524 + dm_exception_table_unlock(&lock); 1525 + 1586 1526 snapshot_bios = bio_list_get(&pe->snapshot_bios); 1587 1527 origin_bios = bio_list_get(&pe->origin_bios); 1588 1528 full_bio = pe->full_bio; ··· 1803 1733 int r = DM_MAPIO_REMAPPED; 1804 1734 chunk_t chunk; 1805 1735 struct dm_snap_pending_exception *pe = NULL; 1736 + struct dm_exception_table_lock lock; 1806 1737 1807 1738 init_tracked_chunk(bio); 1808 1739 ··· 1813 1742 } 1814 1743 1815 1744 chunk = sector_to_chunk(s->store, bio->bi_iter.bi_sector); 1745 + dm_exception_table_lock_init(s, chunk, &lock); 1816 1746 1817 1747 /* Full snapshots are not usable */ 1818 1748 /* To get here the table must be live so s->active is always set. */ ··· 1821 1749 return DM_MAPIO_KILL; 1822 1750 1823 1751 down_write(&s->lock); 1752 + dm_exception_table_lock(&lock); 1824 1753 1825 1754 if (!s->valid || (unlikely(s->snapshot_overflowed) && 1826 1755 bio_data_dir(bio) == WRITE)) { ··· 1844 1771 if (bio_data_dir(bio) == WRITE) { 1845 1772 pe = __lookup_pending_exception(s, chunk); 1846 1773 if (!pe) { 1774 + dm_exception_table_unlock(&lock); 1847 1775 up_write(&s->lock); 1848 1776 pe = alloc_pending_exception(s); 1849 1777 down_write(&s->lock); 1778 + dm_exception_table_lock(&lock); 1850 1779 1851 1780 if (!s->valid || s->snapshot_overflowed) { 1852 1781 free_pending_exception(pe); ··· 1865 1790 1866 1791 pe = __find_pending_exception(s, pe, chunk); 1867 1792 if (!pe) { 1793 + dm_exception_table_unlock(&lock); 1794 + 1868 1795 if (s->store->userspace_supports_overflow) { 1869 1796 s->snapshot_overflowed = 1; 1870 1797 DMERR("Snapshot overflowed: Unable to allocate exception."); 1871 1798 } else 1872 1799 __invalidate_snapshot(s, -ENOMEM); 1800 + up_write(&s->lock); 1801 + 1873 1802 r = DM_MAPIO_KILL; 1874 - goto out_unlock; 1803 + goto out; 1875 1804 } 1876 1805 } 1877 1806 ··· 1887 1808 bio->bi_iter.bi_size == 1888 1809 (s->store->chunk_size << SECTOR_SHIFT)) { 1889 1810 pe->started = 1; 1811 + dm_exception_table_unlock(&lock); 1890 1812 up_write(&s->lock); 1891 1813 start_full_bio(pe, bio); 1892 1814 goto out; ··· 1898 1818 if (!pe->started) { 1899 1819 /* this is protected by snap->lock */ 1900 1820 pe->started = 1; 1821 + dm_exception_table_unlock(&lock); 1901 1822 up_write(&s->lock); 1902 1823 start_copy(pe); 1903 1824 goto out; ··· 1909 1828 } 1910 1829 1911 1830 out_unlock: 1831 + dm_exception_table_unlock(&lock); 1912 1832 up_write(&s->lock); 1913 1833 out: 1914 1834 return r; ··· 2211 2129 struct dm_snap_pending_exception *pe, *pe2; 2212 2130 struct dm_snap_pending_exception *pe_to_start_now = NULL; 2213 2131 struct dm_snap_pending_exception *pe_to_start_last = NULL; 2132 + struct dm_exception_table_lock lock; 2214 2133 chunk_t chunk; 2215 2134 2216 2135 /* Do all the snapshots on this origin */ ··· 2223 2140 if (dm_target_is_snapshot_merge(snap->ti)) 2224 2141 continue; 2225 2142 2226 - down_write(&snap->lock); 2227 - 2228 - /* Only deal with valid and active snapshots */ 2229 - if (!snap->valid || !snap->active) 2230 - goto next_snapshot; 2231 - 2232 2143 /* Nothing to do if writing beyond end of snapshot */ 2233 2144 if (sector >= dm_table_get_size(snap->ti->table)) 2234 - goto next_snapshot; 2145 + continue; 2235 2146 2236 2147 /* 2237 2148 * Remember, different snapshots can have 2238 2149 * different chunk sizes. 2239 2150 */ 2240 2151 chunk = sector_to_chunk(snap->store, sector); 2152 + dm_exception_table_lock_init(snap, chunk, &lock); 2153 + 2154 + down_write(&snap->lock); 2155 + dm_exception_table_lock(&lock); 2156 + 2157 + /* Only deal with valid and active snapshots */ 2158 + if (!snap->valid || !snap->active) 2159 + goto next_snapshot; 2241 2160 2242 2161 pe = __lookup_pending_exception(snap, chunk); 2243 2162 if (!pe) { ··· 2252 2167 if (e) 2253 2168 goto next_snapshot; 2254 2169 2170 + dm_exception_table_unlock(&lock); 2255 2171 up_write(&snap->lock); 2256 2172 pe = alloc_pending_exception(snap); 2257 2173 down_write(&snap->lock); 2174 + dm_exception_table_lock(&lock); 2258 2175 2259 2176 if (!snap->valid) { 2260 2177 free_pending_exception(pe); ··· 2274 2187 2275 2188 pe = __insert_pending_exception(snap, pe, chunk); 2276 2189 if (!pe) { 2190 + dm_exception_table_unlock(&lock); 2277 2191 __invalidate_snapshot(snap, -ENOMEM); 2278 - goto next_snapshot; 2192 + up_write(&snap->lock); 2193 + 2194 + continue; 2279 2195 } 2280 2196 } else { 2281 2197 free_pending_exception(pe); ··· 2309 2219 } 2310 2220 2311 2221 next_snapshot: 2222 + dm_exception_table_unlock(&lock); 2312 2223 up_write(&snap->lock); 2313 2224 2314 2225 if (pe_to_start_now) {