Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

dm thin metadata: Fix ABBA deadlock by resetting dm_bufio_client

As described in commit 8111964f1b85 ("dm thin: Fix ABBA deadlock between
shrink_slab and dm_pool_abort_metadata"), ABBA deadlocks will be
triggered because shrinker_rwsem currently needs to held by
dm_pool_abort_metadata() as a side-effect of thin-pool metadata
operation failure.

The following three problem scenarios have been noticed:

1) Described by commit 8111964f1b85 ("dm thin: Fix ABBA deadlock between
shrink_slab and dm_pool_abort_metadata")

2) shrinker_rwsem and throttle->lock
P1(drop cache) P2(kworker)
drop_caches_sysctl_handler
drop_slab
shrink_slab
down_read(&shrinker_rwsem) - LOCK A
do_shrink_slab
super_cache_scan
prune_icache_sb
dispose_list
evict
ext4_evict_inode
ext4_clear_inode
ext4_discard_preallocations
ext4_mb_load_buddy_gfp
ext4_mb_init_cache
ext4_wait_block_bitmap
__ext4_error
ext4_handle_error
ext4_commit_super
...
dm_submit_bio
do_worker
throttle_work_update
down_write(&t->lock) -- LOCK B
process_deferred_bios
commit
metadata_operation_failed
dm_pool_abort_metadata
dm_block_manager_create
dm_bufio_client_create
register_shrinker
down_write(&shrinker_rwsem)
-- LOCK A
thin_map
thin_bio_map
thin_defer_bio_with_throttle
throttle_lock
down_read(&t->lock) - LOCK B

3) shrinker_rwsem and wait_on_buffer
P1(drop cache) P2(kworker)
drop_caches_sysctl_handler
drop_slab
shrink_slab
down_read(&shrinker_rwsem) - LOCK A
do_shrink_slab
...
ext4_wait_block_bitmap
__ext4_error
ext4_handle_error
jbd2_journal_abort
jbd2_journal_update_sb_errno
jbd2_write_superblock
submit_bh
// LOCK B
// RELEASE B
do_worker
throttle_work_update
down_write(&t->lock) - LOCK B
process_deferred_bios
process_bio
commit
metadata_operation_failed
dm_pool_abort_metadata
dm_block_manager_create
dm_bufio_client_create
register_shrinker
register_shrinker_prepared
down_write(&shrinker_rwsem) - LOCK A
bio_endio
wait_on_buffer
__wait_on_buffer

Fix these by resetting dm_bufio_client without holding shrinker_rwsem.

Fixes: 8111964f1b85 ("dm thin: Fix ABBA deadlock between shrink_slab and dm_pool_abort_metadata")
Cc: stable@vger.kernel.org
Signed-off-by: Li Lingfeng <lilingfeng3@huawei.com>
Signed-off-by: Mike Snitzer <snitzer@kernel.org>

authored by

Li Lingfeng and committed by
Mike Snitzer
d4830012 2a32897c

+47 -35
+7
drivers/md/dm-bufio.c
··· 2592 2592 } 2593 2593 EXPORT_SYMBOL_GPL(dm_bufio_client_destroy); 2594 2594 2595 + void dm_bufio_client_reset(struct dm_bufio_client *c) 2596 + { 2597 + drop_buffers(c); 2598 + flush_work(&c->shrink_work); 2599 + } 2600 + EXPORT_SYMBOL_GPL(dm_bufio_client_reset); 2601 + 2595 2602 void dm_bufio_set_sector_offset(struct dm_bufio_client *c, sector_t start) 2596 2603 { 2597 2604 c->start = start;
+26 -34
drivers/md/dm-thin-metadata.c
··· 603 603 r = dm_tm_create_with_sm(pmd->bm, THIN_SUPERBLOCK_LOCATION, 604 604 &pmd->tm, &pmd->metadata_sm); 605 605 if (r < 0) { 606 + pmd->tm = NULL; 607 + pmd->metadata_sm = NULL; 606 608 DMERR("tm_create_with_sm failed"); 607 609 return r; 608 610 } ··· 613 611 if (IS_ERR(pmd->data_sm)) { 614 612 DMERR("sm_disk_create failed"); 615 613 r = PTR_ERR(pmd->data_sm); 614 + pmd->data_sm = NULL; 616 615 goto bad_cleanup_tm; 617 616 } 618 617 ··· 644 641 645 642 bad_cleanup_nb_tm: 646 643 dm_tm_destroy(pmd->nb_tm); 644 + pmd->nb_tm = NULL; 647 645 bad_cleanup_data_sm: 648 646 dm_sm_destroy(pmd->data_sm); 647 + pmd->data_sm = NULL; 649 648 bad_cleanup_tm: 650 649 dm_tm_destroy(pmd->tm); 650 + pmd->tm = NULL; 651 651 dm_sm_destroy(pmd->metadata_sm); 652 + pmd->metadata_sm = NULL; 652 653 653 654 return r; 654 655 } ··· 718 711 sizeof(disk_super->metadata_space_map_root), 719 712 &pmd->tm, &pmd->metadata_sm); 720 713 if (r < 0) { 714 + pmd->tm = NULL; 715 + pmd->metadata_sm = NULL; 721 716 DMERR("tm_open_with_sm failed"); 722 717 goto bad_unlock_sblock; 723 718 } ··· 729 720 if (IS_ERR(pmd->data_sm)) { 730 721 DMERR("sm_disk_open failed"); 731 722 r = PTR_ERR(pmd->data_sm); 723 + pmd->data_sm = NULL; 732 724 goto bad_cleanup_tm; 733 725 } 734 726 ··· 756 746 757 747 bad_cleanup_data_sm: 758 748 dm_sm_destroy(pmd->data_sm); 749 + pmd->data_sm = NULL; 759 750 bad_cleanup_tm: 760 751 dm_tm_destroy(pmd->tm); 752 + pmd->tm = NULL; 761 753 dm_sm_destroy(pmd->metadata_sm); 754 + pmd->metadata_sm = NULL; 762 755 bad_unlock_sblock: 763 756 dm_bm_unlock(sblock); 764 757 ··· 808 795 bool destroy_bm) 809 796 { 810 797 dm_sm_destroy(pmd->data_sm); 798 + pmd->data_sm = NULL; 811 799 dm_sm_destroy(pmd->metadata_sm); 800 + pmd->metadata_sm = NULL; 812 801 dm_tm_destroy(pmd->nb_tm); 802 + pmd->nb_tm = NULL; 813 803 dm_tm_destroy(pmd->tm); 804 + pmd->tm = NULL; 814 805 if (destroy_bm) 815 806 dm_block_manager_destroy(pmd->bm); 816 807 } ··· 1022 1005 __func__, r); 1023 1006 } 1024 1007 pmd_write_unlock(pmd); 1025 - if (!pmd->fail_io) 1026 - __destroy_persistent_data_objects(pmd, true); 1008 + __destroy_persistent_data_objects(pmd, true); 1027 1009 1028 1010 kfree(pmd); 1029 1011 return 0; ··· 1893 1877 int dm_pool_abort_metadata(struct dm_pool_metadata *pmd) 1894 1878 { 1895 1879 int r = -EINVAL; 1896 - struct dm_block_manager *old_bm = NULL, *new_bm = NULL; 1897 1880 1898 1881 /* fail_io is double-checked with pmd->root_lock held below */ 1899 1882 if (unlikely(pmd->fail_io)) 1900 1883 return r; 1901 1884 1902 - /* 1903 - * Replacement block manager (new_bm) is created and old_bm destroyed outside of 1904 - * pmd root_lock to avoid ABBA deadlock that would result (due to life-cycle of 1905 - * shrinker associated with the block manager's bufio client vs pmd root_lock). 1906 - * - must take shrinker_mutex without holding pmd->root_lock 1907 - */ 1908 - new_bm = dm_block_manager_create(pmd->bdev, THIN_METADATA_BLOCK_SIZE << SECTOR_SHIFT, 1909 - THIN_MAX_CONCURRENT_LOCKS); 1910 - 1911 1885 pmd_write_lock(pmd); 1912 1886 if (pmd->fail_io) { 1913 1887 pmd_write_unlock(pmd); 1914 - goto out; 1888 + return r; 1915 1889 } 1916 - 1917 1890 __set_abort_with_changes_flags(pmd); 1918 - __destroy_persistent_data_objects(pmd, false); 1919 - old_bm = pmd->bm; 1920 - if (IS_ERR(new_bm)) { 1921 - DMERR("could not create block manager during abort"); 1922 - pmd->bm = NULL; 1923 - r = PTR_ERR(new_bm); 1924 - goto out_unlock; 1925 - } 1926 1891 1927 - pmd->bm = new_bm; 1892 + /* destroy data_sm/metadata_sm/nb_tm/tm */ 1893 + __destroy_persistent_data_objects(pmd, false); 1894 + 1895 + /* reset bm */ 1896 + dm_block_manager_reset(pmd->bm); 1897 + 1898 + /* rebuild data_sm/metadata_sm/nb_tm/tm */ 1928 1899 r = __open_or_format_metadata(pmd, false); 1929 - if (r) { 1930 - pmd->bm = NULL; 1931 - goto out_unlock; 1932 - } 1933 - new_bm = NULL; 1934 - out_unlock: 1935 1900 if (r) 1936 1901 pmd->fail_io = true; 1937 1902 pmd_write_unlock(pmd); 1938 - dm_block_manager_destroy(old_bm); 1939 - out: 1940 - if (new_bm && !IS_ERR(new_bm)) 1941 - dm_block_manager_destroy(new_bm); 1942 - 1943 1903 return r; 1944 1904 } 1945 1905
+6
drivers/md/persistent-data/dm-block-manager.c
··· 421 421 } 422 422 EXPORT_SYMBOL_GPL(dm_block_manager_destroy); 423 423 424 + void dm_block_manager_reset(struct dm_block_manager *bm) 425 + { 426 + dm_bufio_client_reset(bm->bufio); 427 + } 428 + EXPORT_SYMBOL_GPL(dm_block_manager_reset); 429 + 424 430 unsigned int dm_bm_block_size(struct dm_block_manager *bm) 425 431 { 426 432 return dm_bufio_get_block_size(bm->bufio);
+1
drivers/md/persistent-data/dm-block-manager.h
··· 36 36 struct block_device *bdev, unsigned int block_size, 37 37 unsigned int max_held_per_thread); 38 38 void dm_block_manager_destroy(struct dm_block_manager *bm); 39 + void dm_block_manager_reset(struct dm_block_manager *bm); 39 40 40 41 unsigned int dm_bm_block_size(struct dm_block_manager *bm); 41 42 dm_block_t dm_bm_nr_blocks(struct dm_block_manager *bm);
+2 -1
drivers/md/persistent-data/dm-space-map.h
··· 77 77 78 78 static inline void dm_sm_destroy(struct dm_space_map *sm) 79 79 { 80 - sm->destroy(sm); 80 + if (sm) 81 + sm->destroy(sm); 81 82 } 82 83 83 84 static inline int dm_sm_extend(struct dm_space_map *sm, dm_block_t extra_blocks)
+3
drivers/md/persistent-data/dm-transaction-manager.c
··· 199 199 200 200 void dm_tm_destroy(struct dm_transaction_manager *tm) 201 201 { 202 + if (!tm) 203 + return; 204 + 202 205 if (!tm->is_clone) 203 206 wipe_shadow_table(tm); 204 207
+2
include/linux/dm-bufio.h
··· 38 38 */ 39 39 void dm_bufio_client_destroy(struct dm_bufio_client *c); 40 40 41 + void dm_bufio_client_reset(struct dm_bufio_client *c); 42 + 41 43 /* 42 44 * Set the sector range. 43 45 * When this function is called, there must be no I/O in progress on the bufio