Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

ext4: avoid deadlock in fs reclaim with page writeback

Ext4 has a filesystem wide lock protecting ext4_writepages() calls to
avoid races with switching of journalled data flag or inode format. This
lock can however cause a deadlock like:

CPU0 CPU1

ext4_writepages()
percpu_down_read(sbi->s_writepages_rwsem);
ext4_change_inode_journal_flag()
percpu_down_write(sbi->s_writepages_rwsem);
- blocks, all readers block from now on
ext4_do_writepages()
ext4_init_io_end()
kmem_cache_zalloc(io_end_cachep, GFP_KERNEL)
fs_reclaim frees dentry...
dentry_unlink_inode()
iput() - last ref =>
iput_final() - inode dirty =>
write_inode_now()...
ext4_writepages() tries to acquire sbi->s_writepages_rwsem
and blocks forever

Make sure we cannot recurse into filesystem reclaim from writeback code
to avoid the deadlock.

Reported-by: syzbot+6898da502aef574c5f8a@syzkaller.appspotmail.com
Link: https://lore.kernel.org/all/0000000000004c66b405fa108e27@google.com
Fixes: c8585c6fcaf2 ("ext4: fix races between changing inode journal mode and ext4_writepages")
CC: stable@vger.kernel.org
Signed-off-by: Jan Kara <jack@suse.cz>
Link: https://lore.kernel.org/r/20230504124723.20205-1-jack@suse.cz
Signed-off-by: Theodore Ts'o <tytso@mit.edu>

authored by

Jan Kara and committed by
Theodore Ts'o
00d873c1 b87c7cdf

+40 -13
+24
fs/ext4/ext4.h
··· 1684 1684 return container_of(inode, struct ext4_inode_info, vfs_inode); 1685 1685 } 1686 1686 1687 + static inline int ext4_writepages_down_read(struct super_block *sb) 1688 + { 1689 + percpu_down_read(&EXT4_SB(sb)->s_writepages_rwsem); 1690 + return memalloc_nofs_save(); 1691 + } 1692 + 1693 + static inline void ext4_writepages_up_read(struct super_block *sb, int ctx) 1694 + { 1695 + memalloc_nofs_restore(ctx); 1696 + percpu_up_read(&EXT4_SB(sb)->s_writepages_rwsem); 1697 + } 1698 + 1699 + static inline int ext4_writepages_down_write(struct super_block *sb) 1700 + { 1701 + percpu_down_write(&EXT4_SB(sb)->s_writepages_rwsem); 1702 + return memalloc_nofs_save(); 1703 + } 1704 + 1705 + static inline void ext4_writepages_up_write(struct super_block *sb, int ctx) 1706 + { 1707 + memalloc_nofs_restore(ctx); 1708 + percpu_up_write(&EXT4_SB(sb)->s_writepages_rwsem); 1709 + } 1710 + 1687 1711 static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino) 1688 1712 { 1689 1713 return ino == EXT4_ROOT_INO ||
+10 -8
fs/ext4/inode.c
··· 2783 2783 .can_map = 1, 2784 2784 }; 2785 2785 int ret; 2786 + int alloc_ctx; 2786 2787 2787 2788 if (unlikely(ext4_forced_shutdown(EXT4_SB(sb)))) 2788 2789 return -EIO; 2789 2790 2790 - percpu_down_read(&EXT4_SB(sb)->s_writepages_rwsem); 2791 + alloc_ctx = ext4_writepages_down_read(sb); 2791 2792 ret = ext4_do_writepages(&mpd); 2792 2793 /* 2793 2794 * For data=journal writeback we could have come across pages marked ··· 2797 2796 */ 2798 2797 if (!ret && mpd.journalled_more_data) 2799 2798 ret = ext4_do_writepages(&mpd); 2800 - percpu_up_read(&EXT4_SB(sb)->s_writepages_rwsem); 2799 + ext4_writepages_up_read(sb, alloc_ctx); 2801 2800 2802 2801 return ret; 2803 2802 } ··· 2825 2824 long nr_to_write = wbc->nr_to_write; 2826 2825 struct inode *inode = mapping->host; 2827 2826 struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb); 2827 + int alloc_ctx; 2828 2828 2829 2829 if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) 2830 2830 return -EIO; 2831 2831 2832 - percpu_down_read(&sbi->s_writepages_rwsem); 2832 + alloc_ctx = ext4_writepages_down_read(inode->i_sb); 2833 2833 trace_ext4_writepages(inode, wbc); 2834 2834 2835 2835 ret = dax_writeback_mapping_range(mapping, sbi->s_daxdev, wbc); 2836 2836 trace_ext4_writepages_result(inode, wbc, ret, 2837 2837 nr_to_write - wbc->nr_to_write); 2838 - percpu_up_read(&sbi->s_writepages_rwsem); 2838 + ext4_writepages_up_read(inode->i_sb, alloc_ctx); 2839 2839 return ret; 2840 2840 } 2841 2841 ··· 5930 5928 journal_t *journal; 5931 5929 handle_t *handle; 5932 5930 int err; 5933 - struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 5931 + int alloc_ctx; 5934 5932 5935 5933 /* 5936 5934 * We have to be very careful here: changing a data block's ··· 5968 5966 } 5969 5967 } 5970 5968 5971 - percpu_down_write(&sbi->s_writepages_rwsem); 5969 + alloc_ctx = ext4_writepages_down_write(inode->i_sb); 5972 5970 jbd2_journal_lock_updates(journal); 5973 5971 5974 5972 /* ··· 5985 5983 err = jbd2_journal_flush(journal, 0); 5986 5984 if (err < 0) { 5987 5985 jbd2_journal_unlock_updates(journal); 5988 - percpu_up_write(&sbi->s_writepages_rwsem); 5986 + ext4_writepages_up_write(inode->i_sb, alloc_ctx); 5989 5987 return err; 5990 5988 } 5991 5989 ext4_clear_inode_flag(inode, EXT4_INODE_JOURNAL_DATA); ··· 5993 5991 ext4_set_aops(inode); 5994 5992 5995 5993 jbd2_journal_unlock_updates(journal); 5996 - percpu_up_write(&sbi->s_writepages_rwsem); 5994 + ext4_writepages_up_write(inode->i_sb, alloc_ctx); 5997 5995 5998 5996 if (val) 5999 5997 filemap_invalidate_unlock(inode->i_mapping);
+6 -5
fs/ext4/migrate.c
··· 408 408 409 409 int ext4_ext_migrate(struct inode *inode) 410 410 { 411 - struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 412 411 handle_t *handle; 413 412 int retval = 0, i; 414 413 __le32 *i_data; ··· 417 418 unsigned long max_entries; 418 419 __u32 goal, tmp_csum_seed; 419 420 uid_t owner[2]; 421 + int alloc_ctx; 420 422 421 423 /* 422 424 * If the filesystem does not support extents, or the inode ··· 434 434 */ 435 435 return retval; 436 436 437 - percpu_down_write(&sbi->s_writepages_rwsem); 437 + alloc_ctx = ext4_writepages_down_write(inode->i_sb); 438 438 439 439 /* 440 440 * Worst case we can touch the allocation bitmaps and a block ··· 586 586 unlock_new_inode(tmp_inode); 587 587 iput(tmp_inode); 588 588 out_unlock: 589 - percpu_up_write(&sbi->s_writepages_rwsem); 589 + ext4_writepages_up_write(inode->i_sb, alloc_ctx); 590 590 return retval; 591 591 } 592 592 ··· 605 605 ext4_fsblk_t blk; 606 606 handle_t *handle; 607 607 int ret, ret2 = 0; 608 + int alloc_ctx; 608 609 609 610 if (!ext4_has_feature_extents(inode->i_sb) || 610 611 (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) ··· 622 621 if (test_opt(inode->i_sb, DELALLOC)) 623 622 ext4_alloc_da_blocks(inode); 624 623 625 - percpu_down_write(&sbi->s_writepages_rwsem); 624 + alloc_ctx = ext4_writepages_down_write(inode->i_sb); 626 625 627 626 handle = ext4_journal_start(inode, EXT4_HT_MIGRATE, 1); 628 627 if (IS_ERR(handle)) { ··· 666 665 ext4_journal_stop(handle); 667 666 up_write(&EXT4_I(inode)->i_data_sem); 668 667 out_unlock: 669 - percpu_up_write(&sbi->s_writepages_rwsem); 668 + ext4_writepages_up_write(inode->i_sb, alloc_ctx); 670 669 return ret; 671 670 }