Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

ext4: completed_io locking cleanup

Current unwritten extent conversion state-machine is very fuzzy.
- For unknown reason it performs conversion under i_mutex. What for?
My diagnosis:
We already protect extent tree with i_data_sem, truncate and punch_hole
should wait for DIO, so the only data we have to protect is end_io->flags
modification, but only flush_completed_IO and end_io_work modified this
flags and we can serialize them via i_completed_io_lock.

Currently all these games with mutex_trylock result in the following deadlock
truncate: kworker:
ext4_setattr ext4_end_io_work
mutex_lock(i_mutex)
inode_dio_wait(inode) ->BLOCK
DEADLOCK<- mutex_trylock()
inode_dio_done()
#TEST_CASE1_BEGIN
MNT=/mnt_scrach
unlink $MNT/file
fallocate -l $((1024*1024*1024)) $MNT/file
aio-stress -I 100000 -O -s 100m -n -t 1 -c 10 -o 2 -o 3 $MNT/file
sleep 2
truncate -s 0 $MNT/file
#TEST_CASE1_END

Or use 286's xfstests https://github.com/dmonakhov/xfstests/blob/devel/286

This patch makes state machine simple and clean:

(1) xxx_end_io schedule final extent conversion simply by calling
ext4_add_complete_io(), which append it to ei->i_completed_io_list
NOTE1: because of (2A) work should be queued only if
->i_completed_io_list was empty, otherwise the work is scheduled already.

(2) ext4_flush_completed_IO is responsible for handling all pending
end_io from ei->i_completed_io_list
Flushing sequence consists of following stages:
A) LOCKED: Atomically drain completed_io_list to local_list
B) Perform extents conversion
C) LOCKED: move converted io's to to_free list for final deletion
This logic depends on context which we was called from.
D) Final end_io context destruction
NOTE1: i_mutex is no longer required because end_io->flags modification
is protected by ei->ext4_complete_io_lock

Full list of changes:
- Move all completion end_io related routines to page-io.c in order to improve
logic locality
- Move open coded logic from various xx_end_xx routines to ext4_add_complete_io()
- remove EXT4_IO_END_FSYNC
- Improve SMP scalability by removing useless i_mutex which does not
protect io->flags anymore.
- Reduce lock contention on i_completed_io_lock by optimizing list walk.
- Rename ext4_end_io_nolock to end4_end_io and make it static
- Check flush completion status to ext4_ext_punch_hole(). Because it is
not good idea to punch blocks from corrupted inode.

Changes since V3 (in request to Jan's comments):
Fall back to active flush_completed_IO() approach in order to prevent
performance issues with nolocked DIO reads.
Changes since V2:
Fix use-after-free caused by race truncate vs end_io_work

Signed-off-by: Dmitry Monakhov <dmonakhov@openvz.org>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>

authored by

Dmitry Monakhov and committed by
Theodore Ts'o
28a535f9 82e54229

+122 -170
+1 -2
fs/ext4/ext4.h
··· 186 186 #define EXT4_IO_END_ERROR 0x0002 187 187 #define EXT4_IO_END_QUEUED 0x0004 188 188 #define EXT4_IO_END_DIRECT 0x0008 189 - #define EXT4_IO_END_IN_FSYNC 0x0010 190 189 191 190 struct ext4_io_page { 192 191 struct page *p_page; ··· 2417 2418 2418 2419 /* page-io.c */ 2419 2420 extern int __init ext4_init_pageio(void); 2421 + extern void ext4_add_complete_io(ext4_io_end_t *io_end); 2420 2422 extern void ext4_exit_pageio(void); 2421 2423 extern void ext4_ioend_wait(struct inode *); 2422 2424 extern void ext4_free_io_end(ext4_io_end_t *io); 2423 2425 extern ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags); 2424 - extern int ext4_end_io_nolock(ext4_io_end_t *io); 2425 2426 extern void ext4_io_submit(struct ext4_io_submit *io); 2426 2427 extern int ext4_bio_write_page(struct ext4_io_submit *io, 2427 2428 struct page *page,
+3 -1
fs/ext4/extents.c
··· 4833 4833 } 4834 4834 4835 4835 /* finish any pending end_io work */ 4836 - ext4_flush_completed_IO(inode); 4836 + err = ext4_flush_completed_IO(inode); 4837 + if (err) 4838 + return err; 4837 4839 4838 4840 credits = ext4_writepage_trans_blocks(inode); 4839 4841 handle = ext4_journal_start(inode, credits);
-81
fs/ext4/fsync.c
··· 34 34 35 35 #include <trace/events/ext4.h> 36 36 37 - static void dump_completed_IO(struct inode * inode) 38 - { 39 - #ifdef EXT4FS_DEBUG 40 - struct list_head *cur, *before, *after; 41 - ext4_io_end_t *io, *io0, *io1; 42 - unsigned long flags; 43 - 44 - if (list_empty(&EXT4_I(inode)->i_completed_io_list)){ 45 - ext4_debug("inode %lu completed_io list is empty\n", inode->i_ino); 46 - return; 47 - } 48 - 49 - ext4_debug("Dump inode %lu completed_io list \n", inode->i_ino); 50 - spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags); 51 - list_for_each_entry(io, &EXT4_I(inode)->i_completed_io_list, list){ 52 - cur = &io->list; 53 - before = cur->prev; 54 - io0 = container_of(before, ext4_io_end_t, list); 55 - after = cur->next; 56 - io1 = container_of(after, ext4_io_end_t, list); 57 - 58 - ext4_debug("io 0x%p from inode %lu,prev 0x%p,next 0x%p\n", 59 - io, inode->i_ino, io0, io1); 60 - } 61 - spin_unlock_irqrestore(&EXT4_I(inode)->i_completed_io_lock, flags); 62 - #endif 63 - } 64 - 65 - /* 66 - * This function is called from ext4_sync_file(). 67 - * 68 - * When IO is completed, the work to convert unwritten extents to 69 - * written is queued on workqueue but may not get immediately 70 - * scheduled. When fsync is called, we need to ensure the 71 - * conversion is complete before fsync returns. 72 - * The inode keeps track of a list of pending/completed IO that 73 - * might needs to do the conversion. This function walks through 74 - * the list and convert the related unwritten extents for completed IO 75 - * to written. 76 - * The function return the number of pending IOs on success. 77 - */ 78 - int ext4_flush_completed_IO(struct inode *inode) 79 - { 80 - ext4_io_end_t *io; 81 - struct ext4_inode_info *ei = EXT4_I(inode); 82 - unsigned long flags; 83 - int ret = 0; 84 - int ret2 = 0; 85 - 86 - dump_completed_IO(inode); 87 - spin_lock_irqsave(&ei->i_completed_io_lock, flags); 88 - while (!list_empty(&ei->i_completed_io_list)){ 89 - io = list_entry(ei->i_completed_io_list.next, 90 - ext4_io_end_t, list); 91 - list_del_init(&io->list); 92 - io->flag |= EXT4_IO_END_IN_FSYNC; 93 - /* 94 - * Calling ext4_end_io_nolock() to convert completed 95 - * IO to written. 96 - * 97 - * When ext4_sync_file() is called, run_queue() may already 98 - * about to flush the work corresponding to this io structure. 99 - * It will be upset if it founds the io structure related 100 - * to the work-to-be schedule is freed. 101 - * 102 - * Thus we need to keep the io structure still valid here after 103 - * conversion finished. The io structure has a flag to 104 - * avoid double converting from both fsync and background work 105 - * queue work. 106 - */ 107 - spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); 108 - ret = ext4_end_io_nolock(io); 109 - if (ret < 0) 110 - ret2 = ret; 111 - spin_lock_irqsave(&ei->i_completed_io_lock, flags); 112 - io->flag &= ~EXT4_IO_END_IN_FSYNC; 113 - } 114 - spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); 115 - return (ret2 < 0) ? ret2 : 0; 116 - } 117 - 118 37 /* 119 38 * If we're not journaling and this is a just-created file, we have to 120 39 * sync our parent directory (if it was freshly created) since
+2 -4
fs/ext4/indirect.c
··· 807 807 808 808 retry: 809 809 if (rw == READ && ext4_should_dioread_nolock(inode)) { 810 - if (unlikely(!list_empty(&ei->i_completed_io_list))) { 811 - mutex_lock(&inode->i_mutex); 810 + if (unlikely(!list_empty(&ei->i_completed_io_list))) 812 811 ext4_flush_completed_IO(inode); 813 - mutex_unlock(&inode->i_mutex); 814 - } 812 + 815 813 ret = __blockdev_direct_IO(rw, iocb, inode, 816 814 inode->i_sb->s_bdev, iov, 817 815 offset, nr_segs,
+2 -23
fs/ext4/inode.c
··· 2881 2881 { 2882 2882 struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode; 2883 2883 ext4_io_end_t *io_end = iocb->private; 2884 - struct workqueue_struct *wq; 2885 - unsigned long flags; 2886 - struct ext4_inode_info *ei; 2887 2884 2888 2885 /* if not async direct IO or dio with 0 bytes write, just return */ 2889 2886 if (!io_end || !size) ··· 2909 2912 io_end->iocb = iocb; 2910 2913 io_end->result = ret; 2911 2914 } 2912 - wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq; 2913 2915 2914 - /* Add the io_end to per-inode completed aio dio list*/ 2915 - ei = EXT4_I(io_end->inode); 2916 - spin_lock_irqsave(&ei->i_completed_io_lock, flags); 2917 - list_add_tail(&io_end->list, &ei->i_completed_io_list); 2918 - spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); 2919 - 2920 - /* queue the work to convert unwritten extents to written */ 2921 - queue_work(wq, &io_end->work); 2916 + ext4_add_complete_io(io_end); 2922 2917 } 2923 2918 2924 2919 static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate) 2925 2920 { 2926 2921 ext4_io_end_t *io_end = bh->b_private; 2927 - struct workqueue_struct *wq; 2928 2922 struct inode *inode; 2929 - unsigned long flags; 2930 2923 2931 2924 if (!test_clear_buffer_uninit(bh) || !io_end) 2932 2925 goto out; ··· 2935 2948 */ 2936 2949 inode = io_end->inode; 2937 2950 ext4_set_io_unwritten_flag(inode, io_end); 2938 - 2939 - /* Add the io_end to per-inode completed io list*/ 2940 - spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags); 2941 - list_add_tail(&io_end->list, &EXT4_I(inode)->i_completed_io_list); 2942 - spin_unlock_irqrestore(&EXT4_I(inode)->i_completed_io_lock, flags); 2943 - 2944 - wq = EXT4_SB(inode->i_sb)->dio_unwritten_wq; 2945 - /* queue the work to convert unwritten extents to written */ 2946 - queue_work(wq, &io_end->work); 2951 + ext4_add_complete_io(io_end); 2947 2952 out: 2948 2953 bh->b_private = NULL; 2949 2954 bh->b_end_io = NULL;
+114 -59
fs/ext4/page-io.c
··· 71 71 int i; 72 72 73 73 BUG_ON(!io); 74 + BUG_ON(!list_empty(&io->list)); 74 75 BUG_ON(io->flag & EXT4_IO_END_UNWRITTEN); 75 76 76 77 if (io->page) ··· 84 83 kmem_cache_free(io_end_cachep, io); 85 84 } 86 85 87 - /* 88 - * check a range of space and convert unwritten extents to written. 89 - * 90 - * Called with inode->i_mutex; we depend on this when we manipulate 91 - * io->flag, since we could otherwise race with ext4_flush_completed_IO() 92 - */ 93 - int ext4_end_io_nolock(ext4_io_end_t *io) 86 + /* check a range of space and convert unwritten extents to written. */ 87 + static int ext4_end_io(ext4_io_end_t *io) 94 88 { 95 89 struct inode *inode = io->inode; 96 90 loff_t offset = io->offset; 97 91 ssize_t size = io->size; 98 92 int ret = 0; 99 - 100 - BUG_ON(!(io->flag & EXT4_IO_END_UNWRITTEN)); 101 93 102 94 ext4_debug("ext4_end_io_nolock: io 0x%p from inode %lu,list->next 0x%p," 103 95 "list->prev 0x%p\n", ··· 104 110 "(inode %lu, offset %llu, size %zd, error %d)", 105 111 inode->i_ino, offset, size, ret); 106 112 } 107 - io->flag &= ~EXT4_IO_END_UNWRITTEN; 108 113 if (io->iocb) 109 114 aio_complete(io->iocb, io->result, 0); 110 115 ··· 115 122 return ret; 116 123 } 117 124 125 + static void dump_completed_IO(struct inode *inode) 126 + { 127 + #ifdef EXT4FS_DEBUG 128 + struct list_head *cur, *before, *after; 129 + ext4_io_end_t *io, *io0, *io1; 130 + unsigned long flags; 131 + 132 + if (list_empty(&EXT4_I(inode)->i_completed_io_list)) { 133 + ext4_debug("inode %lu completed_io list is empty\n", 134 + inode->i_ino); 135 + return; 136 + } 137 + 138 + ext4_debug("Dump inode %lu completed_io list\n", inode->i_ino); 139 + list_for_each_entry(io, &EXT4_I(inode)->i_completed_io_list, list) { 140 + cur = &io->list; 141 + before = cur->prev; 142 + io0 = container_of(before, ext4_io_end_t, list); 143 + after = cur->next; 144 + io1 = container_of(after, ext4_io_end_t, list); 145 + 146 + ext4_debug("io 0x%p from inode %lu,prev 0x%p,next 0x%p\n", 147 + io, inode->i_ino, io0, io1); 148 + } 149 + #endif 150 + } 151 + 152 + /* Add the io_end to per-inode completed end_io list. */ 153 + void ext4_add_complete_io(ext4_io_end_t *io_end) 154 + { 155 + struct ext4_inode_info *ei = EXT4_I(io_end->inode); 156 + struct workqueue_struct *wq; 157 + unsigned long flags; 158 + 159 + BUG_ON(!(io_end->flag & EXT4_IO_END_UNWRITTEN)); 160 + wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq; 161 + 162 + spin_lock_irqsave(&ei->i_completed_io_lock, flags); 163 + if (list_empty(&ei->i_completed_io_list)) { 164 + io_end->flag |= EXT4_IO_END_QUEUED; 165 + queue_work(wq, &io_end->work); 166 + } 167 + list_add_tail(&io_end->list, &ei->i_completed_io_list); 168 + spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); 169 + } 170 + 171 + static int ext4_do_flush_completed_IO(struct inode *inode, 172 + ext4_io_end_t *work_io) 173 + { 174 + ext4_io_end_t *io; 175 + struct list_head unwritten, complete, to_free; 176 + unsigned long flags; 177 + struct ext4_inode_info *ei = EXT4_I(inode); 178 + int err, ret = 0; 179 + 180 + INIT_LIST_HEAD(&complete); 181 + INIT_LIST_HEAD(&to_free); 182 + 183 + spin_lock_irqsave(&ei->i_completed_io_lock, flags); 184 + dump_completed_IO(inode); 185 + list_replace_init(&ei->i_completed_io_list, &unwritten); 186 + spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); 187 + 188 + while (!list_empty(&unwritten)) { 189 + io = list_entry(unwritten.next, ext4_io_end_t, list); 190 + BUG_ON(!(io->flag & EXT4_IO_END_UNWRITTEN)); 191 + list_del_init(&io->list); 192 + 193 + err = ext4_end_io(io); 194 + if (unlikely(!ret && err)) 195 + ret = err; 196 + 197 + list_add_tail(&io->list, &complete); 198 + } 199 + /* It is important to update all flags for all end_io in one shot w/o 200 + * dropping the lock.*/ 201 + spin_lock_irqsave(&ei->i_completed_io_lock, flags); 202 + while (!list_empty(&complete)) { 203 + io = list_entry(complete.next, ext4_io_end_t, list); 204 + io->flag &= ~EXT4_IO_END_UNWRITTEN; 205 + /* end_io context can not be destroyed now because it still 206 + * used by queued worker. Worker thread will destroy it later */ 207 + if (io->flag & EXT4_IO_END_QUEUED) 208 + list_del_init(&io->list); 209 + else 210 + list_move(&io->list, &to_free); 211 + } 212 + /* If we are called from worker context, it is time to clear queued 213 + * flag, and destroy it's end_io if it was converted already */ 214 + if (work_io) { 215 + work_io->flag &= ~EXT4_IO_END_QUEUED; 216 + if (!(work_io->flag & EXT4_IO_END_UNWRITTEN)) 217 + list_add_tail(&work_io->list, &to_free); 218 + } 219 + spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); 220 + 221 + while (!list_empty(&to_free)) { 222 + io = list_entry(to_free.next, ext4_io_end_t, list); 223 + list_del_init(&io->list); 224 + ext4_free_io_end(io); 225 + } 226 + return ret; 227 + } 228 + 118 229 /* 119 230 * work on completed aio dio IO, to convert unwritten extents to extents 120 231 */ 121 232 static void ext4_end_io_work(struct work_struct *work) 122 233 { 123 - ext4_io_end_t *io = container_of(work, ext4_io_end_t, work); 124 - struct inode *inode = io->inode; 125 - struct ext4_inode_info *ei = EXT4_I(inode); 126 - unsigned long flags; 234 + ext4_io_end_t *io = container_of(work, ext4_io_end_t, work); 235 + ext4_do_flush_completed_IO(io->inode, io); 236 + } 127 237 128 - spin_lock_irqsave(&ei->i_completed_io_lock, flags); 129 - if (io->flag & EXT4_IO_END_IN_FSYNC) 130 - goto requeue; 131 - if (list_empty(&io->list)) { 132 - spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); 133 - goto free; 134 - } 135 - 136 - if (!mutex_trylock(&inode->i_mutex)) { 137 - bool was_queued; 138 - requeue: 139 - was_queued = !!(io->flag & EXT4_IO_END_QUEUED); 140 - io->flag |= EXT4_IO_END_QUEUED; 141 - spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); 142 - /* 143 - * Requeue the work instead of waiting so that the work 144 - * items queued after this can be processed. 145 - */ 146 - queue_work(EXT4_SB(inode->i_sb)->dio_unwritten_wq, &io->work); 147 - /* 148 - * To prevent the ext4-dio-unwritten thread from keeping 149 - * requeueing end_io requests and occupying cpu for too long, 150 - * yield the cpu if it sees an end_io request that has already 151 - * been requeued. 152 - */ 153 - if (was_queued) 154 - yield(); 155 - return; 156 - } 157 - list_del_init(&io->list); 158 - spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); 159 - (void) ext4_end_io_nolock(io); 160 - mutex_unlock(&inode->i_mutex); 161 - free: 162 - ext4_free_io_end(io); 238 + int ext4_flush_completed_IO(struct inode *inode) 239 + { 240 + return ext4_do_flush_completed_IO(inode, NULL); 163 241 } 164 242 165 243 ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags) ··· 263 199 static void ext4_end_bio(struct bio *bio, int error) 264 200 { 265 201 ext4_io_end_t *io_end = bio->bi_private; 266 - struct workqueue_struct *wq; 267 202 struct inode *inode; 268 - unsigned long flags; 269 203 int i; 270 204 sector_t bi_sector = bio->bi_sector; 271 205 ··· 321 259 return; 322 260 } 323 261 324 - /* Add the io_end to per-inode completed io list*/ 325 - spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags); 326 - list_add_tail(&io_end->list, &EXT4_I(inode)->i_completed_io_list); 327 - spin_unlock_irqrestore(&EXT4_I(inode)->i_completed_io_lock, flags); 328 - 329 - wq = EXT4_SB(inode->i_sb)->dio_unwritten_wq; 330 - /* queue the work to convert unwritten extents to written */ 331 - queue_work(wq, &io_end->work); 262 + ext4_add_complete_io(io_end); 332 263 } 333 264 334 265 void ext4_io_submit(struct ext4_io_submit *io)