Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

writeback: Avoid iput() from flusher thread

Doing iput() from flusher thread (writeback_sb_inodes()) can create problems
because iput() can do a lot of work - for example truncate the inode if it's
the last iput on unlinked file. Some filesystems depend on flusher thread
progressing (e.g. because they need to flush delay allocated blocks to reduce
allocation uncertainty) and so flusher thread doing truncate creates
interesting dependencies and possibilities for deadlocks.

We get rid of iput() in flusher thread by using the fact that I_SYNC inode
flag effectively pins the inode in memory. So if we take care to either hold
i_lock or have I_SYNC set, we can get away without taking inode reference
in writeback_sb_inodes().

As a side effect of these changes, we also fix possible use-after-free in
wb_writeback() because inode_wait_for_writeback() call could try to reacquire
i_lock on the inode that was already free.

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Fengguang Wu <fengguang.wu@intel.com>

authored by

Jan Kara and committed by
Fengguang Wu
169ebd90 dbd5768f

+65 -23
+53 -13
fs/fs-writeback.c
··· 326 326 } 327 327 328 328 /* 329 - * Wait for writeback on an inode to complete. 329 + * Wait for writeback on an inode to complete. Called with i_lock held. 330 + * Caller must make sure inode cannot go away when we drop i_lock. 330 331 */ 331 - static void inode_wait_for_writeback(struct inode *inode) 332 + static void __inode_wait_for_writeback(struct inode *inode) 333 + __releases(inode->i_lock) 334 + __acquires(inode->i_lock) 332 335 { 333 336 DEFINE_WAIT_BIT(wq, &inode->i_state, __I_SYNC); 334 337 wait_queue_head_t *wqh; ··· 342 339 __wait_on_bit(wqh, &wq, inode_wait, TASK_UNINTERRUPTIBLE); 343 340 spin_lock(&inode->i_lock); 344 341 } 342 + } 343 + 344 + /* 345 + * Wait for writeback on an inode to complete. Caller must have inode pinned. 346 + */ 347 + void inode_wait_for_writeback(struct inode *inode) 348 + { 349 + spin_lock(&inode->i_lock); 350 + __inode_wait_for_writeback(inode); 351 + spin_unlock(&inode->i_lock); 352 + } 353 + 354 + /* 355 + * Sleep until I_SYNC is cleared. This function must be called with i_lock 356 + * held and drops it. It is aimed for callers not holding any inode reference 357 + * so once i_lock is dropped, inode can go away. 358 + */ 359 + static void inode_sleep_on_writeback(struct inode *inode) 360 + __releases(inode->i_lock) 361 + { 362 + DEFINE_WAIT(wait); 363 + wait_queue_head_t *wqh = bit_waitqueue(&inode->i_state, __I_SYNC); 364 + int sleep; 365 + 366 + prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE); 367 + sleep = inode->i_state & I_SYNC; 368 + spin_unlock(&inode->i_lock); 369 + if (sleep) 370 + schedule(); 371 + finish_wait(wqh, &wait); 345 372 } 346 373 347 374 /* ··· 512 479 if (wbc->sync_mode != WB_SYNC_ALL) 513 480 goto out; 514 481 /* 515 - * It's a data-integrity sync. We must wait. 482 + * It's a data-integrity sync. We must wait. Since callers hold 483 + * inode reference or inode has I_WILL_FREE set, it cannot go 484 + * away under us. 516 485 */ 517 - inode_wait_for_writeback(inode); 486 + __inode_wait_for_writeback(inode); 518 487 } 519 488 WARN_ON(inode->i_state & I_SYNC); 520 489 /* ··· 655 620 } 656 621 spin_unlock(&wb->list_lock); 657 622 658 - __iget(inode); 659 623 /* 660 624 * We already requeued the inode if it had I_SYNC set and we 661 625 * are doing WB_SYNC_NONE writeback. So this catches only the 662 626 * WB_SYNC_ALL case. 663 627 */ 664 - if (inode->i_state & I_SYNC) 665 - inode_wait_for_writeback(inode); 628 + if (inode->i_state & I_SYNC) { 629 + /* Wait for I_SYNC. This function drops i_lock... */ 630 + inode_sleep_on_writeback(inode); 631 + /* Inode may be gone, start again */ 632 + continue; 633 + } 666 634 inode->i_state |= I_SYNC; 667 635 spin_unlock(&inode->i_lock); 636 + 668 637 write_chunk = writeback_chunk_size(wb->bdi, work); 669 638 wbc.nr_to_write = write_chunk; 670 639 wbc.pages_skipped = 0; 671 640 641 + /* 642 + * We use I_SYNC to pin the inode in memory. While it is set 643 + * evict_inode() will wait so the inode cannot be freed. 644 + */ 672 645 __writeback_single_inode(inode, wb, &wbc); 673 646 674 647 work->nr_pages -= write_chunk - wbc.nr_to_write; ··· 688 645 requeue_inode(inode, wb, &wbc); 689 646 inode_sync_complete(inode); 690 647 spin_unlock(&inode->i_lock); 691 - spin_unlock(&wb->list_lock); 692 - iput(inode); 693 - cond_resched(); 694 - spin_lock(&wb->list_lock); 648 + cond_resched_lock(&wb->list_lock); 695 649 /* 696 650 * bail out to wb_writeback() often enough to check 697 651 * background threshold and other termination conditions. ··· 883 843 inode = wb_inode(wb->b_more_io.prev); 884 844 spin_lock(&inode->i_lock); 885 845 spin_unlock(&wb->list_lock); 886 - inode_wait_for_writeback(inode); 887 - spin_unlock(&inode->i_lock); 846 + /* This function drops i_lock... */ 847 + inode_sleep_on_writeback(inode); 888 848 spin_lock(&wb->list_lock); 889 849 } 890 850 }
+7 -1
fs/inode.c
··· 530 530 531 531 inode_sb_list_del(inode); 532 532 533 - inode_sync_wait(inode); 533 + /* 534 + * Wait for flusher thread to be done with the inode so that filesystem 535 + * does not start destroying it while writeback is still running. Since 536 + * the inode has I_FREEING set, flusher thread won't start new work on 537 + * the inode. We just have to wait for running writeback to finish. 538 + */ 539 + inode_wait_for_writeback(inode); 534 540 535 541 if (op->evict_inode) { 536 542 op->evict_inode(inode);
+4 -3
include/linux/fs.h
··· 1753 1753 * anew. Other functions will just ignore such inodes, 1754 1754 * if appropriate. I_NEW is used for waiting. 1755 1755 * 1756 - * I_SYNC Synchonized write of dirty inode data. The bits is 1757 - * set during data writeback, and cleared with a wakeup 1758 - * on the bit address once it is done. 1756 + * I_SYNC Writeback of inode is running. The bit is set during 1757 + * data writeback, and cleared with a wakeup on the bit 1758 + * address once it is done. The bit is also used to pin 1759 + * the inode in memory for flusher thread. 1759 1760 * 1760 1761 * I_REFERENCED Marks the inode as recently references on the LRU list. 1761 1762 *
+1 -6
include/linux/writeback.h
··· 95 95 enum wb_reason reason); 96 96 long wb_do_writeback(struct bdi_writeback *wb, int force_wait); 97 97 void wakeup_flusher_threads(long nr_pages, enum wb_reason reason); 98 + void inode_wait_for_writeback(struct inode *inode); 98 99 99 100 /* writeback.h requires fs.h; it, too, is not included from here. */ 100 101 static inline void wait_on_inode(struct inode *inode) 101 102 { 102 103 might_sleep(); 103 104 wait_on_bit(&inode->i_state, __I_NEW, inode_wait, TASK_UNINTERRUPTIBLE); 104 - } 105 - static inline void inode_sync_wait(struct inode *inode) 106 - { 107 - might_sleep(); 108 - wait_on_bit(&inode->i_state, __I_SYNC, inode_wait, 109 - TASK_UNINTERRUPTIBLE); 110 105 } 111 106 112 107