Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Btrfs: change how we wait for pending ordered extents

We have a mechanism to make sure we don't lose updates for ordered extents that
were logged in the transaction that is currently running. We add the ordered
extent to a transaction list and then the transaction waits on all the ordered
extents in that list. However are substantially large file systems this list
can be extremely large, and can give us soft lockups, since the ordered extents
don't remove themselves from the list when they do complete.

To fix this we simply add a counter to the transaction that is incremented any
time we have a logged extent that needs to be completed in the current
transaction. Then when the ordered extent finally completes it decrements the
per transaction counter and wakes up the transaction if we are the last ones.
This will eliminate the softlockup. Thanks,

Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>

authored by

Josef Bacik and committed by
Chris Mason
161c3549 a408365c

+59 -63
-20
fs/btrfs/disk-io.c
··· 4326 4326 return 0; 4327 4327 } 4328 4328 4329 - static void btrfs_free_pending_ordered(struct btrfs_transaction *cur_trans, 4330 - struct btrfs_fs_info *fs_info) 4331 - { 4332 - struct btrfs_ordered_extent *ordered; 4333 - 4334 - spin_lock(&fs_info->trans_lock); 4335 - while (!list_empty(&cur_trans->pending_ordered)) { 4336 - ordered = list_first_entry(&cur_trans->pending_ordered, 4337 - struct btrfs_ordered_extent, 4338 - trans_list); 4339 - list_del_init(&ordered->trans_list); 4340 - spin_unlock(&fs_info->trans_lock); 4341 - 4342 - btrfs_put_ordered_extent(ordered); 4343 - spin_lock(&fs_info->trans_lock); 4344 - } 4345 - spin_unlock(&fs_info->trans_lock); 4346 - } 4347 - 4348 4329 void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans, 4349 4330 struct btrfs_root *root) 4350 4331 { ··· 4337 4356 cur_trans->state = TRANS_STATE_UNBLOCKED; 4338 4357 wake_up(&root->fs_info->transaction_wait); 4339 4358 4340 - btrfs_free_pending_ordered(cur_trans, root->fs_info); 4341 4359 btrfs_destroy_delayed_inodes(root); 4342 4360 btrfs_assert_delayed_root_empty(root); 4343 4361
+49 -13
fs/btrfs/ordered-data.c
··· 490 490 491 491 spin_lock_irq(&log->log_extents_lock[index]); 492 492 while (!list_empty(&log->logged_list[index])) { 493 + struct inode *inode; 493 494 ordered = list_first_entry(&log->logged_list[index], 494 495 struct btrfs_ordered_extent, 495 496 log_list); 496 497 list_del_init(&ordered->log_list); 498 + inode = ordered->inode; 497 499 spin_unlock_irq(&log->log_extents_lock[index]); 498 500 499 501 if (!test_bit(BTRFS_ORDERED_IO_DONE, &ordered->flags) && 500 502 !test_bit(BTRFS_ORDERED_DIRECT, &ordered->flags)) { 501 - struct inode *inode = ordered->inode; 502 503 u64 start = ordered->file_offset; 503 504 u64 end = ordered->file_offset + ordered->len - 1; 504 505 ··· 510 509 &ordered->flags)); 511 510 512 511 /* 513 - * If our ordered extent completed it means it updated the 514 - * fs/subvol and csum trees already, so no need to make the 515 - * current transaction's commit wait for it, as we end up 516 - * holding memory unnecessarily and delaying the inode's iput 517 - * until the transaction commit (we schedule an iput for the 518 - * inode when the ordered extent's refcount drops to 0), which 519 - * prevents it from being evictable until the transaction 520 - * commits. 512 + * In order to keep us from losing our ordered extent 513 + * information when committing the transaction we have to make 514 + * sure that any logged extents are completed when we go to 515 + * commit the transaction. To do this we simply increase the 516 + * current transactions pending_ordered counter and decrement it 517 + * when the ordered extent completes. 521 518 */ 522 - if (test_bit(BTRFS_ORDERED_COMPLETE, &ordered->flags)) 523 - btrfs_put_ordered_extent(ordered); 524 - else 525 - list_add_tail(&ordered->trans_list, &trans->ordered); 519 + if (!test_bit(BTRFS_ORDERED_COMPLETE, &ordered->flags)) { 520 + struct btrfs_ordered_inode_tree *tree; 526 521 522 + tree = &BTRFS_I(inode)->ordered_tree; 523 + spin_lock_irq(&tree->lock); 524 + if (!test_bit(BTRFS_ORDERED_COMPLETE, &ordered->flags)) { 525 + set_bit(BTRFS_ORDERED_PENDING, &ordered->flags); 526 + atomic_inc(&trans->transaction->pending_ordered); 527 + } 528 + spin_unlock_irq(&tree->lock); 529 + } 530 + btrfs_put_ordered_extent(ordered); 527 531 spin_lock_irq(&log->log_extents_lock[index]); 528 532 } 529 533 spin_unlock_irq(&log->log_extents_lock[index]); ··· 590 584 struct btrfs_ordered_inode_tree *tree; 591 585 struct btrfs_root *root = BTRFS_I(inode)->root; 592 586 struct rb_node *node; 587 + bool dec_pending_ordered = false; 593 588 594 589 tree = &BTRFS_I(inode)->ordered_tree; 595 590 spin_lock_irq(&tree->lock); ··· 600 593 if (tree->last == node) 601 594 tree->last = NULL; 602 595 set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags); 596 + if (test_and_clear_bit(BTRFS_ORDERED_PENDING, &entry->flags)) 597 + dec_pending_ordered = true; 603 598 spin_unlock_irq(&tree->lock); 599 + 600 + /* 601 + * The current running transaction is waiting on us, we need to let it 602 + * know that we're complete and wake it up. 603 + */ 604 + if (dec_pending_ordered) { 605 + struct btrfs_transaction *trans; 606 + 607 + /* 608 + * The checks for trans are just a formality, it should be set, 609 + * but if it isn't we don't want to deref/assert under the spin 610 + * lock, so be nice and check if trans is set, but ASSERT() so 611 + * if it isn't set a developer will notice. 612 + */ 613 + spin_lock(&root->fs_info->trans_lock); 614 + trans = root->fs_info->running_transaction; 615 + if (trans) 616 + atomic_inc(&trans->use_count); 617 + spin_unlock(&root->fs_info->trans_lock); 618 + 619 + ASSERT(trans); 620 + if (trans) { 621 + if (atomic_dec_and_test(&trans->pending_ordered)) 622 + wake_up(&trans->pending_wait); 623 + btrfs_put_transaction(trans); 624 + } 625 + } 604 626 605 627 spin_lock(&root->ordered_extent_lock); 606 628 list_del_init(&entry->root_extent_list);
+2
fs/btrfs/ordered-data.h
··· 73 73 74 74 #define BTRFS_ORDERED_LOGGED 10 /* Set when we've waited on this ordered extent 75 75 * in the logging code. */ 76 + #define BTRFS_ORDERED_PENDING 11 /* We are waiting for this ordered extent to 77 + * complete in the current transaction. */ 76 78 struct btrfs_ordered_extent { 77 79 /* logical offset in the file */ 78 80 u64 file_offset;
+6 -28
fs/btrfs/transaction.c
··· 232 232 extwriter_counter_init(cur_trans, type); 233 233 init_waitqueue_head(&cur_trans->writer_wait); 234 234 init_waitqueue_head(&cur_trans->commit_wait); 235 + init_waitqueue_head(&cur_trans->pending_wait); 235 236 cur_trans->state = TRANS_STATE_RUNNING; 236 237 /* 237 238 * One for this trans handle, one so it will live on until we ··· 240 239 */ 241 240 atomic_set(&cur_trans->use_count, 2); 242 241 cur_trans->have_free_bgs = 0; 242 + atomic_set(&cur_trans->pending_ordered, 0); 243 243 cur_trans->start_time = get_seconds(); 244 244 cur_trans->dirty_bg_run = 0; 245 245 ··· 268 266 INIT_LIST_HEAD(&cur_trans->pending_snapshots); 269 267 INIT_LIST_HEAD(&cur_trans->pending_chunks); 270 268 INIT_LIST_HEAD(&cur_trans->switch_commits); 271 - INIT_LIST_HEAD(&cur_trans->pending_ordered); 272 269 INIT_LIST_HEAD(&cur_trans->dirty_bgs); 273 270 INIT_LIST_HEAD(&cur_trans->io_bgs); 274 271 INIT_LIST_HEAD(&cur_trans->dropped_roots); ··· 552 551 h->can_flush_pending_bgs = true; 553 552 INIT_LIST_HEAD(&h->qgroup_ref_list); 554 553 INIT_LIST_HEAD(&h->new_bgs); 555 - INIT_LIST_HEAD(&h->ordered); 556 554 557 555 smp_mb(); 558 556 if (cur_trans->state >= TRANS_STATE_BLOCKED && ··· 783 783 784 784 if (!list_empty(&trans->new_bgs)) 785 785 btrfs_create_pending_block_groups(trans, root); 786 - 787 - if (!list_empty(&trans->ordered)) { 788 - spin_lock(&info->trans_lock); 789 - list_splice_init(&trans->ordered, &cur_trans->pending_ordered); 790 - spin_unlock(&info->trans_lock); 791 - } 792 786 793 787 trans->delayed_ref_updates = 0; 794 788 if (!trans->sync) { ··· 1782 1788 } 1783 1789 1784 1790 static inline void 1785 - btrfs_wait_pending_ordered(struct btrfs_transaction *cur_trans, 1786 - struct btrfs_fs_info *fs_info) 1791 + btrfs_wait_pending_ordered(struct btrfs_transaction *cur_trans) 1787 1792 { 1788 - struct btrfs_ordered_extent *ordered; 1789 - 1790 - spin_lock(&fs_info->trans_lock); 1791 - while (!list_empty(&cur_trans->pending_ordered)) { 1792 - ordered = list_first_entry(&cur_trans->pending_ordered, 1793 - struct btrfs_ordered_extent, 1794 - trans_list); 1795 - list_del_init(&ordered->trans_list); 1796 - spin_unlock(&fs_info->trans_lock); 1797 - 1798 - wait_event(ordered->wait, test_bit(BTRFS_ORDERED_COMPLETE, 1799 - &ordered->flags)); 1800 - btrfs_put_ordered_extent(ordered); 1801 - spin_lock(&fs_info->trans_lock); 1802 - } 1803 - spin_unlock(&fs_info->trans_lock); 1793 + wait_event(cur_trans->pending_wait, 1794 + atomic_read(&cur_trans->pending_ordered) == 0); 1804 1795 } 1805 1796 1806 1797 int btrfs_commit_transaction(struct btrfs_trans_handle *trans, ··· 1869 1890 } 1870 1891 1871 1892 spin_lock(&root->fs_info->trans_lock); 1872 - list_splice_init(&trans->ordered, &cur_trans->pending_ordered); 1873 1893 if (cur_trans->state >= TRANS_STATE_COMMIT_START) { 1874 1894 spin_unlock(&root->fs_info->trans_lock); 1875 1895 atomic_inc(&cur_trans->use_count); ··· 1927 1949 1928 1950 btrfs_wait_delalloc_flush(root->fs_info); 1929 1951 1930 - btrfs_wait_pending_ordered(cur_trans, root->fs_info); 1952 + btrfs_wait_pending_ordered(cur_trans); 1931 1953 1932 1954 btrfs_scrub_pause(root); 1933 1955 /*
+2 -2
fs/btrfs/transaction.h
··· 46 46 */ 47 47 atomic_t num_writers; 48 48 atomic_t use_count; 49 + atomic_t pending_ordered; 49 50 50 51 /* 51 52 * true if there is free bgs operations in this transaction ··· 60 59 unsigned long start_time; 61 60 wait_queue_head_t writer_wait; 62 61 wait_queue_head_t commit_wait; 62 + wait_queue_head_t pending_wait; 63 63 struct list_head pending_snapshots; 64 64 struct list_head pending_chunks; 65 - struct list_head pending_ordered; 66 65 struct list_head switch_commits; 67 66 struct list_head dirty_bgs; 68 67 struct list_head io_bgs; ··· 130 129 */ 131 130 struct btrfs_root *root; 132 131 struct seq_list delayed_ref_elem; 133 - struct list_head ordered; 134 132 struct list_head qgroup_ref_list; 135 133 struct list_head new_bgs; 136 134 };