fs: Lock the inode LRU list separately

Introduce the inode_lru_lock to protect the inode_lru list. This
lock is nested inside the inode->i_lock to allow the inode to be
added to the LRU list in iput_final without needing to deal with
lock inversions. This keeps iput_final() clean and neat.

Further, where marking the inode I_FREEING and removing it from the
LRU, move the LRU list manipulation within the inode->i_lock to keep
the list manipulation consistent with iput_final. This also means
that most of the open coded LRU list removal + unused inode
accounting can now use the inode_lru_list_del() wrappers which
cleans the code up further.

However, this locking change means what the LRU traversal in
prune_icache() inverts this lock ordering and needs to use trylock
semantics on the inode->i_lock to avoid deadlocking. In these cases,
if we fail to lock the inode we move it to the back of the LRU to
prevent spinning on it.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

authored by Dave Chinner and committed by Al Viro 02afc410 b2b2af8e

+30 -9
+30 -9
fs/inode.c
··· 32 32 * 33 33 * inode->i_lock protects: 34 34 * inode->i_state, inode->i_hash, __iget() 35 + * inode_lru_lock protects: 36 + * inode_lru, inode->i_lru 35 37 * 36 38 * Lock ordering: 37 39 * inode_lock 38 40 * inode->i_lock 41 + * inode_lru_lock 39 42 */ 40 43 41 44 /* ··· 88 85 */ 89 86 90 87 static LIST_HEAD(inode_lru); 88 + static DEFINE_SPINLOCK(inode_lru_lock); 91 89 static struct hlist_head *inode_hashtable __read_mostly; 92 90 93 91 /* ··· 360 356 361 357 static void inode_lru_list_add(struct inode *inode) 362 358 { 359 + spin_lock(&inode_lru_lock); 363 360 if (list_empty(&inode->i_lru)) { 364 361 list_add(&inode->i_lru, &inode_lru); 365 362 inodes_stat.nr_unused++; 366 363 } 364 + spin_unlock(&inode_lru_lock); 367 365 } 368 366 369 367 static void inode_lru_list_del(struct inode *inode) 370 368 { 369 + spin_lock(&inode_lru_lock); 371 370 if (!list_empty(&inode->i_lru)) { 372 371 list_del_init(&inode->i_lru); 373 372 inodes_stat.nr_unused--; 374 373 } 374 + spin_unlock(&inode_lru_lock); 375 375 } 376 376 377 377 static inline void __inode_sb_list_add(struct inode *inode) ··· 551 543 } 552 544 553 545 inode->i_state |= I_FREEING; 554 - if (!(inode->i_state & (I_DIRTY | I_SYNC))) 555 - inodes_stat.nr_unused--; 546 + inode_lru_list_del(inode); 556 547 spin_unlock(&inode->i_lock); 557 - list_move(&inode->i_lru, &dispose); 548 + list_add(&inode->i_lru, &dispose); 558 549 } 559 550 spin_unlock(&inode_lock); 560 551 ··· 603 596 } 604 597 605 598 inode->i_state |= I_FREEING; 606 - if (!(inode->i_state & (I_DIRTY | I_SYNC))) 607 - inodes_stat.nr_unused--; 599 + inode_lru_list_del(inode); 608 600 spin_unlock(&inode->i_lock); 609 - list_move(&inode->i_lru, &dispose); 601 + list_add(&inode->i_lru, &dispose); 610 602 } 611 603 spin_unlock(&inode_lock); 612 604 ··· 629 623 630 624 /* 631 625 * Scan `goal' inodes on the unused list for freeable ones. They are moved to a 632 - * temporary list and then are freed outside inode_lock by dispose_list(). 626 + * temporary list and then are freed outside inode_lru_lock by dispose_list(). 633 627 * 634 628 * Any inodes which are pinned purely because of attached pagecache have their 635 629 * pagecache removed. If the inode has metadata buffers attached to ··· 651 645 652 646 down_read(&iprune_sem); 653 647 spin_lock(&inode_lock); 648 + spin_lock(&inode_lru_lock); 654 649 for (nr_scanned = 0; nr_scanned < nr_to_scan; nr_scanned++) { 655 650 struct inode *inode; 656 651 ··· 661 654 inode = list_entry(inode_lru.prev, struct inode, i_lru); 662 655 663 656 /* 657 + * we are inverting the inode_lru_lock/inode->i_lock here, 658 + * so use a trylock. If we fail to get the lock, just move the 659 + * inode to the back of the list so we don't spin on it. 660 + */ 661 + if (!spin_trylock(&inode->i_lock)) { 662 + list_move(&inode->i_lru, &inode_lru); 663 + continue; 664 + } 665 + 666 + /* 664 667 * Referenced or dirty inodes are still in use. Give them 665 668 * another pass through the LRU as we canot reclaim them now. 666 669 */ 667 - spin_lock(&inode->i_lock); 668 670 if (atomic_read(&inode->i_count) || 669 671 (inode->i_state & ~I_REFERENCED)) { 670 672 spin_unlock(&inode->i_lock); ··· 692 676 if (inode_has_buffers(inode) || inode->i_data.nrpages) { 693 677 __iget(inode); 694 678 spin_unlock(&inode->i_lock); 679 + spin_unlock(&inode_lru_lock); 695 680 spin_unlock(&inode_lock); 696 681 if (remove_inode_buffers(inode)) 697 682 reap += invalidate_mapping_pages(&inode->i_data, 698 683 0, -1); 699 684 iput(inode); 700 685 spin_lock(&inode_lock); 686 + spin_lock(&inode_lru_lock); 701 687 702 688 if (inode != list_entry(inode_lru.next, 703 689 struct inode, i_lru)) 704 690 continue; /* wrong inode or list_empty */ 705 - spin_lock(&inode->i_lock); 691 + /* avoid lock inversions with trylock */ 692 + if (!spin_trylock(&inode->i_lock)) 693 + continue; 706 694 if (!can_unuse(inode)) { 707 695 spin_unlock(&inode->i_lock); 708 696 continue; ··· 723 703 __count_vm_events(KSWAPD_INODESTEAL, reap); 724 704 else 725 705 __count_vm_events(PGINODESTEAL, reap); 706 + spin_unlock(&inode_lru_lock); 726 707 spin_unlock(&inode_lock); 727 708 728 709 dispose_list(&freeable);