[XFS] Prevent a deadlock when xfslogd unpins inodes.

The previous fixes for the use after free in xfs_iunpin left a nasty log
deadlock when xfslogd unpinned the inode and dropped the last reference to
the inode. the ->clear_inode() method can issue transactions, and if the
log was full, the transaction could push on the log and get stuck trying
to push the inode it was currently unpinning.

To fix this, we provide xfs_iunpin a guarantee that it will always have a
valid xfs_inode <-> linux inode link or a particular flag will be set on
the inode. We then use log forces during lookup to ensure transactions are
completed before we recycle the inode. This ensures that xfs_iunpin will
never use the linux inode after it is being freed, and any lookup on an
inode on the reclaim list will wait until it is safe to attach a new linux
inode to the xfs inode.

SGI-PV: 956832
SGI-Modid: xfs-linux-melb:xfs-kern:27359a

Signed-off-by: David Chinner <dgc@sgi.com>
Signed-off-by: Shailendra Tripathi <stripathi@agami.com>
Signed-off-by: Takenori Nagano <t-nagano@ah.jp.nec.com>
Signed-off-by: Tim Shimmin <tes@sgi.com>

authored by David Chinner and committed by Tim Shimmin 4c60658e 7a18c386

+69 -35
+30
fs/xfs/xfs_iget.c
··· 237 238 goto again; 239 } 240 241 vn_trace_exit(vp, "xfs_iget.alloc", 242 (inst_t *)__return_address);
··· 237 238 goto again; 239 } 240 + ASSERT(xfs_iflags_test(ip, XFS_IRECLAIMABLE)); 241 + 242 + /* 243 + * If lookup is racing with unlink, then we 244 + * should return an error immediately so we 245 + * don't remove it from the reclaim list and 246 + * potentially leak the inode. 247 + */ 248 + if ((ip->i_d.di_mode == 0) && 249 + !(flags & XFS_IGET_CREATE)) { 250 + read_unlock(&ih->ih_lock); 251 + return ENOENT; 252 + } 253 + 254 + /* 255 + * There may be transactions sitting in the 256 + * incore log buffers or being flushed to disk 257 + * at this time. We can't clear the 258 + * XFS_IRECLAIMABLE flag until these 259 + * transactions have hit the disk, otherwise we 260 + * will void the guarantee the flag provides 261 + * xfs_iunpin() 262 + */ 263 + if (xfs_ipincount(ip)) { 264 + read_unlock(&ih->ih_lock); 265 + xfs_log_force(mp, 0, 266 + XFS_LOG_FORCE|XFS_LOG_SYNC); 267 + XFS_STATS_INC(xs_ig_frecycle); 268 + goto again; 269 + } 270 271 vn_trace_exit(vp, "xfs_iget.alloc", 272 (inst_t *)__return_address);
+25 -28
fs/xfs/xfs_inode.c
··· 2741 { 2742 ASSERT(atomic_read(&ip->i_pincount) > 0); 2743 2744 - if (atomic_dec_and_test(&ip->i_pincount)) { 2745 - /* 2746 - * If the inode is currently being reclaimed, the 2747 - * linux inode _and_ the xfs vnode may have been 2748 - * freed so we cannot reference either of them safely. 2749 - * Hence we should not try to do anything to them 2750 - * if the xfs inode is currently in the reclaim 2751 - * path. 2752 - * 2753 - * However, we still need to issue the unpin wakeup 2754 - * call as the inode reclaim may be blocked waiting for 2755 - * the inode to become unpinned. 2756 - */ 2757 - struct inode *inode = NULL; 2758 2759 - spin_lock(&ip->i_flags_lock); 2760 if (!__xfs_iflags_test(ip, XFS_IRECLAIM|XFS_IRECLAIMABLE)) { 2761 bhv_vnode_t *vp = XFS_ITOV_NULL(ip); 2762 2763 /* make sync come back and flush this inode */ 2764 - if (vp) { 2765 - inode = vn_to_inode(vp); 2766 - 2767 - if (!(inode->i_state & 2768 - (I_NEW|I_FREEING|I_CLEAR))) { 2769 - inode = igrab(inode); 2770 - if (inode) 2771 - mark_inode_dirty_sync(inode); 2772 - } else 2773 - inode = NULL; 2774 - } 2775 } 2776 spin_unlock(&ip->i_flags_lock); 2777 wake_up(&ip->i_ipin_wait); 2778 - if (inode) 2779 - iput(inode); 2780 } 2781 } 2782
··· 2741 { 2742 ASSERT(atomic_read(&ip->i_pincount) > 0); 2743 2744 + if (atomic_dec_and_lock(&ip->i_pincount, &ip->i_flags_lock)) { 2745 2746 + /* 2747 + * If the inode is currently being reclaimed, the link between 2748 + * the bhv_vnode and the xfs_inode will be broken after the 2749 + * XFS_IRECLAIM* flag is set. Hence, if these flags are not 2750 + * set, then we can move forward and mark the linux inode dirty 2751 + * knowing that it is still valid as it won't freed until after 2752 + * the bhv_vnode<->xfs_inode link is broken in xfs_reclaim. The 2753 + * i_flags_lock is used to synchronise the setting of the 2754 + * XFS_IRECLAIM* flags and the breaking of the link, and so we 2755 + * can execute atomically w.r.t to reclaim by holding this lock 2756 + * here. 2757 + * 2758 + * However, we still need to issue the unpin wakeup call as the 2759 + * inode reclaim may be blocked waiting for the inode to become 2760 + * unpinned. 2761 + */ 2762 + 2763 if (!__xfs_iflags_test(ip, XFS_IRECLAIM|XFS_IRECLAIMABLE)) { 2764 bhv_vnode_t *vp = XFS_ITOV_NULL(ip); 2765 + struct inode *inode = NULL; 2766 + 2767 + BUG_ON(vp == NULL); 2768 + inode = vn_to_inode(vp); 2769 + BUG_ON(inode->i_state & I_CLEAR); 2770 2771 /* make sync come back and flush this inode */ 2772 + if (!(inode->i_state & (I_NEW|I_FREEING))) 2773 + mark_inode_dirty_sync(inode); 2774 } 2775 spin_unlock(&ip->i_flags_lock); 2776 wake_up(&ip->i_ipin_wait); 2777 } 2778 } 2779
+14 -7
fs/xfs/xfs_vnodeops.c
··· 3827 */ 3828 xfs_synchronize_atime(ip); 3829 3830 - /* If we have nothing to flush with this inode then complete the 3831 - * teardown now, otherwise break the link between the xfs inode 3832 - * and the linux inode and clean up the xfs inode later. This 3833 - * avoids flushing the inode to disk during the delete operation 3834 - * itself. 3835 */ 3836 if (!ip->i_update_core && (ip->i_itemp == NULL)) { 3837 xfs_ilock(ip, XFS_ILOCK_EXCL); ··· 3845 } else { 3846 xfs_mount_t *mp = ip->i_mount; 3847 3848 - /* Protect sync from us */ 3849 XFS_MOUNT_ILOCK(mp); 3850 vn_bhv_remove(VN_BHV_HEAD(vp), XFS_ITOBHV(ip)); 3851 list_add_tail(&ip->i_reclaim, &mp->m_del_inodes); 3852 - xfs_iflags_set(ip, XFS_IRECLAIMABLE); 3853 XFS_MOUNT_IUNLOCK(mp); 3854 } 3855 return 0;
··· 3827 */ 3828 xfs_synchronize_atime(ip); 3829 3830 + /* 3831 + * If we have nothing to flush with this inode then complete the 3832 + * teardown now, otherwise break the link between the xfs inode and the 3833 + * linux inode and clean up the xfs inode later. This avoids flushing 3834 + * the inode to disk during the delete operation itself. 3835 + * 3836 + * When breaking the link, we need to set the XFS_IRECLAIMABLE flag 3837 + * first to ensure that xfs_iunpin() will never see an xfs inode 3838 + * that has a linux inode being reclaimed. Synchronisation is provided 3839 + * by the i_flags_lock. 3840 */ 3841 if (!ip->i_update_core && (ip->i_itemp == NULL)) { 3842 xfs_ilock(ip, XFS_ILOCK_EXCL); ··· 3840 } else { 3841 xfs_mount_t *mp = ip->i_mount; 3842 3843 + /* Protect sync and unpin from us */ 3844 XFS_MOUNT_ILOCK(mp); 3845 + spin_lock(&ip->i_flags_lock); 3846 + __xfs_iflags_set(ip, XFS_IRECLAIMABLE); 3847 vn_bhv_remove(VN_BHV_HEAD(vp), XFS_ITOBHV(ip)); 3848 + spin_unlock(&ip->i_flags_lock); 3849 list_add_tail(&ip->i_reclaim, &mp->m_del_inodes); 3850 XFS_MOUNT_IUNLOCK(mp); 3851 } 3852 return 0;