Merge branch 'for-linus' of git://oss.sgi.com/xfs/xfs

* 'for-linus' of git://oss.sgi.com/xfs/xfs:
xfs: xfs_swap_extents needs to handle dynamic fork offsets
xfs: fix missing error check in xfs_rtfree_range
xfs: fix stale inode flush avoidance
xfs: Remove inode iolock held check during allocation
xfs: reclaim all inodes by background tree walks
xfs: Avoid inodes in reclaim when flushing from inode cache
xfs: reclaim inodes under a write lock

+201 -130
+6 -8
fs/xfs/linux-2.6/xfs_super.c
··· 954 954 ASSERT_ALWAYS(!xfs_iflags_test(ip, XFS_IRECLAIM)); 955 955 956 956 /* 957 - * If we have nothing to flush with this inode then complete the 958 - * teardown now, otherwise delay the flush operation. 957 + * We always use background reclaim here because even if the 958 + * inode is clean, it still may be under IO and hence we have 959 + * to take the flush lock. The background reclaim path handles 960 + * this more efficiently than we can here, so simply let background 961 + * reclaim tear down all inodes. 959 962 */ 960 - if (!xfs_inode_clean(ip)) { 961 - xfs_inode_set_reclaim_tag(ip); 962 - return; 963 - } 964 - 965 963 out_reclaim: 966 - xfs_ireclaim(ip); 964 + xfs_inode_set_reclaim_tag(ip); 967 965 } 968 966 969 967 /*
+86 -97
fs/xfs/linux-2.6/xfs_sync.c
··· 65 65 * as the tree is sparse and a gang lookup walks to find 66 66 * the number of objects requested. 67 67 */ 68 - read_lock(&pag->pag_ici_lock); 69 68 if (tag == XFS_ICI_NO_TAG) { 70 69 nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, 71 70 (void **)&ip, *first_index, 1); ··· 73 74 (void **)&ip, *first_index, 1, tag); 74 75 } 75 76 if (!nr_found) 76 - goto unlock; 77 + return NULL; 77 78 78 79 /* 79 80 * Update the index for the next lookup. Catch overflows ··· 83 84 */ 84 85 *first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1); 85 86 if (*first_index < XFS_INO_TO_AGINO(mp, ip->i_ino)) 86 - goto unlock; 87 - 87 + return NULL; 88 88 return ip; 89 - 90 - unlock: 91 - read_unlock(&pag->pag_ici_lock); 92 - return NULL; 93 89 } 94 90 95 91 STATIC int ··· 94 100 int (*execute)(struct xfs_inode *ip, 95 101 struct xfs_perag *pag, int flags), 96 102 int flags, 97 - int tag) 103 + int tag, 104 + int exclusive) 98 105 { 99 106 struct xfs_perag *pag = &mp->m_perag[ag]; 100 107 uint32_t first_index; ··· 109 114 int error = 0; 110 115 xfs_inode_t *ip; 111 116 117 + if (exclusive) 118 + write_lock(&pag->pag_ici_lock); 119 + else 120 + read_lock(&pag->pag_ici_lock); 112 121 ip = xfs_inode_ag_lookup(mp, pag, &first_index, tag); 113 - if (!ip) 122 + if (!ip) { 123 + if (exclusive) 124 + write_unlock(&pag->pag_ici_lock); 125 + else 126 + read_unlock(&pag->pag_ici_lock); 114 127 break; 128 + } 115 129 130 + /* execute releases pag->pag_ici_lock */ 116 131 error = execute(ip, pag, flags); 117 132 if (error == EAGAIN) { 118 133 skipped++; ··· 130 125 } 131 126 if (error) 132 127 last_error = error; 133 - /* 134 - * bail out if the filesystem is corrupted. 135 - */ 128 + 129 + /* bail out if the filesystem is corrupted. */ 136 130 if (error == EFSCORRUPTED) 137 131 break; 138 132 ··· 152 148 int (*execute)(struct xfs_inode *ip, 153 149 struct xfs_perag *pag, int flags), 154 150 int flags, 155 - int tag) 151 + int tag, 152 + int exclusive) 156 153 { 157 154 int error = 0; 158 155 int last_error = 0; ··· 162 157 for (ag = 0; ag < mp->m_sb.sb_agcount; ag++) { 163 158 if (!mp->m_perag[ag].pag_ici_init) 164 159 continue; 165 - error = xfs_inode_ag_walk(mp, ag, execute, flags, tag); 160 + error = xfs_inode_ag_walk(mp, ag, execute, flags, tag, 161 + exclusive); 166 162 if (error) { 167 163 last_error = error; 168 164 if (error == EFSCORRUPTED) ··· 180 174 struct xfs_perag *pag) 181 175 { 182 176 struct inode *inode = VFS_I(ip); 177 + int error = EFSCORRUPTED; 183 178 184 179 /* nothing to sync during shutdown */ 185 - if (XFS_FORCED_SHUTDOWN(ip->i_mount)) { 186 - read_unlock(&pag->pag_ici_lock); 187 - return EFSCORRUPTED; 188 - } 180 + if (XFS_FORCED_SHUTDOWN(ip->i_mount)) 181 + goto out_unlock; 189 182 190 - /* 191 - * If we can't get a reference on the inode, it must be in reclaim. 192 - * Leave it for the reclaim code to flush. Also avoid inodes that 193 - * haven't been fully initialised. 194 - */ 195 - if (!igrab(inode)) { 196 - read_unlock(&pag->pag_ici_lock); 197 - return ENOENT; 198 - } 199 - read_unlock(&pag->pag_ici_lock); 183 + /* avoid new or reclaimable inodes. Leave for reclaim code to flush */ 184 + error = ENOENT; 185 + if (xfs_iflags_test(ip, XFS_INEW | XFS_IRECLAIMABLE | XFS_IRECLAIM)) 186 + goto out_unlock; 200 187 201 - if (is_bad_inode(inode) || xfs_iflags_test(ip, XFS_INEW)) { 188 + /* If we can't grab the inode, it must on it's way to reclaim. */ 189 + if (!igrab(inode)) 190 + goto out_unlock; 191 + 192 + if (is_bad_inode(inode)) { 202 193 IRELE(ip); 203 - return ENOENT; 194 + goto out_unlock; 204 195 } 205 196 206 - return 0; 197 + /* inode is valid */ 198 + error = 0; 199 + out_unlock: 200 + read_unlock(&pag->pag_ici_lock); 201 + return error; 207 202 } 208 203 209 204 STATIC int ··· 289 282 ASSERT((flags & ~(SYNC_TRYLOCK|SYNC_WAIT)) == 0); 290 283 291 284 error = xfs_inode_ag_iterator(mp, xfs_sync_inode_data, flags, 292 - XFS_ICI_NO_TAG); 285 + XFS_ICI_NO_TAG, 0); 293 286 if (error) 294 287 return XFS_ERROR(error); 295 288 ··· 311 304 ASSERT((flags & ~SYNC_WAIT) == 0); 312 305 313 306 return xfs_inode_ag_iterator(mp, xfs_sync_inode_attr, flags, 314 - XFS_ICI_NO_TAG); 307 + XFS_ICI_NO_TAG, 0); 315 308 } 316 309 317 310 STATIC int ··· 671 664 kthread_stop(mp->m_sync_task); 672 665 } 673 666 674 - STATIC int 675 - xfs_reclaim_inode( 676 - xfs_inode_t *ip, 677 - int sync_mode) 678 - { 679 - xfs_perag_t *pag = xfs_get_perag(ip->i_mount, ip->i_ino); 680 - 681 - /* The hash lock here protects a thread in xfs_iget_core from 682 - * racing with us on linking the inode back with a vnode. 683 - * Once we have the XFS_IRECLAIM flag set it will not touch 684 - * us. 685 - */ 686 - write_lock(&pag->pag_ici_lock); 687 - spin_lock(&ip->i_flags_lock); 688 - if (__xfs_iflags_test(ip, XFS_IRECLAIM) || 689 - !__xfs_iflags_test(ip, XFS_IRECLAIMABLE)) { 690 - spin_unlock(&ip->i_flags_lock); 691 - write_unlock(&pag->pag_ici_lock); 692 - return -EAGAIN; 693 - } 694 - __xfs_iflags_set(ip, XFS_IRECLAIM); 695 - spin_unlock(&ip->i_flags_lock); 696 - write_unlock(&pag->pag_ici_lock); 697 - xfs_put_perag(ip->i_mount, pag); 698 - 699 - /* 700 - * If the inode is still dirty, then flush it out. If the inode 701 - * is not in the AIL, then it will be OK to flush it delwri as 702 - * long as xfs_iflush() does not keep any references to the inode. 703 - * We leave that decision up to xfs_iflush() since it has the 704 - * knowledge of whether it's OK to simply do a delwri flush of 705 - * the inode or whether we need to wait until the inode is 706 - * pulled from the AIL. 707 - * We get the flush lock regardless, though, just to make sure 708 - * we don't free it while it is being flushed. 709 - */ 710 - xfs_ilock(ip, XFS_ILOCK_EXCL); 711 - xfs_iflock(ip); 712 - 713 - /* 714 - * In the case of a forced shutdown we rely on xfs_iflush() to 715 - * wait for the inode to be unpinned before returning an error. 716 - */ 717 - if (!is_bad_inode(VFS_I(ip)) && xfs_iflush(ip, sync_mode) == 0) { 718 - /* synchronize with xfs_iflush_done */ 719 - xfs_iflock(ip); 720 - xfs_ifunlock(ip); 721 - } 722 - 723 - xfs_iunlock(ip, XFS_ILOCK_EXCL); 724 - xfs_ireclaim(ip); 725 - return 0; 726 - } 727 - 728 667 void 729 668 __xfs_inode_set_reclaim_tag( 730 669 struct xfs_perag *pag, ··· 713 760 } 714 761 715 762 STATIC int 716 - xfs_reclaim_inode_now( 763 + xfs_reclaim_inode( 717 764 struct xfs_inode *ip, 718 765 struct xfs_perag *pag, 719 - int flags) 766 + int sync_mode) 720 767 { 721 - /* ignore if already under reclaim */ 722 - if (xfs_iflags_test(ip, XFS_IRECLAIM)) { 723 - read_unlock(&pag->pag_ici_lock); 768 + /* 769 + * The radix tree lock here protects a thread in xfs_iget from racing 770 + * with us starting reclaim on the inode. Once we have the 771 + * XFS_IRECLAIM flag set it will not touch us. 772 + */ 773 + spin_lock(&ip->i_flags_lock); 774 + ASSERT_ALWAYS(__xfs_iflags_test(ip, XFS_IRECLAIMABLE)); 775 + if (__xfs_iflags_test(ip, XFS_IRECLAIM)) { 776 + /* ignore as it is already under reclaim */ 777 + spin_unlock(&ip->i_flags_lock); 778 + write_unlock(&pag->pag_ici_lock); 724 779 return 0; 725 780 } 726 - read_unlock(&pag->pag_ici_lock); 781 + __xfs_iflags_set(ip, XFS_IRECLAIM); 782 + spin_unlock(&ip->i_flags_lock); 783 + write_unlock(&pag->pag_ici_lock); 727 784 728 - return xfs_reclaim_inode(ip, flags); 785 + /* 786 + * If the inode is still dirty, then flush it out. If the inode 787 + * is not in the AIL, then it will be OK to flush it delwri as 788 + * long as xfs_iflush() does not keep any references to the inode. 789 + * We leave that decision up to xfs_iflush() since it has the 790 + * knowledge of whether it's OK to simply do a delwri flush of 791 + * the inode or whether we need to wait until the inode is 792 + * pulled from the AIL. 793 + * We get the flush lock regardless, though, just to make sure 794 + * we don't free it while it is being flushed. 795 + */ 796 + xfs_ilock(ip, XFS_ILOCK_EXCL); 797 + xfs_iflock(ip); 798 + 799 + /* 800 + * In the case of a forced shutdown we rely on xfs_iflush() to 801 + * wait for the inode to be unpinned before returning an error. 802 + */ 803 + if (!is_bad_inode(VFS_I(ip)) && xfs_iflush(ip, sync_mode) == 0) { 804 + /* synchronize with xfs_iflush_done */ 805 + xfs_iflock(ip); 806 + xfs_ifunlock(ip); 807 + } 808 + 809 + xfs_iunlock(ip, XFS_ILOCK_EXCL); 810 + xfs_ireclaim(ip); 811 + return 0; 729 812 } 730 813 731 814 int ··· 769 780 xfs_mount_t *mp, 770 781 int mode) 771 782 { 772 - return xfs_inode_ag_iterator(mp, xfs_reclaim_inode_now, mode, 773 - XFS_ICI_RECLAIM_TAG); 783 + return xfs_inode_ag_iterator(mp, xfs_reclaim_inode, mode, 784 + XFS_ICI_RECLAIM_TAG, 1); 774 785 }
+1 -1
fs/xfs/linux-2.6/xfs_sync.h
··· 54 54 int xfs_sync_inode_valid(struct xfs_inode *ip, struct xfs_perag *pag); 55 55 int xfs_inode_ag_iterator(struct xfs_mount *mp, 56 56 int (*execute)(struct xfs_inode *ip, struct xfs_perag *pag, int flags), 57 - int flags, int tag); 57 + int flags, int tag, int write_lock); 58 58 59 59 #endif
+1 -1
fs/xfs/quota/xfs_qm_syscalls.c
··· 891 891 uint flags) 892 892 { 893 893 ASSERT(mp->m_quotainfo); 894 - xfs_inode_ag_iterator(mp, xfs_dqrele_inode, flags, XFS_ICI_NO_TAG); 894 + xfs_inode_ag_iterator(mp, xfs_dqrele_inode, flags, XFS_ICI_NO_TAG, 0); 895 895 } 896 896 897 897 /*------------------------------------------------------------------------*/
+90 -16
fs/xfs/xfs_dfrag.c
··· 114 114 return error; 115 115 } 116 116 117 + /* 118 + * We need to check that the format of the data fork in the temporary inode is 119 + * valid for the target inode before doing the swap. This is not a problem with 120 + * attr1 because of the fixed fork offset, but attr2 has a dynamically sized 121 + * data fork depending on the space the attribute fork is taking so we can get 122 + * invalid formats on the target inode. 123 + * 124 + * E.g. target has space for 7 extents in extent format, temp inode only has 125 + * space for 6. If we defragment down to 7 extents, then the tmp format is a 126 + * btree, but when swapped it needs to be in extent format. Hence we can't just 127 + * blindly swap data forks on attr2 filesystems. 128 + * 129 + * Note that we check the swap in both directions so that we don't end up with 130 + * a corrupt temporary inode, either. 131 + * 132 + * Note that fixing the way xfs_fsr sets up the attribute fork in the source 133 + * inode will prevent this situation from occurring, so all we do here is 134 + * reject and log the attempt. basically we are putting the responsibility on 135 + * userspace to get this right. 136 + */ 137 + static int 138 + xfs_swap_extents_check_format( 139 + xfs_inode_t *ip, /* target inode */ 140 + xfs_inode_t *tip) /* tmp inode */ 141 + { 142 + 143 + /* Should never get a local format */ 144 + if (ip->i_d.di_format == XFS_DINODE_FMT_LOCAL || 145 + tip->i_d.di_format == XFS_DINODE_FMT_LOCAL) 146 + return EINVAL; 147 + 148 + /* 149 + * if the target inode has less extents that then temporary inode then 150 + * why did userspace call us? 151 + */ 152 + if (ip->i_d.di_nextents < tip->i_d.di_nextents) 153 + return EINVAL; 154 + 155 + /* 156 + * if the target inode is in extent form and the temp inode is in btree 157 + * form then we will end up with the target inode in the wrong format 158 + * as we already know there are less extents in the temp inode. 159 + */ 160 + if (ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS && 161 + tip->i_d.di_format == XFS_DINODE_FMT_BTREE) 162 + return EINVAL; 163 + 164 + /* Check temp in extent form to max in target */ 165 + if (tip->i_d.di_format == XFS_DINODE_FMT_EXTENTS && 166 + XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK) > ip->i_df.if_ext_max) 167 + return EINVAL; 168 + 169 + /* Check target in extent form to max in temp */ 170 + if (ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS && 171 + XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) > tip->i_df.if_ext_max) 172 + return EINVAL; 173 + 174 + /* Check root block of temp in btree form to max in target */ 175 + if (tip->i_d.di_format == XFS_DINODE_FMT_BTREE && 176 + XFS_IFORK_BOFF(ip) && 177 + tip->i_df.if_broot_bytes > XFS_IFORK_BOFF(ip)) 178 + return EINVAL; 179 + 180 + /* Check root block of target in btree form to max in temp */ 181 + if (ip->i_d.di_format == XFS_DINODE_FMT_BTREE && 182 + XFS_IFORK_BOFF(tip) && 183 + ip->i_df.if_broot_bytes > XFS_IFORK_BOFF(tip)) 184 + return EINVAL; 185 + 186 + return 0; 187 + } 188 + 117 189 int 118 190 xfs_swap_extents( 119 - xfs_inode_t *ip, 120 - xfs_inode_t *tip, 191 + xfs_inode_t *ip, /* target inode */ 192 + xfs_inode_t *tip, /* tmp inode */ 121 193 xfs_swapext_t *sxp) 122 194 { 123 195 xfs_mount_t *mp; ··· 233 161 goto out_unlock; 234 162 } 235 163 236 - /* Should never get a local format */ 237 - if (ip->i_d.di_format == XFS_DINODE_FMT_LOCAL || 238 - tip->i_d.di_format == XFS_DINODE_FMT_LOCAL) { 239 - error = XFS_ERROR(EINVAL); 240 - goto out_unlock; 241 - } 242 - 243 164 if (VN_CACHED(VFS_I(tip)) != 0) { 244 165 error = xfs_flushinval_pages(tip, 0, -1, 245 166 FI_REMAPF_LOCKED); ··· 254 189 goto out_unlock; 255 190 } 256 191 257 - /* 258 - * If the target has extended attributes, the tmp file 259 - * must also in order to ensure the correct data fork 260 - * format. 261 - */ 262 - if ( XFS_IFORK_Q(ip) != XFS_IFORK_Q(tip) ) { 263 - error = XFS_ERROR(EINVAL); 192 + /* check inode formats now that data is flushed */ 193 + error = xfs_swap_extents_check_format(ip, tip); 194 + if (error) { 195 + xfs_fs_cmn_err(CE_NOTE, mp, 196 + "%s: inode 0x%llx format is incompatible for exchanging.", 197 + __FILE__, ip->i_ino); 264 198 goto out_unlock; 265 199 } 266 200 ··· 338 274 *tempifp = *ifp; /* struct copy */ 339 275 *ifp = *tifp; /* struct copy */ 340 276 *tifp = *tempifp; /* struct copy */ 277 + 278 + /* 279 + * Fix the in-memory data fork values that are dependent on the fork 280 + * offset in the inode. We can't assume they remain the same as attr2 281 + * has dynamic fork offsets. 282 + */ 283 + ifp->if_ext_max = XFS_IFORK_SIZE(ip, XFS_DATA_FORK) / 284 + (uint)sizeof(xfs_bmbt_rec_t); 285 + tifp->if_ext_max = XFS_IFORK_SIZE(tip, XFS_DATA_FORK) / 286 + (uint)sizeof(xfs_bmbt_rec_t); 341 287 342 288 /* 343 289 * Fix the on-disk inode values
-1
fs/xfs/xfs_iget.c
··· 73 73 ASSERT(atomic_read(&ip->i_pincount) == 0); 74 74 ASSERT(!spin_is_locked(&ip->i_flags_lock)); 75 75 ASSERT(completion_done(&ip->i_flush)); 76 - ASSERT(!rwsem_is_locked(&ip->i_iolock.mr_lock)); 77 76 78 77 mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino); 79 78
+15 -6
fs/xfs/xfs_inode.c
··· 2842 2842 2843 2843 /* 2844 2844 * If the inode isn't dirty, then just release the inode flush lock and 2845 - * do nothing. Treat stale inodes the same; we cannot rely on the 2846 - * backing buffer remaining stale in cache for the remaining life of 2847 - * the stale inode and so xfs_itobp() below may give us a buffer that 2848 - * no longer contains inodes below. Doing this stale check here also 2849 - * avoids forcing the log on pinned, stale inodes. 2845 + * do nothing. 2850 2846 */ 2851 - if (xfs_inode_clean(ip) || xfs_iflags_test(ip, XFS_ISTALE)) { 2847 + if (xfs_inode_clean(ip)) { 2852 2848 xfs_ifunlock(ip); 2853 2849 return 0; 2854 2850 } ··· 2866 2870 return EAGAIN; 2867 2871 } 2868 2872 xfs_iunpin_wait(ip); 2873 + 2874 + /* 2875 + * For stale inodes we cannot rely on the backing buffer remaining 2876 + * stale in cache for the remaining life of the stale inode and so 2877 + * xfs_itobp() below may give us a buffer that no longer contains 2878 + * inodes below. We have to check this after ensuring the inode is 2879 + * unpinned so that it is safe to reclaim the stale inode after the 2880 + * flush call. 2881 + */ 2882 + if (xfs_iflags_test(ip, XFS_ISTALE)) { 2883 + xfs_ifunlock(ip); 2884 + return 0; 2885 + } 2869 2886 2870 2887 /* 2871 2888 * This may have been unpinned because the filesystem is shutting
+2
fs/xfs/xfs_rtalloc.c
··· 1517 1517 */ 1518 1518 error = xfs_rtfind_forw(mp, tp, end, mp->m_sb.sb_rextents - 1, 1519 1519 &postblock); 1520 + if (error) 1521 + return error; 1520 1522 /* 1521 1523 * If there are blocks not being freed at the front of the 1522 1524 * old extent, add summary data for them to be allocated.