Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

xfs: inodes are new until the dentry cache is set up

Al Viro noticed a generic set of issues to do with filehandle lookup
racing with dentry cache setup. They involve a filehandle lookup
occurring while an inode is being created and the filehandle lookup
racing with the dentry creation for the real file. This can lead to
multiple dentries for the one path being instantiated. There are a
host of other issues around this same set of paths.

The underlying cause is that file handle lookup only waits on inode
cache instantiation rather than full dentry cache instantiation. XFS
is mostly immune to the problems discovered due to it's own internal
inode cache, but there are a couple of corner cases where races can
happen.

We currently clear the XFS_INEW flag when the inode is fully set up
after insertion into the cache. Newly allocated inodes are inserted
locked and so aren't usable until the allocation transaction
commits. This, however, occurs before the dentry and security
information is fully initialised and hence the inode is unlocked and
available for lookups to find too early.

To solve the problem, only clear the XFS_INEW flag for newly created
inodes once the dentry is fully instantiated. This means lookups
will retry until the XFS_INEW flag is removed from the inode and
hence avoids the race conditions in questions.

THis also means that xfs_create(), xfs_create_tmpfile() and
xfs_symlink() need to finish the setup of the inode in their error
paths if we had allocated the inode but failed later in the creation
process. xfs_symlink(), in particular, needed a lot of help to make
it's error handling match that of xfs_create().

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Brian Foster <bfoster@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>

authored by

Dave Chinner and committed by
Dave Chinner
58c90473 c517d838

+90 -55
+2 -2
fs/xfs/xfs_icache.c
··· 439 439 *ipp = ip; 440 440 441 441 /* 442 - * If we have a real type for an on-disk inode, we can set ops(&unlock) 442 + * If we have a real type for an on-disk inode, we can setup the inode 443 443 * now. If it's a new inode being created, xfs_ialloc will handle it. 444 444 */ 445 445 if (xfs_iflags_test(ip, XFS_INEW) && ip->i_d.di_mode != 0) 446 - xfs_setup_inode(ip); 446 + xfs_setup_existing_inode(ip); 447 447 return 0; 448 448 449 449 out_error_or_again:
+13 -9
fs/xfs/xfs_inode.c
··· 818 818 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); 819 819 xfs_trans_log_inode(tp, ip, flags); 820 820 821 - /* now that we have an i_mode we can setup inode ops and unlock */ 821 + /* now that we have an i_mode we can setup the inode structure */ 822 822 xfs_setup_inode(ip); 823 823 824 824 *ipp = ip; ··· 1235 1235 xfs_trans_cancel(tp, cancel_flags); 1236 1236 out_release_inode: 1237 1237 /* 1238 - * Wait until after the current transaction is aborted to 1239 - * release the inode. This prevents recursive transactions 1240 - * and deadlocks from xfs_inactive. 1238 + * Wait until after the current transaction is aborted to finish the 1239 + * setup of the inode and release the inode. This prevents recursive 1240 + * transactions and deadlocks from xfs_inactive. 1241 1241 */ 1242 - if (ip) 1242 + if (ip) { 1243 + xfs_finish_inode_setup(ip); 1243 1244 IRELE(ip); 1245 + } 1244 1246 1245 1247 xfs_qm_dqrele(udqp); 1246 1248 xfs_qm_dqrele(gdqp); ··· 1347 1345 xfs_trans_cancel(tp, cancel_flags); 1348 1346 out_release_inode: 1349 1347 /* 1350 - * Wait until after the current transaction is aborted to 1351 - * release the inode. This prevents recursive transactions 1352 - * and deadlocks from xfs_inactive. 1348 + * Wait until after the current transaction is aborted to finish the 1349 + * setup of the inode and release the inode. This prevents recursive 1350 + * transactions and deadlocks from xfs_inactive. 1353 1351 */ 1354 - if (ip) 1352 + if (ip) { 1353 + xfs_finish_inode_setup(ip); 1355 1354 IRELE(ip); 1355 + } 1356 1356 1357 1357 xfs_qm_dqrele(udqp); 1358 1358 xfs_qm_dqrele(gdqp);
+22
fs/xfs/xfs_inode.h
··· 390 390 int xfs_iozero(struct xfs_inode *, loff_t, size_t); 391 391 392 392 393 + /* from xfs_iops.c */ 394 + /* 395 + * When setting up a newly allocated inode, we need to call 396 + * xfs_finish_inode_setup() once the inode is fully instantiated at 397 + * the VFS level to prevent the rest of the world seeing the inode 398 + * before we've completed instantiation. Otherwise we can do it 399 + * the moment the inode lookup is complete. 400 + */ 401 + extern void xfs_setup_inode(struct xfs_inode *ip); 402 + static inline void xfs_finish_inode_setup(struct xfs_inode *ip) 403 + { 404 + xfs_iflags_clear(ip, XFS_INEW); 405 + barrier(); 406 + unlock_new_inode(VFS_I(ip)); 407 + } 408 + 409 + static inline void xfs_setup_existing_inode(struct xfs_inode *ip) 410 + { 411 + xfs_setup_inode(ip); 412 + xfs_finish_inode_setup(ip); 413 + } 414 + 393 415 #define IHOLD(ip) \ 394 416 do { \ 395 417 ASSERT(atomic_read(&VFS_I(ip)->i_count) > 0) ; \
+10 -14
fs/xfs/xfs_iops.c
··· 187 187 else 188 188 d_instantiate(dentry, inode); 189 189 190 + xfs_finish_inode_setup(ip); 191 + 190 192 out_free_acl: 191 193 if (default_acl) 192 194 posix_acl_release(default_acl); ··· 197 195 return error; 198 196 199 197 out_cleanup_inode: 198 + xfs_finish_inode_setup(ip); 200 199 if (!tmpfile) 201 200 xfs_cleanup_inode(dir, inode, dentry); 202 201 iput(inode); ··· 370 367 goto out_cleanup_inode; 371 368 372 369 d_instantiate(dentry, inode); 370 + xfs_finish_inode_setup(cip); 373 371 return 0; 374 372 375 373 out_cleanup_inode: 374 + xfs_finish_inode_setup(cip); 376 375 xfs_cleanup_inode(dir, inode, dentry); 377 376 iput(inode); 378 377 out: ··· 1241 1236 } 1242 1237 1243 1238 /* 1244 - * Initialize the Linux inode, set up the operation vectors and 1245 - * unlock the inode. 1239 + * Initialize the Linux inode and set up the operation vectors. 1246 1240 * 1247 - * When reading existing inodes from disk this is called directly 1248 - * from xfs_iget, when creating a new inode it is called from 1249 - * xfs_ialloc after setting up the inode. 1250 - * 1251 - * We are always called with an uninitialised linux inode here. 1252 - * We need to initialise the necessary fields and take a reference 1253 - * on it. 1241 + * When reading existing inodes from disk this is called directly from xfs_iget, 1242 + * when creating a new inode it is called from xfs_ialloc after setting up the 1243 + * inode. These callers have different criteria for clearing XFS_INEW, so leave 1244 + * it up to the caller to deal with unlocking the inode appropriately. 1254 1245 */ 1255 1246 void 1256 1247 xfs_setup_inode( ··· 1333 1332 inode_has_no_xattr(inode); 1334 1333 cache_no_acl(inode); 1335 1334 } 1336 - 1337 - xfs_iflags_clear(ip, XFS_INEW); 1338 - barrier(); 1339 - 1340 - unlock_new_inode(inode); 1341 1335 }
-2
fs/xfs/xfs_iops.h
··· 25 25 26 26 extern ssize_t xfs_vn_listxattr(struct dentry *, char *data, size_t size); 27 27 28 - extern void xfs_setup_inode(struct xfs_inode *); 29 - 30 28 /* 31 29 * Internal setattr interfaces. 32 30 */
+9 -4
fs/xfs/xfs_qm.c
··· 719 719 xfs_trans_t *tp; 720 720 int error; 721 721 int committed; 722 + bool need_alloc = true; 722 723 723 724 *ip = NULL; 724 725 /* ··· 748 747 return error; 749 748 mp->m_sb.sb_gquotino = NULLFSINO; 750 749 mp->m_sb.sb_pquotino = NULLFSINO; 750 + need_alloc = false; 751 751 } 752 752 } 753 753 ··· 760 758 return error; 761 759 } 762 760 763 - if (!*ip) { 761 + if (need_alloc) { 764 762 error = xfs_dir_ialloc(&tp, NULL, S_IFREG, 1, 0, 0, 1, ip, 765 763 &committed); 766 764 if (error) { ··· 796 794 spin_unlock(&mp->m_sb_lock); 797 795 xfs_log_sb(tp); 798 796 799 - if ((error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES))) { 797 + error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); 798 + if (error) { 799 + ASSERT(XFS_FORCED_SHUTDOWN(mp)); 800 800 xfs_alert(mp, "%s failed (error %d)!", __func__, error); 801 - return error; 802 801 } 803 - return 0; 802 + if (need_alloc) 803 + xfs_finish_inode_setup(*ip); 804 + return error; 804 805 } 805 806 806 807
+34 -24
fs/xfs/xfs_symlink.c
··· 177 177 int pathlen; 178 178 struct xfs_bmap_free free_list; 179 179 xfs_fsblock_t first_block; 180 - bool unlock_dp_on_error = false; 180 + bool unlock_dp_on_error = false; 181 181 uint cancel_flags; 182 182 int committed; 183 183 xfs_fileoff_t first_fsb; ··· 221 221 XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, 222 222 &udqp, &gdqp, &pdqp); 223 223 if (error) 224 - goto std_return; 224 + return error; 225 225 226 226 tp = xfs_trans_alloc(mp, XFS_TRANS_SYMLINK); 227 227 cancel_flags = XFS_TRANS_RELEASE_LOG_RES; ··· 241 241 } 242 242 if (error) { 243 243 cancel_flags = 0; 244 - goto error_return; 244 + goto out_trans_cancel; 245 245 } 246 246 247 247 xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT); ··· 252 252 */ 253 253 if (dp->i_d.di_flags & XFS_DIFLAG_NOSYMLINKS) { 254 254 error = -EPERM; 255 - goto error_return; 255 + goto out_trans_cancel; 256 256 } 257 257 258 258 /* ··· 261 261 error = xfs_trans_reserve_quota(tp, mp, udqp, gdqp, 262 262 pdqp, resblks, 1, 0); 263 263 if (error) 264 - goto error_return; 264 + goto out_trans_cancel; 265 265 266 266 /* 267 267 * Check for ability to enter directory entry, if no space reserved. ··· 269 269 if (!resblks) { 270 270 error = xfs_dir_canenter(tp, dp, link_name); 271 271 if (error) 272 - goto error_return; 272 + goto out_trans_cancel; 273 273 } 274 274 /* 275 275 * Initialize the bmap freelist prior to calling either ··· 282 282 */ 283 283 error = xfs_dir_ialloc(&tp, dp, S_IFLNK | (mode & ~S_IFMT), 1, 0, 284 284 prid, resblks > 0, &ip, NULL); 285 - if (error) { 286 - if (error == -ENOSPC) 287 - goto error_return; 288 - goto error1; 289 - } 285 + if (error) 286 + goto out_trans_cancel; 290 287 291 288 /* 292 - * An error after we've joined dp to the transaction will result in the 293 - * transaction cancel unlocking dp so don't do it explicitly in the 289 + * Now we join the directory inode to the transaction. We do not do it 290 + * earlier because xfs_dir_ialloc might commit the previous transaction 291 + * (and release all the locks). An error from here on will result in 292 + * the transaction cancel unlocking dp so don't do it explicitly in the 294 293 * error path. 295 294 */ 296 295 xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL); ··· 329 330 XFS_BMAPI_METADATA, &first_block, resblks, 330 331 mval, &nmaps, &free_list); 331 332 if (error) 332 - goto error2; 333 + goto out_bmap_cancel; 333 334 334 335 if (resblks) 335 336 resblks -= fs_blocks; ··· 347 348 BTOBB(byte_cnt), 0); 348 349 if (!bp) { 349 350 error = -ENOMEM; 350 - goto error2; 351 + goto out_bmap_cancel; 351 352 } 352 353 bp->b_ops = &xfs_symlink_buf_ops; 353 354 ··· 377 378 error = xfs_dir_createname(tp, dp, link_name, ip->i_ino, 378 379 &first_block, &free_list, resblks); 379 380 if (error) 380 - goto error2; 381 + goto out_bmap_cancel; 381 382 xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); 382 383 xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE); 383 384 ··· 391 392 } 392 393 393 394 error = xfs_bmap_finish(&tp, &free_list, &committed); 394 - if (error) { 395 - goto error2; 396 - } 395 + if (error) 396 + goto out_bmap_cancel; 397 + 397 398 error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); 399 + if (error) 400 + goto out_release_inode; 401 + 398 402 xfs_qm_dqrele(udqp); 399 403 xfs_qm_dqrele(gdqp); 400 404 xfs_qm_dqrele(pdqp); ··· 405 403 *ipp = ip; 406 404 return 0; 407 405 408 - error2: 409 - IRELE(ip); 410 - error1: 406 + out_bmap_cancel: 411 407 xfs_bmap_cancel(&free_list); 412 408 cancel_flags |= XFS_TRANS_ABORT; 413 - error_return: 409 + out_trans_cancel: 414 410 xfs_trans_cancel(tp, cancel_flags); 411 + out_release_inode: 412 + /* 413 + * Wait until after the current transaction is aborted to finish the 414 + * setup of the inode and release the inode. This prevents recursive 415 + * transactions and deadlocks from xfs_inactive. 416 + */ 417 + if (ip) { 418 + xfs_finish_inode_setup(ip); 419 + IRELE(ip); 420 + } 421 + 415 422 xfs_qm_dqrele(udqp); 416 423 xfs_qm_dqrele(gdqp); 417 424 xfs_qm_dqrele(pdqp); 418 425 419 426 if (unlock_dp_on_error) 420 427 xfs_iunlock(dp, XFS_ILOCK_EXCL); 421 - std_return: 422 428 return error; 423 429 } 424 430