Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

xfs: stop holding ILOCK over filldir callbacks

The recent change to the readdir locking made in 40194ec ("xfs:
reinstate the ilock in xfs_readdir") for CXFS directory sanity was
probably the wrong thing to do. Deep in the readdir code we
can take page faults in the filldir callback, and so taking a page
fault while holding an inode ilock creates a new set of locking
issues that lockdep warns all over the place about.

The locking order for regular inodes w.r.t. page faults is io_lock
-> pagefault -> mmap_sem -> ilock. The directory readdir code now
triggers ilock -> page fault -> mmap_sem. While we cannot deadlock
at this point, it inverts all the locking patterns that lockdep
normally sees on XFS inodes, and so triggers lockdep. We worked
around this with commit 93a8614 ("xfs: fix directory inode iolock
lockdep false positive"), but that then just moved the lockdep
warning to deeper in the page fault path and triggered on security
inode locks. Fixing the shmem issue there just moved the lockdep
reports somewhere else, and now we are getting false positives from
filesystem freezing annotations getting confused.

Further, if we enter memory reclaim in a readdir path, we now get
lockdep warning about potential deadlocks because the ilock is held
when we enter reclaim. This, again, is different to a regular file
in that we never allow memory reclaim to run while holding the ilock
for regular files. Hence lockdep now throws
ilock->kmalloc->reclaim->ilock warnings.

Basically, the problem is that the ilock is being used to protect
the directory data and the inode metadata, whereas for a regular
file the iolock protects the data and the ilock protects the
metadata. From the VFS perspective, the i_mutex serialises all
accesses to the directory data, and so not holding the ilock for
readdir doesn't matter. The issue is that CXFS doesn't access
directory data via the VFS, so it has no "data serialisaton"
mechanism. Hence we need to hold the IOLOCK in the correct places to
provide this low level directory data access serialisation.

The ilock can then be used just when the extent list needs to be
read, just like we do for regular files. The directory modification
code can take the iolock exclusive when the ilock is also taken,
and this then ensures that readdir is correct excluded while
modifications are in progress.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Brian Foster <bfoster@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>

authored by

Dave Chinner and committed by
Dave Chinner
dbad7c99 0952c818

+36 -19
+3
fs/xfs/libxfs/xfs_dir2.c
··· 362 362 struct xfs_da_args *args; 363 363 int rval; 364 364 int v; /* type-checking value */ 365 + int lock_mode; 365 366 366 367 ASSERT(S_ISDIR(dp->i_d.di_mode)); 367 368 XFS_STATS_INC(xs_dir_lookup); ··· 388 387 if (ci_name) 389 388 args->op_flags |= XFS_DA_OP_CILOOKUP; 390 389 390 + lock_mode = xfs_ilock_data_map_shared(dp); 391 391 if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) { 392 392 rval = xfs_dir2_sf_lookup(args); 393 393 goto out_check_rval; ··· 421 419 } 422 420 } 423 421 out_free: 422 + xfs_iunlock(dp, lock_mode); 424 423 kmem_free(args); 425 424 return rval; 426 425 }
+8 -3
fs/xfs/xfs_dir2_readdir.c
··· 171 171 int wantoff; /* starting block offset */ 172 172 xfs_off_t cook; 173 173 struct xfs_da_geometry *geo = args->geo; 174 + int lock_mode; 174 175 175 176 /* 176 177 * If the block number in the offset is out of range, we're done. ··· 179 178 if (xfs_dir2_dataptr_to_db(geo, ctx->pos) > geo->datablk) 180 179 return 0; 181 180 181 + lock_mode = xfs_ilock_data_map_shared(dp); 182 182 error = xfs_dir3_block_read(NULL, dp, &bp); 183 + xfs_iunlock(dp, lock_mode); 183 184 if (error) 184 185 return error; 185 186 ··· 532 529 * current buffer, need to get another one. 533 530 */ 534 531 if (!bp || ptr >= (char *)bp->b_addr + geo->blksize) { 532 + int lock_mode; 535 533 534 + lock_mode = xfs_ilock_data_map_shared(dp); 536 535 error = xfs_dir2_leaf_readbuf(args, bufsize, map_info, 537 536 &curoff, &bp); 537 + xfs_iunlock(dp, lock_mode); 538 538 if (error || !map_info->map_valid) 539 539 break; 540 540 ··· 659 653 struct xfs_da_args args = { NULL }; 660 654 int rval; 661 655 int v; 662 - uint lock_mode; 663 656 664 657 trace_xfs_readdir(dp); 665 658 ··· 671 666 args.dp = dp; 672 667 args.geo = dp->i_mount->m_dir_geo; 673 668 674 - lock_mode = xfs_ilock_data_map_shared(dp); 669 + xfs_ilock(dp, XFS_IOLOCK_SHARED); 675 670 if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) 676 671 rval = xfs_dir2_sf_getdents(&args, ctx); 677 672 else if ((rval = xfs_dir2_isblock(&args, &v))) ··· 680 675 rval = xfs_dir2_block_getdents(&args, ctx); 681 676 else 682 677 rval = xfs_dir2_leaf_getdents(&args, ctx, bufsize); 683 - xfs_iunlock(dp, lock_mode); 678 + xfs_iunlock(dp, XFS_IOLOCK_SHARED); 684 679 685 680 return rval; 686 681 }
+21 -13
fs/xfs/xfs_inode.c
··· 663 663 { 664 664 xfs_ino_t inum; 665 665 int error; 666 - uint lock_mode; 667 666 668 667 trace_xfs_lookup(dp, name); 669 668 670 669 if (XFS_FORCED_SHUTDOWN(dp->i_mount)) 671 670 return -EIO; 672 671 673 - lock_mode = xfs_ilock_data_map_shared(dp); 672 + xfs_ilock(dp, XFS_IOLOCK_SHARED); 674 673 error = xfs_dir_lookup(NULL, dp, name, &inum, ci_name); 675 - xfs_iunlock(dp, lock_mode); 676 - 677 674 if (error) 678 - goto out; 675 + goto out_unlock; 679 676 680 677 error = xfs_iget(dp->i_mount, NULL, inum, 0, 0, ipp); 681 678 if (error) 682 679 goto out_free_name; 683 680 681 + xfs_iunlock(dp, XFS_IOLOCK_SHARED); 684 682 return 0; 685 683 686 684 out_free_name: 687 685 if (ci_name) 688 686 kmem_free(ci_name->name); 689 - out: 687 + out_unlock: 688 + xfs_iunlock(dp, XFS_IOLOCK_SHARED); 690 689 *ipp = NULL; 691 690 return error; 692 691 } ··· 1182 1183 goto out_trans_cancel; 1183 1184 1184 1185 1185 - xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT); 1186 + xfs_ilock(dp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL | 1187 + XFS_IOLOCK_PARENT | XFS_ILOCK_PARENT); 1186 1188 unlock_dp_on_error = true; 1187 1189 1188 1190 xfs_bmap_init(&free_list, &first_block); ··· 1222 1222 * the transaction cancel unlocking dp so don't do it explicitly in the 1223 1223 * error path. 1224 1224 */ 1225 - xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL); 1225 + xfs_trans_ijoin(tp, dp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL); 1226 1226 unlock_dp_on_error = false; 1227 1227 1228 1228 error = xfs_dir_createname(tp, dp, name, ip->i_ino, ··· 1295 1295 xfs_qm_dqrele(pdqp); 1296 1296 1297 1297 if (unlock_dp_on_error) 1298 - xfs_iunlock(dp, XFS_ILOCK_EXCL); 1298 + xfs_iunlock(dp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL); 1299 1299 return error; 1300 1300 } 1301 1301 ··· 1443 1443 if (error) 1444 1444 goto error_return; 1445 1445 1446 + xfs_ilock(tdp, XFS_IOLOCK_EXCL | XFS_IOLOCK_PARENT); 1446 1447 xfs_lock_two_inodes(sip, tdp, XFS_ILOCK_EXCL); 1447 1448 1448 1449 xfs_trans_ijoin(tp, sip, XFS_ILOCK_EXCL); 1449 - xfs_trans_ijoin(tp, tdp, XFS_ILOCK_EXCL); 1450 + xfs_trans_ijoin(tp, tdp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL); 1450 1451 1451 1452 /* 1452 1453 * If we are using project inheritance, we only allow hard link ··· 2550 2549 goto out_trans_cancel; 2551 2550 } 2552 2551 2552 + xfs_ilock(dp, XFS_IOLOCK_EXCL | XFS_IOLOCK_PARENT); 2553 2553 xfs_lock_two_inodes(dp, ip, XFS_ILOCK_EXCL); 2554 2554 2555 - xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL); 2555 + xfs_trans_ijoin(tp, dp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL); 2556 2556 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); 2557 2557 2558 2558 /* ··· 2934 2932 * whether the target directory is the same as the source 2935 2933 * directory, we can lock from 2 to 4 inodes. 2936 2934 */ 2935 + if (!new_parent) 2936 + xfs_ilock(src_dp, XFS_IOLOCK_EXCL | XFS_IOLOCK_PARENT); 2937 + else 2938 + xfs_lock_two_inodes(src_dp, target_dp, 2939 + XFS_IOLOCK_EXCL | XFS_IOLOCK_PARENT); 2940 + 2937 2941 xfs_lock_inodes(inodes, num_inodes, XFS_ILOCK_EXCL); 2938 2942 2939 2943 /* ··· 2947 2939 * we can rely on either trans_commit or trans_cancel to unlock 2948 2940 * them. 2949 2941 */ 2950 - xfs_trans_ijoin(tp, src_dp, XFS_ILOCK_EXCL); 2942 + xfs_trans_ijoin(tp, src_dp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL); 2951 2943 if (new_parent) 2952 - xfs_trans_ijoin(tp, target_dp, XFS_ILOCK_EXCL); 2944 + xfs_trans_ijoin(tp, target_dp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL); 2953 2945 xfs_trans_ijoin(tp, src_ip, XFS_ILOCK_EXCL); 2954 2946 if (target_ip) 2955 2947 xfs_trans_ijoin(tp, target_ip, XFS_ILOCK_EXCL);
+4 -3
fs/xfs/xfs_symlink.c
··· 240 240 if (error) 241 241 goto out_trans_cancel; 242 242 243 - xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT); 243 + xfs_ilock(dp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL | 244 + XFS_IOLOCK_PARENT | XFS_ILOCK_PARENT); 244 245 unlock_dp_on_error = true; 245 246 246 247 /* ··· 289 288 * the transaction cancel unlocking dp so don't do it explicitly in the 290 289 * error path. 291 290 */ 292 - xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL); 291 + xfs_trans_ijoin(tp, dp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL); 293 292 unlock_dp_on_error = false; 294 293 295 294 /* ··· 422 421 xfs_qm_dqrele(pdqp); 423 422 424 423 if (unlock_dp_on_error) 425 - xfs_iunlock(dp, XFS_ILOCK_EXCL); 424 + xfs_iunlock(dp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL); 426 425 return error; 427 426 } 428 427