Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge tag 'xfs-5.4-merge-7' of git://git.kernel.org/pub/scm/fs/xfs/xfs-linux

Pull xfs updates from Darrick Wong:
"For this cycle we have the usual pile of cleanups and bug fixes, some
performance improvements for online metadata scrubbing, massive
speedups in the directory entry creation code, some performance
improvement in the file ACL lookup code, a fix for a logging stall
during mount, and fixes for concurrency problems.

It has survived a couple of weeks of xfstests runs and merges cleanly.

Summary:

- Remove KM_SLEEP/KM_NOSLEEP.

- Ensure that memory buffers for IO are properly sector-aligned to
avoid problems that the block layer doesn't check.

- Make the bmap scrubber more efficient in its record checking.

- Don't crash xfs_db when superblock inode geometry is corrupt.

- Fix btree key helper functions.

- Remove unneeded error returns for things that can't fail.

- Fix buffer logging bugs in repair.

- Clean up iterator return values.

- Speed up directory entry creation.

- Enable allocation of xattr value memory buffer during lookup.

- Fix readahead racing with truncate/punch hole.

- Other minor cleanups.

- Fix one AGI/AGF deadlock with RENAME_WHITEOUT.

- More BUG -> WARN whackamole.

- Fix various problems with the log failing to advance under certain
circumstances, which results in stalls during mount"

* tag 'xfs-5.4-merge-7' of git://git.kernel.org/pub/scm/fs/xfs/xfs-linux: (45 commits)
xfs: push the grant head when the log head moves forward
xfs: push iclog state cleaning into xlog_state_clean_log
xfs: factor iclog state processing out of xlog_state_do_callback()
xfs: factor callbacks out of xlog_state_do_callback()
xfs: factor debug code out of xlog_state_do_callback()
xfs: prevent CIL push holdoff in log recovery
xfs: fix missed wakeup on l_flush_wait
xfs: push the AIL in xlog_grant_head_wake
xfs: Use WARN_ON_ONCE for bailout mount-operation
xfs: Fix deadlock between AGI and AGF with RENAME_WHITEOUT
xfs: define a flags field for the AG geometry ioctl structure
xfs: add a xfs_valid_startblock helper
xfs: remove the unused XFS_ALLOC_USERDATA flag
xfs: cleanup xfs_fsb_to_db
xfs: fix the dax supported check in xfs_ioctl_setattr_dax_invalidate
xfs: Fix stale data exposure when readahead races with hole punch
fs: Export generic_fadvise()
mm: Handle MADV_WILLNEED through vfs_fadvise()
xfs: allocate xattr buffer on demand
xfs: consolidate attribute value copying
...

+1338 -1112
+59 -20
fs/xfs/kmem.c
··· 3 3 * Copyright (c) 2000-2005 Silicon Graphics, Inc. 4 4 * All Rights Reserved. 5 5 */ 6 - #include <linux/sched/mm.h> 6 + #include "xfs.h" 7 7 #include <linux/backing-dev.h> 8 - #include "kmem.h" 9 8 #include "xfs_message.h" 9 + #include "xfs_trace.h" 10 10 11 11 void * 12 12 kmem_alloc(size_t size, xfs_km_flags_t flags) ··· 15 15 gfp_t lflags = kmem_flags_convert(flags); 16 16 void *ptr; 17 17 18 + trace_kmem_alloc(size, flags, _RET_IP_); 19 + 18 20 do { 19 21 ptr = kmalloc(size, lflags); 20 - if (ptr || (flags & (KM_MAYFAIL|KM_NOSLEEP))) 22 + if (ptr || (flags & KM_MAYFAIL)) 21 23 return ptr; 22 24 if (!(++retries % 100)) 23 25 xfs_err(NULL, ··· 30 28 } while (1); 31 29 } 32 30 33 - void * 34 - kmem_alloc_large(size_t size, xfs_km_flags_t flags) 31 + 32 + /* 33 + * __vmalloc() will allocate data pages and auxillary structures (e.g. 34 + * pagetables) with GFP_KERNEL, yet we may be under GFP_NOFS context here. Hence 35 + * we need to tell memory reclaim that we are in such a context via 36 + * PF_MEMALLOC_NOFS to prevent memory reclaim re-entering the filesystem here 37 + * and potentially deadlocking. 38 + */ 39 + static void * 40 + __kmem_vmalloc(size_t size, xfs_km_flags_t flags) 35 41 { 36 42 unsigned nofs_flag = 0; 37 43 void *ptr; 38 - gfp_t lflags; 44 + gfp_t lflags = kmem_flags_convert(flags); 39 45 40 - ptr = kmem_alloc(size, flags | KM_MAYFAIL); 41 - if (ptr) 42 - return ptr; 43 - 44 - /* 45 - * __vmalloc() will allocate data pages and auxillary structures (e.g. 46 - * pagetables) with GFP_KERNEL, yet we may be under GFP_NOFS context 47 - * here. Hence we need to tell memory reclaim that we are in such a 48 - * context via PF_MEMALLOC_NOFS to prevent memory reclaim re-entering 49 - * the filesystem here and potentially deadlocking. 50 - */ 51 46 if (flags & KM_NOFS) 52 47 nofs_flag = memalloc_nofs_save(); 53 48 54 - lflags = kmem_flags_convert(flags); 55 49 ptr = __vmalloc(size, lflags, PAGE_KERNEL); 56 50 57 51 if (flags & KM_NOFS) 58 52 memalloc_nofs_restore(nofs_flag); 59 53 60 54 return ptr; 55 + } 56 + 57 + /* 58 + * Same as kmem_alloc_large, except we guarantee the buffer returned is aligned 59 + * to the @align_mask. We only guarantee alignment up to page size, we'll clamp 60 + * alignment at page size if it is larger. vmalloc always returns a PAGE_SIZE 61 + * aligned region. 62 + */ 63 + void * 64 + kmem_alloc_io(size_t size, int align_mask, xfs_km_flags_t flags) 65 + { 66 + void *ptr; 67 + 68 + trace_kmem_alloc_io(size, flags, _RET_IP_); 69 + 70 + if (WARN_ON_ONCE(align_mask >= PAGE_SIZE)) 71 + align_mask = PAGE_SIZE - 1; 72 + 73 + ptr = kmem_alloc(size, flags | KM_MAYFAIL); 74 + if (ptr) { 75 + if (!((uintptr_t)ptr & align_mask)) 76 + return ptr; 77 + kfree(ptr); 78 + } 79 + return __kmem_vmalloc(size, flags); 80 + } 81 + 82 + void * 83 + kmem_alloc_large(size_t size, xfs_km_flags_t flags) 84 + { 85 + void *ptr; 86 + 87 + trace_kmem_alloc_large(size, flags, _RET_IP_); 88 + 89 + ptr = kmem_alloc(size, flags | KM_MAYFAIL); 90 + if (ptr) 91 + return ptr; 92 + return __kmem_vmalloc(size, flags); 61 93 } 62 94 63 95 void * ··· 101 65 gfp_t lflags = kmem_flags_convert(flags); 102 66 void *ptr; 103 67 68 + trace_kmem_realloc(newsize, flags, _RET_IP_); 69 + 104 70 do { 105 71 ptr = krealloc(old, newsize, lflags); 106 - if (ptr || (flags & (KM_MAYFAIL|KM_NOSLEEP))) 72 + if (ptr || (flags & KM_MAYFAIL)) 107 73 return ptr; 108 74 if (!(++retries % 100)) 109 75 xfs_err(NULL, ··· 123 85 gfp_t lflags = kmem_flags_convert(flags); 124 86 void *ptr; 125 87 88 + trace_kmem_zone_alloc(kmem_cache_size(zone), flags, _RET_IP_); 126 89 do { 127 90 ptr = kmem_cache_alloc(zone, lflags); 128 - if (ptr || (flags & (KM_MAYFAIL|KM_NOSLEEP))) 91 + if (ptr || (flags & KM_MAYFAIL)) 129 92 return ptr; 130 93 if (!(++retries % 100)) 131 94 xfs_err(NULL,
+5 -10
fs/xfs/kmem.h
··· 16 16 */ 17 17 18 18 typedef unsigned __bitwise xfs_km_flags_t; 19 - #define KM_SLEEP ((__force xfs_km_flags_t)0x0001u) 20 - #define KM_NOSLEEP ((__force xfs_km_flags_t)0x0002u) 21 19 #define KM_NOFS ((__force xfs_km_flags_t)0x0004u) 22 20 #define KM_MAYFAIL ((__force xfs_km_flags_t)0x0008u) 23 21 #define KM_ZERO ((__force xfs_km_flags_t)0x0010u) ··· 30 32 { 31 33 gfp_t lflags; 32 34 33 - BUG_ON(flags & ~(KM_SLEEP|KM_NOSLEEP|KM_NOFS|KM_MAYFAIL|KM_ZERO)); 35 + BUG_ON(flags & ~(KM_NOFS|KM_MAYFAIL|KM_ZERO)); 34 36 35 - if (flags & KM_NOSLEEP) { 36 - lflags = GFP_ATOMIC | __GFP_NOWARN; 37 - } else { 38 - lflags = GFP_KERNEL | __GFP_NOWARN; 39 - if (flags & KM_NOFS) 40 - lflags &= ~__GFP_FS; 41 - } 37 + lflags = GFP_KERNEL | __GFP_NOWARN; 38 + if (flags & KM_NOFS) 39 + lflags &= ~__GFP_FS; 42 40 43 41 /* 44 42 * Default page/slab allocator behavior is to retry for ever ··· 53 59 } 54 60 55 61 extern void *kmem_alloc(size_t, xfs_km_flags_t); 62 + extern void *kmem_alloc_io(size_t size, int align_mask, xfs_km_flags_t flags); 56 63 extern void *kmem_alloc_large(size_t size, xfs_km_flags_t); 57 64 extern void *kmem_realloc(const void *, size_t, xfs_km_flags_t); 58 65 static inline void kmem_free(const void *ptr)
+1 -1
fs/xfs/libxfs/xfs_alloc.c
··· 2205 2205 ASSERT(xfs_bmap_free_item_zone != NULL); 2206 2206 ASSERT(oinfo != NULL); 2207 2207 2208 - new = kmem_zone_alloc(xfs_bmap_free_item_zone, KM_SLEEP); 2208 + new = kmem_zone_alloc(xfs_bmap_free_item_zone, 0); 2209 2209 new->xefi_startblock = XFS_AGB_TO_FSB(mp, agno, agbno); 2210 2210 new->xefi_blockcount = 1; 2211 2211 new->xefi_oinfo = *oinfo;
+3 -4
fs/xfs/libxfs/xfs_alloc.h
··· 81 81 /* 82 82 * Defines for datatype 83 83 */ 84 - #define XFS_ALLOC_USERDATA (1 << 0)/* allocation is for user data*/ 85 - #define XFS_ALLOC_INITIAL_USER_DATA (1 << 1)/* special case start of file */ 86 - #define XFS_ALLOC_USERDATA_ZERO (1 << 2)/* zero extent on allocation */ 87 - #define XFS_ALLOC_NOBUSY (1 << 3)/* Busy extents not allowed */ 84 + #define XFS_ALLOC_INITIAL_USER_DATA (1 << 0)/* special case start of file */ 85 + #define XFS_ALLOC_USERDATA_ZERO (1 << 1)/* zero extent on allocation */ 86 + #define XFS_ALLOC_NOBUSY (1 << 2)/* Busy extents not allowed */ 88 87 89 88 static inline bool 90 89 xfs_alloc_is_userdata(int datatype)
+55 -24
fs/xfs/libxfs/xfs_attr.c
··· 97 97 * Overall external interface routines. 98 98 *========================================================================*/ 99 99 100 - /* Retrieve an extended attribute and its value. Must have ilock. */ 100 + /* 101 + * Retrieve an extended attribute and its value. Must have ilock. 102 + * Returns 0 on successful retrieval, otherwise an error. 103 + */ 101 104 int 102 105 xfs_attr_get_ilocked( 103 106 struct xfs_inode *ip, ··· 118 115 return xfs_attr_node_get(args); 119 116 } 120 117 121 - /* Retrieve an extended attribute by name, and its value. */ 118 + /* 119 + * Retrieve an extended attribute by name, and its value if requested. 120 + * 121 + * If ATTR_KERNOVAL is set in @flags, then the caller does not want the value, 122 + * just an indication whether the attribute exists and the size of the value if 123 + * it exists. The size is returned in @valuelenp, 124 + * 125 + * If the attribute is found, but exceeds the size limit set by the caller in 126 + * @valuelenp, return -ERANGE with the size of the attribute that was found in 127 + * @valuelenp. 128 + * 129 + * If ATTR_ALLOC is set in @flags, allocate the buffer for the value after 130 + * existence of the attribute has been determined. On success, return that 131 + * buffer to the caller and leave them to free it. On failure, free any 132 + * allocated buffer and ensure the buffer pointer returned to the caller is 133 + * null. 134 + */ 122 135 int 123 136 xfs_attr_get( 124 137 struct xfs_inode *ip, 125 138 const unsigned char *name, 126 - unsigned char *value, 139 + unsigned char **value, 127 140 int *valuelenp, 128 141 int flags) 129 142 { 130 143 struct xfs_da_args args; 131 144 uint lock_mode; 132 145 int error; 146 + 147 + ASSERT((flags & (ATTR_ALLOC | ATTR_KERNOVAL)) || *value); 133 148 134 149 XFS_STATS_INC(ip->i_mount, xs_attr_get); 135 150 ··· 158 137 if (error) 159 138 return error; 160 139 161 - args.value = value; 162 - args.valuelen = *valuelenp; 163 140 /* Entirely possible to look up a name which doesn't exist */ 164 141 args.op_flags = XFS_DA_OP_OKNOENT; 142 + if (flags & ATTR_ALLOC) 143 + args.op_flags |= XFS_DA_OP_ALLOCVAL; 144 + else 145 + args.value = *value; 146 + args.valuelen = *valuelenp; 165 147 166 148 lock_mode = xfs_ilock_attr_map_shared(ip); 167 149 error = xfs_attr_get_ilocked(ip, &args); 168 150 xfs_iunlock(ip, lock_mode); 169 - 170 151 *valuelenp = args.valuelen; 171 - return error == -EEXIST ? 0 : error; 152 + 153 + /* on error, we have to clean up allocated value buffers */ 154 + if (error) { 155 + if (flags & ATTR_ALLOC) { 156 + kmem_free(args.value); 157 + *value = NULL; 158 + } 159 + return error; 160 + } 161 + *value = args.value; 162 + return 0; 172 163 } 173 164 174 165 /* ··· 801 768 * 802 769 * This leaf block cannot have a "remote" value, we only call this routine 803 770 * if bmap_one_block() says there is only one block (ie: no remote blks). 771 + * 772 + * Returns 0 on successful retrieval, otherwise an error. 804 773 */ 805 774 STATIC int 806 775 xfs_attr_leaf_get(xfs_da_args_t *args) ··· 824 789 } 825 790 error = xfs_attr3_leaf_getvalue(bp, args); 826 791 xfs_trans_brelse(args->trans, bp); 827 - if (!error && (args->rmtblkno > 0) && !(args->flags & ATTR_KERNOVAL)) { 828 - error = xfs_attr_rmtval_get(args); 829 - } 830 792 return error; 831 793 } 832 794 ··· 1300 1268 } 1301 1269 1302 1270 /* 1303 - * Look up a filename in a node attribute list. 1271 + * Retrieve the attribute data from a node attribute list. 1304 1272 * 1305 1273 * This routine gets called for any attribute fork that has more than one 1306 1274 * block, ie: both true Btree attr lists and for single-leaf-blocks with 1307 1275 * "remote" values taking up more blocks. 1276 + * 1277 + * Returns 0 on successful retrieval, otherwise an error. 1308 1278 */ 1309 1279 STATIC int 1310 1280 xfs_attr_node_get(xfs_da_args_t *args) ··· 1328 1294 error = xfs_da3_node_lookup_int(state, &retval); 1329 1295 if (error) { 1330 1296 retval = error; 1331 - } else if (retval == -EEXIST) { 1332 - blk = &state->path.blk[ state->path.active-1 ]; 1333 - ASSERT(blk->bp != NULL); 1334 - ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC); 1335 - 1336 - /* 1337 - * Get the value, local or "remote" 1338 - */ 1339 - retval = xfs_attr3_leaf_getvalue(blk->bp, args); 1340 - if (!retval && (args->rmtblkno > 0) 1341 - && !(args->flags & ATTR_KERNOVAL)) { 1342 - retval = xfs_attr_rmtval_get(args); 1343 - } 1297 + goto out_release; 1344 1298 } 1299 + if (retval != -EEXIST) 1300 + goto out_release; 1301 + 1302 + /* 1303 + * Get the value, local or "remote" 1304 + */ 1305 + blk = &state->path.blk[state->path.active - 1]; 1306 + retval = xfs_attr3_leaf_getvalue(blk->bp, args); 1345 1307 1346 1308 /* 1347 1309 * If not in a transaction, we have to release all the buffers. 1348 1310 */ 1311 + out_release: 1349 1312 for (i = 0; i < state->path.active; i++) { 1350 1313 xfs_trans_brelse(args->trans, state->path.blk[i].bp); 1351 1314 state->path.blk[i].bp = NULL;
+4 -2
fs/xfs/libxfs/xfs_attr.h
··· 37 37 #define ATTR_KERNOVAL 0x2000 /* [kernel] get attr size only, not value */ 38 38 39 39 #define ATTR_INCOMPLETE 0x4000 /* [kernel] return INCOMPLETE attr keys */ 40 + #define ATTR_ALLOC 0x8000 /* allocate xattr buffer on demand */ 40 41 41 42 #define XFS_ATTR_FLAGS \ 42 43 { ATTR_DONTFOLLOW, "DONTFOLLOW" }, \ ··· 48 47 { ATTR_REPLACE, "REPLACE" }, \ 49 48 { ATTR_KERNOTIME, "KERNOTIME" }, \ 50 49 { ATTR_KERNOVAL, "KERNOVAL" }, \ 51 - { ATTR_INCOMPLETE, "INCOMPLETE" } 50 + { ATTR_INCOMPLETE, "INCOMPLETE" }, \ 51 + { ATTR_ALLOC, "ALLOC" } 52 52 53 53 /* 54 54 * The maximum size (into the kernel or returned from the kernel) of an ··· 145 143 int xfs_inode_hasattr(struct xfs_inode *ip); 146 144 int xfs_attr_get_ilocked(struct xfs_inode *ip, struct xfs_da_args *args); 147 145 int xfs_attr_get(struct xfs_inode *ip, const unsigned char *name, 148 - unsigned char *value, int *valuelenp, int flags); 146 + unsigned char **value, int *valuelenp, int flags); 149 147 int xfs_attr_set(struct xfs_inode *dp, const unsigned char *name, 150 148 unsigned char *value, int valuelen, int flags); 151 149 int xfs_attr_set_args(struct xfs_da_args *args);
+76 -52
fs/xfs/libxfs/xfs_attr_leaf.c
··· 393 393 return XFS_ATTR_NSP_ONDISK(ondisk_flags) == XFS_ATTR_NSP_ARGS_TO_ONDISK(arg_flags); 394 394 } 395 395 396 + static int 397 + xfs_attr_copy_value( 398 + struct xfs_da_args *args, 399 + unsigned char *value, 400 + int valuelen) 401 + { 402 + /* 403 + * No copy if all we have to do is get the length 404 + */ 405 + if (args->flags & ATTR_KERNOVAL) { 406 + args->valuelen = valuelen; 407 + return 0; 408 + } 409 + 410 + /* 411 + * No copy if the length of the existing buffer is too small 412 + */ 413 + if (args->valuelen < valuelen) { 414 + args->valuelen = valuelen; 415 + return -ERANGE; 416 + } 417 + 418 + if (args->op_flags & XFS_DA_OP_ALLOCVAL) { 419 + args->value = kmem_alloc_large(valuelen, 0); 420 + if (!args->value) 421 + return -ENOMEM; 422 + } 423 + args->valuelen = valuelen; 424 + 425 + /* remote block xattr requires IO for copy-in */ 426 + if (args->rmtblkno) 427 + return xfs_attr_rmtval_get(args); 428 + 429 + /* 430 + * This is to prevent a GCC warning because the remote xattr case 431 + * doesn't have a value to pass in. In that case, we never reach here, 432 + * but GCC can't work that out and so throws a "passing NULL to 433 + * memcpy" warning. 434 + */ 435 + if (!value) 436 + return -EINVAL; 437 + memcpy(args->value, value, valuelen); 438 + return 0; 439 + } 396 440 397 441 /*======================================================================== 398 442 * External routines when attribute fork size < XFS_LITINO(mp). ··· 764 720 } 765 721 766 722 /* 767 - * Look up a name in a shortform attribute list structure. 723 + * Retreive the attribute value and length. 724 + * 725 + * If ATTR_KERNOVAL is specified, only the length needs to be returned. 726 + * Unlike a lookup, we only return an error if the attribute does not 727 + * exist or we can't retrieve the value. 768 728 */ 769 - /*ARGSUSED*/ 770 729 int 771 - xfs_attr_shortform_getvalue(xfs_da_args_t *args) 730 + xfs_attr_shortform_getvalue( 731 + struct xfs_da_args *args) 772 732 { 773 - xfs_attr_shortform_t *sf; 774 - xfs_attr_sf_entry_t *sfe; 775 - int i; 733 + struct xfs_attr_shortform *sf; 734 + struct xfs_attr_sf_entry *sfe; 735 + int i; 776 736 777 737 ASSERT(args->dp->i_afp->if_flags == XFS_IFINLINE); 778 738 sf = (xfs_attr_shortform_t *)args->dp->i_afp->if_u1.if_data; ··· 789 741 continue; 790 742 if (!xfs_attr_namesp_match(args->flags, sfe->flags)) 791 743 continue; 792 - if (args->flags & ATTR_KERNOVAL) { 793 - args->valuelen = sfe->valuelen; 794 - return -EEXIST; 795 - } 796 - if (args->valuelen < sfe->valuelen) { 797 - args->valuelen = sfe->valuelen; 798 - return -ERANGE; 799 - } 800 - args->valuelen = sfe->valuelen; 801 - memcpy(args->value, &sfe->nameval[args->namelen], 802 - args->valuelen); 803 - return -EEXIST; 744 + return xfs_attr_copy_value(args, &sfe->nameval[args->namelen], 745 + sfe->valuelen); 804 746 } 805 747 return -ENOATTR; 806 748 } ··· 820 782 ifp = dp->i_afp; 821 783 sf = (xfs_attr_shortform_t *)ifp->if_u1.if_data; 822 784 size = be16_to_cpu(sf->hdr.totsize); 823 - tmpbuffer = kmem_alloc(size, KM_SLEEP); 785 + tmpbuffer = kmem_alloc(size, 0); 824 786 ASSERT(tmpbuffer != NULL); 825 787 memcpy(tmpbuffer, ifp->if_u1.if_data, size); 826 788 sf = (xfs_attr_shortform_t *)tmpbuffer; ··· 1023 985 1024 986 trace_xfs_attr_leaf_to_sf(args); 1025 987 1026 - tmpbuffer = kmem_alloc(args->geo->blksize, KM_SLEEP); 988 + tmpbuffer = kmem_alloc(args->geo->blksize, 0); 1027 989 if (!tmpbuffer) 1028 990 return -ENOMEM; 1029 991 ··· 1486 1448 1487 1449 trace_xfs_attr_leaf_compact(args); 1488 1450 1489 - tmpbuffer = kmem_alloc(args->geo->blksize, KM_SLEEP); 1451 + tmpbuffer = kmem_alloc(args->geo->blksize, 0); 1490 1452 memcpy(tmpbuffer, bp->b_addr, args->geo->blksize); 1491 1453 memset(bp->b_addr, 0, args->geo->blksize); 1492 1454 leaf_src = (xfs_attr_leafblock_t *)tmpbuffer; ··· 2205 2167 struct xfs_attr_leafblock *tmp_leaf; 2206 2168 struct xfs_attr3_icleaf_hdr tmphdr; 2207 2169 2208 - tmp_leaf = kmem_zalloc(state->args->geo->blksize, KM_SLEEP); 2170 + tmp_leaf = kmem_zalloc(state->args->geo->blksize, 0); 2209 2171 2210 2172 /* 2211 2173 * Copy the header into the temp leaf so that all the stuff ··· 2388 2350 /* 2389 2351 * Get the value associated with an attribute name from a leaf attribute 2390 2352 * list structure. 2353 + * 2354 + * If ATTR_KERNOVAL is specified, only the length needs to be returned. 2355 + * Unlike a lookup, we only return an error if the attribute does not 2356 + * exist or we can't retrieve the value. 2391 2357 */ 2392 2358 int 2393 2359 xfs_attr3_leaf_getvalue( ··· 2403 2361 struct xfs_attr_leaf_entry *entry; 2404 2362 struct xfs_attr_leaf_name_local *name_loc; 2405 2363 struct xfs_attr_leaf_name_remote *name_rmt; 2406 - int valuelen; 2407 2364 2408 2365 leaf = bp->b_addr; 2409 2366 xfs_attr3_leaf_hdr_from_disk(args->geo, &ichdr, leaf); ··· 2414 2373 name_loc = xfs_attr3_leaf_name_local(leaf, args->index); 2415 2374 ASSERT(name_loc->namelen == args->namelen); 2416 2375 ASSERT(memcmp(args->name, name_loc->nameval, args->namelen) == 0); 2417 - valuelen = be16_to_cpu(name_loc->valuelen); 2418 - if (args->flags & ATTR_KERNOVAL) { 2419 - args->valuelen = valuelen; 2420 - return 0; 2421 - } 2422 - if (args->valuelen < valuelen) { 2423 - args->valuelen = valuelen; 2424 - return -ERANGE; 2425 - } 2426 - args->valuelen = valuelen; 2427 - memcpy(args->value, &name_loc->nameval[args->namelen], valuelen); 2428 - } else { 2429 - name_rmt = xfs_attr3_leaf_name_remote(leaf, args->index); 2430 - ASSERT(name_rmt->namelen == args->namelen); 2431 - ASSERT(memcmp(args->name, name_rmt->name, args->namelen) == 0); 2432 - args->rmtvaluelen = be32_to_cpu(name_rmt->valuelen); 2433 - args->rmtblkno = be32_to_cpu(name_rmt->valueblk); 2434 - args->rmtblkcnt = xfs_attr3_rmt_blocks(args->dp->i_mount, 2435 - args->rmtvaluelen); 2436 - if (args->flags & ATTR_KERNOVAL) { 2437 - args->valuelen = args->rmtvaluelen; 2438 - return 0; 2439 - } 2440 - if (args->valuelen < args->rmtvaluelen) { 2441 - args->valuelen = args->rmtvaluelen; 2442 - return -ERANGE; 2443 - } 2444 - args->valuelen = args->rmtvaluelen; 2376 + return xfs_attr_copy_value(args, 2377 + &name_loc->nameval[args->namelen], 2378 + be16_to_cpu(name_loc->valuelen)); 2445 2379 } 2446 - return 0; 2380 + 2381 + name_rmt = xfs_attr3_leaf_name_remote(leaf, args->index); 2382 + ASSERT(name_rmt->namelen == args->namelen); 2383 + ASSERT(memcmp(args->name, name_rmt->name, args->namelen) == 0); 2384 + args->rmtvaluelen = be32_to_cpu(name_rmt->valuelen); 2385 + args->rmtblkno = be32_to_cpu(name_rmt->valueblk); 2386 + args->rmtblkcnt = xfs_attr3_rmt_blocks(args->dp->i_mount, 2387 + args->rmtvaluelen); 2388 + return xfs_attr_copy_value(args, NULL, args->rmtvaluelen); 2447 2389 } 2448 2390 2449 2391 /*========================================================================
+2
fs/xfs/libxfs/xfs_attr_remote.c
··· 358 358 /* 359 359 * Read the value associated with an attribute from the out-of-line buffer 360 360 * that we stored it in. 361 + * 362 + * Returns 0 on successful retrieval, otherwise an error. 361 363 */ 362 364 int 363 365 xfs_attr_rmtval_get(
+30 -55
fs/xfs/libxfs/xfs_bmap.c
··· 553 553 #endif 554 554 ASSERT(xfs_bmap_free_item_zone != NULL); 555 555 556 - new = kmem_zone_alloc(xfs_bmap_free_item_zone, KM_SLEEP); 556 + new = kmem_zone_alloc(xfs_bmap_free_item_zone, 0); 557 557 new->xefi_startblock = bno; 558 558 new->xefi_blockcount = (xfs_extlen_t)len; 559 559 if (oinfo) ··· 1099 1099 if (error) 1100 1100 goto trans_cancel; 1101 1101 ASSERT(ip->i_afp == NULL); 1102 - ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, KM_SLEEP); 1102 + ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, 0); 1103 1103 ip->i_afp->if_flags = XFS_IFEXTENTS; 1104 1104 logflags = 0; 1105 1105 switch (ip->i_d.di_format) { ··· 1985 1985 } 1986 1986 1987 1987 /* add reverse mapping unless caller opted out */ 1988 - if (!(bma->flags & XFS_BMAPI_NORMAP)) { 1989 - error = xfs_rmap_map_extent(bma->tp, bma->ip, whichfork, new); 1990 - if (error) 1991 - goto done; 1992 - } 1988 + if (!(bma->flags & XFS_BMAPI_NORMAP)) 1989 + xfs_rmap_map_extent(bma->tp, bma->ip, whichfork, new); 1993 1990 1994 1991 /* convert to a btree if necessary */ 1995 1992 if (xfs_bmap_needs_btree(bma->ip, whichfork)) { ··· 2468 2471 } 2469 2472 2470 2473 /* update reverse mappings */ 2471 - error = xfs_rmap_convert_extent(mp, tp, ip, whichfork, new); 2472 - if (error) 2473 - goto done; 2474 + xfs_rmap_convert_extent(mp, tp, ip, whichfork, new); 2474 2475 2475 2476 /* convert to a btree if necessary */ 2476 2477 if (xfs_bmap_needs_btree(ip, whichfork)) { ··· 2827 2832 } 2828 2833 2829 2834 /* add reverse mapping unless caller opted out */ 2830 - if (!(flags & XFS_BMAPI_NORMAP)) { 2831 - error = xfs_rmap_map_extent(tp, ip, whichfork, new); 2832 - if (error) 2833 - goto done; 2834 - } 2835 + if (!(flags & XFS_BMAPI_NORMAP)) 2836 + xfs_rmap_map_extent(tp, ip, whichfork, new); 2835 2837 2836 2838 /* convert to a btree if necessary */ 2837 2839 if (xfs_bmap_needs_btree(ip, whichfork)) { ··· 4042 4050 */ 4043 4051 if (!(bma->flags & XFS_BMAPI_METADATA)) { 4044 4052 bma->datatype = XFS_ALLOC_NOBUSY; 4045 - if (whichfork == XFS_DATA_FORK) { 4046 - if (bma->offset == 0) 4047 - bma->datatype |= XFS_ALLOC_INITIAL_USER_DATA; 4048 - else 4049 - bma->datatype |= XFS_ALLOC_USERDATA; 4050 - } 4053 + if (whichfork == XFS_DATA_FORK && bma->offset == 0) 4054 + bma->datatype |= XFS_ALLOC_INITIAL_USER_DATA; 4051 4055 if (bma->flags & XFS_BMAPI_ZERO) 4052 4056 bma->datatype |= XFS_ALLOC_USERDATA_ZERO; 4053 4057 } ··· 4389 4401 * If this is a CoW allocation, record the data in 4390 4402 * the refcount btree for orphan recovery. 4391 4403 */ 4392 - if (whichfork == XFS_COW_FORK) { 4393 - error = xfs_refcount_alloc_cow_extent(tp, 4394 - bma.blkno, bma.length); 4395 - if (error) 4396 - goto error0; 4397 - } 4404 + if (whichfork == XFS_COW_FORK) 4405 + xfs_refcount_alloc_cow_extent(tp, bma.blkno, 4406 + bma.length); 4398 4407 } 4399 4408 4400 4409 /* Deal with the allocated space we found. */ ··· 4515 4530 if (WARN_ON_ONCE(bma.blkno == NULLFSBLOCK)) 4516 4531 goto out_finish; 4517 4532 error = -EFSCORRUPTED; 4518 - if (WARN_ON_ONCE(!bma.got.br_startblock && !XFS_IS_REALTIME_INODE(ip))) 4533 + if (WARN_ON_ONCE(!xfs_valid_startblock(ip, bma.got.br_startblock))) 4519 4534 goto out_finish; 4520 4535 4521 4536 XFS_STATS_ADD(mp, xs_xstrat_bytes, XFS_FSB_TO_B(mp, bma.length)); ··· 4525 4540 *imap = bma.got; 4526 4541 *seq = READ_ONCE(ifp->if_seq); 4527 4542 4528 - if (whichfork == XFS_COW_FORK) { 4529 - error = xfs_refcount_alloc_cow_extent(tp, bma.blkno, 4530 - bma.length); 4531 - if (error) 4532 - goto out_finish; 4533 - } 4543 + if (whichfork == XFS_COW_FORK) 4544 + xfs_refcount_alloc_cow_extent(tp, bma.blkno, bma.length); 4534 4545 4535 4546 error = xfs_bmap_btree_to_extents(tp, ip, bma.cur, &bma.logflags, 4536 4547 whichfork); ··· 5130 5149 } 5131 5150 5132 5151 /* remove reverse mapping */ 5133 - error = xfs_rmap_unmap_extent(tp, ip, whichfork, del); 5134 - if (error) 5135 - goto done; 5152 + xfs_rmap_unmap_extent(tp, ip, whichfork, del); 5136 5153 5137 5154 /* 5138 5155 * If we need to, add to list of extents to delete. 5139 5156 */ 5140 5157 if (do_fx && !(bflags & XFS_BMAPI_REMAP)) { 5141 5158 if (xfs_is_reflink_inode(ip) && whichfork == XFS_DATA_FORK) { 5142 - error = xfs_refcount_decrease_extent(tp, del); 5143 - if (error) 5144 - goto done; 5159 + xfs_refcount_decrease_extent(tp, del); 5145 5160 } else { 5146 5161 __xfs_bmap_add_free(tp, del->br_startblock, 5147 5162 del->br_blockcount, NULL, ··· 5628 5651 &new); 5629 5652 5630 5653 /* update reverse mapping. rmap functions merge the rmaps for us */ 5631 - error = xfs_rmap_unmap_extent(tp, ip, whichfork, got); 5632 - if (error) 5633 - return error; 5654 + xfs_rmap_unmap_extent(tp, ip, whichfork, got); 5634 5655 memcpy(&new, got, sizeof(new)); 5635 5656 new.br_startoff = left->br_startoff + left->br_blockcount; 5636 - return xfs_rmap_map_extent(tp, ip, whichfork, &new); 5657 + xfs_rmap_map_extent(tp, ip, whichfork, &new); 5658 + return 0; 5637 5659 } 5638 5660 5639 5661 static int ··· 5671 5695 got); 5672 5696 5673 5697 /* update reverse mapping */ 5674 - error = xfs_rmap_unmap_extent(tp, ip, whichfork, &prev); 5675 - if (error) 5676 - return error; 5677 - return xfs_rmap_map_extent(tp, ip, whichfork, got); 5698 + xfs_rmap_unmap_extent(tp, ip, whichfork, &prev); 5699 + xfs_rmap_map_extent(tp, ip, whichfork, got); 5700 + return 0; 5678 5701 } 5679 5702 5680 5703 int ··· 6069 6094 bmap->br_blockcount, 6070 6095 bmap->br_state); 6071 6096 6072 - bi = kmem_alloc(sizeof(struct xfs_bmap_intent), KM_SLEEP | KM_NOFS); 6097 + bi = kmem_alloc(sizeof(struct xfs_bmap_intent), KM_NOFS); 6073 6098 INIT_LIST_HEAD(&bi->bi_list); 6074 6099 bi->bi_type = type; 6075 6100 bi->bi_owner = ip; ··· 6081 6106 } 6082 6107 6083 6108 /* Map an extent into a file. */ 6084 - int 6109 + void 6085 6110 xfs_bmap_map_extent( 6086 6111 struct xfs_trans *tp, 6087 6112 struct xfs_inode *ip, 6088 6113 struct xfs_bmbt_irec *PREV) 6089 6114 { 6090 6115 if (!xfs_bmap_is_update_needed(PREV)) 6091 - return 0; 6116 + return; 6092 6117 6093 - return __xfs_bmap_add(tp, XFS_BMAP_MAP, ip, XFS_DATA_FORK, PREV); 6118 + __xfs_bmap_add(tp, XFS_BMAP_MAP, ip, XFS_DATA_FORK, PREV); 6094 6119 } 6095 6120 6096 6121 /* Unmap an extent out of a file. */ 6097 - int 6122 + void 6098 6123 xfs_bmap_unmap_extent( 6099 6124 struct xfs_trans *tp, 6100 6125 struct xfs_inode *ip, 6101 6126 struct xfs_bmbt_irec *PREV) 6102 6127 { 6103 6128 if (!xfs_bmap_is_update_needed(PREV)) 6104 - return 0; 6129 + return; 6105 6130 6106 - return __xfs_bmap_add(tp, XFS_BMAP_UNMAP, ip, XFS_DATA_FORK, PREV); 6131 + __xfs_bmap_add(tp, XFS_BMAP_UNMAP, ip, XFS_DATA_FORK, PREV); 6107 6132 } 6108 6133 6109 6134 /*
+9 -2
fs/xfs/libxfs/xfs_bmap.h
··· 171 171 !isnullstartblock(irec->br_startblock); 172 172 } 173 173 174 + /* 175 + * Check the mapping for obviously garbage allocations that could trash the 176 + * filesystem immediately. 177 + */ 178 + #define xfs_valid_startblock(ip, startblock) \ 179 + ((startblock) != 0 || XFS_IS_REALTIME_INODE(ip)) 180 + 174 181 void xfs_trim_extent(struct xfs_bmbt_irec *irec, xfs_fileoff_t bno, 175 182 xfs_filblks_t len); 176 183 int xfs_bmap_add_attrfork(struct xfs_inode *ip, int size, int rsvd); ··· 261 254 enum xfs_bmap_intent_type type, int whichfork, 262 255 xfs_fileoff_t startoff, xfs_fsblock_t startblock, 263 256 xfs_filblks_t *blockcount, xfs_exntst_t state); 264 - int xfs_bmap_map_extent(struct xfs_trans *tp, struct xfs_inode *ip, 257 + void xfs_bmap_map_extent(struct xfs_trans *tp, struct xfs_inode *ip, 265 258 struct xfs_bmbt_irec *imap); 266 - int xfs_bmap_unmap_extent(struct xfs_trans *tp, struct xfs_inode *ip, 259 + void xfs_bmap_unmap_extent(struct xfs_trans *tp, struct xfs_inode *ip, 267 260 struct xfs_bmbt_irec *imap); 268 261 269 262 static inline int xfs_bmap_fork_to_state(int whichfork)
+14 -2
fs/xfs/libxfs/xfs_bmap_btree.c
··· 400 400 union xfs_btree_key *k1, 401 401 union xfs_btree_key *k2) 402 402 { 403 - return (int64_t)be64_to_cpu(k1->bmbt.br_startoff) - 404 - be64_to_cpu(k2->bmbt.br_startoff); 403 + uint64_t a = be64_to_cpu(k1->bmbt.br_startoff); 404 + uint64_t b = be64_to_cpu(k2->bmbt.br_startoff); 405 + 406 + /* 407 + * Note: This routine previously casted a and b to int64 and subtracted 408 + * them to generate a result. This lead to problems if b was the 409 + * "maximum" key value (all ones) being signed incorrectly, hence this 410 + * somewhat less efficient version. 411 + */ 412 + if (a > b) 413 + return 1; 414 + if (b > a) 415 + return -1; 416 + return 0; 405 417 } 406 418 407 419 static xfs_failaddr_t
+5 -9
fs/xfs/libxfs/xfs_btree.c
··· 4466 4466 * btree block 4467 4467 * 4468 4468 * @bp: buffer containing the btree block 4469 - * @max_recs: pointer to the m_*_mxr max records field in the xfs mount 4470 - * @pag_max_level: pointer to the per-ag max level field 4471 4469 */ 4472 4470 xfs_failaddr_t 4473 4471 xfs_btree_sblock_v5hdr_verify( ··· 4598 4600 4599 4601 /* Callback */ 4600 4602 error = fn(cur, recp, priv); 4601 - if (error < 0 || error == XFS_BTREE_QUERY_RANGE_ABORT) 4603 + if (error) 4602 4604 break; 4603 4605 4604 4606 advloop: ··· 4700 4702 */ 4701 4703 if (ldiff >= 0 && hdiff >= 0) { 4702 4704 error = fn(cur, recp, priv); 4703 - if (error < 0 || 4704 - error == XFS_BTREE_QUERY_RANGE_ABORT) 4705 + if (error) 4705 4706 break; 4706 4707 } else if (hdiff < 0) { 4707 4708 /* Record is larger than high key; pop. */ ··· 4771 4774 * Query a btree for all records overlapping a given interval of keys. The 4772 4775 * supplied function will be called with each record found; return one of the 4773 4776 * XFS_BTREE_QUERY_RANGE_{CONTINUE,ABORT} values or the usual negative error 4774 - * code. This function returns XFS_BTREE_QUERY_RANGE_ABORT, zero, or a 4775 - * negative error code. 4777 + * code. This function returns -ECANCELED, zero, or a negative error code. 4776 4778 */ 4777 4779 int 4778 4780 xfs_btree_query_range( ··· 4887 4891 union xfs_btree_rec *rec, 4888 4892 void *priv) 4889 4893 { 4890 - return XFS_BTREE_QUERY_RANGE_ABORT; 4894 + return -ECANCELED; 4891 4895 } 4892 4896 4893 4897 /* Is there a record covering a given range of keys? */ ··· 4902 4906 4903 4907 error = xfs_btree_query_range(cur, low, high, 4904 4908 &xfs_btree_has_record_helper, NULL); 4905 - if (error == XFS_BTREE_QUERY_RANGE_ABORT) { 4909 + if (error == -ECANCELED) { 4906 4910 *exists = true; 4907 4911 return 0; 4908 4912 }
+7 -3
fs/xfs/libxfs/xfs_btree.h
··· 464 464 uint xfs_btree_compute_maxlevels(uint *limits, unsigned long len); 465 465 unsigned long long xfs_btree_calc_size(uint *limits, unsigned long long len); 466 466 467 - /* return codes */ 468 - #define XFS_BTREE_QUERY_RANGE_CONTINUE (XFS_ITER_CONTINUE) /* keep iterating */ 469 - #define XFS_BTREE_QUERY_RANGE_ABORT (XFS_ITER_ABORT) /* stop iterating */ 467 + /* 468 + * Return codes for the query range iterator function are 0 to continue 469 + * iterating, and non-zero to stop iterating. Any non-zero value will be 470 + * passed up to the _query_range caller. The special value -ECANCELED can be 471 + * used to stop iteration, because _query_range never generates that error 472 + * code on its own. 473 + */ 470 474 typedef int (*xfs_btree_query_range_fn)(struct xfs_btree_cur *cur, 471 475 union xfs_btree_rec *rec, void *priv); 472 476
+3 -3
fs/xfs/libxfs/xfs_da_btree.c
··· 2098 2098 * If we didn't get it and the block might work if fragmented, 2099 2099 * try without the CONTIG flag. Loop until we get it all. 2100 2100 */ 2101 - mapp = kmem_alloc(sizeof(*mapp) * count, KM_SLEEP); 2101 + mapp = kmem_alloc(sizeof(*mapp) * count, 0); 2102 2102 for (b = *bno, mapi = 0; b < *bno + count; ) { 2103 2103 nmap = min(XFS_BMAP_MAX_NMAP, count); 2104 2104 c = (int)(*bno + count - b); ··· 2480 2480 2481 2481 if (nirecs > 1) { 2482 2482 map = kmem_zalloc(nirecs * sizeof(struct xfs_buf_map), 2483 - KM_SLEEP | KM_NOFS); 2483 + KM_NOFS); 2484 2484 if (!map) 2485 2485 return -ENOMEM; 2486 2486 *mapp = map; ··· 2539 2539 */ 2540 2540 if (nfsb != 1) 2541 2541 irecs = kmem_zalloc(sizeof(irec) * nfsb, 2542 - KM_SLEEP | KM_NOFS); 2542 + KM_NOFS); 2543 2543 2544 2544 nirecs = nfsb; 2545 2545 error = xfs_bmapi_read(dp, (xfs_fileoff_t)bno, nfsb, irecs,
+3 -1
fs/xfs/libxfs/xfs_da_btree.h
··· 81 81 #define XFS_DA_OP_ADDNAME 0x0004 /* this is an add operation */ 82 82 #define XFS_DA_OP_OKNOENT 0x0008 /* lookup/add op, ENOENT ok, else die */ 83 83 #define XFS_DA_OP_CILOOKUP 0x0010 /* lookup to return CI name if found */ 84 + #define XFS_DA_OP_ALLOCVAL 0x0020 /* lookup to alloc buffer if found */ 84 85 85 86 #define XFS_DA_OP_FLAGS \ 86 87 { XFS_DA_OP_JUSTCHECK, "JUSTCHECK" }, \ 87 88 { XFS_DA_OP_RENAME, "RENAME" }, \ 88 89 { XFS_DA_OP_ADDNAME, "ADDNAME" }, \ 89 90 { XFS_DA_OP_OKNOENT, "OKNOENT" }, \ 90 - { XFS_DA_OP_CILOOKUP, "CILOOKUP" } 91 + { XFS_DA_OP_CILOOKUP, "CILOOKUP" }, \ 92 + { XFS_DA_OP_ALLOCVAL, "ALLOCVAL" } 91 93 92 94 /* 93 95 * Storage for holding state during Btree searches and split/join ops.
+1 -1
fs/xfs/libxfs/xfs_defer.c
··· 517 517 } 518 518 if (!dfp) { 519 519 dfp = kmem_alloc(sizeof(struct xfs_defer_pending), 520 - KM_SLEEP | KM_NOFS); 520 + KM_NOFS); 521 521 dfp->dfp_type = type; 522 522 dfp->dfp_intent = NULL; 523 523 dfp->dfp_done = NULL;
+7 -7
fs/xfs/libxfs/xfs_dir2.c
··· 110 110 111 111 nodehdr_size = mp->m_dir_inode_ops->node_hdr_size; 112 112 mp->m_dir_geo = kmem_zalloc(sizeof(struct xfs_da_geometry), 113 - KM_SLEEP | KM_MAYFAIL); 113 + KM_MAYFAIL); 114 114 mp->m_attr_geo = kmem_zalloc(sizeof(struct xfs_da_geometry), 115 - KM_SLEEP | KM_MAYFAIL); 115 + KM_MAYFAIL); 116 116 if (!mp->m_dir_geo || !mp->m_attr_geo) { 117 117 kmem_free(mp->m_dir_geo); 118 118 kmem_free(mp->m_attr_geo); ··· 217 217 if (error) 218 218 return error; 219 219 220 - args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS); 220 + args = kmem_zalloc(sizeof(*args), KM_NOFS); 221 221 if (!args) 222 222 return -ENOMEM; 223 223 ··· 254 254 XFS_STATS_INC(dp->i_mount, xs_dir_create); 255 255 } 256 256 257 - args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS); 257 + args = kmem_zalloc(sizeof(*args), KM_NOFS); 258 258 if (!args) 259 259 return -ENOMEM; 260 260 ··· 353 353 * lockdep Doing this avoids having to add a bunch of lockdep class 354 354 * annotations into the reclaim path for the ilock. 355 355 */ 356 - args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS); 356 + args = kmem_zalloc(sizeof(*args), KM_NOFS); 357 357 args->geo = dp->i_mount->m_dir_geo; 358 358 args->name = name->name; 359 359 args->namelen = name->len; ··· 422 422 ASSERT(S_ISDIR(VFS_I(dp)->i_mode)); 423 423 XFS_STATS_INC(dp->i_mount, xs_dir_remove); 424 424 425 - args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS); 425 + args = kmem_zalloc(sizeof(*args), KM_NOFS); 426 426 if (!args) 427 427 return -ENOMEM; 428 428 ··· 483 483 if (rval) 484 484 return rval; 485 485 486 - args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS); 486 + args = kmem_zalloc(sizeof(*args), KM_NOFS); 487 487 if (!args) 488 488 return -ENOMEM; 489 489
+1 -1
fs/xfs/libxfs/xfs_dir2_block.c
··· 1092 1092 * Copy the directory into a temporary buffer. 1093 1093 * Then pitch the incore inode data so we can make extents. 1094 1094 */ 1095 - sfp = kmem_alloc(ifp->if_bytes, KM_SLEEP); 1095 + sfp = kmem_alloc(ifp->if_bytes, 0); 1096 1096 memcpy(sfp, oldsfp, ifp->if_bytes); 1097 1097 1098 1098 xfs_idata_realloc(dp, -ifp->if_bytes, XFS_DATA_FORK);
+347 -379
fs/xfs/libxfs/xfs_dir2_node.c
··· 32 32 static int xfs_dir2_leafn_remove(xfs_da_args_t *args, struct xfs_buf *bp, 33 33 int index, xfs_da_state_blk_t *dblk, 34 34 int *rval); 35 - static int xfs_dir2_node_addname_int(xfs_da_args_t *args, 36 - xfs_da_state_blk_t *fblk); 37 35 38 36 /* 39 37 * Check internal consistency of a leafn block. ··· 1609 1611 } 1610 1612 1611 1613 /* 1614 + * Add a new data block to the directory at the free space index that the caller 1615 + * has specified. 1616 + */ 1617 + static int 1618 + xfs_dir2_node_add_datablk( 1619 + struct xfs_da_args *args, 1620 + struct xfs_da_state_blk *fblk, 1621 + xfs_dir2_db_t *dbno, 1622 + struct xfs_buf **dbpp, 1623 + struct xfs_buf **fbpp, 1624 + int *findex) 1625 + { 1626 + struct xfs_inode *dp = args->dp; 1627 + struct xfs_trans *tp = args->trans; 1628 + struct xfs_mount *mp = dp->i_mount; 1629 + struct xfs_dir3_icfree_hdr freehdr; 1630 + struct xfs_dir2_data_free *bf; 1631 + struct xfs_dir2_data_hdr *hdr; 1632 + struct xfs_dir2_free *free = NULL; 1633 + xfs_dir2_db_t fbno; 1634 + struct xfs_buf *fbp; 1635 + struct xfs_buf *dbp; 1636 + __be16 *bests = NULL; 1637 + int error; 1638 + 1639 + /* Not allowed to allocate, return failure. */ 1640 + if (args->total == 0) 1641 + return -ENOSPC; 1642 + 1643 + /* Allocate and initialize the new data block. */ 1644 + error = xfs_dir2_grow_inode(args, XFS_DIR2_DATA_SPACE, dbno); 1645 + if (error) 1646 + return error; 1647 + error = xfs_dir3_data_init(args, *dbno, &dbp); 1648 + if (error) 1649 + return error; 1650 + 1651 + /* 1652 + * Get the freespace block corresponding to the data block 1653 + * that was just allocated. 1654 + */ 1655 + fbno = dp->d_ops->db_to_fdb(args->geo, *dbno); 1656 + error = xfs_dir2_free_try_read(tp, dp, 1657 + xfs_dir2_db_to_da(args->geo, fbno), &fbp); 1658 + if (error) 1659 + return error; 1660 + 1661 + /* 1662 + * If there wasn't a freespace block, the read will 1663 + * return a NULL fbp. Allocate and initialize a new one. 1664 + */ 1665 + if (!fbp) { 1666 + error = xfs_dir2_grow_inode(args, XFS_DIR2_FREE_SPACE, &fbno); 1667 + if (error) 1668 + return error; 1669 + 1670 + if (dp->d_ops->db_to_fdb(args->geo, *dbno) != fbno) { 1671 + xfs_alert(mp, 1672 + "%s: dir ino %llu needed freesp block %lld for data block %lld, got %lld", 1673 + __func__, (unsigned long long)dp->i_ino, 1674 + (long long)dp->d_ops->db_to_fdb(args->geo, *dbno), 1675 + (long long)*dbno, (long long)fbno); 1676 + if (fblk) { 1677 + xfs_alert(mp, 1678 + " fblk "PTR_FMT" blkno %llu index %d magic 0x%x", 1679 + fblk, (unsigned long long)fblk->blkno, 1680 + fblk->index, fblk->magic); 1681 + } else { 1682 + xfs_alert(mp, " ... fblk is NULL"); 1683 + } 1684 + XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp); 1685 + return -EFSCORRUPTED; 1686 + } 1687 + 1688 + /* Get a buffer for the new block. */ 1689 + error = xfs_dir3_free_get_buf(args, fbno, &fbp); 1690 + if (error) 1691 + return error; 1692 + free = fbp->b_addr; 1693 + bests = dp->d_ops->free_bests_p(free); 1694 + dp->d_ops->free_hdr_from_disk(&freehdr, free); 1695 + 1696 + /* Remember the first slot as our empty slot. */ 1697 + freehdr.firstdb = (fbno - xfs_dir2_byte_to_db(args->geo, 1698 + XFS_DIR2_FREE_OFFSET)) * 1699 + dp->d_ops->free_max_bests(args->geo); 1700 + } else { 1701 + free = fbp->b_addr; 1702 + bests = dp->d_ops->free_bests_p(free); 1703 + dp->d_ops->free_hdr_from_disk(&freehdr, free); 1704 + } 1705 + 1706 + /* Set the freespace block index from the data block number. */ 1707 + *findex = dp->d_ops->db_to_fdindex(args->geo, *dbno); 1708 + 1709 + /* Extend the freespace table if the new data block is off the end. */ 1710 + if (*findex >= freehdr.nvalid) { 1711 + ASSERT(*findex < dp->d_ops->free_max_bests(args->geo)); 1712 + freehdr.nvalid = *findex + 1; 1713 + bests[*findex] = cpu_to_be16(NULLDATAOFF); 1714 + } 1715 + 1716 + /* 1717 + * If this entry was for an empty data block (this should always be 1718 + * true) then update the header. 1719 + */ 1720 + if (bests[*findex] == cpu_to_be16(NULLDATAOFF)) { 1721 + freehdr.nused++; 1722 + dp->d_ops->free_hdr_to_disk(fbp->b_addr, &freehdr); 1723 + xfs_dir2_free_log_header(args, fbp); 1724 + } 1725 + 1726 + /* Update the freespace value for the new block in the table. */ 1727 + hdr = dbp->b_addr; 1728 + bf = dp->d_ops->data_bestfree_p(hdr); 1729 + bests[*findex] = bf[0].length; 1730 + 1731 + *dbpp = dbp; 1732 + *fbpp = fbp; 1733 + return 0; 1734 + } 1735 + 1736 + static int 1737 + xfs_dir2_node_find_freeblk( 1738 + struct xfs_da_args *args, 1739 + struct xfs_da_state_blk *fblk, 1740 + xfs_dir2_db_t *dbnop, 1741 + struct xfs_buf **fbpp, 1742 + int *findexp, 1743 + int length) 1744 + { 1745 + struct xfs_dir3_icfree_hdr freehdr; 1746 + struct xfs_dir2_free *free = NULL; 1747 + struct xfs_inode *dp = args->dp; 1748 + struct xfs_trans *tp = args->trans; 1749 + struct xfs_buf *fbp = NULL; 1750 + xfs_dir2_db_t firstfbno; 1751 + xfs_dir2_db_t lastfbno; 1752 + xfs_dir2_db_t ifbno = -1; 1753 + xfs_dir2_db_t dbno = -1; 1754 + xfs_dir2_db_t fbno; 1755 + xfs_fileoff_t fo; 1756 + __be16 *bests = NULL; 1757 + int findex = 0; 1758 + int error; 1759 + 1760 + /* 1761 + * If we came in with a freespace block that means that lookup 1762 + * found an entry with our hash value. This is the freespace 1763 + * block for that data entry. 1764 + */ 1765 + if (fblk) { 1766 + fbp = fblk->bp; 1767 + free = fbp->b_addr; 1768 + findex = fblk->index; 1769 + if (findex >= 0) { 1770 + /* caller already found the freespace for us. */ 1771 + bests = dp->d_ops->free_bests_p(free); 1772 + dp->d_ops->free_hdr_from_disk(&freehdr, free); 1773 + 1774 + ASSERT(findex < freehdr.nvalid); 1775 + ASSERT(be16_to_cpu(bests[findex]) != NULLDATAOFF); 1776 + ASSERT(be16_to_cpu(bests[findex]) >= length); 1777 + dbno = freehdr.firstdb + findex; 1778 + goto found_block; 1779 + } 1780 + 1781 + /* 1782 + * The data block looked at didn't have enough room. 1783 + * We'll start at the beginning of the freespace entries. 1784 + */ 1785 + ifbno = fblk->blkno; 1786 + xfs_trans_brelse(tp, fbp); 1787 + fbp = NULL; 1788 + fblk->bp = NULL; 1789 + } 1790 + 1791 + /* 1792 + * If we don't have a data block yet, we're going to scan the freespace 1793 + * data for a data block with enough free space in it. 1794 + */ 1795 + error = xfs_bmap_last_offset(dp, &fo, XFS_DATA_FORK); 1796 + if (error) 1797 + return error; 1798 + lastfbno = xfs_dir2_da_to_db(args->geo, (xfs_dablk_t)fo); 1799 + firstfbno = xfs_dir2_byte_to_db(args->geo, XFS_DIR2_FREE_OFFSET); 1800 + 1801 + for (fbno = lastfbno - 1; fbno >= firstfbno; fbno--) { 1802 + /* If it's ifbno we already looked at it. */ 1803 + if (fbno == ifbno) 1804 + continue; 1805 + 1806 + /* 1807 + * Read the block. There can be holes in the freespace blocks, 1808 + * so this might not succeed. This should be really rare, so 1809 + * there's no reason to avoid it. 1810 + */ 1811 + error = xfs_dir2_free_try_read(tp, dp, 1812 + xfs_dir2_db_to_da(args->geo, fbno), 1813 + &fbp); 1814 + if (error) 1815 + return error; 1816 + if (!fbp) 1817 + continue; 1818 + 1819 + free = fbp->b_addr; 1820 + bests = dp->d_ops->free_bests_p(free); 1821 + dp->d_ops->free_hdr_from_disk(&freehdr, free); 1822 + 1823 + /* Scan the free entry array for a large enough free space. */ 1824 + for (findex = freehdr.nvalid - 1; findex >= 0; findex--) { 1825 + if (be16_to_cpu(bests[findex]) != NULLDATAOFF && 1826 + be16_to_cpu(bests[findex]) >= length) { 1827 + dbno = freehdr.firstdb + findex; 1828 + goto found_block; 1829 + } 1830 + } 1831 + 1832 + /* Didn't find free space, go on to next free block */ 1833 + xfs_trans_brelse(tp, fbp); 1834 + } 1835 + 1836 + found_block: 1837 + *dbnop = dbno; 1838 + *fbpp = fbp; 1839 + *findexp = findex; 1840 + return 0; 1841 + } 1842 + 1843 + 1844 + /* 1845 + * Add the data entry for a node-format directory name addition. 1846 + * The leaf entry is added in xfs_dir2_leafn_add. 1847 + * We may enter with a freespace block that the lookup found. 1848 + */ 1849 + static int 1850 + xfs_dir2_node_addname_int( 1851 + struct xfs_da_args *args, /* operation arguments */ 1852 + struct xfs_da_state_blk *fblk) /* optional freespace block */ 1853 + { 1854 + struct xfs_dir2_data_unused *dup; /* data unused entry pointer */ 1855 + struct xfs_dir2_data_entry *dep; /* data entry pointer */ 1856 + struct xfs_dir2_data_hdr *hdr; /* data block header */ 1857 + struct xfs_dir2_data_free *bf; 1858 + struct xfs_dir2_free *free = NULL; /* freespace block structure */ 1859 + struct xfs_trans *tp = args->trans; 1860 + struct xfs_inode *dp = args->dp; 1861 + struct xfs_buf *dbp; /* data block buffer */ 1862 + struct xfs_buf *fbp; /* freespace buffer */ 1863 + xfs_dir2_data_aoff_t aoff; 1864 + xfs_dir2_db_t dbno; /* data block number */ 1865 + int error; /* error return value */ 1866 + int findex; /* freespace entry index */ 1867 + int length; /* length of the new entry */ 1868 + int logfree = 0; /* need to log free entry */ 1869 + int needlog = 0; /* need to log data header */ 1870 + int needscan = 0; /* need to rescan data frees */ 1871 + __be16 *tagp; /* data entry tag pointer */ 1872 + __be16 *bests; 1873 + 1874 + length = dp->d_ops->data_entsize(args->namelen); 1875 + error = xfs_dir2_node_find_freeblk(args, fblk, &dbno, &fbp, &findex, 1876 + length); 1877 + if (error) 1878 + return error; 1879 + 1880 + /* 1881 + * Now we know if we must allocate blocks, so if we are checking whether 1882 + * we can insert without allocation then we can return now. 1883 + */ 1884 + if (args->op_flags & XFS_DA_OP_JUSTCHECK) { 1885 + if (dbno == -1) 1886 + return -ENOSPC; 1887 + return 0; 1888 + } 1889 + 1890 + /* 1891 + * If we don't have a data block, we need to allocate one and make 1892 + * the freespace entries refer to it. 1893 + */ 1894 + if (dbno == -1) { 1895 + /* we're going to have to log the free block index later */ 1896 + logfree = 1; 1897 + error = xfs_dir2_node_add_datablk(args, fblk, &dbno, &dbp, &fbp, 1898 + &findex); 1899 + } else { 1900 + /* Read the data block in. */ 1901 + error = xfs_dir3_data_read(tp, dp, 1902 + xfs_dir2_db_to_da(args->geo, dbno), 1903 + -1, &dbp); 1904 + } 1905 + if (error) 1906 + return error; 1907 + 1908 + /* setup for data block up now */ 1909 + hdr = dbp->b_addr; 1910 + bf = dp->d_ops->data_bestfree_p(hdr); 1911 + ASSERT(be16_to_cpu(bf[0].length) >= length); 1912 + 1913 + /* Point to the existing unused space. */ 1914 + dup = (xfs_dir2_data_unused_t *) 1915 + ((char *)hdr + be16_to_cpu(bf[0].offset)); 1916 + 1917 + /* Mark the first part of the unused space, inuse for us. */ 1918 + aoff = (xfs_dir2_data_aoff_t)((char *)dup - (char *)hdr); 1919 + error = xfs_dir2_data_use_free(args, dbp, dup, aoff, length, 1920 + &needlog, &needscan); 1921 + if (error) { 1922 + xfs_trans_brelse(tp, dbp); 1923 + return error; 1924 + } 1925 + 1926 + /* Fill in the new entry and log it. */ 1927 + dep = (xfs_dir2_data_entry_t *)dup; 1928 + dep->inumber = cpu_to_be64(args->inumber); 1929 + dep->namelen = args->namelen; 1930 + memcpy(dep->name, args->name, dep->namelen); 1931 + dp->d_ops->data_put_ftype(dep, args->filetype); 1932 + tagp = dp->d_ops->data_entry_tag_p(dep); 1933 + *tagp = cpu_to_be16((char *)dep - (char *)hdr); 1934 + xfs_dir2_data_log_entry(args, dbp, dep); 1935 + 1936 + /* Rescan the freespace and log the data block if needed. */ 1937 + if (needscan) 1938 + xfs_dir2_data_freescan(dp, hdr, &needlog); 1939 + if (needlog) 1940 + xfs_dir2_data_log_header(args, dbp); 1941 + 1942 + /* If the freespace block entry is now wrong, update it. */ 1943 + free = fbp->b_addr; 1944 + bests = dp->d_ops->free_bests_p(free); 1945 + if (bests[findex] != bf[0].length) { 1946 + bests[findex] = bf[0].length; 1947 + logfree = 1; 1948 + } 1949 + 1950 + /* Log the freespace entry if needed. */ 1951 + if (logfree) 1952 + xfs_dir2_free_log_bests(args, fbp, findex, findex); 1953 + 1954 + /* Return the data block and offset in args. */ 1955 + args->blkno = (xfs_dablk_t)dbno; 1956 + args->index = be16_to_cpu(*tagp); 1957 + return 0; 1958 + } 1959 + 1960 + /* 1612 1961 * Top-level node form directory addname routine. 1613 1962 */ 1614 1963 int /* error */ ··· 2022 1677 done: 2023 1678 xfs_da_state_free(state); 2024 1679 return rval; 2025 - } 2026 - 2027 - /* 2028 - * Add the data entry for a node-format directory name addition. 2029 - * The leaf entry is added in xfs_dir2_leafn_add. 2030 - * We may enter with a freespace block that the lookup found. 2031 - */ 2032 - static int /* error */ 2033 - xfs_dir2_node_addname_int( 2034 - xfs_da_args_t *args, /* operation arguments */ 2035 - xfs_da_state_blk_t *fblk) /* optional freespace block */ 2036 - { 2037 - xfs_dir2_data_hdr_t *hdr; /* data block header */ 2038 - xfs_dir2_db_t dbno; /* data block number */ 2039 - struct xfs_buf *dbp; /* data block buffer */ 2040 - xfs_dir2_data_entry_t *dep; /* data entry pointer */ 2041 - xfs_inode_t *dp; /* incore directory inode */ 2042 - xfs_dir2_data_unused_t *dup; /* data unused entry pointer */ 2043 - int error; /* error return value */ 2044 - xfs_dir2_db_t fbno; /* freespace block number */ 2045 - struct xfs_buf *fbp; /* freespace buffer */ 2046 - int findex; /* freespace entry index */ 2047 - xfs_dir2_free_t *free=NULL; /* freespace block structure */ 2048 - xfs_dir2_db_t ifbno; /* initial freespace block no */ 2049 - xfs_dir2_db_t lastfbno=0; /* highest freespace block no */ 2050 - int length; /* length of the new entry */ 2051 - int logfree; /* need to log free entry */ 2052 - xfs_mount_t *mp; /* filesystem mount point */ 2053 - int needlog; /* need to log data header */ 2054 - int needscan; /* need to rescan data frees */ 2055 - __be16 *tagp; /* data entry tag pointer */ 2056 - xfs_trans_t *tp; /* transaction pointer */ 2057 - __be16 *bests; 2058 - struct xfs_dir3_icfree_hdr freehdr; 2059 - struct xfs_dir2_data_free *bf; 2060 - xfs_dir2_data_aoff_t aoff; 2061 - 2062 - dp = args->dp; 2063 - mp = dp->i_mount; 2064 - tp = args->trans; 2065 - length = dp->d_ops->data_entsize(args->namelen); 2066 - /* 2067 - * If we came in with a freespace block that means that lookup 2068 - * found an entry with our hash value. This is the freespace 2069 - * block for that data entry. 2070 - */ 2071 - if (fblk) { 2072 - fbp = fblk->bp; 2073 - /* 2074 - * Remember initial freespace block number. 2075 - */ 2076 - ifbno = fblk->blkno; 2077 - free = fbp->b_addr; 2078 - findex = fblk->index; 2079 - bests = dp->d_ops->free_bests_p(free); 2080 - dp->d_ops->free_hdr_from_disk(&freehdr, free); 2081 - 2082 - /* 2083 - * This means the free entry showed that the data block had 2084 - * space for our entry, so we remembered it. 2085 - * Use that data block. 2086 - */ 2087 - if (findex >= 0) { 2088 - ASSERT(findex < freehdr.nvalid); 2089 - ASSERT(be16_to_cpu(bests[findex]) != NULLDATAOFF); 2090 - ASSERT(be16_to_cpu(bests[findex]) >= length); 2091 - dbno = freehdr.firstdb + findex; 2092 - } else { 2093 - /* 2094 - * The data block looked at didn't have enough room. 2095 - * We'll start at the beginning of the freespace entries. 2096 - */ 2097 - dbno = -1; 2098 - findex = 0; 2099 - } 2100 - } else { 2101 - /* 2102 - * Didn't come in with a freespace block, so no data block. 2103 - */ 2104 - ifbno = dbno = -1; 2105 - fbp = NULL; 2106 - findex = 0; 2107 - } 2108 - 2109 - /* 2110 - * If we don't have a data block yet, we're going to scan the 2111 - * freespace blocks looking for one. Figure out what the 2112 - * highest freespace block number is. 2113 - */ 2114 - if (dbno == -1) { 2115 - xfs_fileoff_t fo; /* freespace block number */ 2116 - 2117 - if ((error = xfs_bmap_last_offset(dp, &fo, XFS_DATA_FORK))) 2118 - return error; 2119 - lastfbno = xfs_dir2_da_to_db(args->geo, (xfs_dablk_t)fo); 2120 - fbno = ifbno; 2121 - } 2122 - /* 2123 - * While we haven't identified a data block, search the freeblock 2124 - * data for a good data block. If we find a null freeblock entry, 2125 - * indicating a hole in the data blocks, remember that. 2126 - */ 2127 - while (dbno == -1) { 2128 - /* 2129 - * If we don't have a freeblock in hand, get the next one. 2130 - */ 2131 - if (fbp == NULL) { 2132 - /* 2133 - * Happens the first time through unless lookup gave 2134 - * us a freespace block to start with. 2135 - */ 2136 - if (++fbno == 0) 2137 - fbno = xfs_dir2_byte_to_db(args->geo, 2138 - XFS_DIR2_FREE_OFFSET); 2139 - /* 2140 - * If it's ifbno we already looked at it. 2141 - */ 2142 - if (fbno == ifbno) 2143 - fbno++; 2144 - /* 2145 - * If it's off the end we're done. 2146 - */ 2147 - if (fbno >= lastfbno) 2148 - break; 2149 - /* 2150 - * Read the block. There can be holes in the 2151 - * freespace blocks, so this might not succeed. 2152 - * This should be really rare, so there's no reason 2153 - * to avoid it. 2154 - */ 2155 - error = xfs_dir2_free_try_read(tp, dp, 2156 - xfs_dir2_db_to_da(args->geo, fbno), 2157 - &fbp); 2158 - if (error) 2159 - return error; 2160 - if (!fbp) 2161 - continue; 2162 - free = fbp->b_addr; 2163 - findex = 0; 2164 - } 2165 - /* 2166 - * Look at the current free entry. Is it good enough? 2167 - * 2168 - * The bests initialisation should be where the bufer is read in 2169 - * the above branch. But gcc is too stupid to realise that bests 2170 - * and the freehdr are actually initialised if they are placed 2171 - * there, so we have to do it here to avoid warnings. Blech. 2172 - */ 2173 - bests = dp->d_ops->free_bests_p(free); 2174 - dp->d_ops->free_hdr_from_disk(&freehdr, free); 2175 - if (be16_to_cpu(bests[findex]) != NULLDATAOFF && 2176 - be16_to_cpu(bests[findex]) >= length) 2177 - dbno = freehdr.firstdb + findex; 2178 - else { 2179 - /* 2180 - * Are we done with the freeblock? 2181 - */ 2182 - if (++findex == freehdr.nvalid) { 2183 - /* 2184 - * Drop the block. 2185 - */ 2186 - xfs_trans_brelse(tp, fbp); 2187 - fbp = NULL; 2188 - if (fblk && fblk->bp) 2189 - fblk->bp = NULL; 2190 - } 2191 - } 2192 - } 2193 - /* 2194 - * If we don't have a data block, we need to allocate one and make 2195 - * the freespace entries refer to it. 2196 - */ 2197 - if (unlikely(dbno == -1)) { 2198 - /* 2199 - * Not allowed to allocate, return failure. 2200 - */ 2201 - if ((args->op_flags & XFS_DA_OP_JUSTCHECK) || args->total == 0) 2202 - return -ENOSPC; 2203 - 2204 - /* 2205 - * Allocate and initialize the new data block. 2206 - */ 2207 - if (unlikely((error = xfs_dir2_grow_inode(args, 2208 - XFS_DIR2_DATA_SPACE, 2209 - &dbno)) || 2210 - (error = xfs_dir3_data_init(args, dbno, &dbp)))) 2211 - return error; 2212 - 2213 - /* 2214 - * If (somehow) we have a freespace block, get rid of it. 2215 - */ 2216 - if (fbp) 2217 - xfs_trans_brelse(tp, fbp); 2218 - if (fblk && fblk->bp) 2219 - fblk->bp = NULL; 2220 - 2221 - /* 2222 - * Get the freespace block corresponding to the data block 2223 - * that was just allocated. 2224 - */ 2225 - fbno = dp->d_ops->db_to_fdb(args->geo, dbno); 2226 - error = xfs_dir2_free_try_read(tp, dp, 2227 - xfs_dir2_db_to_da(args->geo, fbno), 2228 - &fbp); 2229 - if (error) 2230 - return error; 2231 - 2232 - /* 2233 - * If there wasn't a freespace block, the read will 2234 - * return a NULL fbp. Allocate and initialize a new one. 2235 - */ 2236 - if (!fbp) { 2237 - error = xfs_dir2_grow_inode(args, XFS_DIR2_FREE_SPACE, 2238 - &fbno); 2239 - if (error) 2240 - return error; 2241 - 2242 - if (dp->d_ops->db_to_fdb(args->geo, dbno) != fbno) { 2243 - xfs_alert(mp, 2244 - "%s: dir ino %llu needed freesp block %lld for data block %lld, got %lld ifbno %llu lastfbno %d", 2245 - __func__, (unsigned long long)dp->i_ino, 2246 - (long long)dp->d_ops->db_to_fdb( 2247 - args->geo, dbno), 2248 - (long long)dbno, (long long)fbno, 2249 - (unsigned long long)ifbno, lastfbno); 2250 - if (fblk) { 2251 - xfs_alert(mp, 2252 - " fblk "PTR_FMT" blkno %llu index %d magic 0x%x", 2253 - fblk, 2254 - (unsigned long long)fblk->blkno, 2255 - fblk->index, 2256 - fblk->magic); 2257 - } else { 2258 - xfs_alert(mp, " ... fblk is NULL"); 2259 - } 2260 - XFS_ERROR_REPORT("xfs_dir2_node_addname_int", 2261 - XFS_ERRLEVEL_LOW, mp); 2262 - return -EFSCORRUPTED; 2263 - } 2264 - 2265 - /* 2266 - * Get a buffer for the new block. 2267 - */ 2268 - error = xfs_dir3_free_get_buf(args, fbno, &fbp); 2269 - if (error) 2270 - return error; 2271 - free = fbp->b_addr; 2272 - bests = dp->d_ops->free_bests_p(free); 2273 - dp->d_ops->free_hdr_from_disk(&freehdr, free); 2274 - 2275 - /* 2276 - * Remember the first slot as our empty slot. 2277 - */ 2278 - freehdr.firstdb = 2279 - (fbno - xfs_dir2_byte_to_db(args->geo, 2280 - XFS_DIR2_FREE_OFFSET)) * 2281 - dp->d_ops->free_max_bests(args->geo); 2282 - } else { 2283 - free = fbp->b_addr; 2284 - bests = dp->d_ops->free_bests_p(free); 2285 - dp->d_ops->free_hdr_from_disk(&freehdr, free); 2286 - } 2287 - 2288 - /* 2289 - * Set the freespace block index from the data block number. 2290 - */ 2291 - findex = dp->d_ops->db_to_fdindex(args->geo, dbno); 2292 - /* 2293 - * If it's after the end of the current entries in the 2294 - * freespace block, extend that table. 2295 - */ 2296 - if (findex >= freehdr.nvalid) { 2297 - ASSERT(findex < dp->d_ops->free_max_bests(args->geo)); 2298 - freehdr.nvalid = findex + 1; 2299 - /* 2300 - * Tag new entry so nused will go up. 2301 - */ 2302 - bests[findex] = cpu_to_be16(NULLDATAOFF); 2303 - } 2304 - /* 2305 - * If this entry was for an empty data block 2306 - * (this should always be true) then update the header. 2307 - */ 2308 - if (bests[findex] == cpu_to_be16(NULLDATAOFF)) { 2309 - freehdr.nused++; 2310 - dp->d_ops->free_hdr_to_disk(fbp->b_addr, &freehdr); 2311 - xfs_dir2_free_log_header(args, fbp); 2312 - } 2313 - /* 2314 - * Update the real value in the table. 2315 - * We haven't allocated the data entry yet so this will 2316 - * change again. 2317 - */ 2318 - hdr = dbp->b_addr; 2319 - bf = dp->d_ops->data_bestfree_p(hdr); 2320 - bests[findex] = bf[0].length; 2321 - logfree = 1; 2322 - } 2323 - /* 2324 - * We had a data block so we don't have to make a new one. 2325 - */ 2326 - else { 2327 - /* 2328 - * If just checking, we succeeded. 2329 - */ 2330 - if (args->op_flags & XFS_DA_OP_JUSTCHECK) 2331 - return 0; 2332 - 2333 - /* 2334 - * Read the data block in. 2335 - */ 2336 - error = xfs_dir3_data_read(tp, dp, 2337 - xfs_dir2_db_to_da(args->geo, dbno), 2338 - -1, &dbp); 2339 - if (error) 2340 - return error; 2341 - hdr = dbp->b_addr; 2342 - bf = dp->d_ops->data_bestfree_p(hdr); 2343 - logfree = 0; 2344 - } 2345 - ASSERT(be16_to_cpu(bf[0].length) >= length); 2346 - /* 2347 - * Point to the existing unused space. 2348 - */ 2349 - dup = (xfs_dir2_data_unused_t *) 2350 - ((char *)hdr + be16_to_cpu(bf[0].offset)); 2351 - needscan = needlog = 0; 2352 - /* 2353 - * Mark the first part of the unused space, inuse for us. 2354 - */ 2355 - aoff = (xfs_dir2_data_aoff_t)((char *)dup - (char *)hdr); 2356 - error = xfs_dir2_data_use_free(args, dbp, dup, aoff, length, 2357 - &needlog, &needscan); 2358 - if (error) { 2359 - xfs_trans_brelse(tp, dbp); 2360 - return error; 2361 - } 2362 - /* 2363 - * Fill in the new entry and log it. 2364 - */ 2365 - dep = (xfs_dir2_data_entry_t *)dup; 2366 - dep->inumber = cpu_to_be64(args->inumber); 2367 - dep->namelen = args->namelen; 2368 - memcpy(dep->name, args->name, dep->namelen); 2369 - dp->d_ops->data_put_ftype(dep, args->filetype); 2370 - tagp = dp->d_ops->data_entry_tag_p(dep); 2371 - *tagp = cpu_to_be16((char *)dep - (char *)hdr); 2372 - xfs_dir2_data_log_entry(args, dbp, dep); 2373 - /* 2374 - * Rescan the block for bestfree if needed. 2375 - */ 2376 - if (needscan) 2377 - xfs_dir2_data_freescan(dp, hdr, &needlog); 2378 - /* 2379 - * Log the data block header if needed. 2380 - */ 2381 - if (needlog) 2382 - xfs_dir2_data_log_header(args, dbp); 2383 - /* 2384 - * If the freespace entry is now wrong, update it. 2385 - */ 2386 - bests = dp->d_ops->free_bests_p(free); /* gcc is so stupid */ 2387 - if (be16_to_cpu(bests[findex]) != be16_to_cpu(bf[0].length)) { 2388 - bests[findex] = bf[0].length; 2389 - logfree = 1; 2390 - } 2391 - /* 2392 - * Log the freespace entry if needed. 2393 - */ 2394 - if (logfree) 2395 - xfs_dir2_free_log_bests(args, fbp, findex, findex); 2396 - /* 2397 - * Return the data block and offset in args, then drop the data block. 2398 - */ 2399 - args->blkno = (xfs_dablk_t)dbno; 2400 - args->index = be16_to_cpu(*tagp); 2401 - return 0; 2402 1680 } 2403 1681 2404 1682 /*
+4 -4
fs/xfs/libxfs/xfs_dir2_sf.c
··· 164 164 * can free the block and copy the formatted data into the inode literal 165 165 * area. 166 166 */ 167 - dst = kmem_alloc(mp->m_sb.sb_inodesize, KM_SLEEP); 167 + dst = kmem_alloc(mp->m_sb.sb_inodesize, 0); 168 168 hdr = bp->b_addr; 169 169 170 170 /* ··· 436 436 437 437 sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; 438 438 old_isize = (int)dp->i_d.di_size; 439 - buf = kmem_alloc(old_isize, KM_SLEEP); 439 + buf = kmem_alloc(old_isize, 0); 440 440 oldsfp = (xfs_dir2_sf_hdr_t *)buf; 441 441 memcpy(oldsfp, sfp, old_isize); 442 442 /* ··· 1096 1096 * Don't want xfs_idata_realloc copying the data here. 1097 1097 */ 1098 1098 oldsize = dp->i_df.if_bytes; 1099 - buf = kmem_alloc(oldsize, KM_SLEEP); 1099 + buf = kmem_alloc(oldsize, 0); 1100 1100 oldsfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; 1101 1101 ASSERT(oldsfp->i8count == 1); 1102 1102 memcpy(buf, oldsfp, oldsize); ··· 1169 1169 * Don't want xfs_idata_realloc copying the data here. 1170 1170 */ 1171 1171 oldsize = dp->i_df.if_bytes; 1172 - buf = kmem_alloc(oldsize, KM_SLEEP); 1172 + buf = kmem_alloc(oldsize, 0); 1173 1173 oldsfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; 1174 1174 ASSERT(oldsfp->i8count == 0); 1175 1175 memcpy(buf, oldsfp, oldsize);
+1 -1
fs/xfs/libxfs/xfs_fs.h
··· 287 287 uint32_t ag_ifree; /* o: inodes free */ 288 288 uint32_t ag_sick; /* o: sick things in ag */ 289 289 uint32_t ag_checked; /* o: checked metadata in ag */ 290 - uint32_t ag_reserved32; /* o: zero */ 290 + uint32_t ag_flags; /* i/o: flags for this ag */ 291 291 uint64_t ag_reserved[12];/* o: zero */ 292 292 }; 293 293 #define XFS_AG_GEOM_SICK_SB (1 << 0) /* superblock */
+7 -2
fs/xfs/libxfs/xfs_ialloc.c
··· 2787 2787 igeo->inobt_maxlevels = xfs_btree_compute_maxlevels(igeo->inobt_mnr, 2788 2788 inodes); 2789 2789 2790 - /* Set the maximum inode count for this filesystem. */ 2791 - if (sbp->sb_imax_pct) { 2790 + /* 2791 + * Set the maximum inode count for this filesystem, being careful not 2792 + * to use obviously garbage sb_inopblog/sb_inopblock values. Regular 2793 + * users should never get here due to failing sb verification, but 2794 + * certain users (xfs_db) need to be usable even with corrupt metadata. 2795 + */ 2796 + if (sbp->sb_imax_pct && igeo->ialloc_blks) { 2792 2797 /* 2793 2798 * Make sure the maximum inode count is a multiple 2794 2799 * of the units we allocate inodes in.
+4 -4
fs/xfs/libxfs/xfs_iext_tree.c
··· 616 616 * sequence counter is seen before the modifications to the extent tree itself 617 617 * take effect. 618 618 */ 619 - static inline void xfs_iext_inc_seq(struct xfs_ifork *ifp, int state) 619 + static inline void xfs_iext_inc_seq(struct xfs_ifork *ifp) 620 620 { 621 621 WRITE_ONCE(ifp->if_seq, READ_ONCE(ifp->if_seq) + 1); 622 622 } ··· 633 633 struct xfs_iext_leaf *new = NULL; 634 634 int nr_entries, i; 635 635 636 - xfs_iext_inc_seq(ifp, state); 636 + xfs_iext_inc_seq(ifp); 637 637 638 638 if (ifp->if_height == 0) 639 639 xfs_iext_alloc_root(ifp, cur); ··· 875 875 ASSERT(ifp->if_u1.if_root != NULL); 876 876 ASSERT(xfs_iext_valid(ifp, cur)); 877 877 878 - xfs_iext_inc_seq(ifp, state); 878 + xfs_iext_inc_seq(ifp); 879 879 880 880 nr_entries = xfs_iext_leaf_nr_entries(ifp, leaf, cur->pos) - 1; 881 881 for (i = cur->pos; i < nr_entries; i++) ··· 983 983 { 984 984 struct xfs_ifork *ifp = xfs_iext_state_to_fork(ip, state); 985 985 986 - xfs_iext_inc_seq(ifp, state); 986 + xfs_iext_inc_seq(ifp); 987 987 988 988 if (cur->pos == 0) { 989 989 struct xfs_bmbt_irec old;
+8 -8
fs/xfs/libxfs/xfs_inode_fork.c
··· 94 94 return 0; 95 95 96 96 ASSERT(ip->i_afp == NULL); 97 - ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, KM_SLEEP | KM_NOFS); 97 + ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, KM_NOFS); 98 98 99 99 switch (dip->di_aformat) { 100 100 case XFS_DINODE_FMT_LOCAL: ··· 147 147 148 148 if (size) { 149 149 real_size = roundup(mem_size, 4); 150 - ifp->if_u1.if_data = kmem_alloc(real_size, KM_SLEEP | KM_NOFS); 150 + ifp->if_u1.if_data = kmem_alloc(real_size, KM_NOFS); 151 151 memcpy(ifp->if_u1.if_data, data, size); 152 152 if (zero_terminate) 153 153 ifp->if_u1.if_data[size] = '\0'; ··· 302 302 } 303 303 304 304 ifp->if_broot_bytes = size; 305 - ifp->if_broot = kmem_alloc(size, KM_SLEEP | KM_NOFS); 305 + ifp->if_broot = kmem_alloc(size, KM_NOFS); 306 306 ASSERT(ifp->if_broot != NULL); 307 307 /* 308 308 * Copy and convert from the on-disk structure ··· 367 367 */ 368 368 if (ifp->if_broot_bytes == 0) { 369 369 new_size = XFS_BMAP_BROOT_SPACE_CALC(mp, rec_diff); 370 - ifp->if_broot = kmem_alloc(new_size, KM_SLEEP | KM_NOFS); 370 + ifp->if_broot = kmem_alloc(new_size, KM_NOFS); 371 371 ifp->if_broot_bytes = (int)new_size; 372 372 return; 373 373 } ··· 382 382 new_max = cur_max + rec_diff; 383 383 new_size = XFS_BMAP_BROOT_SPACE_CALC(mp, new_max); 384 384 ifp->if_broot = kmem_realloc(ifp->if_broot, new_size, 385 - KM_SLEEP | KM_NOFS); 385 + KM_NOFS); 386 386 op = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1, 387 387 ifp->if_broot_bytes); 388 388 np = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1, ··· 408 408 else 409 409 new_size = 0; 410 410 if (new_size > 0) { 411 - new_broot = kmem_alloc(new_size, KM_SLEEP | KM_NOFS); 411 + new_broot = kmem_alloc(new_size, KM_NOFS); 412 412 /* 413 413 * First copy over the btree block header. 414 414 */ ··· 492 492 * We enforce that here. 493 493 */ 494 494 ifp->if_u1.if_data = kmem_realloc(ifp->if_u1.if_data, 495 - roundup(new_size, 4), KM_SLEEP | KM_NOFS); 495 + roundup(new_size, 4), KM_NOFS); 496 496 ifp->if_bytes = new_size; 497 497 } 498 498 ··· 683 683 return; 684 684 685 685 ip->i_cowfp = kmem_zone_zalloc(xfs_ifork_zone, 686 - KM_SLEEP | KM_NOFS); 686 + KM_NOFS); 687 687 ip->i_cowfp->if_flags = XFS_IFEXTENTS; 688 688 ip->i_cformat = XFS_DINODE_FMT_EXTENTS; 689 689 ip->i_cnextents = 0;
+20 -30
fs/xfs/libxfs/xfs_refcount.c
··· 1174 1174 /* 1175 1175 * Record a refcount intent for later processing. 1176 1176 */ 1177 - static int 1177 + static void 1178 1178 __xfs_refcount_add( 1179 1179 struct xfs_trans *tp, 1180 1180 enum xfs_refcount_intent_type type, ··· 1189 1189 blockcount); 1190 1190 1191 1191 ri = kmem_alloc(sizeof(struct xfs_refcount_intent), 1192 - KM_SLEEP | KM_NOFS); 1192 + KM_NOFS); 1193 1193 INIT_LIST_HEAD(&ri->ri_list); 1194 1194 ri->ri_type = type; 1195 1195 ri->ri_startblock = startblock; 1196 1196 ri->ri_blockcount = blockcount; 1197 1197 1198 1198 xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_REFCOUNT, &ri->ri_list); 1199 - return 0; 1200 1199 } 1201 1200 1202 1201 /* 1203 1202 * Increase the reference count of the blocks backing a file's extent. 1204 1203 */ 1205 - int 1204 + void 1206 1205 xfs_refcount_increase_extent( 1207 1206 struct xfs_trans *tp, 1208 1207 struct xfs_bmbt_irec *PREV) 1209 1208 { 1210 1209 if (!xfs_sb_version_hasreflink(&tp->t_mountp->m_sb)) 1211 - return 0; 1210 + return; 1212 1211 1213 - return __xfs_refcount_add(tp, XFS_REFCOUNT_INCREASE, 1214 - PREV->br_startblock, PREV->br_blockcount); 1212 + __xfs_refcount_add(tp, XFS_REFCOUNT_INCREASE, PREV->br_startblock, 1213 + PREV->br_blockcount); 1215 1214 } 1216 1215 1217 1216 /* 1218 1217 * Decrease the reference count of the blocks backing a file's extent. 1219 1218 */ 1220 - int 1219 + void 1221 1220 xfs_refcount_decrease_extent( 1222 1221 struct xfs_trans *tp, 1223 1222 struct xfs_bmbt_irec *PREV) 1224 1223 { 1225 1224 if (!xfs_sb_version_hasreflink(&tp->t_mountp->m_sb)) 1226 - return 0; 1225 + return; 1227 1226 1228 - return __xfs_refcount_add(tp, XFS_REFCOUNT_DECREASE, 1229 - PREV->br_startblock, PREV->br_blockcount); 1227 + __xfs_refcount_add(tp, XFS_REFCOUNT_DECREASE, PREV->br_startblock, 1228 + PREV->br_blockcount); 1230 1229 } 1231 1230 1232 1231 /* ··· 1540 1541 } 1541 1542 1542 1543 /* Record a CoW staging extent in the refcount btree. */ 1543 - int 1544 + void 1544 1545 xfs_refcount_alloc_cow_extent( 1545 1546 struct xfs_trans *tp, 1546 1547 xfs_fsblock_t fsb, 1547 1548 xfs_extlen_t len) 1548 1549 { 1549 1550 struct xfs_mount *mp = tp->t_mountp; 1550 - int error; 1551 1551 1552 1552 if (!xfs_sb_version_hasreflink(&mp->m_sb)) 1553 - return 0; 1553 + return; 1554 1554 1555 - error = __xfs_refcount_add(tp, XFS_REFCOUNT_ALLOC_COW, fsb, len); 1556 - if (error) 1557 - return error; 1555 + __xfs_refcount_add(tp, XFS_REFCOUNT_ALLOC_COW, fsb, len); 1558 1556 1559 1557 /* Add rmap entry */ 1560 - return xfs_rmap_alloc_extent(tp, XFS_FSB_TO_AGNO(mp, fsb), 1558 + xfs_rmap_alloc_extent(tp, XFS_FSB_TO_AGNO(mp, fsb), 1561 1559 XFS_FSB_TO_AGBNO(mp, fsb), len, XFS_RMAP_OWN_COW); 1562 1560 } 1563 1561 1564 1562 /* Forget a CoW staging event in the refcount btree. */ 1565 - int 1563 + void 1566 1564 xfs_refcount_free_cow_extent( 1567 1565 struct xfs_trans *tp, 1568 1566 xfs_fsblock_t fsb, 1569 1567 xfs_extlen_t len) 1570 1568 { 1571 1569 struct xfs_mount *mp = tp->t_mountp; 1572 - int error; 1573 1570 1574 1571 if (!xfs_sb_version_hasreflink(&mp->m_sb)) 1575 - return 0; 1572 + return; 1576 1573 1577 1574 /* Remove rmap entry */ 1578 - error = xfs_rmap_free_extent(tp, XFS_FSB_TO_AGNO(mp, fsb), 1575 + xfs_rmap_free_extent(tp, XFS_FSB_TO_AGNO(mp, fsb), 1579 1576 XFS_FSB_TO_AGBNO(mp, fsb), len, XFS_RMAP_OWN_COW); 1580 - if (error) 1581 - return error; 1582 - 1583 - return __xfs_refcount_add(tp, XFS_REFCOUNT_FREE_COW, fsb, len); 1577 + __xfs_refcount_add(tp, XFS_REFCOUNT_FREE_COW, fsb, len); 1584 1578 } 1585 1579 1586 1580 struct xfs_refcount_recovery { ··· 1594 1602 if (be32_to_cpu(rec->refc.rc_refcount) != 1) 1595 1603 return -EFSCORRUPTED; 1596 1604 1597 - rr = kmem_alloc(sizeof(struct xfs_refcount_recovery), KM_SLEEP); 1605 + rr = kmem_alloc(sizeof(struct xfs_refcount_recovery), 0); 1598 1606 xfs_refcount_btrec_to_irec(rec, &rr->rr_rrec); 1599 1607 list_add_tail(&rr->rr_list, debris); 1600 1608 ··· 1671 1679 /* Free the orphan record */ 1672 1680 agbno = rr->rr_rrec.rc_startblock - XFS_REFC_COW_START; 1673 1681 fsb = XFS_AGB_TO_FSB(mp, agno, agbno); 1674 - error = xfs_refcount_free_cow_extent(tp, fsb, 1682 + xfs_refcount_free_cow_extent(tp, fsb, 1675 1683 rr->rr_rrec.rc_blockcount); 1676 - if (error) 1677 - goto out_trans; 1678 1684 1679 1685 /* Free the block. */ 1680 1686 xfs_bmap_add_free(tp, fsb, rr->rr_rrec.rc_blockcount, NULL);
+6 -6
fs/xfs/libxfs/xfs_refcount.h
··· 29 29 xfs_extlen_t ri_blockcount; 30 30 }; 31 31 32 - extern int xfs_refcount_increase_extent(struct xfs_trans *tp, 32 + void xfs_refcount_increase_extent(struct xfs_trans *tp, 33 33 struct xfs_bmbt_irec *irec); 34 - extern int xfs_refcount_decrease_extent(struct xfs_trans *tp, 34 + void xfs_refcount_decrease_extent(struct xfs_trans *tp, 35 35 struct xfs_bmbt_irec *irec); 36 36 37 37 extern void xfs_refcount_finish_one_cleanup(struct xfs_trans *tp, ··· 45 45 xfs_agblock_t agbno, xfs_extlen_t aglen, xfs_agblock_t *fbno, 46 46 xfs_extlen_t *flen, bool find_end_of_shared); 47 47 48 - extern int xfs_refcount_alloc_cow_extent(struct xfs_trans *tp, 49 - xfs_fsblock_t fsb, xfs_extlen_t len); 50 - extern int xfs_refcount_free_cow_extent(struct xfs_trans *tp, 51 - xfs_fsblock_t fsb, xfs_extlen_t len); 48 + void xfs_refcount_alloc_cow_extent(struct xfs_trans *tp, xfs_fsblock_t fsb, 49 + xfs_extlen_t len); 50 + void xfs_refcount_free_cow_extent(struct xfs_trans *tp, xfs_fsblock_t fsb, 51 + xfs_extlen_t len); 52 52 extern int xfs_refcount_recover_cow_leftovers(struct xfs_mount *mp, 53 53 xfs_agnumber_t agno); 54 54
+30 -29
fs/xfs/libxfs/xfs_rmap.c
··· 168 168 union xfs_btree_rec *rec, 169 169 struct xfs_rmap_irec *irec) 170 170 { 171 - irec->rm_flags = 0; 172 171 irec->rm_startblock = be32_to_cpu(rec->rmap.rm_startblock); 173 172 irec->rm_blockcount = be32_to_cpu(rec->rmap.rm_blockcount); 174 173 irec->rm_owner = be64_to_cpu(rec->rmap.rm_owner); ··· 253 254 rec->rm_flags); 254 255 255 256 if (rec->rm_owner != info->high.rm_owner) 256 - return XFS_BTREE_QUERY_RANGE_CONTINUE; 257 + return 0; 257 258 if (!XFS_RMAP_NON_INODE_OWNER(rec->rm_owner) && 258 259 !(rec->rm_flags & XFS_RMAP_BMBT_BLOCK) && 259 260 rec->rm_offset + rec->rm_blockcount - 1 != info->high.rm_offset) 260 - return XFS_BTREE_QUERY_RANGE_CONTINUE; 261 + return 0; 261 262 262 263 *info->irec = *rec; 263 264 *info->stat = 1; 264 - return XFS_BTREE_QUERY_RANGE_ABORT; 265 + return -ECANCELED; 265 266 } 266 267 267 268 /* ··· 304 305 305 306 error = xfs_rmap_query_range(cur, &info.high, &info.high, 306 307 xfs_rmap_find_left_neighbor_helper, &info); 307 - if (error == XFS_BTREE_QUERY_RANGE_ABORT) 308 + if (error == -ECANCELED) 308 309 error = 0; 309 310 if (*stat) 310 311 trace_xfs_rmap_find_left_neighbor_result(cur->bc_mp, ··· 329 330 rec->rm_flags); 330 331 331 332 if (rec->rm_owner != info->high.rm_owner) 332 - return XFS_BTREE_QUERY_RANGE_CONTINUE; 333 + return 0; 333 334 if (!XFS_RMAP_NON_INODE_OWNER(rec->rm_owner) && 334 335 !(rec->rm_flags & XFS_RMAP_BMBT_BLOCK) && 335 336 (rec->rm_offset > info->high.rm_offset || 336 337 rec->rm_offset + rec->rm_blockcount <= info->high.rm_offset)) 337 - return XFS_BTREE_QUERY_RANGE_CONTINUE; 338 + return 0; 338 339 339 340 *info->irec = *rec; 340 341 *info->stat = 1; 341 - return XFS_BTREE_QUERY_RANGE_ABORT; 342 + return -ECANCELED; 342 343 } 343 344 344 345 /* ··· 376 377 cur->bc_private.a.agno, bno, 0, owner, offset, flags); 377 378 error = xfs_rmap_query_range(cur, &info.high, &info.high, 378 379 xfs_rmap_lookup_le_range_helper, &info); 379 - if (error == XFS_BTREE_QUERY_RANGE_ABORT) 380 + if (error == -ECANCELED) 380 381 error = 0; 381 382 if (*stat) 382 383 trace_xfs_rmap_lookup_le_range_result(cur->bc_mp, ··· 2267 2268 * Record a rmap intent; the list is kept sorted first by AG and then by 2268 2269 * increasing age. 2269 2270 */ 2270 - static int 2271 + static void 2271 2272 __xfs_rmap_add( 2272 2273 struct xfs_trans *tp, 2273 2274 enum xfs_rmap_intent_type type, ··· 2286 2287 bmap->br_blockcount, 2287 2288 bmap->br_state); 2288 2289 2289 - ri = kmem_alloc(sizeof(struct xfs_rmap_intent), KM_SLEEP | KM_NOFS); 2290 + ri = kmem_alloc(sizeof(struct xfs_rmap_intent), KM_NOFS); 2290 2291 INIT_LIST_HEAD(&ri->ri_list); 2291 2292 ri->ri_type = type; 2292 2293 ri->ri_owner = owner; ··· 2294 2295 ri->ri_bmap = *bmap; 2295 2296 2296 2297 xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_RMAP, &ri->ri_list); 2297 - return 0; 2298 2298 } 2299 2299 2300 2300 /* Map an extent into a file. */ 2301 - int 2301 + void 2302 2302 xfs_rmap_map_extent( 2303 2303 struct xfs_trans *tp, 2304 2304 struct xfs_inode *ip, ··· 2305 2307 struct xfs_bmbt_irec *PREV) 2306 2308 { 2307 2309 if (!xfs_rmap_update_is_needed(tp->t_mountp, whichfork)) 2308 - return 0; 2310 + return; 2309 2311 2310 - return __xfs_rmap_add(tp, xfs_is_reflink_inode(ip) ? 2312 + __xfs_rmap_add(tp, xfs_is_reflink_inode(ip) ? 2311 2313 XFS_RMAP_MAP_SHARED : XFS_RMAP_MAP, ip->i_ino, 2312 2314 whichfork, PREV); 2313 2315 } 2314 2316 2315 2317 /* Unmap an extent out of a file. */ 2316 - int 2318 + void 2317 2319 xfs_rmap_unmap_extent( 2318 2320 struct xfs_trans *tp, 2319 2321 struct xfs_inode *ip, ··· 2321 2323 struct xfs_bmbt_irec *PREV) 2322 2324 { 2323 2325 if (!xfs_rmap_update_is_needed(tp->t_mountp, whichfork)) 2324 - return 0; 2326 + return; 2325 2327 2326 - return __xfs_rmap_add(tp, xfs_is_reflink_inode(ip) ? 2328 + __xfs_rmap_add(tp, xfs_is_reflink_inode(ip) ? 2327 2329 XFS_RMAP_UNMAP_SHARED : XFS_RMAP_UNMAP, ip->i_ino, 2328 2330 whichfork, PREV); 2329 2331 } ··· 2334 2336 * Note that tp can be NULL here as no transaction is used for COW fork 2335 2337 * unwritten conversion. 2336 2338 */ 2337 - int 2339 + void 2338 2340 xfs_rmap_convert_extent( 2339 2341 struct xfs_mount *mp, 2340 2342 struct xfs_trans *tp, ··· 2343 2345 struct xfs_bmbt_irec *PREV) 2344 2346 { 2345 2347 if (!xfs_rmap_update_is_needed(mp, whichfork)) 2346 - return 0; 2348 + return; 2347 2349 2348 - return __xfs_rmap_add(tp, xfs_is_reflink_inode(ip) ? 2350 + __xfs_rmap_add(tp, xfs_is_reflink_inode(ip) ? 2349 2351 XFS_RMAP_CONVERT_SHARED : XFS_RMAP_CONVERT, ip->i_ino, 2350 2352 whichfork, PREV); 2351 2353 } 2352 2354 2353 2355 /* Schedule the creation of an rmap for non-file data. */ 2354 - int 2356 + void 2355 2357 xfs_rmap_alloc_extent( 2356 2358 struct xfs_trans *tp, 2357 2359 xfs_agnumber_t agno, ··· 2362 2364 struct xfs_bmbt_irec bmap; 2363 2365 2364 2366 if (!xfs_rmap_update_is_needed(tp->t_mountp, XFS_DATA_FORK)) 2365 - return 0; 2367 + return; 2366 2368 2367 2369 bmap.br_startblock = XFS_AGB_TO_FSB(tp->t_mountp, agno, bno); 2368 2370 bmap.br_blockcount = len; 2369 2371 bmap.br_startoff = 0; 2370 2372 bmap.br_state = XFS_EXT_NORM; 2371 2373 2372 - return __xfs_rmap_add(tp, XFS_RMAP_ALLOC, owner, XFS_DATA_FORK, &bmap); 2374 + __xfs_rmap_add(tp, XFS_RMAP_ALLOC, owner, XFS_DATA_FORK, &bmap); 2373 2375 } 2374 2376 2375 2377 /* Schedule the deletion of an rmap for non-file data. */ 2376 - int 2378 + void 2377 2379 xfs_rmap_free_extent( 2378 2380 struct xfs_trans *tp, 2379 2381 xfs_agnumber_t agno, ··· 2384 2386 struct xfs_bmbt_irec bmap; 2385 2387 2386 2388 if (!xfs_rmap_update_is_needed(tp->t_mountp, XFS_DATA_FORK)) 2387 - return 0; 2389 + return; 2388 2390 2389 2391 bmap.br_startblock = XFS_AGB_TO_FSB(tp->t_mountp, agno, bno); 2390 2392 bmap.br_blockcount = len; 2391 2393 bmap.br_startoff = 0; 2392 2394 bmap.br_state = XFS_EXT_NORM; 2393 2395 2394 - return __xfs_rmap_add(tp, XFS_RMAP_FREE, owner, XFS_DATA_FORK, &bmap); 2396 + __xfs_rmap_add(tp, XFS_RMAP_FREE, owner, XFS_DATA_FORK, &bmap); 2395 2397 } 2396 2398 2397 2399 /* Compare rmap records. Returns -1 if a < b, 1 if a > b, and 0 if equal. */ ··· 2509 2511 ((rks->flags & rec->rm_flags) & XFS_RMAP_KEY_FLAGS) == rks->flags) 2510 2512 return 0; 2511 2513 rks->has_rmap = true; 2512 - return XFS_BTREE_QUERY_RANGE_ABORT; 2514 + return -ECANCELED; 2513 2515 } 2514 2516 2515 2517 /* ··· 2538 2540 2539 2541 error = xfs_rmap_query_range(cur, &low, &high, 2540 2542 xfs_rmap_has_other_keys_helper, &rks); 2543 + if (error < 0) 2544 + return error; 2545 + 2541 2546 *has_rmap = rks.has_rmap; 2542 - return error; 2547 + return 0; 2543 2548 } 2544 2549 2545 2550 const struct xfs_owner_info XFS_RMAP_OINFO_SKIP_UPDATE = {
+6 -5
fs/xfs/libxfs/xfs_rmap.h
··· 68 68 if (offset & ~(XFS_RMAP_OFF_MASK | XFS_RMAP_OFF_FLAGS)) 69 69 return -EFSCORRUPTED; 70 70 irec->rm_offset = XFS_RMAP_OFF(offset); 71 + irec->rm_flags = 0; 71 72 if (offset & XFS_RMAP_OFF_ATTR_FORK) 72 73 irec->rm_flags |= XFS_RMAP_ATTR_FORK; 73 74 if (offset & XFS_RMAP_OFF_BMBT_BLOCK) ··· 162 161 }; 163 162 164 163 /* functions for updating the rmapbt based on bmbt map/unmap operations */ 165 - int xfs_rmap_map_extent(struct xfs_trans *tp, struct xfs_inode *ip, 164 + void xfs_rmap_map_extent(struct xfs_trans *tp, struct xfs_inode *ip, 166 165 int whichfork, struct xfs_bmbt_irec *imap); 167 - int xfs_rmap_unmap_extent(struct xfs_trans *tp, struct xfs_inode *ip, 166 + void xfs_rmap_unmap_extent(struct xfs_trans *tp, struct xfs_inode *ip, 168 167 int whichfork, struct xfs_bmbt_irec *imap); 169 - int xfs_rmap_convert_extent(struct xfs_mount *mp, struct xfs_trans *tp, 168 + void xfs_rmap_convert_extent(struct xfs_mount *mp, struct xfs_trans *tp, 170 169 struct xfs_inode *ip, int whichfork, 171 170 struct xfs_bmbt_irec *imap); 172 - int xfs_rmap_alloc_extent(struct xfs_trans *tp, xfs_agnumber_t agno, 171 + void xfs_rmap_alloc_extent(struct xfs_trans *tp, xfs_agnumber_t agno, 173 172 xfs_agblock_t bno, xfs_extlen_t len, uint64_t owner); 174 - int xfs_rmap_free_extent(struct xfs_trans *tp, xfs_agnumber_t agno, 173 + void xfs_rmap_free_extent(struct xfs_trans *tp, xfs_agnumber_t agno, 175 174 xfs_agblock_t bno, xfs_extlen_t len, uint64_t owner); 176 175 177 176 void xfs_rmap_finish_one_cleanup(struct xfs_trans *tp,
-6
fs/xfs/libxfs/xfs_shared.h
··· 177 177 unsigned int agino_log; /* #bits for agino in inum */ 178 178 }; 179 179 180 - /* Keep iterating the data structure. */ 181 - #define XFS_ITER_CONTINUE (0) 182 - 183 - /* Stop iterating the data structure. */ 184 - #define XFS_ITER_ABORT (1) 185 - 186 180 #endif /* __XFS_SHARED_H__ */
+8
fs/xfs/libxfs/xfs_types.h
··· 169 169 xfs_exntst_t br_state; /* extent state */ 170 170 } xfs_bmbt_irec_t; 171 171 172 + /* per-AG block reservation types */ 173 + enum xfs_ag_resv_type { 174 + XFS_AG_RESV_NONE = 0, 175 + XFS_AG_RESV_AGFL, 176 + XFS_AG_RESV_METADATA, 177 + XFS_AG_RESV_RMAPBT, 178 + }; 179 + 172 180 /* 173 181 * Type verifier functions 174 182 */
+2 -2
fs/xfs/scrub/agheader.c
··· 639 639 xchk_agfl_block_xref(sc, agbno); 640 640 641 641 if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) 642 - return XFS_ITER_ABORT; 642 + return -ECANCELED; 643 643 644 644 return 0; 645 645 } ··· 730 730 /* Check the blocks in the AGFL. */ 731 731 error = xfs_agfl_walk(sc->mp, XFS_BUF_TO_AGF(sc->sa.agf_bp), 732 732 sc->sa.agfl_bp, xchk_agfl_block, &sai); 733 - if (error == XFS_ITER_ABORT) { 733 + if (error == -ECANCELED) { 734 734 error = 0; 735 735 goto out_free; 736 736 }
+2 -4
fs/xfs/scrub/attr.c
··· 80 80 * without the inode lock held, which means we can sleep. 81 81 */ 82 82 if (sc->flags & XCHK_TRY_HARDER) { 83 - error = xchk_setup_xattr_buf(sc, XATTR_SIZE_MAX, KM_SLEEP); 83 + error = xchk_setup_xattr_buf(sc, XATTR_SIZE_MAX, 0); 84 84 if (error) 85 85 return error; 86 86 } ··· 163 163 args.valuelen = valuelen; 164 164 165 165 error = xfs_attr_get_ilocked(context->dp, &args); 166 - if (error == -EEXIST) 167 - error = 0; 168 166 if (!xchk_fblock_process_error(sx->sc, XFS_ATTR_FORK, args.blkno, 169 167 &error)) 170 168 goto fail_xref; ··· 171 173 args.blkno); 172 174 fail_xref: 173 175 if (sx->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) 174 - context->seen_enough = XFS_ITER_ABORT; 176 + context->seen_enough = 1; 175 177 return; 176 178 } 177 179
+48 -33
fs/xfs/scrub/bmap.c
··· 75 75 xfs_fileoff_t lastoff; 76 76 bool is_rt; 77 77 bool is_shared; 78 + bool was_loaded; 78 79 int whichfork; 79 80 }; 80 81 ··· 214 213 215 214 /* Cross-reference a single rtdev extent record. */ 216 215 STATIC void 217 - xchk_bmap_rt_extent_xref( 218 - struct xchk_bmap_info *info, 216 + xchk_bmap_rt_iextent_xref( 219 217 struct xfs_inode *ip, 220 - struct xfs_btree_cur *cur, 218 + struct xchk_bmap_info *info, 221 219 struct xfs_bmbt_irec *irec) 222 220 { 223 - if (info->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) 224 - return; 225 - 226 221 xchk_xref_is_used_rt_space(info->sc, irec->br_startblock, 227 222 irec->br_blockcount); 228 223 } 229 224 230 225 /* Cross-reference a single datadev extent record. */ 231 226 STATIC void 232 - xchk_bmap_extent_xref( 233 - struct xchk_bmap_info *info, 227 + xchk_bmap_iextent_xref( 234 228 struct xfs_inode *ip, 235 - struct xfs_btree_cur *cur, 229 + struct xchk_bmap_info *info, 236 230 struct xfs_bmbt_irec *irec) 237 231 { 238 232 struct xfs_mount *mp = info->sc->mp; ··· 235 239 xfs_agblock_t agbno; 236 240 xfs_extlen_t len; 237 241 int error; 238 - 239 - if (info->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) 240 - return; 241 242 242 243 agno = XFS_FSB_TO_AGNO(mp, irec->br_startblock); 243 244 agbno = XFS_FSB_TO_AGBNO(mp, irec->br_startblock); ··· 293 300 294 301 /* Scrub a single extent record. */ 295 302 STATIC int 296 - xchk_bmap_extent( 303 + xchk_bmap_iextent( 297 304 struct xfs_inode *ip, 298 - struct xfs_btree_cur *cur, 299 305 struct xchk_bmap_info *info, 300 306 struct xfs_bmbt_irec *irec) 301 307 { 302 308 struct xfs_mount *mp = info->sc->mp; 303 - struct xfs_buf *bp = NULL; 304 309 xfs_filblks_t end; 305 310 int error = 0; 306 - 307 - if (cur) 308 - xfs_btree_get_block(cur, 0, &bp); 309 311 310 312 /* 311 313 * Check for out-of-order extents. This record could have come ··· 352 364 xchk_fblock_set_corrupt(info->sc, info->whichfork, 353 365 irec->br_startoff); 354 366 367 + if (info->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) 368 + return 0; 369 + 355 370 if (info->is_rt) 356 - xchk_bmap_rt_extent_xref(info, ip, cur, irec); 371 + xchk_bmap_rt_iextent_xref(ip, info, irec); 357 372 else 358 - xchk_bmap_extent_xref(info, ip, cur, irec); 373 + xchk_bmap_iextent_xref(ip, info, irec); 359 374 360 375 info->lastoff = irec->br_startoff + irec->br_blockcount; 361 376 return error; ··· 371 380 union xfs_btree_rec *rec) 372 381 { 373 382 struct xfs_bmbt_irec irec; 383 + struct xfs_bmbt_irec iext_irec; 384 + struct xfs_iext_cursor icur; 374 385 struct xchk_bmap_info *info = bs->private; 375 386 struct xfs_inode *ip = bs->cur->bc_private.b.ip; 376 387 struct xfs_buf *bp = NULL; 377 388 struct xfs_btree_block *block; 389 + struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, info->whichfork); 378 390 uint64_t owner; 379 391 int i; 380 392 ··· 396 402 } 397 403 } 398 404 399 - /* Set up the in-core record and scrub it. */ 405 + /* 406 + * Check that the incore extent tree contains an extent that matches 407 + * this one exactly. We validate those cached bmaps later, so we don't 408 + * need to check them here. If the incore extent tree was just loaded 409 + * from disk by the scrubber, we assume that its contents match what's 410 + * on disk (we still hold the ILOCK) and skip the equivalence check. 411 + */ 412 + if (!info->was_loaded) 413 + return 0; 414 + 400 415 xfs_bmbt_disk_get_all(&rec->bmbt, &irec); 401 - return xchk_bmap_extent(ip, bs->cur, info, &irec); 416 + if (!xfs_iext_lookup_extent(ip, ifp, irec.br_startoff, &icur, 417 + &iext_irec) || 418 + irec.br_startoff != iext_irec.br_startoff || 419 + irec.br_startblock != iext_irec.br_startblock || 420 + irec.br_blockcount != iext_irec.br_blockcount || 421 + irec.br_state != iext_irec.br_state) 422 + xchk_fblock_set_corrupt(bs->sc, info->whichfork, 423 + irec.br_startoff); 424 + return 0; 402 425 } 403 426 404 427 /* Scan the btree records. */ ··· 426 415 struct xchk_bmap_info *info) 427 416 { 428 417 struct xfs_owner_info oinfo; 418 + struct xfs_ifork *ifp = XFS_IFORK_PTR(sc->ip, whichfork); 429 419 struct xfs_mount *mp = sc->mp; 430 420 struct xfs_inode *ip = sc->ip; 431 421 struct xfs_btree_cur *cur; 432 422 int error; 433 423 424 + /* Load the incore bmap cache if it's not loaded. */ 425 + info->was_loaded = ifp->if_flags & XFS_IFEXTENTS; 426 + if (!info->was_loaded) { 427 + error = xfs_iread_extents(sc->tp, ip, whichfork); 428 + if (!xchk_fblock_process_error(sc, whichfork, 0, &error)) 429 + goto out; 430 + } 431 + 432 + /* Check the btree structure. */ 434 433 cur = xfs_bmbt_init_cursor(mp, sc->tp, ip, whichfork); 435 434 xfs_rmap_ino_bmbt_owner(&oinfo, ip->i_ino, whichfork); 436 435 error = xchk_btree(sc, cur, xchk_bmapbt_rec, &oinfo, info); 437 436 xfs_btree_del_cursor(cur, error); 437 + out: 438 438 return error; 439 439 } 440 440 ··· 522 500 523 501 out: 524 502 if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) 525 - return XFS_BTREE_QUERY_RANGE_ABORT; 503 + return -ECANCELED; 526 504 return 0; 527 505 } 528 506 ··· 551 529 sbcri.sc = sc; 552 530 sbcri.whichfork = whichfork; 553 531 error = xfs_rmap_query_all(cur, xchk_bmap_check_rmap, &sbcri); 554 - if (error == XFS_BTREE_QUERY_RANGE_ABORT) 532 + if (error == -ECANCELED) 555 533 error = 0; 556 534 557 535 xfs_btree_del_cursor(cur, error); ··· 693 671 if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) 694 672 goto out; 695 673 696 - /* Now try to scrub the in-memory extent list. */ 697 - if (!(ifp->if_flags & XFS_IFEXTENTS)) { 698 - error = xfs_iread_extents(sc->tp, ip, whichfork); 699 - if (!xchk_fblock_process_error(sc, whichfork, 0, &error)) 700 - goto out; 701 - } 702 - 703 674 /* Find the offset of the last extent in the mapping. */ 704 675 error = xfs_bmap_last_offset(ip, &endoff, whichfork); 705 676 if (!xchk_fblock_process_error(sc, whichfork, 0, &error)) ··· 704 689 for_each_xfs_iext(ifp, &icur, &irec) { 705 690 if (xchk_should_terminate(sc, &error) || 706 691 (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)) 707 - break; 692 + goto out; 708 693 if (isnullstartblock(irec.br_startblock)) 709 694 continue; 710 695 if (irec.br_startoff >= endoff) { ··· 712 697 irec.br_startoff); 713 698 goto out; 714 699 } 715 - error = xchk_bmap_extent(ip, NULL, &info, &irec); 700 + error = xchk_bmap_iextent(ip, &info, &irec); 716 701 if (error) 717 702 goto out; 718 703 }
+1 -1
fs/xfs/scrub/fscounters.c
··· 125 125 struct xchk_fscounters *fsc; 126 126 int error; 127 127 128 - sc->buf = kmem_zalloc(sizeof(struct xchk_fscounters), KM_SLEEP); 128 + sc->buf = kmem_zalloc(sizeof(struct xchk_fscounters), 0); 129 129 if (!sc->buf) 130 130 return -ENOMEM; 131 131 fsc = sc->buf;
+3 -3
fs/xfs/scrub/repair.c
··· 351 351 xfs_buf_zero(bp, 0, BBTOB(bp->b_length)); 352 352 xfs_btree_init_block(mp, bp, btnum, 0, 0, sc->sa.agno); 353 353 xfs_trans_buf_set_type(tp, bp, XFS_BLFT_BTREE_BUF); 354 - xfs_trans_log_buf(tp, bp, 0, bp->b_length); 354 + xfs_trans_log_buf(tp, bp, 0, BBTOB(bp->b_length) - 1); 355 355 bp->b_ops = ops; 356 356 *bpp = bp; 357 357 ··· 664 664 { 665 665 xfs_agblock_t *agbno = priv; 666 666 667 - return (*agbno == bno) ? XFS_ITER_ABORT : 0; 667 + return (*agbno == bno) ? -ECANCELED : 0; 668 668 } 669 669 670 670 /* Does this block match the btree information passed in? */ ··· 694 694 if (owner == XFS_RMAP_OWN_AG) { 695 695 error = xfs_agfl_walk(mp, ri->agf, ri->agfl_bp, 696 696 xrep_findroot_agfl_walk, &agbno); 697 - if (error == XFS_ITER_ABORT) 697 + if (error == -ECANCELED) 698 698 return 0; 699 699 if (error) 700 700 return error;
+1 -1
fs/xfs/scrub/symlink.c
··· 22 22 struct xfs_inode *ip) 23 23 { 24 24 /* Allocate the buffer without the inode lock held. */ 25 - sc->buf = kmem_zalloc_large(XFS_SYMLINK_MAXLEN + 1, KM_SLEEP); 25 + sc->buf = kmem_zalloc_large(XFS_SYMLINK_MAXLEN + 1, 0); 26 26 if (!sc->buf) 27 27 return -ENOMEM; 28 28
+5 -9
fs/xfs/xfs_acl.c
··· 112 112 { 113 113 struct xfs_inode *ip = XFS_I(inode); 114 114 struct posix_acl *acl = NULL; 115 - struct xfs_acl *xfs_acl; 115 + struct xfs_acl *xfs_acl = NULL; 116 116 unsigned char *ea_name; 117 117 int error; 118 118 int len; ··· 135 135 * go out to the disk. 136 136 */ 137 137 len = XFS_ACL_MAX_SIZE(ip->i_mount); 138 - xfs_acl = kmem_zalloc_large(len, KM_SLEEP); 139 - if (!xfs_acl) 140 - return ERR_PTR(-ENOMEM); 141 - 142 - error = xfs_attr_get(ip, ea_name, (unsigned char *)xfs_acl, 143 - &len, ATTR_ROOT); 138 + error = xfs_attr_get(ip, ea_name, (unsigned char **)&xfs_acl, &len, 139 + ATTR_ALLOC | ATTR_ROOT); 144 140 if (error) { 145 141 /* 146 142 * If the attribute doesn't exist make sure we have a negative ··· 147 151 } else { 148 152 acl = xfs_acl_from_disk(xfs_acl, len, 149 153 XFS_ACL_MAX_ENTRIES(ip->i_mount)); 154 + kmem_free(xfs_acl); 150 155 } 151 - kmem_free(xfs_acl); 152 156 return acl; 153 157 } 154 158 ··· 176 180 struct xfs_acl *xfs_acl; 177 181 int len = XFS_ACL_MAX_SIZE(ip->i_mount); 178 182 179 - xfs_acl = kmem_zalloc_large(len, KM_SLEEP); 183 + xfs_acl = kmem_zalloc_large(len, 0); 180 184 if (!xfs_acl) 181 185 return -ENOMEM; 182 186
+1 -1
fs/xfs/xfs_attr_inactive.c
··· 147 147 * Allocate storage for a list of all the "remote" value extents. 148 148 */ 149 149 size = count * sizeof(xfs_attr_inactive_list_t); 150 - list = kmem_alloc(size, KM_SLEEP); 150 + list = kmem_alloc(size, 0); 151 151 152 152 /* 153 153 * Identify each of the "remote" value extents.
+1 -1
fs/xfs/xfs_attr_list.c
··· 109 109 * It didn't all fit, so we have to sort everything on hashval. 110 110 */ 111 111 sbsize = sf->hdr.count * sizeof(*sbuf); 112 - sbp = sbuf = kmem_alloc(sbsize, KM_SLEEP | KM_NOFS); 112 + sbp = sbuf = kmem_alloc(sbsize, KM_NOFS); 113 113 114 114 /* 115 115 * Scan the attribute list for the rest of the entries, storing
+3 -5
fs/xfs/xfs_bmap_item.c
··· 141 141 { 142 142 struct xfs_bui_log_item *buip; 143 143 144 - buip = kmem_zone_zalloc(xfs_bui_zone, KM_SLEEP); 144 + buip = kmem_zone_zalloc(xfs_bui_zone, 0); 145 145 146 146 xfs_log_item_init(mp, &buip->bui_item, XFS_LI_BUI, &xfs_bui_item_ops); 147 147 buip->bui_format.bui_nextents = XFS_BUI_MAX_FAST_EXTENTS; ··· 218 218 { 219 219 struct xfs_bud_log_item *budp; 220 220 221 - budp = kmem_zone_zalloc(xfs_bud_zone, KM_SLEEP); 221 + budp = kmem_zone_zalloc(xfs_bud_zone, 0); 222 222 xfs_log_item_init(tp->t_mountp, &budp->bud_item, XFS_LI_BUD, 223 223 &xfs_bud_item_ops); 224 224 budp->bud_buip = buip; ··· 542 542 irec.br_blockcount = count; 543 543 irec.br_startoff = bmap->me_startoff; 544 544 irec.br_state = state; 545 - error = xfs_bmap_unmap_extent(tp, ip, &irec); 546 - if (error) 547 - goto err_inode; 545 + xfs_bmap_unmap_extent(tp, ip, &irec); 548 546 } 549 547 550 548 set_bit(XFS_BUI_RECOVERED, &buip->bui_flags);
+7 -15
fs/xfs/xfs_bmap_util.c
··· 39 39 xfs_daddr_t 40 40 xfs_fsb_to_db(struct xfs_inode *ip, xfs_fsblock_t fsb) 41 41 { 42 - return (XFS_IS_REALTIME_INODE(ip) ? \ 43 - (xfs_daddr_t)XFS_FSB_TO_BB((ip)->i_mount, (fsb)) : \ 44 - XFS_FSB_TO_DADDR((ip)->i_mount, (fsb))); 42 + if (XFS_IS_REALTIME_INODE(ip)) 43 + return XFS_FSB_TO_BB(ip->i_mount, fsb); 44 + return XFS_FSB_TO_DADDR(ip->i_mount, fsb); 45 45 } 46 46 47 47 /* ··· 1532 1532 trace_xfs_swap_extent_rmap_remap_piece(tip, &uirec); 1533 1533 1534 1534 /* Remove the mapping from the donor file. */ 1535 - error = xfs_bmap_unmap_extent(tp, tip, &uirec); 1536 - if (error) 1537 - goto out; 1535 + xfs_bmap_unmap_extent(tp, tip, &uirec); 1538 1536 1539 1537 /* Remove the mapping from the source file. */ 1540 - error = xfs_bmap_unmap_extent(tp, ip, &irec); 1541 - if (error) 1542 - goto out; 1538 + xfs_bmap_unmap_extent(tp, ip, &irec); 1543 1539 1544 1540 /* Map the donor file's blocks into the source file. */ 1545 - error = xfs_bmap_map_extent(tp, ip, &uirec); 1546 - if (error) 1547 - goto out; 1541 + xfs_bmap_map_extent(tp, ip, &uirec); 1548 1542 1549 1543 /* Map the source file's blocks into the donor file. */ 1550 - error = xfs_bmap_map_extent(tp, tip, &irec); 1551 - if (error) 1552 - goto out; 1544 + xfs_bmap_map_extent(tp, tip, &irec); 1553 1545 1554 1546 error = xfs_defer_finish(tpp); 1555 1547 tp = *tpp;
+4 -3
fs/xfs/xfs_buf.c
··· 353 353 */ 354 354 size = BBTOB(bp->b_length); 355 355 if (size < PAGE_SIZE) { 356 - bp->b_addr = kmem_alloc(size, KM_NOFS); 356 + int align_mask = xfs_buftarg_dma_alignment(bp->b_target); 357 + bp->b_addr = kmem_alloc_io(size, align_mask, KM_NOFS); 357 358 if (!bp->b_addr) { 358 359 /* low memory - use alloc_page loop instead */ 359 360 goto use_alloc_page; ··· 369 368 } 370 369 bp->b_offset = offset_in_page(bp->b_addr); 371 370 bp->b_pages = bp->b_page_array; 372 - bp->b_pages[0] = virt_to_page(bp->b_addr); 371 + bp->b_pages[0] = kmem_to_page(bp->b_addr); 373 372 bp->b_page_count = 1; 374 373 bp->b_flags |= _XBF_KMEM; 375 374 return 0; ··· 1742 1741 { 1743 1742 xfs_buftarg_t *btp; 1744 1743 1745 - btp = kmem_zalloc(sizeof(*btp), KM_SLEEP | KM_NOFS); 1744 + btp = kmem_zalloc(sizeof(*btp), KM_NOFS); 1746 1745 1747 1746 btp->bt_mount = mp; 1748 1747 btp->bt_dev = bdev->bd_dev;
+6
fs/xfs/xfs_buf.h
··· 350 350 #define xfs_getsize_buftarg(buftarg) block_size((buftarg)->bt_bdev) 351 351 #define xfs_readonly_buftarg(buftarg) bdev_read_only((buftarg)->bt_bdev) 352 352 353 + static inline int 354 + xfs_buftarg_dma_alignment(struct xfs_buftarg *bt) 355 + { 356 + return queue_dma_alignment(bt->bt_bdev->bd_disk->queue); 357 + } 358 + 353 359 int xfs_buf_reverify(struct xfs_buf *bp, const struct xfs_buf_ops *ops); 354 360 bool xfs_verify_magic(struct xfs_buf *bp, __be32 dmagic); 355 361 bool xfs_verify_magic16(struct xfs_buf *bp, __be16 dmagic);
+2 -2
fs/xfs/xfs_buf_item.c
··· 702 702 } 703 703 704 704 bip->bli_formats = kmem_zalloc(count * sizeof(struct xfs_buf_log_format), 705 - KM_SLEEP); 705 + 0); 706 706 if (!bip->bli_formats) 707 707 return -ENOMEM; 708 708 return 0; ··· 747 747 return 0; 748 748 } 749 749 750 - bip = kmem_zone_zalloc(xfs_buf_item_zone, KM_SLEEP); 750 + bip = kmem_zone_zalloc(xfs_buf_item_zone, 0); 751 751 xfs_log_item_init(mp, &bip->bli_item, XFS_LI_BUF, &xfs_buf_item_ops); 752 752 bip->bli_buf = bp; 753 753
+2 -2
fs/xfs/xfs_dquot.c
··· 440 440 { 441 441 struct xfs_dquot *dqp; 442 442 443 - dqp = kmem_zone_zalloc(xfs_qm_dqzone, KM_SLEEP); 443 + dqp = kmem_zone_zalloc(xfs_qm_dqzone, 0); 444 444 445 445 dqp->dq_flags = type; 446 446 dqp->q_core.d_id = cpu_to_be32(id); ··· 1239 1239 /* 1240 1240 * Iterate every dquot of a particular type. The caller must ensure that the 1241 1241 * particular quota type is active. iter_fn can return negative error codes, 1242 - * or XFS_ITER_ABORT to indicate that it wants to stop iterating. 1242 + * or -ECANCELED to indicate that it wants to stop iterating. 1243 1243 */ 1244 1244 int 1245 1245 xfs_qm_dqiterate(
+1 -1
fs/xfs/xfs_dquot_item.c
··· 347 347 { 348 348 struct xfs_qoff_logitem *qf; 349 349 350 - qf = kmem_zalloc(sizeof(struct xfs_qoff_logitem), KM_SLEEP); 350 + qf = kmem_zalloc(sizeof(struct xfs_qoff_logitem), 0); 351 351 352 352 xfs_log_item_init(mp, &qf->qql_item, XFS_LI_QUOTAOFF, start ? 353 353 &xfs_qm_qoffend_logitem_ops : &xfs_qm_qoff_logitem_ops);
+1 -1
fs/xfs/xfs_error.c
··· 213 213 struct xfs_mount *mp) 214 214 { 215 215 mp->m_errortag = kmem_zalloc(sizeof(unsigned int) * XFS_ERRTAG_MAX, 216 - KM_SLEEP | KM_MAYFAIL); 216 + KM_MAYFAIL); 217 217 if (!mp->m_errortag) 218 218 return -ENOMEM; 219 219
+1 -1
fs/xfs/xfs_extent_busy.c
··· 33 33 struct rb_node **rbp; 34 34 struct rb_node *parent = NULL; 35 35 36 - new = kmem_zalloc(sizeof(struct xfs_extent_busy), KM_SLEEP); 36 + new = kmem_zalloc(sizeof(struct xfs_extent_busy), 0); 37 37 new->agno = agno; 38 38 new->bno = bno; 39 39 new->length = len;
+4 -4
fs/xfs/xfs_extfree_item.c
··· 163 163 if (nextents > XFS_EFI_MAX_FAST_EXTENTS) { 164 164 size = (uint)(sizeof(xfs_efi_log_item_t) + 165 165 ((nextents - 1) * sizeof(xfs_extent_t))); 166 - efip = kmem_zalloc(size, KM_SLEEP); 166 + efip = kmem_zalloc(size, 0); 167 167 } else { 168 - efip = kmem_zone_zalloc(xfs_efi_zone, KM_SLEEP); 168 + efip = kmem_zone_zalloc(xfs_efi_zone, 0); 169 169 } 170 170 171 171 xfs_log_item_init(mp, &efip->efi_item, XFS_LI_EFI, &xfs_efi_item_ops); ··· 333 333 if (nextents > XFS_EFD_MAX_FAST_EXTENTS) { 334 334 efdp = kmem_zalloc(sizeof(struct xfs_efd_log_item) + 335 335 (nextents - 1) * sizeof(struct xfs_extent), 336 - KM_SLEEP); 336 + 0); 337 337 } else { 338 - efdp = kmem_zone_zalloc(xfs_efd_zone, KM_SLEEP); 338 + efdp = kmem_zone_zalloc(xfs_efd_zone, 0); 339 339 } 340 340 341 341 xfs_log_item_init(tp->t_mountp, &efdp->efd_item, XFS_LI_EFD,
+26
fs/xfs/xfs_file.c
··· 28 28 #include <linux/falloc.h> 29 29 #include <linux/backing-dev.h> 30 30 #include <linux/mman.h> 31 + #include <linux/fadvise.h> 31 32 32 33 static const struct vm_operations_struct xfs_file_vm_ops; 33 34 ··· 934 933 return error; 935 934 } 936 935 936 + STATIC int 937 + xfs_file_fadvise( 938 + struct file *file, 939 + loff_t start, 940 + loff_t end, 941 + int advice) 942 + { 943 + struct xfs_inode *ip = XFS_I(file_inode(file)); 944 + int ret; 945 + int lockflags = 0; 946 + 947 + /* 948 + * Operations creating pages in page cache need protection from hole 949 + * punching and similar ops 950 + */ 951 + if (advice == POSIX_FADV_WILLNEED) { 952 + lockflags = XFS_IOLOCK_SHARED; 953 + xfs_ilock(ip, lockflags); 954 + } 955 + ret = generic_fadvise(file, start, end, advice); 956 + if (lockflags) 957 + xfs_iunlock(ip, lockflags); 958 + return ret; 959 + } 937 960 938 961 STATIC loff_t 939 962 xfs_file_remap_range( ··· 1257 1232 .fsync = xfs_file_fsync, 1258 1233 .get_unmapped_area = thp_get_unmapped_area, 1259 1234 .fallocate = xfs_file_fallocate, 1235 + .fadvise = xfs_file_fadvise, 1260 1236 .remap_file_range = xfs_file_remap_range, 1261 1237 }; 1262 1238
+6 -6
fs/xfs/xfs_fsmap.c
··· 250 250 rec_daddr += XFS_FSB_TO_BB(mp, rec->rm_blockcount); 251 251 if (info->next_daddr < rec_daddr) 252 252 info->next_daddr = rec_daddr; 253 - return XFS_BTREE_QUERY_RANGE_CONTINUE; 253 + return 0; 254 254 } 255 255 256 256 /* Are we just counting mappings? */ ··· 259 259 info->head->fmh_entries++; 260 260 261 261 if (info->last) 262 - return XFS_BTREE_QUERY_RANGE_CONTINUE; 262 + return 0; 263 263 264 264 info->head->fmh_entries++; 265 265 266 266 rec_daddr += XFS_FSB_TO_BB(mp, rec->rm_blockcount); 267 267 if (info->next_daddr < rec_daddr) 268 268 info->next_daddr = rec_daddr; 269 - return XFS_BTREE_QUERY_RANGE_CONTINUE; 269 + return 0; 270 270 } 271 271 272 272 /* ··· 276 276 */ 277 277 if (rec_daddr > info->next_daddr) { 278 278 if (info->head->fmh_entries >= info->head->fmh_count) 279 - return XFS_BTREE_QUERY_RANGE_ABORT; 279 + return -ECANCELED; 280 280 281 281 fmr.fmr_device = info->dev; 282 282 fmr.fmr_physical = info->next_daddr; ··· 295 295 296 296 /* Fill out the extent we found */ 297 297 if (info->head->fmh_entries >= info->head->fmh_count) 298 - return XFS_BTREE_QUERY_RANGE_ABORT; 298 + return -ECANCELED; 299 299 300 300 trace_xfs_fsmap_mapping(mp, info->dev, info->agno, rec); 301 301 ··· 328 328 rec_daddr += XFS_FSB_TO_BB(mp, rec->rm_blockcount); 329 329 if (info->next_daddr < rec_daddr) 330 330 info->next_daddr = rec_daddr; 331 - return XFS_BTREE_QUERY_RANGE_CONTINUE; 331 + return 0; 332 332 } 333 333 334 334 /* Transform a rmapbt irec into a fsmap */
+1 -1
fs/xfs/xfs_icache.c
··· 40 40 * KM_MAYFAIL and return NULL here on ENOMEM. Set the 41 41 * code up to do this anyway. 42 42 */ 43 - ip = kmem_zone_alloc(xfs_inode_zone, KM_SLEEP); 43 + ip = kmem_zone_alloc(xfs_inode_zone, 0); 44 44 if (!ip) 45 45 return NULL; 46 46 if (inode_init_always(mp->m_super, VFS_I(ip))) {
+1 -1
fs/xfs/xfs_icreate_item.c
··· 89 89 { 90 90 struct xfs_icreate_item *icp; 91 91 92 - icp = kmem_zone_zalloc(xfs_icreate_zone, KM_SLEEP); 92 + icp = kmem_zone_zalloc(xfs_icreate_zone, 0); 93 93 94 94 xfs_log_item_init(tp->t_mountp, &icp->ic_item, XFS_LI_ICREATE, 95 95 &xfs_icreate_item_ops);
+43 -42
fs/xfs/xfs_inode.c
··· 2018 2018 if (XFS_TEST_ERROR(false, pag->pag_mount, XFS_ERRTAG_IUNLINK_FALLBACK)) 2019 2019 return 0; 2020 2020 2021 - iu = kmem_zalloc(sizeof(*iu), KM_SLEEP | KM_NOFS); 2021 + iu = kmem_zalloc(sizeof(*iu), KM_NOFS); 2022 2022 iu->iu_agino = prev_agino; 2023 2023 iu->iu_next_unlinked = this_agino; 2024 2024 ··· 3282 3282 spaceres); 3283 3283 3284 3284 /* 3285 - * Set up the target. 3285 + * Check for expected errors before we dirty the transaction 3286 + * so we can return an error without a transaction abort. 3286 3287 */ 3287 3288 if (target_ip == NULL) { 3288 3289 /* ··· 3295 3294 if (error) 3296 3295 goto out_trans_cancel; 3297 3296 } 3297 + } else { 3298 + /* 3299 + * If target exists and it's a directory, check that whether 3300 + * it can be destroyed. 3301 + */ 3302 + if (S_ISDIR(VFS_I(target_ip)->i_mode) && 3303 + (!xfs_dir_isempty(target_ip) || 3304 + (VFS_I(target_ip)->i_nlink > 2))) { 3305 + error = -EEXIST; 3306 + goto out_trans_cancel; 3307 + } 3308 + } 3309 + 3310 + /* 3311 + * Directory entry creation below may acquire the AGF. Remove 3312 + * the whiteout from the unlinked list first to preserve correct 3313 + * AGI/AGF locking order. This dirties the transaction so failures 3314 + * after this point will abort and log recovery will clean up the 3315 + * mess. 3316 + * 3317 + * For whiteouts, we need to bump the link count on the whiteout 3318 + * inode. After this point, we have a real link, clear the tmpfile 3319 + * state flag from the inode so it doesn't accidentally get misused 3320 + * in future. 3321 + */ 3322 + if (wip) { 3323 + ASSERT(VFS_I(wip)->i_nlink == 0); 3324 + error = xfs_iunlink_remove(tp, wip); 3325 + if (error) 3326 + goto out_trans_cancel; 3327 + 3328 + xfs_bumplink(tp, wip); 3329 + xfs_trans_log_inode(tp, wip, XFS_ILOG_CORE); 3330 + VFS_I(wip)->i_state &= ~I_LINKABLE; 3331 + } 3332 + 3333 + /* 3334 + * Set up the target. 3335 + */ 3336 + if (target_ip == NULL) { 3298 3337 /* 3299 3338 * If target does not exist and the rename crosses 3300 3339 * directories, adjust the target directory link count ··· 3352 3311 xfs_bumplink(tp, target_dp); 3353 3312 } 3354 3313 } else { /* target_ip != NULL */ 3355 - /* 3356 - * If target exists and it's a directory, check that both 3357 - * target and source are directories and that target can be 3358 - * destroyed, or that neither is a directory. 3359 - */ 3360 - if (S_ISDIR(VFS_I(target_ip)->i_mode)) { 3361 - /* 3362 - * Make sure target dir is empty. 3363 - */ 3364 - if (!(xfs_dir_isempty(target_ip)) || 3365 - (VFS_I(target_ip)->i_nlink > 2)) { 3366 - error = -EEXIST; 3367 - goto out_trans_cancel; 3368 - } 3369 - } 3370 - 3371 3314 /* 3372 3315 * Link the source inode under the target name. 3373 3316 * If the source inode is a directory and we are moving ··· 3441 3416 spaceres); 3442 3417 if (error) 3443 3418 goto out_trans_cancel; 3444 - 3445 - /* 3446 - * For whiteouts, we need to bump the link count on the whiteout inode. 3447 - * This means that failures all the way up to this point leave the inode 3448 - * on the unlinked list and so cleanup is a simple matter of dropping 3449 - * the remaining reference to it. If we fail here after bumping the link 3450 - * count, we're shutting down the filesystem so we'll never see the 3451 - * intermediate state on disk. 3452 - */ 3453 - if (wip) { 3454 - ASSERT(VFS_I(wip)->i_nlink == 0); 3455 - xfs_bumplink(tp, wip); 3456 - error = xfs_iunlink_remove(tp, wip); 3457 - if (error) 3458 - goto out_trans_cancel; 3459 - xfs_trans_log_inode(tp, wip, XFS_ILOG_CORE); 3460 - 3461 - /* 3462 - * Now we have a real link, clear the "I'm a tmpfile" state 3463 - * flag from the inode so it doesn't accidentally get misused in 3464 - * future. 3465 - */ 3466 - VFS_I(wip)->i_state &= ~I_LINKABLE; 3467 - } 3468 3419 3469 3420 xfs_trans_ichgtime(tp, src_dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); 3470 3421 xfs_trans_log_inode(tp, src_dp, XFS_ILOG_CORE);
+1 -1
fs/xfs/xfs_inode_item.c
··· 651 651 struct xfs_inode_log_item *iip; 652 652 653 653 ASSERT(ip->i_itemp == NULL); 654 - iip = ip->i_itemp = kmem_zone_zalloc(xfs_ili_zone, KM_SLEEP); 654 + iip = ip->i_itemp = kmem_zone_zalloc(xfs_ili_zone, 0); 655 655 656 656 iip->ili_inode = ip; 657 657 xfs_log_item_init(mp, &iip->ili_item, XFS_LI_INODE,
+14 -11
fs/xfs/xfs_ioctl.c
··· 396 396 if (IS_ERR(dentry)) 397 397 return PTR_ERR(dentry); 398 398 399 - kbuf = kmem_zalloc_large(al_hreq.buflen, KM_SLEEP); 399 + kbuf = kmem_zalloc_large(al_hreq.buflen, 0); 400 400 if (!kbuf) 401 401 goto out_dput; 402 402 ··· 434 434 435 435 if (*len > XFS_XATTR_SIZE_MAX) 436 436 return -EINVAL; 437 - kbuf = kmem_zalloc_large(*len, KM_SLEEP); 437 + kbuf = kmem_zalloc_large(*len, 0); 438 438 if (!kbuf) 439 439 return -ENOMEM; 440 440 441 - error = xfs_attr_get(XFS_I(inode), name, kbuf, (int *)len, flags); 441 + error = xfs_attr_get(XFS_I(inode), name, &kbuf, (int *)len, flags); 442 442 if (error) 443 443 goto out_kfree; 444 444 ··· 831 831 /* 832 832 * Check the incoming bulk request @hdr from userspace and initialize the 833 833 * internal @breq bulk request appropriately. Returns 0 if the bulk request 834 - * should proceed; XFS_ITER_ABORT if there's nothing to do; or the usual 834 + * should proceed; -ECANCELED if there's nothing to do; or the usual 835 835 * negative error code. 836 836 */ 837 837 static int ··· 889 889 890 890 /* Asking for an inode past the end of the AG? We're done! */ 891 891 if (XFS_INO_TO_AGNO(mp, breq->startino) > hdr->agno) 892 - return XFS_ITER_ABORT; 892 + return -ECANCELED; 893 893 } else if (hdr->agno) 894 894 return -EINVAL; 895 895 896 896 /* Asking for an inode past the end of the FS? We're done! */ 897 897 if (XFS_INO_TO_AGNO(mp, breq->startino) >= mp->m_sb.sb_agcount) 898 - return XFS_ITER_ABORT; 898 + return -ECANCELED; 899 899 900 900 return 0; 901 901 } ··· 936 936 return -EFAULT; 937 937 938 938 error = xfs_bulk_ireq_setup(mp, &hdr, &breq, arg->bulkstat); 939 - if (error == XFS_ITER_ABORT) 939 + if (error == -ECANCELED) 940 940 goto out_teardown; 941 941 if (error < 0) 942 942 return error; ··· 986 986 return -EFAULT; 987 987 988 988 error = xfs_bulk_ireq_setup(mp, &hdr, &breq, arg->inumbers); 989 - if (error == XFS_ITER_ABORT) 989 + if (error == -ECANCELED) 990 990 goto out_teardown; 991 991 if (error < 0) 992 992 return error; ··· 1038 1038 1039 1039 if (copy_from_user(&ageo, arg, sizeof(ageo))) 1040 1040 return -EFAULT; 1041 + if (ageo.ag_flags) 1042 + return -EINVAL; 1043 + if (memchr_inv(&ageo.ag_reserved, 0, sizeof(ageo.ag_reserved))) 1044 + return -EINVAL; 1041 1045 1042 1046 error = xfs_ag_get_geometry(mp, ageo.ag_number, &ageo); 1043 1047 if (error) ··· 1313 1309 if (fa->fsx_xflags & FS_XFLAG_DAX) { 1314 1310 if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode))) 1315 1311 return -EINVAL; 1316 - if (S_ISREG(inode->i_mode) && 1317 - !bdev_dax_supported(xfs_find_bdev_for_inode(VFS_I(ip)), 1312 + if (!bdev_dax_supported(xfs_find_bdev_for_inode(VFS_I(ip)), 1318 1313 sb->s_blocksize)) 1319 1314 return -EINVAL; 1320 1315 } ··· 1884 1881 info.mp = ip->i_mount; 1885 1882 info.data = arg; 1886 1883 error = xfs_getfsmap(ip->i_mount, &xhead, xfs_getfsmap_format, &info); 1887 - if (error == XFS_BTREE_QUERY_RANGE_ABORT) { 1884 + if (error == -ECANCELED) { 1888 1885 error = 0; 1889 1886 aborted = true; 1890 1887 } else if (error)
+1 -1
fs/xfs/xfs_ioctl32.c
··· 381 381 return PTR_ERR(dentry); 382 382 383 383 error = -ENOMEM; 384 - kbuf = kmem_zalloc_large(al_hreq.buflen, KM_SLEEP); 384 + kbuf = kmem_zalloc_large(al_hreq.buflen, 0); 385 385 if (!kbuf) 386 386 goto out_dput; 387 387
+3 -3
fs/xfs/xfs_iomap.c
··· 58 58 { 59 59 struct xfs_mount *mp = ip->i_mount; 60 60 61 - if (unlikely(!imap->br_startblock && !XFS_IS_REALTIME_INODE(ip))) 61 + if (unlikely(!xfs_valid_startblock(ip, imap->br_startblock))) 62 62 return xfs_alert_fsblock_zero(ip, imap); 63 63 64 64 if (imap->br_startblock == HOLESTARTBLOCK) { ··· 297 297 goto out_unlock; 298 298 } 299 299 300 - if (!(imap->br_startblock || XFS_IS_REALTIME_INODE(ip))) 300 + if (unlikely(!xfs_valid_startblock(ip, imap->br_startblock))) 301 301 error = xfs_alert_fsblock_zero(ip, imap); 302 302 303 303 out_unlock: ··· 814 814 if (error) 815 815 return error; 816 816 817 - if (!(imap.br_startblock || XFS_IS_REALTIME_INODE(ip))) 817 + if (unlikely(!xfs_valid_startblock(ip, imap.br_startblock))) 818 818 return xfs_alert_fsblock_zero(ip, &imap); 819 819 820 820 if ((numblks_fsb = imap.br_blockcount) == 0) {
+5 -5
fs/xfs/xfs_itable.c
··· 137 137 xfs_irele(ip); 138 138 139 139 error = bc->formatter(bc->breq, buf); 140 - if (error == XFS_IBULK_ABORT) 140 + if (error == -ECANCELED) 141 141 goto out_advance; 142 142 if (error) 143 143 goto out; ··· 169 169 ASSERT(breq->icount == 1); 170 170 171 171 bc.buf = kmem_zalloc(sizeof(struct xfs_bulkstat), 172 - KM_SLEEP | KM_MAYFAIL); 172 + KM_MAYFAIL); 173 173 if (!bc.buf) 174 174 return -ENOMEM; 175 175 ··· 181 181 * If we reported one inode to userspace then we abort because we hit 182 182 * the end of the buffer. Don't leak that back to userspace. 183 183 */ 184 - if (error == XFS_IWALK_ABORT) 184 + if (error == -ECANCELED) 185 185 error = 0; 186 186 187 187 return error; ··· 243 243 return 0; 244 244 245 245 bc.buf = kmem_zalloc(sizeof(struct xfs_bulkstat), 246 - KM_SLEEP | KM_MAYFAIL); 246 + KM_MAYFAIL); 247 247 if (!bc.buf) 248 248 return -ENOMEM; 249 249 ··· 342 342 int error; 343 343 344 344 error = ic->formatter(ic->breq, &inogrp); 345 - if (error && error != XFS_IBULK_ABORT) 345 + if (error && error != -ECANCELED) 346 346 return error; 347 347 348 348 ic->breq->startino = XFS_AGINO_TO_INO(mp, agno, irec->ir_startino) +
+9 -4
fs/xfs/xfs_itable.h
··· 18 18 /* Only iterate within the same AG as startino */ 19 19 #define XFS_IBULK_SAME_AG (XFS_IWALK_SAME_AG) 20 20 21 - /* Return value that means we want to abort the walk. */ 22 - #define XFS_IBULK_ABORT (XFS_IWALK_ABORT) 23 - 24 21 /* 25 22 * Advance the user buffer pointer by one record of the given size. If the 26 23 * buffer is now full, return the appropriate error code. ··· 31 34 32 35 breq->ubuffer = b + bytes; 33 36 breq->ocount++; 34 - return breq->ocount == breq->icount ? XFS_IBULK_ABORT : 0; 37 + return breq->ocount == breq->icount ? -ECANCELED : 0; 35 38 } 36 39 37 40 /* 38 41 * Return stat information in bulk (by-inode) for the filesystem. 42 + */ 43 + 44 + /* 45 + * Return codes for the formatter function are 0 to continue iterating, and 46 + * non-zero to stop iterating. Any non-zero value will be passed up to the 47 + * bulkstat/inumbers caller. The special value -ECANCELED can be used to stop 48 + * iteration, as neither bulkstat nor inumbers will ever generate that error 49 + * code on their own. 39 50 */ 40 51 41 52 typedef int (*bulkstat_one_fmt_pf)(struct xfs_ibulk *breq,
+2 -2
fs/xfs/xfs_iwalk.c
··· 31 31 * inode it finds, it calls a walk function with the relevant inode number and 32 32 * a pointer to caller-provided data. The walk function can return the usual 33 33 * negative error code to stop the iteration; 0 to continue the iteration; or 34 - * XFS_IWALK_ABORT to stop the iteration. This return value is returned to the 34 + * -ECANCELED to stop the iteration. This return value is returned to the 35 35 * caller. 36 36 * 37 37 * Internally, we allow the walk function to do anything, which means that we ··· 616 616 if (xfs_pwork_ctl_want_abort(&pctl)) 617 617 break; 618 618 619 - iwag = kmem_zalloc(sizeof(struct xfs_iwalk_ag), KM_SLEEP); 619 + iwag = kmem_zalloc(sizeof(struct xfs_iwalk_ag), 0); 620 620 iwag->mp = mp; 621 621 iwag->iwalk_fn = iwalk_fn; 622 622 iwag->data = data;
+8 -5
fs/xfs/xfs_iwalk.h
··· 6 6 #ifndef __XFS_IWALK_H__ 7 7 #define __XFS_IWALK_H__ 8 8 9 + /* 10 + * Return codes for the inode/inobt walk function are 0 to continue iterating, 11 + * and non-zero to stop iterating. Any non-zero value will be passed up to the 12 + * iwalk or inobt_walk caller. The special value -ECANCELED can be used to 13 + * stop iteration, as neither iwalk nor inobt_walk will ever generate that 14 + * error code on their own. 15 + */ 16 + 9 17 /* Walk all inodes in the filesystem starting from @startino. */ 10 18 typedef int (*xfs_iwalk_fn)(struct xfs_mount *mp, struct xfs_trans *tp, 11 19 xfs_ino_t ino, void *data); 12 - /* Return values for xfs_iwalk_fn. */ 13 - #define XFS_IWALK_CONTINUE (XFS_ITER_CONTINUE) 14 - #define XFS_IWALK_ABORT (XFS_ITER_ABORT) 15 20 16 21 int xfs_iwalk(struct xfs_mount *mp, struct xfs_trans *tp, xfs_ino_t startino, 17 22 unsigned int flags, xfs_iwalk_fn iwalk_fn, ··· 35 30 xfs_agnumber_t agno, 36 31 const struct xfs_inobt_rec_incore *irec, 37 32 void *data); 38 - /* Return value (for xfs_inobt_walk_fn) that aborts the walk immediately. */ 39 - #define XFS_INOBT_WALK_ABORT (XFS_IWALK_ABORT) 40 33 41 34 int xfs_inobt_walk(struct xfs_mount *mp, struct xfs_trans *tp, 42 35 xfs_ino_t startino, unsigned int flags,
+276 -190
fs/xfs/xfs_log.c
··· 214 214 { 215 215 struct xlog_ticket *tic; 216 216 int need_bytes; 217 + bool woken_task = false; 217 218 218 219 list_for_each_entry(tic, &head->waiters, t_queue) { 220 + 221 + /* 222 + * There is a chance that the size of the CIL checkpoints in 223 + * progress at the last AIL push target calculation resulted in 224 + * limiting the target to the log head (l_last_sync_lsn) at the 225 + * time. This may not reflect where the log head is now as the 226 + * CIL checkpoints may have completed. 227 + * 228 + * Hence when we are woken here, it may be that the head of the 229 + * log that has moved rather than the tail. As the tail didn't 230 + * move, there still won't be space available for the 231 + * reservation we require. However, if the AIL has already 232 + * pushed to the target defined by the old log head location, we 233 + * will hang here waiting for something else to update the AIL 234 + * push target. 235 + * 236 + * Therefore, if there isn't space to wake the first waiter on 237 + * the grant head, we need to push the AIL again to ensure the 238 + * target reflects both the current log tail and log head 239 + * position before we wait for the tail to move again. 240 + */ 241 + 219 242 need_bytes = xlog_ticket_reservation(log, head, tic); 220 - if (*free_bytes < need_bytes) 243 + if (*free_bytes < need_bytes) { 244 + if (!woken_task) 245 + xlog_grant_push_ail(log, need_bytes); 221 246 return false; 247 + } 222 248 223 249 *free_bytes -= need_bytes; 224 250 trace_xfs_log_grant_wake_up(log, tic); 225 251 wake_up_process(tic->t_task); 252 + woken_task = true; 226 253 } 227 254 228 255 return true; ··· 455 428 XFS_STATS_INC(mp, xs_try_logspace); 456 429 457 430 ASSERT(*ticp == NULL); 458 - tic = xlog_ticket_alloc(log, unit_bytes, cnt, client, permanent, 459 - KM_SLEEP); 431 + tic = xlog_ticket_alloc(log, unit_bytes, cnt, client, permanent, 0); 460 432 *ticp = tic; 461 433 462 434 xlog_grant_push_ail(log, tic->t_cnt ? tic->t_unit_res * tic->t_cnt ··· 1430 1404 */ 1431 1405 ASSERT(log->l_iclog_size >= 4096); 1432 1406 for (i = 0; i < log->l_iclog_bufs; i++) { 1407 + int align_mask = xfs_buftarg_dma_alignment(mp->m_logdev_targp); 1433 1408 size_t bvec_size = howmany(log->l_iclog_size, PAGE_SIZE) * 1434 1409 sizeof(struct bio_vec); 1435 1410 ··· 1442 1415 iclog->ic_prev = prev_iclog; 1443 1416 prev_iclog = iclog; 1444 1417 1445 - iclog->ic_data = kmem_alloc_large(log->l_iclog_size, 1446 - KM_MAYFAIL); 1418 + iclog->ic_data = kmem_alloc_io(log->l_iclog_size, align_mask, 1419 + KM_MAYFAIL); 1447 1420 if (!iclog->ic_data) 1448 1421 goto out_free_iclog; 1449 1422 #ifdef DEBUG ··· 2523 2496 ***************************************************************************** 2524 2497 */ 2525 2498 2526 - /* Clean iclogs starting from the head. This ordering must be 2527 - * maintained, so an iclog doesn't become ACTIVE beyond one that 2528 - * is SYNCING. This is also required to maintain the notion that we use 2529 - * a ordered wait queue to hold off would be writers to the log when every 2530 - * iclog is trying to sync to disk. 2499 + /* 2500 + * An iclog has just finished IO completion processing, so we need to update 2501 + * the iclog state and propagate that up into the overall log state. Hence we 2502 + * prepare the iclog for cleaning, and then clean all the pending dirty iclogs 2503 + * starting from the head, and then wake up any threads that are waiting for the 2504 + * iclog to be marked clean. 2531 2505 * 2532 - * State Change: DIRTY -> ACTIVE 2506 + * The ordering of marking iclogs ACTIVE must be maintained, so an iclog 2507 + * doesn't become ACTIVE beyond one that is SYNCING. This is also required to 2508 + * maintain the notion that we use a ordered wait queue to hold off would be 2509 + * writers to the log when every iclog is trying to sync to disk. 2510 + * 2511 + * Caller must hold the icloglock before calling us. 2512 + * 2513 + * State Change: !IOERROR -> DIRTY -> ACTIVE 2533 2514 */ 2534 2515 STATIC void 2535 - xlog_state_clean_log( 2536 - struct xlog *log) 2516 + xlog_state_clean_iclog( 2517 + struct xlog *log, 2518 + struct xlog_in_core *dirty_iclog) 2537 2519 { 2538 - xlog_in_core_t *iclog; 2539 - int changed = 0; 2520 + struct xlog_in_core *iclog; 2521 + int changed = 0; 2540 2522 2523 + /* Prepare the completed iclog. */ 2524 + if (!(dirty_iclog->ic_state & XLOG_STATE_IOERROR)) 2525 + dirty_iclog->ic_state = XLOG_STATE_DIRTY; 2526 + 2527 + /* Walk all the iclogs to update the ordered active state. */ 2541 2528 iclog = log->l_iclog; 2542 2529 do { 2543 2530 if (iclog->ic_state == XLOG_STATE_DIRTY) { ··· 2589 2548 iclog = iclog->ic_next; 2590 2549 } while (iclog != log->l_iclog); 2591 2550 2592 - /* log is locked when we are called */ 2551 + 2552 + /* 2553 + * Wake up threads waiting in xfs_log_force() for the dirty iclog 2554 + * to be cleaned. 2555 + */ 2556 + wake_up_all(&dirty_iclog->ic_force_wait); 2557 + 2593 2558 /* 2594 2559 * Change state for the dummy log recording. 2595 2560 * We usually go to NEED. But we go to NEED2 if the changed indicates ··· 2629 2582 ASSERT(0); 2630 2583 } 2631 2584 } 2632 - } /* xlog_state_clean_log */ 2585 + } 2633 2586 2634 2587 STATIC xfs_lsn_t 2635 2588 xlog_get_lowest_lsn( ··· 2650 2603 return lowest_lsn; 2651 2604 } 2652 2605 2606 + /* 2607 + * Completion of a iclog IO does not imply that a transaction has completed, as 2608 + * transactions can be large enough to span many iclogs. We cannot change the 2609 + * tail of the log half way through a transaction as this may be the only 2610 + * transaction in the log and moving the tail to point to the middle of it 2611 + * will prevent recovery from finding the start of the transaction. Hence we 2612 + * should only update the last_sync_lsn if this iclog contains transaction 2613 + * completion callbacks on it. 2614 + * 2615 + * We have to do this before we drop the icloglock to ensure we are the only one 2616 + * that can update it. 2617 + * 2618 + * If we are moving the last_sync_lsn forwards, we also need to ensure we kick 2619 + * the reservation grant head pushing. This is due to the fact that the push 2620 + * target is bound by the current last_sync_lsn value. Hence if we have a large 2621 + * amount of log space bound up in this committing transaction then the 2622 + * last_sync_lsn value may be the limiting factor preventing tail pushing from 2623 + * freeing space in the log. Hence once we've updated the last_sync_lsn we 2624 + * should push the AIL to ensure the push target (and hence the grant head) is 2625 + * no longer bound by the old log head location and can move forwards and make 2626 + * progress again. 2627 + */ 2628 + static void 2629 + xlog_state_set_callback( 2630 + struct xlog *log, 2631 + struct xlog_in_core *iclog, 2632 + xfs_lsn_t header_lsn) 2633 + { 2634 + iclog->ic_state = XLOG_STATE_CALLBACK; 2635 + 2636 + ASSERT(XFS_LSN_CMP(atomic64_read(&log->l_last_sync_lsn), 2637 + header_lsn) <= 0); 2638 + 2639 + if (list_empty_careful(&iclog->ic_callbacks)) 2640 + return; 2641 + 2642 + atomic64_set(&log->l_last_sync_lsn, header_lsn); 2643 + xlog_grant_push_ail(log, 0); 2644 + } 2645 + 2646 + /* 2647 + * Return true if we need to stop processing, false to continue to the next 2648 + * iclog. The caller will need to run callbacks if the iclog is returned in the 2649 + * XLOG_STATE_CALLBACK state. 2650 + */ 2651 + static bool 2652 + xlog_state_iodone_process_iclog( 2653 + struct xlog *log, 2654 + struct xlog_in_core *iclog, 2655 + struct xlog_in_core *completed_iclog, 2656 + bool *ioerror) 2657 + { 2658 + xfs_lsn_t lowest_lsn; 2659 + xfs_lsn_t header_lsn; 2660 + 2661 + /* Skip all iclogs in the ACTIVE & DIRTY states */ 2662 + if (iclog->ic_state & (XLOG_STATE_ACTIVE | XLOG_STATE_DIRTY)) 2663 + return false; 2664 + 2665 + /* 2666 + * Between marking a filesystem SHUTDOWN and stopping the log, we do 2667 + * flush all iclogs to disk (if there wasn't a log I/O error). So, we do 2668 + * want things to go smoothly in case of just a SHUTDOWN w/o a 2669 + * LOG_IO_ERROR. 2670 + */ 2671 + if (iclog->ic_state & XLOG_STATE_IOERROR) { 2672 + *ioerror = true; 2673 + return false; 2674 + } 2675 + 2676 + /* 2677 + * Can only perform callbacks in order. Since this iclog is not in the 2678 + * DONE_SYNC/ DO_CALLBACK state, we skip the rest and just try to clean 2679 + * up. If we set our iclog to DO_CALLBACK, we will not process it when 2680 + * we retry since a previous iclog is in the CALLBACK and the state 2681 + * cannot change since we are holding the l_icloglock. 2682 + */ 2683 + if (!(iclog->ic_state & 2684 + (XLOG_STATE_DONE_SYNC | XLOG_STATE_DO_CALLBACK))) { 2685 + if (completed_iclog && 2686 + (completed_iclog->ic_state == XLOG_STATE_DONE_SYNC)) { 2687 + completed_iclog->ic_state = XLOG_STATE_DO_CALLBACK; 2688 + } 2689 + return true; 2690 + } 2691 + 2692 + /* 2693 + * We now have an iclog that is in either the DO_CALLBACK or DONE_SYNC 2694 + * states. The other states (WANT_SYNC, SYNCING, or CALLBACK were caught 2695 + * by the above if and are going to clean (i.e. we aren't doing their 2696 + * callbacks) see the above if. 2697 + * 2698 + * We will do one more check here to see if we have chased our tail 2699 + * around. If this is not the lowest lsn iclog, then we will leave it 2700 + * for another completion to process. 2701 + */ 2702 + header_lsn = be64_to_cpu(iclog->ic_header.h_lsn); 2703 + lowest_lsn = xlog_get_lowest_lsn(log); 2704 + if (lowest_lsn && XFS_LSN_CMP(lowest_lsn, header_lsn) < 0) 2705 + return false; 2706 + 2707 + xlog_state_set_callback(log, iclog, header_lsn); 2708 + return false; 2709 + 2710 + } 2711 + 2712 + /* 2713 + * Keep processing entries in the iclog callback list until we come around and 2714 + * it is empty. We need to atomically see that the list is empty and change the 2715 + * state to DIRTY so that we don't miss any more callbacks being added. 2716 + * 2717 + * This function is called with the icloglock held and returns with it held. We 2718 + * drop it while running callbacks, however, as holding it over thousands of 2719 + * callbacks is unnecessary and causes excessive contention if we do. 2720 + */ 2721 + static void 2722 + xlog_state_do_iclog_callbacks( 2723 + struct xlog *log, 2724 + struct xlog_in_core *iclog, 2725 + bool aborted) 2726 + { 2727 + spin_unlock(&log->l_icloglock); 2728 + spin_lock(&iclog->ic_callback_lock); 2729 + while (!list_empty(&iclog->ic_callbacks)) { 2730 + LIST_HEAD(tmp); 2731 + 2732 + list_splice_init(&iclog->ic_callbacks, &tmp); 2733 + 2734 + spin_unlock(&iclog->ic_callback_lock); 2735 + xlog_cil_process_committed(&tmp, aborted); 2736 + spin_lock(&iclog->ic_callback_lock); 2737 + } 2738 + 2739 + /* 2740 + * Pick up the icloglock while still holding the callback lock so we 2741 + * serialise against anyone trying to add more callbacks to this iclog 2742 + * now we've finished processing. 2743 + */ 2744 + spin_lock(&log->l_icloglock); 2745 + spin_unlock(&iclog->ic_callback_lock); 2746 + } 2747 + 2748 + #ifdef DEBUG 2749 + /* 2750 + * Make one last gasp attempt to see if iclogs are being left in limbo. If the 2751 + * above loop finds an iclog earlier than the current iclog and in one of the 2752 + * syncing states, the current iclog is put into DO_CALLBACK and the callbacks 2753 + * are deferred to the completion of the earlier iclog. Walk the iclogs in order 2754 + * and make sure that no iclog is in DO_CALLBACK unless an earlier iclog is in 2755 + * one of the syncing states. 2756 + * 2757 + * Note that SYNCING|IOERROR is a valid state so we cannot just check for 2758 + * ic_state == SYNCING. 2759 + */ 2760 + static void 2761 + xlog_state_callback_check_state( 2762 + struct xlog *log) 2763 + { 2764 + struct xlog_in_core *first_iclog = log->l_iclog; 2765 + struct xlog_in_core *iclog = first_iclog; 2766 + 2767 + do { 2768 + ASSERT(iclog->ic_state != XLOG_STATE_DO_CALLBACK); 2769 + /* 2770 + * Terminate the loop if iclogs are found in states 2771 + * which will cause other threads to clean up iclogs. 2772 + * 2773 + * SYNCING - i/o completion will go through logs 2774 + * DONE_SYNC - interrupt thread should be waiting for 2775 + * l_icloglock 2776 + * IOERROR - give up hope all ye who enter here 2777 + */ 2778 + if (iclog->ic_state == XLOG_STATE_WANT_SYNC || 2779 + iclog->ic_state & XLOG_STATE_SYNCING || 2780 + iclog->ic_state == XLOG_STATE_DONE_SYNC || 2781 + iclog->ic_state == XLOG_STATE_IOERROR ) 2782 + break; 2783 + iclog = iclog->ic_next; 2784 + } while (first_iclog != iclog); 2785 + } 2786 + #else 2787 + #define xlog_state_callback_check_state(l) ((void)0) 2788 + #endif 2789 + 2653 2790 STATIC void 2654 2791 xlog_state_do_callback( 2655 2792 struct xlog *log, 2656 2793 bool aborted, 2657 2794 struct xlog_in_core *ciclog) 2658 2795 { 2659 - xlog_in_core_t *iclog; 2660 - xlog_in_core_t *first_iclog; /* used to know when we've 2661 - * processed all iclogs once */ 2662 - int flushcnt = 0; 2663 - xfs_lsn_t lowest_lsn; 2664 - int ioerrors; /* counter: iclogs with errors */ 2665 - int loopdidcallbacks; /* flag: inner loop did callbacks*/ 2666 - int funcdidcallbacks; /* flag: function did callbacks */ 2667 - int repeats; /* for issuing console warnings if 2668 - * looping too many times */ 2669 - int wake = 0; 2796 + struct xlog_in_core *iclog; 2797 + struct xlog_in_core *first_iclog; 2798 + bool did_callbacks = false; 2799 + bool cycled_icloglock; 2800 + bool ioerror; 2801 + int flushcnt = 0; 2802 + int repeats = 0; 2670 2803 2671 2804 spin_lock(&log->l_icloglock); 2672 - first_iclog = iclog = log->l_iclog; 2673 - ioerrors = 0; 2674 - funcdidcallbacks = 0; 2675 - repeats = 0; 2676 - 2677 2805 do { 2678 2806 /* 2679 2807 * Scan all iclogs starting with the one pointed to by the ··· 2860 2638 */ 2861 2639 first_iclog = log->l_iclog; 2862 2640 iclog = log->l_iclog; 2863 - loopdidcallbacks = 0; 2641 + cycled_icloglock = false; 2642 + ioerror = false; 2864 2643 repeats++; 2865 2644 2866 2645 do { 2646 + if (xlog_state_iodone_process_iclog(log, iclog, 2647 + ciclog, &ioerror)) 2648 + break; 2867 2649 2868 - /* skip all iclogs in the ACTIVE & DIRTY states */ 2869 - if (iclog->ic_state & 2870 - (XLOG_STATE_ACTIVE|XLOG_STATE_DIRTY)) { 2650 + if (!(iclog->ic_state & 2651 + (XLOG_STATE_CALLBACK | XLOG_STATE_IOERROR))) { 2871 2652 iclog = iclog->ic_next; 2872 2653 continue; 2873 2654 } 2874 2655 2875 2656 /* 2876 - * Between marking a filesystem SHUTDOWN and stopping 2877 - * the log, we do flush all iclogs to disk (if there 2878 - * wasn't a log I/O error). So, we do want things to 2879 - * go smoothly in case of just a SHUTDOWN w/o a 2880 - * LOG_IO_ERROR. 2657 + * Running callbacks will drop the icloglock which means 2658 + * we'll have to run at least one more complete loop. 2881 2659 */ 2882 - if (!(iclog->ic_state & XLOG_STATE_IOERROR)) { 2883 - /* 2884 - * Can only perform callbacks in order. Since 2885 - * this iclog is not in the DONE_SYNC/ 2886 - * DO_CALLBACK state, we skip the rest and 2887 - * just try to clean up. If we set our iclog 2888 - * to DO_CALLBACK, we will not process it when 2889 - * we retry since a previous iclog is in the 2890 - * CALLBACK and the state cannot change since 2891 - * we are holding the l_icloglock. 2892 - */ 2893 - if (!(iclog->ic_state & 2894 - (XLOG_STATE_DONE_SYNC | 2895 - XLOG_STATE_DO_CALLBACK))) { 2896 - if (ciclog && (ciclog->ic_state == 2897 - XLOG_STATE_DONE_SYNC)) { 2898 - ciclog->ic_state = XLOG_STATE_DO_CALLBACK; 2899 - } 2900 - break; 2901 - } 2902 - /* 2903 - * We now have an iclog that is in either the 2904 - * DO_CALLBACK or DONE_SYNC states. The other 2905 - * states (WANT_SYNC, SYNCING, or CALLBACK were 2906 - * caught by the above if and are going to 2907 - * clean (i.e. we aren't doing their callbacks) 2908 - * see the above if. 2909 - */ 2660 + cycled_icloglock = true; 2661 + xlog_state_do_iclog_callbacks(log, iclog, aborted); 2910 2662 2911 - /* 2912 - * We will do one more check here to see if we 2913 - * have chased our tail around. 2914 - */ 2915 - 2916 - lowest_lsn = xlog_get_lowest_lsn(log); 2917 - if (lowest_lsn && 2918 - XFS_LSN_CMP(lowest_lsn, 2919 - be64_to_cpu(iclog->ic_header.h_lsn)) < 0) { 2920 - iclog = iclog->ic_next; 2921 - continue; /* Leave this iclog for 2922 - * another thread */ 2923 - } 2924 - 2925 - iclog->ic_state = XLOG_STATE_CALLBACK; 2926 - 2927 - 2928 - /* 2929 - * Completion of a iclog IO does not imply that 2930 - * a transaction has completed, as transactions 2931 - * can be large enough to span many iclogs. We 2932 - * cannot change the tail of the log half way 2933 - * through a transaction as this may be the only 2934 - * transaction in the log and moving th etail to 2935 - * point to the middle of it will prevent 2936 - * recovery from finding the start of the 2937 - * transaction. Hence we should only update the 2938 - * last_sync_lsn if this iclog contains 2939 - * transaction completion callbacks on it. 2940 - * 2941 - * We have to do this before we drop the 2942 - * icloglock to ensure we are the only one that 2943 - * can update it. 2944 - */ 2945 - ASSERT(XFS_LSN_CMP(atomic64_read(&log->l_last_sync_lsn), 2946 - be64_to_cpu(iclog->ic_header.h_lsn)) <= 0); 2947 - if (!list_empty_careful(&iclog->ic_callbacks)) 2948 - atomic64_set(&log->l_last_sync_lsn, 2949 - be64_to_cpu(iclog->ic_header.h_lsn)); 2950 - 2951 - } else 2952 - ioerrors++; 2953 - 2954 - spin_unlock(&log->l_icloglock); 2955 - 2956 - /* 2957 - * Keep processing entries in the callback list until 2958 - * we come around and it is empty. We need to 2959 - * atomically see that the list is empty and change the 2960 - * state to DIRTY so that we don't miss any more 2961 - * callbacks being added. 2962 - */ 2963 - spin_lock(&iclog->ic_callback_lock); 2964 - while (!list_empty(&iclog->ic_callbacks)) { 2965 - LIST_HEAD(tmp); 2966 - 2967 - list_splice_init(&iclog->ic_callbacks, &tmp); 2968 - 2969 - spin_unlock(&iclog->ic_callback_lock); 2970 - xlog_cil_process_committed(&tmp, aborted); 2971 - spin_lock(&iclog->ic_callback_lock); 2972 - } 2973 - 2974 - loopdidcallbacks++; 2975 - funcdidcallbacks++; 2976 - 2977 - spin_lock(&log->l_icloglock); 2978 - spin_unlock(&iclog->ic_callback_lock); 2979 - if (!(iclog->ic_state & XLOG_STATE_IOERROR)) 2980 - iclog->ic_state = XLOG_STATE_DIRTY; 2981 - 2982 - /* 2983 - * Transition from DIRTY to ACTIVE if applicable. 2984 - * NOP if STATE_IOERROR. 2985 - */ 2986 - xlog_state_clean_log(log); 2987 - 2988 - /* wake up threads waiting in xfs_log_force() */ 2989 - wake_up_all(&iclog->ic_force_wait); 2990 - 2663 + xlog_state_clean_iclog(log, iclog); 2991 2664 iclog = iclog->ic_next; 2992 2665 } while (first_iclog != iclog); 2666 + 2667 + did_callbacks |= cycled_icloglock; 2993 2668 2994 2669 if (repeats > 5000) { 2995 2670 flushcnt += repeats; ··· 2895 2776 "%s: possible infinite loop (%d iterations)", 2896 2777 __func__, flushcnt); 2897 2778 } 2898 - } while (!ioerrors && loopdidcallbacks); 2779 + } while (!ioerror && cycled_icloglock); 2899 2780 2900 - #ifdef DEBUG 2901 - /* 2902 - * Make one last gasp attempt to see if iclogs are being left in limbo. 2903 - * If the above loop finds an iclog earlier than the current iclog and 2904 - * in one of the syncing states, the current iclog is put into 2905 - * DO_CALLBACK and the callbacks are deferred to the completion of the 2906 - * earlier iclog. Walk the iclogs in order and make sure that no iclog 2907 - * is in DO_CALLBACK unless an earlier iclog is in one of the syncing 2908 - * states. 2909 - * 2910 - * Note that SYNCING|IOABORT is a valid state so we cannot just check 2911 - * for ic_state == SYNCING. 2912 - */ 2913 - if (funcdidcallbacks) { 2914 - first_iclog = iclog = log->l_iclog; 2915 - do { 2916 - ASSERT(iclog->ic_state != XLOG_STATE_DO_CALLBACK); 2917 - /* 2918 - * Terminate the loop if iclogs are found in states 2919 - * which will cause other threads to clean up iclogs. 2920 - * 2921 - * SYNCING - i/o completion will go through logs 2922 - * DONE_SYNC - interrupt thread should be waiting for 2923 - * l_icloglock 2924 - * IOERROR - give up hope all ye who enter here 2925 - */ 2926 - if (iclog->ic_state == XLOG_STATE_WANT_SYNC || 2927 - iclog->ic_state & XLOG_STATE_SYNCING || 2928 - iclog->ic_state == XLOG_STATE_DONE_SYNC || 2929 - iclog->ic_state == XLOG_STATE_IOERROR ) 2930 - break; 2931 - iclog = iclog->ic_next; 2932 - } while (first_iclog != iclog); 2933 - } 2934 - #endif 2781 + if (did_callbacks) 2782 + xlog_state_callback_check_state(log); 2935 2783 2936 2784 if (log->l_iclog->ic_state & (XLOG_STATE_ACTIVE|XLOG_STATE_IOERROR)) 2937 - wake = 1; 2938 - spin_unlock(&log->l_icloglock); 2939 - 2940 - if (wake) 2941 2785 wake_up_all(&log->l_flush_wait); 2786 + 2787 + spin_unlock(&log->l_icloglock); 2942 2788 } 2943 2789 2944 2790 ··· 4003 3919 * item committed callback functions will do this again under lock to 4004 3920 * avoid races. 4005 3921 */ 3922 + spin_lock(&log->l_cilp->xc_push_lock); 4006 3923 wake_up_all(&log->l_cilp->xc_commit_wait); 3924 + spin_unlock(&log->l_cilp->xc_push_lock); 4007 3925 xlog_state_do_callback(log, true, NULL); 4008 3926 4009 3927 #ifdef XFSERRORDEBUG
+5 -5
fs/xfs/xfs_log_cil.c
··· 38 38 struct xlog_ticket *tic; 39 39 40 40 tic = xlog_ticket_alloc(log, 0, 1, XFS_TRANSACTION, 0, 41 - KM_SLEEP|KM_NOFS); 41 + KM_NOFS); 42 42 43 43 /* 44 44 * set the current reservation to zero so we know to steal the basic ··· 186 186 */ 187 187 kmem_free(lip->li_lv_shadow); 188 188 189 - lv = kmem_alloc_large(buf_size, KM_SLEEP | KM_NOFS); 189 + lv = kmem_alloc_large(buf_size, KM_NOFS); 190 190 memset(lv, 0, xlog_cil_iovec_space(niovecs)); 191 191 192 192 lv->lv_item = lip; ··· 660 660 if (!cil) 661 661 return 0; 662 662 663 - new_ctx = kmem_zalloc(sizeof(*new_ctx), KM_SLEEP|KM_NOFS); 663 + new_ctx = kmem_zalloc(sizeof(*new_ctx), KM_NOFS); 664 664 new_ctx->ticket = xlog_cil_ticket_alloc(log); 665 665 666 666 down_write(&cil->xc_ctx_lock); ··· 1179 1179 struct xfs_cil *cil; 1180 1180 struct xfs_cil_ctx *ctx; 1181 1181 1182 - cil = kmem_zalloc(sizeof(*cil), KM_SLEEP|KM_MAYFAIL); 1182 + cil = kmem_zalloc(sizeof(*cil), KM_MAYFAIL); 1183 1183 if (!cil) 1184 1184 return -ENOMEM; 1185 1185 1186 - ctx = kmem_zalloc(sizeof(*ctx), KM_SLEEP|KM_MAYFAIL); 1186 + ctx = kmem_zalloc(sizeof(*ctx), KM_MAYFAIL); 1187 1187 if (!ctx) { 1188 1188 kmem_free(cil); 1189 1189 return -ENOMEM;
+32 -18
fs/xfs/xfs_log_recover.c
··· 97 97 struct xlog *log, 98 98 int nbblks) 99 99 { 100 + int align_mask = xfs_buftarg_dma_alignment(log->l_targ); 101 + 100 102 /* 101 103 * Pass log block 0 since we don't have an addr yet, buffer will be 102 104 * verified on read. ··· 127 125 if (nbblks > 1 && log->l_sectBBsize > 1) 128 126 nbblks += log->l_sectBBsize; 129 127 nbblks = round_up(nbblks, log->l_sectBBsize); 130 - return kmem_alloc_large(BBTOB(nbblks), KM_MAYFAIL); 128 + return kmem_alloc_io(BBTOB(nbblks), align_mask, KM_MAYFAIL); 131 129 } 132 130 133 131 /* ··· 1962 1960 } 1963 1961 } 1964 1962 1965 - bcp = kmem_alloc(sizeof(struct xfs_buf_cancel), KM_SLEEP); 1963 + bcp = kmem_alloc(sizeof(struct xfs_buf_cancel), 0); 1966 1964 bcp->bc_blkno = buf_f->blf_blkno; 1967 1965 bcp->bc_len = buf_f->blf_len; 1968 1966 bcp->bc_refcount = 1; ··· 2932 2930 if (item->ri_buf[0].i_len == sizeof(struct xfs_inode_log_format)) { 2933 2931 in_f = item->ri_buf[0].i_addr; 2934 2932 } else { 2935 - in_f = kmem_alloc(sizeof(struct xfs_inode_log_format), KM_SLEEP); 2933 + in_f = kmem_alloc(sizeof(struct xfs_inode_log_format), 0); 2936 2934 need_free = 1; 2937 2935 error = xfs_inode_item_format_convert(&item->ri_buf[0], in_f); 2938 2936 if (error) ··· 4163 4161 { 4164 4162 xlog_recover_item_t *item; 4165 4163 4166 - item = kmem_zalloc(sizeof(xlog_recover_item_t), KM_SLEEP); 4164 + item = kmem_zalloc(sizeof(xlog_recover_item_t), 0); 4167 4165 INIT_LIST_HEAD(&item->ri_list); 4168 4166 list_add_tail(&item->ri_list, head); 4169 4167 } ··· 4203 4201 old_ptr = item->ri_buf[item->ri_cnt-1].i_addr; 4204 4202 old_len = item->ri_buf[item->ri_cnt-1].i_len; 4205 4203 4206 - ptr = kmem_realloc(old_ptr, len + old_len, KM_SLEEP); 4204 + ptr = kmem_realloc(old_ptr, len + old_len, 0); 4207 4205 memcpy(&ptr[old_len], dp, len); 4208 4206 item->ri_buf[item->ri_cnt-1].i_len += len; 4209 4207 item->ri_buf[item->ri_cnt-1].i_addr = ptr; ··· 4263 4261 return 0; 4264 4262 } 4265 4263 4266 - ptr = kmem_alloc(len, KM_SLEEP); 4264 + ptr = kmem_alloc(len, 0); 4267 4265 memcpy(ptr, dp, len); 4268 4266 in_f = (struct xfs_inode_log_format *)ptr; 4269 4267 ··· 4291 4289 item->ri_total = in_f->ilf_size; 4292 4290 item->ri_buf = 4293 4291 kmem_zalloc(item->ri_total * sizeof(xfs_log_iovec_t), 4294 - KM_SLEEP); 4292 + 0); 4295 4293 } 4296 4294 ASSERT(item->ri_total > item->ri_cnt); 4297 4295 /* Description region is ri_buf[0] */ ··· 4425 4423 * This is a new transaction so allocate a new recovery container to 4426 4424 * hold the recovery ops that will follow. 4427 4425 */ 4428 - trans = kmem_zalloc(sizeof(struct xlog_recover), KM_SLEEP); 4426 + trans = kmem_zalloc(sizeof(struct xlog_recover), 0); 4429 4427 trans->r_log_tid = tid; 4430 4428 trans->r_lsn = be64_to_cpu(rhead->h_lsn); 4431 4429 INIT_LIST_HEAD(&trans->r_itemq); ··· 5024 5022 } 5025 5023 5026 5024 /* 5027 - * xlog_iunlink_recover 5025 + * Recover AGI unlinked lists 5028 5026 * 5029 - * This is called during recovery to process any inodes which 5030 - * we unlinked but not freed when the system crashed. These 5031 - * inodes will be on the lists in the AGI blocks. What we do 5032 - * here is scan all the AGIs and fully truncate and free any 5033 - * inodes found on the lists. Each inode is removed from the 5034 - * lists when it has been fully truncated and is freed. The 5035 - * freeing of the inode and its removal from the list must be 5036 - * atomic. 5027 + * This is called during recovery to process any inodes which we unlinked but 5028 + * not freed when the system crashed. These inodes will be on the lists in the 5029 + * AGI blocks. What we do here is scan all the AGIs and fully truncate and free 5030 + * any inodes found on the lists. Each inode is removed from the lists when it 5031 + * has been fully truncated and is freed. The freeing of the inode and its 5032 + * removal from the list must be atomic. 5033 + * 5034 + * If everything we touch in the agi processing loop is already in memory, this 5035 + * loop can hold the cpu for a long time. It runs without lock contention, 5036 + * memory allocation contention, the need wait for IO, etc, and so will run 5037 + * until we either run out of inodes to process, run low on memory or we run out 5038 + * of log space. 5039 + * 5040 + * This behaviour is bad for latency on single CPU and non-preemptible kernels, 5041 + * and can prevent other filesytem work (such as CIL pushes) from running. This 5042 + * can lead to deadlocks if the recovery process runs out of log reservation 5043 + * space. Hence we need to yield the CPU when there is other kernel work 5044 + * scheduled on this CPU to ensure other scheduled work can run without undue 5045 + * latency. 5037 5046 */ 5038 5047 STATIC void 5039 5048 xlog_recover_process_iunlinks( ··· 5091 5078 while (agino != NULLAGINO) { 5092 5079 agino = xlog_recover_process_one_iunlink(mp, 5093 5080 agno, agino, bucket); 5081 + cond_resched(); 5094 5082 } 5095 5083 } 5096 5084 xfs_buf_rele(agibp); ··· 5541 5527 */ 5542 5528 log->l_buf_cancel_table = kmem_zalloc(XLOG_BC_TABLE_SIZE * 5543 5529 sizeof(struct list_head), 5544 - KM_SLEEP); 5530 + 0); 5545 5531 for (i = 0; i < XLOG_BC_TABLE_SIZE; i++) 5546 5532 INIT_LIST_HEAD(&log->l_buf_cancel_table[i]); 5547 5533
+2 -2
fs/xfs/xfs_mount.c
··· 82 82 if (hole < 0) { 83 83 xfs_uuid_table = kmem_realloc(xfs_uuid_table, 84 84 (xfs_uuid_table_size + 1) * sizeof(*xfs_uuid_table), 85 - KM_SLEEP); 85 + 0); 86 86 hole = xfs_uuid_table_size++; 87 87 } 88 88 xfs_uuid_table[hole] = *uuid; ··· 214 214 215 215 spin_lock(&mp->m_perag_lock); 216 216 if (radix_tree_insert(&mp->m_perag_tree, index, pag)) { 217 - BUG(); 217 + WARN_ON_ONCE(1); 218 218 spin_unlock(&mp->m_perag_lock); 219 219 radix_tree_preload_end(); 220 220 error = -EEXIST;
-7
fs/xfs/xfs_mount.h
··· 327 327 } 328 328 329 329 /* per-AG block reservation data structures*/ 330 - enum xfs_ag_resv_type { 331 - XFS_AG_RESV_NONE = 0, 332 - XFS_AG_RESV_AGFL, 333 - XFS_AG_RESV_METADATA, 334 - XFS_AG_RESV_RMAPBT, 335 - }; 336 - 337 330 struct xfs_ag_resv { 338 331 /* number of blocks originally reserved here */ 339 332 xfs_extlen_t ar_orig_reserved;
+2 -2
fs/xfs/xfs_mru_cache.c
··· 333 333 if (!(grp_time = msecs_to_jiffies(lifetime_ms) / grp_count)) 334 334 return -EINVAL; 335 335 336 - if (!(mru = kmem_zalloc(sizeof(*mru), KM_SLEEP))) 336 + if (!(mru = kmem_zalloc(sizeof(*mru), 0))) 337 337 return -ENOMEM; 338 338 339 339 /* An extra list is needed to avoid reaping up to a grp_time early. */ 340 340 mru->grp_count = grp_count + 1; 341 - mru->lists = kmem_zalloc(mru->grp_count * sizeof(*mru->lists), KM_SLEEP); 341 + mru->lists = kmem_zalloc(mru->grp_count * sizeof(*mru->lists), 0); 342 342 343 343 if (!mru->lists) { 344 344 err = -ENOMEM;
+2 -2
fs/xfs/xfs_qm.c
··· 642 642 643 643 ASSERT(XFS_IS_QUOTA_RUNNING(mp)); 644 644 645 - qinf = mp->m_quotainfo = kmem_zalloc(sizeof(xfs_quotainfo_t), KM_SLEEP); 645 + qinf = mp->m_quotainfo = kmem_zalloc(sizeof(xfs_quotainfo_t), 0); 646 646 647 647 error = list_lru_init(&qinf->qi_lru); 648 648 if (error) ··· 978 978 if (qip->i_d.di_nblocks == 0) 979 979 return 0; 980 980 981 - map = kmem_alloc(XFS_DQITER_MAP_SIZE * sizeof(*map), KM_SLEEP); 981 + map = kmem_alloc(XFS_DQITER_MAP_SIZE * sizeof(*map), 0); 982 982 983 983 lblkno = 0; 984 984 maxlblkcnt = XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes);
+7 -9
fs/xfs/xfs_refcount_item.c
··· 144 144 ASSERT(nextents > 0); 145 145 if (nextents > XFS_CUI_MAX_FAST_EXTENTS) 146 146 cuip = kmem_zalloc(xfs_cui_log_item_sizeof(nextents), 147 - KM_SLEEP); 147 + 0); 148 148 else 149 - cuip = kmem_zone_zalloc(xfs_cui_zone, KM_SLEEP); 149 + cuip = kmem_zone_zalloc(xfs_cui_zone, 0); 150 150 151 151 xfs_log_item_init(mp, &cuip->cui_item, XFS_LI_CUI, &xfs_cui_item_ops); 152 152 cuip->cui_format.cui_nextents = nextents; ··· 223 223 { 224 224 struct xfs_cud_log_item *cudp; 225 225 226 - cudp = kmem_zone_zalloc(xfs_cud_zone, KM_SLEEP); 226 + cudp = kmem_zone_zalloc(xfs_cud_zone, 0); 227 227 xfs_log_item_init(tp->t_mountp, &cudp->cud_item, XFS_LI_CUD, 228 228 &xfs_cud_item_ops); 229 229 cudp->cud_cuip = cuip; ··· 555 555 irec.br_blockcount = new_len; 556 556 switch (type) { 557 557 case XFS_REFCOUNT_INCREASE: 558 - error = xfs_refcount_increase_extent(tp, &irec); 558 + xfs_refcount_increase_extent(tp, &irec); 559 559 break; 560 560 case XFS_REFCOUNT_DECREASE: 561 - error = xfs_refcount_decrease_extent(tp, &irec); 561 + xfs_refcount_decrease_extent(tp, &irec); 562 562 break; 563 563 case XFS_REFCOUNT_ALLOC_COW: 564 - error = xfs_refcount_alloc_cow_extent(tp, 564 + xfs_refcount_alloc_cow_extent(tp, 565 565 irec.br_startblock, 566 566 irec.br_blockcount); 567 567 break; 568 568 case XFS_REFCOUNT_FREE_COW: 569 - error = xfs_refcount_free_cow_extent(tp, 569 + xfs_refcount_free_cow_extent(tp, 570 570 irec.br_startblock, 571 571 irec.br_blockcount); 572 572 break; 573 573 default: 574 574 ASSERT(0); 575 575 } 576 - if (error) 577 - goto abort_error; 578 576 requeue_only = true; 579 577 } 580 578 }
+6 -17
fs/xfs/xfs_reflink.c
··· 495 495 ASSERT((*tpp)->t_firstblock == NULLFSBLOCK); 496 496 497 497 /* Free the CoW orphan record. */ 498 - error = xfs_refcount_free_cow_extent(*tpp, 499 - del.br_startblock, del.br_blockcount); 500 - if (error) 501 - break; 498 + xfs_refcount_free_cow_extent(*tpp, del.br_startblock, 499 + del.br_blockcount); 502 500 503 501 xfs_bmap_add_free(*tpp, del.br_startblock, 504 502 del.br_blockcount, NULL); ··· 673 675 trace_xfs_reflink_cow_remap(ip, &del); 674 676 675 677 /* Free the CoW orphan record. */ 676 - error = xfs_refcount_free_cow_extent(tp, del.br_startblock, 677 - del.br_blockcount); 678 - if (error) 679 - goto out_cancel; 678 + xfs_refcount_free_cow_extent(tp, del.br_startblock, del.br_blockcount); 680 679 681 680 /* Map the new blocks into the data fork. */ 682 - error = xfs_bmap_map_extent(tp, ip, &del); 683 - if (error) 684 - goto out_cancel; 681 + xfs_bmap_map_extent(tp, ip, &del); 685 682 686 683 /* Charge this new data fork mapping to the on-disk quota. */ 687 684 xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_DELBCOUNT, ··· 1063 1070 uirec.br_blockcount, uirec.br_startblock); 1064 1071 1065 1072 /* Update the refcount tree */ 1066 - error = xfs_refcount_increase_extent(tp, &uirec); 1067 - if (error) 1068 - goto out_cancel; 1073 + xfs_refcount_increase_extent(tp, &uirec); 1069 1074 1070 1075 /* Map the new blocks into the data fork. */ 1071 - error = xfs_bmap_map_extent(tp, ip, &uirec); 1072 - if (error) 1073 - goto out_cancel; 1076 + xfs_bmap_map_extent(tp, ip, &uirec); 1074 1077 1075 1078 /* Update quota accounting. */ 1076 1079 xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT,
+3 -3
fs/xfs/xfs_rmap_item.c
··· 142 142 143 143 ASSERT(nextents > 0); 144 144 if (nextents > XFS_RUI_MAX_FAST_EXTENTS) 145 - ruip = kmem_zalloc(xfs_rui_log_item_sizeof(nextents), KM_SLEEP); 145 + ruip = kmem_zalloc(xfs_rui_log_item_sizeof(nextents), 0); 146 146 else 147 - ruip = kmem_zone_zalloc(xfs_rui_zone, KM_SLEEP); 147 + ruip = kmem_zone_zalloc(xfs_rui_zone, 0); 148 148 149 149 xfs_log_item_init(mp, &ruip->rui_item, XFS_LI_RUI, &xfs_rui_item_ops); 150 150 ruip->rui_format.rui_nextents = nextents; ··· 244 244 { 245 245 struct xfs_rud_log_item *rudp; 246 246 247 - rudp = kmem_zone_zalloc(xfs_rud_zone, KM_SLEEP); 247 + rudp = kmem_zone_zalloc(xfs_rud_zone, 0); 248 248 xfs_log_item_init(tp->t_mountp, &rudp->rud_item, XFS_LI_RUD, 249 249 &xfs_rud_item_ops); 250 250 rudp->rud_ruip = ruip;
+2 -2
fs/xfs/xfs_rtalloc.c
··· 865 865 * lower bound on the minimum level with any free extents. We can 866 866 * continue without the cache if it couldn't be allocated. 867 867 */ 868 - mp->m_rsum_cache = kmem_zalloc_large(rbmblocks, KM_SLEEP); 868 + mp->m_rsum_cache = kmem_zalloc_large(rbmblocks, 0); 869 869 if (!mp->m_rsum_cache) 870 870 xfs_warn(mp, "could not allocate realtime summary cache"); 871 871 } ··· 963 963 /* 964 964 * Allocate a new (fake) mount/sb. 965 965 */ 966 - nmp = kmem_alloc(sizeof(*nmp), KM_SLEEP); 966 + nmp = kmem_alloc(sizeof(*nmp), 0); 967 967 /* 968 968 * Loop over the bitmap blocks. 969 969 * We will do everything one bitmap block at a time.
+2 -1
fs/xfs/xfs_super.c
··· 818 818 goto out_destroy_buf; 819 819 820 820 mp->m_cil_workqueue = alloc_workqueue("xfs-cil/%s", 821 - WQ_MEM_RECLAIM|WQ_FREEZABLE, 0, mp->m_fsname); 821 + WQ_MEM_RECLAIM | WQ_FREEZABLE | WQ_UNBOUND, 822 + 0, mp->m_fsname); 822 823 if (!mp->m_cil_workqueue) 823 824 goto out_destroy_unwritten; 824 825
+34
fs/xfs/xfs_trace.h
··· 23 23 struct xlog_ticket; 24 24 struct xlog_recover; 25 25 struct xlog_recover_item; 26 + struct xlog_rec_header; 26 27 struct xfs_buf_log_format; 27 28 struct xfs_inode_log_format; 28 29 struct xfs_bmbt_irec; ··· 31 30 struct xfs_refcount_irec; 32 31 struct xfs_fsmap; 33 32 struct xfs_rmap_irec; 33 + struct xfs_icreate_log; 34 + struct xfs_owner_info; 35 + struct xfs_trans_res; 36 + struct xfs_inobt_rec_incore; 34 37 35 38 DECLARE_EVENT_CLASS(xfs_attr_list_class, 36 39 TP_PROTO(struct xfs_attr_list_context *ctx), ··· 3579 3574 MAJOR(__entry->dev), MINOR(__entry->dev), 3580 3575 __entry->nr_threads, __entry->pid) 3581 3576 ) 3577 + 3578 + DECLARE_EVENT_CLASS(xfs_kmem_class, 3579 + TP_PROTO(ssize_t size, int flags, unsigned long caller_ip), 3580 + TP_ARGS(size, flags, caller_ip), 3581 + TP_STRUCT__entry( 3582 + __field(ssize_t, size) 3583 + __field(int, flags) 3584 + __field(unsigned long, caller_ip) 3585 + ), 3586 + TP_fast_assign( 3587 + __entry->size = size; 3588 + __entry->flags = flags; 3589 + __entry->caller_ip = caller_ip; 3590 + ), 3591 + TP_printk("size %zd flags 0x%x caller %pS", 3592 + __entry->size, 3593 + __entry->flags, 3594 + (char *)__entry->caller_ip) 3595 + ) 3596 + 3597 + #define DEFINE_KMEM_EVENT(name) \ 3598 + DEFINE_EVENT(xfs_kmem_class, name, \ 3599 + TP_PROTO(ssize_t size, int flags, unsigned long caller_ip), \ 3600 + TP_ARGS(size, flags, caller_ip)) 3601 + DEFINE_KMEM_EVENT(kmem_alloc); 3602 + DEFINE_KMEM_EVENT(kmem_alloc_io); 3603 + DEFINE_KMEM_EVENT(kmem_alloc_large); 3604 + DEFINE_KMEM_EVENT(kmem_realloc); 3605 + DEFINE_KMEM_EVENT(kmem_zone_alloc); 3582 3606 3583 3607 #endif /* _TRACE_XFS_H */ 3584 3608
+2 -2
fs/xfs/xfs_trans.c
··· 90 90 91 91 trace_xfs_trans_dup(tp, _RET_IP_); 92 92 93 - ntp = kmem_zone_zalloc(xfs_trans_zone, KM_SLEEP); 93 + ntp = kmem_zone_zalloc(xfs_trans_zone, 0); 94 94 95 95 /* 96 96 * Initialize the new transaction structure. ··· 263 263 * GFP_NOFS allocation context so that we avoid lockdep false positives 264 264 * by doing GFP_KERNEL allocations inside sb_start_intwrite(). 265 265 */ 266 - tp = kmem_zone_zalloc(xfs_trans_zone, KM_SLEEP); 266 + tp = kmem_zone_zalloc(xfs_trans_zone, 0); 267 267 if (!(flags & XFS_TRANS_NO_WRITECOUNT)) 268 268 sb_start_intwrite(mp->m_super); 269 269
+1 -1
fs/xfs/xfs_trans_dquot.c
··· 863 863 xfs_trans_alloc_dqinfo( 864 864 xfs_trans_t *tp) 865 865 { 866 - tp->t_dqinfo = kmem_zone_zalloc(xfs_qm_dqtrxzone, KM_SLEEP); 866 + tp->t_dqinfo = kmem_zone_zalloc(xfs_qm_dqtrxzone, 0); 867 867 } 868 868 869 869 void
+1 -1
fs/xfs/xfs_xattr.c
··· 30 30 value = NULL; 31 31 } 32 32 33 - error = xfs_attr_get(ip, (unsigned char *)name, value, &asize, xflags); 33 + error = xfs_attr_get(ip, name, (unsigned char **)&value, &asize, xflags); 34 34 if (error) 35 35 return error; 36 36 return asize;
+2
include/linux/fs.h
··· 3543 3543 /* mm/fadvise.c */ 3544 3544 extern int vfs_fadvise(struct file *file, loff_t offset, loff_t len, 3545 3545 int advice); 3546 + extern int generic_fadvise(struct file *file, loff_t offset, loff_t len, 3547 + int advice); 3546 3548 3547 3549 #if defined(CONFIG_IO_URING) 3548 3550 extern struct sock *io_uring_get_socket(struct file *file);
+2 -2
mm/fadvise.c
··· 27 27 * deactivate the pages and clear PG_Referenced. 28 28 */ 29 29 30 - static int generic_fadvise(struct file *file, loff_t offset, loff_t len, 31 - int advice) 30 + int generic_fadvise(struct file *file, loff_t offset, loff_t len, int advice) 32 31 { 33 32 struct inode *inode; 34 33 struct address_space *mapping; ··· 177 178 } 178 179 return 0; 179 180 } 181 + EXPORT_SYMBOL(generic_fadvise); 180 182 181 183 int vfs_fadvise(struct file *file, loff_t offset, loff_t len, int advice) 182 184 {
+16 -6
mm/madvise.c
··· 14 14 #include <linux/userfaultfd_k.h> 15 15 #include <linux/hugetlb.h> 16 16 #include <linux/falloc.h> 17 + #include <linux/fadvise.h> 17 18 #include <linux/sched.h> 18 19 #include <linux/ksm.h> 19 20 #include <linux/fs.h> ··· 276 275 unsigned long start, unsigned long end) 277 276 { 278 277 struct file *file = vma->vm_file; 278 + loff_t offset; 279 279 280 280 *prev = vma; 281 281 #ifdef CONFIG_SWAP ··· 300 298 return 0; 301 299 } 302 300 303 - start = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; 304 - if (end > vma->vm_end) 305 - end = vma->vm_end; 306 - end = ((end - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; 307 - 308 - force_page_cache_readahead(file->f_mapping, file, start, end - start); 301 + /* 302 + * Filesystem's fadvise may need to take various locks. We need to 303 + * explicitly grab a reference because the vma (and hence the 304 + * vma's reference to the file) can go away as soon as we drop 305 + * mmap_sem. 306 + */ 307 + *prev = NULL; /* tell sys_madvise we drop mmap_sem */ 308 + get_file(file); 309 + up_read(&current->mm->mmap_sem); 310 + offset = (loff_t)(start - vma->vm_start) 311 + + ((loff_t)vma->vm_pgoff << PAGE_SHIFT); 312 + vfs_fadvise(file, offset, end - start, POSIX_FADV_WILLNEED); 313 + fput(file); 314 + down_read(&current->mm->mmap_sem); 309 315 return 0; 310 316 } 311 317