Merge tag 'xfs-5.4-merge-7' of git://git.kernel.org/pub/scm/fs/xfs/xfs-linux

+59 -20

fs/xfs/kmem.c

··· 3 3 * Copyright (c) 2000-2005 Silicon Graphics, Inc. 4 4 * All Rights Reserved. 5 5 */ 6 - #include <linux/sched/mm.h> 6 + #include "xfs.h" 7 7 #include <linux/backing-dev.h> 8 - #include "kmem.h" 9 8 #include "xfs_message.h" 9 + #include "xfs_trace.h" 10 10 11 11 void * 12 12 kmem_alloc(size_t size, xfs_km_flags_t flags) ··· 15 15 gfp_t lflags = kmem_flags_convert(flags); 16 16 void *ptr; 17 17 18 + trace_kmem_alloc(size, flags, _RET_IP_); 19 + 18 20 do { 19 21 ptr = kmalloc(size, lflags); 20 - if (ptr || (flags & (KM_MAYFAIL|KM_NOSLEEP))) 22 + if (ptr || (flags & KM_MAYFAIL)) 21 23 return ptr; 22 24 if (!(++retries % 100)) 23 25 xfs_err(NULL, ··· 30 28 } while (1); 31 29 } 32 30 33 - void * 34 - kmem_alloc_large(size_t size, xfs_km_flags_t flags) 31 + 32 + /* 33 + * __vmalloc() will allocate data pages and auxillary structures (e.g. 34 + * pagetables) with GFP_KERNEL, yet we may be under GFP_NOFS context here. Hence 35 + * we need to tell memory reclaim that we are in such a context via 36 + * PF_MEMALLOC_NOFS to prevent memory reclaim re-entering the filesystem here 37 + * and potentially deadlocking. 38 + */ 39 + static void * 40 + __kmem_vmalloc(size_t size, xfs_km_flags_t flags) 35 41 { 36 42 unsigned nofs_flag = 0; 37 43 void *ptr; 38 - gfp_t lflags; 44 + gfp_t lflags = kmem_flags_convert(flags); 39 45 40 - ptr = kmem_alloc(size, flags | KM_MAYFAIL); 41 - if (ptr) 42 - return ptr; 43 - 44 - /* 45 - * __vmalloc() will allocate data pages and auxillary structures (e.g. 46 - * pagetables) with GFP_KERNEL, yet we may be under GFP_NOFS context 47 - * here. Hence we need to tell memory reclaim that we are in such a 48 - * context via PF_MEMALLOC_NOFS to prevent memory reclaim re-entering 49 - * the filesystem here and potentially deadlocking. 50 - */ 51 46 if (flags & KM_NOFS) 52 47 nofs_flag = memalloc_nofs_save(); 53 48 54 - lflags = kmem_flags_convert(flags); 55 49 ptr = __vmalloc(size, lflags, PAGE_KERNEL); 56 50 57 51 if (flags & KM_NOFS) 58 52 memalloc_nofs_restore(nofs_flag); 59 53 60 54 return ptr; 55 + } 56 + 57 + /* 58 + * Same as kmem_alloc_large, except we guarantee the buffer returned is aligned 59 + * to the @align_mask. We only guarantee alignment up to page size, we'll clamp 60 + * alignment at page size if it is larger. vmalloc always returns a PAGE_SIZE 61 + * aligned region. 62 + */ 63 + void * 64 + kmem_alloc_io(size_t size, int align_mask, xfs_km_flags_t flags) 65 + { 66 + void *ptr; 67 + 68 + trace_kmem_alloc_io(size, flags, _RET_IP_); 69 + 70 + if (WARN_ON_ONCE(align_mask >= PAGE_SIZE)) 71 + align_mask = PAGE_SIZE - 1; 72 + 73 + ptr = kmem_alloc(size, flags | KM_MAYFAIL); 74 + if (ptr) { 75 + if (!((uintptr_t)ptr & align_mask)) 76 + return ptr; 77 + kfree(ptr); 78 + } 79 + return __kmem_vmalloc(size, flags); 80 + } 81 + 82 + void * 83 + kmem_alloc_large(size_t size, xfs_km_flags_t flags) 84 + { 85 + void *ptr; 86 + 87 + trace_kmem_alloc_large(size, flags, _RET_IP_); 88 + 89 + ptr = kmem_alloc(size, flags | KM_MAYFAIL); 90 + if (ptr) 91 + return ptr; 92 + return __kmem_vmalloc(size, flags); 61 93 } 62 94 63 95 void * ··· 101 65 gfp_t lflags = kmem_flags_convert(flags); 102 66 void *ptr; 103 67 68 + trace_kmem_realloc(newsize, flags, _RET_IP_); 69 + 104 70 do { 105 71 ptr = krealloc(old, newsize, lflags); 106 - if (ptr || (flags & (KM_MAYFAIL|KM_NOSLEEP))) 72 + if (ptr || (flags & KM_MAYFAIL)) 107 73 return ptr; 108 74 if (!(++retries % 100)) 109 75 xfs_err(NULL, ··· 123 85 gfp_t lflags = kmem_flags_convert(flags); 124 86 void *ptr; 125 87 88 + trace_kmem_zone_alloc(kmem_cache_size(zone), flags, _RET_IP_); 126 89 do { 127 90 ptr = kmem_cache_alloc(zone, lflags); 128 - if (ptr || (flags & (KM_MAYFAIL|KM_NOSLEEP))) 91 + if (ptr || (flags & KM_MAYFAIL)) 129 92 return ptr; 130 93 if (!(++retries % 100)) 131 94 xfs_err(NULL,

+5 -10

fs/xfs/kmem.h

··· 16 16 */ 17 17 18 18 typedef unsigned __bitwise xfs_km_flags_t; 19 - #define KM_SLEEP ((__force xfs_km_flags_t)0x0001u) 20 - #define KM_NOSLEEP ((__force xfs_km_flags_t)0x0002u) 21 19 #define KM_NOFS ((__force xfs_km_flags_t)0x0004u) 22 20 #define KM_MAYFAIL ((__force xfs_km_flags_t)0x0008u) 23 21 #define KM_ZERO ((__force xfs_km_flags_t)0x0010u) ··· 30 32 { 31 33 gfp_t lflags; 32 34 33 - BUG_ON(flags & ~(KM_SLEEP|KM_NOSLEEP|KM_NOFS|KM_MAYFAIL|KM_ZERO)); 35 + BUG_ON(flags & ~(KM_NOFS|KM_MAYFAIL|KM_ZERO)); 34 36 35 - if (flags & KM_NOSLEEP) { 36 - lflags = GFP_ATOMIC | __GFP_NOWARN; 37 - } else { 38 - lflags = GFP_KERNEL | __GFP_NOWARN; 39 - if (flags & KM_NOFS) 40 - lflags &= ~__GFP_FS; 41 - } 37 + lflags = GFP_KERNEL | __GFP_NOWARN; 38 + if (flags & KM_NOFS) 39 + lflags &= ~__GFP_FS; 42 40 43 41 /* 44 42 * Default page/slab allocator behavior is to retry for ever ··· 53 59 } 54 60 55 61 extern void *kmem_alloc(size_t, xfs_km_flags_t); 62 + extern void *kmem_alloc_io(size_t size, int align_mask, xfs_km_flags_t flags); 56 63 extern void *kmem_alloc_large(size_t size, xfs_km_flags_t); 57 64 extern void *kmem_realloc(const void *, size_t, xfs_km_flags_t); 58 65 static inline void kmem_free(const void *ptr)

+1 -1

fs/xfs/libxfs/xfs_alloc.c

··· 2205 2205 ASSERT(xfs_bmap_free_item_zone != NULL); 2206 2206 ASSERT(oinfo != NULL); 2207 2207 2208 - new = kmem_zone_alloc(xfs_bmap_free_item_zone, KM_SLEEP); 2208 + new = kmem_zone_alloc(xfs_bmap_free_item_zone, 0); 2209 2209 new->xefi_startblock = XFS_AGB_TO_FSB(mp, agno, agbno); 2210 2210 new->xefi_blockcount = 1; 2211 2211 new->xefi_oinfo = *oinfo;

+3 -4

fs/xfs/libxfs/xfs_alloc.h

··· 81 81 /* 82 82 * Defines for datatype 83 83 */ 84 - #define XFS_ALLOC_USERDATA (1 << 0)/* allocation is for user data*/ 85 - #define XFS_ALLOC_INITIAL_USER_DATA (1 << 1)/* special case start of file */ 86 - #define XFS_ALLOC_USERDATA_ZERO (1 << 2)/* zero extent on allocation */ 87 - #define XFS_ALLOC_NOBUSY (1 << 3)/* Busy extents not allowed */ 84 + #define XFS_ALLOC_INITIAL_USER_DATA (1 << 0)/* special case start of file */ 85 + #define XFS_ALLOC_USERDATA_ZERO (1 << 1)/* zero extent on allocation */ 86 + #define XFS_ALLOC_NOBUSY (1 << 2)/* Busy extents not allowed */ 88 87 89 88 static inline bool 90 89 xfs_alloc_is_userdata(int datatype)

+55 -24

fs/xfs/libxfs/xfs_attr.c

··· 97 97 * Overall external interface routines. 98 98 *========================================================================*/ 99 99 100 - /* Retrieve an extended attribute and its value. Must have ilock. */ 100 + /* 101 + * Retrieve an extended attribute and its value. Must have ilock. 102 + * Returns 0 on successful retrieval, otherwise an error. 103 + */ 101 104 int 102 105 xfs_attr_get_ilocked( 103 106 struct xfs_inode *ip, ··· 118 115 return xfs_attr_node_get(args); 119 116 } 120 117 121 - /* Retrieve an extended attribute by name, and its value. */ 118 + /* 119 + * Retrieve an extended attribute by name, and its value if requested. 120 + * 121 + * If ATTR_KERNOVAL is set in @flags, then the caller does not want the value, 122 + * just an indication whether the attribute exists and the size of the value if 123 + * it exists. The size is returned in @valuelenp, 124 + * 125 + * If the attribute is found, but exceeds the size limit set by the caller in 126 + * @valuelenp, return -ERANGE with the size of the attribute that was found in 127 + * @valuelenp. 128 + * 129 + * If ATTR_ALLOC is set in @flags, allocate the buffer for the value after 130 + * existence of the attribute has been determined. On success, return that 131 + * buffer to the caller and leave them to free it. On failure, free any 132 + * allocated buffer and ensure the buffer pointer returned to the caller is 133 + * null. 134 + */ 122 135 int 123 136 xfs_attr_get( 124 137 struct xfs_inode *ip, 125 138 const unsigned char *name, 126 - unsigned char *value, 139 + unsigned char **value, 127 140 int *valuelenp, 128 141 int flags) 129 142 { 130 143 struct xfs_da_args args; 131 144 uint lock_mode; 132 145 int error; 146 + 147 + ASSERT((flags & (ATTR_ALLOC | ATTR_KERNOVAL)) || *value); 133 148 134 149 XFS_STATS_INC(ip->i_mount, xs_attr_get); 135 150 ··· 158 137 if (error) 159 138 return error; 160 139 161 - args.value = value; 162 - args.valuelen = *valuelenp; 163 140 /* Entirely possible to look up a name which doesn't exist */ 164 141 args.op_flags = XFS_DA_OP_OKNOENT; 142 + if (flags & ATTR_ALLOC) 143 + args.op_flags |= XFS_DA_OP_ALLOCVAL; 144 + else 145 + args.value = *value; 146 + args.valuelen = *valuelenp; 165 147 166 148 lock_mode = xfs_ilock_attr_map_shared(ip); 167 149 error = xfs_attr_get_ilocked(ip, &args); 168 150 xfs_iunlock(ip, lock_mode); 169 - 170 151 *valuelenp = args.valuelen; 171 - return error == -EEXIST ? 0 : error; 152 + 153 + /* on error, we have to clean up allocated value buffers */ 154 + if (error) { 155 + if (flags & ATTR_ALLOC) { 156 + kmem_free(args.value); 157 + *value = NULL; 158 + } 159 + return error; 160 + } 161 + *value = args.value; 162 + return 0; 172 163 } 173 164 174 165 /* ··· 801 768 * 802 769 * This leaf block cannot have a "remote" value, we only call this routine 803 770 * if bmap_one_block() says there is only one block (ie: no remote blks). 771 + * 772 + * Returns 0 on successful retrieval, otherwise an error. 804 773 */ 805 774 STATIC int 806 775 xfs_attr_leaf_get(xfs_da_args_t *args) ··· 824 789 } 825 790 error = xfs_attr3_leaf_getvalue(bp, args); 826 791 xfs_trans_brelse(args->trans, bp); 827 - if (!error && (args->rmtblkno > 0) && !(args->flags & ATTR_KERNOVAL)) { 828 - error = xfs_attr_rmtval_get(args); 829 - } 830 792 return error; 831 793 } 832 794 ··· 1300 1268 } 1301 1269 1302 1270 /* 1303 - * Look up a filename in a node attribute list. 1271 + * Retrieve the attribute data from a node attribute list. 1304 1272 * 1305 1273 * This routine gets called for any attribute fork that has more than one 1306 1274 * block, ie: both true Btree attr lists and for single-leaf-blocks with 1307 1275 * "remote" values taking up more blocks. 1276 + * 1277 + * Returns 0 on successful retrieval, otherwise an error. 1308 1278 */ 1309 1279 STATIC int 1310 1280 xfs_attr_node_get(xfs_da_args_t *args) ··· 1328 1294 error = xfs_da3_node_lookup_int(state, &retval); 1329 1295 if (error) { 1330 1296 retval = error; 1331 - } else if (retval == -EEXIST) { 1332 - blk = &state->path.blk[ state->path.active-1 ]; 1333 - ASSERT(blk->bp != NULL); 1334 - ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC); 1335 - 1336 - /* 1337 - * Get the value, local or "remote" 1338 - */ 1339 - retval = xfs_attr3_leaf_getvalue(blk->bp, args); 1340 - if (!retval && (args->rmtblkno > 0) 1341 - && !(args->flags & ATTR_KERNOVAL)) { 1342 - retval = xfs_attr_rmtval_get(args); 1343 - } 1297 + goto out_release; 1344 1298 } 1299 + if (retval != -EEXIST) 1300 + goto out_release; 1301 + 1302 + /* 1303 + * Get the value, local or "remote" 1304 + */ 1305 + blk = &state->path.blk[state->path.active - 1]; 1306 + retval = xfs_attr3_leaf_getvalue(blk->bp, args); 1345 1307 1346 1308 /* 1347 1309 * If not in a transaction, we have to release all the buffers. 1348 1310 */ 1311 + out_release: 1349 1312 for (i = 0; i < state->path.active; i++) { 1350 1313 xfs_trans_brelse(args->trans, state->path.blk[i].bp); 1351 1314 state->path.blk[i].bp = NULL;

+4 -2

fs/xfs/libxfs/xfs_attr.h

··· 37 37 #define ATTR_KERNOVAL 0x2000 /* [kernel] get attr size only, not value */ 38 38 39 39 #define ATTR_INCOMPLETE 0x4000 /* [kernel] return INCOMPLETE attr keys */ 40 + #define ATTR_ALLOC 0x8000 /* allocate xattr buffer on demand */ 40 41 41 42 #define XFS_ATTR_FLAGS \ 42 43 { ATTR_DONTFOLLOW, "DONTFOLLOW" }, \ ··· 48 47 { ATTR_REPLACE, "REPLACE" }, \ 49 48 { ATTR_KERNOTIME, "KERNOTIME" }, \ 50 49 { ATTR_KERNOVAL, "KERNOVAL" }, \ 51 - { ATTR_INCOMPLETE, "INCOMPLETE" } 50 + { ATTR_INCOMPLETE, "INCOMPLETE" }, \ 51 + { ATTR_ALLOC, "ALLOC" } 52 52 53 53 /* 54 54 * The maximum size (into the kernel or returned from the kernel) of an ··· 145 143 int xfs_inode_hasattr(struct xfs_inode *ip); 146 144 int xfs_attr_get_ilocked(struct xfs_inode *ip, struct xfs_da_args *args); 147 145 int xfs_attr_get(struct xfs_inode *ip, const unsigned char *name, 148 - unsigned char *value, int *valuelenp, int flags); 146 + unsigned char **value, int *valuelenp, int flags); 149 147 int xfs_attr_set(struct xfs_inode *dp, const unsigned char *name, 150 148 unsigned char *value, int valuelen, int flags); 151 149 int xfs_attr_set_args(struct xfs_da_args *args);

+76 -52

fs/xfs/libxfs/xfs_attr_leaf.c

··· 393 393 return XFS_ATTR_NSP_ONDISK(ondisk_flags) == XFS_ATTR_NSP_ARGS_TO_ONDISK(arg_flags); 394 394 } 395 395 396 + static int 397 + xfs_attr_copy_value( 398 + struct xfs_da_args *args, 399 + unsigned char *value, 400 + int valuelen) 401 + { 402 + /* 403 + * No copy if all we have to do is get the length 404 + */ 405 + if (args->flags & ATTR_KERNOVAL) { 406 + args->valuelen = valuelen; 407 + return 0; 408 + } 409 + 410 + /* 411 + * No copy if the length of the existing buffer is too small 412 + */ 413 + if (args->valuelen < valuelen) { 414 + args->valuelen = valuelen; 415 + return -ERANGE; 416 + } 417 + 418 + if (args->op_flags & XFS_DA_OP_ALLOCVAL) { 419 + args->value = kmem_alloc_large(valuelen, 0); 420 + if (!args->value) 421 + return -ENOMEM; 422 + } 423 + args->valuelen = valuelen; 424 + 425 + /* remote block xattr requires IO for copy-in */ 426 + if (args->rmtblkno) 427 + return xfs_attr_rmtval_get(args); 428 + 429 + /* 430 + * This is to prevent a GCC warning because the remote xattr case 431 + * doesn't have a value to pass in. In that case, we never reach here, 432 + * but GCC can't work that out and so throws a "passing NULL to 433 + * memcpy" warning. 434 + */ 435 + if (!value) 436 + return -EINVAL; 437 + memcpy(args->value, value, valuelen); 438 + return 0; 439 + } 396 440 397 441 /*======================================================================== 398 442 * External routines when attribute fork size < XFS_LITINO(mp). ··· 764 720 } 765 721 766 722 /* 767 - * Look up a name in a shortform attribute list structure. 723 + * Retreive the attribute value and length. 724 + * 725 + * If ATTR_KERNOVAL is specified, only the length needs to be returned. 726 + * Unlike a lookup, we only return an error if the attribute does not 727 + * exist or we can't retrieve the value. 768 728 */ 769 - /*ARGSUSED*/ 770 729 int 771 - xfs_attr_shortform_getvalue(xfs_da_args_t *args) 730 + xfs_attr_shortform_getvalue( 731 + struct xfs_da_args *args) 772 732 { 773 - xfs_attr_shortform_t *sf; 774 - xfs_attr_sf_entry_t *sfe; 775 - int i; 733 + struct xfs_attr_shortform *sf; 734 + struct xfs_attr_sf_entry *sfe; 735 + int i; 776 736 777 737 ASSERT(args->dp->i_afp->if_flags == XFS_IFINLINE); 778 738 sf = (xfs_attr_shortform_t *)args->dp->i_afp->if_u1.if_data; ··· 789 741 continue; 790 742 if (!xfs_attr_namesp_match(args->flags, sfe->flags)) 791 743 continue; 792 - if (args->flags & ATTR_KERNOVAL) { 793 - args->valuelen = sfe->valuelen; 794 - return -EEXIST; 795 - } 796 - if (args->valuelen < sfe->valuelen) { 797 - args->valuelen = sfe->valuelen; 798 - return -ERANGE; 799 - } 800 - args->valuelen = sfe->valuelen; 801 - memcpy(args->value, &sfe->nameval[args->namelen], 802 - args->valuelen); 803 - return -EEXIST; 744 + return xfs_attr_copy_value(args, &sfe->nameval[args->namelen], 745 + sfe->valuelen); 804 746 } 805 747 return -ENOATTR; 806 748 } ··· 820 782 ifp = dp->i_afp; 821 783 sf = (xfs_attr_shortform_t *)ifp->if_u1.if_data; 822 784 size = be16_to_cpu(sf->hdr.totsize); 823 - tmpbuffer = kmem_alloc(size, KM_SLEEP); 785 + tmpbuffer = kmem_alloc(size, 0); 824 786 ASSERT(tmpbuffer != NULL); 825 787 memcpy(tmpbuffer, ifp->if_u1.if_data, size); 826 788 sf = (xfs_attr_shortform_t *)tmpbuffer; ··· 1023 985 1024 986 trace_xfs_attr_leaf_to_sf(args); 1025 987 1026 - tmpbuffer = kmem_alloc(args->geo->blksize, KM_SLEEP); 988 + tmpbuffer = kmem_alloc(args->geo->blksize, 0); 1027 989 if (!tmpbuffer) 1028 990 return -ENOMEM; 1029 991 ··· 1486 1448 1487 1449 trace_xfs_attr_leaf_compact(args); 1488 1450 1489 - tmpbuffer = kmem_alloc(args->geo->blksize, KM_SLEEP); 1451 + tmpbuffer = kmem_alloc(args->geo->blksize, 0); 1490 1452 memcpy(tmpbuffer, bp->b_addr, args->geo->blksize); 1491 1453 memset(bp->b_addr, 0, args->geo->blksize); 1492 1454 leaf_src = (xfs_attr_leafblock_t *)tmpbuffer; ··· 2205 2167 struct xfs_attr_leafblock *tmp_leaf; 2206 2168 struct xfs_attr3_icleaf_hdr tmphdr; 2207 2169 2208 - tmp_leaf = kmem_zalloc(state->args->geo->blksize, KM_SLEEP); 2170 + tmp_leaf = kmem_zalloc(state->args->geo->blksize, 0); 2209 2171 2210 2172 /* 2211 2173 * Copy the header into the temp leaf so that all the stuff ··· 2388 2350 /* 2389 2351 * Get the value associated with an attribute name from a leaf attribute 2390 2352 * list structure. 2353 + * 2354 + * If ATTR_KERNOVAL is specified, only the length needs to be returned. 2355 + * Unlike a lookup, we only return an error if the attribute does not 2356 + * exist or we can't retrieve the value. 2391 2357 */ 2392 2358 int 2393 2359 xfs_attr3_leaf_getvalue( ··· 2403 2361 struct xfs_attr_leaf_entry *entry; 2404 2362 struct xfs_attr_leaf_name_local *name_loc; 2405 2363 struct xfs_attr_leaf_name_remote *name_rmt; 2406 - int valuelen; 2407 2364 2408 2365 leaf = bp->b_addr; 2409 2366 xfs_attr3_leaf_hdr_from_disk(args->geo, &ichdr, leaf); ··· 2414 2373 name_loc = xfs_attr3_leaf_name_local(leaf, args->index); 2415 2374 ASSERT(name_loc->namelen == args->namelen); 2416 2375 ASSERT(memcmp(args->name, name_loc->nameval, args->namelen) == 0); 2417 - valuelen = be16_to_cpu(name_loc->valuelen); 2418 - if (args->flags & ATTR_KERNOVAL) { 2419 - args->valuelen = valuelen; 2420 - return 0; 2421 - } 2422 - if (args->valuelen < valuelen) { 2423 - args->valuelen = valuelen; 2424 - return -ERANGE; 2425 - } 2426 - args->valuelen = valuelen; 2427 - memcpy(args->value, &name_loc->nameval[args->namelen], valuelen); 2428 - } else { 2429 - name_rmt = xfs_attr3_leaf_name_remote(leaf, args->index); 2430 - ASSERT(name_rmt->namelen == args->namelen); 2431 - ASSERT(memcmp(args->name, name_rmt->name, args->namelen) == 0); 2432 - args->rmtvaluelen = be32_to_cpu(name_rmt->valuelen); 2433 - args->rmtblkno = be32_to_cpu(name_rmt->valueblk); 2434 - args->rmtblkcnt = xfs_attr3_rmt_blocks(args->dp->i_mount, 2435 - args->rmtvaluelen); 2436 - if (args->flags & ATTR_KERNOVAL) { 2437 - args->valuelen = args->rmtvaluelen; 2438 - return 0; 2439 - } 2440 - if (args->valuelen < args->rmtvaluelen) { 2441 - args->valuelen = args->rmtvaluelen; 2442 - return -ERANGE; 2443 - } 2444 - args->valuelen = args->rmtvaluelen; 2376 + return xfs_attr_copy_value(args, 2377 + &name_loc->nameval[args->namelen], 2378 + be16_to_cpu(name_loc->valuelen)); 2445 2379 } 2446 - return 0; 2380 + 2381 + name_rmt = xfs_attr3_leaf_name_remote(leaf, args->index); 2382 + ASSERT(name_rmt->namelen == args->namelen); 2383 + ASSERT(memcmp(args->name, name_rmt->name, args->namelen) == 0); 2384 + args->rmtvaluelen = be32_to_cpu(name_rmt->valuelen); 2385 + args->rmtblkno = be32_to_cpu(name_rmt->valueblk); 2386 + args->rmtblkcnt = xfs_attr3_rmt_blocks(args->dp->i_mount, 2387 + args->rmtvaluelen); 2388 + return xfs_attr_copy_value(args, NULL, args->rmtvaluelen); 2447 2389 } 2448 2390 2449 2391 /*========================================================================

+2

fs/xfs/libxfs/xfs_attr_remote.c

··· 358 358 /* 359 359 * Read the value associated with an attribute from the out-of-line buffer 360 360 * that we stored it in. 361 + * 362 + * Returns 0 on successful retrieval, otherwise an error. 361 363 */ 362 364 int 363 365 xfs_attr_rmtval_get(

+30 -55

fs/xfs/libxfs/xfs_bmap.c

··· 553 553 #endif 554 554 ASSERT(xfs_bmap_free_item_zone != NULL); 555 555 556 - new = kmem_zone_alloc(xfs_bmap_free_item_zone, KM_SLEEP); 556 + new = kmem_zone_alloc(xfs_bmap_free_item_zone, 0); 557 557 new->xefi_startblock = bno; 558 558 new->xefi_blockcount = (xfs_extlen_t)len; 559 559 if (oinfo) ··· 1099 1099 if (error) 1100 1100 goto trans_cancel; 1101 1101 ASSERT(ip->i_afp == NULL); 1102 - ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, KM_SLEEP); 1102 + ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, 0); 1103 1103 ip->i_afp->if_flags = XFS_IFEXTENTS; 1104 1104 logflags = 0; 1105 1105 switch (ip->i_d.di_format) { ··· 1985 1985 } 1986 1986 1987 1987 /* add reverse mapping unless caller opted out */ 1988 - if (!(bma->flags & XFS_BMAPI_NORMAP)) { 1989 - error = xfs_rmap_map_extent(bma->tp, bma->ip, whichfork, new); 1990 - if (error) 1991 - goto done; 1992 - } 1988 + if (!(bma->flags & XFS_BMAPI_NORMAP)) 1989 + xfs_rmap_map_extent(bma->tp, bma->ip, whichfork, new); 1993 1990 1994 1991 /* convert to a btree if necessary */ 1995 1992 if (xfs_bmap_needs_btree(bma->ip, whichfork)) { ··· 2468 2471 } 2469 2472 2470 2473 /* update reverse mappings */ 2471 - error = xfs_rmap_convert_extent(mp, tp, ip, whichfork, new); 2472 - if (error) 2473 - goto done; 2474 + xfs_rmap_convert_extent(mp, tp, ip, whichfork, new); 2474 2475 2475 2476 /* convert to a btree if necessary */ 2476 2477 if (xfs_bmap_needs_btree(ip, whichfork)) { ··· 2827 2832 } 2828 2833 2829 2834 /* add reverse mapping unless caller opted out */ 2830 - if (!(flags & XFS_BMAPI_NORMAP)) { 2831 - error = xfs_rmap_map_extent(tp, ip, whichfork, new); 2832 - if (error) 2833 - goto done; 2834 - } 2835 + if (!(flags & XFS_BMAPI_NORMAP)) 2836 + xfs_rmap_map_extent(tp, ip, whichfork, new); 2835 2837 2836 2838 /* convert to a btree if necessary */ 2837 2839 if (xfs_bmap_needs_btree(ip, whichfork)) { ··· 4042 4050 */ 4043 4051 if (!(bma->flags & XFS_BMAPI_METADATA)) { 4044 4052 bma->datatype = XFS_ALLOC_NOBUSY; 4045 - if (whichfork == XFS_DATA_FORK) { 4046 - if (bma->offset == 0) 4047 - bma->datatype |= XFS_ALLOC_INITIAL_USER_DATA; 4048 - else 4049 - bma->datatype |= XFS_ALLOC_USERDATA; 4050 - } 4053 + if (whichfork == XFS_DATA_FORK && bma->offset == 0) 4054 + bma->datatype |= XFS_ALLOC_INITIAL_USER_DATA; 4051 4055 if (bma->flags & XFS_BMAPI_ZERO) 4052 4056 bma->datatype |= XFS_ALLOC_USERDATA_ZERO; 4053 4057 } ··· 4389 4401 * If this is a CoW allocation, record the data in 4390 4402 * the refcount btree for orphan recovery. 4391 4403 */ 4392 - if (whichfork == XFS_COW_FORK) { 4393 - error = xfs_refcount_alloc_cow_extent(tp, 4394 - bma.blkno, bma.length); 4395 - if (error) 4396 - goto error0; 4397 - } 4404 + if (whichfork == XFS_COW_FORK) 4405 + xfs_refcount_alloc_cow_extent(tp, bma.blkno, 4406 + bma.length); 4398 4407 } 4399 4408 4400 4409 /* Deal with the allocated space we found. */ ··· 4515 4530 if (WARN_ON_ONCE(bma.blkno == NULLFSBLOCK)) 4516 4531 goto out_finish; 4517 4532 error = -EFSCORRUPTED; 4518 - if (WARN_ON_ONCE(!bma.got.br_startblock && !XFS_IS_REALTIME_INODE(ip))) 4533 + if (WARN_ON_ONCE(!xfs_valid_startblock(ip, bma.got.br_startblock))) 4519 4534 goto out_finish; 4520 4535 4521 4536 XFS_STATS_ADD(mp, xs_xstrat_bytes, XFS_FSB_TO_B(mp, bma.length)); ··· 4525 4540 *imap = bma.got; 4526 4541 *seq = READ_ONCE(ifp->if_seq); 4527 4542 4528 - if (whichfork == XFS_COW_FORK) { 4529 - error = xfs_refcount_alloc_cow_extent(tp, bma.blkno, 4530 - bma.length); 4531 - if (error) 4532 - goto out_finish; 4533 - } 4543 + if (whichfork == XFS_COW_FORK) 4544 + xfs_refcount_alloc_cow_extent(tp, bma.blkno, bma.length); 4534 4545 4535 4546 error = xfs_bmap_btree_to_extents(tp, ip, bma.cur, &bma.logflags, 4536 4547 whichfork); ··· 5130 5149 } 5131 5150 5132 5151 /* remove reverse mapping */ 5133 - error = xfs_rmap_unmap_extent(tp, ip, whichfork, del); 5134 - if (error) 5135 - goto done; 5152 + xfs_rmap_unmap_extent(tp, ip, whichfork, del); 5136 5153 5137 5154 /* 5138 5155 * If we need to, add to list of extents to delete. 5139 5156 */ 5140 5157 if (do_fx && !(bflags & XFS_BMAPI_REMAP)) { 5141 5158 if (xfs_is_reflink_inode(ip) && whichfork == XFS_DATA_FORK) { 5142 - error = xfs_refcount_decrease_extent(tp, del); 5143 - if (error) 5144 - goto done; 5159 + xfs_refcount_decrease_extent(tp, del); 5145 5160 } else { 5146 5161 __xfs_bmap_add_free(tp, del->br_startblock, 5147 5162 del->br_blockcount, NULL, ··· 5628 5651 &new); 5629 5652 5630 5653 /* update reverse mapping. rmap functions merge the rmaps for us */ 5631 - error = xfs_rmap_unmap_extent(tp, ip, whichfork, got); 5632 - if (error) 5633 - return error; 5654 + xfs_rmap_unmap_extent(tp, ip, whichfork, got); 5634 5655 memcpy(&new, got, sizeof(new)); 5635 5656 new.br_startoff = left->br_startoff + left->br_blockcount; 5636 - return xfs_rmap_map_extent(tp, ip, whichfork, &new); 5657 + xfs_rmap_map_extent(tp, ip, whichfork, &new); 5658 + return 0; 5637 5659 } 5638 5660 5639 5661 static int ··· 5671 5695 got); 5672 5696 5673 5697 /* update reverse mapping */ 5674 - error = xfs_rmap_unmap_extent(tp, ip, whichfork, &prev); 5675 - if (error) 5676 - return error; 5677 - return xfs_rmap_map_extent(tp, ip, whichfork, got); 5698 + xfs_rmap_unmap_extent(tp, ip, whichfork, &prev); 5699 + xfs_rmap_map_extent(tp, ip, whichfork, got); 5700 + return 0; 5678 5701 } 5679 5702 5680 5703 int ··· 6069 6094 bmap->br_blockcount, 6070 6095 bmap->br_state); 6071 6096 6072 - bi = kmem_alloc(sizeof(struct xfs_bmap_intent), KM_SLEEP | KM_NOFS); 6097 + bi = kmem_alloc(sizeof(struct xfs_bmap_intent), KM_NOFS); 6073 6098 INIT_LIST_HEAD(&bi->bi_list); 6074 6099 bi->bi_type = type; 6075 6100 bi->bi_owner = ip; ··· 6081 6106 } 6082 6107 6083 6108 /* Map an extent into a file. */ 6084 - int 6109 + void 6085 6110 xfs_bmap_map_extent( 6086 6111 struct xfs_trans *tp, 6087 6112 struct xfs_inode *ip, 6088 6113 struct xfs_bmbt_irec *PREV) 6089 6114 { 6090 6115 if (!xfs_bmap_is_update_needed(PREV)) 6091 - return 0; 6116 + return; 6092 6117 6093 - return __xfs_bmap_add(tp, XFS_BMAP_MAP, ip, XFS_DATA_FORK, PREV); 6118 + __xfs_bmap_add(tp, XFS_BMAP_MAP, ip, XFS_DATA_FORK, PREV); 6094 6119 } 6095 6120 6096 6121 /* Unmap an extent out of a file. */ 6097 - int 6122 + void 6098 6123 xfs_bmap_unmap_extent( 6099 6124 struct xfs_trans *tp, 6100 6125 struct xfs_inode *ip, 6101 6126 struct xfs_bmbt_irec *PREV) 6102 6127 { 6103 6128 if (!xfs_bmap_is_update_needed(PREV)) 6104 - return 0; 6129 + return; 6105 6130 6106 - return __xfs_bmap_add(tp, XFS_BMAP_UNMAP, ip, XFS_DATA_FORK, PREV); 6131 + __xfs_bmap_add(tp, XFS_BMAP_UNMAP, ip, XFS_DATA_FORK, PREV); 6107 6132 } 6108 6133 6109 6134 /*

+9 -2

fs/xfs/libxfs/xfs_bmap.h

··· 171 171 !isnullstartblock(irec->br_startblock); 172 172 } 173 173 174 + /* 175 + * Check the mapping for obviously garbage allocations that could trash the 176 + * filesystem immediately. 177 + */ 178 + #define xfs_valid_startblock(ip, startblock) \ 179 + ((startblock) != 0 || XFS_IS_REALTIME_INODE(ip)) 180 + 174 181 void xfs_trim_extent(struct xfs_bmbt_irec *irec, xfs_fileoff_t bno, 175 182 xfs_filblks_t len); 176 183 int xfs_bmap_add_attrfork(struct xfs_inode *ip, int size, int rsvd); ··· 261 254 enum xfs_bmap_intent_type type, int whichfork, 262 255 xfs_fileoff_t startoff, xfs_fsblock_t startblock, 263 256 xfs_filblks_t *blockcount, xfs_exntst_t state); 264 - int xfs_bmap_map_extent(struct xfs_trans *tp, struct xfs_inode *ip, 257 + void xfs_bmap_map_extent(struct xfs_trans *tp, struct xfs_inode *ip, 265 258 struct xfs_bmbt_irec *imap); 266 - int xfs_bmap_unmap_extent(struct xfs_trans *tp, struct xfs_inode *ip, 259 + void xfs_bmap_unmap_extent(struct xfs_trans *tp, struct xfs_inode *ip, 267 260 struct xfs_bmbt_irec *imap); 268 261 269 262 static inline int xfs_bmap_fork_to_state(int whichfork)

+14 -2

fs/xfs/libxfs/xfs_bmap_btree.c

··· 400 400 union xfs_btree_key *k1, 401 401 union xfs_btree_key *k2) 402 402 { 403 - return (int64_t)be64_to_cpu(k1->bmbt.br_startoff) - 404 - be64_to_cpu(k2->bmbt.br_startoff); 403 + uint64_t a = be64_to_cpu(k1->bmbt.br_startoff); 404 + uint64_t b = be64_to_cpu(k2->bmbt.br_startoff); 405 + 406 + /* 407 + * Note: This routine previously casted a and b to int64 and subtracted 408 + * them to generate a result. This lead to problems if b was the 409 + * "maximum" key value (all ones) being signed incorrectly, hence this 410 + * somewhat less efficient version. 411 + */ 412 + if (a > b) 413 + return 1; 414 + if (b > a) 415 + return -1; 416 + return 0; 405 417 } 406 418 407 419 static xfs_failaddr_t

+5 -9

fs/xfs/libxfs/xfs_btree.c

··· 4466 4466 * btree block 4467 4467 * 4468 4468 * @bp: buffer containing the btree block 4469 - * @max_recs: pointer to the m_*_mxr max records field in the xfs mount 4470 - * @pag_max_level: pointer to the per-ag max level field 4471 4469 */ 4472 4470 xfs_failaddr_t 4473 4471 xfs_btree_sblock_v5hdr_verify( ··· 4598 4600 4599 4601 /* Callback */ 4600 4602 error = fn(cur, recp, priv); 4601 - if (error < 0 || error == XFS_BTREE_QUERY_RANGE_ABORT) 4603 + if (error) 4602 4604 break; 4603 4605 4604 4606 advloop: ··· 4700 4702 */ 4701 4703 if (ldiff >= 0 && hdiff >= 0) { 4702 4704 error = fn(cur, recp, priv); 4703 - if (error < 0 || 4704 - error == XFS_BTREE_QUERY_RANGE_ABORT) 4705 + if (error) 4705 4706 break; 4706 4707 } else if (hdiff < 0) { 4707 4708 /* Record is larger than high key; pop. */ ··· 4771 4774 * Query a btree for all records overlapping a given interval of keys. The 4772 4775 * supplied function will be called with each record found; return one of the 4773 4776 * XFS_BTREE_QUERY_RANGE_{CONTINUE,ABORT} values or the usual negative error 4774 - * code. This function returns XFS_BTREE_QUERY_RANGE_ABORT, zero, or a 4775 - * negative error code. 4777 + * code. This function returns -ECANCELED, zero, or a negative error code. 4776 4778 */ 4777 4779 int 4778 4780 xfs_btree_query_range( ··· 4887 4891 union xfs_btree_rec *rec, 4888 4892 void *priv) 4889 4893 { 4890 - return XFS_BTREE_QUERY_RANGE_ABORT; 4894 + return -ECANCELED; 4891 4895 } 4892 4896 4893 4897 /* Is there a record covering a given range of keys? */ ··· 4902 4906 4903 4907 error = xfs_btree_query_range(cur, low, high, 4904 4908 &xfs_btree_has_record_helper, NULL); 4905 - if (error == XFS_BTREE_QUERY_RANGE_ABORT) { 4909 + if (error == -ECANCELED) { 4906 4910 *exists = true; 4907 4911 return 0; 4908 4912 }

+7 -3

fs/xfs/libxfs/xfs_btree.h

··· 464 464 uint xfs_btree_compute_maxlevels(uint *limits, unsigned long len); 465 465 unsigned long long xfs_btree_calc_size(uint *limits, unsigned long long len); 466 466 467 - /* return codes */ 468 - #define XFS_BTREE_QUERY_RANGE_CONTINUE (XFS_ITER_CONTINUE) /* keep iterating */ 469 - #define XFS_BTREE_QUERY_RANGE_ABORT (XFS_ITER_ABORT) /* stop iterating */ 467 + /* 468 + * Return codes for the query range iterator function are 0 to continue 469 + * iterating, and non-zero to stop iterating. Any non-zero value will be 470 + * passed up to the _query_range caller. The special value -ECANCELED can be 471 + * used to stop iteration, because _query_range never generates that error 472 + * code on its own. 473 + */ 470 474 typedef int (*xfs_btree_query_range_fn)(struct xfs_btree_cur *cur, 471 475 union xfs_btree_rec *rec, void *priv); 472 476

+3 -3

fs/xfs/libxfs/xfs_da_btree.c

··· 2098 2098 * If we didn't get it and the block might work if fragmented, 2099 2099 * try without the CONTIG flag. Loop until we get it all. 2100 2100 */ 2101 - mapp = kmem_alloc(sizeof(*mapp) * count, KM_SLEEP); 2101 + mapp = kmem_alloc(sizeof(*mapp) * count, 0); 2102 2102 for (b = *bno, mapi = 0; b < *bno + count; ) { 2103 2103 nmap = min(XFS_BMAP_MAX_NMAP, count); 2104 2104 c = (int)(*bno + count - b); ··· 2480 2480 2481 2481 if (nirecs > 1) { 2482 2482 map = kmem_zalloc(nirecs * sizeof(struct xfs_buf_map), 2483 - KM_SLEEP | KM_NOFS); 2483 + KM_NOFS); 2484 2484 if (!map) 2485 2485 return -ENOMEM; 2486 2486 *mapp = map; ··· 2539 2539 */ 2540 2540 if (nfsb != 1) 2541 2541 irecs = kmem_zalloc(sizeof(irec) * nfsb, 2542 - KM_SLEEP | KM_NOFS); 2542 + KM_NOFS); 2543 2543 2544 2544 nirecs = nfsb; 2545 2545 error = xfs_bmapi_read(dp, (xfs_fileoff_t)bno, nfsb, irecs,

+3 -1

fs/xfs/libxfs/xfs_da_btree.h

··· 81 81 #define XFS_DA_OP_ADDNAME 0x0004 /* this is an add operation */ 82 82 #define XFS_DA_OP_OKNOENT 0x0008 /* lookup/add op, ENOENT ok, else die */ 83 83 #define XFS_DA_OP_CILOOKUP 0x0010 /* lookup to return CI name if found */ 84 + #define XFS_DA_OP_ALLOCVAL 0x0020 /* lookup to alloc buffer if found */ 84 85 85 86 #define XFS_DA_OP_FLAGS \ 86 87 { XFS_DA_OP_JUSTCHECK, "JUSTCHECK" }, \ 87 88 { XFS_DA_OP_RENAME, "RENAME" }, \ 88 89 { XFS_DA_OP_ADDNAME, "ADDNAME" }, \ 89 90 { XFS_DA_OP_OKNOENT, "OKNOENT" }, \ 90 - { XFS_DA_OP_CILOOKUP, "CILOOKUP" } 91 + { XFS_DA_OP_CILOOKUP, "CILOOKUP" }, \ 92 + { XFS_DA_OP_ALLOCVAL, "ALLOCVAL" } 91 93 92 94 /* 93 95 * Storage for holding state during Btree searches and split/join ops.

+1 -1

fs/xfs/libxfs/xfs_defer.c

··· 517 517 } 518 518 if (!dfp) { 519 519 dfp = kmem_alloc(sizeof(struct xfs_defer_pending), 520 - KM_SLEEP | KM_NOFS); 520 + KM_NOFS); 521 521 dfp->dfp_type = type; 522 522 dfp->dfp_intent = NULL; 523 523 dfp->dfp_done = NULL;

+7 -7

fs/xfs/libxfs/xfs_dir2.c

··· 110 110 111 111 nodehdr_size = mp->m_dir_inode_ops->node_hdr_size; 112 112 mp->m_dir_geo = kmem_zalloc(sizeof(struct xfs_da_geometry), 113 - KM_SLEEP | KM_MAYFAIL); 113 + KM_MAYFAIL); 114 114 mp->m_attr_geo = kmem_zalloc(sizeof(struct xfs_da_geometry), 115 - KM_SLEEP | KM_MAYFAIL); 115 + KM_MAYFAIL); 116 116 if (!mp->m_dir_geo || !mp->m_attr_geo) { 117 117 kmem_free(mp->m_dir_geo); 118 118 kmem_free(mp->m_attr_geo); ··· 217 217 if (error) 218 218 return error; 219 219 220 - args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS); 220 + args = kmem_zalloc(sizeof(*args), KM_NOFS); 221 221 if (!args) 222 222 return -ENOMEM; 223 223 ··· 254 254 XFS_STATS_INC(dp->i_mount, xs_dir_create); 255 255 } 256 256 257 - args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS); 257 + args = kmem_zalloc(sizeof(*args), KM_NOFS); 258 258 if (!args) 259 259 return -ENOMEM; 260 260 ··· 353 353 * lockdep Doing this avoids having to add a bunch of lockdep class 354 354 * annotations into the reclaim path for the ilock. 355 355 */ 356 - args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS); 356 + args = kmem_zalloc(sizeof(*args), KM_NOFS); 357 357 args->geo = dp->i_mount->m_dir_geo; 358 358 args->name = name->name; 359 359 args->namelen = name->len; ··· 422 422 ASSERT(S_ISDIR(VFS_I(dp)->i_mode)); 423 423 XFS_STATS_INC(dp->i_mount, xs_dir_remove); 424 424 425 - args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS); 425 + args = kmem_zalloc(sizeof(*args), KM_NOFS); 426 426 if (!args) 427 427 return -ENOMEM; 428 428 ··· 483 483 if (rval) 484 484 return rval; 485 485 486 - args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS); 486 + args = kmem_zalloc(sizeof(*args), KM_NOFS); 487 487 if (!args) 488 488 return -ENOMEM; 489 489

+1 -1

fs/xfs/libxfs/xfs_dir2_block.c

··· 1092 1092 * Copy the directory into a temporary buffer. 1093 1093 * Then pitch the incore inode data so we can make extents. 1094 1094 */ 1095 - sfp = kmem_alloc(ifp->if_bytes, KM_SLEEP); 1095 + sfp = kmem_alloc(ifp->if_bytes, 0); 1096 1096 memcpy(sfp, oldsfp, ifp->if_bytes); 1097 1097 1098 1098 xfs_idata_realloc(dp, -ifp->if_bytes, XFS_DATA_FORK);

+347 -379

fs/xfs/libxfs/xfs_dir2_node.c

··· 32 32 static int xfs_dir2_leafn_remove(xfs_da_args_t *args, struct xfs_buf *bp, 33 33 int index, xfs_da_state_blk_t *dblk, 34 34 int *rval); 35 - static int xfs_dir2_node_addname_int(xfs_da_args_t *args, 36 - xfs_da_state_blk_t *fblk); 37 35 38 36 /* 39 37 * Check internal consistency of a leafn block. ··· 1609 1611 } 1610 1612 1611 1613 /* 1614 + * Add a new data block to the directory at the free space index that the caller 1615 + * has specified. 1616 + */ 1617 + static int 1618 + xfs_dir2_node_add_datablk( 1619 + struct xfs_da_args *args, 1620 + struct xfs_da_state_blk *fblk, 1621 + xfs_dir2_db_t *dbno, 1622 + struct xfs_buf **dbpp, 1623 + struct xfs_buf **fbpp, 1624 + int *findex) 1625 + { 1626 + struct xfs_inode *dp = args->dp; 1627 + struct xfs_trans *tp = args->trans; 1628 + struct xfs_mount *mp = dp->i_mount; 1629 + struct xfs_dir3_icfree_hdr freehdr; 1630 + struct xfs_dir2_data_free *bf; 1631 + struct xfs_dir2_data_hdr *hdr; 1632 + struct xfs_dir2_free *free = NULL; 1633 + xfs_dir2_db_t fbno; 1634 + struct xfs_buf *fbp; 1635 + struct xfs_buf *dbp; 1636 + __be16 *bests = NULL; 1637 + int error; 1638 + 1639 + /* Not allowed to allocate, return failure. */ 1640 + if (args->total == 0) 1641 + return -ENOSPC; 1642 + 1643 + /* Allocate and initialize the new data block. */ 1644 + error = xfs_dir2_grow_inode(args, XFS_DIR2_DATA_SPACE, dbno); 1645 + if (error) 1646 + return error; 1647 + error = xfs_dir3_data_init(args, *dbno, &dbp); 1648 + if (error) 1649 + return error; 1650 + 1651 + /* 1652 + * Get the freespace block corresponding to the data block 1653 + * that was just allocated. 1654 + */ 1655 + fbno = dp->d_ops->db_to_fdb(args->geo, *dbno); 1656 + error = xfs_dir2_free_try_read(tp, dp, 1657 + xfs_dir2_db_to_da(args->geo, fbno), &fbp); 1658 + if (error) 1659 + return error; 1660 + 1661 + /* 1662 + * If there wasn't a freespace block, the read will 1663 + * return a NULL fbp. Allocate and initialize a new one. 1664 + */ 1665 + if (!fbp) { 1666 + error = xfs_dir2_grow_inode(args, XFS_DIR2_FREE_SPACE, &fbno); 1667 + if (error) 1668 + return error; 1669 + 1670 + if (dp->d_ops->db_to_fdb(args->geo, *dbno) != fbno) { 1671 + xfs_alert(mp, 1672 + "%s: dir ino %llu needed freesp block %lld for data block %lld, got %lld", 1673 + __func__, (unsigned long long)dp->i_ino, 1674 + (long long)dp->d_ops->db_to_fdb(args->geo, *dbno), 1675 + (long long)*dbno, (long long)fbno); 1676 + if (fblk) { 1677 + xfs_alert(mp, 1678 + " fblk "PTR_FMT" blkno %llu index %d magic 0x%x", 1679 + fblk, (unsigned long long)fblk->blkno, 1680 + fblk->index, fblk->magic); 1681 + } else { 1682 + xfs_alert(mp, " ... fblk is NULL"); 1683 + } 1684 + XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp); 1685 + return -EFSCORRUPTED; 1686 + } 1687 + 1688 + /* Get a buffer for the new block. */ 1689 + error = xfs_dir3_free_get_buf(args, fbno, &fbp); 1690 + if (error) 1691 + return error; 1692 + free = fbp->b_addr; 1693 + bests = dp->d_ops->free_bests_p(free); 1694 + dp->d_ops->free_hdr_from_disk(&freehdr, free); 1695 + 1696 + /* Remember the first slot as our empty slot. */ 1697 + freehdr.firstdb = (fbno - xfs_dir2_byte_to_db(args->geo, 1698 + XFS_DIR2_FREE_OFFSET)) * 1699 + dp->d_ops->free_max_bests(args->geo); 1700 + } else { 1701 + free = fbp->b_addr; 1702 + bests = dp->d_ops->free_bests_p(free); 1703 + dp->d_ops->free_hdr_from_disk(&freehdr, free); 1704 + } 1705 + 1706 + /* Set the freespace block index from the data block number. */ 1707 + *findex = dp->d_ops->db_to_fdindex(args->geo, *dbno); 1708 + 1709 + /* Extend the freespace table if the new data block is off the end. */ 1710 + if (*findex >= freehdr.nvalid) { 1711 + ASSERT(*findex < dp->d_ops->free_max_bests(args->geo)); 1712 + freehdr.nvalid = *findex + 1; 1713 + bests[*findex] = cpu_to_be16(NULLDATAOFF); 1714 + } 1715 + 1716 + /* 1717 + * If this entry was for an empty data block (this should always be 1718 + * true) then update the header. 1719 + */ 1720 + if (bests[*findex] == cpu_to_be16(NULLDATAOFF)) { 1721 + freehdr.nused++; 1722 + dp->d_ops->free_hdr_to_disk(fbp->b_addr, &freehdr); 1723 + xfs_dir2_free_log_header(args, fbp); 1724 + } 1725 + 1726 + /* Update the freespace value for the new block in the table. */ 1727 + hdr = dbp->b_addr; 1728 + bf = dp->d_ops->data_bestfree_p(hdr); 1729 + bests[*findex] = bf[0].length; 1730 + 1731 + *dbpp = dbp; 1732 + *fbpp = fbp; 1733 + return 0; 1734 + } 1735 + 1736 + static int 1737 + xfs_dir2_node_find_freeblk( 1738 + struct xfs_da_args *args, 1739 + struct xfs_da_state_blk *fblk, 1740 + xfs_dir2_db_t *dbnop, 1741 + struct xfs_buf **fbpp, 1742 + int *findexp, 1743 + int length) 1744 + { 1745 + struct xfs_dir3_icfree_hdr freehdr; 1746 + struct xfs_dir2_free *free = NULL; 1747 + struct xfs_inode *dp = args->dp; 1748 + struct xfs_trans *tp = args->trans; 1749 + struct xfs_buf *fbp = NULL; 1750 + xfs_dir2_db_t firstfbno; 1751 + xfs_dir2_db_t lastfbno; 1752 + xfs_dir2_db_t ifbno = -1; 1753 + xfs_dir2_db_t dbno = -1; 1754 + xfs_dir2_db_t fbno; 1755 + xfs_fileoff_t fo; 1756 + __be16 *bests = NULL; 1757 + int findex = 0; 1758 + int error; 1759 + 1760 + /* 1761 + * If we came in with a freespace block that means that lookup 1762 + * found an entry with our hash value. This is the freespace 1763 + * block for that data entry. 1764 + */ 1765 + if (fblk) { 1766 + fbp = fblk->bp; 1767 + free = fbp->b_addr; 1768 + findex = fblk->index; 1769 + if (findex >= 0) { 1770 + /* caller already found the freespace for us. */ 1771 + bests = dp->d_ops->free_bests_p(free); 1772 + dp->d_ops->free_hdr_from_disk(&freehdr, free); 1773 + 1774 + ASSERT(findex < freehdr.nvalid); 1775 + ASSERT(be16_to_cpu(bests[findex]) != NULLDATAOFF); 1776 + ASSERT(be16_to_cpu(bests[findex]) >= length); 1777 + dbno = freehdr.firstdb + findex; 1778 + goto found_block; 1779 + } 1780 + 1781 + /* 1782 + * The data block looked at didn't have enough room. 1783 + * We'll start at the beginning of the freespace entries. 1784 + */ 1785 + ifbno = fblk->blkno; 1786 + xfs_trans_brelse(tp, fbp); 1787 + fbp = NULL; 1788 + fblk->bp = NULL; 1789 + } 1790 + 1791 + /* 1792 + * If we don't have a data block yet, we're going to scan the freespace 1793 + * data for a data block with enough free space in it. 1794 + */ 1795 + error = xfs_bmap_last_offset(dp, &fo, XFS_DATA_FORK); 1796 + if (error) 1797 + return error; 1798 + lastfbno = xfs_dir2_da_to_db(args->geo, (xfs_dablk_t)fo); 1799 + firstfbno = xfs_dir2_byte_to_db(args->geo, XFS_DIR2_FREE_OFFSET); 1800 + 1801 + for (fbno = lastfbno - 1; fbno >= firstfbno; fbno--) { 1802 + /* If it's ifbno we already looked at it. */ 1803 + if (fbno == ifbno) 1804 + continue; 1805 + 1806 + /* 1807 + * Read the block. There can be holes in the freespace blocks, 1808 + * so this might not succeed. This should be really rare, so 1809 + * there's no reason to avoid it. 1810 + */ 1811 + error = xfs_dir2_free_try_read(tp, dp, 1812 + xfs_dir2_db_to_da(args->geo, fbno), 1813 + &fbp); 1814 + if (error) 1815 + return error; 1816 + if (!fbp) 1817 + continue; 1818 + 1819 + free = fbp->b_addr; 1820 + bests = dp->d_ops->free_bests_p(free); 1821 + dp->d_ops->free_hdr_from_disk(&freehdr, free); 1822 + 1823 + /* Scan the free entry array for a large enough free space. */ 1824 + for (findex = freehdr.nvalid - 1; findex >= 0; findex--) { 1825 + if (be16_to_cpu(bests[findex]) != NULLDATAOFF && 1826 + be16_to_cpu(bests[findex]) >= length) { 1827 + dbno = freehdr.firstdb + findex; 1828 + goto found_block; 1829 + } 1830 + } 1831 + 1832 + /* Didn't find free space, go on to next free block */ 1833 + xfs_trans_brelse(tp, fbp); 1834 + } 1835 + 1836 + found_block: 1837 + *dbnop = dbno; 1838 + *fbpp = fbp; 1839 + *findexp = findex; 1840 + return 0; 1841 + } 1842 + 1843 + 1844 + /* 1845 + * Add the data entry for a node-format directory name addition. 1846 + * The leaf entry is added in xfs_dir2_leafn_add. 1847 + * We may enter with a freespace block that the lookup found. 1848 + */ 1849 + static int 1850 + xfs_dir2_node_addname_int( 1851 + struct xfs_da_args *args, /* operation arguments */ 1852 + struct xfs_da_state_blk *fblk) /* optional freespace block */ 1853 + { 1854 + struct xfs_dir2_data_unused *dup; /* data unused entry pointer */ 1855 + struct xfs_dir2_data_entry *dep; /* data entry pointer */ 1856 + struct xfs_dir2_data_hdr *hdr; /* data block header */ 1857 + struct xfs_dir2_data_free *bf; 1858 + struct xfs_dir2_free *free = NULL; /* freespace block structure */ 1859 + struct xfs_trans *tp = args->trans; 1860 + struct xfs_inode *dp = args->dp; 1861 + struct xfs_buf *dbp; /* data block buffer */ 1862 + struct xfs_buf *fbp; /* freespace buffer */ 1863 + xfs_dir2_data_aoff_t aoff; 1864 + xfs_dir2_db_t dbno; /* data block number */ 1865 + int error; /* error return value */ 1866 + int findex; /* freespace entry index */ 1867 + int length; /* length of the new entry */ 1868 + int logfree = 0; /* need to log free entry */ 1869 + int needlog = 0; /* need to log data header */ 1870 + int needscan = 0; /* need to rescan data frees */ 1871 + __be16 *tagp; /* data entry tag pointer */ 1872 + __be16 *bests; 1873 + 1874 + length = dp->d_ops->data_entsize(args->namelen); 1875 + error = xfs_dir2_node_find_freeblk(args, fblk, &dbno, &fbp, &findex, 1876 + length); 1877 + if (error) 1878 + return error; 1879 + 1880 + /* 1881 + * Now we know if we must allocate blocks, so if we are checking whether 1882 + * we can insert without allocation then we can return now. 1883 + */ 1884 + if (args->op_flags & XFS_DA_OP_JUSTCHECK) { 1885 + if (dbno == -1) 1886 + return -ENOSPC; 1887 + return 0; 1888 + } 1889 + 1890 + /* 1891 + * If we don't have a data block, we need to allocate one and make 1892 + * the freespace entries refer to it. 1893 + */ 1894 + if (dbno == -1) { 1895 + /* we're going to have to log the free block index later */ 1896 + logfree = 1; 1897 + error = xfs_dir2_node_add_datablk(args, fblk, &dbno, &dbp, &fbp, 1898 + &findex); 1899 + } else { 1900 + /* Read the data block in. */ 1901 + error = xfs_dir3_data_read(tp, dp, 1902 + xfs_dir2_db_to_da(args->geo, dbno), 1903 + -1, &dbp); 1904 + } 1905 + if (error) 1906 + return error; 1907 + 1908 + /* setup for data block up now */ 1909 + hdr = dbp->b_addr; 1910 + bf = dp->d_ops->data_bestfree_p(hdr); 1911 + ASSERT(be16_to_cpu(bf[0].length) >= length); 1912 + 1913 + /* Point to the existing unused space. */ 1914 + dup = (xfs_dir2_data_unused_t *) 1915 + ((char *)hdr + be16_to_cpu(bf[0].offset)); 1916 + 1917 + /* Mark the first part of the unused space, inuse for us. */ 1918 + aoff = (xfs_dir2_data_aoff_t)((char *)dup - (char *)hdr); 1919 + error = xfs_dir2_data_use_free(args, dbp, dup, aoff, length, 1920 + &needlog, &needscan); 1921 + if (error) { 1922 + xfs_trans_brelse(tp, dbp); 1923 + return error; 1924 + } 1925 + 1926 + /* Fill in the new entry and log it. */ 1927 + dep = (xfs_dir2_data_entry_t *)dup; 1928 + dep->inumber = cpu_to_be64(args->inumber); 1929 + dep->namelen = args->namelen; 1930 + memcpy(dep->name, args->name, dep->namelen); 1931 + dp->d_ops->data_put_ftype(dep, args->filetype); 1932 + tagp = dp->d_ops->data_entry_tag_p(dep); 1933 + *tagp = cpu_to_be16((char *)dep - (char *)hdr); 1934 + xfs_dir2_data_log_entry(args, dbp, dep); 1935 + 1936 + /* Rescan the freespace and log the data block if needed. */ 1937 + if (needscan) 1938 + xfs_dir2_data_freescan(dp, hdr, &needlog); 1939 + if (needlog) 1940 + xfs_dir2_data_log_header(args, dbp); 1941 + 1942 + /* If the freespace block entry is now wrong, update it. */ 1943 + free = fbp->b_addr; 1944 + bests = dp->d_ops->free_bests_p(free); 1945 + if (bests[findex] != bf[0].length) { 1946 + bests[findex] = bf[0].length; 1947 + logfree = 1; 1948 + } 1949 + 1950 + /* Log the freespace entry if needed. */ 1951 + if (logfree) 1952 + xfs_dir2_free_log_bests(args, fbp, findex, findex); 1953 + 1954 + /* Return the data block and offset in args. */ 1955 + args->blkno = (xfs_dablk_t)dbno; 1956 + args->index = be16_to_cpu(*tagp); 1957 + return 0; 1958 + } 1959 + 1960 + /* 1612 1961 * Top-level node form directory addname routine. 1613 1962 */ 1614 1963 int /* error */ ··· 2022 1677 done: 2023 1678 xfs_da_state_free(state); 2024 1679 return rval; 2025 - } 2026 - 2027 - /* 2028 - * Add the data entry for a node-format directory name addition. 2029 - * The leaf entry is added in xfs_dir2_leafn_add. 2030 - * We may enter with a freespace block that the lookup found. 2031 - */ 2032 - static int /* error */ 2033 - xfs_dir2_node_addname_int( 2034 - xfs_da_args_t *args, /* operation arguments */ 2035 - xfs_da_state_blk_t *fblk) /* optional freespace block */ 2036 - { 2037 - xfs_dir2_data_hdr_t *hdr; /* data block header */ 2038 - xfs_dir2_db_t dbno; /* data block number */ 2039 - struct xfs_buf *dbp; /* data block buffer */ 2040 - xfs_dir2_data_entry_t *dep; /* data entry pointer */ 2041 - xfs_inode_t *dp; /* incore directory inode */ 2042 - xfs_dir2_data_unused_t *dup; /* data unused entry pointer */ 2043 - int error; /* error return value */ 2044 - xfs_dir2_db_t fbno; /* freespace block number */ 2045 - struct xfs_buf *fbp; /* freespace buffer */ 2046 - int findex; /* freespace entry index */ 2047 - xfs_dir2_free_t *free=NULL; /* freespace block structure */ 2048 - xfs_dir2_db_t ifbno; /* initial freespace block no */ 2049 - xfs_dir2_db_t lastfbno=0; /* highest freespace block no */ 2050 - int length; /* length of the new entry */ 2051 - int logfree; /* need to log free entry */ 2052 - xfs_mount_t *mp; /* filesystem mount point */ 2053 - int needlog; /* need to log data header */ 2054 - int needscan; /* need to rescan data frees */ 2055 - __be16 *tagp; /* data entry tag pointer */ 2056 - xfs_trans_t *tp; /* transaction pointer */ 2057 - __be16 *bests; 2058 - struct xfs_dir3_icfree_hdr freehdr; 2059 - struct xfs_dir2_data_free *bf; 2060 - xfs_dir2_data_aoff_t aoff; 2061 - 2062 - dp = args->dp; 2063 - mp = dp->i_mount; 2064 - tp = args->trans; 2065 - length = dp->d_ops->data_entsize(args->namelen); 2066 - /* 2067 - * If we came in with a freespace block that means that lookup 2068 - * found an entry with our hash value. This is the freespace 2069 - * block for that data entry. 2070 - */ 2071 - if (fblk) { 2072 - fbp = fblk->bp; 2073 - /* 2074 - * Remember initial freespace block number. 2075 - */ 2076 - ifbno = fblk->blkno; 2077 - free = fbp->b_addr; 2078 - findex = fblk->index; 2079 - bests = dp->d_ops->free_bests_p(free); 2080 - dp->d_ops->free_hdr_from_disk(&freehdr, free); 2081 - 2082 - /* 2083 - * This means the free entry showed that the data block had 2084 - * space for our entry, so we remembered it. 2085 - * Use that data block. 2086 - */ 2087 - if (findex >= 0) { 2088 - ASSERT(findex < freehdr.nvalid); 2089 - ASSERT(be16_to_cpu(bests[findex]) != NULLDATAOFF); 2090 - ASSERT(be16_to_cpu(bests[findex]) >= length); 2091 - dbno = freehdr.firstdb + findex; 2092 - } else { 2093 - /* 2094 - * The data block looked at didn't have enough room. 2095 - * We'll start at the beginning of the freespace entries. 2096 - */ 2097 - dbno = -1; 2098 - findex = 0; 2099 - } 2100 - } else { 2101 - /* 2102 - * Didn't come in with a freespace block, so no data block. 2103 - */ 2104 - ifbno = dbno = -1; 2105 - fbp = NULL; 2106 - findex = 0; 2107 - } 2108 - 2109 - /* 2110 - * If we don't have a data block yet, we're going to scan the 2111 - * freespace blocks looking for one. Figure out what the 2112 - * highest freespace block number is. 2113 - */ 2114 - if (dbno == -1) { 2115 - xfs_fileoff_t fo; /* freespace block number */ 2116 - 2117 - if ((error = xfs_bmap_last_offset(dp, &fo, XFS_DATA_FORK))) 2118 - return error; 2119 - lastfbno = xfs_dir2_da_to_db(args->geo, (xfs_dablk_t)fo); 2120 - fbno = ifbno; 2121 - } 2122 - /* 2123 - * While we haven't identified a data block, search the freeblock 2124 - * data for a good data block. If we find a null freeblock entry, 2125 - * indicating a hole in the data blocks, remember that. 2126 - */ 2127 - while (dbno == -1) { 2128 - /* 2129 - * If we don't have a freeblock in hand, get the next one. 2130 - */ 2131 - if (fbp == NULL) { 2132 - /* 2133 - * Happens the first time through unless lookup gave 2134 - * us a freespace block to start with. 2135 - */ 2136 - if (++fbno == 0) 2137 - fbno = xfs_dir2_byte_to_db(args->geo, 2138 - XFS_DIR2_FREE_OFFSET); 2139 - /* 2140 - * If it's ifbno we already looked at it. 2141 - */ 2142 - if (fbno == ifbno) 2143 - fbno++; 2144 - /* 2145 - * If it's off the end we're done. 2146 - */ 2147 - if (fbno >= lastfbno) 2148 - break; 2149 - /* 2150 - * Read the block. There can be holes in the 2151 - * freespace blocks, so this might not succeed. 2152 - * This should be really rare, so there's no reason 2153 - * to avoid it. 2154 - */ 2155 - error = xfs_dir2_free_try_read(tp, dp, 2156 - xfs_dir2_db_to_da(args->geo, fbno), 2157 - &fbp); 2158 - if (error) 2159 - return error; 2160 - if (!fbp) 2161 - continue; 2162 - free = fbp->b_addr; 2163 - findex = 0; 2164 - } 2165 - /* 2166 - * Look at the current free entry. Is it good enough? 2167 - * 2168 - * The bests initialisation should be where the bufer is read in 2169 - * the above branch. But gcc is too stupid to realise that bests 2170 - * and the freehdr are actually initialised if they are placed 2171 - * there, so we have to do it here to avoid warnings. Blech. 2172 - */ 2173 - bests = dp->d_ops->free_bests_p(free); 2174 - dp->d_ops->free_hdr_from_disk(&freehdr, free); 2175 - if (be16_to_cpu(bests[findex]) != NULLDATAOFF && 2176 - be16_to_cpu(bests[findex]) >= length) 2177 - dbno = freehdr.firstdb + findex; 2178 - else { 2179 - /* 2180 - * Are we done with the freeblock? 2181 - */ 2182 - if (++findex == freehdr.nvalid) { 2183 - /* 2184 - * Drop the block. 2185 - */ 2186 - xfs_trans_brelse(tp, fbp); 2187 - fbp = NULL; 2188 - if (fblk && fblk->bp) 2189 - fblk->bp = NULL; 2190 - } 2191 - } 2192 - } 2193 - /* 2194 - * If we don't have a data block, we need to allocate one and make 2195 - * the freespace entries refer to it. 2196 - */ 2197 - if (unlikely(dbno == -1)) { 2198 - /* 2199 - * Not allowed to allocate, return failure. 2200 - */ 2201 - if ((args->op_flags & XFS_DA_OP_JUSTCHECK) || args->total == 0) 2202 - return -ENOSPC; 2203 - 2204 - /* 2205 - * Allocate and initialize the new data block. 2206 - */ 2207 - if (unlikely((error = xfs_dir2_grow_inode(args, 2208 - XFS_DIR2_DATA_SPACE, 2209 - &dbno)) || 2210 - (error = xfs_dir3_data_init(args, dbno, &dbp)))) 2211 - return error; 2212 - 2213 - /* 2214 - * If (somehow) we have a freespace block, get rid of it. 2215 - */ 2216 - if (fbp) 2217 - xfs_trans_brelse(tp, fbp); 2218 - if (fblk && fblk->bp) 2219 - fblk->bp = NULL; 2220 - 2221 - /* 2222 - * Get the freespace block corresponding to the data block 2223 - * that was just allocated. 2224 - */ 2225 - fbno = dp->d_ops->db_to_fdb(args->geo, dbno); 2226 - error = xfs_dir2_free_try_read(tp, dp, 2227 - xfs_dir2_db_to_da(args->geo, fbno), 2228 - &fbp); 2229 - if (error) 2230 - return error; 2231 - 2232 - /* 2233 - * If there wasn't a freespace block, the read will 2234 - * return a NULL fbp. Allocate and initialize a new one. 2235 - */ 2236 - if (!fbp) { 2237 - error = xfs_dir2_grow_inode(args, XFS_DIR2_FREE_SPACE, 2238 - &fbno); 2239 - if (error) 2240 - return error; 2241 - 2242 - if (dp->d_ops->db_to_fdb(args->geo, dbno) != fbno) { 2243 - xfs_alert(mp, 2244 - "%s: dir ino %llu needed freesp block %lld for data block %lld, got %lld ifbno %llu lastfbno %d", 2245 - __func__, (unsigned long long)dp->i_ino, 2246 - (long long)dp->d_ops->db_to_fdb( 2247 - args->geo, dbno), 2248 - (long long)dbno, (long long)fbno, 2249 - (unsigned long long)ifbno, lastfbno); 2250 - if (fblk) { 2251 - xfs_alert(mp, 2252 - " fblk "PTR_FMT" blkno %llu index %d magic 0x%x", 2253 - fblk, 2254 - (unsigned long long)fblk->blkno, 2255 - fblk->index, 2256 - fblk->magic); 2257 - } else { 2258 - xfs_alert(mp, " ... fblk is NULL"); 2259 - } 2260 - XFS_ERROR_REPORT("xfs_dir2_node_addname_int", 2261 - XFS_ERRLEVEL_LOW, mp); 2262 - return -EFSCORRUPTED; 2263 - } 2264 - 2265 - /* 2266 - * Get a buffer for the new block. 2267 - */ 2268 - error = xfs_dir3_free_get_buf(args, fbno, &fbp); 2269 - if (error) 2270 - return error; 2271 - free = fbp->b_addr; 2272 - bests = dp->d_ops->free_bests_p(free); 2273 - dp->d_ops->free_hdr_from_disk(&freehdr, free); 2274 - 2275 - /* 2276 - * Remember the first slot as our empty slot. 2277 - */ 2278 - freehdr.firstdb = 2279 - (fbno - xfs_dir2_byte_to_db(args->geo, 2280 - XFS_DIR2_FREE_OFFSET)) * 2281 - dp->d_ops->free_max_bests(args->geo); 2282 - } else { 2283 - free = fbp->b_addr; 2284 - bests = dp->d_ops->free_bests_p(free); 2285 - dp->d_ops->free_hdr_from_disk(&freehdr, free); 2286 - } 2287 - 2288 - /* 2289 - * Set the freespace block index from the data block number. 2290 - */ 2291 - findex = dp->d_ops->db_to_fdindex(args->geo, dbno); 2292 - /* 2293 - * If it's after the end of the current entries in the 2294 - * freespace block, extend that table. 2295 - */ 2296 - if (findex >= freehdr.nvalid) { 2297 - ASSERT(findex < dp->d_ops->free_max_bests(args->geo)); 2298 - freehdr.nvalid = findex + 1; 2299 - /* 2300 - * Tag new entry so nused will go up. 2301 - */ 2302 - bests[findex] = cpu_to_be16(NULLDATAOFF); 2303 - } 2304 - /* 2305 - * If this entry was for an empty data block 2306 - * (this should always be true) then update the header. 2307 - */ 2308 - if (bests[findex] == cpu_to_be16(NULLDATAOFF)) { 2309 - freehdr.nused++; 2310 - dp->d_ops->free_hdr_to_disk(fbp->b_addr, &freehdr); 2311 - xfs_dir2_free_log_header(args, fbp); 2312 - } 2313 - /* 2314 - * Update the real value in the table. 2315 - * We haven't allocated the data entry yet so this will 2316 - * change again. 2317 - */ 2318 - hdr = dbp->b_addr; 2319 - bf = dp->d_ops->data_bestfree_p(hdr); 2320 - bests[findex] = bf[0].length; 2321 - logfree = 1; 2322 - } 2323 - /* 2324 - * We had a data block so we don't have to make a new one. 2325 - */ 2326 - else { 2327 - /* 2328 - * If just checking, we succeeded. 2329 - */ 2330 - if (args->op_flags & XFS_DA_OP_JUSTCHECK) 2331 - return 0; 2332 - 2333 - /* 2334 - * Read the data block in. 2335 - */ 2336 - error = xfs_dir3_data_read(tp, dp, 2337 - xfs_dir2_db_to_da(args->geo, dbno), 2338 - -1, &dbp); 2339 - if (error) 2340 - return error; 2341 - hdr = dbp->b_addr; 2342 - bf = dp->d_ops->data_bestfree_p(hdr); 2343 - logfree = 0; 2344 - } 2345 - ASSERT(be16_to_cpu(bf[0].length) >= length); 2346 - /* 2347 - * Point to the existing unused space. 2348 - */ 2349 - dup = (xfs_dir2_data_unused_t *) 2350 - ((char *)hdr + be16_to_cpu(bf[0].offset)); 2351 - needscan = needlog = 0; 2352 - /* 2353 - * Mark the first part of the unused space, inuse for us. 2354 - */ 2355 - aoff = (xfs_dir2_data_aoff_t)((char *)dup - (char *)hdr); 2356 - error = xfs_dir2_data_use_free(args, dbp, dup, aoff, length, 2357 - &needlog, &needscan); 2358 - if (error) { 2359 - xfs_trans_brelse(tp, dbp); 2360 - return error; 2361 - } 2362 - /* 2363 - * Fill in the new entry and log it. 2364 - */ 2365 - dep = (xfs_dir2_data_entry_t *)dup; 2366 - dep->inumber = cpu_to_be64(args->inumber); 2367 - dep->namelen = args->namelen; 2368 - memcpy(dep->name, args->name, dep->namelen); 2369 - dp->d_ops->data_put_ftype(dep, args->filetype); 2370 - tagp = dp->d_ops->data_entry_tag_p(dep); 2371 - *tagp = cpu_to_be16((char *)dep - (char *)hdr); 2372 - xfs_dir2_data_log_entry(args, dbp, dep); 2373 - /* 2374 - * Rescan the block for bestfree if needed. 2375 - */ 2376 - if (needscan) 2377 - xfs_dir2_data_freescan(dp, hdr, &needlog); 2378 - /* 2379 - * Log the data block header if needed. 2380 - */ 2381 - if (needlog) 2382 - xfs_dir2_data_log_header(args, dbp); 2383 - /* 2384 - * If the freespace entry is now wrong, update it. 2385 - */ 2386 - bests = dp->d_ops->free_bests_p(free); /* gcc is so stupid */ 2387 - if (be16_to_cpu(bests[findex]) != be16_to_cpu(bf[0].length)) { 2388 - bests[findex] = bf[0].length; 2389 - logfree = 1; 2390 - } 2391 - /* 2392 - * Log the freespace entry if needed. 2393 - */ 2394 - if (logfree) 2395 - xfs_dir2_free_log_bests(args, fbp, findex, findex); 2396 - /* 2397 - * Return the data block and offset in args, then drop the data block. 2398 - */ 2399 - args->blkno = (xfs_dablk_t)dbno; 2400 - args->index = be16_to_cpu(*tagp); 2401 - return 0; 2402 1680 } 2403 1681 2404 1682 /*

+4 -4

fs/xfs/libxfs/xfs_dir2_sf.c

··· 164 164 * can free the block and copy the formatted data into the inode literal 165 165 * area. 166 166 */ 167 - dst = kmem_alloc(mp->m_sb.sb_inodesize, KM_SLEEP); 167 + dst = kmem_alloc(mp->m_sb.sb_inodesize, 0); 168 168 hdr = bp->b_addr; 169 169 170 170 /* ··· 436 436 437 437 sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; 438 438 old_isize = (int)dp->i_d.di_size; 439 - buf = kmem_alloc(old_isize, KM_SLEEP); 439 + buf = kmem_alloc(old_isize, 0); 440 440 oldsfp = (xfs_dir2_sf_hdr_t *)buf; 441 441 memcpy(oldsfp, sfp, old_isize); 442 442 /* ··· 1096 1096 * Don't want xfs_idata_realloc copying the data here. 1097 1097 */ 1098 1098 oldsize = dp->i_df.if_bytes; 1099 - buf = kmem_alloc(oldsize, KM_SLEEP); 1099 + buf = kmem_alloc(oldsize, 0); 1100 1100 oldsfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; 1101 1101 ASSERT(oldsfp->i8count == 1); 1102 1102 memcpy(buf, oldsfp, oldsize); ··· 1169 1169 * Don't want xfs_idata_realloc copying the data here. 1170 1170 */ 1171 1171 oldsize = dp->i_df.if_bytes; 1172 - buf = kmem_alloc(oldsize, KM_SLEEP); 1172 + buf = kmem_alloc(oldsize, 0); 1173 1173 oldsfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; 1174 1174 ASSERT(oldsfp->i8count == 0); 1175 1175 memcpy(buf, oldsfp, oldsize);

+1 -1

fs/xfs/libxfs/xfs_fs.h

··· 287 287 uint32_t ag_ifree; /* o: inodes free */ 288 288 uint32_t ag_sick; /* o: sick things in ag */ 289 289 uint32_t ag_checked; /* o: checked metadata in ag */ 290 - uint32_t ag_reserved32; /* o: zero */ 290 + uint32_t ag_flags; /* i/o: flags for this ag */ 291 291 uint64_t ag_reserved[12];/* o: zero */ 292 292 }; 293 293 #define XFS_AG_GEOM_SICK_SB (1 << 0) /* superblock */

+7 -2

fs/xfs/libxfs/xfs_ialloc.c

··· 2787 2787 igeo->inobt_maxlevels = xfs_btree_compute_maxlevels(igeo->inobt_mnr, 2788 2788 inodes); 2789 2789 2790 - /* Set the maximum inode count for this filesystem. */ 2791 - if (sbp->sb_imax_pct) { 2790 + /* 2791 + * Set the maximum inode count for this filesystem, being careful not 2792 + * to use obviously garbage sb_inopblog/sb_inopblock values. Regular 2793 + * users should never get here due to failing sb verification, but 2794 + * certain users (xfs_db) need to be usable even with corrupt metadata. 2795 + */ 2796 + if (sbp->sb_imax_pct && igeo->ialloc_blks) { 2792 2797 /* 2793 2798 * Make sure the maximum inode count is a multiple 2794 2799 * of the units we allocate inodes in.

+4 -4

fs/xfs/libxfs/xfs_iext_tree.c

··· 616 616 * sequence counter is seen before the modifications to the extent tree itself 617 617 * take effect. 618 618 */ 619 - static inline void xfs_iext_inc_seq(struct xfs_ifork *ifp, int state) 619 + static inline void xfs_iext_inc_seq(struct xfs_ifork *ifp) 620 620 { 621 621 WRITE_ONCE(ifp->if_seq, READ_ONCE(ifp->if_seq) + 1); 622 622 } ··· 633 633 struct xfs_iext_leaf *new = NULL; 634 634 int nr_entries, i; 635 635 636 - xfs_iext_inc_seq(ifp, state); 636 + xfs_iext_inc_seq(ifp); 637 637 638 638 if (ifp->if_height == 0) 639 639 xfs_iext_alloc_root(ifp, cur); ··· 875 875 ASSERT(ifp->if_u1.if_root != NULL); 876 876 ASSERT(xfs_iext_valid(ifp, cur)); 877 877 878 - xfs_iext_inc_seq(ifp, state); 878 + xfs_iext_inc_seq(ifp); 879 879 880 880 nr_entries = xfs_iext_leaf_nr_entries(ifp, leaf, cur->pos) - 1; 881 881 for (i = cur->pos; i < nr_entries; i++) ··· 983 983 { 984 984 struct xfs_ifork *ifp = xfs_iext_state_to_fork(ip, state); 985 985 986 - xfs_iext_inc_seq(ifp, state); 986 + xfs_iext_inc_seq(ifp); 987 987 988 988 if (cur->pos == 0) { 989 989 struct xfs_bmbt_irec old;

+8 -8

fs/xfs/libxfs/xfs_inode_fork.c

··· 94 94 return 0; 95 95 96 96 ASSERT(ip->i_afp == NULL); 97 - ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, KM_SLEEP | KM_NOFS); 97 + ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, KM_NOFS); 98 98 99 99 switch (dip->di_aformat) { 100 100 case XFS_DINODE_FMT_LOCAL: ··· 147 147 148 148 if (size) { 149 149 real_size = roundup(mem_size, 4); 150 - ifp->if_u1.if_data = kmem_alloc(real_size, KM_SLEEP | KM_NOFS); 150 + ifp->if_u1.if_data = kmem_alloc(real_size, KM_NOFS); 151 151 memcpy(ifp->if_u1.if_data, data, size); 152 152 if (zero_terminate) 153 153 ifp->if_u1.if_data[size] = '\0'; ··· 302 302 } 303 303 304 304 ifp->if_broot_bytes = size; 305 - ifp->if_broot = kmem_alloc(size, KM_SLEEP | KM_NOFS); 305 + ifp->if_broot = kmem_alloc(size, KM_NOFS); 306 306 ASSERT(ifp->if_broot != NULL); 307 307 /* 308 308 * Copy and convert from the on-disk structure ··· 367 367 */ 368 368 if (ifp->if_broot_bytes == 0) { 369 369 new_size = XFS_BMAP_BROOT_SPACE_CALC(mp, rec_diff); 370 - ifp->if_broot = kmem_alloc(new_size, KM_SLEEP | KM_NOFS); 370 + ifp->if_broot = kmem_alloc(new_size, KM_NOFS); 371 371 ifp->if_broot_bytes = (int)new_size; 372 372 return; 373 373 } ··· 382 382 new_max = cur_max + rec_diff; 383 383 new_size = XFS_BMAP_BROOT_SPACE_CALC(mp, new_max); 384 384 ifp->if_broot = kmem_realloc(ifp->if_broot, new_size, 385 - KM_SLEEP | KM_NOFS); 385 + KM_NOFS); 386 386 op = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1, 387 387 ifp->if_broot_bytes); 388 388 np = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1, ··· 408 408 else 409 409 new_size = 0; 410 410 if (new_size > 0) { 411 - new_broot = kmem_alloc(new_size, KM_SLEEP | KM_NOFS); 411 + new_broot = kmem_alloc(new_size, KM_NOFS); 412 412 /* 413 413 * First copy over the btree block header. 414 414 */ ··· 492 492 * We enforce that here. 493 493 */ 494 494 ifp->if_u1.if_data = kmem_realloc(ifp->if_u1.if_data, 495 - roundup(new_size, 4), KM_SLEEP | KM_NOFS); 495 + roundup(new_size, 4), KM_NOFS); 496 496 ifp->if_bytes = new_size; 497 497 } 498 498 ··· 683 683 return; 684 684 685 685 ip->i_cowfp = kmem_zone_zalloc(xfs_ifork_zone, 686 - KM_SLEEP | KM_NOFS); 686 + KM_NOFS); 687 687 ip->i_cowfp->if_flags = XFS_IFEXTENTS; 688 688 ip->i_cformat = XFS_DINODE_FMT_EXTENTS; 689 689 ip->i_cnextents = 0;

+20 -30

fs/xfs/libxfs/xfs_refcount.c

··· 1174 1174 /* 1175 1175 * Record a refcount intent for later processing. 1176 1176 */ 1177 - static int 1177 + static void 1178 1178 __xfs_refcount_add( 1179 1179 struct xfs_trans *tp, 1180 1180 enum xfs_refcount_intent_type type, ··· 1189 1189 blockcount); 1190 1190 1191 1191 ri = kmem_alloc(sizeof(struct xfs_refcount_intent), 1192 - KM_SLEEP | KM_NOFS); 1192 + KM_NOFS); 1193 1193 INIT_LIST_HEAD(&ri->ri_list); 1194 1194 ri->ri_type = type; 1195 1195 ri->ri_startblock = startblock; 1196 1196 ri->ri_blockcount = blockcount; 1197 1197 1198 1198 xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_REFCOUNT, &ri->ri_list); 1199 - return 0; 1200 1199 } 1201 1200 1202 1201 /* 1203 1202 * Increase the reference count of the blocks backing a file's extent. 1204 1203 */ 1205 - int 1204 + void 1206 1205 xfs_refcount_increase_extent( 1207 1206 struct xfs_trans *tp, 1208 1207 struct xfs_bmbt_irec *PREV) 1209 1208 { 1210 1209 if (!xfs_sb_version_hasreflink(&tp->t_mountp->m_sb)) 1211 - return 0; 1210 + return; 1212 1211 1213 - return __xfs_refcount_add(tp, XFS_REFCOUNT_INCREASE, 1214 - PREV->br_startblock, PREV->br_blockcount); 1212 + __xfs_refcount_add(tp, XFS_REFCOUNT_INCREASE, PREV->br_startblock, 1213 + PREV->br_blockcount); 1215 1214 } 1216 1215 1217 1216 /* 1218 1217 * Decrease the reference count of the blocks backing a file's extent. 1219 1218 */ 1220 - int 1219 + void 1221 1220 xfs_refcount_decrease_extent( 1222 1221 struct xfs_trans *tp, 1223 1222 struct xfs_bmbt_irec *PREV) 1224 1223 { 1225 1224 if (!xfs_sb_version_hasreflink(&tp->t_mountp->m_sb)) 1226 - return 0; 1225 + return; 1227 1226 1228 - return __xfs_refcount_add(tp, XFS_REFCOUNT_DECREASE, 1229 - PREV->br_startblock, PREV->br_blockcount); 1227 + __xfs_refcount_add(tp, XFS_REFCOUNT_DECREASE, PREV->br_startblock, 1228 + PREV->br_blockcount); 1230 1229 } 1231 1230 1232 1231 /* ··· 1540 1541 } 1541 1542 1542 1543 /* Record a CoW staging extent in the refcount btree. */ 1543 - int 1544 + void 1544 1545 xfs_refcount_alloc_cow_extent( 1545 1546 struct xfs_trans *tp, 1546 1547 xfs_fsblock_t fsb, 1547 1548 xfs_extlen_t len) 1548 1549 { 1549 1550 struct xfs_mount *mp = tp->t_mountp; 1550 - int error; 1551 1551 1552 1552 if (!xfs_sb_version_hasreflink(&mp->m_sb)) 1553 - return 0; 1553 + return; 1554 1554 1555 - error = __xfs_refcount_add(tp, XFS_REFCOUNT_ALLOC_COW, fsb, len); 1556 - if (error) 1557 - return error; 1555 + __xfs_refcount_add(tp, XFS_REFCOUNT_ALLOC_COW, fsb, len); 1558 1556 1559 1557 /* Add rmap entry */ 1560 - return xfs_rmap_alloc_extent(tp, XFS_FSB_TO_AGNO(mp, fsb), 1558 + xfs_rmap_alloc_extent(tp, XFS_FSB_TO_AGNO(mp, fsb), 1561 1559 XFS_FSB_TO_AGBNO(mp, fsb), len, XFS_RMAP_OWN_COW); 1562 1560 } 1563 1561 1564 1562 /* Forget a CoW staging event in the refcount btree. */ 1565 - int 1563 + void 1566 1564 xfs_refcount_free_cow_extent( 1567 1565 struct xfs_trans *tp, 1568 1566 xfs_fsblock_t fsb, 1569 1567 xfs_extlen_t len) 1570 1568 { 1571 1569 struct xfs_mount *mp = tp->t_mountp; 1572 - int error; 1573 1570 1574 1571 if (!xfs_sb_version_hasreflink(&mp->m_sb)) 1575 - return 0; 1572 + return; 1576 1573 1577 1574 /* Remove rmap entry */ 1578 - error = xfs_rmap_free_extent(tp, XFS_FSB_TO_AGNO(mp, fsb), 1575 + xfs_rmap_free_extent(tp, XFS_FSB_TO_AGNO(mp, fsb), 1579 1576 XFS_FSB_TO_AGBNO(mp, fsb), len, XFS_RMAP_OWN_COW); 1580 - if (error) 1581 - return error; 1582 - 1583 - return __xfs_refcount_add(tp, XFS_REFCOUNT_FREE_COW, fsb, len); 1577 + __xfs_refcount_add(tp, XFS_REFCOUNT_FREE_COW, fsb, len); 1584 1578 } 1585 1579 1586 1580 struct xfs_refcount_recovery { ··· 1594 1602 if (be32_to_cpu(rec->refc.rc_refcount) != 1) 1595 1603 return -EFSCORRUPTED; 1596 1604 1597 - rr = kmem_alloc(sizeof(struct xfs_refcount_recovery), KM_SLEEP); 1605 + rr = kmem_alloc(sizeof(struct xfs_refcount_recovery), 0); 1598 1606 xfs_refcount_btrec_to_irec(rec, &rr->rr_rrec); 1599 1607 list_add_tail(&rr->rr_list, debris); 1600 1608 ··· 1671 1679 /* Free the orphan record */ 1672 1680 agbno = rr->rr_rrec.rc_startblock - XFS_REFC_COW_START; 1673 1681 fsb = XFS_AGB_TO_FSB(mp, agno, agbno); 1674 - error = xfs_refcount_free_cow_extent(tp, fsb, 1682 + xfs_refcount_free_cow_extent(tp, fsb, 1675 1683 rr->rr_rrec.rc_blockcount); 1676 - if (error) 1677 - goto out_trans; 1678 1684 1679 1685 /* Free the block. */ 1680 1686 xfs_bmap_add_free(tp, fsb, rr->rr_rrec.rc_blockcount, NULL);

+6 -6

fs/xfs/libxfs/xfs_refcount.h

··· 29 29 xfs_extlen_t ri_blockcount; 30 30 }; 31 31 32 - extern int xfs_refcount_increase_extent(struct xfs_trans *tp, 32 + void xfs_refcount_increase_extent(struct xfs_trans *tp, 33 33 struct xfs_bmbt_irec *irec); 34 - extern int xfs_refcount_decrease_extent(struct xfs_trans *tp, 34 + void xfs_refcount_decrease_extent(struct xfs_trans *tp, 35 35 struct xfs_bmbt_irec *irec); 36 36 37 37 extern void xfs_refcount_finish_one_cleanup(struct xfs_trans *tp, ··· 45 45 xfs_agblock_t agbno, xfs_extlen_t aglen, xfs_agblock_t *fbno, 46 46 xfs_extlen_t *flen, bool find_end_of_shared); 47 47 48 - extern int xfs_refcount_alloc_cow_extent(struct xfs_trans *tp, 49 - xfs_fsblock_t fsb, xfs_extlen_t len); 50 - extern int xfs_refcount_free_cow_extent(struct xfs_trans *tp, 51 - xfs_fsblock_t fsb, xfs_extlen_t len); 48 + void xfs_refcount_alloc_cow_extent(struct xfs_trans *tp, xfs_fsblock_t fsb, 49 + xfs_extlen_t len); 50 + void xfs_refcount_free_cow_extent(struct xfs_trans *tp, xfs_fsblock_t fsb, 51 + xfs_extlen_t len); 52 52 extern int xfs_refcount_recover_cow_leftovers(struct xfs_mount *mp, 53 53 xfs_agnumber_t agno); 54 54

+30 -29

fs/xfs/libxfs/xfs_rmap.c

··· 168 168 union xfs_btree_rec *rec, 169 169 struct xfs_rmap_irec *irec) 170 170 { 171 - irec->rm_flags = 0; 172 171 irec->rm_startblock = be32_to_cpu(rec->rmap.rm_startblock); 173 172 irec->rm_blockcount = be32_to_cpu(rec->rmap.rm_blockcount); 174 173 irec->rm_owner = be64_to_cpu(rec->rmap.rm_owner); ··· 253 254 rec->rm_flags); 254 255 255 256 if (rec->rm_owner != info->high.rm_owner) 256 - return XFS_BTREE_QUERY_RANGE_CONTINUE; 257 + return 0; 257 258 if (!XFS_RMAP_NON_INODE_OWNER(rec->rm_owner) && 258 259 !(rec->rm_flags & XFS_RMAP_BMBT_BLOCK) && 259 260 rec->rm_offset + rec->rm_blockcount - 1 != info->high.rm_offset) 260 - return XFS_BTREE_QUERY_RANGE_CONTINUE; 261 + return 0; 261 262 262 263 *info->irec = *rec; 263 264 *info->stat = 1; 264 - return XFS_BTREE_QUERY_RANGE_ABORT; 265 + return -ECANCELED; 265 266 } 266 267 267 268 /* ··· 304 305 305 306 error = xfs_rmap_query_range(cur, &info.high, &info.high, 306 307 xfs_rmap_find_left_neighbor_helper, &info); 307 - if (error == XFS_BTREE_QUERY_RANGE_ABORT) 308 + if (error == -ECANCELED) 308 309 error = 0; 309 310 if (*stat) 310 311 trace_xfs_rmap_find_left_neighbor_result(cur->bc_mp, ··· 329 330 rec->rm_flags); 330 331 331 332 if (rec->rm_owner != info->high.rm_owner) 332 - return XFS_BTREE_QUERY_RANGE_CONTINUE; 333 + return 0; 333 334 if (!XFS_RMAP_NON_INODE_OWNER(rec->rm_owner) && 334 335 !(rec->rm_flags & XFS_RMAP_BMBT_BLOCK) && 335 336 (rec->rm_offset > info->high.rm_offset || 336 337 rec->rm_offset + rec->rm_blockcount <= info->high.rm_offset)) 337 - return XFS_BTREE_QUERY_RANGE_CONTINUE; 338 + return 0; 338 339 339 340 *info->irec = *rec; 340 341 *info->stat = 1; 341 - return XFS_BTREE_QUERY_RANGE_ABORT; 342 + return -ECANCELED; 342 343 } 343 344 344 345 /* ··· 376 377 cur->bc_private.a.agno, bno, 0, owner, offset, flags); 377 378 error = xfs_rmap_query_range(cur, &info.high, &info.high, 378 379 xfs_rmap_lookup_le_range_helper, &info); 379 - if (error == XFS_BTREE_QUERY_RANGE_ABORT) 380 + if (error == -ECANCELED) 380 381 error = 0; 381 382 if (*stat) 382 383 trace_xfs_rmap_lookup_le_range_result(cur->bc_mp, ··· 2267 2268 * Record a rmap intent; the list is kept sorted first by AG and then by 2268 2269 * increasing age. 2269 2270 */ 2270 - static int 2271 + static void 2271 2272 __xfs_rmap_add( 2272 2273 struct xfs_trans *tp, 2273 2274 enum xfs_rmap_intent_type type, ··· 2286 2287 bmap->br_blockcount, 2287 2288 bmap->br_state); 2288 2289 2289 - ri = kmem_alloc(sizeof(struct xfs_rmap_intent), KM_SLEEP | KM_NOFS); 2290 + ri = kmem_alloc(sizeof(struct xfs_rmap_intent), KM_NOFS); 2290 2291 INIT_LIST_HEAD(&ri->ri_list); 2291 2292 ri->ri_type = type; 2292 2293 ri->ri_owner = owner; ··· 2294 2295 ri->ri_bmap = *bmap; 2295 2296 2296 2297 xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_RMAP, &ri->ri_list); 2297 - return 0; 2298 2298 } 2299 2299 2300 2300 /* Map an extent into a file. */ 2301 - int 2301 + void 2302 2302 xfs_rmap_map_extent( 2303 2303 struct xfs_trans *tp, 2304 2304 struct xfs_inode *ip, ··· 2305 2307 struct xfs_bmbt_irec *PREV) 2306 2308 { 2307 2309 if (!xfs_rmap_update_is_needed(tp->t_mountp, whichfork)) 2308 - return 0; 2310 + return; 2309 2311 2310 - return __xfs_rmap_add(tp, xfs_is_reflink_inode(ip) ? 2312 + __xfs_rmap_add(tp, xfs_is_reflink_inode(ip) ? 2311 2313 XFS_RMAP_MAP_SHARED : XFS_RMAP_MAP, ip->i_ino, 2312 2314 whichfork, PREV); 2313 2315 } 2314 2316 2315 2317 /* Unmap an extent out of a file. */ 2316 - int 2318 + void 2317 2319 xfs_rmap_unmap_extent( 2318 2320 struct xfs_trans *tp, 2319 2321 struct xfs_inode *ip, ··· 2321 2323 struct xfs_bmbt_irec *PREV) 2322 2324 { 2323 2325 if (!xfs_rmap_update_is_needed(tp->t_mountp, whichfork)) 2324 - return 0; 2326 + return; 2325 2327 2326 - return __xfs_rmap_add(tp, xfs_is_reflink_inode(ip) ? 2328 + __xfs_rmap_add(tp, xfs_is_reflink_inode(ip) ? 2327 2329 XFS_RMAP_UNMAP_SHARED : XFS_RMAP_UNMAP, ip->i_ino, 2328 2330 whichfork, PREV); 2329 2331 } ··· 2334 2336 * Note that tp can be NULL here as no transaction is used for COW fork 2335 2337 * unwritten conversion. 2336 2338 */ 2337 - int 2339 + void 2338 2340 xfs_rmap_convert_extent( 2339 2341 struct xfs_mount *mp, 2340 2342 struct xfs_trans *tp, ··· 2343 2345 struct xfs_bmbt_irec *PREV) 2344 2346 { 2345 2347 if (!xfs_rmap_update_is_needed(mp, whichfork)) 2346 - return 0; 2348 + return; 2347 2349 2348 - return __xfs_rmap_add(tp, xfs_is_reflink_inode(ip) ? 2350 + __xfs_rmap_add(tp, xfs_is_reflink_inode(ip) ? 2349 2351 XFS_RMAP_CONVERT_SHARED : XFS_RMAP_CONVERT, ip->i_ino, 2350 2352 whichfork, PREV); 2351 2353 } 2352 2354 2353 2355 /* Schedule the creation of an rmap for non-file data. */ 2354 - int 2356 + void 2355 2357 xfs_rmap_alloc_extent( 2356 2358 struct xfs_trans *tp, 2357 2359 xfs_agnumber_t agno, ··· 2362 2364 struct xfs_bmbt_irec bmap; 2363 2365 2364 2366 if (!xfs_rmap_update_is_needed(tp->t_mountp, XFS_DATA_FORK)) 2365 - return 0; 2367 + return; 2366 2368 2367 2369 bmap.br_startblock = XFS_AGB_TO_FSB(tp->t_mountp, agno, bno); 2368 2370 bmap.br_blockcount = len; 2369 2371 bmap.br_startoff = 0; 2370 2372 bmap.br_state = XFS_EXT_NORM; 2371 2373 2372 - return __xfs_rmap_add(tp, XFS_RMAP_ALLOC, owner, XFS_DATA_FORK, &bmap); 2374 + __xfs_rmap_add(tp, XFS_RMAP_ALLOC, owner, XFS_DATA_FORK, &bmap); 2373 2375 } 2374 2376 2375 2377 /* Schedule the deletion of an rmap for non-file data. */ 2376 - int 2378 + void 2377 2379 xfs_rmap_free_extent( 2378 2380 struct xfs_trans *tp, 2379 2381 xfs_agnumber_t agno, ··· 2384 2386 struct xfs_bmbt_irec bmap; 2385 2387 2386 2388 if (!xfs_rmap_update_is_needed(tp->t_mountp, XFS_DATA_FORK)) 2387 - return 0; 2389 + return; 2388 2390 2389 2391 bmap.br_startblock = XFS_AGB_TO_FSB(tp->t_mountp, agno, bno); 2390 2392 bmap.br_blockcount = len; 2391 2393 bmap.br_startoff = 0; 2392 2394 bmap.br_state = XFS_EXT_NORM; 2393 2395 2394 - return __xfs_rmap_add(tp, XFS_RMAP_FREE, owner, XFS_DATA_FORK, &bmap); 2396 + __xfs_rmap_add(tp, XFS_RMAP_FREE, owner, XFS_DATA_FORK, &bmap); 2395 2397 } 2396 2398 2397 2399 /* Compare rmap records. Returns -1 if a < b, 1 if a > b, and 0 if equal. */ ··· 2509 2511 ((rks->flags & rec->rm_flags) & XFS_RMAP_KEY_FLAGS) == rks->flags) 2510 2512 return 0; 2511 2513 rks->has_rmap = true; 2512 - return XFS_BTREE_QUERY_RANGE_ABORT; 2514 + return -ECANCELED; 2513 2515 } 2514 2516 2515 2517 /* ··· 2538 2540 2539 2541 error = xfs_rmap_query_range(cur, &low, &high, 2540 2542 xfs_rmap_has_other_keys_helper, &rks); 2543 + if (error < 0) 2544 + return error; 2545 + 2541 2546 *has_rmap = rks.has_rmap; 2542 - return error; 2547 + return 0; 2543 2548 } 2544 2549 2545 2550 const struct xfs_owner_info XFS_RMAP_OINFO_SKIP_UPDATE = {

+6 -5

fs/xfs/libxfs/xfs_rmap.h

··· 68 68 if (offset & ~(XFS_RMAP_OFF_MASK | XFS_RMAP_OFF_FLAGS)) 69 69 return -EFSCORRUPTED; 70 70 irec->rm_offset = XFS_RMAP_OFF(offset); 71 + irec->rm_flags = 0; 71 72 if (offset & XFS_RMAP_OFF_ATTR_FORK) 72 73 irec->rm_flags |= XFS_RMAP_ATTR_FORK; 73 74 if (offset & XFS_RMAP_OFF_BMBT_BLOCK) ··· 162 161 }; 163 162 164 163 /* functions for updating the rmapbt based on bmbt map/unmap operations */ 165 - int xfs_rmap_map_extent(struct xfs_trans *tp, struct xfs_inode *ip, 164 + void xfs_rmap_map_extent(struct xfs_trans *tp, struct xfs_inode *ip, 166 165 int whichfork, struct xfs_bmbt_irec *imap); 167 - int xfs_rmap_unmap_extent(struct xfs_trans *tp, struct xfs_inode *ip, 166 + void xfs_rmap_unmap_extent(struct xfs_trans *tp, struct xfs_inode *ip, 168 167 int whichfork, struct xfs_bmbt_irec *imap); 169 - int xfs_rmap_convert_extent(struct xfs_mount *mp, struct xfs_trans *tp, 168 + void xfs_rmap_convert_extent(struct xfs_mount *mp, struct xfs_trans *tp, 170 169 struct xfs_inode *ip, int whichfork, 171 170 struct xfs_bmbt_irec *imap); 172 - int xfs_rmap_alloc_extent(struct xfs_trans *tp, xfs_agnumber_t agno, 171 + void xfs_rmap_alloc_extent(struct xfs_trans *tp, xfs_agnumber_t agno, 173 172 xfs_agblock_t bno, xfs_extlen_t len, uint64_t owner); 174 - int xfs_rmap_free_extent(struct xfs_trans *tp, xfs_agnumber_t agno, 173 + void xfs_rmap_free_extent(struct xfs_trans *tp, xfs_agnumber_t agno, 175 174 xfs_agblock_t bno, xfs_extlen_t len, uint64_t owner); 176 175 177 176 void xfs_rmap_finish_one_cleanup(struct xfs_trans *tp,

-6

fs/xfs/libxfs/xfs_shared.h

··· 177 177 unsigned int agino_log; /* #bits for agino in inum */ 178 178 }; 179 179 180 - /* Keep iterating the data structure. */ 181 - #define XFS_ITER_CONTINUE (0) 182 - 183 - /* Stop iterating the data structure. */ 184 - #define XFS_ITER_ABORT (1) 185 - 186 180 #endif /* __XFS_SHARED_H__ */

+8

fs/xfs/libxfs/xfs_types.h

··· 169 169 xfs_exntst_t br_state; /* extent state */ 170 170 } xfs_bmbt_irec_t; 171 171 172 + /* per-AG block reservation types */ 173 + enum xfs_ag_resv_type { 174 + XFS_AG_RESV_NONE = 0, 175 + XFS_AG_RESV_AGFL, 176 + XFS_AG_RESV_METADATA, 177 + XFS_AG_RESV_RMAPBT, 178 + }; 179 + 172 180 /* 173 181 * Type verifier functions 174 182 */

+2 -2

fs/xfs/scrub/agheader.c

··· 639 639 xchk_agfl_block_xref(sc, agbno); 640 640 641 641 if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) 642 - return XFS_ITER_ABORT; 642 + return -ECANCELED; 643 643 644 644 return 0; 645 645 } ··· 730 730 /* Check the blocks in the AGFL. */ 731 731 error = xfs_agfl_walk(sc->mp, XFS_BUF_TO_AGF(sc->sa.agf_bp), 732 732 sc->sa.agfl_bp, xchk_agfl_block, &sai); 733 - if (error == XFS_ITER_ABORT) { 733 + if (error == -ECANCELED) { 734 734 error = 0; 735 735 goto out_free; 736 736 }

+2 -4

fs/xfs/scrub/attr.c

··· 80 80 * without the inode lock held, which means we can sleep. 81 81 */ 82 82 if (sc->flags & XCHK_TRY_HARDER) { 83 - error = xchk_setup_xattr_buf(sc, XATTR_SIZE_MAX, KM_SLEEP); 83 + error = xchk_setup_xattr_buf(sc, XATTR_SIZE_MAX, 0); 84 84 if (error) 85 85 return error; 86 86 } ··· 163 163 args.valuelen = valuelen; 164 164 165 165 error = xfs_attr_get_ilocked(context->dp, &args); 166 - if (error == -EEXIST) 167 - error = 0; 168 166 if (!xchk_fblock_process_error(sx->sc, XFS_ATTR_FORK, args.blkno, 169 167 &error)) 170 168 goto fail_xref; ··· 171 173 args.blkno); 172 174 fail_xref: 173 175 if (sx->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) 174 - context->seen_enough = XFS_ITER_ABORT; 176 + context->seen_enough = 1; 175 177 return; 176 178 } 177 179

+48 -33

fs/xfs/scrub/bmap.c

··· 75 75 xfs_fileoff_t lastoff; 76 76 bool is_rt; 77 77 bool is_shared; 78 + bool was_loaded; 78 79 int whichfork; 79 80 }; 80 81 ··· 214 213 215 214 /* Cross-reference a single rtdev extent record. */ 216 215 STATIC void 217 - xchk_bmap_rt_extent_xref( 218 - struct xchk_bmap_info *info, 216 + xchk_bmap_rt_iextent_xref( 219 217 struct xfs_inode *ip, 220 - struct xfs_btree_cur *cur, 218 + struct xchk_bmap_info *info, 221 219 struct xfs_bmbt_irec *irec) 222 220 { 223 - if (info->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) 224 - return; 225 - 226 221 xchk_xref_is_used_rt_space(info->sc, irec->br_startblock, 227 222 irec->br_blockcount); 228 223 } 229 224 230 225 /* Cross-reference a single datadev extent record. */ 231 226 STATIC void 232 - xchk_bmap_extent_xref( 233 - struct xchk_bmap_info *info, 227 + xchk_bmap_iextent_xref( 234 228 struct xfs_inode *ip, 235 - struct xfs_btree_cur *cur, 229 + struct xchk_bmap_info *info, 236 230 struct xfs_bmbt_irec *irec) 237 231 { 238 232 struct xfs_mount *mp = info->sc->mp; ··· 235 239 xfs_agblock_t agbno; 236 240 xfs_extlen_t len; 237 241 int error; 238 - 239 - if (info->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) 240 - return; 241 242 242 243 agno = XFS_FSB_TO_AGNO(mp, irec->br_startblock); 243 244 agbno = XFS_FSB_TO_AGBNO(mp, irec->br_startblock); ··· 293 300 294 301 /* Scrub a single extent record. */ 295 302 STATIC int 296 - xchk_bmap_extent( 303 + xchk_bmap_iextent( 297 304 struct xfs_inode *ip, 298 - struct xfs_btree_cur *cur, 299 305 struct xchk_bmap_info *info, 300 306 struct xfs_bmbt_irec *irec) 301 307 { 302 308 struct xfs_mount *mp = info->sc->mp; 303 - struct xfs_buf *bp = NULL; 304 309 xfs_filblks_t end; 305 310 int error = 0; 306 - 307 - if (cur) 308 - xfs_btree_get_block(cur, 0, &bp); 309 311 310 312 /* 311 313 * Check for out-of-order extents. This record could have come ··· 352 364 xchk_fblock_set_corrupt(info->sc, info->whichfork, 353 365 irec->br_startoff); 354 366 367 + if (info->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) 368 + return 0; 369 + 355 370 if (info->is_rt) 356 - xchk_bmap_rt_extent_xref(info, ip, cur, irec); 371 + xchk_bmap_rt_iextent_xref(ip, info, irec); 357 372 else 358 - xchk_bmap_extent_xref(info, ip, cur, irec); 373 + xchk_bmap_iextent_xref(ip, info, irec); 359 374 360 375 info->lastoff = irec->br_startoff + irec->br_blockcount; 361 376 return error; ··· 371 380 union xfs_btree_rec *rec) 372 381 { 373 382 struct xfs_bmbt_irec irec; 383 + struct xfs_bmbt_irec iext_irec; 384 + struct xfs_iext_cursor icur; 374 385 struct xchk_bmap_info *info = bs->private; 375 386 struct xfs_inode *ip = bs->cur->bc_private.b.ip; 376 387 struct xfs_buf *bp = NULL; 377 388 struct xfs_btree_block *block; 389 + struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, info->whichfork); 378 390 uint64_t owner; 379 391 int i; 380 392 ··· 396 402 } 397 403 } 398 404 399 - /* Set up the in-core record and scrub it. */ 405 + /* 406 + * Check that the incore extent tree contains an extent that matches 407 + * this one exactly. We validate those cached bmaps later, so we don't 408 + * need to check them here. If the incore extent tree was just loaded 409 + * from disk by the scrubber, we assume that its contents match what's 410 + * on disk (we still hold the ILOCK) and skip the equivalence check. 411 + */ 412 + if (!info->was_loaded) 413 + return 0; 414 + 400 415 xfs_bmbt_disk_get_all(&rec->bmbt, &irec); 401 - return xchk_bmap_extent(ip, bs->cur, info, &irec); 416 + if (!xfs_iext_lookup_extent(ip, ifp, irec.br_startoff, &icur, 417 + &iext_irec) || 418 + irec.br_startoff != iext_irec.br_startoff || 419 + irec.br_startblock != iext_irec.br_startblock || 420 + irec.br_blockcount != iext_irec.br_blockcount || 421 + irec.br_state != iext_irec.br_state) 422 + xchk_fblock_set_corrupt(bs->sc, info->whichfork, 423 + irec.br_startoff); 424 + return 0; 402 425 } 403 426 404 427 /* Scan the btree records. */ ··· 426 415 struct xchk_bmap_info *info) 427 416 { 428 417 struct xfs_owner_info oinfo; 418 + struct xfs_ifork *ifp = XFS_IFORK_PTR(sc->ip, whichfork); 429 419 struct xfs_mount *mp = sc->mp; 430 420 struct xfs_inode *ip = sc->ip; 431 421 struct xfs_btree_cur *cur; 432 422 int error; 433 423 424 + /* Load the incore bmap cache if it's not loaded. */ 425 + info->was_loaded = ifp->if_flags & XFS_IFEXTENTS; 426 + if (!info->was_loaded) { 427 + error = xfs_iread_extents(sc->tp, ip, whichfork); 428 + if (!xchk_fblock_process_error(sc, whichfork, 0, &error)) 429 + goto out; 430 + } 431 + 432 + /* Check the btree structure. */ 434 433 cur = xfs_bmbt_init_cursor(mp, sc->tp, ip, whichfork); 435 434 xfs_rmap_ino_bmbt_owner(&oinfo, ip->i_ino, whichfork); 436 435 error = xchk_btree(sc, cur, xchk_bmapbt_rec, &oinfo, info); 437 436 xfs_btree_del_cursor(cur, error); 437 + out: 438 438 return error; 439 439 } 440 440 ··· 522 500 523 501 out: 524 502 if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) 525 - return XFS_BTREE_QUERY_RANGE_ABORT; 503 + return -ECANCELED; 526 504 return 0; 527 505 } 528 506 ··· 551 529 sbcri.sc = sc; 552 530 sbcri.whichfork = whichfork; 553 531 error = xfs_rmap_query_all(cur, xchk_bmap_check_rmap, &sbcri); 554 - if (error == XFS_BTREE_QUERY_RANGE_ABORT) 532 + if (error == -ECANCELED) 555 533 error = 0; 556 534 557 535 xfs_btree_del_cursor(cur, error); ··· 693 671 if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) 694 672 goto out; 695 673 696 - /* Now try to scrub the in-memory extent list. */ 697 - if (!(ifp->if_flags & XFS_IFEXTENTS)) { 698 - error = xfs_iread_extents(sc->tp, ip, whichfork); 699 - if (!xchk_fblock_process_error(sc, whichfork, 0, &error)) 700 - goto out; 701 - } 702 - 703 674 /* Find the offset of the last extent in the mapping. */ 704 675 error = xfs_bmap_last_offset(ip, &endoff, whichfork); 705 676 if (!xchk_fblock_process_error(sc, whichfork, 0, &error)) ··· 704 689 for_each_xfs_iext(ifp, &icur, &irec) { 705 690 if (xchk_should_terminate(sc, &error) || 706 691 (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)) 707 - break; 692 + goto out; 708 693 if (isnullstartblock(irec.br_startblock)) 709 694 continue; 710 695 if (irec.br_startoff >= endoff) { ··· 712 697 irec.br_startoff); 713 698 goto out; 714 699 } 715 - error = xchk_bmap_extent(ip, NULL, &info, &irec); 700 + error = xchk_bmap_iextent(ip, &info, &irec); 716 701 if (error) 717 702 goto out; 718 703 }

+1 -1

fs/xfs/scrub/fscounters.c

··· 125 125 struct xchk_fscounters *fsc; 126 126 int error; 127 127 128 - sc->buf = kmem_zalloc(sizeof(struct xchk_fscounters), KM_SLEEP); 128 + sc->buf = kmem_zalloc(sizeof(struct xchk_fscounters), 0); 129 129 if (!sc->buf) 130 130 return -ENOMEM; 131 131 fsc = sc->buf;

+3 -3

fs/xfs/scrub/repair.c

··· 351 351 xfs_buf_zero(bp, 0, BBTOB(bp->b_length)); 352 352 xfs_btree_init_block(mp, bp, btnum, 0, 0, sc->sa.agno); 353 353 xfs_trans_buf_set_type(tp, bp, XFS_BLFT_BTREE_BUF); 354 - xfs_trans_log_buf(tp, bp, 0, bp->b_length); 354 + xfs_trans_log_buf(tp, bp, 0, BBTOB(bp->b_length) - 1); 355 355 bp->b_ops = ops; 356 356 *bpp = bp; 357 357 ··· 664 664 { 665 665 xfs_agblock_t *agbno = priv; 666 666 667 - return (*agbno == bno) ? XFS_ITER_ABORT : 0; 667 + return (*agbno == bno) ? -ECANCELED : 0; 668 668 } 669 669 670 670 /* Does this block match the btree information passed in? */ ··· 694 694 if (owner == XFS_RMAP_OWN_AG) { 695 695 error = xfs_agfl_walk(mp, ri->agf, ri->agfl_bp, 696 696 xrep_findroot_agfl_walk, &agbno); 697 - if (error == XFS_ITER_ABORT) 697 + if (error == -ECANCELED) 698 698 return 0; 699 699 if (error) 700 700 return error;

+1 -1

fs/xfs/scrub/symlink.c

··· 22 22 struct xfs_inode *ip) 23 23 { 24 24 /* Allocate the buffer without the inode lock held. */ 25 - sc->buf = kmem_zalloc_large(XFS_SYMLINK_MAXLEN + 1, KM_SLEEP); 25 + sc->buf = kmem_zalloc_large(XFS_SYMLINK_MAXLEN + 1, 0); 26 26 if (!sc->buf) 27 27 return -ENOMEM; 28 28

+5 -9

fs/xfs/xfs_acl.c

··· 112 112 { 113 113 struct xfs_inode *ip = XFS_I(inode); 114 114 struct posix_acl *acl = NULL; 115 - struct xfs_acl *xfs_acl; 115 + struct xfs_acl *xfs_acl = NULL; 116 116 unsigned char *ea_name; 117 117 int error; 118 118 int len; ··· 135 135 * go out to the disk. 136 136 */ 137 137 len = XFS_ACL_MAX_SIZE(ip->i_mount); 138 - xfs_acl = kmem_zalloc_large(len, KM_SLEEP); 139 - if (!xfs_acl) 140 - return ERR_PTR(-ENOMEM); 141 - 142 - error = xfs_attr_get(ip, ea_name, (unsigned char *)xfs_acl, 143 - &len, ATTR_ROOT); 138 + error = xfs_attr_get(ip, ea_name, (unsigned char **)&xfs_acl, &len, 139 + ATTR_ALLOC | ATTR_ROOT); 144 140 if (error) { 145 141 /* 146 142 * If the attribute doesn't exist make sure we have a negative ··· 147 151 } else { 148 152 acl = xfs_acl_from_disk(xfs_acl, len, 149 153 XFS_ACL_MAX_ENTRIES(ip->i_mount)); 154 + kmem_free(xfs_acl); 150 155 } 151 - kmem_free(xfs_acl); 152 156 return acl; 153 157 } 154 158 ··· 176 180 struct xfs_acl *xfs_acl; 177 181 int len = XFS_ACL_MAX_SIZE(ip->i_mount); 178 182 179 - xfs_acl = kmem_zalloc_large(len, KM_SLEEP); 183 + xfs_acl = kmem_zalloc_large(len, 0); 180 184 if (!xfs_acl) 181 185 return -ENOMEM; 182 186

+1 -1

fs/xfs/xfs_attr_inactive.c

··· 147 147 * Allocate storage for a list of all the "remote" value extents. 148 148 */ 149 149 size = count * sizeof(xfs_attr_inactive_list_t); 150 - list = kmem_alloc(size, KM_SLEEP); 150 + list = kmem_alloc(size, 0); 151 151 152 152 /* 153 153 * Identify each of the "remote" value extents.

+1 -1

fs/xfs/xfs_attr_list.c

··· 109 109 * It didn't all fit, so we have to sort everything on hashval. 110 110 */ 111 111 sbsize = sf->hdr.count * sizeof(*sbuf); 112 - sbp = sbuf = kmem_alloc(sbsize, KM_SLEEP | KM_NOFS); 112 + sbp = sbuf = kmem_alloc(sbsize, KM_NOFS); 113 113 114 114 /* 115 115 * Scan the attribute list for the rest of the entries, storing

+3 -5

fs/xfs/xfs_bmap_item.c

··· 141 141 { 142 142 struct xfs_bui_log_item *buip; 143 143 144 - buip = kmem_zone_zalloc(xfs_bui_zone, KM_SLEEP); 144 + buip = kmem_zone_zalloc(xfs_bui_zone, 0); 145 145 146 146 xfs_log_item_init(mp, &buip->bui_item, XFS_LI_BUI, &xfs_bui_item_ops); 147 147 buip->bui_format.bui_nextents = XFS_BUI_MAX_FAST_EXTENTS; ··· 218 218 { 219 219 struct xfs_bud_log_item *budp; 220 220 221 - budp = kmem_zone_zalloc(xfs_bud_zone, KM_SLEEP); 221 + budp = kmem_zone_zalloc(xfs_bud_zone, 0); 222 222 xfs_log_item_init(tp->t_mountp, &budp->bud_item, XFS_LI_BUD, 223 223 &xfs_bud_item_ops); 224 224 budp->bud_buip = buip; ··· 542 542 irec.br_blockcount = count; 543 543 irec.br_startoff = bmap->me_startoff; 544 544 irec.br_state = state; 545 - error = xfs_bmap_unmap_extent(tp, ip, &irec); 546 - if (error) 547 - goto err_inode; 545 + xfs_bmap_unmap_extent(tp, ip, &irec); 548 546 } 549 547 550 548 set_bit(XFS_BUI_RECOVERED, &buip->bui_flags);

+7 -15

fs/xfs/xfs_bmap_util.c

··· 39 39 xfs_daddr_t 40 40 xfs_fsb_to_db(struct xfs_inode *ip, xfs_fsblock_t fsb) 41 41 { 42 - return (XFS_IS_REALTIME_INODE(ip) ? \ 43 - (xfs_daddr_t)XFS_FSB_TO_BB((ip)->i_mount, (fsb)) : \ 44 - XFS_FSB_TO_DADDR((ip)->i_mount, (fsb))); 42 + if (XFS_IS_REALTIME_INODE(ip)) 43 + return XFS_FSB_TO_BB(ip->i_mount, fsb); 44 + return XFS_FSB_TO_DADDR(ip->i_mount, fsb); 45 45 } 46 46 47 47 /* ··· 1532 1532 trace_xfs_swap_extent_rmap_remap_piece(tip, &uirec); 1533 1533 1534 1534 /* Remove the mapping from the donor file. */ 1535 - error = xfs_bmap_unmap_extent(tp, tip, &uirec); 1536 - if (error) 1537 - goto out; 1535 + xfs_bmap_unmap_extent(tp, tip, &uirec); 1538 1536 1539 1537 /* Remove the mapping from the source file. */ 1540 - error = xfs_bmap_unmap_extent(tp, ip, &irec); 1541 - if (error) 1542 - goto out; 1538 + xfs_bmap_unmap_extent(tp, ip, &irec); 1543 1539 1544 1540 /* Map the donor file's blocks into the source file. */ 1545 - error = xfs_bmap_map_extent(tp, ip, &uirec); 1546 - if (error) 1547 - goto out; 1541 + xfs_bmap_map_extent(tp, ip, &uirec); 1548 1542 1549 1543 /* Map the source file's blocks into the donor file. */ 1550 - error = xfs_bmap_map_extent(tp, tip, &irec); 1551 - if (error) 1552 - goto out; 1544 + xfs_bmap_map_extent(tp, tip, &irec); 1553 1545 1554 1546 error = xfs_defer_finish(tpp); 1555 1547 tp = *tpp;

+4 -3

fs/xfs/xfs_buf.c

··· 353 353 */ 354 354 size = BBTOB(bp->b_length); 355 355 if (size < PAGE_SIZE) { 356 - bp->b_addr = kmem_alloc(size, KM_NOFS); 356 + int align_mask = xfs_buftarg_dma_alignment(bp->b_target); 357 + bp->b_addr = kmem_alloc_io(size, align_mask, KM_NOFS); 357 358 if (!bp->b_addr) { 358 359 /* low memory - use alloc_page loop instead */ 359 360 goto use_alloc_page; ··· 369 368 } 370 369 bp->b_offset = offset_in_page(bp->b_addr); 371 370 bp->b_pages = bp->b_page_array; 372 - bp->b_pages[0] = virt_to_page(bp->b_addr); 371 + bp->b_pages[0] = kmem_to_page(bp->b_addr); 373 372 bp->b_page_count = 1; 374 373 bp->b_flags |= _XBF_KMEM; 375 374 return 0; ··· 1742 1741 { 1743 1742 xfs_buftarg_t *btp; 1744 1743 1745 - btp = kmem_zalloc(sizeof(*btp), KM_SLEEP | KM_NOFS); 1744 + btp = kmem_zalloc(sizeof(*btp), KM_NOFS); 1746 1745 1747 1746 btp->bt_mount = mp; 1748 1747 btp->bt_dev = bdev->bd_dev;

+6

fs/xfs/xfs_buf.h

··· 350 350 #define xfs_getsize_buftarg(buftarg) block_size((buftarg)->bt_bdev) 351 351 #define xfs_readonly_buftarg(buftarg) bdev_read_only((buftarg)->bt_bdev) 352 352 353 + static inline int 354 + xfs_buftarg_dma_alignment(struct xfs_buftarg *bt) 355 + { 356 + return queue_dma_alignment(bt->bt_bdev->bd_disk->queue); 357 + } 358 + 353 359 int xfs_buf_reverify(struct xfs_buf *bp, const struct xfs_buf_ops *ops); 354 360 bool xfs_verify_magic(struct xfs_buf *bp, __be32 dmagic); 355 361 bool xfs_verify_magic16(struct xfs_buf *bp, __be16 dmagic);

+2 -2

fs/xfs/xfs_buf_item.c

··· 702 702 } 703 703 704 704 bip->bli_formats = kmem_zalloc(count * sizeof(struct xfs_buf_log_format), 705 - KM_SLEEP); 705 + 0); 706 706 if (!bip->bli_formats) 707 707 return -ENOMEM; 708 708 return 0; ··· 747 747 return 0; 748 748 } 749 749 750 - bip = kmem_zone_zalloc(xfs_buf_item_zone, KM_SLEEP); 750 + bip = kmem_zone_zalloc(xfs_buf_item_zone, 0); 751 751 xfs_log_item_init(mp, &bip->bli_item, XFS_LI_BUF, &xfs_buf_item_ops); 752 752 bip->bli_buf = bp; 753 753

+2 -2

fs/xfs/xfs_dquot.c

··· 440 440 { 441 441 struct xfs_dquot *dqp; 442 442 443 - dqp = kmem_zone_zalloc(xfs_qm_dqzone, KM_SLEEP); 443 + dqp = kmem_zone_zalloc(xfs_qm_dqzone, 0); 444 444 445 445 dqp->dq_flags = type; 446 446 dqp->q_core.d_id = cpu_to_be32(id); ··· 1239 1239 /* 1240 1240 * Iterate every dquot of a particular type. The caller must ensure that the 1241 1241 * particular quota type is active. iter_fn can return negative error codes, 1242 - * or XFS_ITER_ABORT to indicate that it wants to stop iterating. 1242 + * or -ECANCELED to indicate that it wants to stop iterating. 1243 1243 */ 1244 1244 int 1245 1245 xfs_qm_dqiterate(

+1 -1

fs/xfs/xfs_dquot_item.c

··· 347 347 { 348 348 struct xfs_qoff_logitem *qf; 349 349 350 - qf = kmem_zalloc(sizeof(struct xfs_qoff_logitem), KM_SLEEP); 350 + qf = kmem_zalloc(sizeof(struct xfs_qoff_logitem), 0); 351 351 352 352 xfs_log_item_init(mp, &qf->qql_item, XFS_LI_QUOTAOFF, start ? 353 353 &xfs_qm_qoffend_logitem_ops : &xfs_qm_qoff_logitem_ops);

+1 -1

fs/xfs/xfs_error.c

··· 213 213 struct xfs_mount *mp) 214 214 { 215 215 mp->m_errortag = kmem_zalloc(sizeof(unsigned int) * XFS_ERRTAG_MAX, 216 - KM_SLEEP | KM_MAYFAIL); 216 + KM_MAYFAIL); 217 217 if (!mp->m_errortag) 218 218 return -ENOMEM; 219 219

+1 -1

fs/xfs/xfs_extent_busy.c

··· 33 33 struct rb_node **rbp; 34 34 struct rb_node *parent = NULL; 35 35 36 - new = kmem_zalloc(sizeof(struct xfs_extent_busy), KM_SLEEP); 36 + new = kmem_zalloc(sizeof(struct xfs_extent_busy), 0); 37 37 new->agno = agno; 38 38 new->bno = bno; 39 39 new->length = len;

+4 -4

fs/xfs/xfs_extfree_item.c

··· 163 163 if (nextents > XFS_EFI_MAX_FAST_EXTENTS) { 164 164 size = (uint)(sizeof(xfs_efi_log_item_t) + 165 165 ((nextents - 1) * sizeof(xfs_extent_t))); 166 - efip = kmem_zalloc(size, KM_SLEEP); 166 + efip = kmem_zalloc(size, 0); 167 167 } else { 168 - efip = kmem_zone_zalloc(xfs_efi_zone, KM_SLEEP); 168 + efip = kmem_zone_zalloc(xfs_efi_zone, 0); 169 169 } 170 170 171 171 xfs_log_item_init(mp, &efip->efi_item, XFS_LI_EFI, &xfs_efi_item_ops); ··· 333 333 if (nextents > XFS_EFD_MAX_FAST_EXTENTS) { 334 334 efdp = kmem_zalloc(sizeof(struct xfs_efd_log_item) + 335 335 (nextents - 1) * sizeof(struct xfs_extent), 336 - KM_SLEEP); 336 + 0); 337 337 } else { 338 - efdp = kmem_zone_zalloc(xfs_efd_zone, KM_SLEEP); 338 + efdp = kmem_zone_zalloc(xfs_efd_zone, 0); 339 339 } 340 340 341 341 xfs_log_item_init(tp->t_mountp, &efdp->efd_item, XFS_LI_EFD,

+26

fs/xfs/xfs_file.c

··· 28 28 #include <linux/falloc.h> 29 29 #include <linux/backing-dev.h> 30 30 #include <linux/mman.h> 31 + #include <linux/fadvise.h> 31 32 32 33 static const struct vm_operations_struct xfs_file_vm_ops; 33 34 ··· 934 933 return error; 935 934 } 936 935 936 + STATIC int 937 + xfs_file_fadvise( 938 + struct file *file, 939 + loff_t start, 940 + loff_t end, 941 + int advice) 942 + { 943 + struct xfs_inode *ip = XFS_I(file_inode(file)); 944 + int ret; 945 + int lockflags = 0; 946 + 947 + /* 948 + * Operations creating pages in page cache need protection from hole 949 + * punching and similar ops 950 + */ 951 + if (advice == POSIX_FADV_WILLNEED) { 952 + lockflags = XFS_IOLOCK_SHARED; 953 + xfs_ilock(ip, lockflags); 954 + } 955 + ret = generic_fadvise(file, start, end, advice); 956 + if (lockflags) 957 + xfs_iunlock(ip, lockflags); 958 + return ret; 959 + } 937 960 938 961 STATIC loff_t 939 962 xfs_file_remap_range( ··· 1257 1232 .fsync = xfs_file_fsync, 1258 1233 .get_unmapped_area = thp_get_unmapped_area, 1259 1234 .fallocate = xfs_file_fallocate, 1235 + .fadvise = xfs_file_fadvise, 1260 1236 .remap_file_range = xfs_file_remap_range, 1261 1237 }; 1262 1238

+6 -6

fs/xfs/xfs_fsmap.c

··· 250 250 rec_daddr += XFS_FSB_TO_BB(mp, rec->rm_blockcount); 251 251 if (info->next_daddr < rec_daddr) 252 252 info->next_daddr = rec_daddr; 253 - return XFS_BTREE_QUERY_RANGE_CONTINUE; 253 + return 0; 254 254 } 255 255 256 256 /* Are we just counting mappings? */ ··· 259 259 info->head->fmh_entries++; 260 260 261 261 if (info->last) 262 - return XFS_BTREE_QUERY_RANGE_CONTINUE; 262 + return 0; 263 263 264 264 info->head->fmh_entries++; 265 265 266 266 rec_daddr += XFS_FSB_TO_BB(mp, rec->rm_blockcount); 267 267 if (info->next_daddr < rec_daddr) 268 268 info->next_daddr = rec_daddr; 269 - return XFS_BTREE_QUERY_RANGE_CONTINUE; 269 + return 0; 270 270 } 271 271 272 272 /* ··· 276 276 */ 277 277 if (rec_daddr > info->next_daddr) { 278 278 if (info->head->fmh_entries >= info->head->fmh_count) 279 - return XFS_BTREE_QUERY_RANGE_ABORT; 279 + return -ECANCELED; 280 280 281 281 fmr.fmr_device = info->dev; 282 282 fmr.fmr_physical = info->next_daddr; ··· 295 295 296 296 /* Fill out the extent we found */ 297 297 if (info->head->fmh_entries >= info->head->fmh_count) 298 - return XFS_BTREE_QUERY_RANGE_ABORT; 298 + return -ECANCELED; 299 299 300 300 trace_xfs_fsmap_mapping(mp, info->dev, info->agno, rec); 301 301 ··· 328 328 rec_daddr += XFS_FSB_TO_BB(mp, rec->rm_blockcount); 329 329 if (info->next_daddr < rec_daddr) 330 330 info->next_daddr = rec_daddr; 331 - return XFS_BTREE_QUERY_RANGE_CONTINUE; 331 + return 0; 332 332 } 333 333 334 334 /* Transform a rmapbt irec into a fsmap */

+1 -1

fs/xfs/xfs_icache.c

··· 40 40 * KM_MAYFAIL and return NULL here on ENOMEM. Set the 41 41 * code up to do this anyway. 42 42 */ 43 - ip = kmem_zone_alloc(xfs_inode_zone, KM_SLEEP); 43 + ip = kmem_zone_alloc(xfs_inode_zone, 0); 44 44 if (!ip) 45 45 return NULL; 46 46 if (inode_init_always(mp->m_super, VFS_I(ip))) {

+1 -1

fs/xfs/xfs_icreate_item.c

··· 89 89 { 90 90 struct xfs_icreate_item *icp; 91 91 92 - icp = kmem_zone_zalloc(xfs_icreate_zone, KM_SLEEP); 92 + icp = kmem_zone_zalloc(xfs_icreate_zone, 0); 93 93 94 94 xfs_log_item_init(tp->t_mountp, &icp->ic_item, XFS_LI_ICREATE, 95 95 &xfs_icreate_item_ops);

+43 -42

fs/xfs/xfs_inode.c

··· 2018 2018 if (XFS_TEST_ERROR(false, pag->pag_mount, XFS_ERRTAG_IUNLINK_FALLBACK)) 2019 2019 return 0; 2020 2020 2021 - iu = kmem_zalloc(sizeof(*iu), KM_SLEEP | KM_NOFS); 2021 + iu = kmem_zalloc(sizeof(*iu), KM_NOFS); 2022 2022 iu->iu_agino = prev_agino; 2023 2023 iu->iu_next_unlinked = this_agino; 2024 2024 ··· 3282 3282 spaceres); 3283 3283 3284 3284 /* 3285 - * Set up the target. 3285 + * Check for expected errors before we dirty the transaction 3286 + * so we can return an error without a transaction abort. 3286 3287 */ 3287 3288 if (target_ip == NULL) { 3288 3289 /* ··· 3295 3294 if (error) 3296 3295 goto out_trans_cancel; 3297 3296 } 3297 + } else { 3298 + /* 3299 + * If target exists and it's a directory, check that whether 3300 + * it can be destroyed. 3301 + */ 3302 + if (S_ISDIR(VFS_I(target_ip)->i_mode) && 3303 + (!xfs_dir_isempty(target_ip) || 3304 + (VFS_I(target_ip)->i_nlink > 2))) { 3305 + error = -EEXIST; 3306 + goto out_trans_cancel; 3307 + } 3308 + } 3309 + 3310 + /* 3311 + * Directory entry creation below may acquire the AGF. Remove 3312 + * the whiteout from the unlinked list first to preserve correct 3313 + * AGI/AGF locking order. This dirties the transaction so failures 3314 + * after this point will abort and log recovery will clean up the 3315 + * mess. 3316 + * 3317 + * For whiteouts, we need to bump the link count on the whiteout 3318 + * inode. After this point, we have a real link, clear the tmpfile 3319 + * state flag from the inode so it doesn't accidentally get misused 3320 + * in future. 3321 + */ 3322 + if (wip) { 3323 + ASSERT(VFS_I(wip)->i_nlink == 0); 3324 + error = xfs_iunlink_remove(tp, wip); 3325 + if (error) 3326 + goto out_trans_cancel; 3327 + 3328 + xfs_bumplink(tp, wip); 3329 + xfs_trans_log_inode(tp, wip, XFS_ILOG_CORE); 3330 + VFS_I(wip)->i_state &= ~I_LINKABLE; 3331 + } 3332 + 3333 + /* 3334 + * Set up the target. 3335 + */ 3336 + if (target_ip == NULL) { 3298 3337 /* 3299 3338 * If target does not exist and the rename crosses 3300 3339 * directories, adjust the target directory link count ··· 3352 3311 xfs_bumplink(tp, target_dp); 3353 3312 } 3354 3313 } else { /* target_ip != NULL */ 3355 - /* 3356 - * If target exists and it's a directory, check that both 3357 - * target and source are directories and that target can be 3358 - * destroyed, or that neither is a directory. 3359 - */ 3360 - if (S_ISDIR(VFS_I(target_ip)->i_mode)) { 3361 - /* 3362 - * Make sure target dir is empty. 3363 - */ 3364 - if (!(xfs_dir_isempty(target_ip)) || 3365 - (VFS_I(target_ip)->i_nlink > 2)) { 3366 - error = -EEXIST; 3367 - goto out_trans_cancel; 3368 - } 3369 - } 3370 - 3371 3314 /* 3372 3315 * Link the source inode under the target name. 3373 3316 * If the source inode is a directory and we are moving ··· 3441 3416 spaceres); 3442 3417 if (error) 3443 3418 goto out_trans_cancel; 3444 - 3445 - /* 3446 - * For whiteouts, we need to bump the link count on the whiteout inode. 3447 - * This means that failures all the way up to this point leave the inode 3448 - * on the unlinked list and so cleanup is a simple matter of dropping 3449 - * the remaining reference to it. If we fail here after bumping the link 3450 - * count, we're shutting down the filesystem so we'll never see the 3451 - * intermediate state on disk. 3452 - */ 3453 - if (wip) { 3454 - ASSERT(VFS_I(wip)->i_nlink == 0); 3455 - xfs_bumplink(tp, wip); 3456 - error = xfs_iunlink_remove(tp, wip); 3457 - if (error) 3458 - goto out_trans_cancel; 3459 - xfs_trans_log_inode(tp, wip, XFS_ILOG_CORE); 3460 - 3461 - /* 3462 - * Now we have a real link, clear the "I'm a tmpfile" state 3463 - * flag from the inode so it doesn't accidentally get misused in 3464 - * future. 3465 - */ 3466 - VFS_I(wip)->i_state &= ~I_LINKABLE; 3467 - } 3468 3419 3469 3420 xfs_trans_ichgtime(tp, src_dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); 3470 3421 xfs_trans_log_inode(tp, src_dp, XFS_ILOG_CORE);

+1 -1

fs/xfs/xfs_inode_item.c

··· 651 651 struct xfs_inode_log_item *iip; 652 652 653 653 ASSERT(ip->i_itemp == NULL); 654 - iip = ip->i_itemp = kmem_zone_zalloc(xfs_ili_zone, KM_SLEEP); 654 + iip = ip->i_itemp = kmem_zone_zalloc(xfs_ili_zone, 0); 655 655 656 656 iip->ili_inode = ip; 657 657 xfs_log_item_init(mp, &iip->ili_item, XFS_LI_INODE,

+14 -11

fs/xfs/xfs_ioctl.c

··· 396 396 if (IS_ERR(dentry)) 397 397 return PTR_ERR(dentry); 398 398 399 - kbuf = kmem_zalloc_large(al_hreq.buflen, KM_SLEEP); 399 + kbuf = kmem_zalloc_large(al_hreq.buflen, 0); 400 400 if (!kbuf) 401 401 goto out_dput; 402 402 ··· 434 434 435 435 if (*len > XFS_XATTR_SIZE_MAX) 436 436 return -EINVAL; 437 - kbuf = kmem_zalloc_large(*len, KM_SLEEP); 437 + kbuf = kmem_zalloc_large(*len, 0); 438 438 if (!kbuf) 439 439 return -ENOMEM; 440 440 441 - error = xfs_attr_get(XFS_I(inode), name, kbuf, (int *)len, flags); 441 + error = xfs_attr_get(XFS_I(inode), name, &kbuf, (int *)len, flags); 442 442 if (error) 443 443 goto out_kfree; 444 444 ··· 831 831 /* 832 832 * Check the incoming bulk request @hdr from userspace and initialize the 833 833 * internal @breq bulk request appropriately. Returns 0 if the bulk request 834 - * should proceed; XFS_ITER_ABORT if there's nothing to do; or the usual 834 + * should proceed; -ECANCELED if there's nothing to do; or the usual 835 835 * negative error code. 836 836 */ 837 837 static int ··· 889 889 890 890 /* Asking for an inode past the end of the AG? We're done! */ 891 891 if (XFS_INO_TO_AGNO(mp, breq->startino) > hdr->agno) 892 - return XFS_ITER_ABORT; 892 + return -ECANCELED; 893 893 } else if (hdr->agno) 894 894 return -EINVAL; 895 895 896 896 /* Asking for an inode past the end of the FS? We're done! */ 897 897 if (XFS_INO_TO_AGNO(mp, breq->startino) >= mp->m_sb.sb_agcount) 898 - return XFS_ITER_ABORT; 898 + return -ECANCELED; 899 899 900 900 return 0; 901 901 } ··· 936 936 return -EFAULT; 937 937 938 938 error = xfs_bulk_ireq_setup(mp, &hdr, &breq, arg->bulkstat); 939 - if (error == XFS_ITER_ABORT) 939 + if (error == -ECANCELED) 940 940 goto out_teardown; 941 941 if (error < 0) 942 942 return error; ··· 986 986 return -EFAULT; 987 987 988 988 error = xfs_bulk_ireq_setup(mp, &hdr, &breq, arg->inumbers); 989 - if (error == XFS_ITER_ABORT) 989 + if (error == -ECANCELED) 990 990 goto out_teardown; 991 991 if (error < 0) 992 992 return error; ··· 1038 1038 1039 1039 if (copy_from_user(&ageo, arg, sizeof(ageo))) 1040 1040 return -EFAULT; 1041 + if (ageo.ag_flags) 1042 + return -EINVAL; 1043 + if (memchr_inv(&ageo.ag_reserved, 0, sizeof(ageo.ag_reserved))) 1044 + return -EINVAL; 1041 1045 1042 1046 error = xfs_ag_get_geometry(mp, ageo.ag_number, &ageo); 1043 1047 if (error) ··· 1313 1309 if (fa->fsx_xflags & FS_XFLAG_DAX) { 1314 1310 if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode))) 1315 1311 return -EINVAL; 1316 - if (S_ISREG(inode->i_mode) && 1317 - !bdev_dax_supported(xfs_find_bdev_for_inode(VFS_I(ip)), 1312 + if (!bdev_dax_supported(xfs_find_bdev_for_inode(VFS_I(ip)), 1318 1313 sb->s_blocksize)) 1319 1314 return -EINVAL; 1320 1315 } ··· 1884 1881 info.mp = ip->i_mount; 1885 1882 info.data = arg; 1886 1883 error = xfs_getfsmap(ip->i_mount, &xhead, xfs_getfsmap_format, &info); 1887 - if (error == XFS_BTREE_QUERY_RANGE_ABORT) { 1884 + if (error == -ECANCELED) { 1888 1885 error = 0; 1889 1886 aborted = true; 1890 1887 } else if (error)

+1 -1

fs/xfs/xfs_ioctl32.c

··· 381 381 return PTR_ERR(dentry); 382 382 383 383 error = -ENOMEM; 384 - kbuf = kmem_zalloc_large(al_hreq.buflen, KM_SLEEP); 384 + kbuf = kmem_zalloc_large(al_hreq.buflen, 0); 385 385 if (!kbuf) 386 386 goto out_dput; 387 387

+3 -3

fs/xfs/xfs_iomap.c

··· 58 58 { 59 59 struct xfs_mount *mp = ip->i_mount; 60 60 61 - if (unlikely(!imap->br_startblock && !XFS_IS_REALTIME_INODE(ip))) 61 + if (unlikely(!xfs_valid_startblock(ip, imap->br_startblock))) 62 62 return xfs_alert_fsblock_zero(ip, imap); 63 63 64 64 if (imap->br_startblock == HOLESTARTBLOCK) { ··· 297 297 goto out_unlock; 298 298 } 299 299 300 - if (!(imap->br_startblock || XFS_IS_REALTIME_INODE(ip))) 300 + if (unlikely(!xfs_valid_startblock(ip, imap->br_startblock))) 301 301 error = xfs_alert_fsblock_zero(ip, imap); 302 302 303 303 out_unlock: ··· 814 814 if (error) 815 815 return error; 816 816 817 - if (!(imap.br_startblock || XFS_IS_REALTIME_INODE(ip))) 817 + if (unlikely(!xfs_valid_startblock(ip, imap.br_startblock))) 818 818 return xfs_alert_fsblock_zero(ip, &imap); 819 819 820 820 if ((numblks_fsb = imap.br_blockcount) == 0) {

+5 -5

fs/xfs/xfs_itable.c

··· 137 137 xfs_irele(ip); 138 138 139 139 error = bc->formatter(bc->breq, buf); 140 - if (error == XFS_IBULK_ABORT) 140 + if (error == -ECANCELED) 141 141 goto out_advance; 142 142 if (error) 143 143 goto out; ··· 169 169 ASSERT(breq->icount == 1); 170 170 171 171 bc.buf = kmem_zalloc(sizeof(struct xfs_bulkstat), 172 - KM_SLEEP | KM_MAYFAIL); 172 + KM_MAYFAIL); 173 173 if (!bc.buf) 174 174 return -ENOMEM; 175 175 ··· 181 181 * If we reported one inode to userspace then we abort because we hit 182 182 * the end of the buffer. Don't leak that back to userspace. 183 183 */ 184 - if (error == XFS_IWALK_ABORT) 184 + if (error == -ECANCELED) 185 185 error = 0; 186 186 187 187 return error; ··· 243 243 return 0; 244 244 245 245 bc.buf = kmem_zalloc(sizeof(struct xfs_bulkstat), 246 - KM_SLEEP | KM_MAYFAIL); 246 + KM_MAYFAIL); 247 247 if (!bc.buf) 248 248 return -ENOMEM; 249 249 ··· 342 342 int error; 343 343 344 344 error = ic->formatter(ic->breq, &inogrp); 345 - if (error && error != XFS_IBULK_ABORT) 345 + if (error && error != -ECANCELED) 346 346 return error; 347 347 348 348 ic->breq->startino = XFS_AGINO_TO_INO(mp, agno, irec->ir_startino) +

+9 -4

fs/xfs/xfs_itable.h

··· 18 18 /* Only iterate within the same AG as startino */ 19 19 #define XFS_IBULK_SAME_AG (XFS_IWALK_SAME_AG) 20 20 21 - /* Return value that means we want to abort the walk. */ 22 - #define XFS_IBULK_ABORT (XFS_IWALK_ABORT) 23 - 24 21 /* 25 22 * Advance the user buffer pointer by one record of the given size. If the 26 23 * buffer is now full, return the appropriate error code. ··· 31 34 32 35 breq->ubuffer = b + bytes; 33 36 breq->ocount++; 34 - return breq->ocount == breq->icount ? XFS_IBULK_ABORT : 0; 37 + return breq->ocount == breq->icount ? -ECANCELED : 0; 35 38 } 36 39 37 40 /* 38 41 * Return stat information in bulk (by-inode) for the filesystem. 42 + */ 43 + 44 + /* 45 + * Return codes for the formatter function are 0 to continue iterating, and 46 + * non-zero to stop iterating. Any non-zero value will be passed up to the 47 + * bulkstat/inumbers caller. The special value -ECANCELED can be used to stop 48 + * iteration, as neither bulkstat nor inumbers will ever generate that error 49 + * code on their own. 39 50 */ 40 51 41 52 typedef int (*bulkstat_one_fmt_pf)(struct xfs_ibulk *breq,

+2 -2

fs/xfs/xfs_iwalk.c

··· 31 31 * inode it finds, it calls a walk function with the relevant inode number and 32 32 * a pointer to caller-provided data. The walk function can return the usual 33 33 * negative error code to stop the iteration; 0 to continue the iteration; or 34 - * XFS_IWALK_ABORT to stop the iteration. This return value is returned to the 34 + * -ECANCELED to stop the iteration. This return value is returned to the 35 35 * caller. 36 36 * 37 37 * Internally, we allow the walk function to do anything, which means that we ··· 616 616 if (xfs_pwork_ctl_want_abort(&pctl)) 617 617 break; 618 618 619 - iwag = kmem_zalloc(sizeof(struct xfs_iwalk_ag), KM_SLEEP); 619 + iwag = kmem_zalloc(sizeof(struct xfs_iwalk_ag), 0); 620 620 iwag->mp = mp; 621 621 iwag->iwalk_fn = iwalk_fn; 622 622 iwag->data = data;

+8 -5

fs/xfs/xfs_iwalk.h

··· 6 6 #ifndef __XFS_IWALK_H__ 7 7 #define __XFS_IWALK_H__ 8 8 9 + /* 10 + * Return codes for the inode/inobt walk function are 0 to continue iterating, 11 + * and non-zero to stop iterating. Any non-zero value will be passed up to the 12 + * iwalk or inobt_walk caller. The special value -ECANCELED can be used to 13 + * stop iteration, as neither iwalk nor inobt_walk will ever generate that 14 + * error code on their own. 15 + */ 16 + 9 17 /* Walk all inodes in the filesystem starting from @startino. */ 10 18 typedef int (*xfs_iwalk_fn)(struct xfs_mount *mp, struct xfs_trans *tp, 11 19 xfs_ino_t ino, void *data); 12 - /* Return values for xfs_iwalk_fn. */ 13 - #define XFS_IWALK_CONTINUE (XFS_ITER_CONTINUE) 14 - #define XFS_IWALK_ABORT (XFS_ITER_ABORT) 15 20 16 21 int xfs_iwalk(struct xfs_mount *mp, struct xfs_trans *tp, xfs_ino_t startino, 17 22 unsigned int flags, xfs_iwalk_fn iwalk_fn, ··· 35 30 xfs_agnumber_t agno, 36 31 const struct xfs_inobt_rec_incore *irec, 37 32 void *data); 38 - /* Return value (for xfs_inobt_walk_fn) that aborts the walk immediately. */ 39 - #define XFS_INOBT_WALK_ABORT (XFS_IWALK_ABORT) 40 33 41 34 int xfs_inobt_walk(struct xfs_mount *mp, struct xfs_trans *tp, 42 35 xfs_ino_t startino, unsigned int flags,

+276 -190

fs/xfs/xfs_log.c

··· 214 214 { 215 215 struct xlog_ticket *tic; 216 216 int need_bytes; 217 + bool woken_task = false; 217 218 218 219 list_for_each_entry(tic, &head->waiters, t_queue) { 220 + 221 + /* 222 + * There is a chance that the size of the CIL checkpoints in 223 + * progress at the last AIL push target calculation resulted in 224 + * limiting the target to the log head (l_last_sync_lsn) at the 225 + * time. This may not reflect where the log head is now as the 226 + * CIL checkpoints may have completed. 227 + * 228 + * Hence when we are woken here, it may be that the head of the 229 + * log that has moved rather than the tail. As the tail didn't 230 + * move, there still won't be space available for the 231 + * reservation we require. However, if the AIL has already 232 + * pushed to the target defined by the old log head location, we 233 + * will hang here waiting for something else to update the AIL 234 + * push target. 235 + * 236 + * Therefore, if there isn't space to wake the first waiter on 237 + * the grant head, we need to push the AIL again to ensure the 238 + * target reflects both the current log tail and log head 239 + * position before we wait for the tail to move again. 240 + */ 241 + 219 242 need_bytes = xlog_ticket_reservation(log, head, tic); 220 - if (*free_bytes < need_bytes) 243 + if (*free_bytes < need_bytes) { 244 + if (!woken_task) 245 + xlog_grant_push_ail(log, need_bytes); 221 246 return false; 247 + } 222 248 223 249 *free_bytes -= need_bytes; 224 250 trace_xfs_log_grant_wake_up(log, tic); 225 251 wake_up_process(tic->t_task); 252 + woken_task = true; 226 253 } 227 254 228 255 return true; ··· 455 428 XFS_STATS_INC(mp, xs_try_logspace); 456 429 457 430 ASSERT(*ticp == NULL); 458 - tic = xlog_ticket_alloc(log, unit_bytes, cnt, client, permanent, 459 - KM_SLEEP); 431 + tic = xlog_ticket_alloc(log, unit_bytes, cnt, client, permanent, 0); 460 432 *ticp = tic; 461 433 462 434 xlog_grant_push_ail(log, tic->t_cnt ? tic->t_unit_res * tic->t_cnt ··· 1430 1404 */ 1431 1405 ASSERT(log->l_iclog_size >= 4096); 1432 1406 for (i = 0; i < log->l_iclog_bufs; i++) { 1407 + int align_mask = xfs_buftarg_dma_alignment(mp->m_logdev_targp); 1433 1408 size_t bvec_size = howmany(log->l_iclog_size, PAGE_SIZE) * 1434 1409 sizeof(struct bio_vec); 1435 1410 ··· 1442 1415 iclog->ic_prev = prev_iclog; 1443 1416 prev_iclog = iclog; 1444 1417 1445 - iclog->ic_data = kmem_alloc_large(log->l_iclog_size, 1446 - KM_MAYFAIL); 1418 + iclog->ic_data = kmem_alloc_io(log->l_iclog_size, align_mask, 1419 + KM_MAYFAIL); 1447 1420 if (!iclog->ic_data) 1448 1421 goto out_free_iclog; 1449 1422 #ifdef DEBUG ··· 2523 2496 ***************************************************************************** 2524 2497 */ 2525 2498 2526 - /* Clean iclogs starting from the head. This ordering must be 2527 - * maintained, so an iclog doesn't become ACTIVE beyond one that 2528 - * is SYNCING. This is also required to maintain the notion that we use 2529 - * a ordered wait queue to hold off would be writers to the log when every 2530 - * iclog is trying to sync to disk. 2499 + /* 2500 + * An iclog has just finished IO completion processing, so we need to update 2501 + * the iclog state and propagate that up into the overall log state. Hence we 2502 + * prepare the iclog for cleaning, and then clean all the pending dirty iclogs 2503 + * starting from the head, and then wake up any threads that are waiting for the 2504 + * iclog to be marked clean. 2531 2505 * 2532 - * State Change: DIRTY -> ACTIVE 2506 + * The ordering of marking iclogs ACTIVE must be maintained, so an iclog 2507 + * doesn't become ACTIVE beyond one that is SYNCING. This is also required to 2508 + * maintain the notion that we use a ordered wait queue to hold off would be 2509 + * writers to the log when every iclog is trying to sync to disk. 2510 + * 2511 + * Caller must hold the icloglock before calling us. 2512 + * 2513 + * State Change: !IOERROR -> DIRTY -> ACTIVE 2533 2514 */ 2534 2515 STATIC void 2535 - xlog_state_clean_log( 2536 - struct xlog *log) 2516 + xlog_state_clean_iclog( 2517 + struct xlog *log, 2518 + struct xlog_in_core *dirty_iclog) 2537 2519 { 2538 - xlog_in_core_t *iclog; 2539 - int changed = 0; 2520 + struct xlog_in_core *iclog; 2521 + int changed = 0; 2540 2522 2523 + /* Prepare the completed iclog. */ 2524 + if (!(dirty_iclog->ic_state & XLOG_STATE_IOERROR)) 2525 + dirty_iclog->ic_state = XLOG_STATE_DIRTY; 2526 + 2527 + /* Walk all the iclogs to update the ordered active state. */ 2541 2528 iclog = log->l_iclog; 2542 2529 do { 2543 2530 if (iclog->ic_state == XLOG_STATE_DIRTY) { ··· 2589 2548 iclog = iclog->ic_next; 2590 2549 } while (iclog != log->l_iclog); 2591 2550 2592 - /* log is locked when we are called */ 2551 + 2552 + /* 2553 + * Wake up threads waiting in xfs_log_force() for the dirty iclog 2554 + * to be cleaned. 2555 + */ 2556 + wake_up_all(&dirty_iclog->ic_force_wait); 2557 + 2593 2558 /* 2594 2559 * Change state for the dummy log recording. 2595 2560 * We usually go to NEED. But we go to NEED2 if the changed indicates ··· 2629 2582 ASSERT(0); 2630 2583 } 2631 2584 } 2632 - } /* xlog_state_clean_log */ 2585 + } 2633 2586 2634 2587 STATIC xfs_lsn_t 2635 2588 xlog_get_lowest_lsn( ··· 2650 2603 return lowest_lsn; 2651 2604 } 2652 2605 2606 + /* 2607 + * Completion of a iclog IO does not imply that a transaction has completed, as 2608 + * transactions can be large enough to span many iclogs. We cannot change the 2609 + * tail of the log half way through a transaction as this may be the only 2610 + * transaction in the log and moving the tail to point to the middle of it 2611 + * will prevent recovery from finding the start of the transaction. Hence we 2612 + * should only update the last_sync_lsn if this iclog contains transaction 2613 + * completion callbacks on it. 2614 + * 2615 + * We have to do this before we drop the icloglock to ensure we are the only one 2616 + * that can update it. 2617 + * 2618 + * If we are moving the last_sync_lsn forwards, we also need to ensure we kick 2619 + * the reservation grant head pushing. This is due to the fact that the push 2620 + * target is bound by the current last_sync_lsn value. Hence if we have a large 2621 + * amount of log space bound up in this committing transaction then the 2622 + * last_sync_lsn value may be the limiting factor preventing tail pushing from 2623 + * freeing space in the log. Hence once we've updated the last_sync_lsn we 2624 + * should push the AIL to ensure the push target (and hence the grant head) is 2625 + * no longer bound by the old log head location and can move forwards and make 2626 + * progress again. 2627 + */ 2628 + static void 2629 + xlog_state_set_callback( 2630 + struct xlog *log, 2631 + struct xlog_in_core *iclog, 2632 + xfs_lsn_t header_lsn) 2633 + { 2634 + iclog->ic_state = XLOG_STATE_CALLBACK; 2635 + 2636 + ASSERT(XFS_LSN_CMP(atomic64_read(&log->l_last_sync_lsn), 2637 + header_lsn) <= 0); 2638 + 2639 + if (list_empty_careful(&iclog->ic_callbacks)) 2640 + return; 2641 + 2642 + atomic64_set(&log->l_last_sync_lsn, header_lsn); 2643 + xlog_grant_push_ail(log, 0); 2644 + } 2645 + 2646 + /* 2647 + * Return true if we need to stop processing, false to continue to the next 2648 + * iclog. The caller will need to run callbacks if the iclog is returned in the 2649 + * XLOG_STATE_CALLBACK state. 2650 + */ 2651 + static bool 2652 + xlog_state_iodone_process_iclog( 2653 + struct xlog *log, 2654 + struct xlog_in_core *iclog, 2655 + struct xlog_in_core *completed_iclog, 2656 + bool *ioerror) 2657 + { 2658 + xfs_lsn_t lowest_lsn; 2659 + xfs_lsn_t header_lsn; 2660 + 2661 + /* Skip all iclogs in the ACTIVE & DIRTY states */ 2662 + if (iclog->ic_state & (XLOG_STATE_ACTIVE | XLOG_STATE_DIRTY)) 2663 + return false; 2664 + 2665 + /* 2666 + * Between marking a filesystem SHUTDOWN and stopping the log, we do 2667 + * flush all iclogs to disk (if there wasn't a log I/O error). So, we do 2668 + * want things to go smoothly in case of just a SHUTDOWN w/o a 2669 + * LOG_IO_ERROR. 2670 + */ 2671 + if (iclog->ic_state & XLOG_STATE_IOERROR) { 2672 + *ioerror = true; 2673 + return false; 2674 + } 2675 + 2676 + /* 2677 + * Can only perform callbacks in order. Since this iclog is not in the 2678 + * DONE_SYNC/ DO_CALLBACK state, we skip the rest and just try to clean 2679 + * up. If we set our iclog to DO_CALLBACK, we will not process it when 2680 + * we retry since a previous iclog is in the CALLBACK and the state 2681 + * cannot change since we are holding the l_icloglock. 2682 + */ 2683 + if (!(iclog->ic_state & 2684 + (XLOG_STATE_DONE_SYNC | XLOG_STATE_DO_CALLBACK))) { 2685 + if (completed_iclog && 2686 + (completed_iclog->ic_state == XLOG_STATE_DONE_SYNC)) { 2687 + completed_iclog->ic_state = XLOG_STATE_DO_CALLBACK; 2688 + } 2689 + return true; 2690 + } 2691 + 2692 + /* 2693 + * We now have an iclog that is in either the DO_CALLBACK or DONE_SYNC 2694 + * states. The other states (WANT_SYNC, SYNCING, or CALLBACK were caught 2695 + * by the above if and are going to clean (i.e. we aren't doing their 2696 + * callbacks) see the above if. 2697 + * 2698 + * We will do one more check here to see if we have chased our tail 2699 + * around. If this is not the lowest lsn iclog, then we will leave it 2700 + * for another completion to process. 2701 + */ 2702 + header_lsn = be64_to_cpu(iclog->ic_header.h_lsn); 2703 + lowest_lsn = xlog_get_lowest_lsn(log); 2704 + if (lowest_lsn && XFS_LSN_CMP(lowest_lsn, header_lsn) < 0) 2705 + return false; 2706 + 2707 + xlog_state_set_callback(log, iclog, header_lsn); 2708 + return false; 2709 + 2710 + } 2711 + 2712 + /* 2713 + * Keep processing entries in the iclog callback list until we come around and 2714 + * it is empty. We need to atomically see that the list is empty and change the 2715 + * state to DIRTY so that we don't miss any more callbacks being added. 2716 + * 2717 + * This function is called with the icloglock held and returns with it held. We 2718 + * drop it while running callbacks, however, as holding it over thousands of 2719 + * callbacks is unnecessary and causes excessive contention if we do. 2720 + */ 2721 + static void 2722 + xlog_state_do_iclog_callbacks( 2723 + struct xlog *log, 2724 + struct xlog_in_core *iclog, 2725 + bool aborted) 2726 + { 2727 + spin_unlock(&log->l_icloglock); 2728 + spin_lock(&iclog->ic_callback_lock); 2729 + while (!list_empty(&iclog->ic_callbacks)) { 2730 + LIST_HEAD(tmp); 2731 + 2732 + list_splice_init(&iclog->ic_callbacks, &tmp); 2733 + 2734 + spin_unlock(&iclog->ic_callback_lock); 2735 + xlog_cil_process_committed(&tmp, aborted); 2736 + spin_lock(&iclog->ic_callback_lock); 2737 + } 2738 + 2739 + /* 2740 + * Pick up the icloglock while still holding the callback lock so we 2741 + * serialise against anyone trying to add more callbacks to this iclog 2742 + * now we've finished processing. 2743 + */ 2744 + spin_lock(&log->l_icloglock); 2745 + spin_unlock(&iclog->ic_callback_lock); 2746 + } 2747 + 2748 + #ifdef DEBUG 2749 + /* 2750 + * Make one last gasp attempt to see if iclogs are being left in limbo. If the 2751 + * above loop finds an iclog earlier than the current iclog and in one of the 2752 + * syncing states, the current iclog is put into DO_CALLBACK and the callbacks 2753 + * are deferred to the completion of the earlier iclog. Walk the iclogs in order 2754 + * and make sure that no iclog is in DO_CALLBACK unless an earlier iclog is in 2755 + * one of the syncing states. 2756 + * 2757 + * Note that SYNCING|IOERROR is a valid state so we cannot just check for 2758 + * ic_state == SYNCING. 2759 + */ 2760 + static void 2761 + xlog_state_callback_check_state( 2762 + struct xlog *log) 2763 + { 2764 + struct xlog_in_core *first_iclog = log->l_iclog; 2765 + struct xlog_in_core *iclog = first_iclog; 2766 + 2767 + do { 2768 + ASSERT(iclog->ic_state != XLOG_STATE_DO_CALLBACK); 2769 + /* 2770 + * Terminate the loop if iclogs are found in states 2771 + * which will cause other threads to clean up iclogs. 2772 + * 2773 + * SYNCING - i/o completion will go through logs 2774 + * DONE_SYNC - interrupt thread should be waiting for 2775 + * l_icloglock 2776 + * IOERROR - give up hope all ye who enter here 2777 + */ 2778 + if (iclog->ic_state == XLOG_STATE_WANT_SYNC || 2779 + iclog->ic_state & XLOG_STATE_SYNCING || 2780 + iclog->ic_state == XLOG_STATE_DONE_SYNC || 2781 + iclog->ic_state == XLOG_STATE_IOERROR ) 2782 + break; 2783 + iclog = iclog->ic_next; 2784 + } while (first_iclog != iclog); 2785 + } 2786 + #else 2787 + #define xlog_state_callback_check_state(l) ((void)0) 2788 + #endif 2789 + 2653 2790 STATIC void 2654 2791 xlog_state_do_callback( 2655 2792 struct xlog *log, 2656 2793 bool aborted, 2657 2794 struct xlog_in_core *ciclog) 2658 2795 { 2659 - xlog_in_core_t *iclog; 2660 - xlog_in_core_t *first_iclog; /* used to know when we've 2661 - * processed all iclogs once */ 2662 - int flushcnt = 0; 2663 - xfs_lsn_t lowest_lsn; 2664 - int ioerrors; /* counter: iclogs with errors */ 2665 - int loopdidcallbacks; /* flag: inner loop did callbacks*/ 2666 - int funcdidcallbacks; /* flag: function did callbacks */ 2667 - int repeats; /* for issuing console warnings if 2668 - * looping too many times */ 2669 - int wake = 0; 2796 + struct xlog_in_core *iclog; 2797 + struct xlog_in_core *first_iclog; 2798 + bool did_callbacks = false; 2799 + bool cycled_icloglock; 2800 + bool ioerror; 2801 + int flushcnt = 0; 2802 + int repeats = 0; 2670 2803 2671 2804 spin_lock(&log->l_icloglock); 2672 - first_iclog = iclog = log->l_iclog; 2673 - ioerrors = 0; 2674 - funcdidcallbacks = 0; 2675 - repeats = 0; 2676 - 2677 2805 do { 2678 2806 /* 2679 2807 * Scan all iclogs starting with the one pointed to by the ··· 2860 2638 */ 2861 2639 first_iclog = log->l_iclog; 2862 2640 iclog = log->l_iclog; 2863 - loopdidcallbacks = 0; 2641 + cycled_icloglock = false; 2642 + ioerror = false; 2864 2643 repeats++; 2865 2644 2866 2645 do { 2646 + if (xlog_state_iodone_process_iclog(log, iclog, 2647 + ciclog, &ioerror)) 2648 + break; 2867 2649 2868 - /* skip all iclogs in the ACTIVE & DIRTY states */ 2869 - if (iclog->ic_state & 2870 - (XLOG_STATE_ACTIVE|XLOG_STATE_DIRTY)) { 2650 + if (!(iclog->ic_state & 2651 + (XLOG_STATE_CALLBACK | XLOG_STATE_IOERROR))) { 2871 2652 iclog = iclog->ic_next; 2872 2653 continue; 2873 2654 } 2874 2655 2875 2656 /* 2876 - * Between marking a filesystem SHUTDOWN and stopping 2877 - * the log, we do flush all iclogs to disk (if there 2878 - * wasn't a log I/O error). So, we do want things to 2879 - * go smoothly in case of just a SHUTDOWN w/o a 2880 - * LOG_IO_ERROR. 2657 + * Running callbacks will drop the icloglock which means 2658 + * we'll have to run at least one more complete loop. 2881 2659 */ 2882 - if (!(iclog->ic_state & XLOG_STATE_IOERROR)) { 2883 - /* 2884 - * Can only perform callbacks in order. Since 2885 - * this iclog is not in the DONE_SYNC/ 2886 - * DO_CALLBACK state, we skip the rest and 2887 - * just try to clean up. If we set our iclog 2888 - * to DO_CALLBACK, we will not process it when 2889 - * we retry since a previous iclog is in the 2890 - * CALLBACK and the state cannot change since 2891 - * we are holding the l_icloglock. 2892 - */ 2893 - if (!(iclog->ic_state & 2894 - (XLOG_STATE_DONE_SYNC | 2895 - XLOG_STATE_DO_CALLBACK))) { 2896 - if (ciclog && (ciclog->ic_state == 2897 - XLOG_STATE_DONE_SYNC)) { 2898 - ciclog->ic_state = XLOG_STATE_DO_CALLBACK; 2899 - } 2900 - break; 2901 - } 2902 - /* 2903 - * We now have an iclog that is in either the 2904 - * DO_CALLBACK or DONE_SYNC states. The other 2905 - * states (WANT_SYNC, SYNCING, or CALLBACK were 2906 - * caught by the above if and are going to 2907 - * clean (i.e. we aren't doing their callbacks) 2908 - * see the above if. 2909 - */ 2660 + cycled_icloglock = true; 2661 + xlog_state_do_iclog_callbacks(log, iclog, aborted); 2910 2662 2911 - /* 2912 - * We will do one more check here to see if we 2913 - * have chased our tail around. 2914 - */ 2915 - 2916 - lowest_lsn = xlog_get_lowest_lsn(log); 2917 - if (lowest_lsn && 2918 - XFS_LSN_CMP(lowest_lsn, 2919 - be64_to_cpu(iclog->ic_header.h_lsn)) < 0) { 2920 - iclog = iclog->ic_next; 2921 - continue; /* Leave this iclog for 2922 - * another thread */ 2923 - } 2924 - 2925 - iclog->ic_state = XLOG_STATE_CALLBACK; 2926 - 2927 - 2928 - /* 2929 - * Completion of a iclog IO does not imply that 2930 - * a transaction has completed, as transactions 2931 - * can be large enough to span many iclogs. We 2932 - * cannot change the tail of the log half way 2933 - * through a transaction as this may be the only 2934 - * transaction in the log and moving th etail to 2935 - * point to the middle of it will prevent 2936 - * recovery from finding the start of the 2937 - * transaction. Hence we should only update the 2938 - * last_sync_lsn if this iclog contains 2939 - * transaction completion callbacks on it. 2940 - * 2941 - * We have to do this before we drop the 2942 - * icloglock to ensure we are the only one that 2943 - * can update it. 2944 - */ 2945 - ASSERT(XFS_LSN_CMP(atomic64_read(&log->l_last_sync_lsn), 2946 - be64_to_cpu(iclog->ic_header.h_lsn)) <= 0); 2947 - if (!list_empty_careful(&iclog->ic_callbacks)) 2948 - atomic64_set(&log->l_last_sync_lsn, 2949 - be64_to_cpu(iclog->ic_header.h_lsn)); 2950 - 2951 - } else 2952 - ioerrors++; 2953 - 2954 - spin_unlock(&log->l_icloglock); 2955 - 2956 - /* 2957 - * Keep processing entries in the callback list until 2958 - * we come around and it is empty. We need to 2959 - * atomically see that the list is empty and change the 2960 - * state to DIRTY so that we don't miss any more 2961 - * callbacks being added. 2962 - */ 2963 - spin_lock(&iclog->ic_callback_lock); 2964 - while (!list_empty(&iclog->ic_callbacks)) { 2965 - LIST_HEAD(tmp); 2966 - 2967 - list_splice_init(&iclog->ic_callbacks, &tmp); 2968 - 2969 - spin_unlock(&iclog->ic_callback_lock); 2970 - xlog_cil_process_committed(&tmp, aborted); 2971 - spin_lock(&iclog->ic_callback_lock); 2972 - } 2973 - 2974 - loopdidcallbacks++; 2975 - funcdidcallbacks++; 2976 - 2977 - spin_lock(&log->l_icloglock); 2978 - spin_unlock(&iclog->ic_callback_lock); 2979 - if (!(iclog->ic_state & XLOG_STATE_IOERROR)) 2980 - iclog->ic_state = XLOG_STATE_DIRTY; 2981 - 2982 - /* 2983 - * Transition from DIRTY to ACTIVE if applicable. 2984 - * NOP if STATE_IOERROR. 2985 - */ 2986 - xlog_state_clean_log(log); 2987 - 2988 - /* wake up threads waiting in xfs_log_force() */ 2989 - wake_up_all(&iclog->ic_force_wait); 2990 - 2663 + xlog_state_clean_iclog(log, iclog); 2991 2664 iclog = iclog->ic_next; 2992 2665 } while (first_iclog != iclog); 2666 + 2667 + did_callbacks |= cycled_icloglock; 2993 2668 2994 2669 if (repeats > 5000) { 2995 2670 flushcnt += repeats; ··· 2895 2776 "%s: possible infinite loop (%d iterations)", 2896 2777 __func__, flushcnt); 2897 2778 } 2898 - } while (!ioerrors && loopdidcallbacks); 2779 + } while (!ioerror && cycled_icloglock); 2899 2780 2900 - #ifdef DEBUG 2901 - /* 2902 - * Make one last gasp attempt to see if iclogs are being left in limbo. 2903 - * If the above loop finds an iclog earlier than the current iclog and 2904 - * in one of the syncing states, the current iclog is put into 2905 - * DO_CALLBACK and the callbacks are deferred to the completion of the 2906 - * earlier iclog. Walk the iclogs in order and make sure that no iclog 2907 - * is in DO_CALLBACK unless an earlier iclog is in one of the syncing 2908 - * states. 2909 - * 2910 - * Note that SYNCING|IOABORT is a valid state so we cannot just check 2911 - * for ic_state == SYNCING. 2912 - */ 2913 - if (funcdidcallbacks) { 2914 - first_iclog = iclog = log->l_iclog; 2915 - do { 2916 - ASSERT(iclog->ic_state != XLOG_STATE_DO_CALLBACK); 2917 - /* 2918 - * Terminate the loop if iclogs are found in states 2919 - * which will cause other threads to clean up iclogs. 2920 - * 2921 - * SYNCING - i/o completion will go through logs 2922 - * DONE_SYNC - interrupt thread should be waiting for 2923 - * l_icloglock 2924 - * IOERROR - give up hope all ye who enter here 2925 - */ 2926 - if (iclog->ic_state == XLOG_STATE_WANT_SYNC || 2927 - iclog->ic_state & XLOG_STATE_SYNCING || 2928 - iclog->ic_state == XLOG_STATE_DONE_SYNC || 2929 - iclog->ic_state == XLOG_STATE_IOERROR ) 2930 - break; 2931 - iclog = iclog->ic_next; 2932 - } while (first_iclog != iclog); 2933 - } 2934 - #endif 2781 + if (did_callbacks) 2782 + xlog_state_callback_check_state(log); 2935 2783 2936 2784 if (log->l_iclog->ic_state & (XLOG_STATE_ACTIVE|XLOG_STATE_IOERROR)) 2937 - wake = 1; 2938 - spin_unlock(&log->l_icloglock); 2939 - 2940 - if (wake) 2941 2785 wake_up_all(&log->l_flush_wait); 2786 + 2787 + spin_unlock(&log->l_icloglock); 2942 2788 } 2943 2789 2944 2790 ··· 4003 3919 * item committed callback functions will do this again under lock to 4004 3920 * avoid races. 4005 3921 */ 3922 + spin_lock(&log->l_cilp->xc_push_lock); 4006 3923 wake_up_all(&log->l_cilp->xc_commit_wait); 3924 + spin_unlock(&log->l_cilp->xc_push_lock); 4007 3925 xlog_state_do_callback(log, true, NULL); 4008 3926 4009 3927 #ifdef XFSERRORDEBUG

+5 -5

fs/xfs/xfs_log_cil.c

··· 38 38 struct xlog_ticket *tic; 39 39 40 40 tic = xlog_ticket_alloc(log, 0, 1, XFS_TRANSACTION, 0, 41 - KM_SLEEP|KM_NOFS); 41 + KM_NOFS); 42 42 43 43 /* 44 44 * set the current reservation to zero so we know to steal the basic ··· 186 186 */ 187 187 kmem_free(lip->li_lv_shadow); 188 188 189 - lv = kmem_alloc_large(buf_size, KM_SLEEP | KM_NOFS); 189 + lv = kmem_alloc_large(buf_size, KM_NOFS); 190 190 memset(lv, 0, xlog_cil_iovec_space(niovecs)); 191 191 192 192 lv->lv_item = lip; ··· 660 660 if (!cil) 661 661 return 0; 662 662 663 - new_ctx = kmem_zalloc(sizeof(*new_ctx), KM_SLEEP|KM_NOFS); 663 + new_ctx = kmem_zalloc(sizeof(*new_ctx), KM_NOFS); 664 664 new_ctx->ticket = xlog_cil_ticket_alloc(log); 665 665 666 666 down_write(&cil->xc_ctx_lock); ··· 1179 1179 struct xfs_cil *cil; 1180 1180 struct xfs_cil_ctx *ctx; 1181 1181 1182 - cil = kmem_zalloc(sizeof(*cil), KM_SLEEP|KM_MAYFAIL); 1182 + cil = kmem_zalloc(sizeof(*cil), KM_MAYFAIL); 1183 1183 if (!cil) 1184 1184 return -ENOMEM; 1185 1185 1186 - ctx = kmem_zalloc(sizeof(*ctx), KM_SLEEP|KM_MAYFAIL); 1186 + ctx = kmem_zalloc(sizeof(*ctx), KM_MAYFAIL); 1187 1187 if (!ctx) { 1188 1188 kmem_free(cil); 1189 1189 return -ENOMEM;

+32 -18

fs/xfs/xfs_log_recover.c

··· 97 97 struct xlog *log, 98 98 int nbblks) 99 99 { 100 + int align_mask = xfs_buftarg_dma_alignment(log->l_targ); 101 + 100 102 /* 101 103 * Pass log block 0 since we don't have an addr yet, buffer will be 102 104 * verified on read. ··· 127 125 if (nbblks > 1 && log->l_sectBBsize > 1) 128 126 nbblks += log->l_sectBBsize; 129 127 nbblks = round_up(nbblks, log->l_sectBBsize); 130 - return kmem_alloc_large(BBTOB(nbblks), KM_MAYFAIL); 128 + return kmem_alloc_io(BBTOB(nbblks), align_mask, KM_MAYFAIL); 131 129 } 132 130 133 131 /* ··· 1962 1960 } 1963 1961 } 1964 1962 1965 - bcp = kmem_alloc(sizeof(struct xfs_buf_cancel), KM_SLEEP); 1963 + bcp = kmem_alloc(sizeof(struct xfs_buf_cancel), 0); 1966 1964 bcp->bc_blkno = buf_f->blf_blkno; 1967 1965 bcp->bc_len = buf_f->blf_len; 1968 1966 bcp->bc_refcount = 1; ··· 2932 2930 if (item->ri_buf[0].i_len == sizeof(struct xfs_inode_log_format)) { 2933 2931 in_f = item->ri_buf[0].i_addr; 2934 2932 } else { 2935 - in_f = kmem_alloc(sizeof(struct xfs_inode_log_format), KM_SLEEP); 2933 + in_f = kmem_alloc(sizeof(struct xfs_inode_log_format), 0); 2936 2934 need_free = 1; 2937 2935 error = xfs_inode_item_format_convert(&item->ri_buf[0], in_f); 2938 2936 if (error) ··· 4163 4161 { 4164 4162 xlog_recover_item_t *item; 4165 4163 4166 - item = kmem_zalloc(sizeof(xlog_recover_item_t), KM_SLEEP); 4164 + item = kmem_zalloc(sizeof(xlog_recover_item_t), 0); 4167 4165 INIT_LIST_HEAD(&item->ri_list); 4168 4166 list_add_tail(&item->ri_list, head); 4169 4167 } ··· 4203 4201 old_ptr = item->ri_buf[item->ri_cnt-1].i_addr; 4204 4202 old_len = item->ri_buf[item->ri_cnt-1].i_len; 4205 4203 4206 - ptr = kmem_realloc(old_ptr, len + old_len, KM_SLEEP); 4204 + ptr = kmem_realloc(old_ptr, len + old_len, 0); 4207 4205 memcpy(&ptr[old_len], dp, len); 4208 4206 item->ri_buf[item->ri_cnt-1].i_len += len; 4209 4207 item->ri_buf[item->ri_cnt-1].i_addr = ptr; ··· 4263 4261 return 0; 4264 4262 } 4265 4263 4266 - ptr = kmem_alloc(len, KM_SLEEP); 4264 + ptr = kmem_alloc(len, 0); 4267 4265 memcpy(ptr, dp, len); 4268 4266 in_f = (struct xfs_inode_log_format *)ptr; 4269 4267 ··· 4291 4289 item->ri_total = in_f->ilf_size; 4292 4290 item->ri_buf = 4293 4291 kmem_zalloc(item->ri_total * sizeof(xfs_log_iovec_t), 4294 - KM_SLEEP); 4292 + 0); 4295 4293 } 4296 4294 ASSERT(item->ri_total > item->ri_cnt); 4297 4295 /* Description region is ri_buf[0] */ ··· 4425 4423 * This is a new transaction so allocate a new recovery container to 4426 4424 * hold the recovery ops that will follow. 4427 4425 */ 4428 - trans = kmem_zalloc(sizeof(struct xlog_recover), KM_SLEEP); 4426 + trans = kmem_zalloc(sizeof(struct xlog_recover), 0); 4429 4427 trans->r_log_tid = tid; 4430 4428 trans->r_lsn = be64_to_cpu(rhead->h_lsn); 4431 4429 INIT_LIST_HEAD(&trans->r_itemq); ··· 5024 5022 } 5025 5023 5026 5024 /* 5027 - * xlog_iunlink_recover 5025 + * Recover AGI unlinked lists 5028 5026 * 5029 - * This is called during recovery to process any inodes which 5030 - * we unlinked but not freed when the system crashed. These 5031 - * inodes will be on the lists in the AGI blocks. What we do 5032 - * here is scan all the AGIs and fully truncate and free any 5033 - * inodes found on the lists. Each inode is removed from the 5034 - * lists when it has been fully truncated and is freed. The 5035 - * freeing of the inode and its removal from the list must be 5036 - * atomic. 5027 + * This is called during recovery to process any inodes which we unlinked but 5028 + * not freed when the system crashed. These inodes will be on the lists in the 5029 + * AGI blocks. What we do here is scan all the AGIs and fully truncate and free 5030 + * any inodes found on the lists. Each inode is removed from the lists when it 5031 + * has been fully truncated and is freed. The freeing of the inode and its 5032 + * removal from the list must be atomic. 5033 + * 5034 + * If everything we touch in the agi processing loop is already in memory, this 5035 + * loop can hold the cpu for a long time. It runs without lock contention, 5036 + * memory allocation contention, the need wait for IO, etc, and so will run 5037 + * until we either run out of inodes to process, run low on memory or we run out 5038 + * of log space. 5039 + * 5040 + * This behaviour is bad for latency on single CPU and non-preemptible kernels, 5041 + * and can prevent other filesytem work (such as CIL pushes) from running. This 5042 + * can lead to deadlocks if the recovery process runs out of log reservation 5043 + * space. Hence we need to yield the CPU when there is other kernel work 5044 + * scheduled on this CPU to ensure other scheduled work can run without undue 5045 + * latency. 5037 5046 */ 5038 5047 STATIC void 5039 5048 xlog_recover_process_iunlinks( ··· 5091 5078 while (agino != NULLAGINO) { 5092 5079 agino = xlog_recover_process_one_iunlink(mp, 5093 5080 agno, agino, bucket); 5081 + cond_resched(); 5094 5082 } 5095 5083 } 5096 5084 xfs_buf_rele(agibp); ··· 5541 5527 */ 5542 5528 log->l_buf_cancel_table = kmem_zalloc(XLOG_BC_TABLE_SIZE * 5543 5529 sizeof(struct list_head), 5544 - KM_SLEEP); 5530 + 0); 5545 5531 for (i = 0; i < XLOG_BC_TABLE_SIZE; i++) 5546 5532 INIT_LIST_HEAD(&log->l_buf_cancel_table[i]); 5547 5533

+2 -2

fs/xfs/xfs_mount.c

··· 82 82 if (hole < 0) { 83 83 xfs_uuid_table = kmem_realloc(xfs_uuid_table, 84 84 (xfs_uuid_table_size + 1) * sizeof(*xfs_uuid_table), 85 - KM_SLEEP); 85 + 0); 86 86 hole = xfs_uuid_table_size++; 87 87 } 88 88 xfs_uuid_table[hole] = *uuid; ··· 214 214 215 215 spin_lock(&mp->m_perag_lock); 216 216 if (radix_tree_insert(&mp->m_perag_tree, index, pag)) { 217 - BUG(); 217 + WARN_ON_ONCE(1); 218 218 spin_unlock(&mp->m_perag_lock); 219 219 radix_tree_preload_end(); 220 220 error = -EEXIST;

-7

fs/xfs/xfs_mount.h

··· 327 327 } 328 328 329 329 /* per-AG block reservation data structures*/ 330 - enum xfs_ag_resv_type { 331 - XFS_AG_RESV_NONE = 0, 332 - XFS_AG_RESV_AGFL, 333 - XFS_AG_RESV_METADATA, 334 - XFS_AG_RESV_RMAPBT, 335 - }; 336 - 337 330 struct xfs_ag_resv { 338 331 /* number of blocks originally reserved here */ 339 332 xfs_extlen_t ar_orig_reserved;

+2 -2

fs/xfs/xfs_mru_cache.c

··· 333 333 if (!(grp_time = msecs_to_jiffies(lifetime_ms) / grp_count)) 334 334 return -EINVAL; 335 335 336 - if (!(mru = kmem_zalloc(sizeof(*mru), KM_SLEEP))) 336 + if (!(mru = kmem_zalloc(sizeof(*mru), 0))) 337 337 return -ENOMEM; 338 338 339 339 /* An extra list is needed to avoid reaping up to a grp_time early. */ 340 340 mru->grp_count = grp_count + 1; 341 - mru->lists = kmem_zalloc(mru->grp_count * sizeof(*mru->lists), KM_SLEEP); 341 + mru->lists = kmem_zalloc(mru->grp_count * sizeof(*mru->lists), 0); 342 342 343 343 if (!mru->lists) { 344 344 err = -ENOMEM;

+2 -2

fs/xfs/xfs_qm.c

··· 642 642 643 643 ASSERT(XFS_IS_QUOTA_RUNNING(mp)); 644 644 645 - qinf = mp->m_quotainfo = kmem_zalloc(sizeof(xfs_quotainfo_t), KM_SLEEP); 645 + qinf = mp->m_quotainfo = kmem_zalloc(sizeof(xfs_quotainfo_t), 0); 646 646 647 647 error = list_lru_init(&qinf->qi_lru); 648 648 if (error) ··· 978 978 if (qip->i_d.di_nblocks == 0) 979 979 return 0; 980 980 981 - map = kmem_alloc(XFS_DQITER_MAP_SIZE * sizeof(*map), KM_SLEEP); 981 + map = kmem_alloc(XFS_DQITER_MAP_SIZE * sizeof(*map), 0); 982 982 983 983 lblkno = 0; 984 984 maxlblkcnt = XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes);

+7 -9

fs/xfs/xfs_refcount_item.c

··· 144 144 ASSERT(nextents > 0); 145 145 if (nextents > XFS_CUI_MAX_FAST_EXTENTS) 146 146 cuip = kmem_zalloc(xfs_cui_log_item_sizeof(nextents), 147 - KM_SLEEP); 147 + 0); 148 148 else 149 - cuip = kmem_zone_zalloc(xfs_cui_zone, KM_SLEEP); 149 + cuip = kmem_zone_zalloc(xfs_cui_zone, 0); 150 150 151 151 xfs_log_item_init(mp, &cuip->cui_item, XFS_LI_CUI, &xfs_cui_item_ops); 152 152 cuip->cui_format.cui_nextents = nextents; ··· 223 223 { 224 224 struct xfs_cud_log_item *cudp; 225 225 226 - cudp = kmem_zone_zalloc(xfs_cud_zone, KM_SLEEP); 226 + cudp = kmem_zone_zalloc(xfs_cud_zone, 0); 227 227 xfs_log_item_init(tp->t_mountp, &cudp->cud_item, XFS_LI_CUD, 228 228 &xfs_cud_item_ops); 229 229 cudp->cud_cuip = cuip; ··· 555 555 irec.br_blockcount = new_len; 556 556 switch (type) { 557 557 case XFS_REFCOUNT_INCREASE: 558 - error = xfs_refcount_increase_extent(tp, &irec); 558 + xfs_refcount_increase_extent(tp, &irec); 559 559 break; 560 560 case XFS_REFCOUNT_DECREASE: 561 - error = xfs_refcount_decrease_extent(tp, &irec); 561 + xfs_refcount_decrease_extent(tp, &irec); 562 562 break; 563 563 case XFS_REFCOUNT_ALLOC_COW: 564 - error = xfs_refcount_alloc_cow_extent(tp, 564 + xfs_refcount_alloc_cow_extent(tp, 565 565 irec.br_startblock, 566 566 irec.br_blockcount); 567 567 break; 568 568 case XFS_REFCOUNT_FREE_COW: 569 - error = xfs_refcount_free_cow_extent(tp, 569 + xfs_refcount_free_cow_extent(tp, 570 570 irec.br_startblock, 571 571 irec.br_blockcount); 572 572 break; 573 573 default: 574 574 ASSERT(0); 575 575 } 576 - if (error) 577 - goto abort_error; 578 576 requeue_only = true; 579 577 } 580 578 }

+6 -17

fs/xfs/xfs_reflink.c

··· 495 495 ASSERT((*tpp)->t_firstblock == NULLFSBLOCK); 496 496 497 497 /* Free the CoW orphan record. */ 498 - error = xfs_refcount_free_cow_extent(*tpp, 499 - del.br_startblock, del.br_blockcount); 500 - if (error) 501 - break; 498 + xfs_refcount_free_cow_extent(*tpp, del.br_startblock, 499 + del.br_blockcount); 502 500 503 501 xfs_bmap_add_free(*tpp, del.br_startblock, 504 502 del.br_blockcount, NULL); ··· 673 675 trace_xfs_reflink_cow_remap(ip, &del); 674 676 675 677 /* Free the CoW orphan record. */ 676 - error = xfs_refcount_free_cow_extent(tp, del.br_startblock, 677 - del.br_blockcount); 678 - if (error) 679 - goto out_cancel; 678 + xfs_refcount_free_cow_extent(tp, del.br_startblock, del.br_blockcount); 680 679 681 680 /* Map the new blocks into the data fork. */ 682 - error = xfs_bmap_map_extent(tp, ip, &del); 683 - if (error) 684 - goto out_cancel; 681 + xfs_bmap_map_extent(tp, ip, &del); 685 682 686 683 /* Charge this new data fork mapping to the on-disk quota. */ 687 684 xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_DELBCOUNT, ··· 1063 1070 uirec.br_blockcount, uirec.br_startblock); 1064 1071 1065 1072 /* Update the refcount tree */ 1066 - error = xfs_refcount_increase_extent(tp, &uirec); 1067 - if (error) 1068 - goto out_cancel; 1073 + xfs_refcount_increase_extent(tp, &uirec); 1069 1074 1070 1075 /* Map the new blocks into the data fork. */ 1071 - error = xfs_bmap_map_extent(tp, ip, &uirec); 1072 - if (error) 1073 - goto out_cancel; 1076 + xfs_bmap_map_extent(tp, ip, &uirec); 1074 1077 1075 1078 /* Update quota accounting. */ 1076 1079 xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT,

+3 -3

fs/xfs/xfs_rmap_item.c

··· 142 142 143 143 ASSERT(nextents > 0); 144 144 if (nextents > XFS_RUI_MAX_FAST_EXTENTS) 145 - ruip = kmem_zalloc(xfs_rui_log_item_sizeof(nextents), KM_SLEEP); 145 + ruip = kmem_zalloc(xfs_rui_log_item_sizeof(nextents), 0); 146 146 else 147 - ruip = kmem_zone_zalloc(xfs_rui_zone, KM_SLEEP); 147 + ruip = kmem_zone_zalloc(xfs_rui_zone, 0); 148 148 149 149 xfs_log_item_init(mp, &ruip->rui_item, XFS_LI_RUI, &xfs_rui_item_ops); 150 150 ruip->rui_format.rui_nextents = nextents; ··· 244 244 { 245 245 struct xfs_rud_log_item *rudp; 246 246 247 - rudp = kmem_zone_zalloc(xfs_rud_zone, KM_SLEEP); 247 + rudp = kmem_zone_zalloc(xfs_rud_zone, 0); 248 248 xfs_log_item_init(tp->t_mountp, &rudp->rud_item, XFS_LI_RUD, 249 249 &xfs_rud_item_ops); 250 250 rudp->rud_ruip = ruip;

+2 -2

fs/xfs/xfs_rtalloc.c

··· 865 865 * lower bound on the minimum level with any free extents. We can 866 866 * continue without the cache if it couldn't be allocated. 867 867 */ 868 - mp->m_rsum_cache = kmem_zalloc_large(rbmblocks, KM_SLEEP); 868 + mp->m_rsum_cache = kmem_zalloc_large(rbmblocks, 0); 869 869 if (!mp->m_rsum_cache) 870 870 xfs_warn(mp, "could not allocate realtime summary cache"); 871 871 } ··· 963 963 /* 964 964 * Allocate a new (fake) mount/sb. 965 965 */ 966 - nmp = kmem_alloc(sizeof(*nmp), KM_SLEEP); 966 + nmp = kmem_alloc(sizeof(*nmp), 0); 967 967 /* 968 968 * Loop over the bitmap blocks. 969 969 * We will do everything one bitmap block at a time.

+2 -1

fs/xfs/xfs_super.c

··· 818 818 goto out_destroy_buf; 819 819 820 820 mp->m_cil_workqueue = alloc_workqueue("xfs-cil/%s", 821 - WQ_MEM_RECLAIM|WQ_FREEZABLE, 0, mp->m_fsname); 821 + WQ_MEM_RECLAIM | WQ_FREEZABLE | WQ_UNBOUND, 822 + 0, mp->m_fsname); 822 823 if (!mp->m_cil_workqueue) 823 824 goto out_destroy_unwritten; 824 825

+34

fs/xfs/xfs_trace.h

··· 23 23 struct xlog_ticket; 24 24 struct xlog_recover; 25 25 struct xlog_recover_item; 26 + struct xlog_rec_header; 26 27 struct xfs_buf_log_format; 27 28 struct xfs_inode_log_format; 28 29 struct xfs_bmbt_irec; ··· 31 30 struct xfs_refcount_irec; 32 31 struct xfs_fsmap; 33 32 struct xfs_rmap_irec; 33 + struct xfs_icreate_log; 34 + struct xfs_owner_info; 35 + struct xfs_trans_res; 36 + struct xfs_inobt_rec_incore; 34 37 35 38 DECLARE_EVENT_CLASS(xfs_attr_list_class, 36 39 TP_PROTO(struct xfs_attr_list_context *ctx), ··· 3579 3574 MAJOR(__entry->dev), MINOR(__entry->dev), 3580 3575 __entry->nr_threads, __entry->pid) 3581 3576 ) 3577 + 3578 + DECLARE_EVENT_CLASS(xfs_kmem_class, 3579 + TP_PROTO(ssize_t size, int flags, unsigned long caller_ip), 3580 + TP_ARGS(size, flags, caller_ip), 3581 + TP_STRUCT__entry( 3582 + __field(ssize_t, size) 3583 + __field(int, flags) 3584 + __field(unsigned long, caller_ip) 3585 + ), 3586 + TP_fast_assign( 3587 + __entry->size = size; 3588 + __entry->flags = flags; 3589 + __entry->caller_ip = caller_ip; 3590 + ), 3591 + TP_printk("size %zd flags 0x%x caller %pS", 3592 + __entry->size, 3593 + __entry->flags, 3594 + (char *)__entry->caller_ip) 3595 + ) 3596 + 3597 + #define DEFINE_KMEM_EVENT(name) \ 3598 + DEFINE_EVENT(xfs_kmem_class, name, \ 3599 + TP_PROTO(ssize_t size, int flags, unsigned long caller_ip), \ 3600 + TP_ARGS(size, flags, caller_ip)) 3601 + DEFINE_KMEM_EVENT(kmem_alloc); 3602 + DEFINE_KMEM_EVENT(kmem_alloc_io); 3603 + DEFINE_KMEM_EVENT(kmem_alloc_large); 3604 + DEFINE_KMEM_EVENT(kmem_realloc); 3605 + DEFINE_KMEM_EVENT(kmem_zone_alloc); 3582 3606 3583 3607 #endif /* _TRACE_XFS_H */ 3584 3608

+2 -2

fs/xfs/xfs_trans.c

··· 90 90 91 91 trace_xfs_trans_dup(tp, _RET_IP_); 92 92 93 - ntp = kmem_zone_zalloc(xfs_trans_zone, KM_SLEEP); 93 + ntp = kmem_zone_zalloc(xfs_trans_zone, 0); 94 94 95 95 /* 96 96 * Initialize the new transaction structure. ··· 263 263 * GFP_NOFS allocation context so that we avoid lockdep false positives 264 264 * by doing GFP_KERNEL allocations inside sb_start_intwrite(). 265 265 */ 266 - tp = kmem_zone_zalloc(xfs_trans_zone, KM_SLEEP); 266 + tp = kmem_zone_zalloc(xfs_trans_zone, 0); 267 267 if (!(flags & XFS_TRANS_NO_WRITECOUNT)) 268 268 sb_start_intwrite(mp->m_super); 269 269

+1 -1

fs/xfs/xfs_trans_dquot.c

··· 863 863 xfs_trans_alloc_dqinfo( 864 864 xfs_trans_t *tp) 865 865 { 866 - tp->t_dqinfo = kmem_zone_zalloc(xfs_qm_dqtrxzone, KM_SLEEP); 866 + tp->t_dqinfo = kmem_zone_zalloc(xfs_qm_dqtrxzone, 0); 867 867 } 868 868 869 869 void

+1 -1

fs/xfs/xfs_xattr.c

··· 30 30 value = NULL; 31 31 } 32 32 33 - error = xfs_attr_get(ip, (unsigned char *)name, value, &asize, xflags); 33 + error = xfs_attr_get(ip, name, (unsigned char **)&value, &asize, xflags); 34 34 if (error) 35 35 return error; 36 36 return asize;

+2

include/linux/fs.h

··· 3543 3543 /* mm/fadvise.c */ 3544 3544 extern int vfs_fadvise(struct file *file, loff_t offset, loff_t len, 3545 3545 int advice); 3546 + extern int generic_fadvise(struct file *file, loff_t offset, loff_t len, 3547 + int advice); 3546 3548 3547 3549 #if defined(CONFIG_IO_URING) 3548 3550 extern struct sock *io_uring_get_socket(struct file *file);

+2 -2

mm/fadvise.c

··· 27 27 * deactivate the pages and clear PG_Referenced. 28 28 */ 29 29 30 - static int generic_fadvise(struct file *file, loff_t offset, loff_t len, 31 - int advice) 30 + int generic_fadvise(struct file *file, loff_t offset, loff_t len, int advice) 32 31 { 33 32 struct inode *inode; 34 33 struct address_space *mapping; ··· 177 178 } 178 179 return 0; 179 180 } 181 + EXPORT_SYMBOL(generic_fadvise); 180 182 181 183 int vfs_fadvise(struct file *file, loff_t offset, loff_t len, int advice) 182 184 {

+16 -6

mm/madvise.c

··· 14 14 #include <linux/userfaultfd_k.h> 15 15 #include <linux/hugetlb.h> 16 16 #include <linux/falloc.h> 17 + #include <linux/fadvise.h> 17 18 #include <linux/sched.h> 18 19 #include <linux/ksm.h> 19 20 #include <linux/fs.h> ··· 276 275 unsigned long start, unsigned long end) 277 276 { 278 277 struct file *file = vma->vm_file; 278 + loff_t offset; 279 279 280 280 *prev = vma; 281 281 #ifdef CONFIG_SWAP ··· 300 298 return 0; 301 299 } 302 300 303 - start = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; 304 - if (end > vma->vm_end) 305 - end = vma->vm_end; 306 - end = ((end - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; 307 - 308 - force_page_cache_readahead(file->f_mapping, file, start, end - start); 301 + /* 302 + * Filesystem's fadvise may need to take various locks. We need to 303 + * explicitly grab a reference because the vma (and hence the 304 + * vma's reference to the file) can go away as soon as we drop 305 + * mmap_sem. 306 + */ 307 + *prev = NULL; /* tell sys_madvise we drop mmap_sem */ 308 + get_file(file); 309 + up_read(&current->mm->mmap_sem); 310 + offset = (loff_t)(start - vma->vm_start) 311 + + ((loff_t)vma->vm_pgoff << PAGE_SHIFT); 312 + vfs_fadvise(file, offset, end - start, POSIX_FADV_WILLNEED); 313 + fput(file); 314 + down_read(&current->mm->mmap_sem); 309 315 return 0; 310 316 } 311 317