Merge tag 'xfs-5.9-merge-7' of git://git.kernel.org/pub/scm/fs/xfs/xfs-linux

-21

fs/xfs/kmem.c

··· 115 115 congestion_wait(BLK_RW_ASYNC, HZ/50); 116 116 } while (1); 117 117 } 118 - 119 - void * 120 - kmem_zone_alloc(kmem_zone_t *zone, xfs_km_flags_t flags) 121 - { 122 - int retries = 0; 123 - gfp_t lflags = kmem_flags_convert(flags); 124 - void *ptr; 125 - 126 - trace_kmem_zone_alloc(kmem_cache_size(zone), flags, _RET_IP_); 127 - do { 128 - ptr = kmem_cache_alloc(zone, lflags); 129 - if (ptr || (flags & KM_MAYFAIL)) 130 - return ptr; 131 - if (!(++retries % 100)) 132 - xfs_err(NULL, 133 - "%s(%u) possible memory allocation deadlock in %s (mode:0x%x)", 134 - current->comm, current->pid, 135 - __func__, lflags); 136 - congestion_wait(BLK_RW_ASYNC, HZ/50); 137 - } while (1); 138 - }

-8

fs/xfs/kmem.h

··· 85 85 #define kmem_zone kmem_cache 86 86 #define kmem_zone_t struct kmem_cache 87 87 88 - extern void *kmem_zone_alloc(kmem_zone_t *, xfs_km_flags_t); 89 - 90 - static inline void * 91 - kmem_zone_zalloc(kmem_zone_t *zone, xfs_km_flags_t flags) 92 - { 93 - return kmem_zone_alloc(zone, flags | KM_ZERO); 94 - } 95 - 96 88 static inline struct page * 97 89 kmem_to_page(void *addr) 98 90 {

+2 -2

fs/xfs/libxfs/xfs_ag.c

··· 563 563 error = xfs_alloc_read_agf(mp, NULL, agno, 0, &agf_bp); 564 564 if (error) 565 565 goto out_agi; 566 - pag = xfs_perag_get(mp, agno); 566 + 567 + pag = agi_bp->b_pag; 567 568 568 569 /* Fill out form. */ 569 570 memset(ageo, 0, sizeof(*ageo)); ··· 584 583 xfs_ag_geom_health(pag, ageo); 585 584 586 585 /* Release resources. */ 587 - xfs_perag_put(pag); 588 586 xfs_buf_relse(agf_bp); 589 587 out_agi: 590 588 xfs_buf_relse(agi_bp);

-12

fs/xfs/libxfs/xfs_ag_resv.h

··· 37 37 xfs_perag_put(pag); 38 38 } 39 39 40 - static inline void 41 - xfs_ag_resv_rmapbt_free( 42 - struct xfs_mount *mp, 43 - xfs_agnumber_t agno) 44 - { 45 - struct xfs_perag *pag; 46 - 47 - pag = xfs_perag_get(mp, agno); 48 - xfs_ag_resv_free_extent(pag, XFS_AG_RESV_RMAPBT, NULL, 1); 49 - xfs_perag_put(pag); 50 - } 51 - 52 40 #endif /* __XFS_AG_RESV_H__ */

+9 -16

fs/xfs/libxfs/xfs_alloc.c

··· 710 710 STATIC int 711 711 xfs_alloc_update_counters( 712 712 struct xfs_trans *tp, 713 - struct xfs_perag *pag, 714 713 struct xfs_buf *agbp, 715 714 long len) 716 715 { 717 716 struct xfs_agf *agf = agbp->b_addr; 718 717 719 - pag->pagf_freeblks += len; 718 + agbp->b_pag->pagf_freeblks += len; 720 719 be32_add_cpu(&agf->agf_freeblks, len); 721 720 722 721 xfs_trans_agblocks_delta(tp, len); ··· 1174 1175 } 1175 1176 1176 1177 if (!args->wasfromfl) { 1177 - error = xfs_alloc_update_counters(args->tp, args->pag, 1178 - args->agbp, 1178 + error = xfs_alloc_update_counters(args->tp, args->agbp, 1179 1179 -((long)(args->len))); 1180 1180 if (error) 1181 1181 return error; ··· 1885 1887 enum xfs_ag_resv_type type) 1886 1888 { 1887 1889 struct xfs_mount *mp; 1888 - struct xfs_perag *pag; 1889 1890 struct xfs_btree_cur *bno_cur; 1890 1891 struct xfs_btree_cur *cnt_cur; 1891 1892 xfs_agblock_t gtbno; /* start of right neighbor */ ··· 2164 2167 /* 2165 2168 * Update the freespace totals in the ag and superblock. 2166 2169 */ 2167 - pag = xfs_perag_get(mp, agno); 2168 - error = xfs_alloc_update_counters(tp, pag, agbp, len); 2169 - xfs_ag_resv_free_extent(pag, type, tp, len); 2170 - xfs_perag_put(pag); 2170 + error = xfs_alloc_update_counters(tp, agbp, len); 2171 + xfs_ag_resv_free_extent(agbp->b_pag, type, tp, len); 2171 2172 if (error) 2172 2173 goto error0; 2173 2174 ··· 2462 2467 ASSERT(xfs_bmap_free_item_zone != NULL); 2463 2468 ASSERT(oinfo != NULL); 2464 2469 2465 - new = kmem_zone_alloc(xfs_bmap_free_item_zone, 0); 2470 + new = kmem_cache_alloc(xfs_bmap_free_item_zone, 2471 + GFP_KERNEL | __GFP_NOFAIL); 2466 2472 new->xefi_startblock = XFS_AGB_TO_FSB(mp, agno, agbno); 2467 2473 new->xefi_blockcount = 1; 2468 2474 new->xefi_oinfo = *oinfo; ··· 2685 2689 if (be32_to_cpu(agf->agf_flfirst) == xfs_agfl_size(mp)) 2686 2690 agf->agf_flfirst = 0; 2687 2691 2688 - pag = xfs_perag_get(mp, be32_to_cpu(agf->agf_seqno)); 2692 + pag = agbp->b_pag; 2689 2693 ASSERT(!pag->pagf_agflreset); 2690 2694 be32_add_cpu(&agf->agf_flcount, -1); 2691 2695 xfs_trans_agflist_delta(tp, -1); ··· 2697 2701 pag->pagf_btreeblks++; 2698 2702 logflags |= XFS_AGF_BTREEBLKS; 2699 2703 } 2700 - xfs_perag_put(pag); 2701 2704 2702 2705 xfs_alloc_log_agf(tp, agbp, logflags); 2703 2706 *bnop = bno; ··· 2792 2797 if (be32_to_cpu(agf->agf_fllast) == xfs_agfl_size(mp)) 2793 2798 agf->agf_fllast = 0; 2794 2799 2795 - pag = xfs_perag_get(mp, be32_to_cpu(agf->agf_seqno)); 2800 + pag = agbp->b_pag; 2796 2801 ASSERT(!pag->pagf_agflreset); 2797 2802 be32_add_cpu(&agf->agf_flcount, 1); 2798 2803 xfs_trans_agflist_delta(tp, 1); ··· 2804 2809 pag->pagf_btreeblks--; 2805 2810 logflags |= XFS_AGF_BTREEBLKS; 2806 2811 } 2807 - xfs_perag_put(pag); 2808 2812 2809 2813 xfs_alloc_log_agf(tp, agbp, logflags); 2810 2814 ··· 3000 3006 ASSERT(!(*bpp)->b_error); 3001 3007 3002 3008 agf = (*bpp)->b_addr; 3003 - pag = xfs_perag_get(mp, agno); 3009 + pag = (*bpp)->b_pag; 3004 3010 if (!pag->pagf_init) { 3005 3011 pag->pagf_freeblks = be32_to_cpu(agf->agf_freeblks); 3006 3012 pag->pagf_btreeblks = be32_to_cpu(agf->agf_btreeblks); ··· 3028 3034 be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNTi])); 3029 3035 } 3030 3036 #endif 3031 - xfs_perag_put(pag); 3032 3037 return 0; 3033 3038 } 3034 3039

+3 -7

fs/xfs/libxfs/xfs_alloc_btree.c

··· 38 38 { 39 39 struct xfs_buf *agbp = cur->bc_ag.agbp; 40 40 struct xfs_agf *agf = agbp->b_addr; 41 - xfs_agnumber_t seqno = be32_to_cpu(agf->agf_seqno); 42 41 int btnum = cur->bc_btnum; 43 - struct xfs_perag *pag = xfs_perag_get(cur->bc_mp, seqno); 42 + struct xfs_perag *pag = agbp->b_pag; 44 43 45 44 ASSERT(ptr->s != 0); 46 45 47 46 agf->agf_roots[btnum] = ptr->s; 48 47 be32_add_cpu(&agf->agf_levels[btnum], inc); 49 48 pag->pagf_levels[btnum] += inc; 50 - xfs_perag_put(pag); 51 49 52 50 xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_ROOTS | XFS_AGF_LEVELS); 53 51 } ··· 113 115 int reason) 114 116 { 115 117 struct xfs_agf *agf = cur->bc_ag.agbp->b_addr; 116 - xfs_agnumber_t seqno = be32_to_cpu(agf->agf_seqno); 117 118 struct xfs_perag *pag; 118 119 __be32 len; 119 120 int numrecs; ··· 157 160 } 158 161 159 162 agf->agf_longest = len; 160 - pag = xfs_perag_get(cur->bc_mp, seqno); 163 + pag = cur->bc_ag.agbp->b_pag; 161 164 pag->pagf_longest = be32_to_cpu(len); 162 - xfs_perag_put(pag); 163 165 xfs_alloc_log_agf(cur->bc_tp, cur->bc_ag.agbp, XFS_AGF_LONGEST); 164 166 } 165 167 ··· 480 484 481 485 ASSERT(btnum == XFS_BTNUM_BNO || btnum == XFS_BTNUM_CNT); 482 486 483 - cur = kmem_zone_zalloc(xfs_btree_cur_zone, KM_NOFS); 487 + cur = kmem_cache_zalloc(xfs_btree_cur_zone, GFP_NOFS | __GFP_NOFAIL); 484 488 485 489 cur->bc_tp = tp; 486 490 cur->bc_mp = mp;

+542 -351

fs/xfs/libxfs/xfs_attr.c

··· 46 46 STATIC int xfs_attr_leaf_get(xfs_da_args_t *args); 47 47 STATIC int xfs_attr_leaf_addname(xfs_da_args_t *args); 48 48 STATIC int xfs_attr_leaf_removename(xfs_da_args_t *args); 49 + STATIC int xfs_attr_leaf_hasname(struct xfs_da_args *args, struct xfs_buf **bp); 49 50 50 51 /* 51 52 * Internal routines when attribute list is more than one block. ··· 54 53 STATIC int xfs_attr_node_get(xfs_da_args_t *args); 55 54 STATIC int xfs_attr_node_addname(xfs_da_args_t *args); 56 55 STATIC int xfs_attr_node_removename(xfs_da_args_t *args); 56 + STATIC int xfs_attr_node_hasname(xfs_da_args_t *args, 57 + struct xfs_da_state **state); 57 58 STATIC int xfs_attr_fillstate(xfs_da_state_t *state); 58 59 STATIC int xfs_attr_refillstate(xfs_da_state_t *state); 59 60 ··· 178 175 struct xfs_da_args *args) 179 176 { 180 177 181 - struct xfs_mount *mp = dp->i_mount; 182 - int error, error2; 178 + int error; 179 + 180 + /* 181 + * Build initial attribute list (if required). 182 + */ 183 + if (dp->i_afp->if_format == XFS_DINODE_FMT_EXTENTS) 184 + xfs_attr_shortform_create(args); 183 185 184 186 error = xfs_attr_shortform_addname(args); 185 187 if (error == -ENOSPC) ··· 197 189 if (!error && !(args->op_flags & XFS_DA_OP_NOTIME)) 198 190 xfs_trans_ichgtime(args->trans, dp, XFS_ICHGTIME_CHG); 199 191 200 - if (mp->m_flags & XFS_MOUNT_WSYNC) 192 + if (dp->i_mount->m_flags & XFS_MOUNT_WSYNC) 201 193 xfs_trans_set_sync(args->trans); 202 194 203 - error2 = xfs_trans_commit(args->trans); 204 - args->trans = NULL; 205 - return error ? error : error2; 195 + return error; 196 + } 197 + 198 + /* 199 + * Check to see if the attr should be upgraded from non-existent or shortform to 200 + * single-leaf-block attribute list. 201 + */ 202 + static inline bool 203 + xfs_attr_is_shortform( 204 + struct xfs_inode *ip) 205 + { 206 + return ip->i_afp->if_format == XFS_DINODE_FMT_LOCAL || 207 + (ip->i_afp->if_format == XFS_DINODE_FMT_EXTENTS && 208 + ip->i_afp->if_nextents == 0); 209 + } 210 + 211 + /* 212 + * Attempts to set an attr in shortform, or converts short form to leaf form if 213 + * there is not enough room. If the attr is set, the transaction is committed 214 + * and set to NULL. 215 + */ 216 + STATIC int 217 + xfs_attr_set_shortform( 218 + struct xfs_da_args *args, 219 + struct xfs_buf **leaf_bp) 220 + { 221 + struct xfs_inode *dp = args->dp; 222 + int error, error2 = 0; 223 + 224 + /* 225 + * Try to add the attr to the attribute list in the inode. 226 + */ 227 + error = xfs_attr_try_sf_addname(dp, args); 228 + if (error != -ENOSPC) { 229 + error2 = xfs_trans_commit(args->trans); 230 + args->trans = NULL; 231 + return error ? error : error2; 232 + } 233 + /* 234 + * It won't fit in the shortform, transform to a leaf block. GROT: 235 + * another possible req'mt for a double-split btree op. 236 + */ 237 + error = xfs_attr_shortform_to_leaf(args, leaf_bp); 238 + if (error) 239 + return error; 240 + 241 + /* 242 + * Prevent the leaf buffer from being unlocked so that a concurrent AIL 243 + * push cannot grab the half-baked leaf buffer and run into problems 244 + * with the write verifier. Once we're done rolling the transaction we 245 + * can release the hold and add the attr to the leaf. 246 + */ 247 + xfs_trans_bhold(args->trans, *leaf_bp); 248 + error = xfs_defer_finish(&args->trans); 249 + xfs_trans_bhold_release(args->trans, *leaf_bp); 250 + if (error) { 251 + xfs_trans_brelse(args->trans, *leaf_bp); 252 + return error; 253 + } 254 + 255 + return 0; 206 256 } 207 257 208 258 /* ··· 272 206 { 273 207 struct xfs_inode *dp = args->dp; 274 208 struct xfs_buf *leaf_bp = NULL; 275 - int error; 209 + int error = 0; 276 210 277 211 /* 278 - * If the attribute list is non-existent or a shortform list, 279 - * upgrade it to a single-leaf-block attribute list. 212 + * If the attribute list is already in leaf format, jump straight to 213 + * leaf handling. Otherwise, try to add the attribute to the shortform 214 + * list; if there's no room then convert the list to leaf format and try 215 + * again. 280 216 */ 281 - if (dp->i_afp->if_format == XFS_DINODE_FMT_LOCAL || 282 - (dp->i_afp->if_format == XFS_DINODE_FMT_EXTENTS && 283 - dp->i_afp->if_nextents == 0)) { 217 + if (xfs_attr_is_shortform(dp)) { 284 218 285 219 /* 286 - * Build initial attribute list (if required). 220 + * If the attr was successfully set in shortform, the 221 + * transaction is committed and set to NULL. Otherwise, is it 222 + * converted from shortform to leaf, and the transaction is 223 + * retained. 287 224 */ 288 - if (dp->i_afp->if_format == XFS_DINODE_FMT_EXTENTS) 289 - xfs_attr_shortform_create(args); 225 + error = xfs_attr_set_shortform(args, &leaf_bp); 226 + if (error || !args->trans) 227 + return error; 228 + } 290 229 291 - /* 292 - * Try to add the attr to the attribute list in the inode. 293 - */ 294 - error = xfs_attr_try_sf_addname(dp, args); 230 + if (xfs_bmap_one_block(dp, XFS_ATTR_FORK)) { 231 + error = xfs_attr_leaf_addname(args); 295 232 if (error != -ENOSPC) 296 233 return error; 297 234 298 235 /* 299 - * It won't fit in the shortform, transform to a leaf block. 300 - * GROT: another possible req'mt for a double-split btree op. 236 + * Promote the attribute list to the Btree format. 301 237 */ 302 - error = xfs_attr_shortform_to_leaf(args, &leaf_bp); 238 + error = xfs_attr3_leaf_to_node(args); 303 239 if (error) 304 240 return error; 305 241 306 242 /* 307 - * Prevent the leaf buffer from being unlocked so that a 308 - * concurrent AIL push cannot grab the half-baked leaf 309 - * buffer and run into problems with the write verifier. 310 - * Once we're done rolling the transaction we can release 311 - * the hold and add the attr to the leaf. 243 + * Finish any deferred work items and roll the transaction once 244 + * more. The goal here is to call node_addname with the inode 245 + * and transaction in the same state (inode locked and joined, 246 + * transaction clean) no matter how we got to this step. 312 247 */ 313 - xfs_trans_bhold(args->trans, leaf_bp); 314 248 error = xfs_defer_finish(&args->trans); 315 - xfs_trans_bhold_release(args->trans, leaf_bp); 316 - if (error) { 317 - xfs_trans_brelse(args->trans, leaf_bp); 249 + if (error) 318 250 return error; 319 - } 251 + 252 + /* 253 + * Commit the current trans (including the inode) and 254 + * start a new one. 255 + */ 256 + error = xfs_trans_roll_inode(&args->trans, dp); 257 + if (error) 258 + return error; 320 259 } 321 260 322 - if (xfs_bmap_one_block(dp, XFS_ATTR_FORK)) 323 - error = xfs_attr_leaf_addname(args); 324 - else 325 - error = xfs_attr_node_addname(args); 261 + error = xfs_attr_node_addname(args); 326 262 return error; 263 + } 264 + 265 + /* 266 + * Return EEXIST if attr is found, or ENOATTR if not 267 + */ 268 + int 269 + xfs_has_attr( 270 + struct xfs_da_args *args) 271 + { 272 + struct xfs_inode *dp = args->dp; 273 + struct xfs_buf *bp = NULL; 274 + int error; 275 + 276 + if (!xfs_inode_hasattr(dp)) 277 + return -ENOATTR; 278 + 279 + if (dp->i_afp->if_format == XFS_DINODE_FMT_LOCAL) { 280 + ASSERT(dp->i_afp->if_flags & XFS_IFINLINE); 281 + return xfs_attr_sf_findname(args, NULL, NULL); 282 + } 283 + 284 + if (xfs_bmap_one_block(dp, XFS_ATTR_FORK)) { 285 + error = xfs_attr_leaf_hasname(args, &bp); 286 + 287 + if (bp) 288 + xfs_trans_brelse(args->trans, bp); 289 + 290 + return error; 291 + } 292 + 293 + return xfs_attr_node_hasname(args, NULL); 327 294 } 328 295 329 296 /* ··· 469 370 args->total, 0, quota_flags); 470 371 if (error) 471 372 goto out_trans_cancel; 373 + 374 + error = xfs_has_attr(args); 375 + if (error == -EEXIST && (args->attr_flags & XATTR_CREATE)) 376 + goto out_trans_cancel; 377 + if (error == -ENOATTR && (args->attr_flags & XATTR_REPLACE)) 378 + goto out_trans_cancel; 379 + if (error != -ENOATTR && error != -EEXIST) 380 + goto out_trans_cancel; 381 + 472 382 error = xfs_attr_set_args(args); 473 383 if (error) 474 384 goto out_trans_cancel; ··· 485 377 if (!args->trans) 486 378 goto out_unlock; 487 379 } else { 380 + error = xfs_has_attr(args); 381 + if (error != -EEXIST) 382 + goto out_trans_cancel; 383 + 488 384 error = xfs_attr_remove_args(args); 489 385 if (error) 490 386 goto out_trans_cancel; ··· 571 459 * External routines when attribute list is one block 572 460 *========================================================================*/ 573 461 574 - /* 575 - * Add a name to the leaf attribute list structure 576 - * 577 - * This leaf block cannot have a "remote" value, we only call this routine 578 - * if bmap_one_block() says there is only one block (ie: no remote blks). 579 - */ 580 - STATIC int 581 - xfs_attr_leaf_addname( 462 + /* Store info about a remote block */ 463 + STATIC void 464 + xfs_attr_save_rmt_blk( 582 465 struct xfs_da_args *args) 583 466 { 584 - struct xfs_inode *dp; 585 - struct xfs_buf *bp; 586 - int retval, error, forkoff; 467 + args->blkno2 = args->blkno; 468 + args->index2 = args->index; 469 + args->rmtblkno2 = args->rmtblkno; 470 + args->rmtblkcnt2 = args->rmtblkcnt; 471 + args->rmtvaluelen2 = args->rmtvaluelen; 472 + } 587 473 588 - trace_xfs_attr_leaf_addname(args); 474 + /* Set stored info about a remote block */ 475 + STATIC void 476 + xfs_attr_restore_rmt_blk( 477 + struct xfs_da_args *args) 478 + { 479 + args->blkno = args->blkno2; 480 + args->index = args->index2; 481 + args->rmtblkno = args->rmtblkno2; 482 + args->rmtblkcnt = args->rmtblkcnt2; 483 + args->rmtvaluelen = args->rmtvaluelen2; 484 + } 589 485 590 - /* 591 - * Read the (only) block in the attribute list in. 592 - */ 593 - dp = args->dp; 594 - args->blkno = 0; 595 - error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, &bp); 596 - if (error) 597 - return error; 486 + /* 487 + * Tries to add an attribute to an inode in leaf form 488 + * 489 + * This function is meant to execute as part of a delayed operation and leaves 490 + * the transaction handling to the caller. On success the attribute is added 491 + * and the inode and transaction are left dirty. If there is not enough space, 492 + * the attr data is converted to node format and -ENOSPC is returned. Caller is 493 + * responsible for handling the dirty inode and transaction or adding the attr 494 + * in node format. 495 + */ 496 + STATIC int 497 + xfs_attr_leaf_try_add( 498 + struct xfs_da_args *args, 499 + struct xfs_buf *bp) 500 + { 501 + int retval; 598 502 599 503 /* 600 504 * Look up the given attribute in the leaf block. Figure out if 601 505 * the given flags produce an error or call for an atomic rename. 602 506 */ 603 - retval = xfs_attr3_leaf_lookup_int(bp, args); 507 + retval = xfs_attr_leaf_hasname(args, &bp); 508 + if (retval != -ENOATTR && retval != -EEXIST) 509 + return retval; 604 510 if (retval == -ENOATTR && (args->attr_flags & XATTR_REPLACE)) 605 511 goto out_brelse; 606 512 if (retval == -EEXIST) { ··· 629 499 630 500 /* save the attribute state for later removal*/ 631 501 args->op_flags |= XFS_DA_OP_RENAME; /* an atomic rename */ 632 - args->blkno2 = args->blkno; /* set 2nd entry info*/ 633 - args->index2 = args->index; 634 - args->rmtblkno2 = args->rmtblkno; 635 - args->rmtblkcnt2 = args->rmtblkcnt; 636 - args->rmtvaluelen2 = args->rmtvaluelen; 502 + xfs_attr_save_rmt_blk(args); 637 503 638 504 /* 639 505 * clear the remote attr state now that it is saved so that the ··· 642 516 } 643 517 644 518 /* 645 - * Add the attribute to the leaf block, transitioning to a Btree 646 - * if required. 519 + * Add the attribute to the leaf block 647 520 */ 648 - retval = xfs_attr3_leaf_add(bp, args); 649 - if (retval == -ENOSPC) { 650 - /* 651 - * Promote the attribute list to the Btree format, then 652 - * Commit that transaction so that the node_addname() call 653 - * can manage its own transactions. 654 - */ 655 - error = xfs_attr3_leaf_to_node(args); 656 - if (error) 657 - return error; 658 - error = xfs_defer_finish(&args->trans); 659 - if (error) 660 - return error; 521 + return xfs_attr3_leaf_add(bp, args); 661 522 662 - /* 663 - * Commit the current trans (including the inode) and start 664 - * a new one. 665 - */ 666 - error = xfs_trans_roll_inode(&args->trans, dp); 667 - if (error) 668 - return error; 523 + out_brelse: 524 + xfs_trans_brelse(args->trans, bp); 525 + return retval; 526 + } 669 527 670 - /* 671 - * Fob the whole rest of the problem off on the Btree code. 672 - */ 673 - error = xfs_attr_node_addname(args); 528 + 529 + /* 530 + * Add a name to the leaf attribute list structure 531 + * 532 + * This leaf block cannot have a "remote" value, we only call this routine 533 + * if bmap_one_block() says there is only one block (ie: no remote blks). 534 + */ 535 + STATIC int 536 + xfs_attr_leaf_addname( 537 + struct xfs_da_args *args) 538 + { 539 + int error, forkoff; 540 + struct xfs_buf *bp = NULL; 541 + struct xfs_inode *dp = args->dp; 542 + 543 + trace_xfs_attr_leaf_addname(args); 544 + 545 + error = xfs_attr_leaf_try_add(args, bp); 546 + if (error) 674 547 return error; 675 - } 676 548 677 549 /* 678 550 * Commit the transaction that added the attr name so that ··· 692 568 return error; 693 569 } 694 570 695 - /* 696 - * If this is an atomic rename operation, we must "flip" the 697 - * incomplete flags on the "new" and "old" attribute/value pairs 698 - * so that one disappears and one appears atomically. Then we 699 - * must remove the "old" attribute/value pair. 700 - */ 701 - if (args->op_flags & XFS_DA_OP_RENAME) { 702 - /* 703 - * In a separate transaction, set the incomplete flag on the 704 - * "old" attr and clear the incomplete flag on the "new" attr. 705 - */ 706 - error = xfs_attr3_leaf_flipflags(args); 707 - if (error) 708 - return error; 709 - 710 - /* 711 - * Dismantle the "old" attribute/value pair by removing 712 - * a "remote" value (if it exists). 713 - */ 714 - args->index = args->index2; 715 - args->blkno = args->blkno2; 716 - args->rmtblkno = args->rmtblkno2; 717 - args->rmtblkcnt = args->rmtblkcnt2; 718 - args->rmtvaluelen = args->rmtvaluelen2; 719 - if (args->rmtblkno) { 720 - error = xfs_attr_rmtval_remove(args); 721 - if (error) 722 - return error; 723 - } 724 - 725 - /* 726 - * Read in the block containing the "old" attr, then 727 - * remove the "old" attr from that block (neat, huh!) 728 - */ 729 - error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, 730 - &bp); 731 - if (error) 732 - return error; 733 - 734 - xfs_attr3_leaf_remove(bp, args); 735 - 736 - /* 737 - * If the result is small enough, shrink it all into the inode. 738 - */ 739 - if ((forkoff = xfs_attr_shortform_allfit(bp, dp))) { 740 - error = xfs_attr3_leaf_to_shortform(bp, args, forkoff); 741 - /* bp is gone due to xfs_da_shrink_inode */ 742 - if (error) 743 - return error; 744 - error = xfs_defer_finish(&args->trans); 745 - if (error) 746 - return error; 747 - } 748 - 749 - /* 750 - * Commit the remove and start the next trans in series. 751 - */ 752 - error = xfs_trans_roll_inode(&args->trans, dp); 753 - 754 - } else if (args->rmtblkno > 0) { 571 + if (!(args->op_flags & XFS_DA_OP_RENAME)) { 755 572 /* 756 573 * Added a "remote" value, just clear the incomplete flag. 757 574 */ 758 - error = xfs_attr3_leaf_clearflag(args); 575 + if (args->rmtblkno > 0) 576 + error = xfs_attr3_leaf_clearflag(args); 577 + 578 + return error; 759 579 } 580 + 581 + /* 582 + * If this is an atomic rename operation, we must "flip" the incomplete 583 + * flags on the "new" and "old" attribute/value pairs so that one 584 + * disappears and one appears atomically. Then we must remove the "old" 585 + * attribute/value pair. 586 + * 587 + * In a separate transaction, set the incomplete flag on the "old" attr 588 + * and clear the incomplete flag on the "new" attr. 589 + */ 590 + 591 + error = xfs_attr3_leaf_flipflags(args); 592 + if (error) 593 + return error; 594 + /* 595 + * Commit the flag value change and start the next trans in series. 596 + */ 597 + error = xfs_trans_roll_inode(&args->trans, args->dp); 598 + if (error) 599 + return error; 600 + 601 + /* 602 + * Dismantle the "old" attribute/value pair by removing a "remote" value 603 + * (if it exists). 604 + */ 605 + xfs_attr_restore_rmt_blk(args); 606 + 607 + if (args->rmtblkno) { 608 + error = xfs_attr_rmtval_invalidate(args); 609 + if (error) 610 + return error; 611 + 612 + error = xfs_attr_rmtval_remove(args); 613 + if (error) 614 + return error; 615 + } 616 + 617 + /* 618 + * Read in the block containing the "old" attr, then remove the "old" 619 + * attr from that block (neat, huh!) 620 + */ 621 + error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, 622 + &bp); 623 + if (error) 624 + return error; 625 + 626 + xfs_attr3_leaf_remove(bp, args); 627 + 628 + /* 629 + * If the result is small enough, shrink it all into the inode. 630 + */ 631 + forkoff = xfs_attr_shortform_allfit(bp, dp); 632 + if (forkoff) 633 + error = xfs_attr3_leaf_to_shortform(bp, args, forkoff); 634 + /* bp is gone due to xfs_da_shrink_inode */ 635 + 760 636 return error; 761 - out_brelse: 762 - xfs_trans_brelse(args->trans, bp); 763 - return retval; 637 + } 638 + 639 + /* 640 + * Return EEXIST if attr is found, or ENOATTR if not 641 + */ 642 + STATIC int 643 + xfs_attr_leaf_hasname( 644 + struct xfs_da_args *args, 645 + struct xfs_buf **bp) 646 + { 647 + int error = 0; 648 + 649 + error = xfs_attr3_leaf_read(args->trans, args->dp, 0, bp); 650 + if (error) 651 + return error; 652 + 653 + error = xfs_attr3_leaf_lookup_int(*bp, args); 654 + if (error != -ENOATTR && error != -EEXIST) 655 + xfs_trans_brelse(args->trans, *bp); 656 + 657 + return error; 764 658 } 765 659 766 660 /* ··· 801 659 * Remove the attribute. 802 660 */ 803 661 dp = args->dp; 804 - args->blkno = 0; 805 - error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, &bp); 806 - if (error) 807 - return error; 808 662 809 - error = xfs_attr3_leaf_lookup_int(bp, args); 663 + error = xfs_attr_leaf_hasname(args, &bp); 664 + 810 665 if (error == -ENOATTR) { 811 666 xfs_trans_brelse(args->trans, bp); 812 667 return error; 813 - } 668 + } else if (error != -EEXIST) 669 + return error; 814 670 815 671 xfs_attr3_leaf_remove(bp, args); 816 672 817 673 /* 818 674 * If the result is small enough, shrink it all into the inode. 819 675 */ 820 - if ((forkoff = xfs_attr_shortform_allfit(bp, dp))) { 821 - error = xfs_attr3_leaf_to_shortform(bp, args, forkoff); 676 + forkoff = xfs_attr_shortform_allfit(bp, dp); 677 + if (forkoff) 678 + return xfs_attr3_leaf_to_shortform(bp, args, forkoff); 822 679 /* bp is gone due to xfs_da_shrink_inode */ 823 - if (error) 824 - return error; 825 - error = xfs_defer_finish(&args->trans); 826 - if (error) 827 - return error; 828 - } 680 + 829 681 return 0; 830 682 } 831 683 ··· 839 703 840 704 trace_xfs_attr_leaf_get(args); 841 705 842 - args->blkno = 0; 843 - error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, &bp); 844 - if (error) 845 - return error; 706 + error = xfs_attr_leaf_hasname(args, &bp); 846 707 847 - error = xfs_attr3_leaf_lookup_int(bp, args); 848 - if (error != -EEXIST) { 708 + if (error == -ENOATTR) { 849 709 xfs_trans_brelse(args->trans, bp); 850 710 return error; 851 - } 711 + } else if (error != -EEXIST) 712 + return error; 713 + 714 + 852 715 error = xfs_attr3_leaf_getvalue(bp, args); 853 716 xfs_trans_brelse(args->trans, bp); 854 717 return error; 718 + } 719 + 720 + /* 721 + * Return EEXIST if attr is found, or ENOATTR if not 722 + * statep: If not null is set to point at the found state. Caller will 723 + * be responsible for freeing the state in this case. 724 + */ 725 + STATIC int 726 + xfs_attr_node_hasname( 727 + struct xfs_da_args *args, 728 + struct xfs_da_state **statep) 729 + { 730 + struct xfs_da_state *state; 731 + int retval, error; 732 + 733 + state = xfs_da_state_alloc(args); 734 + if (statep != NULL) 735 + *statep = NULL; 736 + 737 + /* 738 + * Search to see if name exists, and get back a pointer to it. 739 + */ 740 + error = xfs_da3_node_lookup_int(state, &retval); 741 + if (error) { 742 + xfs_da_state_free(state); 743 + return error; 744 + } 745 + 746 + if (statep != NULL) 747 + *statep = state; 748 + else 749 + xfs_da_state_free(state); 750 + return retval; 855 751 } 856 752 857 753 /*======================================================================== ··· 907 739 struct xfs_da_state *state; 908 740 struct xfs_da_state_blk *blk; 909 741 struct xfs_inode *dp; 910 - struct xfs_mount *mp; 911 742 int retval, error; 912 743 913 744 trace_xfs_attr_node_addname(args); ··· 915 748 * Fill in bucket of arguments/results/context to carry around. 916 749 */ 917 750 dp = args->dp; 918 - mp = dp->i_mount; 919 751 restart: 920 - state = xfs_da_state_alloc(); 921 - state->args = args; 922 - state->mp = mp; 923 - 924 752 /* 925 753 * Search to see if name already exists, and get back a pointer 926 754 * to where it should go. 927 755 */ 928 - error = xfs_da3_node_lookup_int(state, &retval); 929 - if (error) 756 + retval = xfs_attr_node_hasname(args, &state); 757 + if (retval != -ENOATTR && retval != -EEXIST) 930 758 goto out; 759 + 931 760 blk = &state->path.blk[ state->path.active-1 ]; 932 761 ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC); 933 762 if (retval == -ENOATTR && (args->attr_flags & XATTR_REPLACE)) ··· 936 773 937 774 /* save the attribute state for later removal*/ 938 775 args->op_flags |= XFS_DA_OP_RENAME; /* atomic rename op */ 939 - args->blkno2 = args->blkno; /* set 2nd entry info*/ 940 - args->index2 = args->index; 941 - args->rmtblkno2 = args->rmtblkno; 942 - args->rmtblkcnt2 = args->rmtblkcnt; 943 - args->rmtvaluelen2 = args->rmtvaluelen; 776 + xfs_attr_save_rmt_blk(args); 944 777 945 778 /* 946 779 * clear the remote attr state now that it is saved so that the ··· 1022 863 return error; 1023 864 } 1024 865 1025 - /* 1026 - * If this is an atomic rename operation, we must "flip" the 1027 - * incomplete flags on the "new" and "old" attribute/value pairs 1028 - * so that one disappears and one appears atomically. Then we 1029 - * must remove the "old" attribute/value pair. 1030 - */ 1031 - if (args->op_flags & XFS_DA_OP_RENAME) { 1032 - /* 1033 - * In a separate transaction, set the incomplete flag on the 1034 - * "old" attr and clear the incomplete flag on the "new" attr. 1035 - */ 1036 - error = xfs_attr3_leaf_flipflags(args); 1037 - if (error) 1038 - goto out; 1039 - 1040 - /* 1041 - * Dismantle the "old" attribute/value pair by removing 1042 - * a "remote" value (if it exists). 1043 - */ 1044 - args->index = args->index2; 1045 - args->blkno = args->blkno2; 1046 - args->rmtblkno = args->rmtblkno2; 1047 - args->rmtblkcnt = args->rmtblkcnt2; 1048 - args->rmtvaluelen = args->rmtvaluelen2; 1049 - if (args->rmtblkno) { 1050 - error = xfs_attr_rmtval_remove(args); 1051 - if (error) 1052 - return error; 1053 - } 1054 - 1055 - /* 1056 - * Re-find the "old" attribute entry after any split ops. 1057 - * The INCOMPLETE flag means that we will find the "old" 1058 - * attr, not the "new" one. 1059 - */ 1060 - args->attr_filter |= XFS_ATTR_INCOMPLETE; 1061 - state = xfs_da_state_alloc(); 1062 - state->args = args; 1063 - state->mp = mp; 1064 - state->inleaf = 0; 1065 - error = xfs_da3_node_lookup_int(state, &retval); 1066 - if (error) 1067 - goto out; 1068 - 1069 - /* 1070 - * Remove the name and update the hashvals in the tree. 1071 - */ 1072 - blk = &state->path.blk[ state->path.active-1 ]; 1073 - ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC); 1074 - error = xfs_attr3_leaf_remove(blk->bp, args); 1075 - xfs_da3_fixhashpath(state, &state->path); 1076 - 1077 - /* 1078 - * Check to see if the tree needs to be collapsed. 1079 - */ 1080 - if (retval && (state->path.active > 1)) { 1081 - error = xfs_da3_join(state); 1082 - if (error) 1083 - goto out; 1084 - error = xfs_defer_finish(&args->trans); 1085 - if (error) 1086 - goto out; 1087 - } 1088 - 1089 - /* 1090 - * Commit and start the next trans in the chain. 1091 - */ 1092 - error = xfs_trans_roll_inode(&args->trans, dp); 1093 - if (error) 1094 - goto out; 1095 - 1096 - } else if (args->rmtblkno > 0) { 866 + if (!(args->op_flags & XFS_DA_OP_RENAME)) { 1097 867 /* 1098 868 * Added a "remote" value, just clear the incomplete flag. 1099 869 */ 1100 - error = xfs_attr3_leaf_clearflag(args); 870 + if (args->rmtblkno > 0) 871 + error = xfs_attr3_leaf_clearflag(args); 872 + retval = error; 873 + goto out; 874 + } 875 + 876 + /* 877 + * If this is an atomic rename operation, we must "flip" the incomplete 878 + * flags on the "new" and "old" attribute/value pairs so that one 879 + * disappears and one appears atomically. Then we must remove the "old" 880 + * attribute/value pair. 881 + * 882 + * In a separate transaction, set the incomplete flag on the "old" attr 883 + * and clear the incomplete flag on the "new" attr. 884 + */ 885 + error = xfs_attr3_leaf_flipflags(args); 886 + if (error) 887 + goto out; 888 + /* 889 + * Commit the flag value change and start the next trans in series 890 + */ 891 + error = xfs_trans_roll_inode(&args->trans, args->dp); 892 + if (error) 893 + goto out; 894 + 895 + /* 896 + * Dismantle the "old" attribute/value pair by removing a "remote" value 897 + * (if it exists). 898 + */ 899 + xfs_attr_restore_rmt_blk(args); 900 + 901 + if (args->rmtblkno) { 902 + error = xfs_attr_rmtval_invalidate(args); 903 + if (error) 904 + return error; 905 + 906 + error = xfs_attr_rmtval_remove(args); 907 + if (error) 908 + return error; 909 + } 910 + 911 + /* 912 + * Re-find the "old" attribute entry after any split ops. The INCOMPLETE 913 + * flag means that we will find the "old" attr, not the "new" one. 914 + */ 915 + args->attr_filter |= XFS_ATTR_INCOMPLETE; 916 + state = xfs_da_state_alloc(args); 917 + state->inleaf = 0; 918 + error = xfs_da3_node_lookup_int(state, &retval); 919 + if (error) 920 + goto out; 921 + 922 + /* 923 + * Remove the name and update the hashvals in the tree. 924 + */ 925 + blk = &state->path.blk[state->path.active-1]; 926 + ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC); 927 + error = xfs_attr3_leaf_remove(blk->bp, args); 928 + xfs_da3_fixhashpath(state, &state->path); 929 + 930 + /* 931 + * Check to see if the tree needs to be collapsed. 932 + */ 933 + if (retval && (state->path.active > 1)) { 934 + error = xfs_da3_join(state); 1101 935 if (error) 1102 936 goto out; 1103 937 } ··· 1102 950 if (error) 1103 951 return error; 1104 952 return retval; 953 + } 954 + 955 + /* 956 + * Shrink an attribute from leaf to shortform 957 + */ 958 + STATIC int 959 + xfs_attr_node_shrink( 960 + struct xfs_da_args *args, 961 + struct xfs_da_state *state) 962 + { 963 + struct xfs_inode *dp = args->dp; 964 + int error, forkoff; 965 + struct xfs_buf *bp; 966 + 967 + /* 968 + * Have to get rid of the copy of this dabuf in the state. 969 + */ 970 + ASSERT(state->path.active == 1); 971 + ASSERT(state->path.blk[0].bp); 972 + state->path.blk[0].bp = NULL; 973 + 974 + error = xfs_attr3_leaf_read(args->trans, args->dp, 0, &bp); 975 + if (error) 976 + return error; 977 + 978 + forkoff = xfs_attr_shortform_allfit(bp, dp); 979 + if (forkoff) { 980 + error = xfs_attr3_leaf_to_shortform(bp, args, forkoff); 981 + /* bp is gone due to xfs_da_shrink_inode */ 982 + } else 983 + xfs_trans_brelse(args->trans, bp); 984 + 985 + return error; 986 + } 987 + 988 + /* 989 + * Mark an attribute entry INCOMPLETE and save pointers to the relevant buffers 990 + * for later deletion of the entry. 991 + */ 992 + STATIC int 993 + xfs_attr_leaf_mark_incomplete( 994 + struct xfs_da_args *args, 995 + struct xfs_da_state *state) 996 + { 997 + int error; 998 + 999 + /* 1000 + * Fill in disk block numbers in the state structure 1001 + * so that we can get the buffers back after we commit 1002 + * several transactions in the following calls. 1003 + */ 1004 + error = xfs_attr_fillstate(state); 1005 + if (error) 1006 + return error; 1007 + 1008 + /* 1009 + * Mark the attribute as INCOMPLETE 1010 + */ 1011 + return xfs_attr3_leaf_setflag(args); 1012 + } 1013 + 1014 + /* 1015 + * Initial setup for xfs_attr_node_removename. Make sure the attr is there and 1016 + * the blocks are valid. Attr keys with remote blocks will be marked 1017 + * incomplete. 1018 + */ 1019 + STATIC 1020 + int xfs_attr_node_removename_setup( 1021 + struct xfs_da_args *args, 1022 + struct xfs_da_state **state) 1023 + { 1024 + int error; 1025 + 1026 + error = xfs_attr_node_hasname(args, state); 1027 + if (error != -EEXIST) 1028 + return error; 1029 + 1030 + ASSERT((*state)->path.blk[(*state)->path.active - 1].bp != NULL); 1031 + ASSERT((*state)->path.blk[(*state)->path.active - 1].magic == 1032 + XFS_ATTR_LEAF_MAGIC); 1033 + 1034 + if (args->rmtblkno > 0) { 1035 + error = xfs_attr_leaf_mark_incomplete(args, *state); 1036 + if (error) 1037 + return error; 1038 + 1039 + return xfs_attr_rmtval_invalidate(args); 1040 + } 1041 + 1042 + return 0; 1043 + } 1044 + 1045 + STATIC int 1046 + xfs_attr_node_remove_rmt( 1047 + struct xfs_da_args *args, 1048 + struct xfs_da_state *state) 1049 + { 1050 + int error = 0; 1051 + 1052 + error = xfs_attr_rmtval_remove(args); 1053 + if (error) 1054 + return error; 1055 + 1056 + /* 1057 + * Refill the state structure with buffers, the prior calls released our 1058 + * buffers. 1059 + */ 1060 + return xfs_attr_refillstate(state); 1105 1061 } 1106 1062 1107 1063 /* ··· 1225 965 { 1226 966 struct xfs_da_state *state; 1227 967 struct xfs_da_state_blk *blk; 1228 - struct xfs_inode *dp; 1229 - struct xfs_buf *bp; 1230 - int retval, error, forkoff; 968 + int retval, error; 969 + struct xfs_inode *dp = args->dp; 1231 970 1232 971 trace_xfs_attr_node_removename(args); 1233 972 1234 - /* 1235 - * Tie a string around our finger to remind us where we are. 1236 - */ 1237 - dp = args->dp; 1238 - state = xfs_da_state_alloc(); 1239 - state->args = args; 1240 - state->mp = dp->i_mount; 1241 - 1242 - /* 1243 - * Search to see if name exists, and get back a pointer to it. 1244 - */ 1245 - error = xfs_da3_node_lookup_int(state, &retval); 1246 - if (error || (retval != -EEXIST)) { 1247 - if (error == 0) 1248 - error = retval; 973 + error = xfs_attr_node_removename_setup(args, &state); 974 + if (error) 1249 975 goto out; 1250 - } 1251 976 1252 977 /* 1253 978 * If there is an out-of-line value, de-allocate the blocks. 1254 979 * This is done before we remove the attribute so that we don't 1255 980 * overflow the maximum size of a transaction and/or hit a deadlock. 1256 981 */ 1257 - blk = &state->path.blk[ state->path.active-1 ]; 1258 - ASSERT(blk->bp != NULL); 1259 - ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC); 1260 982 if (args->rmtblkno > 0) { 1261 - /* 1262 - * Fill in disk block numbers in the state structure 1263 - * so that we can get the buffers back after we commit 1264 - * several transactions in the following calls. 1265 - */ 1266 - error = xfs_attr_fillstate(state); 1267 - if (error) 1268 - goto out; 1269 - 1270 - /* 1271 - * Mark the attribute as INCOMPLETE, then bunmapi() the 1272 - * remote value. 1273 - */ 1274 - error = xfs_attr3_leaf_setflag(args); 1275 - if (error) 1276 - goto out; 1277 - error = xfs_attr_rmtval_remove(args); 1278 - if (error) 1279 - goto out; 1280 - 1281 - /* 1282 - * Refill the state structure with buffers, the prior calls 1283 - * released our buffers. 1284 - */ 1285 - error = xfs_attr_refillstate(state); 983 + error = xfs_attr_node_remove_rmt(args, state); 1286 984 if (error) 1287 985 goto out; 1288 986 } ··· 1274 1056 /* 1275 1057 * If the result is small enough, push it all into the inode. 1276 1058 */ 1277 - if (xfs_bmap_one_block(dp, XFS_ATTR_FORK)) { 1278 - /* 1279 - * Have to get rid of the copy of this dabuf in the state. 1280 - */ 1281 - ASSERT(state->path.active == 1); 1282 - ASSERT(state->path.blk[0].bp); 1283 - state->path.blk[0].bp = NULL; 1284 - 1285 - error = xfs_attr3_leaf_read(args->trans, args->dp, 0, &bp); 1286 - if (error) 1287 - goto out; 1288 - 1289 - if ((forkoff = xfs_attr_shortform_allfit(bp, dp))) { 1290 - error = xfs_attr3_leaf_to_shortform(bp, args, forkoff); 1291 - /* bp is gone due to xfs_da_shrink_inode */ 1292 - if (error) 1293 - goto out; 1294 - error = xfs_defer_finish(&args->trans); 1295 - if (error) 1296 - goto out; 1297 - } else 1298 - xfs_trans_brelse(args->trans, bp); 1299 - } 1300 - error = 0; 1059 + if (xfs_bmap_one_block(dp, XFS_ATTR_FORK)) 1060 + error = xfs_attr_node_shrink(args, state); 1301 1061 1302 1062 out: 1303 - xfs_da_state_free(state); 1063 + if (state) 1064 + xfs_da_state_free(state); 1304 1065 return error; 1305 1066 } 1306 1067 ··· 1395 1198 * Returns 0 on successful retrieval, otherwise an error. 1396 1199 */ 1397 1200 STATIC int 1398 - xfs_attr_node_get(xfs_da_args_t *args) 1201 + xfs_attr_node_get( 1202 + struct xfs_da_args *args) 1399 1203 { 1400 - xfs_da_state_t *state; 1401 - xfs_da_state_blk_t *blk; 1402 - int error, retval; 1403 - int i; 1204 + struct xfs_da_state *state; 1205 + struct xfs_da_state_blk *blk; 1206 + int i; 1207 + int error; 1404 1208 1405 1209 trace_xfs_attr_node_get(args); 1406 - 1407 - state = xfs_da_state_alloc(); 1408 - state->args = args; 1409 - state->mp = args->dp->i_mount; 1410 1210 1411 1211 /* 1412 1212 * Search to see if name exists, and get back a pointer to it. 1413 1213 */ 1414 - error = xfs_da3_node_lookup_int(state, &retval); 1415 - if (error) { 1416 - retval = error; 1417 - goto out_release; 1418 - } 1419 - if (retval != -EEXIST) 1214 + error = xfs_attr_node_hasname(args, &state); 1215 + if (error != -EEXIST) 1420 1216 goto out_release; 1421 1217 1422 1218 /* 1423 1219 * Get the value, local or "remote" 1424 1220 */ 1425 1221 blk = &state->path.blk[state->path.active - 1]; 1426 - retval = xfs_attr3_leaf_getvalue(blk->bp, args); 1222 + error = xfs_attr3_leaf_getvalue(blk->bp, args); 1427 1223 1428 1224 /* 1429 1225 * If not in a transaction, we have to release all the buffers. 1430 1226 */ 1431 1227 out_release: 1432 - for (i = 0; i < state->path.active; i++) { 1228 + for (i = 0; state != NULL && i < state->path.active; i++) { 1433 1229 xfs_trans_brelse(args->trans, state->path.blk[i].bp); 1434 1230 state->path.blk[i].bp = NULL; 1435 1231 } 1436 1232 1437 - xfs_da_state_free(state); 1438 - return retval; 1233 + if (state) 1234 + xfs_da_state_free(state); 1235 + return error; 1439 1236 } 1440 1237 1441 1238 /* Returns true if the attribute entry name is valid. */

+1

fs/xfs/libxfs/xfs_attr.h

··· 89 89 int xfs_attr_get(struct xfs_da_args *args); 90 90 int xfs_attr_set(struct xfs_da_args *args); 91 91 int xfs_attr_set_args(struct xfs_da_args *args); 92 + int xfs_has_attr(struct xfs_da_args *args); 92 93 int xfs_attr_remove_args(struct xfs_da_args *args); 93 94 bool xfs_attr_namecheck(const void *name, size_t length); 94 95

+73 -44

fs/xfs/libxfs/xfs_attr_leaf.c

··· 660 660 } 661 661 662 662 /* 663 + * Return -EEXIST if attr is found, or -ENOATTR if not 664 + * args: args containing attribute name and namelen 665 + * sfep: If not null, pointer will be set to the last attr entry found on 666 + -EEXIST. On -ENOATTR pointer is left at the last entry in the list 667 + * basep: If not null, pointer is set to the byte offset of the entry in the 668 + * list on -EEXIST. On -ENOATTR, pointer is left at the byte offset of 669 + * the last entry in the list 670 + */ 671 + int 672 + xfs_attr_sf_findname( 673 + struct xfs_da_args *args, 674 + struct xfs_attr_sf_entry **sfep, 675 + unsigned int *basep) 676 + { 677 + struct xfs_attr_shortform *sf; 678 + struct xfs_attr_sf_entry *sfe; 679 + unsigned int base = sizeof(struct xfs_attr_sf_hdr); 680 + int size = 0; 681 + int end; 682 + int i; 683 + 684 + sf = (struct xfs_attr_shortform *)args->dp->i_afp->if_u1.if_data; 685 + sfe = &sf->list[0]; 686 + end = sf->hdr.count; 687 + for (i = 0; i < end; sfe = XFS_ATTR_SF_NEXTENTRY(sfe), 688 + base += size, i++) { 689 + size = XFS_ATTR_SF_ENTSIZE(sfe); 690 + if (!xfs_attr_match(args, sfe->namelen, sfe->nameval, 691 + sfe->flags)) 692 + continue; 693 + break; 694 + } 695 + 696 + if (sfep != NULL) 697 + *sfep = sfe; 698 + 699 + if (basep != NULL) 700 + *basep = base; 701 + 702 + if (i == end) 703 + return -ENOATTR; 704 + return -EEXIST; 705 + } 706 + 707 + /* 663 708 * Add a name/value pair to the shortform attribute list. 664 709 * Overflow from the inode has already been checked for. 665 710 */ 666 711 void 667 - xfs_attr_shortform_add(xfs_da_args_t *args, int forkoff) 712 + xfs_attr_shortform_add( 713 + struct xfs_da_args *args, 714 + int forkoff) 668 715 { 669 - xfs_attr_shortform_t *sf; 670 - xfs_attr_sf_entry_t *sfe; 671 - int i, offset, size; 672 - xfs_mount_t *mp; 673 - xfs_inode_t *dp; 674 - struct xfs_ifork *ifp; 716 + struct xfs_attr_shortform *sf; 717 + struct xfs_attr_sf_entry *sfe; 718 + int offset, size; 719 + struct xfs_mount *mp; 720 + struct xfs_inode *dp; 721 + struct xfs_ifork *ifp; 675 722 676 723 trace_xfs_attr_sf_add(args); 677 724 ··· 729 682 ifp = dp->i_afp; 730 683 ASSERT(ifp->if_flags & XFS_IFINLINE); 731 684 sf = (xfs_attr_shortform_t *)ifp->if_u1.if_data; 732 - sfe = &sf->list[0]; 733 - for (i = 0; i < sf->hdr.count; sfe = XFS_ATTR_SF_NEXTENTRY(sfe), i++) { 734 - ASSERT(!xfs_attr_match(args, sfe->namelen, sfe->nameval, 735 - sfe->flags)); 736 - } 685 + if (xfs_attr_sf_findname(args, &sfe, NULL) == -EEXIST) 686 + ASSERT(0); 737 687 738 688 offset = (char *)sfe - (char *)sf; 739 689 size = XFS_ATTR_SF_ENTSIZE_BYNAME(args->namelen, args->valuelen); ··· 772 728 * Remove an attribute from the shortform attribute list structure. 773 729 */ 774 730 int 775 - xfs_attr_shortform_remove(xfs_da_args_t *args) 731 + xfs_attr_shortform_remove( 732 + struct xfs_da_args *args) 776 733 { 777 - xfs_attr_shortform_t *sf; 778 - xfs_attr_sf_entry_t *sfe; 779 - int base, size=0, end, totsize, i; 780 - xfs_mount_t *mp; 781 - xfs_inode_t *dp; 734 + struct xfs_attr_shortform *sf; 735 + struct xfs_attr_sf_entry *sfe; 736 + int size = 0, end, totsize; 737 + unsigned int base; 738 + struct xfs_mount *mp; 739 + struct xfs_inode *dp; 740 + int error; 782 741 783 742 trace_xfs_attr_sf_remove(args); 784 743 785 744 dp = args->dp; 786 745 mp = dp->i_mount; 787 - base = sizeof(xfs_attr_sf_hdr_t); 788 746 sf = (xfs_attr_shortform_t *)dp->i_afp->if_u1.if_data; 789 - sfe = &sf->list[0]; 790 - end = sf->hdr.count; 791 - for (i = 0; i < end; sfe = XFS_ATTR_SF_NEXTENTRY(sfe), 792 - base += size, i++) { 793 - size = XFS_ATTR_SF_ENTSIZE(sfe); 794 - if (xfs_attr_match(args, sfe->namelen, sfe->nameval, 795 - sfe->flags)) 796 - break; 797 - } 798 - if (i == end) 799 - return -ENOATTR; 747 + 748 + error = xfs_attr_sf_findname(args, &sfe, &base); 749 + if (error != -EEXIST) 750 + return error; 751 + size = XFS_ATTR_SF_ENTSIZE(sfe); 800 752 801 753 /* 802 754 * Fix up the attribute fork data, covering the hole ··· 2782 2742 XFS_DA_LOGRANGE(leaf, name_rmt, sizeof(*name_rmt))); 2783 2743 } 2784 2744 2785 - /* 2786 - * Commit the flag value change and start the next trans in series. 2787 - */ 2788 - return xfs_trans_roll_inode(&args->trans, args->dp); 2745 + return 0; 2789 2746 } 2790 2747 2791 2748 /* ··· 2830 2793 XFS_DA_LOGRANGE(leaf, name_rmt, sizeof(*name_rmt))); 2831 2794 } 2832 2795 2833 - /* 2834 - * Commit the flag value change and start the next trans in series. 2835 - */ 2836 - return xfs_trans_roll_inode(&args->trans, args->dp); 2796 + return 0; 2837 2797 } 2838 2798 2839 2799 /* ··· 2945 2911 XFS_DA_LOGRANGE(leaf2, name_rmt, sizeof(*name_rmt))); 2946 2912 } 2947 2913 2948 - /* 2949 - * Commit the flag value change and start the next trans in series. 2950 - */ 2951 - error = xfs_trans_roll_inode(&args->trans, args->dp); 2952 - 2953 - return error; 2914 + return 0; 2954 2915 }

+3

fs/xfs/libxfs/xfs_attr_leaf.h

··· 52 52 int xfs_attr_shortform_to_leaf(struct xfs_da_args *args, 53 53 struct xfs_buf **leaf_bp); 54 54 int xfs_attr_shortform_remove(struct xfs_da_args *args); 55 + int xfs_attr_sf_findname(struct xfs_da_args *args, 56 + struct xfs_attr_sf_entry **sfep, 57 + unsigned int *basep); 55 58 int xfs_attr_shortform_allfit(struct xfs_buf *bp, struct xfs_inode *dp); 56 59 int xfs_attr_shortform_bytesfit(struct xfs_inode *dp, int bytes); 57 60 xfs_failaddr_t xfs_attr_shortform_verify(struct xfs_inode *ip);

+143 -73

fs/xfs/libxfs/xfs_attr_remote.c

··· 440 440 } 441 441 442 442 /* 443 - * Write the value associated with an attribute into the out-of-line buffer 444 - * that we have defined for it. 443 + * Find a "hole" in the attribute address space large enough for us to drop the 444 + * new attribute's value into 445 445 */ 446 - int 447 - xfs_attr_rmtval_set( 446 + STATIC int 447 + xfs_attr_rmt_find_hole( 448 448 struct xfs_da_args *args) 449 449 { 450 450 struct xfs_inode *dp = args->dp; 451 451 struct xfs_mount *mp = dp->i_mount; 452 - struct xfs_bmbt_irec map; 453 - xfs_dablk_t lblkno; 454 - xfs_fileoff_t lfileoff = 0; 455 - uint8_t *src = args->value; 456 - int blkcnt; 457 - int valuelen; 458 - int nmap; 459 452 int error; 460 - int offset = 0; 461 - 462 - trace_xfs_attr_rmtval_set(args); 453 + int blkcnt; 454 + xfs_fileoff_t lfileoff = 0; 463 455 464 456 /* 465 - * Find a "hole" in the attribute address space large enough for 466 - * us to drop the new attribute's value into. Because CRC enable 467 - * attributes have headers, we can't just do a straight byte to FSB 468 - * conversion and have to take the header space into account. 457 + * Because CRC enable attributes have headers, we can't just do a 458 + * straight byte to FSB conversion and have to take the header space 459 + * into account. 469 460 */ 470 461 blkcnt = xfs_attr3_rmt_blocks(mp, args->rmtvaluelen); 471 462 error = xfs_bmap_first_unused(args->trans, args->dp, blkcnt, &lfileoff, ··· 464 473 if (error) 465 474 return error; 466 475 467 - args->rmtblkno = lblkno = (xfs_dablk_t)lfileoff; 476 + args->rmtblkno = (xfs_dablk_t)lfileoff; 468 477 args->rmtblkcnt = blkcnt; 469 478 470 - /* 471 - * Roll through the "value", allocating blocks on disk as required. 472 - */ 473 - while (blkcnt > 0) { 474 - /* 475 - * Allocate a single extent, up to the size of the value. 476 - * 477 - * Note that we have to consider this a data allocation as we 478 - * write the remote attribute without logging the contents. 479 - * Hence we must ensure that we aren't using blocks that are on 480 - * the busy list so that we don't overwrite blocks which have 481 - * recently been freed but their transactions are not yet 482 - * committed to disk. If we overwrite the contents of a busy 483 - * extent and then crash then the block may not contain the 484 - * correct metadata after log recovery occurs. 485 - */ 486 - nmap = 1; 487 - error = xfs_bmapi_write(args->trans, dp, (xfs_fileoff_t)lblkno, 488 - blkcnt, XFS_BMAPI_ATTRFORK, args->total, &map, 489 - &nmap); 490 - if (error) 491 - return error; 492 - error = xfs_defer_finish(&args->trans); 493 - if (error) 494 - return error; 479 + return 0; 480 + } 495 481 496 - ASSERT(nmap == 1); 497 - ASSERT((map.br_startblock != DELAYSTARTBLOCK) && 498 - (map.br_startblock != HOLESTARTBLOCK)); 499 - lblkno += map.br_blockcount; 500 - blkcnt -= map.br_blockcount; 501 - 502 - /* 503 - * Start the next trans in the chain. 504 - */ 505 - error = xfs_trans_roll_inode(&args->trans, dp); 506 - if (error) 507 - return error; 508 - } 482 + STATIC int 483 + xfs_attr_rmtval_set_value( 484 + struct xfs_da_args *args) 485 + { 486 + struct xfs_inode *dp = args->dp; 487 + struct xfs_mount *mp = dp->i_mount; 488 + struct xfs_bmbt_irec map; 489 + xfs_dablk_t lblkno; 490 + uint8_t *src = args->value; 491 + int blkcnt; 492 + int valuelen; 493 + int nmap; 494 + int error; 495 + int offset = 0; 509 496 510 497 /* 511 498 * Roll through the "value", copying the attribute value to the ··· 564 595 } 565 596 566 597 /* 598 + * Write the value associated with an attribute into the out-of-line buffer 599 + * that we have defined for it. 600 + */ 601 + int 602 + xfs_attr_rmtval_set( 603 + struct xfs_da_args *args) 604 + { 605 + struct xfs_inode *dp = args->dp; 606 + struct xfs_bmbt_irec map; 607 + xfs_dablk_t lblkno; 608 + int blkcnt; 609 + int nmap; 610 + int error; 611 + 612 + trace_xfs_attr_rmtval_set(args); 613 + 614 + error = xfs_attr_rmt_find_hole(args); 615 + if (error) 616 + return error; 617 + 618 + blkcnt = args->rmtblkcnt; 619 + lblkno = (xfs_dablk_t)args->rmtblkno; 620 + /* 621 + * Roll through the "value", allocating blocks on disk as required. 622 + */ 623 + while (blkcnt > 0) { 624 + /* 625 + * Allocate a single extent, up to the size of the value. 626 + * 627 + * Note that we have to consider this a data allocation as we 628 + * write the remote attribute without logging the contents. 629 + * Hence we must ensure that we aren't using blocks that are on 630 + * the busy list so that we don't overwrite blocks which have 631 + * recently been freed but their transactions are not yet 632 + * committed to disk. If we overwrite the contents of a busy 633 + * extent and then crash then the block may not contain the 634 + * correct metadata after log recovery occurs. 635 + */ 636 + nmap = 1; 637 + error = xfs_bmapi_write(args->trans, dp, (xfs_fileoff_t)lblkno, 638 + blkcnt, XFS_BMAPI_ATTRFORK, args->total, &map, 639 + &nmap); 640 + if (error) 641 + return error; 642 + error = xfs_defer_finish(&args->trans); 643 + if (error) 644 + return error; 645 + 646 + ASSERT(nmap == 1); 647 + ASSERT((map.br_startblock != DELAYSTARTBLOCK) && 648 + (map.br_startblock != HOLESTARTBLOCK)); 649 + lblkno += map.br_blockcount; 650 + blkcnt -= map.br_blockcount; 651 + 652 + /* 653 + * Start the next trans in the chain. 654 + */ 655 + error = xfs_trans_roll_inode(&args->trans, dp); 656 + if (error) 657 + return error; 658 + } 659 + 660 + return xfs_attr_rmtval_set_value(args); 661 + } 662 + 663 + /* 567 664 * Remove the value associated with an attribute by deleting the 568 665 * out-of-line buffer that it is stored on. 569 666 */ 570 667 int 571 - xfs_attr_rmtval_remove( 668 + xfs_attr_rmtval_invalidate( 572 669 struct xfs_da_args *args) 573 670 { 574 671 xfs_dablk_t lblkno; 575 672 int blkcnt; 576 673 int error; 577 - int done; 578 - 579 - trace_xfs_attr_rmtval_remove(args); 580 674 581 675 /* 582 676 * Roll through the "value", invalidating the attribute value's blocks. ··· 667 635 lblkno += map.br_blockcount; 668 636 blkcnt -= map.br_blockcount; 669 637 } 638 + return 0; 639 + } 640 + 641 + /* 642 + * Remove the value associated with an attribute by deleting the 643 + * out-of-line buffer that it is stored on. 644 + */ 645 + int 646 + xfs_attr_rmtval_remove( 647 + struct xfs_da_args *args) 648 + { 649 + int error; 650 + int retval; 651 + 652 + trace_xfs_attr_rmtval_remove(args); 670 653 671 654 /* 672 655 * Keep de-allocating extents until the remote-value region is gone. 673 656 */ 674 - lblkno = args->rmtblkno; 675 - blkcnt = args->rmtblkcnt; 676 - done = 0; 677 - while (!done) { 678 - error = xfs_bunmapi(args->trans, args->dp, lblkno, blkcnt, 679 - XFS_BMAPI_ATTRFORK, 1, &done); 680 - if (error) 681 - return error; 682 - error = xfs_defer_finish(&args->trans); 683 - if (error) 684 - return error; 657 + do { 658 + retval = __xfs_attr_rmtval_remove(args); 659 + if (retval && retval != -EAGAIN) 660 + return retval; 685 661 686 662 /* 687 663 * Close out trans and start the next one in the chain. ··· 697 657 error = xfs_trans_roll_inode(&args->trans, args->dp); 698 658 if (error) 699 659 return error; 700 - } 660 + } while (retval == -EAGAIN); 661 + 701 662 return 0; 663 + } 664 + 665 + /* 666 + * Remove the value associated with an attribute by deleting the out-of-line 667 + * buffer that it is stored on. Returns EAGAIN for the caller to refresh the 668 + * transaction and re-call the function 669 + */ 670 + int 671 + __xfs_attr_rmtval_remove( 672 + struct xfs_da_args *args) 673 + { 674 + int error, done; 675 + 676 + /* 677 + * Unmap value blocks for this attr. 678 + */ 679 + error = xfs_bunmapi(args->trans, args->dp, args->rmtblkno, 680 + args->rmtblkcnt, XFS_BMAPI_ATTRFORK, 1, &done); 681 + if (error) 682 + return error; 683 + 684 + error = xfs_defer_finish(&args->trans); 685 + if (error) 686 + return error; 687 + 688 + if (!done) 689 + return -EAGAIN; 690 + 691 + return error; 702 692 }

+2 -1

fs/xfs/libxfs/xfs_attr_remote.h

··· 13 13 int xfs_attr_rmtval_remove(struct xfs_da_args *args); 14 14 int xfs_attr_rmtval_stale(struct xfs_inode *ip, struct xfs_bmbt_irec *map, 15 15 xfs_buf_flags_t incore_flags); 16 - 16 + int xfs_attr_rmtval_invalidate(struct xfs_da_args *args); 17 + int __xfs_attr_rmtval_remove(struct xfs_da_args *args); 17 18 #endif /* __XFS_ATTR_REMOTE_H__ */

+6 -2

fs/xfs/libxfs/xfs_bmap.c

··· 553 553 #endif 554 554 ASSERT(xfs_bmap_free_item_zone != NULL); 555 555 556 - new = kmem_zone_alloc(xfs_bmap_free_item_zone, 0); 556 + new = kmem_cache_alloc(xfs_bmap_free_item_zone, 557 + GFP_KERNEL | __GFP_NOFAIL); 557 558 new->xefi_startblock = bno; 558 559 new->xefi_blockcount = (xfs_extlen_t)len; 559 560 if (oinfo) ··· 1099 1098 if (error) 1100 1099 goto trans_cancel; 1101 1100 ASSERT(ip->i_afp == NULL); 1102 - ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, 0); 1101 + 1102 + ip->i_afp = kmem_cache_zalloc(xfs_ifork_zone, 1103 + GFP_KERNEL | __GFP_NOFAIL); 1104 + 1103 1105 ip->i_afp->if_format = XFS_DINODE_FMT_EXTENTS; 1104 1106 ip->i_afp->if_flags = XFS_IFEXTENTS; 1105 1107 logflags = 0;

+10 -5

fs/xfs/libxfs/xfs_bmap.h

··· 158 158 { BMAP_ATTRFORK, "ATTR" }, \ 159 159 { BMAP_COWFORK, "COW" } 160 160 161 + /* Return true if the extent is an allocated extent, written or not. */ 162 + static inline bool xfs_bmap_is_real_extent(struct xfs_bmbt_irec *irec) 163 + { 164 + return irec->br_startblock != HOLESTARTBLOCK && 165 + irec->br_startblock != DELAYSTARTBLOCK && 166 + !isnullstartblock(irec->br_startblock); 167 + } 161 168 162 169 /* 163 170 * Return true if the extent is a real, allocated extent, or false if it is a 164 171 * delayed allocation, and unwritten extent or a hole. 165 172 */ 166 - static inline bool xfs_bmap_is_real_extent(struct xfs_bmbt_irec *irec) 173 + static inline bool xfs_bmap_is_written_extent(struct xfs_bmbt_irec *irec) 167 174 { 168 - return irec->br_state != XFS_EXT_UNWRITTEN && 169 - irec->br_startblock != HOLESTARTBLOCK && 170 - irec->br_startblock != DELAYSTARTBLOCK && 171 - !isnullstartblock(irec->br_startblock); 175 + return xfs_bmap_is_real_extent(irec) && 176 + irec->br_state != XFS_EXT_UNWRITTEN; 172 177 } 173 178 174 179 /*

+1 -1

fs/xfs/libxfs/xfs_bmap_btree.c

··· 552 552 struct xfs_btree_cur *cur; 553 553 ASSERT(whichfork != XFS_COW_FORK); 554 554 555 - cur = kmem_zone_zalloc(xfs_btree_cur_zone, KM_NOFS); 555 + cur = kmem_cache_zalloc(xfs_btree_cur_zone, GFP_NOFS | __GFP_NOFAIL); 556 556 557 557 cur->bc_tp = tp; 558 558 cur->bc_mp = mp;

+3 -3

fs/xfs/libxfs/xfs_btree_staging.h

··· 18 18 unsigned int af_blocks; 19 19 }; 20 20 21 - /* Cursor interactions with with fake roots for AG-rooted btrees. */ 21 + /* Cursor interactions with fake roots for AG-rooted btrees. */ 22 22 void xfs_btree_stage_afakeroot(struct xfs_btree_cur *cur, 23 23 struct xbtree_afakeroot *afake); 24 24 void xfs_btree_commit_afakeroot(struct xfs_btree_cur *cur, struct xfs_trans *tp, ··· 45 45 unsigned int if_extents; 46 46 }; 47 47 48 - /* Cursor interactions with with fake roots for inode-rooted btrees. */ 48 + /* Cursor interactions with fake roots for inode-rooted btrees. */ 49 49 void xfs_btree_stage_ifakeroot(struct xfs_btree_cur *cur, 50 50 struct xbtree_ifakeroot *ifake, 51 51 struct xfs_btree_ops **new_ops); ··· 90 90 91 91 /* 92 92 * Number of free records to leave in each leaf block. If the caller 93 - * sets this to -1, the slack value will be calculated to be be halfway 93 + * sets this to -1, the slack value will be calculated to be halfway 94 94 * between maxrecs and minrecs. This typically leaves the block 75% 95 95 * full. Note that slack values are not enforced on inode root blocks. 96 96 */

+9 -3

fs/xfs/libxfs/xfs_da_btree.c

··· 78 78 * Allocate a dir-state structure. 79 79 * We don't put them on the stack since they're large. 80 80 */ 81 - xfs_da_state_t * 82 - xfs_da_state_alloc(void) 81 + struct xfs_da_state * 82 + xfs_da_state_alloc( 83 + struct xfs_da_args *args) 83 84 { 84 - return kmem_zone_zalloc(xfs_da_state_zone, KM_NOFS); 85 + struct xfs_da_state *state; 86 + 87 + state = kmem_cache_zalloc(xfs_da_state_zone, GFP_NOFS | __GFP_NOFAIL); 88 + state->args = args; 89 + state->mp = args->dp->i_mount; 90 + return state; 85 91 } 86 92 87 93 /*

+1 -1

fs/xfs/libxfs/xfs_da_btree.h

··· 219 219 const unsigned char *name, int len); 220 220 221 221 222 - xfs_da_state_t *xfs_da_state_alloc(void); 222 + struct xfs_da_state *xfs_da_state_alloc(struct xfs_da_args *args); 223 223 void xfs_da_state_free(xfs_da_state_t *state); 224 224 225 225 void xfs_da3_node_hdr_from_disk(struct xfs_mount *mp,

+5 -12

fs/xfs/libxfs/xfs_dir2_node.c

··· 2015 2015 /* 2016 2016 * Allocate and initialize the state (btree cursor). 2017 2017 */ 2018 - state = xfs_da_state_alloc(); 2019 - state->args = args; 2020 - state->mp = args->dp->i_mount; 2018 + state = xfs_da_state_alloc(args); 2021 2019 /* 2022 2020 * Look up the name. We're not supposed to find it, but 2023 2021 * this gives us the insertion point. ··· 2084 2086 /* 2085 2087 * Allocate and initialize the btree cursor. 2086 2088 */ 2087 - state = xfs_da_state_alloc(); 2088 - state->args = args; 2089 - state->mp = args->dp->i_mount; 2089 + state = xfs_da_state_alloc(args); 2090 + 2090 2091 /* 2091 2092 * Fill in the path to the entry in the cursor. 2092 2093 */ ··· 2136 2139 /* 2137 2140 * Allocate and initialize the btree cursor. 2138 2141 */ 2139 - state = xfs_da_state_alloc(); 2140 - state->args = args; 2141 - state->mp = args->dp->i_mount; 2142 + state = xfs_da_state_alloc(args); 2142 2143 2143 2144 /* Look up the entry we're deleting, set up the cursor. */ 2144 2145 error = xfs_da3_node_lookup_int(state, &rval); ··· 2201 2206 /* 2202 2207 * Allocate and initialize the btree cursor. 2203 2208 */ 2204 - state = xfs_da_state_alloc(); 2205 - state->args = args; 2206 - state->mp = args->dp->i_mount; 2209 + state = xfs_da_state_alloc(args); 2207 2210 2208 2211 /* 2209 2212 * We have to save new inode number and ftype since

+13 -12

fs/xfs/libxfs/xfs_dquot_buf.c

··· 37 37 xfs_dquot_verify( 38 38 struct xfs_mount *mp, 39 39 struct xfs_disk_dquot *ddq, 40 - xfs_dqid_t id, 41 - uint type) /* used only during quotacheck */ 40 + xfs_dqid_t id) /* used only during quotacheck */ 42 41 { 42 + __u8 ddq_type; 43 + 43 44 /* 44 45 * We can encounter an uninitialized dquot buffer for 2 reasons: 45 46 * 1. If we crash while deleting the quotainode(s), and those blks got ··· 61 60 if (ddq->d_version != XFS_DQUOT_VERSION) 62 61 return __this_address; 63 62 64 - if (type && ddq->d_flags != type) 63 + if (ddq->d_type & ~XFS_DQTYPE_ANY) 65 64 return __this_address; 66 - if (ddq->d_flags != XFS_DQ_USER && 67 - ddq->d_flags != XFS_DQ_PROJ && 68 - ddq->d_flags != XFS_DQ_GROUP) 65 + ddq_type = ddq->d_type & XFS_DQTYPE_REC_MASK; 66 + if (ddq_type != XFS_DQTYPE_USER && 67 + ddq_type != XFS_DQTYPE_PROJ && 68 + ddq_type != XFS_DQTYPE_GROUP) 69 69 return __this_address; 70 70 71 71 if (id != -1 && id != be32_to_cpu(ddq->d_id)) ··· 97 95 xfs_dqblk_verify( 98 96 struct xfs_mount *mp, 99 97 struct xfs_dqblk *dqb, 100 - xfs_dqid_t id, 101 - uint type) /* used only during quotacheck */ 98 + xfs_dqid_t id) /* used only during quotacheck */ 102 99 { 103 100 if (xfs_sb_version_hascrc(&mp->m_sb) && 104 101 !uuid_equal(&dqb->dd_uuid, &mp->m_sb.sb_meta_uuid)) 105 102 return __this_address; 106 103 107 - return xfs_dquot_verify(mp, &dqb->dd_diskdq, id, type); 104 + return xfs_dquot_verify(mp, &dqb->dd_diskdq, id); 108 105 } 109 106 110 107 /* ··· 114 113 struct xfs_mount *mp, 115 114 struct xfs_dqblk *dqb, 116 115 xfs_dqid_t id, 117 - uint type) 116 + xfs_dqtype_t type) 118 117 { 119 118 /* 120 119 * Typically, a repair is only requested by quotacheck. ··· 124 123 125 124 dqb->dd_diskdq.d_magic = cpu_to_be16(XFS_DQUOT_MAGIC); 126 125 dqb->dd_diskdq.d_version = XFS_DQUOT_VERSION; 127 - dqb->dd_diskdq.d_flags = type; 126 + dqb->dd_diskdq.d_type = type; 128 127 dqb->dd_diskdq.d_id = cpu_to_be32(id); 129 128 130 129 if (xfs_sb_version_hascrc(&mp->m_sb)) { ··· 206 205 if (i == 0) 207 206 id = be32_to_cpu(ddq->d_id); 208 207 209 - fa = xfs_dqblk_verify(mp, &dqb[i], id + i, 0); 208 + fa = xfs_dqblk_verify(mp, &dqb[i], id + i); 210 209 if (fa) { 211 210 if (!readahead) 212 211 xfs_buf_verifier_error(bp, -EFSCORRUPTED,

+31 -5

fs/xfs/libxfs/xfs_format.h

··· 1149 1149 #define XFS_DQUOT_MAGIC 0x4451 /* 'DQ' */ 1150 1150 #define XFS_DQUOT_VERSION (uint8_t)0x01 /* latest version number */ 1151 1151 1152 + #define XFS_DQTYPE_USER 0x01 /* user dquot record */ 1153 + #define XFS_DQTYPE_PROJ 0x02 /* project dquot record */ 1154 + #define XFS_DQTYPE_GROUP 0x04 /* group dquot record */ 1155 + 1156 + /* bitmask to determine if this is a user/group/project dquot */ 1157 + #define XFS_DQTYPE_REC_MASK (XFS_DQTYPE_USER | \ 1158 + XFS_DQTYPE_PROJ | \ 1159 + XFS_DQTYPE_GROUP) 1160 + 1161 + #define XFS_DQTYPE_ANY (XFS_DQTYPE_REC_MASK) 1162 + 1152 1163 /* 1153 - * This is the main portion of the on-disk representation of quota 1154 - * information for a user. This is the q_core of the struct xfs_dquot that 1155 - * is kept in kernel memory. We pad this with some more expansion room 1156 - * to construct the on disk structure. 1164 + * This is the main portion of the on-disk representation of quota information 1165 + * for a user. We pad this with some more expansion room to construct the on 1166 + * disk structure. 1157 1167 */ 1158 1168 struct xfs_disk_dquot { 1159 1169 __be16 d_magic; /* dquot magic = XFS_DQUOT_MAGIC */ 1160 1170 __u8 d_version; /* dquot version */ 1161 - __u8 d_flags; /* XFS_DQ_USER/PROJ/GROUP */ 1171 + __u8 d_type; /* XFS_DQTYPE_USER/PROJ/GROUP */ 1162 1172 __be32 d_id; /* user,project,group id */ 1163 1173 __be64 d_blk_hardlimit;/* absolute limit on disk blks */ 1164 1174 __be64 d_blk_softlimit;/* preferred limit on disk blks */ ··· 1207 1197 } xfs_dqblk_t; 1208 1198 1209 1199 #define XFS_DQUOT_CRC_OFF offsetof(struct xfs_dqblk, dd_crc) 1200 + 1201 + /* 1202 + * This defines the unit of allocation of dquots. 1203 + * 1204 + * Currently, it is just one file system block, and a 4K blk contains 30 1205 + * (136 * 30 = 4080) dquots. It's probably not worth trying to make 1206 + * this more dynamic. 1207 + * 1208 + * However, if this number is changed, we have to make sure that we don't 1209 + * implicitly assume that we do allocations in chunks of a single filesystem 1210 + * block in the dquot/xqm code. 1211 + * 1212 + * This is part of the ondisk format because the structure size is not a power 1213 + * of two, which leaves slack at the end of the disk block. 1214 + */ 1215 + #define XFS_DQUOT_CLUSTER_SIZE_FSB (xfs_filblks_t)1 1210 1216 1211 1217 /* 1212 1218 * Remote symlink format and access functions.

+7 -21

fs/xfs/libxfs/xfs_ialloc.c

··· 888 888 */ 889 889 be32_add_cpu(&agi->agi_count, newlen); 890 890 be32_add_cpu(&agi->agi_freecount, newlen); 891 - pag = xfs_perag_get(args.mp, agno); 891 + pag = agbp->b_pag; 892 892 pag->pagi_freecount += newlen; 893 893 pag->pagi_count += newlen; 894 - xfs_perag_put(pag); 895 894 agi->agi_newino = cpu_to_be32(newino); 896 895 897 896 /* ··· 1133 1134 xfs_agnumber_t agno = be32_to_cpu(agi->agi_seqno); 1134 1135 xfs_agnumber_t pagno = XFS_INO_TO_AGNO(mp, parent); 1135 1136 xfs_agino_t pagino = XFS_INO_TO_AGINO(mp, parent); 1136 - struct xfs_perag *pag; 1137 + struct xfs_perag *pag = agbp->b_pag; 1137 1138 struct xfs_btree_cur *cur, *tcur; 1138 1139 struct xfs_inobt_rec_incore rec, trec; 1139 1140 xfs_ino_t ino; ··· 1141 1142 int offset; 1142 1143 int i, j; 1143 1144 int searchdistance = 10; 1144 - 1145 - pag = xfs_perag_get(mp, agno); 1146 1145 1147 1146 ASSERT(pag->pagi_init); 1148 1147 ASSERT(pag->pagi_inodeok); ··· 1381 1384 1382 1385 xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); 1383 1386 xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, -1); 1384 - xfs_perag_put(pag); 1385 1387 *inop = ino; 1386 1388 return 0; 1387 1389 error1: 1388 1390 xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR); 1389 1391 error0: 1390 1392 xfs_btree_del_cursor(cur, XFS_BTREE_ERROR); 1391 - xfs_perag_put(pag); 1392 1393 return error; 1393 1394 } 1394 1395 ··· 1582 1587 xfs_agnumber_t agno = be32_to_cpu(agi->agi_seqno); 1583 1588 xfs_agnumber_t pagno = XFS_INO_TO_AGNO(mp, parent); 1584 1589 xfs_agino_t pagino = XFS_INO_TO_AGINO(mp, parent); 1585 - struct xfs_perag *pag; 1586 1590 struct xfs_btree_cur *cur; /* finobt cursor */ 1587 1591 struct xfs_btree_cur *icur; /* inobt cursor */ 1588 1592 struct xfs_inobt_rec_incore rec; ··· 1592 1598 1593 1599 if (!xfs_sb_version_hasfinobt(&mp->m_sb)) 1594 1600 return xfs_dialloc_ag_inobt(tp, agbp, parent, inop); 1595 - 1596 - pag = xfs_perag_get(mp, agno); 1597 1601 1598 1602 /* 1599 1603 * If pagino is 0 (this is the root inode allocation) use newino. ··· 1659 1667 */ 1660 1668 be32_add_cpu(&agi->agi_freecount, -1); 1661 1669 xfs_ialloc_log_agi(tp, agbp, XFS_AGI_FREECOUNT); 1662 - pag->pagi_freecount--; 1670 + agbp->b_pag->pagi_freecount--; 1663 1671 1664 1672 xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, -1); 1665 1673 ··· 1672 1680 1673 1681 xfs_btree_del_cursor(icur, XFS_BTREE_NOERROR); 1674 1682 xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); 1675 - xfs_perag_put(pag); 1676 1683 *inop = ino; 1677 1684 return 0; 1678 1685 ··· 1679 1688 xfs_btree_del_cursor(icur, XFS_BTREE_ERROR); 1680 1689 error_cur: 1681 1690 xfs_btree_del_cursor(cur, XFS_BTREE_ERROR); 1682 - xfs_perag_put(pag); 1683 1691 return error; 1684 1692 } 1685 1693 ··· 1935 1945 { 1936 1946 struct xfs_agi *agi = agbp->b_addr; 1937 1947 xfs_agnumber_t agno = be32_to_cpu(agi->agi_seqno); 1938 - struct xfs_perag *pag; 1939 1948 struct xfs_btree_cur *cur; 1940 1949 struct xfs_inobt_rec_incore rec; 1941 1950 int ilen; ··· 1996 2007 if (!(mp->m_flags & XFS_MOUNT_IKEEP) && 1997 2008 rec.ir_free == XFS_INOBT_ALL_FREE && 1998 2009 mp->m_sb.sb_inopblock <= XFS_INODES_PER_CHUNK) { 2010 + struct xfs_perag *pag = agbp->b_pag; 2011 + 1999 2012 xic->deleted = true; 2000 2013 xic->first_ino = XFS_AGINO_TO_INO(mp, agno, rec.ir_startino); 2001 2014 xic->alloc = xfs_inobt_irec_to_allocmask(&rec); ··· 2011 2020 be32_add_cpu(&agi->agi_count, -ilen); 2012 2021 be32_add_cpu(&agi->agi_freecount, -(ilen - 1)); 2013 2022 xfs_ialloc_log_agi(tp, agbp, XFS_AGI_COUNT | XFS_AGI_FREECOUNT); 2014 - pag = xfs_perag_get(mp, agno); 2015 2023 pag->pagi_freecount -= ilen - 1; 2016 2024 pag->pagi_count -= ilen; 2017 - xfs_perag_put(pag); 2018 2025 xfs_trans_mod_sb(tp, XFS_TRANS_SB_ICOUNT, -ilen); 2019 2026 xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, -(ilen - 1)); 2020 2027 ··· 2038 2049 */ 2039 2050 be32_add_cpu(&agi->agi_freecount, 1); 2040 2051 xfs_ialloc_log_agi(tp, agbp, XFS_AGI_FREECOUNT); 2041 - pag = xfs_perag_get(mp, agno); 2042 - pag->pagi_freecount++; 2043 - xfs_perag_put(pag); 2052 + agbp->b_pag->pagi_freecount++; 2044 2053 xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, 1); 2045 2054 } 2046 2055 ··· 2648 2661 return error; 2649 2662 2650 2663 agi = (*bpp)->b_addr; 2651 - pag = xfs_perag_get(mp, agno); 2664 + pag = (*bpp)->b_pag; 2652 2665 if (!pag->pagi_init) { 2653 2666 pag->pagi_freecount = be32_to_cpu(agi->agi_freecount); 2654 2667 pag->pagi_count = be32_to_cpu(agi->agi_count); ··· 2661 2674 */ 2662 2675 ASSERT(pag->pagi_freecount == be32_to_cpu(agi->agi_freecount) || 2663 2676 XFS_FORCED_SHUTDOWN(mp)); 2664 - xfs_perag_put(pag); 2665 2677 return 0; 2666 2678 } 2667 2679

+1 -1

fs/xfs/libxfs/xfs_ialloc_btree.c

··· 411 411 { 412 412 struct xfs_btree_cur *cur; 413 413 414 - cur = kmem_zone_zalloc(xfs_btree_cur_zone, KM_NOFS); 414 + cur = kmem_cache_zalloc(xfs_btree_cur_zone, GFP_NOFS | __GFP_NOFAIL); 415 415 cur->bc_tp = tp; 416 416 cur->bc_mp = mp; 417 417 cur->bc_btnum = btnum;

+5 -28

fs/xfs/libxfs/xfs_inode_buf.c

··· 21 21 #include <linux/iversion.h> 22 22 23 23 /* 24 - * Check that none of the inode's in the buffer have a next 25 - * unlinked field of 0. 26 - */ 27 - #if defined(DEBUG) 28 - void 29 - xfs_inobp_check( 30 - xfs_mount_t *mp, 31 - xfs_buf_t *bp) 32 - { 33 - int i; 34 - xfs_dinode_t *dip; 35 - 36 - for (i = 0; i < M_IGEO(mp)->inodes_per_cluster; i++) { 37 - dip = xfs_buf_offset(bp, i * mp->m_sb.sb_inodesize); 38 - if (!dip->di_next_unlinked) { 39 - xfs_alert(mp, 40 - "Detected bogus zero next_unlinked field in inode %d buffer 0x%llx.", 41 - i, (long long)bp->b_bn); 42 - } 43 - } 44 - } 45 - #endif 46 - 47 - /* 48 24 * If we are doing readahead on an inode buffer, we might be in log recovery 49 25 * reading an inode allocation buffer that hasn't yet been replayed, and hence 50 26 * has not had the inode cores stamped into it. Hence for readahead, the buffer ··· 29 53 * If the readahead buffer is invalid, we need to mark it with an error and 30 54 * clear the DONE status of the buffer so that a followup read will re-read it 31 55 * from disk. We don't report the error otherwise to avoid warnings during log 32 - * recovery and we don't get unnecssary panics on debug kernels. We use EIO here 56 + * recovery and we don't get unnecessary panics on debug kernels. We use EIO here 33 57 * because all we want to do is say readahead failed; there is no-one to report 34 58 * the error to, so this will distinguish it from a non-ra verifier failure. 35 - * Changes to this readahead error behavour also need to be reflected in 59 + * Changes to this readahead error behaviour also need to be reflected in 36 60 * xfs_dquot_buf_readahead_verify(). 37 61 */ 38 62 static void ··· 152 176 } 153 177 154 178 *bpp = bp; 155 - *dipp = xfs_buf_offset(bp, imap->im_boffset); 179 + if (dipp) 180 + *dipp = xfs_buf_offset(bp, imap->im_boffset); 156 181 return 0; 157 182 } 158 183 ··· 180 203 /* 181 204 * First get the permanent information that is needed to allocate an 182 205 * inode. If the inode is unused, mode is zero and we shouldn't mess 183 - * with the unitialized part of it. 206 + * with the uninitialized part of it. 184 207 */ 185 208 to->di_flushiter = be16_to_cpu(from->di_flushiter); 186 209 inode->i_generation = be32_to_cpu(from->di_gen);

-6

fs/xfs/libxfs/xfs_inode_buf.h

··· 52 52 void xfs_log_dinode_to_disk(struct xfs_log_dinode *from, 53 53 struct xfs_dinode *to); 54 54 55 - #if defined(DEBUG) 56 - void xfs_inobp_check(struct xfs_mount *, struct xfs_buf *); 57 - #else 58 - #define xfs_inobp_check(mp, bp) 59 - #endif /* DEBUG */ 60 - 61 55 xfs_failaddr_t xfs_dinode_verify(struct xfs_mount *mp, xfs_ino_t ino, 62 56 struct xfs_dinode *dip); 63 57 xfs_failaddr_t xfs_inode_validate_extsize(struct xfs_mount *mp,

+3 -3

fs/xfs/libxfs/xfs_inode_fork.c

··· 291 291 * Initialize the extent count early, as the per-format routines may 292 292 * depend on it. 293 293 */ 294 - ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, KM_NOFS); 294 + ip->i_afp = kmem_cache_zalloc(xfs_ifork_zone, GFP_NOFS | __GFP_NOFAIL); 295 295 ip->i_afp->if_format = dip->di_aformat; 296 296 if (unlikely(ip->i_afp->if_format == 0)) /* pre IRIX 6.2 file system */ 297 297 ip->i_afp->if_format = XFS_DINODE_FMT_EXTENTS; ··· 673 673 if (ip->i_cowfp) 674 674 return; 675 675 676 - ip->i_cowfp = kmem_zone_zalloc(xfs_ifork_zone, 677 - KM_NOFS); 676 + ip->i_cowfp = kmem_cache_zalloc(xfs_ifork_zone, 677 + GFP_NOFS | __GFP_NOFAIL); 678 678 ip->i_cowfp->if_flags = XFS_IFEXTENTS; 679 679 ip->i_cowfp->if_format = XFS_DINODE_FMT_EXTENTS; 680 680 }

+15 -16

fs/xfs/libxfs/xfs_quota_defs.h

··· 18 18 typedef uint64_t xfs_qcnt_t; 19 19 typedef uint16_t xfs_qwarncnt_t; 20 20 21 + typedef uint8_t xfs_dqtype_t; 22 + 23 + #define XFS_DQTYPE_STRINGS \ 24 + { XFS_DQTYPE_USER, "USER" }, \ 25 + { XFS_DQTYPE_PROJ, "PROJ" }, \ 26 + { XFS_DQTYPE_GROUP, "GROUP" } 27 + 21 28 /* 22 29 * flags for q_flags field in the dquot. 23 30 */ 24 - #define XFS_DQ_USER 0x0001 /* a user quota */ 25 - #define XFS_DQ_PROJ 0x0002 /* project quota */ 26 - #define XFS_DQ_GROUP 0x0004 /* a group quota */ 27 - #define XFS_DQ_DIRTY 0x0008 /* dquot is dirty */ 28 - #define XFS_DQ_FREEING 0x0010 /* dquot is being torn down */ 31 + #define XFS_DQFLAG_DIRTY (1 << 0) /* dquot is dirty */ 32 + #define XFS_DQFLAG_FREEING (1 << 1) /* dquot is being torn down */ 29 33 30 - #define XFS_DQ_ALLTYPES (XFS_DQ_USER|XFS_DQ_PROJ|XFS_DQ_GROUP) 31 - 32 - #define XFS_DQ_FLAGS \ 33 - { XFS_DQ_USER, "USER" }, \ 34 - { XFS_DQ_PROJ, "PROJ" }, \ 35 - { XFS_DQ_GROUP, "GROUP" }, \ 36 - { XFS_DQ_DIRTY, "DIRTY" }, \ 37 - { XFS_DQ_FREEING, "FREEING" } 34 + #define XFS_DQFLAG_STRINGS \ 35 + { XFS_DQFLAG_DIRTY, "DIRTY" }, \ 36 + { XFS_DQFLAG_FREEING, "FREEING" } 38 37 39 38 /* 40 39 * We have the possibility of all three quota types being active at once, and ··· 136 137 #define XFS_QMOPT_RESBLK_MASK (XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_RES_RTBLKS) 137 138 138 139 extern xfs_failaddr_t xfs_dquot_verify(struct xfs_mount *mp, 139 - struct xfs_disk_dquot *ddq, xfs_dqid_t id, uint type); 140 + struct xfs_disk_dquot *ddq, xfs_dqid_t id); 140 141 extern xfs_failaddr_t xfs_dqblk_verify(struct xfs_mount *mp, 141 - struct xfs_dqblk *dqb, xfs_dqid_t id, uint type); 142 + struct xfs_dqblk *dqb, xfs_dqid_t id); 142 143 extern int xfs_calc_dquots_per_chunk(unsigned int nbblks); 143 144 extern void xfs_dqblk_repair(struct xfs_mount *mp, struct xfs_dqblk *dqb, 144 - xfs_dqid_t id, uint type); 145 + xfs_dqid_t id, xfs_dqtype_t type); 145 146 146 147 #endif /* __XFS_QUOTA_H__ */

+2 -4

fs/xfs/libxfs/xfs_refcount_btree.c

··· 37 37 { 38 38 struct xfs_buf *agbp = cur->bc_ag.agbp; 39 39 struct xfs_agf *agf = agbp->b_addr; 40 - xfs_agnumber_t seqno = be32_to_cpu(agf->agf_seqno); 41 - struct xfs_perag *pag = xfs_perag_get(cur->bc_mp, seqno); 40 + struct xfs_perag *pag = agbp->b_pag; 42 41 43 42 ASSERT(ptr->s != 0); 44 43 45 44 agf->agf_refcount_root = ptr->s; 46 45 be32_add_cpu(&agf->agf_refcount_level, inc); 47 46 pag->pagf_refcount_level += inc; 48 - xfs_perag_put(pag); 49 47 50 48 xfs_alloc_log_agf(cur->bc_tp, agbp, 51 49 XFS_AGF_REFCOUNT_ROOT | XFS_AGF_REFCOUNT_LEVEL); ··· 323 325 ASSERT(agno != NULLAGNUMBER); 324 326 ASSERT(agno < mp->m_sb.sb_agcount); 325 327 326 - cur = kmem_zone_zalloc(xfs_btree_cur_zone, KM_NOFS); 328 + cur = kmem_cache_zalloc(xfs_btree_cur_zone, GFP_NOFS | __GFP_NOFAIL); 327 329 cur->bc_tp = tp; 328 330 cur->bc_mp = mp; 329 331 cur->bc_btnum = XFS_BTNUM_REFC;

+5 -6

fs/xfs/libxfs/xfs_rmap_btree.c

··· 63 63 { 64 64 struct xfs_buf *agbp = cur->bc_ag.agbp; 65 65 struct xfs_agf *agf = agbp->b_addr; 66 - xfs_agnumber_t seqno = be32_to_cpu(agf->agf_seqno); 67 66 int btnum = cur->bc_btnum; 68 - struct xfs_perag *pag = xfs_perag_get(cur->bc_mp, seqno); 67 + struct xfs_perag *pag = agbp->b_pag; 69 68 70 69 ASSERT(ptr->s != 0); 71 70 72 71 agf->agf_roots[btnum] = ptr->s; 73 72 be32_add_cpu(&agf->agf_levels[btnum], inc); 74 73 pag->pagf_levels[btnum] += inc; 75 - xfs_perag_put(pag); 76 74 77 75 xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_ROOTS | XFS_AGF_LEVELS); 78 76 } ··· 121 123 { 122 124 struct xfs_buf *agbp = cur->bc_ag.agbp; 123 125 struct xfs_agf *agf = agbp->b_addr; 126 + struct xfs_perag *pag; 124 127 xfs_agblock_t bno; 125 128 int error; 126 129 ··· 138 139 XFS_EXTENT_BUSY_SKIP_DISCARD); 139 140 xfs_trans_agbtree_delta(cur->bc_tp, -1); 140 141 141 - xfs_ag_resv_rmapbt_free(cur->bc_mp, cur->bc_ag.agno); 142 - 142 + pag = cur->bc_ag.agbp->b_pag; 143 + xfs_ag_resv_free_extent(pag, XFS_AG_RESV_RMAPBT, NULL, 1); 143 144 return 0; 144 145 } 145 146 ··· 456 457 { 457 458 struct xfs_btree_cur *cur; 458 459 459 - cur = kmem_zone_zalloc(xfs_btree_cur_zone, KM_NOFS); 460 + cur = kmem_cache_zalloc(xfs_btree_cur_zone, GFP_NOFS | __GFP_NOFAIL); 460 461 cur->bc_tp = tp; 461 462 cur->bc_mp = mp; 462 463 /* Overlapping btree; 2 keys per pointer. */

+1 -1

fs/xfs/libxfs/xfs_rtbitmap.c

··· 70 70 if (error) 71 71 return error; 72 72 73 - if (XFS_IS_CORRUPT(mp, nmap == 0 || !xfs_bmap_is_real_extent(&map))) 73 + if (XFS_IS_CORRUPT(mp, nmap == 0 || !xfs_bmap_is_written_extent(&map))) 74 74 return -EFSCORRUPTED; 75 75 76 76 ASSERT(map.br_startblock != NULLFSBLOCK);

+1

fs/xfs/libxfs/xfs_shared.h

··· 65 65 #define XFS_TRANS_DQ_DIRTY 0x10 /* at least one dquot in trx dirty */ 66 66 #define XFS_TRANS_RESERVE 0x20 /* OK to use reserved data blocks */ 67 67 #define XFS_TRANS_NO_WRITECOUNT 0x40 /* do not elevate SB writecount */ 68 + #define XFS_TRANS_RES_FDBLKS 0x80 /* reserve newly freed blocks */ 68 69 /* 69 70 * LOWMODE is used by the allocator to activate the lowspace algorithm - when 70 71 * free space is running low the extent allocator may choose to allocate an

+78 -32

fs/xfs/libxfs/xfs_trans_inode.c

··· 8 8 #include "xfs_shared.h" 9 9 #include "xfs_format.h" 10 10 #include "xfs_log_format.h" 11 + #include "xfs_trans_resv.h" 12 + #include "xfs_mount.h" 11 13 #include "xfs_inode.h" 12 14 #include "xfs_trans.h" 13 15 #include "xfs_trans_priv.h" ··· 38 36 39 37 ASSERT(iip->ili_lock_flags == 0); 40 38 iip->ili_lock_flags = lock_flags; 39 + ASSERT(!xfs_iflags_test(ip, XFS_ISTALE)); 41 40 42 41 /* 43 42 * Get a log_item_desc to point at the new item. ··· 74 71 } 75 72 76 73 /* 77 - * This is called to mark the fields indicated in fieldmask as needing 78 - * to be logged when the transaction is committed. The inode must 79 - * already be associated with the given transaction. 74 + * This is called to mark the fields indicated in fieldmask as needing to be 75 + * logged when the transaction is committed. The inode must already be 76 + * associated with the given transaction. 80 77 * 81 - * The values for fieldmask are defined in xfs_inode_item.h. We always 82 - * log all of the core inode if any of it has changed, and we always log 83 - * all of the inline data/extents/b-tree root if any of them has changed. 78 + * The values for fieldmask are defined in xfs_inode_item.h. We always log all 79 + * of the core inode if any of it has changed, and we always log all of the 80 + * inline data/extents/b-tree root if any of them has changed. 81 + * 82 + * Grab and pin the cluster buffer associated with this inode to avoid RMW 83 + * cycles at inode writeback time. Avoid the need to add error handling to every 84 + * xfs_trans_log_inode() call by shutting down on read error. This will cause 85 + * transactions to fail and everything to error out, just like if we return a 86 + * read error in a dirty transaction and cancel it. 84 87 */ 85 88 void 86 89 xfs_trans_log_inode( 87 - xfs_trans_t *tp, 88 - xfs_inode_t *ip, 89 - uint flags) 90 + struct xfs_trans *tp, 91 + struct xfs_inode *ip, 92 + uint flags) 90 93 { 91 - struct inode *inode = VFS_I(ip); 94 + struct xfs_inode_log_item *iip = ip->i_itemp; 95 + struct inode *inode = VFS_I(ip); 96 + uint iversion_flags = 0; 92 97 93 - ASSERT(ip->i_itemp != NULL); 98 + ASSERT(iip); 94 99 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); 100 + ASSERT(!xfs_iflags_test(ip, XFS_ISTALE)); 101 + 102 + tp->t_flags |= XFS_TRANS_DIRTY; 95 103 96 104 /* 97 105 * Don't bother with i_lock for the I_DIRTY_TIME check here, as races ··· 117 103 } 118 104 119 105 /* 120 - * Record the specific change for fdatasync optimisation. This 121 - * allows fdatasync to skip log forces for inodes that are only 122 - * timestamp dirty. We do this before the change count so that 123 - * the core being logged in this case does not impact on fdatasync 124 - * behaviour. 125 - */ 126 - ip->i_itemp->ili_fsync_fields |= flags; 127 - 128 - /* 129 106 * First time we log the inode in a transaction, bump the inode change 130 107 * counter if it is configured for this to occur. While we have the 131 108 * inode locked exclusively for metadata modification, we can usually ··· 125 120 * set however, then go ahead and bump the i_version counter 126 121 * unconditionally. 127 122 */ 128 - if (!test_and_set_bit(XFS_LI_DIRTY, &ip->i_itemp->ili_item.li_flags) && 129 - IS_I_VERSION(VFS_I(ip))) { 130 - if (inode_maybe_inc_iversion(VFS_I(ip), flags & XFS_ILOG_CORE)) 131 - flags |= XFS_ILOG_CORE; 123 + if (!test_and_set_bit(XFS_LI_DIRTY, &iip->ili_item.li_flags)) { 124 + if (IS_I_VERSION(inode) && 125 + inode_maybe_inc_iversion(inode, flags & XFS_ILOG_CORE)) 126 + iversion_flags = XFS_ILOG_CORE; 132 127 } 133 128 134 - tp->t_flags |= XFS_TRANS_DIRTY; 129 + /* 130 + * Record the specific change for fdatasync optimisation. This allows 131 + * fdatasync to skip log forces for inodes that are only timestamp 132 + * dirty. 133 + */ 134 + spin_lock(&iip->ili_lock); 135 + iip->ili_fsync_fields |= flags; 136 + 137 + if (!iip->ili_item.li_buf) { 138 + struct xfs_buf *bp; 139 + int error; 140 + 141 + /* 142 + * We hold the ILOCK here, so this inode is not going to be 143 + * flushed while we are here. Further, because there is no 144 + * buffer attached to the item, we know that there is no IO in 145 + * progress, so nothing will clear the ili_fields while we read 146 + * in the buffer. Hence we can safely drop the spin lock and 147 + * read the buffer knowing that the state will not change from 148 + * here. 149 + */ 150 + spin_unlock(&iip->ili_lock); 151 + error = xfs_imap_to_bp(ip->i_mount, tp, &ip->i_imap, NULL, 152 + &bp, 0); 153 + if (error) { 154 + xfs_force_shutdown(ip->i_mount, SHUTDOWN_META_IO_ERROR); 155 + return; 156 + } 157 + 158 + /* 159 + * We need an explicit buffer reference for the log item but 160 + * don't want the buffer to remain attached to the transaction. 161 + * Hold the buffer but release the transaction reference once 162 + * we've attached the inode log item to the buffer log item 163 + * list. 164 + */ 165 + xfs_buf_hold(bp); 166 + spin_lock(&iip->ili_lock); 167 + iip->ili_item.li_buf = bp; 168 + bp->b_flags |= _XBF_INODES; 169 + list_add_tail(&iip->ili_item.li_bio_list, &bp->b_li_list); 170 + xfs_trans_brelse(tp, bp); 171 + } 135 172 136 173 /* 137 - * Always OR in the bits from the ili_last_fields field. 138 - * This is to coordinate with the xfs_iflush() and xfs_iflush_done() 139 - * routines in the eventual clearing of the ili_fields bits. 140 - * See the big comment in xfs_iflush() for an explanation of 141 - * this coordination mechanism. 174 + * Always OR in the bits from the ili_last_fields field. This is to 175 + * coordinate with the xfs_iflush() and xfs_iflush_done() routines in 176 + * the eventual clearing of the ili_fields bits. See the big comment in 177 + * xfs_iflush() for an explanation of this coordination mechanism. 142 178 */ 143 - flags |= ip->i_itemp->ili_last_fields; 144 - ip->i_itemp->ili_fields |= flags; 179 + iip->ili_fields |= (flags | iip->ili_last_fields | iversion_flags); 180 + spin_unlock(&iip->ili_lock); 145 181 } 146 182 147 183 int

+1 -1

fs/xfs/libxfs/xfs_trans_space.h

··· 57 57 XFS_DAREMOVE_SPACE_RES(mp, XFS_DATA_FORK) 58 58 #define XFS_IALLOC_SPACE_RES(mp) \ 59 59 (M_IGEO(mp)->ialloc_blks + \ 60 - (xfs_sb_version_hasfinobt(&mp->m_sb) ? 2 : 1 * \ 60 + ((xfs_sb_version_hasfinobt(&mp->m_sb) ? 2 : 1) * \ 61 61 (M_IGEO(mp)->inobt_maxlevels - 1))) 62 62 63 63 /*

+20 -2

fs/xfs/scrub/bmap.c

··· 45 45 */ 46 46 if (S_ISREG(VFS_I(sc->ip)->i_mode) && 47 47 sc->sm->sm_type == XFS_SCRUB_TYPE_BMBTD) { 48 + struct address_space *mapping = VFS_I(sc->ip)->i_mapping; 49 + 48 50 inode_dio_wait(VFS_I(sc->ip)); 49 - error = filemap_write_and_wait(VFS_I(sc->ip)->i_mapping); 50 - if (error) 51 + 52 + /* 53 + * Try to flush all incore state to disk before we examine the 54 + * space mappings for the data fork. Leave accumulated errors 55 + * in the mapping for the writer threads to consume. 56 + * 57 + * On ENOSPC or EIO writeback errors, we continue into the 58 + * extent mapping checks because write failures do not 59 + * necessarily imply anything about the correctness of the file 60 + * metadata. The metadata and the file data could be on 61 + * completely separate devices; a media failure might only 62 + * affect a subset of the disk, etc. We can handle delalloc 63 + * extents in the scrubber, so leaving them in memory is fine. 64 + */ 65 + error = filemap_fdatawrite(mapping); 66 + if (!error) 67 + error = filemap_fdatawait_keep_errors(mapping); 68 + if (error && (error != -ENOSPC && error != -EIO)) 51 69 goto out; 52 70 } 53 71

+1 -3

fs/xfs/scrub/dabtree.c

··· 476 476 ds.dargs.whichfork = whichfork; 477 477 ds.dargs.trans = sc->tp; 478 478 ds.dargs.op_flags = XFS_DA_OP_OKNOENT; 479 - ds.state = xfs_da_state_alloc(); 480 - ds.state->args = &ds.dargs; 481 - ds.state->mp = mp; 479 + ds.state = xfs_da_state_alloc(&ds.dargs); 482 480 ds.sc = sc; 483 481 ds.private = private; 484 482 if (whichfork == XFS_ATTR_FORK) {

+33 -54

fs/xfs/scrub/quota.c

··· 18 18 #include "scrub/common.h" 19 19 20 20 /* Convert a scrub type code to a DQ flag, or return 0 if error. */ 21 - static inline uint 21 + static inline xfs_dqtype_t 22 22 xchk_quota_to_dqtype( 23 23 struct xfs_scrub *sc) 24 24 { 25 25 switch (sc->sm->sm_type) { 26 26 case XFS_SCRUB_TYPE_UQUOTA: 27 - return XFS_DQ_USER; 27 + return XFS_DQTYPE_USER; 28 28 case XFS_SCRUB_TYPE_GQUOTA: 29 - return XFS_DQ_GROUP; 29 + return XFS_DQTYPE_GROUP; 30 30 case XFS_SCRUB_TYPE_PQUOTA: 31 - return XFS_DQ_PROJ; 31 + return XFS_DQTYPE_PROJ; 32 32 default: 33 33 return 0; 34 34 } ··· 40 40 struct xfs_scrub *sc, 41 41 struct xfs_inode *ip) 42 42 { 43 - uint dqtype; 43 + xfs_dqtype_t dqtype; 44 44 int error; 45 45 46 46 if (!XFS_IS_QUOTA_RUNNING(sc->mp) || !XFS_IS_QUOTA_ON(sc->mp)) ··· 73 73 STATIC int 74 74 xchk_quota_item( 75 75 struct xfs_dquot *dq, 76 - uint dqtype, 76 + xfs_dqtype_t dqtype, 77 77 void *priv) 78 78 { 79 79 struct xchk_quota_info *sqi = priv; 80 80 struct xfs_scrub *sc = sqi->sc; 81 81 struct xfs_mount *mp = sc->mp; 82 - struct xfs_disk_dquot *d = &dq->q_core; 83 82 struct xfs_quotainfo *qi = mp->m_quotainfo; 84 83 xfs_fileoff_t offset; 85 - unsigned long long bsoft; 86 - unsigned long long isoft; 87 - unsigned long long rsoft; 88 - unsigned long long bhard; 89 - unsigned long long ihard; 90 - unsigned long long rhard; 91 - unsigned long long bcount; 92 - unsigned long long icount; 93 - unsigned long long rcount; 94 84 xfs_ino_t fs_icount; 95 - xfs_dqid_t id = be32_to_cpu(d->d_id); 96 85 int error = 0; 97 86 98 87 if (xchk_should_terminate(sc, &error)) ··· 91 102 * Except for the root dquot, the actual dquot we got must either have 92 103 * the same or higher id as we saw before. 93 104 */ 94 - offset = id / qi->qi_dqperchunk; 95 - if (id && id <= sqi->last_id) 105 + offset = dq->q_id / qi->qi_dqperchunk; 106 + if (dq->q_id && dq->q_id <= sqi->last_id) 96 107 xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset); 97 108 98 - sqi->last_id = id; 99 - 100 - /* Did we get the dquot type we wanted? */ 101 - if (dqtype != (d->d_flags & XFS_DQ_ALLTYPES)) 102 - xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset); 103 - 104 - if (d->d_pad0 != cpu_to_be32(0) || d->d_pad != cpu_to_be16(0)) 105 - xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset); 106 - 107 - /* Check the limits. */ 108 - bhard = be64_to_cpu(d->d_blk_hardlimit); 109 - ihard = be64_to_cpu(d->d_ino_hardlimit); 110 - rhard = be64_to_cpu(d->d_rtb_hardlimit); 111 - 112 - bsoft = be64_to_cpu(d->d_blk_softlimit); 113 - isoft = be64_to_cpu(d->d_ino_softlimit); 114 - rsoft = be64_to_cpu(d->d_rtb_softlimit); 109 + sqi->last_id = dq->q_id; 115 110 116 111 /* 117 112 * Warn if the hard limits are larger than the fs. ··· 105 132 * Complain about corruption if the soft limit is greater than 106 133 * the hard limit. 107 134 */ 108 - if (bhard > mp->m_sb.sb_dblocks) 135 + if (dq->q_blk.hardlimit > mp->m_sb.sb_dblocks) 109 136 xchk_fblock_set_warning(sc, XFS_DATA_FORK, offset); 110 - if (bsoft > bhard) 137 + if (dq->q_blk.softlimit > dq->q_blk.hardlimit) 111 138 xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset); 112 139 113 - if (ihard > M_IGEO(mp)->maxicount) 140 + if (dq->q_ino.hardlimit > M_IGEO(mp)->maxicount) 114 141 xchk_fblock_set_warning(sc, XFS_DATA_FORK, offset); 115 - if (isoft > ihard) 142 + if (dq->q_ino.softlimit > dq->q_ino.hardlimit) 116 143 xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset); 117 144 118 - if (rhard > mp->m_sb.sb_rblocks) 145 + if (dq->q_rtb.hardlimit > mp->m_sb.sb_rblocks) 119 146 xchk_fblock_set_warning(sc, XFS_DATA_FORK, offset); 120 - if (rsoft > rhard) 147 + if (dq->q_rtb.softlimit > dq->q_rtb.hardlimit) 121 148 xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset); 122 149 123 150 /* Check the resource counts. */ 124 - bcount = be64_to_cpu(d->d_bcount); 125 - icount = be64_to_cpu(d->d_icount); 126 - rcount = be64_to_cpu(d->d_rtbcount); 127 151 fs_icount = percpu_counter_sum(&mp->m_icount); 128 152 129 153 /* ··· 129 159 * if there are no quota limits. 130 160 */ 131 161 if (xfs_sb_version_hasreflink(&mp->m_sb)) { 132 - if (mp->m_sb.sb_dblocks < bcount) 162 + if (mp->m_sb.sb_dblocks < dq->q_blk.count) 133 163 xchk_fblock_set_warning(sc, XFS_DATA_FORK, 134 164 offset); 135 165 } else { 136 - if (mp->m_sb.sb_dblocks < bcount) 166 + if (mp->m_sb.sb_dblocks < dq->q_blk.count) 137 167 xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, 138 168 offset); 139 169 } 140 - if (icount > fs_icount || rcount > mp->m_sb.sb_rblocks) 170 + if (dq->q_ino.count > fs_icount || dq->q_rtb.count > mp->m_sb.sb_rblocks) 141 171 xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset); 142 172 143 173 /* ··· 145 175 * lower limit than the actual usage. However, we flag it for 146 176 * admin review. 147 177 */ 148 - if (id != 0 && bhard != 0 && bcount > bhard) 149 - xchk_fblock_set_warning(sc, XFS_DATA_FORK, offset); 150 - if (id != 0 && ihard != 0 && icount > ihard) 151 - xchk_fblock_set_warning(sc, XFS_DATA_FORK, offset); 152 - if (id != 0 && rhard != 0 && rcount > rhard) 178 + if (dq->q_id == 0) 179 + goto out; 180 + 181 + if (dq->q_blk.hardlimit != 0 && 182 + dq->q_blk.count > dq->q_blk.hardlimit) 153 183 xchk_fblock_set_warning(sc, XFS_DATA_FORK, offset); 154 184 185 + if (dq->q_ino.hardlimit != 0 && 186 + dq->q_ino.count > dq->q_ino.hardlimit) 187 + xchk_fblock_set_warning(sc, XFS_DATA_FORK, offset); 188 + 189 + if (dq->q_rtb.hardlimit != 0 && 190 + dq->q_rtb.count > dq->q_rtb.hardlimit) 191 + xchk_fblock_set_warning(sc, XFS_DATA_FORK, offset); 192 + 193 + out: 155 194 if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) 156 195 return -EFSCORRUPTED; 157 196 ··· 214 235 struct xchk_quota_info sqi; 215 236 struct xfs_mount *mp = sc->mp; 216 237 struct xfs_quotainfo *qi = mp->m_quotainfo; 217 - uint dqtype; 238 + xfs_dqtype_t dqtype; 218 239 int error = 0; 219 240 220 241 dqtype = xchk_quota_to_dqtype(sc);

+5 -5

fs/xfs/scrub/repair.c

··· 899 899 void 900 900 xrep_force_quotacheck( 901 901 struct xfs_scrub *sc, 902 - uint dqtype) 902 + xfs_dqtype_t type) 903 903 { 904 904 uint flag; 905 905 906 - flag = xfs_quota_chkd_flag(dqtype); 906 + flag = xfs_quota_chkd_flag(type); 907 907 if (!(flag & sc->mp->m_qflags)) 908 908 return; 909 909 ··· 939 939 "inode %llu repair encountered quota error %d, quotacheck forced.", 940 940 (unsigned long long)sc->ip->i_ino, error); 941 941 if (XFS_IS_UQUOTA_ON(sc->mp) && !sc->ip->i_udquot) 942 - xrep_force_quotacheck(sc, XFS_DQ_USER); 942 + xrep_force_quotacheck(sc, XFS_DQTYPE_USER); 943 943 if (XFS_IS_GQUOTA_ON(sc->mp) && !sc->ip->i_gdquot) 944 - xrep_force_quotacheck(sc, XFS_DQ_GROUP); 944 + xrep_force_quotacheck(sc, XFS_DQTYPE_GROUP); 945 945 if (XFS_IS_PQUOTA_ON(sc->mp) && !sc->ip->i_pdquot) 946 - xrep_force_quotacheck(sc, XFS_DQ_PROJ); 946 + xrep_force_quotacheck(sc, XFS_DQTYPE_PROJ); 947 947 /* fall through */ 948 948 case -ESRCH: 949 949 error = 0;

+3 -1

fs/xfs/scrub/repair.h

··· 6 6 #ifndef __XFS_SCRUB_REPAIR_H__ 7 7 #define __XFS_SCRUB_REPAIR_H__ 8 8 9 + #include "xfs_quota_defs.h" 10 + 9 11 static inline int xrep_notsupported(struct xfs_scrub *sc) 10 12 { 11 13 return -EOPNOTSUPP; ··· 51 49 52 50 int xrep_find_ag_btree_roots(struct xfs_scrub *sc, struct xfs_buf *agf_bp, 53 51 struct xrep_find_ag_btree *btree_info, struct xfs_buf *agfl_bp); 54 - void xrep_force_quotacheck(struct xfs_scrub *sc, uint dqtype); 52 + void xrep_force_quotacheck(struct xfs_scrub *sc, xfs_dqtype_t type); 55 53 int xrep_ino_dqattach(struct xfs_scrub *sc); 56 54 57 55 /* Metadata repairers */

+47

fs/xfs/scrub/rtbitmap.c

··· 13 13 #include "xfs_trans.h" 14 14 #include "xfs_rtalloc.h" 15 15 #include "xfs_inode.h" 16 + #include "xfs_bmap.h" 16 17 #include "scrub/scrub.h" 17 18 #include "scrub/common.h" 18 19 ··· 59 58 return 0; 60 59 } 61 60 61 + /* Make sure the entire rtbitmap file is mapped with written extents. */ 62 + STATIC int 63 + xchk_rtbitmap_check_extents( 64 + struct xfs_scrub *sc) 65 + { 66 + struct xfs_mount *mp = sc->mp; 67 + struct xfs_bmbt_irec map; 68 + xfs_rtblock_t off; 69 + int nmap; 70 + int error = 0; 71 + 72 + for (off = 0; off < mp->m_sb.sb_rbmblocks;) { 73 + if (xchk_should_terminate(sc, &error) || 74 + (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)) 75 + break; 76 + 77 + /* Make sure we have a written extent. */ 78 + nmap = 1; 79 + error = xfs_bmapi_read(mp->m_rbmip, off, 80 + mp->m_sb.sb_rbmblocks - off, &map, &nmap, 81 + XFS_DATA_FORK); 82 + if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, off, &error)) 83 + break; 84 + 85 + if (nmap != 1 || !xfs_bmap_is_written_extent(&map)) { 86 + xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, off); 87 + break; 88 + } 89 + 90 + off += map.br_blockcount; 91 + } 92 + 93 + return error; 94 + } 95 + 62 96 /* Scrub the realtime bitmap. */ 63 97 int 64 98 xchk_rtbitmap( ··· 101 65 { 102 66 int error; 103 67 68 + /* Is the size of the rtbitmap correct? */ 69 + if (sc->mp->m_rbmip->i_d.di_size != 70 + XFS_FSB_TO_B(sc->mp, sc->mp->m_sb.sb_rbmblocks)) { 71 + xchk_ino_set_corrupt(sc, sc->mp->m_rbmip->i_ino); 72 + return 0; 73 + } 74 + 104 75 /* Invoke the fork scrubber. */ 105 76 error = xchk_metadata_inode_forks(sc); 77 + if (error || (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)) 78 + return error; 79 + 80 + error = xchk_rtbitmap_check_extents(sc); 106 81 if (error || (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)) 107 82 return error; 108 83

+2 -2

fs/xfs/xfs_bmap_item.c

··· 138 138 { 139 139 struct xfs_bui_log_item *buip; 140 140 141 - buip = kmem_zone_zalloc(xfs_bui_zone, 0); 141 + buip = kmem_cache_zalloc(xfs_bui_zone, GFP_KERNEL | __GFP_NOFAIL); 142 142 143 143 xfs_log_item_init(mp, &buip->bui_item, XFS_LI_BUI, &xfs_bui_item_ops); 144 144 buip->bui_format.bui_nextents = XFS_BUI_MAX_FAST_EXTENTS; ··· 215 215 { 216 216 struct xfs_bud_log_item *budp; 217 217 218 - budp = kmem_zone_zalloc(xfs_bud_zone, 0); 218 + budp = kmem_cache_zalloc(xfs_bud_zone, GFP_KERNEL | __GFP_NOFAIL); 219 219 xfs_log_item_init(tp->t_mountp, &budp->bud_item, XFS_LI_BUD, 220 220 &xfs_bud_item_ops); 221 221 budp->bud_buip = buip;

+9 -9

fs/xfs/xfs_bmap_util.c

··· 1567 1567 int lock_flags; 1568 1568 uint64_t f; 1569 1569 int resblks = 0; 1570 + unsigned int flags = 0; 1570 1571 1571 1572 /* 1572 1573 * Lock the inodes against other IO, page faults and truncate to ··· 1631 1630 resblks += XFS_SWAP_RMAP_SPACE_RES(mp, tipnext, w); 1632 1631 1633 1632 /* 1634 - * Handle the corner case where either inode might straddle the 1635 - * btree format boundary. If so, the inode could bounce between 1636 - * btree <-> extent format on unmap -> remap cycles, freeing and 1637 - * allocating a bmapbt block each time. 1633 + * If either inode straddles a bmapbt block allocation boundary, 1634 + * the rmapbt algorithm triggers repeated allocs and frees as 1635 + * extents are remapped. This can exhaust the block reservation 1636 + * prematurely and cause shutdown. Return freed blocks to the 1637 + * transaction reservation to counter this behavior. 1638 1638 */ 1639 - if (ipnext == (XFS_IFORK_MAXEXT(ip, w) + 1)) 1640 - resblks += XFS_IFORK_MAXEXT(ip, w); 1641 - if (tipnext == (XFS_IFORK_MAXEXT(tip, w) + 1)) 1642 - resblks += XFS_IFORK_MAXEXT(tip, w); 1639 + flags |= XFS_TRANS_RES_FDBLKS; 1643 1640 } 1644 - error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, 0, &tp); 1641 + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, flags, 1642 + &tp); 1645 1643 if (error) 1646 1644 goto out_unlock; 1647 1645

+30 -14

fs/xfs/xfs_buf.c

··· 14 14 #include "xfs_mount.h" 15 15 #include "xfs_trace.h" 16 16 #include "xfs_log.h" 17 + #include "xfs_log_recover.h" 18 + #include "xfs_trans.h" 19 + #include "xfs_buf_item.h" 17 20 #include "xfs_errortag.h" 18 21 #include "xfs_error.h" 19 22 ··· 214 211 int i; 215 212 216 213 *bpp = NULL; 217 - bp = kmem_zone_zalloc(xfs_buf_zone, KM_NOFS); 218 - if (unlikely(!bp)) 219 - return -ENOMEM; 214 + bp = kmem_cache_zalloc(xfs_buf_zone, GFP_NOFS | __GFP_NOFAIL); 220 215 221 216 /* 222 217 * We don't want certain flags to appear in b_flags unless they are ··· 656 655 */ 657 656 if (bp->b_flags & XBF_STALE) { 658 657 ASSERT((bp->b_flags & _XBF_DELWRI_Q) == 0); 659 - ASSERT(bp->b_iodone == NULL); 660 658 bp->b_flags &= _XBF_KMEM | _XBF_PAGES; 661 659 bp->b_ops = NULL; 662 660 } ··· 1191 1191 if (!bp->b_error && bp->b_io_error) 1192 1192 xfs_buf_ioerror(bp, bp->b_io_error); 1193 1193 1194 - /* Only validate buffers that were read without errors */ 1195 - if (read && !bp->b_error && bp->b_ops) { 1196 - ASSERT(!bp->b_iodone); 1197 - bp->b_ops->verify_read(bp); 1194 + if (read) { 1195 + if (!bp->b_error && bp->b_ops) 1196 + bp->b_ops->verify_read(bp); 1197 + if (!bp->b_error) 1198 + bp->b_flags |= XBF_DONE; 1199 + xfs_buf_ioend_finish(bp); 1200 + return; 1198 1201 } 1199 1202 1200 1203 if (!bp->b_error) { ··· 1205 1202 bp->b_flags |= XBF_DONE; 1206 1203 } 1207 1204 1208 - if (bp->b_iodone) 1209 - (*(bp->b_iodone))(bp); 1210 - else if (bp->b_flags & XBF_ASYNC) 1211 - xfs_buf_relse(bp); 1212 - else 1213 - complete(&bp->b_iowait); 1205 + /* 1206 + * If this is a log recovery buffer, we aren't doing transactional IO 1207 + * yet so we need to let it handle IO completions. 1208 + */ 1209 + if (bp->b_flags & _XBF_LOGRECOVERY) { 1210 + xlog_recover_iodone(bp); 1211 + return; 1212 + } 1213 + 1214 + if (bp->b_flags & _XBF_INODES) { 1215 + xfs_buf_inode_iodone(bp); 1216 + return; 1217 + } 1218 + 1219 + if (bp->b_flags & _XBF_DQUOTS) { 1220 + xfs_buf_dquot_iodone(bp); 1221 + return; 1222 + } 1223 + xfs_buf_iodone(bp); 1214 1224 } 1215 1225 1216 1226 static void

+30 -18

fs/xfs/xfs_buf.h

··· 18 18 /* 19 19 * Base types 20 20 */ 21 + struct xfs_buf; 21 22 22 23 #define XFS_BUF_DADDR_NULL ((xfs_daddr_t) (-1LL)) 23 24 ··· 31 30 #define XBF_STALE (1 << 6) /* buffer has been staled, do not find it */ 32 31 #define XBF_WRITE_FAIL (1 << 7) /* async writes have failed on this buffer */ 33 32 34 - /* flags used only as arguments to access routines */ 35 - #define XBF_TRYLOCK (1 << 16)/* lock requested, but do not wait */ 36 - #define XBF_UNMAPPED (1 << 17)/* do not map the buffer */ 33 + /* buffer type flags for write callbacks */ 34 + #define _XBF_INODES (1 << 16)/* inode buffer */ 35 + #define _XBF_DQUOTS (1 << 17)/* dquot buffer */ 36 + #define _XBF_LOGRECOVERY (1 << 18)/* log recovery buffer */ 37 37 38 38 /* flags used only internally */ 39 39 #define _XBF_PAGES (1 << 20)/* backed by refcounted pages */ 40 40 #define _XBF_KMEM (1 << 21)/* backed by heap memory */ 41 41 #define _XBF_DELWRI_Q (1 << 22)/* buffer on a delwri queue */ 42 + 43 + /* flags used only as arguments to access routines */ 44 + #define XBF_TRYLOCK (1 << 30)/* lock requested, but do not wait */ 45 + #define XBF_UNMAPPED (1 << 31)/* do not map the buffer */ 42 46 43 47 typedef unsigned int xfs_buf_flags_t; 44 48 ··· 56 50 { XBF_DONE, "DONE" }, \ 57 51 { XBF_STALE, "STALE" }, \ 58 52 { XBF_WRITE_FAIL, "WRITE_FAIL" }, \ 59 - { XBF_TRYLOCK, "TRYLOCK" }, /* should never be set */\ 60 - { XBF_UNMAPPED, "UNMAPPED" }, /* ditto */\ 53 + { _XBF_INODES, "INODES" }, \ 54 + { _XBF_DQUOTS, "DQUOTS" }, \ 55 + { _XBF_LOGRECOVERY, "LOG_RECOVERY" }, \ 61 56 { _XBF_PAGES, "PAGES" }, \ 62 57 { _XBF_KMEM, "KMEM" }, \ 63 - { _XBF_DELWRI_Q, "DELWRI_Q" } 64 - 58 + { _XBF_DELWRI_Q, "DELWRI_Q" }, \ 59 + /* The following interface flags should never be set */ \ 60 + { XBF_TRYLOCK, "TRYLOCK" }, \ 61 + { XBF_UNMAPPED, "UNMAPPED" } 65 62 66 63 /* 67 64 * Internal state flags. ··· 102 93 struct percpu_counter bt_io_count; 103 94 struct ratelimit_state bt_ioerror_rl; 104 95 } xfs_buftarg_t; 105 - 106 - struct xfs_buf; 107 - typedef void (*xfs_buf_iodone_t)(struct xfs_buf *); 108 - 109 96 110 97 #define XB_PAGES 2 111 98 ··· 155 150 xfs_buftarg_t *b_target; /* buffer target (device) */ 156 151 void *b_addr; /* virtual address of buffer */ 157 152 struct work_struct b_ioend_work; 158 - xfs_buf_iodone_t b_iodone; /* I/O completion function */ 159 153 struct completion b_iowait; /* queue for I/O waiters */ 160 154 struct xfs_buf_log_item *b_log_item; 161 155 struct list_head b_li_list; /* Log items list head */ ··· 261 257 #define xfs_buf_islocked(bp) \ 262 258 ((bp)->b_sema.count <= 0) 263 259 260 + static inline void xfs_buf_relse(xfs_buf_t *bp) 261 + { 262 + xfs_buf_unlock(bp); 263 + xfs_buf_rele(bp); 264 + } 265 + 264 266 /* Buffer Read and Write Routines */ 265 267 extern int xfs_bwrite(struct xfs_buf *bp); 266 268 extern void xfs_buf_ioend(struct xfs_buf *bp); 269 + static inline void xfs_buf_ioend_finish(struct xfs_buf *bp) 270 + { 271 + if (bp->b_flags & XBF_ASYNC) 272 + xfs_buf_relse(bp); 273 + else 274 + complete(&bp->b_iowait); 275 + } 276 + 267 277 extern void __xfs_buf_ioerror(struct xfs_buf *bp, int error, 268 278 xfs_failaddr_t failaddr); 269 279 #define xfs_buf_ioerror(bp, err) __xfs_buf_ioerror((bp), (err), __this_address) ··· 340 322 static inline int xfs_buf_ispinned(struct xfs_buf *bp) 341 323 { 342 324 return atomic_read(&bp->b_pin_count); 343 - } 344 - 345 - static inline void xfs_buf_relse(xfs_buf_t *bp) 346 - { 347 - xfs_buf_unlock(bp); 348 - xfs_buf_rele(bp); 349 325 } 350 326 351 327 static inline int

+218 -216

fs/xfs/xfs_buf_item.c

··· 12 12 #include "xfs_bit.h" 13 13 #include "xfs_mount.h" 14 14 #include "xfs_trans.h" 15 - #include "xfs_buf_item.h" 16 15 #include "xfs_trans_priv.h" 16 + #include "xfs_buf_item.h" 17 + #include "xfs_inode.h" 18 + #include "xfs_inode_item.h" 19 + #include "xfs_quota.h" 20 + #include "xfs_dquot_item.h" 21 + #include "xfs_dquot.h" 17 22 #include "xfs_trace.h" 18 23 #include "xfs_log.h" 19 24 ··· 30 25 return container_of(lip, struct xfs_buf_log_item, bli_item); 31 26 } 32 27 33 - STATIC void xfs_buf_do_callbacks(struct xfs_buf *bp); 28 + static void xfs_buf_item_done(struct xfs_buf *bp); 34 29 35 30 /* Is this log iovec plausibly large enough to contain the buffer log format? */ 36 31 bool ··· 462 457 * the AIL lock. 463 458 */ 464 459 if (bip->bli_flags & XFS_BLI_STALE_INODE) { 465 - xfs_buf_do_callbacks(bp); 466 - bp->b_log_item = NULL; 467 - list_del_init(&bp->b_li_list); 468 - bp->b_iodone = NULL; 460 + xfs_buf_item_done(bp); 461 + xfs_iflush_done(bp); 462 + ASSERT(list_empty(&bp->b_li_list)); 469 463 } else { 470 464 xfs_trans_ail_delete(lip, SHUTDOWN_LOG_IO_ERROR); 471 465 xfs_buf_item_relse(bp); ··· 738 734 return 0; 739 735 } 740 736 741 - bip = kmem_zone_zalloc(xfs_buf_item_zone, 0); 737 + bip = kmem_cache_zalloc(xfs_buf_item_zone, GFP_KERNEL | __GFP_NOFAIL); 742 738 xfs_log_item_init(mp, &bip->bli_item, XFS_LI_BUF, &xfs_buf_item_ops); 743 739 bip->bli_buf = bp; 744 740 ··· 940 936 } 941 937 942 938 /* 943 - * This is called when the buf log item is no longer needed. It should 944 - * free the buf log item associated with the given buffer and clear 945 - * the buffer's pointer to the buf log item. If there are no more 946 - * items in the list, clear the b_iodone field of the buffer (see 947 - * xfs_buf_attach_iodone() below). 939 + * xfs_buf_item_relse() is called when the buf log item is no longer needed. 948 940 */ 949 941 void 950 942 xfs_buf_item_relse( ··· 952 952 ASSERT(!test_bit(XFS_LI_IN_AIL, &bip->bli_item.li_flags)); 953 953 954 954 bp->b_log_item = NULL; 955 - if (list_empty(&bp->b_li_list)) 956 - bp->b_iodone = NULL; 957 - 958 955 xfs_buf_rele(bp); 959 956 xfs_buf_item_free(bip); 960 957 } 961 958 962 - 963 959 /* 964 - * Add the given log item with its callback to the list of callbacks 965 - * to be called when the buffer's I/O completes. If it is not set 966 - * already, set the buffer's b_iodone() routine to be 967 - * xfs_buf_iodone_callbacks() and link the log item into the list of 968 - * items rooted at b_li_list. 960 + * Decide if we're going to retry the write after a failure, and prepare 961 + * the buffer for retrying the write. 969 962 */ 970 - void 971 - xfs_buf_attach_iodone( 972 - struct xfs_buf *bp, 973 - void (*cb)(struct xfs_buf *, struct xfs_log_item *), 974 - struct xfs_log_item *lip) 975 - { 976 - ASSERT(xfs_buf_islocked(bp)); 977 - 978 - lip->li_cb = cb; 979 - list_add_tail(&lip->li_bio_list, &bp->b_li_list); 980 - 981 - ASSERT(bp->b_iodone == NULL || 982 - bp->b_iodone == xfs_buf_iodone_callbacks); 983 - bp->b_iodone = xfs_buf_iodone_callbacks; 984 - } 985 - 986 - /* 987 - * We can have many callbacks on a buffer. Running the callbacks individually 988 - * can cause a lot of contention on the AIL lock, so we allow for a single 989 - * callback to be able to scan the remaining items in bp->b_li_list for other 990 - * items of the same type and callback to be processed in the first call. 991 - * 992 - * As a result, the loop walking the callback list below will also modify the 993 - * list. it removes the first item from the list and then runs the callback. 994 - * The loop then restarts from the new first item int the list. This allows the 995 - * callback to scan and modify the list attached to the buffer and we don't 996 - * have to care about maintaining a next item pointer. 997 - */ 998 - STATIC void 999 - xfs_buf_do_callbacks( 1000 - struct xfs_buf *bp) 1001 - { 1002 - struct xfs_buf_log_item *blip = bp->b_log_item; 1003 - struct xfs_log_item *lip; 1004 - 1005 - /* If there is a buf_log_item attached, run its callback */ 1006 - if (blip) { 1007 - lip = &blip->bli_item; 1008 - lip->li_cb(bp, lip); 1009 - } 1010 - 1011 - while (!list_empty(&bp->b_li_list)) { 1012 - lip = list_first_entry(&bp->b_li_list, struct xfs_log_item, 1013 - li_bio_list); 1014 - 1015 - /* 1016 - * Remove the item from the list, so we don't have any 1017 - * confusion if the item is added to another buf. 1018 - * Don't touch the log item after calling its 1019 - * callback, because it could have freed itself. 1020 - */ 1021 - list_del_init(&lip->li_bio_list); 1022 - lip->li_cb(bp, lip); 1023 - } 1024 - } 1025 - 1026 - /* 1027 - * Invoke the error state callback for each log item affected by the failed I/O. 1028 - * 1029 - * If a metadata buffer write fails with a non-permanent error, the buffer is 1030 - * eventually resubmitted and so the completion callbacks are not run. The error 1031 - * state may need to be propagated to the log items attached to the buffer, 1032 - * however, so the next AIL push of the item knows hot to handle it correctly. 1033 - */ 1034 - STATIC void 1035 - xfs_buf_do_callbacks_fail( 1036 - struct xfs_buf *bp) 1037 - { 1038 - struct xfs_log_item *lip; 1039 - struct xfs_ail *ailp; 1040 - 1041 - /* 1042 - * Buffer log item errors are handled directly by xfs_buf_item_push() 1043 - * and xfs_buf_iodone_callback_error, and they have no IO error 1044 - * callbacks. Check only for items in b_li_list. 1045 - */ 1046 - if (list_empty(&bp->b_li_list)) 1047 - return; 1048 - 1049 - lip = list_first_entry(&bp->b_li_list, struct xfs_log_item, 1050 - li_bio_list); 1051 - ailp = lip->li_ailp; 1052 - spin_lock(&ailp->ail_lock); 1053 - list_for_each_entry(lip, &bp->b_li_list, li_bio_list) { 1054 - if (lip->li_ops->iop_error) 1055 - lip->li_ops->iop_error(lip, bp); 1056 - } 1057 - spin_unlock(&ailp->ail_lock); 1058 - } 1059 - 1060 963 static bool 1061 - xfs_buf_iodone_callback_error( 964 + xfs_buf_ioerror_fail_without_retry( 1062 965 struct xfs_buf *bp) 1063 966 { 1064 - struct xfs_buf_log_item *bip = bp->b_log_item; 1065 - struct xfs_log_item *lip; 1066 - struct xfs_mount *mp; 967 + struct xfs_mount *mp = bp->b_mount; 1067 968 static ulong lasttime; 1068 969 static xfs_buftarg_t *lasttarg; 1069 - struct xfs_error_cfg *cfg; 1070 - 1071 - /* 1072 - * The failed buffer might not have a buf_log_item attached or the 1073 - * log_item list might be empty. Get the mp from the available 1074 - * xfs_log_item 1075 - */ 1076 - lip = list_first_entry_or_null(&bp->b_li_list, struct xfs_log_item, 1077 - li_bio_list); 1078 - mp = lip ? lip->li_mountp : bip->bli_item.li_mountp; 1079 970 1080 971 /* 1081 972 * If we've already decided to shutdown the filesystem because of 1082 973 * I/O errors, there's no point in giving this a retry. 1083 974 */ 1084 975 if (XFS_FORCED_SHUTDOWN(mp)) 1085 - goto out_stale; 976 + return true; 1086 977 1087 978 if (bp->b_target != lasttarg || 1088 979 time_after(jiffies, (lasttime + 5*HZ))) { ··· 984 1093 985 1094 /* synchronous writes will have callers process the error */ 986 1095 if (!(bp->b_flags & XBF_ASYNC)) 987 - goto out_stale; 988 - 989 - trace_xfs_buf_item_iodone_async(bp, _RET_IP_); 990 - ASSERT(bp->b_iodone != NULL); 991 - 992 - cfg = xfs_error_get_cfg(mp, XFS_ERR_METADATA, bp->b_error); 993 - 994 - /* 995 - * If the write was asynchronous then no one will be looking for the 996 - * error. If this is the first failure of this type, clear the error 997 - * state and write the buffer out again. This means we always retry an 998 - * async write failure at least once, but we also need to set the buffer 999 - * up to behave correctly now for repeated failures. 1000 - */ 1001 - if (!(bp->b_flags & (XBF_STALE | XBF_WRITE_FAIL)) || 1002 - bp->b_last_error != bp->b_error) { 1003 - bp->b_flags |= (XBF_WRITE | XBF_DONE | XBF_WRITE_FAIL); 1004 - bp->b_last_error = bp->b_error; 1005 - if (cfg->retry_timeout != XFS_ERR_RETRY_FOREVER && 1006 - !bp->b_first_retry_time) 1007 - bp->b_first_retry_time = jiffies; 1008 - 1009 - xfs_buf_ioerror(bp, 0); 1010 - xfs_buf_submit(bp); 1011 1096 return true; 1012 - } 1097 + return false; 1098 + } 1013 1099 1014 - /* 1015 - * Repeated failure on an async write. Take action according to the 1016 - * error configuration we have been set up to use. 1017 - */ 1100 + static bool 1101 + xfs_buf_ioerror_retry( 1102 + struct xfs_buf *bp, 1103 + struct xfs_error_cfg *cfg) 1104 + { 1105 + if ((bp->b_flags & (XBF_STALE | XBF_WRITE_FAIL)) && 1106 + bp->b_last_error == bp->b_error) 1107 + return false; 1108 + 1109 + bp->b_flags |= (XBF_WRITE | XBF_DONE | XBF_WRITE_FAIL); 1110 + bp->b_last_error = bp->b_error; 1111 + if (cfg->retry_timeout != XFS_ERR_RETRY_FOREVER && 1112 + !bp->b_first_retry_time) 1113 + bp->b_first_retry_time = jiffies; 1114 + return true; 1115 + } 1116 + 1117 + /* 1118 + * Account for this latest trip around the retry handler, and decide if 1119 + * we've failed enough times to constitute a permanent failure. 1120 + */ 1121 + static bool 1122 + xfs_buf_ioerror_permanent( 1123 + struct xfs_buf *bp, 1124 + struct xfs_error_cfg *cfg) 1125 + { 1126 + struct xfs_mount *mp = bp->b_mount; 1018 1127 1019 1128 if (cfg->max_retries != XFS_ERR_RETRY_FOREVER && 1020 1129 ++bp->b_retries > cfg->max_retries) 1021 - goto permanent_error; 1130 + return true; 1022 1131 if (cfg->retry_timeout != XFS_ERR_RETRY_FOREVER && 1023 1132 time_after(jiffies, cfg->retry_timeout + bp->b_first_retry_time)) 1024 - goto permanent_error; 1133 + return true; 1025 1134 1026 1135 /* At unmount we may treat errors differently */ 1027 1136 if ((mp->m_flags & XFS_MOUNT_UNMOUNTING) && mp->m_fail_unmount) 1028 - goto permanent_error; 1137 + return true; 1029 1138 1030 - /* 1031 - * Still a transient error, run IO completion failure callbacks and let 1032 - * the higher layers retry the buffer. 1033 - */ 1034 - xfs_buf_do_callbacks_fail(bp); 1035 - xfs_buf_ioerror(bp, 0); 1036 - xfs_buf_relse(bp); 1037 - return true; 1139 + return false; 1140 + } 1141 + 1142 + /* 1143 + * On a sync write or shutdown we just want to stale the buffer and let the 1144 + * caller handle the error in bp->b_error appropriately. 1145 + * 1146 + * If the write was asynchronous then no one will be looking for the error. If 1147 + * this is the first failure of this type, clear the error state and write the 1148 + * buffer out again. This means we always retry an async write failure at least 1149 + * once, but we also need to set the buffer up to behave correctly now for 1150 + * repeated failures. 1151 + * 1152 + * If we get repeated async write failures, then we take action according to the 1153 + * error configuration we have been set up to use. 1154 + * 1155 + * Multi-state return value: 1156 + * 1157 + * XBF_IOERROR_FINISH: clear IO error retry state and run callback completions 1158 + * XBF_IOERROR_DONE: resubmitted immediately, do not run any completions 1159 + * XBF_IOERROR_FAIL: transient error, run failure callback completions and then 1160 + * release the buffer 1161 + */ 1162 + enum { 1163 + XBF_IOERROR_FINISH, 1164 + XBF_IOERROR_DONE, 1165 + XBF_IOERROR_FAIL, 1166 + }; 1167 + 1168 + static int 1169 + xfs_buf_iodone_error( 1170 + struct xfs_buf *bp) 1171 + { 1172 + struct xfs_mount *mp = bp->b_mount; 1173 + struct xfs_error_cfg *cfg; 1174 + 1175 + if (xfs_buf_ioerror_fail_without_retry(bp)) 1176 + goto out_stale; 1177 + 1178 + trace_xfs_buf_item_iodone_async(bp, _RET_IP_); 1179 + 1180 + cfg = xfs_error_get_cfg(mp, XFS_ERR_METADATA, bp->b_error); 1181 + if (xfs_buf_ioerror_retry(bp, cfg)) { 1182 + xfs_buf_ioerror(bp, 0); 1183 + xfs_buf_submit(bp); 1184 + return XBF_IOERROR_DONE; 1185 + } 1038 1186 1039 1187 /* 1040 1188 * Permanent error - we need to trigger a shutdown if we haven't already 1041 1189 * to indicate that inconsistency will result from this action. 1042 1190 */ 1043 - permanent_error: 1044 - xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR); 1191 + if (xfs_buf_ioerror_permanent(bp, cfg)) { 1192 + xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR); 1193 + goto out_stale; 1194 + } 1195 + 1196 + /* Still considered a transient error. Caller will schedule retries. */ 1197 + return XBF_IOERROR_FAIL; 1198 + 1045 1199 out_stale: 1046 1200 xfs_buf_stale(bp); 1047 1201 bp->b_flags |= XBF_DONE; 1048 1202 trace_xfs_buf_error_relse(bp, _RET_IP_); 1049 - return false; 1203 + return XBF_IOERROR_FINISH; 1050 1204 } 1051 1205 1052 - /* 1053 - * This is the iodone() function for buffers which have had callbacks attached 1054 - * to them by xfs_buf_attach_iodone(). We need to iterate the items on the 1055 - * callback list, mark the buffer as having no more callbacks and then push the 1056 - * buffer through IO completion processing. 1057 - */ 1058 - void 1059 - xfs_buf_iodone_callbacks( 1206 + static void 1207 + xfs_buf_item_done( 1060 1208 struct xfs_buf *bp) 1061 1209 { 1062 - /* 1063 - * If there is an error, process it. Some errors require us 1064 - * to run callbacks after failure processing is done so we 1065 - * detect that and take appropriate action. 1066 - */ 1067 - if (bp->b_error && xfs_buf_iodone_callback_error(bp)) 1210 + struct xfs_buf_log_item *bip = bp->b_log_item; 1211 + 1212 + if (!bip) 1068 1213 return; 1069 - 1070 - /* 1071 - * Successful IO or permanent error. Either way, we can clear the 1072 - * retry state here in preparation for the next error that may occur. 1073 - */ 1074 - bp->b_last_error = 0; 1075 - bp->b_retries = 0; 1076 - bp->b_first_retry_time = 0; 1077 - 1078 - xfs_buf_do_callbacks(bp); 1079 - bp->b_log_item = NULL; 1080 - list_del_init(&bp->b_li_list); 1081 - bp->b_iodone = NULL; 1082 - xfs_buf_ioend(bp); 1083 - } 1084 - 1085 - /* 1086 - * This is the iodone() function for buffers which have been 1087 - * logged. It is called when they are eventually flushed out. 1088 - * It should remove the buf item from the AIL, and free the buf item. 1089 - * It is called by xfs_buf_iodone_callbacks() above which will take 1090 - * care of cleaning up the buffer itself. 1091 - */ 1092 - void 1093 - xfs_buf_iodone( 1094 - struct xfs_buf *bp, 1095 - struct xfs_log_item *lip) 1096 - { 1097 - ASSERT(BUF_ITEM(lip)->bli_buf == bp); 1098 - 1099 - xfs_buf_rele(bp); 1100 1214 1101 1215 /* 1102 1216 * If we are forcibly shutting down, this may well be off the AIL ··· 1112 1216 * 1113 1217 * Either way, AIL is useless if we're forcing a shutdown. 1114 1218 */ 1115 - xfs_trans_ail_delete(lip, SHUTDOWN_CORRUPT_INCORE); 1116 - xfs_buf_item_free(BUF_ITEM(lip)); 1219 + xfs_trans_ail_delete(&bip->bli_item, SHUTDOWN_CORRUPT_INCORE); 1220 + bp->b_log_item = NULL; 1221 + xfs_buf_item_free(bip); 1222 + xfs_buf_rele(bp); 1223 + } 1224 + 1225 + static inline void 1226 + xfs_buf_clear_ioerror_retry_state( 1227 + struct xfs_buf *bp) 1228 + { 1229 + bp->b_last_error = 0; 1230 + bp->b_retries = 0; 1231 + bp->b_first_retry_time = 0; 1232 + } 1233 + 1234 + /* 1235 + * Inode buffer iodone callback function. 1236 + */ 1237 + void 1238 + xfs_buf_inode_iodone( 1239 + struct xfs_buf *bp) 1240 + { 1241 + if (bp->b_error) { 1242 + struct xfs_log_item *lip; 1243 + int ret = xfs_buf_iodone_error(bp); 1244 + 1245 + if (ret == XBF_IOERROR_FINISH) 1246 + goto finish_iodone; 1247 + if (ret == XBF_IOERROR_DONE) 1248 + return; 1249 + ASSERT(ret == XBF_IOERROR_FAIL); 1250 + list_for_each_entry(lip, &bp->b_li_list, li_bio_list) { 1251 + set_bit(XFS_LI_FAILED, &lip->li_flags); 1252 + } 1253 + xfs_buf_ioerror(bp, 0); 1254 + xfs_buf_relse(bp); 1255 + return; 1256 + } 1257 + 1258 + finish_iodone: 1259 + xfs_buf_clear_ioerror_retry_state(bp); 1260 + xfs_buf_item_done(bp); 1261 + xfs_iflush_done(bp); 1262 + xfs_buf_ioend_finish(bp); 1263 + } 1264 + 1265 + /* 1266 + * Dquot buffer iodone callback function. 1267 + */ 1268 + void 1269 + xfs_buf_dquot_iodone( 1270 + struct xfs_buf *bp) 1271 + { 1272 + if (bp->b_error) { 1273 + struct xfs_log_item *lip; 1274 + int ret = xfs_buf_iodone_error(bp); 1275 + 1276 + if (ret == XBF_IOERROR_FINISH) 1277 + goto finish_iodone; 1278 + if (ret == XBF_IOERROR_DONE) 1279 + return; 1280 + ASSERT(ret == XBF_IOERROR_FAIL); 1281 + spin_lock(&bp->b_mount->m_ail->ail_lock); 1282 + list_for_each_entry(lip, &bp->b_li_list, li_bio_list) { 1283 + xfs_set_li_failed(lip, bp); 1284 + } 1285 + spin_unlock(&bp->b_mount->m_ail->ail_lock); 1286 + xfs_buf_ioerror(bp, 0); 1287 + xfs_buf_relse(bp); 1288 + return; 1289 + } 1290 + 1291 + finish_iodone: 1292 + xfs_buf_clear_ioerror_retry_state(bp); 1293 + /* a newly allocated dquot buffer might have a log item attached */ 1294 + xfs_buf_item_done(bp); 1295 + xfs_dquot_done(bp); 1296 + xfs_buf_ioend_finish(bp); 1297 + } 1298 + 1299 + /* 1300 + * Dirty buffer iodone callback function. 1301 + * 1302 + * Note that for things like remote attribute buffers, there may not be a buffer 1303 + * log item here, so processing the buffer log item must remain be optional. 1304 + */ 1305 + void 1306 + xfs_buf_iodone( 1307 + struct xfs_buf *bp) 1308 + { 1309 + if (bp->b_error) { 1310 + int ret = xfs_buf_iodone_error(bp); 1311 + 1312 + if (ret == XBF_IOERROR_FINISH) 1313 + goto finish_iodone; 1314 + if (ret == XBF_IOERROR_DONE) 1315 + return; 1316 + ASSERT(ret == XBF_IOERROR_FAIL); 1317 + ASSERT(list_empty(&bp->b_li_list)); 1318 + xfs_buf_ioerror(bp, 0); 1319 + xfs_buf_relse(bp); 1320 + return; 1321 + } 1322 + 1323 + finish_iodone: 1324 + xfs_buf_clear_ioerror_retry_state(bp); 1325 + xfs_buf_item_done(bp); 1326 + xfs_buf_ioend_finish(bp); 1117 1327 }

+3 -5

fs/xfs/xfs_buf_item.h

··· 54 54 bool xfs_buf_item_put(struct xfs_buf_log_item *); 55 55 void xfs_buf_item_log(struct xfs_buf_log_item *, uint, uint); 56 56 bool xfs_buf_item_dirty_format(struct xfs_buf_log_item *); 57 - void xfs_buf_attach_iodone(struct xfs_buf *, 58 - void(*)(struct xfs_buf *, struct xfs_log_item *), 59 - struct xfs_log_item *); 60 - void xfs_buf_iodone_callbacks(struct xfs_buf *); 61 - void xfs_buf_iodone(struct xfs_buf *, struct xfs_log_item *); 57 + void xfs_buf_inode_iodone(struct xfs_buf *); 58 + void xfs_buf_dquot_iodone(struct xfs_buf *); 59 + void xfs_buf_iodone(struct xfs_buf *); 62 60 bool xfs_buf_log_check_iovec(struct xfs_log_iovec *iovec); 63 61 64 62 extern kmem_zone_t *xfs_buf_item_zone;

+6 -8

fs/xfs/xfs_buf_item_recover.c

··· 419 419 if (bp->b_ops) { 420 420 struct xfs_buf_log_item *bip; 421 421 422 - ASSERT(!bp->b_iodone || bp->b_iodone == xlog_recover_iodone); 423 - bp->b_iodone = xlog_recover_iodone; 422 + bp->b_flags |= _XBF_LOGRECOVERY; 424 423 xfs_buf_item_init(bp, mp); 425 424 bip = bp->b_log_item; 426 425 bip->bli_item.li_lsn = current_lsn; ··· 493 494 item->ri_buf[i].i_len, __func__); 494 495 goto next; 495 496 } 496 - fa = xfs_dquot_verify(mp, item->ri_buf[i].i_addr, 497 - -1, 0); 497 + fa = xfs_dquot_verify(mp, item->ri_buf[i].i_addr, -1); 498 498 if (fa) { 499 499 xfs_alert(mp, 500 500 "dquot corrupt at %pS trying to replay into block 0x%llx", ··· 546 548 547 549 type = 0; 548 550 if (buf_f->blf_flags & XFS_BLF_UDQUOT_BUF) 549 - type |= XFS_DQ_USER; 551 + type |= XFS_DQTYPE_USER; 550 552 if (buf_f->blf_flags & XFS_BLF_PDQUOT_BUF) 551 - type |= XFS_DQ_PROJ; 553 + type |= XFS_DQTYPE_PROJ; 552 554 if (buf_f->blf_flags & XFS_BLF_GDQUOT_BUF) 553 - type |= XFS_DQ_GROUP; 555 + type |= XFS_DQTYPE_GROUP; 554 556 /* 555 557 * This type of quotas was turned off, so ignore this buffer 556 558 */ ··· 961 963 error = xfs_bwrite(bp); 962 964 } else { 963 965 ASSERT(bp->b_mount == mp); 964 - bp->b_iodone = xlog_recover_iodone; 966 + bp->b_flags |= _XBF_LOGRECOVERY; 965 967 xfs_buf_delwri_queue(bp, buffer_list); 966 968 } 967 969

+231 -184

fs/xfs/xfs_dquot.c

··· 23 23 #include "xfs_trace.h" 24 24 #include "xfs_log.h" 25 25 #include "xfs_bmap_btree.h" 26 + #include "xfs_error.h" 26 27 27 28 /* 28 29 * Lock order: ··· 67 66 */ 68 67 void 69 68 xfs_qm_adjust_dqlimits( 70 - struct xfs_mount *mp, 71 69 struct xfs_dquot *dq) 72 70 { 71 + struct xfs_mount *mp = dq->q_mount; 73 72 struct xfs_quotainfo *q = mp->m_quotainfo; 74 - struct xfs_disk_dquot *d = &dq->q_core; 75 73 struct xfs_def_quota *defq; 76 74 int prealloc = 0; 77 75 78 - ASSERT(d->d_id); 76 + ASSERT(dq->q_id); 79 77 defq = xfs_get_defquota(q, xfs_dquot_type(dq)); 80 78 81 - if (defq->bsoftlimit && !d->d_blk_softlimit) { 82 - d->d_blk_softlimit = cpu_to_be64(defq->bsoftlimit); 79 + if (!dq->q_blk.softlimit) { 80 + dq->q_blk.softlimit = defq->blk.soft; 83 81 prealloc = 1; 84 82 } 85 - if (defq->bhardlimit && !d->d_blk_hardlimit) { 86 - d->d_blk_hardlimit = cpu_to_be64(defq->bhardlimit); 83 + if (!dq->q_blk.hardlimit) { 84 + dq->q_blk.hardlimit = defq->blk.hard; 87 85 prealloc = 1; 88 86 } 89 - if (defq->isoftlimit && !d->d_ino_softlimit) 90 - d->d_ino_softlimit = cpu_to_be64(defq->isoftlimit); 91 - if (defq->ihardlimit && !d->d_ino_hardlimit) 92 - d->d_ino_hardlimit = cpu_to_be64(defq->ihardlimit); 93 - if (defq->rtbsoftlimit && !d->d_rtb_softlimit) 94 - d->d_rtb_softlimit = cpu_to_be64(defq->rtbsoftlimit); 95 - if (defq->rtbhardlimit && !d->d_rtb_hardlimit) 96 - d->d_rtb_hardlimit = cpu_to_be64(defq->rtbhardlimit); 87 + if (!dq->q_ino.softlimit) 88 + dq->q_ino.softlimit = defq->ino.soft; 89 + if (!dq->q_ino.hardlimit) 90 + dq->q_ino.hardlimit = defq->ino.hard; 91 + if (!dq->q_rtb.softlimit) 92 + dq->q_rtb.softlimit = defq->rtb.soft; 93 + if (!dq->q_rtb.hardlimit) 94 + dq->q_rtb.hardlimit = defq->rtb.hard; 97 95 98 96 if (prealloc) 99 97 xfs_dquot_set_prealloc_limits(dq); 98 + } 99 + 100 + /* 101 + * Determine if this quota counter is over either limit and set the quota 102 + * timers as appropriate. 103 + */ 104 + static inline void 105 + xfs_qm_adjust_res_timer( 106 + struct xfs_dquot_res *res, 107 + struct xfs_quota_limits *qlim) 108 + { 109 + ASSERT(res->hardlimit == 0 || res->softlimit <= res->hardlimit); 110 + 111 + if ((res->softlimit && res->count > res->softlimit) || 112 + (res->hardlimit && res->count > res->hardlimit)) { 113 + if (res->timer == 0) 114 + res->timer = ktime_get_real_seconds() + qlim->time; 115 + } else { 116 + if (res->timer == 0) 117 + res->warnings = 0; 118 + else 119 + res->timer = 0; 120 + } 100 121 } 101 122 102 123 /* ··· 136 113 */ 137 114 void 138 115 xfs_qm_adjust_dqtimers( 139 - struct xfs_mount *mp, 140 116 struct xfs_dquot *dq) 141 117 { 118 + struct xfs_mount *mp = dq->q_mount; 142 119 struct xfs_quotainfo *qi = mp->m_quotainfo; 143 - struct xfs_disk_dquot *d = &dq->q_core; 144 120 struct xfs_def_quota *defq; 145 121 146 - ASSERT(d->d_id); 122 + ASSERT(dq->q_id); 147 123 defq = xfs_get_defquota(qi, xfs_dquot_type(dq)); 148 124 149 - #ifdef DEBUG 150 - if (d->d_blk_hardlimit) 151 - ASSERT(be64_to_cpu(d->d_blk_softlimit) <= 152 - be64_to_cpu(d->d_blk_hardlimit)); 153 - if (d->d_ino_hardlimit) 154 - ASSERT(be64_to_cpu(d->d_ino_softlimit) <= 155 - be64_to_cpu(d->d_ino_hardlimit)); 156 - if (d->d_rtb_hardlimit) 157 - ASSERT(be64_to_cpu(d->d_rtb_softlimit) <= 158 - be64_to_cpu(d->d_rtb_hardlimit)); 159 - #endif 160 - 161 - if (!d->d_btimer) { 162 - if ((d->d_blk_softlimit && 163 - (be64_to_cpu(d->d_bcount) > 164 - be64_to_cpu(d->d_blk_softlimit))) || 165 - (d->d_blk_hardlimit && 166 - (be64_to_cpu(d->d_bcount) > 167 - be64_to_cpu(d->d_blk_hardlimit)))) { 168 - d->d_btimer = cpu_to_be32(ktime_get_real_seconds() + 169 - defq->btimelimit); 170 - } else { 171 - d->d_bwarns = 0; 172 - } 173 - } else { 174 - if ((!d->d_blk_softlimit || 175 - (be64_to_cpu(d->d_bcount) <= 176 - be64_to_cpu(d->d_blk_softlimit))) && 177 - (!d->d_blk_hardlimit || 178 - (be64_to_cpu(d->d_bcount) <= 179 - be64_to_cpu(d->d_blk_hardlimit)))) { 180 - d->d_btimer = 0; 181 - } 182 - } 183 - 184 - if (!d->d_itimer) { 185 - if ((d->d_ino_softlimit && 186 - (be64_to_cpu(d->d_icount) > 187 - be64_to_cpu(d->d_ino_softlimit))) || 188 - (d->d_ino_hardlimit && 189 - (be64_to_cpu(d->d_icount) > 190 - be64_to_cpu(d->d_ino_hardlimit)))) { 191 - d->d_itimer = cpu_to_be32(ktime_get_real_seconds() + 192 - defq->itimelimit); 193 - } else { 194 - d->d_iwarns = 0; 195 - } 196 - } else { 197 - if ((!d->d_ino_softlimit || 198 - (be64_to_cpu(d->d_icount) <= 199 - be64_to_cpu(d->d_ino_softlimit))) && 200 - (!d->d_ino_hardlimit || 201 - (be64_to_cpu(d->d_icount) <= 202 - be64_to_cpu(d->d_ino_hardlimit)))) { 203 - d->d_itimer = 0; 204 - } 205 - } 206 - 207 - if (!d->d_rtbtimer) { 208 - if ((d->d_rtb_softlimit && 209 - (be64_to_cpu(d->d_rtbcount) > 210 - be64_to_cpu(d->d_rtb_softlimit))) || 211 - (d->d_rtb_hardlimit && 212 - (be64_to_cpu(d->d_rtbcount) > 213 - be64_to_cpu(d->d_rtb_hardlimit)))) { 214 - d->d_rtbtimer = cpu_to_be32(ktime_get_real_seconds() + 215 - defq->rtbtimelimit); 216 - } else { 217 - d->d_rtbwarns = 0; 218 - } 219 - } else { 220 - if ((!d->d_rtb_softlimit || 221 - (be64_to_cpu(d->d_rtbcount) <= 222 - be64_to_cpu(d->d_rtb_softlimit))) && 223 - (!d->d_rtb_hardlimit || 224 - (be64_to_cpu(d->d_rtbcount) <= 225 - be64_to_cpu(d->d_rtb_hardlimit)))) { 226 - d->d_rtbtimer = 0; 227 - } 228 - } 125 + xfs_qm_adjust_res_timer(&dq->q_blk, &defq->blk); 126 + xfs_qm_adjust_res_timer(&dq->q_ino, &defq->ino); 127 + xfs_qm_adjust_res_timer(&dq->q_rtb, &defq->rtb); 229 128 } 230 129 231 130 /* ··· 158 213 struct xfs_trans *tp, 159 214 struct xfs_mount *mp, 160 215 xfs_dqid_t id, 161 - uint type, 216 + xfs_dqtype_t type, 162 217 struct xfs_buf *bp) 163 218 { 164 219 struct xfs_quotainfo *q = mp->m_quotainfo; ··· 171 226 ASSERT(tp); 172 227 ASSERT(xfs_buf_islocked(bp)); 173 228 229 + switch (type) { 230 + case XFS_DQTYPE_USER: 231 + qflag = XFS_UQUOTA_CHKD; 232 + blftype = XFS_BLF_UDQUOT_BUF; 233 + break; 234 + case XFS_DQTYPE_PROJ: 235 + qflag = XFS_PQUOTA_CHKD; 236 + blftype = XFS_BLF_PDQUOT_BUF; 237 + break; 238 + case XFS_DQTYPE_GROUP: 239 + qflag = XFS_GQUOTA_CHKD; 240 + blftype = XFS_BLF_GDQUOT_BUF; 241 + break; 242 + default: 243 + ASSERT(0); 244 + return; 245 + } 246 + 174 247 d = bp->b_addr; 175 248 176 249 /* ··· 200 237 d->dd_diskdq.d_magic = cpu_to_be16(XFS_DQUOT_MAGIC); 201 238 d->dd_diskdq.d_version = XFS_DQUOT_VERSION; 202 239 d->dd_diskdq.d_id = cpu_to_be32(curid); 203 - d->dd_diskdq.d_flags = type; 240 + d->dd_diskdq.d_type = type; 204 241 if (xfs_sb_version_hascrc(&mp->m_sb)) { 205 242 uuid_copy(&d->dd_uuid, &mp->m_sb.sb_meta_uuid); 206 243 xfs_update_cksum((char *)d, sizeof(struct xfs_dqblk), 207 244 XFS_DQUOT_CRC_OFF); 208 245 } 209 - } 210 - 211 - if (type & XFS_DQ_USER) { 212 - qflag = XFS_UQUOTA_CHKD; 213 - blftype = XFS_BLF_UDQUOT_BUF; 214 - } else if (type & XFS_DQ_PROJ) { 215 - qflag = XFS_PQUOTA_CHKD; 216 - blftype = XFS_BLF_PDQUOT_BUF; 217 - } else { 218 - qflag = XFS_GQUOTA_CHKD; 219 - blftype = XFS_BLF_GDQUOT_BUF; 220 246 } 221 247 222 248 xfs_trans_dquot_buf(tp, bp, blftype); ··· 242 290 { 243 291 uint64_t space; 244 292 245 - dqp->q_prealloc_hi_wmark = be64_to_cpu(dqp->q_core.d_blk_hardlimit); 246 - dqp->q_prealloc_lo_wmark = be64_to_cpu(dqp->q_core.d_blk_softlimit); 293 + dqp->q_prealloc_hi_wmark = dqp->q_blk.hardlimit; 294 + dqp->q_prealloc_lo_wmark = dqp->q_blk.softlimit; 247 295 if (!dqp->q_prealloc_lo_wmark) { 248 296 dqp->q_prealloc_lo_wmark = dqp->q_prealloc_hi_wmark; 249 297 do_div(dqp->q_prealloc_lo_wmark, 100); ··· 273 321 struct xfs_trans *tp = *tpp; 274 322 struct xfs_mount *mp = tp->t_mountp; 275 323 struct xfs_buf *bp; 276 - struct xfs_inode *quotip = xfs_quota_inode(mp, dqp->dq_flags); 324 + xfs_dqtype_t qtype = xfs_dquot_type(dqp); 325 + struct xfs_inode *quotip = xfs_quota_inode(mp, qtype); 277 326 int nmaps = 1; 278 327 int error; 279 328 280 329 trace_xfs_dqalloc(dqp); 281 330 282 331 xfs_ilock(quotip, XFS_ILOCK_EXCL); 283 - if (!xfs_this_quota_on(dqp->q_mount, dqp->dq_flags)) { 332 + if (!xfs_this_quota_on(dqp->q_mount, qtype)) { 284 333 /* 285 334 * Return if this type of quotas is turned off while we didn't 286 335 * have an inode lock ··· 318 365 * Make a chunk of dquots out of this buffer and log 319 366 * the entire thing. 320 367 */ 321 - xfs_qm_init_dquot_blk(tp, mp, be32_to_cpu(dqp->q_core.d_id), 322 - dqp->dq_flags & XFS_DQ_ALLTYPES, bp); 368 + xfs_qm_init_dquot_blk(tp, mp, dqp->q_id, qtype, bp); 323 369 xfs_buf_set_ref(bp, XFS_DQUOT_REF); 324 370 325 371 /* ··· 365 413 { 366 414 struct xfs_bmbt_irec map; 367 415 struct xfs_buf *bp; 368 - struct xfs_inode *quotip = xfs_quota_inode(mp, dqp->dq_flags); 416 + xfs_dqtype_t qtype = xfs_dquot_type(dqp); 417 + struct xfs_inode *quotip = xfs_quota_inode(mp, qtype); 369 418 uint lock_mode; 370 419 int nmaps = 1; 371 420 int error; 372 421 373 422 lock_mode = xfs_ilock_data_map_shared(quotip); 374 - if (!xfs_this_quota_on(mp, dqp->dq_flags)) { 423 + if (!xfs_this_quota_on(mp, qtype)) { 375 424 /* 376 425 * Return if this type of quotas is turned off while we 377 426 * didn't have the quota inode lock. ··· 424 471 xfs_dquot_alloc( 425 472 struct xfs_mount *mp, 426 473 xfs_dqid_t id, 427 - uint type) 474 + xfs_dqtype_t type) 428 475 { 429 476 struct xfs_dquot *dqp; 430 477 431 - dqp = kmem_zone_zalloc(xfs_qm_dqzone, 0); 478 + dqp = kmem_cache_zalloc(xfs_qm_dqzone, GFP_KERNEL | __GFP_NOFAIL); 432 479 433 - dqp->dq_flags = type; 434 - dqp->q_core.d_id = cpu_to_be32(id); 480 + dqp->q_type = type; 481 + dqp->q_id = id; 435 482 dqp->q_mount = mp; 436 483 INIT_LIST_HEAD(&dqp->q_lru); 437 484 mutex_init(&dqp->q_qlock); ··· 456 503 * quotas. 457 504 */ 458 505 switch (type) { 459 - case XFS_DQ_USER: 506 + case XFS_DQTYPE_USER: 460 507 /* uses the default lock class */ 461 508 break; 462 - case XFS_DQ_GROUP: 509 + case XFS_DQTYPE_GROUP: 463 510 lockdep_set_class(&dqp->q_qlock, &xfs_dquot_group_class); 464 511 break; 465 - case XFS_DQ_PROJ: 512 + case XFS_DQTYPE_PROJ: 466 513 lockdep_set_class(&dqp->q_qlock, &xfs_dquot_project_class); 467 514 break; 468 515 default: ··· 477 524 } 478 525 479 526 /* Copy the in-core quota fields in from the on-disk buffer. */ 480 - STATIC void 527 + STATIC int 481 528 xfs_dquot_from_disk( 482 529 struct xfs_dquot *dqp, 483 530 struct xfs_buf *bp) 484 531 { 485 532 struct xfs_disk_dquot *ddqp = bp->b_addr + dqp->q_bufoffset; 486 533 534 + /* 535 + * Ensure that we got the type and ID we were looking for. 536 + * Everything else was checked by the dquot buffer verifier. 537 + */ 538 + if ((ddqp->d_type & XFS_DQTYPE_REC_MASK) != xfs_dquot_type(dqp) || 539 + be32_to_cpu(ddqp->d_id) != dqp->q_id) { 540 + xfs_alert_tag(bp->b_mount, XFS_PTAG_VERIFIER_ERROR, 541 + "Metadata corruption detected at %pS, quota %u", 542 + __this_address, dqp->q_id); 543 + xfs_alert(bp->b_mount, "Unmount and run xfs_repair"); 544 + return -EFSCORRUPTED; 545 + } 546 + 487 547 /* copy everything from disk dquot to the incore dquot */ 488 - memcpy(&dqp->q_core, ddqp, sizeof(struct xfs_disk_dquot)); 548 + dqp->q_type = ddqp->d_type; 549 + dqp->q_blk.hardlimit = be64_to_cpu(ddqp->d_blk_hardlimit); 550 + dqp->q_blk.softlimit = be64_to_cpu(ddqp->d_blk_softlimit); 551 + dqp->q_ino.hardlimit = be64_to_cpu(ddqp->d_ino_hardlimit); 552 + dqp->q_ino.softlimit = be64_to_cpu(ddqp->d_ino_softlimit); 553 + dqp->q_rtb.hardlimit = be64_to_cpu(ddqp->d_rtb_hardlimit); 554 + dqp->q_rtb.softlimit = be64_to_cpu(ddqp->d_rtb_softlimit); 555 + 556 + dqp->q_blk.count = be64_to_cpu(ddqp->d_bcount); 557 + dqp->q_ino.count = be64_to_cpu(ddqp->d_icount); 558 + dqp->q_rtb.count = be64_to_cpu(ddqp->d_rtbcount); 559 + 560 + dqp->q_blk.warnings = be16_to_cpu(ddqp->d_bwarns); 561 + dqp->q_ino.warnings = be16_to_cpu(ddqp->d_iwarns); 562 + dqp->q_rtb.warnings = be16_to_cpu(ddqp->d_rtbwarns); 563 + 564 + dqp->q_blk.timer = be32_to_cpu(ddqp->d_btimer); 565 + dqp->q_ino.timer = be32_to_cpu(ddqp->d_itimer); 566 + dqp->q_rtb.timer = be32_to_cpu(ddqp->d_rtbtimer); 489 567 490 568 /* 491 569 * Reservation counters are defined as reservation plus current usage 492 570 * to avoid having to add every time. 493 571 */ 494 - dqp->q_res_bcount = be64_to_cpu(ddqp->d_bcount); 495 - dqp->q_res_icount = be64_to_cpu(ddqp->d_icount); 496 - dqp->q_res_rtbcount = be64_to_cpu(ddqp->d_rtbcount); 572 + dqp->q_blk.reserved = dqp->q_blk.count; 573 + dqp->q_ino.reserved = dqp->q_ino.count; 574 + dqp->q_rtb.reserved = dqp->q_rtb.count; 497 575 498 576 /* initialize the dquot speculative prealloc thresholds */ 499 577 xfs_dquot_set_prealloc_limits(dqp); 578 + return 0; 579 + } 580 + 581 + /* Copy the in-core quota fields into the on-disk buffer. */ 582 + void 583 + xfs_dquot_to_disk( 584 + struct xfs_disk_dquot *ddqp, 585 + struct xfs_dquot *dqp) 586 + { 587 + ddqp->d_magic = cpu_to_be16(XFS_DQUOT_MAGIC); 588 + ddqp->d_version = XFS_DQUOT_VERSION; 589 + ddqp->d_type = dqp->q_type; 590 + ddqp->d_id = cpu_to_be32(dqp->q_id); 591 + ddqp->d_pad0 = 0; 592 + ddqp->d_pad = 0; 593 + 594 + ddqp->d_blk_hardlimit = cpu_to_be64(dqp->q_blk.hardlimit); 595 + ddqp->d_blk_softlimit = cpu_to_be64(dqp->q_blk.softlimit); 596 + ddqp->d_ino_hardlimit = cpu_to_be64(dqp->q_ino.hardlimit); 597 + ddqp->d_ino_softlimit = cpu_to_be64(dqp->q_ino.softlimit); 598 + ddqp->d_rtb_hardlimit = cpu_to_be64(dqp->q_rtb.hardlimit); 599 + ddqp->d_rtb_softlimit = cpu_to_be64(dqp->q_rtb.softlimit); 600 + 601 + ddqp->d_bcount = cpu_to_be64(dqp->q_blk.count); 602 + ddqp->d_icount = cpu_to_be64(dqp->q_ino.count); 603 + ddqp->d_rtbcount = cpu_to_be64(dqp->q_rtb.count); 604 + 605 + ddqp->d_bwarns = cpu_to_be16(dqp->q_blk.warnings); 606 + ddqp->d_iwarns = cpu_to_be16(dqp->q_ino.warnings); 607 + ddqp->d_rtbwarns = cpu_to_be16(dqp->q_rtb.warnings); 608 + 609 + ddqp->d_btimer = cpu_to_be32(dqp->q_blk.timer); 610 + ddqp->d_itimer = cpu_to_be32(dqp->q_ino.timer); 611 + ddqp->d_rtbtimer = cpu_to_be32(dqp->q_rtb.timer); 500 612 } 501 613 502 614 /* Allocate and initialize the dquot buffer for this in-core dquot. */ ··· 610 592 xfs_qm_dqread( 611 593 struct xfs_mount *mp, 612 594 xfs_dqid_t id, 613 - uint type, 595 + xfs_dqtype_t type, 614 596 bool can_alloc, 615 597 struct xfs_dquot **dqpp) 616 598 { ··· 635 617 * further. 636 618 */ 637 619 ASSERT(xfs_buf_islocked(bp)); 638 - xfs_dquot_from_disk(dqp, bp); 639 - 620 + error = xfs_dquot_from_disk(dqp, bp); 640 621 xfs_buf_relse(bp); 622 + if (error) 623 + goto err; 624 + 641 625 *dqpp = dqp; 642 626 return error; 643 627 ··· 658 638 static int 659 639 xfs_dq_get_next_id( 660 640 struct xfs_mount *mp, 661 - uint type, 641 + xfs_dqtype_t type, 662 642 xfs_dqid_t *id) 663 643 { 664 644 struct xfs_inode *quotip = xfs_quota_inode(mp, type); ··· 726 706 } 727 707 728 708 xfs_dqlock(dqp); 729 - if (dqp->dq_flags & XFS_DQ_FREEING) { 709 + if (dqp->q_flags & XFS_DQFLAG_FREEING) { 730 710 xfs_dqunlock(dqp); 731 711 mutex_unlock(&qi->qi_tree_lock); 732 712 trace_xfs_dqget_freeing(dqp); ··· 782 762 static int 783 763 xfs_qm_dqget_checks( 784 764 struct xfs_mount *mp, 785 - uint type) 765 + xfs_dqtype_t type) 786 766 { 787 767 if (WARN_ON_ONCE(!XFS_IS_QUOTA_RUNNING(mp))) 788 768 return -ESRCH; 789 769 790 770 switch (type) { 791 - case XFS_DQ_USER: 771 + case XFS_DQTYPE_USER: 792 772 if (!XFS_IS_UQUOTA_ON(mp)) 793 773 return -ESRCH; 794 774 return 0; 795 - case XFS_DQ_GROUP: 775 + case XFS_DQTYPE_GROUP: 796 776 if (!XFS_IS_GQUOTA_ON(mp)) 797 777 return -ESRCH; 798 778 return 0; 799 - case XFS_DQ_PROJ: 779 + case XFS_DQTYPE_PROJ: 800 780 if (!XFS_IS_PQUOTA_ON(mp)) 801 781 return -ESRCH; 802 782 return 0; ··· 814 794 xfs_qm_dqget( 815 795 struct xfs_mount *mp, 816 796 xfs_dqid_t id, 817 - uint type, 797 + xfs_dqtype_t type, 818 798 bool can_alloc, 819 799 struct xfs_dquot **O_dqpp) 820 800 { ··· 864 844 xfs_qm_dqget_uncached( 865 845 struct xfs_mount *mp, 866 846 xfs_dqid_t id, 867 - uint type, 847 + xfs_dqtype_t type, 868 848 struct xfs_dquot **dqpp) 869 849 { 870 850 int error; ··· 880 860 xfs_dqid_t 881 861 xfs_qm_id_for_quotatype( 882 862 struct xfs_inode *ip, 883 - uint type) 863 + xfs_dqtype_t type) 884 864 { 885 865 switch (type) { 886 - case XFS_DQ_USER: 866 + case XFS_DQTYPE_USER: 887 867 return i_uid_read(VFS_I(ip)); 888 - case XFS_DQ_GROUP: 868 + case XFS_DQTYPE_GROUP: 889 869 return i_gid_read(VFS_I(ip)); 890 - case XFS_DQ_PROJ: 870 + case XFS_DQTYPE_PROJ: 891 871 return ip->i_d.di_projid; 892 872 } 893 873 ASSERT(0); ··· 902 882 int 903 883 xfs_qm_dqget_inode( 904 884 struct xfs_inode *ip, 905 - uint type, 885 + xfs_dqtype_t type, 906 886 bool can_alloc, 907 887 struct xfs_dquot **O_dqpp) 908 888 { ··· 988 968 xfs_qm_dqget_next( 989 969 struct xfs_mount *mp, 990 970 xfs_dqid_t id, 991 - uint type, 971 + xfs_dqtype_t type, 992 972 struct xfs_dquot **dqpp) 993 973 { 994 974 struct xfs_dquot *dqp; ··· 1068 1048 * from the AIL if it has not been re-logged, and unlocking the dquot's 1069 1049 * flush lock. This behavior is very similar to that of inodes.. 1070 1050 */ 1071 - STATIC void 1051 + static void 1072 1052 xfs_qm_dqflush_done( 1073 - struct xfs_buf *bp, 1074 1053 struct xfs_log_item *lip) 1075 1054 { 1076 1055 struct xfs_dq_logitem *qip = (struct xfs_dq_logitem *)lip; ··· 1090 1071 test_bit(XFS_LI_FAILED, &lip->li_flags))) { 1091 1072 1092 1073 spin_lock(&ailp->ail_lock); 1074 + xfs_clear_li_failed(lip); 1093 1075 if (lip->li_lsn == qip->qli_flush_lsn) { 1094 1076 /* xfs_ail_update_finish() drops the AIL lock */ 1095 1077 tail_lsn = xfs_ail_delete_one(ailp, lip); 1096 1078 xfs_ail_update_finish(ailp, tail_lsn); 1097 1079 } else { 1098 - /* 1099 - * Clear the failed state since we are about to drop the 1100 - * flush lock 1101 - */ 1102 - xfs_clear_li_failed(lip); 1103 1080 spin_unlock(&ailp->ail_lock); 1104 1081 } 1105 1082 } ··· 1104 1089 * Release the dq's flush lock since we're done with it. 1105 1090 */ 1106 1091 xfs_dqfunlock(dqp); 1092 + } 1093 + 1094 + void 1095 + xfs_dquot_done( 1096 + struct xfs_buf *bp) 1097 + { 1098 + struct xfs_log_item *lip, *n; 1099 + 1100 + list_for_each_entry_safe(lip, n, &bp->b_li_list, li_bio_list) { 1101 + list_del_init(&lip->li_bio_list); 1102 + xfs_qm_dqflush_done(lip); 1103 + } 1104 + } 1105 + 1106 + /* Check incore dquot for errors before we flush. */ 1107 + static xfs_failaddr_t 1108 + xfs_qm_dqflush_check( 1109 + struct xfs_dquot *dqp) 1110 + { 1111 + xfs_dqtype_t type = xfs_dquot_type(dqp); 1112 + 1113 + if (type != XFS_DQTYPE_USER && 1114 + type != XFS_DQTYPE_GROUP && 1115 + type != XFS_DQTYPE_PROJ) 1116 + return __this_address; 1117 + 1118 + if (dqp->q_id == 0) 1119 + return NULL; 1120 + 1121 + if (dqp->q_blk.softlimit && dqp->q_blk.count > dqp->q_blk.softlimit && 1122 + !dqp->q_blk.timer) 1123 + return __this_address; 1124 + 1125 + if (dqp->q_ino.softlimit && dqp->q_ino.count > dqp->q_ino.softlimit && 1126 + !dqp->q_ino.timer) 1127 + return __this_address; 1128 + 1129 + if (dqp->q_rtb.softlimit && dqp->q_rtb.count > dqp->q_rtb.softlimit && 1130 + !dqp->q_rtb.timer) 1131 + return __this_address; 1132 + 1133 + return NULL; 1107 1134 } 1108 1135 1109 1136 /* ··· 1164 1107 struct xfs_mount *mp = dqp->q_mount; 1165 1108 struct xfs_log_item *lip = &dqp->q_logitem.qli_item; 1166 1109 struct xfs_buf *bp; 1167 - struct xfs_dqblk *dqb; 1168 - struct xfs_disk_dquot *ddqp; 1110 + struct xfs_dqblk *dqblk; 1169 1111 xfs_failaddr_t fa; 1170 1112 int error; 1171 1113 ··· 1188 1132 if (error) 1189 1133 goto out_abort; 1190 1134 1191 - /* 1192 - * Calculate the location of the dquot inside the buffer. 1193 - */ 1194 - dqb = bp->b_addr + dqp->q_bufoffset; 1195 - ddqp = &dqb->dd_diskdq; 1196 - 1197 - /* sanity check the in-core structure before we flush */ 1198 - fa = xfs_dquot_verify(mp, &dqp->q_core, be32_to_cpu(dqp->q_core.d_id), 1199 - 0); 1135 + fa = xfs_qm_dqflush_check(dqp); 1200 1136 if (fa) { 1201 1137 xfs_alert(mp, "corrupt dquot ID 0x%x in memory at %pS", 1202 - be32_to_cpu(dqp->q_core.d_id), fa); 1138 + dqp->q_id, fa); 1203 1139 xfs_buf_relse(bp); 1204 1140 error = -EFSCORRUPTED; 1205 1141 goto out_abort; 1206 1142 } 1207 1143 1208 - /* This is the only portion of data that needs to persist */ 1209 - memcpy(ddqp, &dqp->q_core, sizeof(struct xfs_disk_dquot)); 1144 + /* Flush the incore dquot to the ondisk buffer. */ 1145 + dqblk = bp->b_addr + dqp->q_bufoffset; 1146 + xfs_dquot_to_disk(&dqblk->dd_diskdq, dqp); 1210 1147 1211 1148 /* 1212 1149 * Clear the dirty field and remember the flush lsn for later use. 1213 1150 */ 1214 - dqp->dq_flags &= ~XFS_DQ_DIRTY; 1151 + dqp->q_flags &= ~XFS_DQFLAG_DIRTY; 1215 1152 1216 1153 xfs_trans_ail_copy_lsn(mp->m_ail, &dqp->q_logitem.qli_flush_lsn, 1217 1154 &dqp->q_logitem.qli_item.li_lsn); ··· 1219 1170 * of a dquot without an up-to-date CRC getting to disk. 1220 1171 */ 1221 1172 if (xfs_sb_version_hascrc(&mp->m_sb)) { 1222 - dqb->dd_lsn = cpu_to_be64(dqp->q_logitem.qli_item.li_lsn); 1223 - xfs_update_cksum((char *)dqb, sizeof(struct xfs_dqblk), 1173 + dqblk->dd_lsn = cpu_to_be64(dqp->q_logitem.qli_item.li_lsn); 1174 + xfs_update_cksum((char *)dqblk, sizeof(struct xfs_dqblk), 1224 1175 XFS_DQUOT_CRC_OFF); 1225 1176 } 1226 1177 1227 1178 /* 1228 - * Attach an iodone routine so that we can remove this dquot from the 1229 - * AIL and release the flush lock once the dquot is synced to disk. 1179 + * Attach the dquot to the buffer so that we can remove this dquot from 1180 + * the AIL and release the flush lock once the dquot is synced to disk. 1230 1181 */ 1231 - xfs_buf_attach_iodone(bp, xfs_qm_dqflush_done, 1232 - &dqp->q_logitem.qli_item); 1182 + bp->b_flags |= _XBF_DQUOTS; 1183 + list_add_tail(&dqp->q_logitem.qli_item.li_bio_list, &bp->b_li_list); 1233 1184 1234 1185 /* 1235 1186 * If the buffer is pinned then push on the log so we won't ··· 1245 1196 return 0; 1246 1197 1247 1198 out_abort: 1248 - dqp->dq_flags &= ~XFS_DQ_DIRTY; 1199 + dqp->q_flags &= ~XFS_DQFLAG_DIRTY; 1249 1200 xfs_trans_ail_delete(lip, 0); 1250 1201 xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); 1251 1202 out_unlock: ··· 1266 1217 { 1267 1218 if (d1 && d2) { 1268 1219 ASSERT(d1 != d2); 1269 - if (be32_to_cpu(d1->q_core.d_id) > 1270 - be32_to_cpu(d2->q_core.d_id)) { 1220 + if (d1->q_id > d2->q_id) { 1271 1221 mutex_lock(&d2->q_qlock); 1272 1222 mutex_lock_nested(&d1->q_qlock, XFS_QLOCK_NESTED); 1273 1223 } else { ··· 1318 1270 int 1319 1271 xfs_qm_dqiterate( 1320 1272 struct xfs_mount *mp, 1321 - uint dqtype, 1273 + xfs_dqtype_t type, 1322 1274 xfs_qm_dqiterate_fn iter_fn, 1323 1275 void *priv) 1324 1276 { ··· 1327 1279 int error; 1328 1280 1329 1281 do { 1330 - error = xfs_qm_dqget_next(mp, id, dqtype, &dq); 1282 + error = xfs_qm_dqget_next(mp, id, type, &dq); 1331 1283 if (error == -ENOENT) 1332 1284 return 0; 1333 1285 if (error) 1334 1286 return error; 1335 1287 1336 - error = iter_fn(dq, dqtype, priv); 1337 - id = be32_to_cpu(dq->q_core.d_id); 1288 + error = iter_fn(dq, type, priv); 1289 + id = dq->q_id; 1338 1290 xfs_qm_dqput(dq); 1339 - id++; 1340 1291 } while (error == 0 && id != 0); 1341 1292 1342 1293 return error;

+89 -40

fs/xfs/xfs_dquot.h

··· 27 27 XFS_QLOWSP_MAX 28 28 }; 29 29 30 + struct xfs_dquot_res { 31 + /* Total resources allocated and reserved. */ 32 + xfs_qcnt_t reserved; 33 + 34 + /* Total resources allocated. */ 35 + xfs_qcnt_t count; 36 + 37 + /* Absolute and preferred limits. */ 38 + xfs_qcnt_t hardlimit; 39 + xfs_qcnt_t softlimit; 40 + 41 + /* 42 + * For root dquots, this is the default grace period, in seconds. 43 + * Otherwise, this is when the quota grace period expires, 44 + * in seconds since the Unix epoch. 45 + */ 46 + time64_t timer; 47 + 48 + /* 49 + * For root dquots, this is the maximum number of warnings that will 50 + * be issued for this quota type. Otherwise, this is the number of 51 + * warnings issued against this quota. Note that none of this is 52 + * implemented. 53 + */ 54 + xfs_qwarncnt_t warnings; 55 + }; 56 + 30 57 /* 31 58 * The incore dquot structure 32 59 */ 33 60 struct xfs_dquot { 34 - uint dq_flags; 35 61 struct list_head q_lru; 36 62 struct xfs_mount *q_mount; 63 + xfs_dqtype_t q_type; 64 + uint16_t q_flags; 65 + xfs_dqid_t q_id; 37 66 uint q_nrefs; 38 - xfs_daddr_t q_blkno; 39 67 int q_bufoffset; 68 + xfs_daddr_t q_blkno; 40 69 xfs_fileoff_t q_fileoffset; 41 70 42 - struct xfs_disk_dquot q_core; 71 + struct xfs_dquot_res q_blk; /* regular blocks */ 72 + struct xfs_dquot_res q_ino; /* inodes */ 73 + struct xfs_dquot_res q_rtb; /* realtime blocks */ 74 + 43 75 struct xfs_dq_logitem q_logitem; 44 - /* total regular nblks used+reserved */ 45 - xfs_qcnt_t q_res_bcount; 46 - /* total inos allocd+reserved */ 47 - xfs_qcnt_t q_res_icount; 48 - /* total realtime blks used+reserved */ 49 - xfs_qcnt_t q_res_rtbcount; 76 + 50 77 xfs_qcnt_t q_prealloc_lo_wmark; 51 78 xfs_qcnt_t q_prealloc_hi_wmark; 52 79 int64_t q_low_space[XFS_QLOWSP_MAX]; ··· 128 101 mutex_unlock(&dqp->q_qlock); 129 102 } 130 103 131 - static inline int xfs_this_quota_on(struct xfs_mount *mp, int type) 104 + static inline int 105 + xfs_dquot_type(const struct xfs_dquot *dqp) 132 106 { 133 - switch (type & XFS_DQ_ALLTYPES) { 134 - case XFS_DQ_USER: 107 + return dqp->q_type & XFS_DQTYPE_REC_MASK; 108 + } 109 + 110 + static inline int xfs_this_quota_on(struct xfs_mount *mp, xfs_dqtype_t type) 111 + { 112 + switch (type) { 113 + case XFS_DQTYPE_USER: 135 114 return XFS_IS_UQUOTA_ON(mp); 136 - case XFS_DQ_GROUP: 115 + case XFS_DQTYPE_GROUP: 137 116 return XFS_IS_GQUOTA_ON(mp); 138 - case XFS_DQ_PROJ: 117 + case XFS_DQTYPE_PROJ: 139 118 return XFS_IS_PQUOTA_ON(mp); 140 119 default: 141 120 return 0; 142 121 } 143 122 } 144 123 145 - static inline struct xfs_dquot *xfs_inode_dquot(struct xfs_inode *ip, int type) 124 + static inline struct xfs_dquot *xfs_inode_dquot( 125 + struct xfs_inode *ip, 126 + xfs_dqtype_t type) 146 127 { 147 - switch (type & XFS_DQ_ALLTYPES) { 148 - case XFS_DQ_USER: 128 + switch (type) { 129 + case XFS_DQTYPE_USER: 149 130 return ip->i_udquot; 150 - case XFS_DQ_GROUP: 131 + case XFS_DQTYPE_GROUP: 151 132 return ip->i_gdquot; 152 - case XFS_DQ_PROJ: 133 + case XFS_DQTYPE_PROJ: 153 134 return ip->i_pdquot; 154 135 default: 155 136 return NULL; 156 137 } 138 + } 139 + 140 + /* Decide if the dquot's limits are actually being enforced. */ 141 + static inline bool 142 + xfs_dquot_is_enforced( 143 + const struct xfs_dquot *dqp) 144 + { 145 + switch (xfs_dquot_type(dqp)) { 146 + case XFS_DQTYPE_USER: 147 + return XFS_IS_UQUOTA_ENFORCED(dqp->q_mount); 148 + case XFS_DQTYPE_GROUP: 149 + return XFS_IS_GQUOTA_ENFORCED(dqp->q_mount); 150 + case XFS_DQTYPE_PROJ: 151 + return XFS_IS_PQUOTA_ENFORCED(dqp->q_mount); 152 + } 153 + ASSERT(0); 154 + return false; 157 155 } 158 156 159 157 /* ··· 189 137 { 190 138 int64_t freesp; 191 139 192 - freesp = be64_to_cpu(dqp->q_core.d_blk_hardlimit) - dqp->q_res_bcount; 140 + freesp = dqp->q_blk.hardlimit - dqp->q_blk.reserved; 193 141 if (freesp < dqp->q_low_space[XFS_QLOWSP_1_PCNT]) 194 142 return true; 195 143 196 144 return false; 197 145 } 198 146 147 + void xfs_dquot_to_disk(struct xfs_disk_dquot *ddqp, struct xfs_dquot *dqp); 148 + 199 149 #define XFS_DQ_IS_LOCKED(dqp) (mutex_is_locked(&((dqp)->q_qlock))) 200 - #define XFS_DQ_IS_DIRTY(dqp) ((dqp)->dq_flags & XFS_DQ_DIRTY) 201 - #define XFS_QM_ISUDQ(dqp) ((dqp)->dq_flags & XFS_DQ_USER) 202 - #define XFS_QM_ISPDQ(dqp) ((dqp)->dq_flags & XFS_DQ_PROJ) 203 - #define XFS_QM_ISGDQ(dqp) ((dqp)->dq_flags & XFS_DQ_GROUP) 150 + #define XFS_DQ_IS_DIRTY(dqp) ((dqp)->q_flags & XFS_DQFLAG_DIRTY) 204 151 205 152 void xfs_qm_dqdestroy(struct xfs_dquot *dqp); 206 153 int xfs_qm_dqflush(struct xfs_dquot *dqp, struct xfs_buf **bpp); 207 154 void xfs_qm_dqunpin_wait(struct xfs_dquot *dqp); 208 - void xfs_qm_adjust_dqtimers(struct xfs_mount *mp, 209 - struct xfs_dquot *d); 210 - void xfs_qm_adjust_dqlimits(struct xfs_mount *mp, 211 - struct xfs_dquot *d); 212 - xfs_dqid_t xfs_qm_id_for_quotatype(struct xfs_inode *ip, uint type); 155 + void xfs_qm_adjust_dqtimers(struct xfs_dquot *d); 156 + void xfs_qm_adjust_dqlimits(struct xfs_dquot *d); 157 + xfs_dqid_t xfs_qm_id_for_quotatype(struct xfs_inode *ip, 158 + xfs_dqtype_t type); 213 159 int xfs_qm_dqget(struct xfs_mount *mp, xfs_dqid_t id, 214 - uint type, bool can_alloc, 215 - struct xfs_dquot **dqpp); 216 - int xfs_qm_dqget_inode(struct xfs_inode *ip, uint type, 217 - bool can_alloc, 218 - struct xfs_dquot **dqpp); 160 + xfs_dqtype_t type, bool can_alloc, 161 + struct xfs_dquot **dqpp); 162 + int xfs_qm_dqget_inode(struct xfs_inode *ip, xfs_dqtype_t type, 163 + bool can_alloc, struct xfs_dquot **dqpp); 219 164 int xfs_qm_dqget_next(struct xfs_mount *mp, xfs_dqid_t id, 220 - uint type, struct xfs_dquot **dqpp); 165 + xfs_dqtype_t type, struct xfs_dquot **dqpp); 221 166 int xfs_qm_dqget_uncached(struct xfs_mount *mp, 222 - xfs_dqid_t id, uint type, 223 - struct xfs_dquot **dqpp); 167 + xfs_dqid_t id, xfs_dqtype_t type, 168 + struct xfs_dquot **dqpp); 224 169 void xfs_qm_dqput(struct xfs_dquot *dqp); 225 170 226 171 void xfs_dqlock2(struct xfs_dquot *, struct xfs_dquot *); ··· 232 183 return dqp; 233 184 } 234 185 235 - typedef int (*xfs_qm_dqiterate_fn)(struct xfs_dquot *dq, uint dqtype, 236 - void *priv); 237 - int xfs_qm_dqiterate(struct xfs_mount *mp, uint dqtype, 186 + typedef int (*xfs_qm_dqiterate_fn)(struct xfs_dquot *dq, 187 + xfs_dqtype_t type, void *priv); 188 + int xfs_qm_dqiterate(struct xfs_mount *mp, xfs_dqtype_t type, 238 189 xfs_qm_dqiterate_fn iter_fn, void *priv); 239 190 240 191 #endif /* __XFS_DQUOT_H__ */

+5 -21

fs/xfs/xfs_dquot_item.c

··· 45 45 struct xfs_log_item *lip, 46 46 struct xfs_log_vec *lv) 47 47 { 48 + struct xfs_disk_dquot ddq; 48 49 struct xfs_dq_logitem *qlip = DQUOT_ITEM(lip); 49 50 struct xfs_log_iovec *vecp = NULL; 50 51 struct xfs_dq_logformat *qlf; ··· 53 52 qlf = xlog_prepare_iovec(lv, &vecp, XLOG_REG_TYPE_QFORMAT); 54 53 qlf->qlf_type = XFS_LI_DQUOT; 55 54 qlf->qlf_size = 2; 56 - qlf->qlf_id = be32_to_cpu(qlip->qli_dquot->q_core.d_id); 55 + qlf->qlf_id = qlip->qli_dquot->q_id; 57 56 qlf->qlf_blkno = qlip->qli_dquot->q_blkno; 58 57 qlf->qlf_len = 1; 59 58 qlf->qlf_boffset = qlip->qli_dquot->q_bufoffset; 60 59 xlog_finish_iovec(lv, vecp, sizeof(struct xfs_dq_logformat)); 61 60 62 - xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_DQUOT, 63 - &qlip->qli_dquot->q_core, 61 + xfs_dquot_to_disk(&ddq, qlip->qli_dquot); 62 + 63 + xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_DQUOT, &ddq, 64 64 sizeof(struct xfs_disk_dquot)); 65 65 } 66 66 ··· 113 111 */ 114 112 xfs_log_force(dqp->q_mount, 0); 115 113 wait_event(dqp->q_pinwait, (atomic_read(&dqp->q_pincount) == 0)); 116 - } 117 - 118 - /* 119 - * Callback used to mark a buffer with XFS_LI_FAILED when items in the buffer 120 - * have been failed during writeback 121 - * 122 - * this informs the AIL that the dquot is already flush locked on the next push, 123 - * and acquires a hold on the buffer to ensure that it isn't reclaimed before 124 - * dirty data makes it to disk. 125 - */ 126 - STATIC void 127 - xfs_dquot_item_error( 128 - struct xfs_log_item *lip, 129 - struct xfs_buf *bp) 130 - { 131 - ASSERT(!completion_done(&DQUOT_ITEM(lip)->qli_dquot->q_flush)); 132 - xfs_set_li_failed(lip, bp); 133 114 } 134 115 135 116 STATIC uint ··· 201 216 .iop_release = xfs_qm_dquot_logitem_release, 202 217 .iop_committing = xfs_qm_dquot_logitem_committing, 203 218 .iop_push = xfs_qm_dquot_logitem_push, 204 - .iop_error = xfs_dquot_item_error 205 219 }; 206 220 207 221 /*

+7 -7

fs/xfs/xfs_dquot_item_recover.c

··· 39 39 if (item->ri_buf[1].i_len < sizeof(struct xfs_disk_dquot)) 40 40 return; 41 41 42 - type = recddq->d_flags & (XFS_DQ_USER | XFS_DQ_PROJ | XFS_DQ_GROUP); 42 + type = recddq->d_type & XFS_DQTYPE_REC_MASK; 43 43 ASSERT(type); 44 44 if (log->l_quotaoffs_flag & type) 45 45 return; ··· 91 91 /* 92 92 * This type of quotas was turned off, so ignore this record. 93 93 */ 94 - type = recddq->d_flags & (XFS_DQ_USER | XFS_DQ_PROJ | XFS_DQ_GROUP); 94 + type = recddq->d_type & XFS_DQTYPE_REC_MASK; 95 95 ASSERT(type); 96 96 if (log->l_quotaoffs_flag & type) 97 97 return 0; ··· 108 108 */ 109 109 dq_f = item->ri_buf[0].i_addr; 110 110 ASSERT(dq_f); 111 - fa = xfs_dquot_verify(mp, recddq, dq_f->qlf_id, 0); 111 + fa = xfs_dquot_verify(mp, recddq, dq_f->qlf_id); 112 112 if (fa) { 113 113 xfs_alert(mp, "corrupt dquot ID 0x%x in log at %pS", 114 114 dq_f->qlf_id, fa); ··· 153 153 154 154 ASSERT(dq_f->qlf_size == 2); 155 155 ASSERT(bp->b_mount == mp); 156 - bp->b_iodone = xlog_recover_iodone; 156 + bp->b_flags |= _XBF_LOGRECOVERY; 157 157 xfs_buf_delwri_queue(bp, buffer_list); 158 158 159 159 out_release: ··· 185 185 * group/project quotaoff or both. 186 186 */ 187 187 if (qoff_f->qf_flags & XFS_UQUOTA_ACCT) 188 - log->l_quotaoffs_flag |= XFS_DQ_USER; 188 + log->l_quotaoffs_flag |= XFS_DQTYPE_USER; 189 189 if (qoff_f->qf_flags & XFS_PQUOTA_ACCT) 190 - log->l_quotaoffs_flag |= XFS_DQ_PROJ; 190 + log->l_quotaoffs_flag |= XFS_DQTYPE_PROJ; 191 191 if (qoff_f->qf_flags & XFS_GQUOTA_ACCT) 192 - log->l_quotaoffs_flag |= XFS_DQ_GROUP; 192 + log->l_quotaoffs_flag |= XFS_DQTYPE_GROUP; 193 193 194 194 return 0; 195 195 }

+4 -2

fs/xfs/xfs_extfree_item.c

··· 161 161 ((nextents - 1) * sizeof(xfs_extent_t))); 162 162 efip = kmem_zalloc(size, 0); 163 163 } else { 164 - efip = kmem_zone_zalloc(xfs_efi_zone, 0); 164 + efip = kmem_cache_zalloc(xfs_efi_zone, 165 + GFP_KERNEL | __GFP_NOFAIL); 165 166 } 166 167 167 168 xfs_log_item_init(mp, &efip->efi_item, XFS_LI_EFI, &xfs_efi_item_ops); ··· 333 332 (nextents - 1) * sizeof(struct xfs_extent), 334 333 0); 335 334 } else { 336 - efdp = kmem_zone_zalloc(xfs_efd_zone, 0); 335 + efdp = kmem_cache_zalloc(xfs_efd_zone, 336 + GFP_KERNEL | __GFP_NOFAIL); 337 337 } 338 338 339 339 xfs_log_item_init(tp->t_mountp, &efdp->efd_item, XFS_LI_EFD,

+22 -6

fs/xfs/xfs_file.c

··· 94 94 { 95 95 struct inode *inode = file->f_mapping->host; 96 96 struct xfs_inode *ip = XFS_I(inode); 97 + struct xfs_inode_log_item *iip = ip->i_itemp; 97 98 struct xfs_mount *mp = ip->i_mount; 98 99 int error = 0; 99 100 int log_flushed = 0; ··· 138 137 xfs_ilock(ip, XFS_ILOCK_SHARED); 139 138 if (xfs_ipincount(ip)) { 140 139 if (!datasync || 141 - (ip->i_itemp->ili_fsync_fields & ~XFS_ILOG_TIMESTAMP)) 142 - lsn = ip->i_itemp->ili_last_lsn; 140 + (iip->ili_fsync_fields & ~XFS_ILOG_TIMESTAMP)) 141 + lsn = iip->ili_last_lsn; 143 142 } 144 143 145 144 if (lsn) { 146 145 error = xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, &log_flushed); 147 - ip->i_itemp->ili_fsync_fields = 0; 146 + spin_lock(&iip->ili_lock); 147 + iip->ili_fsync_fields = 0; 148 + spin_unlock(&iip->ili_lock); 148 149 } 149 150 xfs_iunlock(ip, XFS_ILOCK_SHARED); 150 151 ··· 1038 1035 /* Prepare and then clone file data. */ 1039 1036 ret = xfs_reflink_remap_prep(file_in, pos_in, file_out, pos_out, 1040 1037 &len, remap_flags); 1041 - if (ret < 0 || len == 0) 1038 + if (ret || len == 0) 1042 1039 return ret; 1043 1040 1044 1041 trace_xfs_reflink_remap_range(src, pos_in, len, dest, pos_out); ··· 1068 1065 if (mp->m_flags & XFS_MOUNT_WSYNC) 1069 1066 xfs_log_force_inode(dest); 1070 1067 out_unlock: 1071 - xfs_reflink_remap_unlock(file_in, file_out); 1068 + xfs_iunlock2_io_mmap(src, dest); 1072 1069 if (ret) 1073 1070 trace_xfs_reflink_remap_range_error(dest, ret, _RET_IP_); 1074 1071 return remapped > 0 ? remapped : ret; ··· 1266 1263 return __xfs_filemap_fault(vmf, PE_SIZE_PTE, true); 1267 1264 } 1268 1265 1266 + static void 1267 + xfs_filemap_map_pages( 1268 + struct vm_fault *vmf, 1269 + pgoff_t start_pgoff, 1270 + pgoff_t end_pgoff) 1271 + { 1272 + struct inode *inode = file_inode(vmf->vma->vm_file); 1273 + 1274 + xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED); 1275 + filemap_map_pages(vmf, start_pgoff, end_pgoff); 1276 + xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED); 1277 + } 1278 + 1269 1279 static const struct vm_operations_struct xfs_file_vm_ops = { 1270 1280 .fault = xfs_filemap_fault, 1271 1281 .huge_fault = xfs_filemap_huge_fault, 1272 - .map_pages = filemap_map_pages, 1282 + .map_pages = xfs_filemap_map_pages, 1273 1283 .page_mkwrite = xfs_filemap_page_mkwrite, 1274 1284 .pfn_mkwrite = xfs_filemap_pfn_mkwrite, 1275 1285 };

+112 -264

fs/xfs/xfs_icache.c

··· 37 37 struct xfs_inode *ip; 38 38 39 39 /* 40 - * if this didn't occur in transactions, we could use 41 - * KM_MAYFAIL and return NULL here on ENOMEM. Set the 42 - * code up to do this anyway. 40 + * XXX: If this didn't occur in transactions, we could drop GFP_NOFAIL 41 + * and return NULL here on ENOMEM. 43 42 */ 44 - ip = kmem_zone_alloc(xfs_inode_zone, 0); 45 - if (!ip) 46 - return NULL; 43 + ip = kmem_cache_alloc(xfs_inode_zone, GFP_KERNEL | __GFP_NOFAIL); 44 + 47 45 if (inode_init_always(mp->m_super, VFS_I(ip))) { 48 46 kmem_cache_free(xfs_inode_zone, ip); 49 47 return NULL; ··· 113 115 { 114 116 /* asserts to verify all state is correct here */ 115 117 ASSERT(atomic_read(&ip->i_pincount) == 0); 118 + ASSERT(!ip->i_itemp || list_empty(&ip->i_itemp->ili_item.li_bio_list)); 116 119 XFS_STATS_DEC(ip->i_mount, vn_active); 117 120 118 121 call_rcu(&VFS_I(ip)->i_rcu, xfs_inode_free_callback); ··· 140 141 } 141 142 142 143 /* 143 - * Queue a new inode reclaim pass if there are reclaimable inodes and there 144 - * isn't a reclaim pass already in progress. By default it runs every 5s based 145 - * on the xfs periodic sync default of 30s. Perhaps this should have it's own 146 - * tunable, but that can be done if this method proves to be ineffective or too 147 - * aggressive. 144 + * Queue background inode reclaim work if there are reclaimable inodes and there 145 + * isn't reclaim work already scheduled or in progress. 148 146 */ 149 147 static void 150 148 xfs_reclaim_work_queue( ··· 154 158 msecs_to_jiffies(xfs_syncd_centisecs / 6 * 10)); 155 159 } 156 160 rcu_read_unlock(); 157 - } 158 - 159 - /* 160 - * This is a fast pass over the inode cache to try to get reclaim moving on as 161 - * many inodes as possible in a short period of time. It kicks itself every few 162 - * seconds, as well as being kicked by the inode cache shrinker when memory 163 - * goes low. It scans as quickly as possible avoiding locked inodes or those 164 - * already being flushed, and once done schedules a future pass. 165 - */ 166 - void 167 - xfs_reclaim_worker( 168 - struct work_struct *work) 169 - { 170 - struct xfs_mount *mp = container_of(to_delayed_work(work), 171 - struct xfs_mount, m_reclaim_work); 172 - 173 - xfs_reclaim_inodes(mp, SYNC_TRYLOCK); 174 - xfs_reclaim_work_queue(mp); 175 161 } 176 162 177 163 static void ··· 596 618 } 597 619 598 620 /* 599 - * Look up an inode by number in the given file system. 600 - * The inode is looked up in the cache held in each AG. 601 - * If the inode is found in the cache, initialise the vfs inode 602 - * if necessary. 621 + * Look up an inode by number in the given file system. The inode is looked up 622 + * in the cache held in each AG. If the inode is found in the cache, initialise 623 + * the vfs inode if necessary. 603 624 * 604 - * If it is not in core, read it in from the file system's device, 605 - * add it to the cache and initialise the vfs inode. 625 + * If it is not in core, read it in from the file system's device, add it to the 626 + * cache and initialise the vfs inode. 606 627 * 607 628 * The inode is locked according to the value of the lock_flags parameter. 608 - * This flag parameter indicates how and if the inode's IO lock and inode lock 609 - * should be taken. 610 - * 611 - * mp -- the mount point structure for the current file system. It points 612 - * to the inode hash table. 613 - * tp -- a pointer to the current transaction if there is one. This is 614 - * simply passed through to the xfs_iread() call. 615 - * ino -- the number of the inode desired. This is the unique identifier 616 - * within the file system for the inode being requested. 617 - * lock_flags -- flags indicating how to lock the inode. See the comment 618 - * for xfs_ilock() for a list of valid values. 629 + * Inode lookup is only done during metadata operations and not as part of the 630 + * data IO path. Hence we only allow locking of the XFS_ILOCK during lookup. 619 631 */ 620 632 int 621 633 xfs_iget( 622 - xfs_mount_t *mp, 623 - xfs_trans_t *tp, 624 - xfs_ino_t ino, 625 - uint flags, 626 - uint lock_flags, 627 - xfs_inode_t **ipp) 634 + struct xfs_mount *mp, 635 + struct xfs_trans *tp, 636 + xfs_ino_t ino, 637 + uint flags, 638 + uint lock_flags, 639 + struct xfs_inode **ipp) 628 640 { 629 - xfs_inode_t *ip; 630 - int error; 631 - xfs_perag_t *pag; 632 - xfs_agino_t agino; 641 + struct xfs_inode *ip; 642 + struct xfs_perag *pag; 643 + xfs_agino_t agino; 644 + int error; 633 645 634 - /* 635 - * xfs_reclaim_inode() uses the ILOCK to ensure an inode 636 - * doesn't get freed while it's being referenced during a 637 - * radix tree traversal here. It assumes this function 638 - * aqcuires only the ILOCK (and therefore it has no need to 639 - * involve the IOLOCK in this synchronization). 640 - */ 641 646 ASSERT((lock_flags & (XFS_IOLOCK_EXCL | XFS_IOLOCK_SHARED)) == 0); 642 647 643 648 /* reject inode numbers outside existing AGs */ ··· 737 776 738 777 ASSERT(rcu_read_lock_held()); 739 778 740 - /* 741 - * check for stale RCU freed inode 742 - * 743 - * If the inode has been reallocated, it doesn't matter if it's not in 744 - * the AG we are walking - we are walking for writeback, so if it 745 - * passes all the "valid inode" checks and is dirty, then we'll write 746 - * it back anyway. If it has been reallocated and still being 747 - * initialised, the XFS_INEW check below will catch it. 748 - */ 779 + /* Check for stale RCU freed inode */ 749 780 spin_lock(&ip->i_flags_lock); 750 781 if (!ip->i_ino) 751 782 goto out_unlock_noent; ··· 981 1028 982 1029 /* 983 1030 * Grab the inode for reclaim exclusively. 984 - * Return 0 if we grabbed it, non-zero otherwise. 1031 + * 1032 + * We have found this inode via a lookup under RCU, so the inode may have 1033 + * already been freed, or it may be in the process of being recycled by 1034 + * xfs_iget(). In both cases, the inode will have XFS_IRECLAIM set. If the inode 1035 + * has been fully recycled by the time we get the i_flags_lock, XFS_IRECLAIMABLE 1036 + * will not be set. Hence we need to check for both these flag conditions to 1037 + * avoid inodes that are no longer reclaim candidates. 1038 + * 1039 + * Note: checking for other state flags here, under the i_flags_lock or not, is 1040 + * racy and should be avoided. Those races should be resolved only after we have 1041 + * ensured that we are able to reclaim this inode and the world can see that we 1042 + * are going to reclaim it. 1043 + * 1044 + * Return true if we grabbed it, false otherwise. 985 1045 */ 986 - STATIC int 1046 + static bool 987 1047 xfs_reclaim_inode_grab( 988 - struct xfs_inode *ip, 989 - int flags) 1048 + struct xfs_inode *ip) 990 1049 { 991 1050 ASSERT(rcu_read_lock_held()); 992 1051 993 - /* quick check for stale RCU freed inode */ 994 - if (!ip->i_ino) 995 - return 1; 996 - 997 - /* 998 - * If we are asked for non-blocking operation, do unlocked checks to 999 - * see if the inode already is being flushed or in reclaim to avoid 1000 - * lock traffic. 1001 - */ 1002 - if ((flags & SYNC_TRYLOCK) && 1003 - __xfs_iflags_test(ip, XFS_IFLOCK | XFS_IRECLAIM)) 1004 - return 1; 1005 - 1006 - /* 1007 - * The radix tree lock here protects a thread in xfs_iget from racing 1008 - * with us starting reclaim on the inode. Once we have the 1009 - * XFS_IRECLAIM flag set it will not touch us. 1010 - * 1011 - * Due to RCU lookup, we may find inodes that have been freed and only 1012 - * have XFS_IRECLAIM set. Indeed, we may see reallocated inodes that 1013 - * aren't candidates for reclaim at all, so we must check the 1014 - * XFS_IRECLAIMABLE is set first before proceeding to reclaim. 1015 - */ 1016 1052 spin_lock(&ip->i_flags_lock); 1017 1053 if (!__xfs_iflags_test(ip, XFS_IRECLAIMABLE) || 1018 1054 __xfs_iflags_test(ip, XFS_IRECLAIM)) { 1019 1055 /* not a reclaim candidate. */ 1020 1056 spin_unlock(&ip->i_flags_lock); 1021 - return 1; 1057 + return false; 1022 1058 } 1023 1059 __xfs_iflags_set(ip, XFS_IRECLAIM); 1024 1060 spin_unlock(&ip->i_flags_lock); 1025 - return 0; 1061 + return true; 1026 1062 } 1027 1063 1028 1064 /* 1029 - * Inodes in different states need to be treated differently. The following 1030 - * table lists the inode states and the reclaim actions necessary: 1065 + * Inode reclaim is non-blocking, so the default action if progress cannot be 1066 + * made is to "requeue" the inode for reclaim by unlocking it and clearing the 1067 + * XFS_IRECLAIM flag. If we are in a shutdown state, we don't care about 1068 + * blocking anymore and hence we can wait for the inode to be able to reclaim 1069 + * it. 1031 1070 * 1032 - * inode state iflush ret required action 1033 - * --------------- ---------- --------------- 1034 - * bad - reclaim 1035 - * shutdown EIO unpin and reclaim 1036 - * clean, unpinned 0 reclaim 1037 - * stale, unpinned 0 reclaim 1038 - * clean, pinned(*) 0 requeue 1039 - * stale, pinned EAGAIN requeue 1040 - * dirty, async - requeue 1041 - * dirty, sync 0 reclaim 1042 - * 1043 - * (*) dgc: I don't think the clean, pinned state is possible but it gets 1044 - * handled anyway given the order of checks implemented. 1045 - * 1046 - * Also, because we get the flush lock first, we know that any inode that has 1047 - * been flushed delwri has had the flush completed by the time we check that 1048 - * the inode is clean. 1049 - * 1050 - * Note that because the inode is flushed delayed write by AIL pushing, the 1051 - * flush lock may already be held here and waiting on it can result in very 1052 - * long latencies. Hence for sync reclaims, where we wait on the flush lock, 1053 - * the caller should push the AIL first before trying to reclaim inodes to 1054 - * minimise the amount of time spent waiting. For background relaim, we only 1055 - * bother to reclaim clean inodes anyway. 1056 - * 1057 - * Hence the order of actions after gaining the locks should be: 1058 - * bad => reclaim 1059 - * shutdown => unpin and reclaim 1060 - * pinned, async => requeue 1061 - * pinned, sync => unpin 1062 - * stale => reclaim 1063 - * clean => reclaim 1064 - * dirty, async => requeue 1065 - * dirty, sync => flush, wait and reclaim 1071 + * We do no IO here - if callers require inodes to be cleaned they must push the 1072 + * AIL first to trigger writeback of dirty inodes. This enables writeback to be 1073 + * done in the background in a non-blocking manner, and enables memory reclaim 1074 + * to make progress without blocking. 1066 1075 */ 1067 - STATIC int 1076 + static void 1068 1077 xfs_reclaim_inode( 1069 1078 struct xfs_inode *ip, 1070 - struct xfs_perag *pag, 1071 - int sync_mode) 1079 + struct xfs_perag *pag) 1072 1080 { 1073 - struct xfs_buf *bp = NULL; 1074 1081 xfs_ino_t ino = ip->i_ino; /* for radix_tree_delete */ 1075 - int error; 1076 1082 1077 - restart: 1078 - error = 0; 1079 - xfs_ilock(ip, XFS_ILOCK_EXCL); 1080 - if (!xfs_iflock_nowait(ip)) { 1081 - if (!(sync_mode & SYNC_WAIT)) 1082 - goto out; 1083 - xfs_iflock(ip); 1084 - } 1083 + if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) 1084 + goto out; 1085 + if (!xfs_iflock_nowait(ip)) 1086 + goto out_iunlock; 1085 1087 1086 1088 if (XFS_FORCED_SHUTDOWN(ip->i_mount)) { 1087 1089 xfs_iunpin_wait(ip); ··· 1044 1136 xfs_iflush_abort(ip); 1045 1137 goto reclaim; 1046 1138 } 1047 - if (xfs_ipincount(ip)) { 1048 - if (!(sync_mode & SYNC_WAIT)) 1049 - goto out_ifunlock; 1050 - xfs_iunpin_wait(ip); 1051 - } 1052 - if (xfs_iflags_test(ip, XFS_ISTALE) || xfs_inode_clean(ip)) { 1053 - xfs_ifunlock(ip); 1054 - goto reclaim; 1055 - } 1056 - 1057 - /* 1058 - * Never flush out dirty data during non-blocking reclaim, as it would 1059 - * just contend with AIL pushing trying to do the same job. 1060 - */ 1061 - if (!(sync_mode & SYNC_WAIT)) 1139 + if (xfs_ipincount(ip)) 1140 + goto out_ifunlock; 1141 + if (!xfs_inode_clean(ip)) 1062 1142 goto out_ifunlock; 1063 1143 1064 - /* 1065 - * Now we have an inode that needs flushing. 1066 - * 1067 - * Note that xfs_iflush will never block on the inode buffer lock, as 1068 - * xfs_ifree_cluster() can lock the inode buffer before it locks the 1069 - * ip->i_lock, and we are doing the exact opposite here. As a result, 1070 - * doing a blocking xfs_imap_to_bp() to get the cluster buffer would 1071 - * result in an ABBA deadlock with xfs_ifree_cluster(). 1072 - * 1073 - * As xfs_ifree_cluser() must gather all inodes that are active in the 1074 - * cache to mark them stale, if we hit this case we don't actually want 1075 - * to do IO here - we want the inode marked stale so we can simply 1076 - * reclaim it. Hence if we get an EAGAIN error here, just unlock the 1077 - * inode, back off and try again. Hopefully the next pass through will 1078 - * see the stale flag set on the inode. 1079 - */ 1080 - error = xfs_iflush(ip, &bp); 1081 - if (error == -EAGAIN) { 1082 - xfs_iunlock(ip, XFS_ILOCK_EXCL); 1083 - /* backoff longer than in xfs_ifree_cluster */ 1084 - delay(2); 1085 - goto restart; 1086 - } 1087 - 1088 - if (!error) { 1089 - error = xfs_bwrite(bp); 1090 - xfs_buf_relse(bp); 1091 - } 1092 - 1144 + xfs_ifunlock(ip); 1093 1145 reclaim: 1094 1146 ASSERT(!xfs_isiflocked(ip)); 1095 1147 ··· 1096 1228 xfs_ilock(ip, XFS_ILOCK_EXCL); 1097 1229 xfs_qm_dqdetach(ip); 1098 1230 xfs_iunlock(ip, XFS_ILOCK_EXCL); 1231 + ASSERT(xfs_inode_clean(ip)); 1099 1232 1100 1233 __xfs_inode_free(ip); 1101 - return error; 1234 + return; 1102 1235 1103 1236 out_ifunlock: 1104 1237 xfs_ifunlock(ip); 1238 + out_iunlock: 1239 + xfs_iunlock(ip, XFS_ILOCK_EXCL); 1105 1240 out: 1106 1241 xfs_iflags_clear(ip, XFS_IRECLAIM); 1107 - xfs_iunlock(ip, XFS_ILOCK_EXCL); 1108 - /* 1109 - * We could return -EAGAIN here to make reclaim rescan the inode tree in 1110 - * a short while. However, this just burns CPU time scanning the tree 1111 - * waiting for IO to complete and the reclaim work never goes back to 1112 - * the idle state. Instead, return 0 to let the next scheduled 1113 - * background reclaim attempt to reclaim the inode again. 1114 - */ 1115 - return 0; 1116 1242 } 1117 1243 1118 1244 /* ··· 1114 1252 * corrupted, we still want to try to reclaim all the inodes. If we don't, 1115 1253 * then a shut down during filesystem unmount reclaim walk leak all the 1116 1254 * unreclaimed inodes. 1255 + * 1256 + * Returns non-zero if any AGs or inodes were skipped in the reclaim pass 1257 + * so that callers that want to block until all dirty inodes are written back 1258 + * and reclaimed can sanely loop. 1117 1259 */ 1118 - STATIC int 1260 + static void 1119 1261 xfs_reclaim_inodes_ag( 1120 1262 struct xfs_mount *mp, 1121 - int flags, 1122 1263 int *nr_to_scan) 1123 1264 { 1124 1265 struct xfs_perag *pag; 1125 - int error = 0; 1126 - int last_error = 0; 1127 - xfs_agnumber_t ag; 1128 - int trylock = flags & SYNC_TRYLOCK; 1129 - int skipped; 1266 + xfs_agnumber_t ag = 0; 1130 1267 1131 - restart: 1132 - ag = 0; 1133 - skipped = 0; 1134 1268 while ((pag = xfs_perag_get_tag(mp, ag, XFS_ICI_RECLAIM_TAG))) { 1135 1269 unsigned long first_index = 0; 1136 1270 int done = 0; ··· 1134 1276 1135 1277 ag = pag->pag_agno + 1; 1136 1278 1137 - if (trylock) { 1138 - if (!mutex_trylock(&pag->pag_ici_reclaim_lock)) { 1139 - skipped++; 1140 - xfs_perag_put(pag); 1141 - continue; 1142 - } 1143 - first_index = pag->pag_ici_reclaim_cursor; 1144 - } else 1145 - mutex_lock(&pag->pag_ici_reclaim_lock); 1146 - 1279 + first_index = READ_ONCE(pag->pag_ici_reclaim_cursor); 1147 1280 do { 1148 1281 struct xfs_inode *batch[XFS_LOOKUP_BATCH]; 1149 1282 int i; ··· 1158 1309 for (i = 0; i < nr_found; i++) { 1159 1310 struct xfs_inode *ip = batch[i]; 1160 1311 1161 - if (done || xfs_reclaim_inode_grab(ip, flags)) 1312 + if (done || !xfs_reclaim_inode_grab(ip)) 1162 1313 batch[i] = NULL; 1163 1314 1164 1315 /* ··· 1187 1338 rcu_read_unlock(); 1188 1339 1189 1340 for (i = 0; i < nr_found; i++) { 1190 - if (!batch[i]) 1191 - continue; 1192 - error = xfs_reclaim_inode(batch[i], pag, flags); 1193 - if (error && last_error != -EFSCORRUPTED) 1194 - last_error = error; 1341 + if (batch[i]) 1342 + xfs_reclaim_inode(batch[i], pag); 1195 1343 } 1196 1344 1197 1345 *nr_to_scan -= XFS_LOOKUP_BATCH; 1198 - 1199 1346 cond_resched(); 1200 - 1201 1347 } while (nr_found && !done && *nr_to_scan > 0); 1202 1348 1203 - if (trylock && !done) 1204 - pag->pag_ici_reclaim_cursor = first_index; 1205 - else 1206 - pag->pag_ici_reclaim_cursor = 0; 1207 - mutex_unlock(&pag->pag_ici_reclaim_lock); 1349 + if (done) 1350 + first_index = 0; 1351 + WRITE_ONCE(pag->pag_ici_reclaim_cursor, first_index); 1208 1352 xfs_perag_put(pag); 1209 1353 } 1210 - 1211 - /* 1212 - * if we skipped any AG, and we still have scan count remaining, do 1213 - * another pass this time using blocking reclaim semantics (i.e 1214 - * waiting on the reclaim locks and ignoring the reclaim cursors). This 1215 - * ensure that when we get more reclaimers than AGs we block rather 1216 - * than spin trying to execute reclaim. 1217 - */ 1218 - if (skipped && (flags & SYNC_WAIT) && *nr_to_scan > 0) { 1219 - trylock = 0; 1220 - goto restart; 1221 - } 1222 - return last_error; 1223 1354 } 1224 1355 1225 - int 1356 + void 1226 1357 xfs_reclaim_inodes( 1227 - xfs_mount_t *mp, 1228 - int mode) 1358 + struct xfs_mount *mp) 1229 1359 { 1230 1360 int nr_to_scan = INT_MAX; 1231 1361 1232 - return xfs_reclaim_inodes_ag(mp, mode, &nr_to_scan); 1362 + while (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_RECLAIM_TAG)) { 1363 + xfs_ail_push_all_sync(mp->m_ail); 1364 + xfs_reclaim_inodes_ag(mp, &nr_to_scan); 1365 + }; 1233 1366 } 1234 1367 1235 1368 /* 1236 - * Scan a certain number of inodes for reclaim. 1237 - * 1238 - * When called we make sure that there is a background (fast) inode reclaim in 1239 - * progress, while we will throttle the speed of reclaim via doing synchronous 1240 - * reclaim of inodes. That means if we come across dirty inodes, we wait for 1241 - * them to be cleaned, which we hope will not be very long due to the 1242 - * background walker having already kicked the IO off on those dirty inodes. 1369 + * The shrinker infrastructure determines how many inodes we should scan for 1370 + * reclaim. We want as many clean inodes ready to reclaim as possible, so we 1371 + * push the AIL here. We also want to proactively free up memory if we can to 1372 + * minimise the amount of work memory reclaim has to do so we kick the 1373 + * background reclaim if it isn't already scheduled. 1243 1374 */ 1244 1375 long 1245 1376 xfs_reclaim_inodes_nr( ··· 1230 1401 xfs_reclaim_work_queue(mp); 1231 1402 xfs_ail_push_all(mp->m_ail); 1232 1403 1233 - return xfs_reclaim_inodes_ag(mp, SYNC_TRYLOCK | SYNC_WAIT, &nr_to_scan); 1404 + xfs_reclaim_inodes_ag(mp, &nr_to_scan); 1405 + return 0; 1234 1406 } 1235 1407 1236 1408 /* ··· 1328 1498 return true; 1329 1499 } 1330 1500 1501 + /* 1502 + * This is a fast pass over the inode cache to try to get reclaim moving on as 1503 + * many inodes as possible in a short period of time. It kicks itself every few 1504 + * seconds, as well as being kicked by the inode cache shrinker when memory 1505 + * goes low. 1506 + */ 1507 + void 1508 + xfs_reclaim_worker( 1509 + struct work_struct *work) 1510 + { 1511 + struct xfs_mount *mp = container_of(to_delayed_work(work), 1512 + struct xfs_mount, m_reclaim_work); 1513 + int nr_to_scan = INT_MAX; 1514 + 1515 + xfs_reclaim_inodes_ag(mp, &nr_to_scan); 1516 + xfs_reclaim_work_queue(mp); 1517 + } 1518 + 1331 1519 STATIC int 1332 1520 xfs_inode_free_eofblocks( 1333 1521 struct xfs_inode *ip, ··· 1422 1574 eofb.eof_flags = XFS_EOF_FLAGS_UNION|XFS_EOF_FLAGS_SYNC; 1423 1575 1424 1576 if (XFS_IS_UQUOTA_ENFORCED(ip->i_mount)) { 1425 - dq = xfs_inode_dquot(ip, XFS_DQ_USER); 1577 + dq = xfs_inode_dquot(ip, XFS_DQTYPE_USER); 1426 1578 if (dq && xfs_dquot_lowsp(dq)) { 1427 1579 eofb.eof_uid = VFS_I(ip)->i_uid; 1428 1580 eofb.eof_flags |= XFS_EOF_FLAGS_UID; ··· 1431 1583 } 1432 1584 1433 1585 if (XFS_IS_GQUOTA_ENFORCED(ip->i_mount)) { 1434 - dq = xfs_inode_dquot(ip, XFS_DQ_GROUP); 1586 + dq = xfs_inode_dquot(ip, XFS_DQTYPE_GROUP); 1435 1587 if (dq && xfs_dquot_lowsp(dq)) { 1436 1588 eofb.eof_gid = VFS_I(ip)->i_gid; 1437 1589 eofb.eof_flags |= XFS_EOF_FLAGS_GID;

+1 -4

fs/xfs/xfs_icache.h

··· 17 17 __u64 eof_min_file_size; 18 18 }; 19 19 20 - #define SYNC_WAIT 0x0001 /* wait for i/o to complete */ 21 - #define SYNC_TRYLOCK 0x0002 /* only try to lock inodes */ 22 - 23 20 /* 24 21 * tags for inode radix tree 25 22 */ ··· 48 51 49 52 void xfs_reclaim_worker(struct work_struct *work); 50 53 51 - int xfs_reclaim_inodes(struct xfs_mount *mp, int mode); 54 + void xfs_reclaim_inodes(struct xfs_mount *mp); 52 55 int xfs_reclaim_inodes_count(struct xfs_mount *mp); 53 56 long xfs_reclaim_inodes_nr(struct xfs_mount *mp, int nr_to_scan); 54 57

+1 -1

fs/xfs/xfs_icreate_item.c

··· 97 97 { 98 98 struct xfs_icreate_item *icp; 99 99 100 - icp = kmem_zone_zalloc(xfs_icreate_zone, 0); 100 + icp = kmem_cache_zalloc(xfs_icreate_zone, GFP_KERNEL | __GFP_NOFAIL); 101 101 102 102 xfs_log_item_init(tp->t_mountp, &icp->ic_item, XFS_LI_ICREATE, 103 103 &xfs_icreate_item_ops);

+322 -382

fs/xfs/xfs_inode.c

··· 44 44 */ 45 45 #define XFS_ITRUNC_MAX_EXTENTS 2 46 46 47 - STATIC int xfs_iflush_int(struct xfs_inode *, struct xfs_buf *); 48 47 STATIC int xfs_iunlink(struct xfs_trans *, struct xfs_inode *); 49 48 STATIC int xfs_iunlink_remove(struct xfs_trans *, struct xfs_inode *); 50 49 ··· 1739 1740 return error; 1740 1741 } 1741 1742 1743 + /* 1744 + * We do not hold the inode locked across the entire rolling transaction 1745 + * here. We only need to hold it for the first transaction that 1746 + * xfs_ifree() builds, which may mark the inode XFS_ISTALE if the 1747 + * underlying cluster buffer is freed. Relogging an XFS_ISTALE inode 1748 + * here breaks the relationship between cluster buffer invalidation and 1749 + * stale inode invalidation on cluster buffer item journal commit 1750 + * completion, and can result in leaving dirty stale inodes hanging 1751 + * around in memory. 1752 + * 1753 + * We have no need for serialising this inode operation against other 1754 + * operations - we freed the inode and hence reallocation is required 1755 + * and that will serialise on reallocating the space the deferops need 1756 + * to free. Hence we can unlock the inode on the first commit of 1757 + * the transaction rather than roll it right through the deferops. This 1758 + * avoids relogging the XFS_ISTALE inode. 1759 + * 1760 + * We check that xfs_ifree() hasn't grown an internal transaction roll 1761 + * by asserting that the inode is still locked when it returns. 1762 + */ 1742 1763 xfs_ilock(ip, XFS_ILOCK_EXCL); 1743 - xfs_trans_ijoin(tp, ip, 0); 1764 + xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); 1744 1765 1745 1766 error = xfs_ifree(tp, ip); 1767 + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); 1746 1768 if (error) { 1747 1769 /* 1748 1770 * If we fail to free the inode, shut down. The cancel ··· 1776 1756 xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR); 1777 1757 } 1778 1758 xfs_trans_cancel(tp); 1779 - xfs_iunlock(ip, XFS_ILOCK_EXCL); 1780 1759 return error; 1781 1760 } 1782 1761 ··· 1793 1774 xfs_notice(mp, "%s: xfs_trans_commit returned error %d", 1794 1775 __func__, error); 1795 1776 1796 - xfs_iunlock(ip, XFS_ILOCK_EXCL); 1797 1777 return 0; 1798 1778 } 1799 1779 ··· 2165 2147 xfs_dinode_calc_crc(mp, dip); 2166 2148 xfs_trans_inode_buf(tp, ibp); 2167 2149 xfs_trans_log_buf(tp, ibp, offset, offset + sizeof(xfs_agino_t) - 1); 2168 - xfs_inobp_check(mp, ibp); 2169 2150 } 2170 2151 2171 2152 /* Set an in-core inode's unlinked pointer and return the old value. */ ··· 2265 2248 } 2266 2249 2267 2250 if (next_agino != NULLAGINO) { 2268 - struct xfs_perag *pag; 2269 2251 xfs_agino_t old_agino; 2270 2252 2271 2253 /* ··· 2281 2265 * agino has been unlinked, add a backref from the next inode 2282 2266 * back to agino. 2283 2267 */ 2284 - pag = xfs_perag_get(mp, agno); 2285 - error = xfs_iunlink_add_backref(pag, agino, next_agino); 2286 - xfs_perag_put(pag); 2268 + error = xfs_iunlink_add_backref(agibp->b_pag, agino, next_agino); 2287 2269 if (error) 2288 2270 return error; 2289 2271 } ··· 2417 2403 struct xfs_buf *agibp; 2418 2404 struct xfs_buf *last_ibp; 2419 2405 struct xfs_dinode *last_dip = NULL; 2420 - struct xfs_perag *pag = NULL; 2421 2406 xfs_agnumber_t agno = XFS_INO_TO_AGNO(mp, ip->i_ino); 2422 2407 xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ip->i_ino); 2423 2408 xfs_agino_t next_agino; ··· 2460 2447 * this inode's backref to point from the next inode. 2461 2448 */ 2462 2449 if (next_agino != NULLAGINO) { 2463 - pag = xfs_perag_get(mp, agno); 2464 - error = xfs_iunlink_change_backref(pag, next_agino, 2450 + error = xfs_iunlink_change_backref(agibp->b_pag, next_agino, 2465 2451 NULLAGINO); 2466 2452 if (error) 2467 - goto out; 2453 + return error; 2468 2454 } 2469 2455 2470 - if (head_agino == agino) { 2471 - /* Point the head of the list to the next unlinked inode. */ 2472 - error = xfs_iunlink_update_bucket(tp, agno, agibp, bucket_index, 2473 - next_agino); 2474 - if (error) 2475 - goto out; 2476 - } else { 2456 + if (head_agino != agino) { 2477 2457 struct xfs_imap imap; 2478 2458 xfs_agino_t prev_agino; 2479 - 2480 - if (!pag) 2481 - pag = xfs_perag_get(mp, agno); 2482 2459 2483 2460 /* We need to search the list for the inode being freed. */ 2484 2461 error = xfs_iunlink_map_prev(tp, agno, head_agino, agino, 2485 2462 &prev_agino, &imap, &last_dip, &last_ibp, 2486 - pag); 2463 + agibp->b_pag); 2487 2464 if (error) 2488 - goto out; 2465 + return error; 2489 2466 2490 2467 /* Point the previous inode on the list to the next inode. */ 2491 2468 xfs_iunlink_update_dinode(tp, agno, prev_agino, last_ibp, ··· 2489 2486 * change_backref takes care of deleting the backref if 2490 2487 * next_agino is NULLAGINO. 2491 2488 */ 2492 - error = xfs_iunlink_change_backref(pag, agino, next_agino); 2493 - if (error) 2494 - goto out; 2489 + return xfs_iunlink_change_backref(agibp->b_pag, agino, 2490 + next_agino); 2495 2491 } 2496 2492 2497 - out: 2498 - if (pag) 2499 - xfs_perag_put(pag); 2500 - return error; 2493 + /* Point the head of the list to the next unlinked inode. */ 2494 + return xfs_iunlink_update_bucket(tp, agno, agibp, bucket_index, 2495 + next_agino); 2501 2496 } 2502 2497 2503 2498 /* 2504 - * Look up the inode number specified and mark it stale if it is found. If it is 2505 - * dirty, return the inode so it can be attached to the cluster buffer so it can 2506 - * be processed appropriately when the cluster free transaction completes. 2499 + * Look up the inode number specified and if it is not already marked XFS_ISTALE 2500 + * mark it stale. We should only find clean inodes in this lookup that aren't 2501 + * already stale. 2507 2502 */ 2508 - static struct xfs_inode * 2509 - xfs_ifree_get_one_inode( 2510 - struct xfs_perag *pag, 2503 + static void 2504 + xfs_ifree_mark_inode_stale( 2505 + struct xfs_buf *bp, 2511 2506 struct xfs_inode *free_ip, 2512 2507 xfs_ino_t inum) 2513 2508 { 2514 - struct xfs_mount *mp = pag->pag_mount; 2509 + struct xfs_mount *mp = bp->b_mount; 2510 + struct xfs_perag *pag = bp->b_pag; 2511 + struct xfs_inode_log_item *iip; 2515 2512 struct xfs_inode *ip; 2516 2513 2517 2514 retry: ··· 2519 2516 ip = radix_tree_lookup(&pag->pag_ici_root, XFS_INO_TO_AGINO(mp, inum)); 2520 2517 2521 2518 /* Inode not in memory, nothing to do */ 2522 - if (!ip) 2523 - goto out_rcu_unlock; 2519 + if (!ip) { 2520 + rcu_read_unlock(); 2521 + return; 2522 + } 2524 2523 2525 2524 /* 2526 2525 * because this is an RCU protected lookup, we could find a recently ··· 2533 2528 spin_lock(&ip->i_flags_lock); 2534 2529 if (ip->i_ino != inum || __xfs_iflags_test(ip, XFS_ISTALE)) { 2535 2530 spin_unlock(&ip->i_flags_lock); 2536 - goto out_rcu_unlock; 2531 + rcu_read_unlock(); 2532 + return; 2537 2533 } 2538 - spin_unlock(&ip->i_flags_lock); 2539 2534 2540 2535 /* 2541 2536 * Don't try to lock/unlock the current inode, but we _cannot_ skip the ··· 2545 2540 */ 2546 2541 if (ip != free_ip) { 2547 2542 if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) { 2543 + spin_unlock(&ip->i_flags_lock); 2548 2544 rcu_read_unlock(); 2549 2545 delay(1); 2550 2546 goto retry; 2551 2547 } 2552 - 2553 - /* 2554 - * Check the inode number again in case we're racing with 2555 - * freeing in xfs_reclaim_inode(). See the comments in that 2556 - * function for more information as to why the initial check is 2557 - * not sufficient. 2558 - */ 2559 - if (ip->i_ino != inum) { 2560 - xfs_iunlock(ip, XFS_ILOCK_EXCL); 2561 - goto out_rcu_unlock; 2562 - } 2563 2548 } 2549 + ip->i_flags |= XFS_ISTALE; 2550 + spin_unlock(&ip->i_flags_lock); 2564 2551 rcu_read_unlock(); 2565 - 2566 - xfs_iflock(ip); 2567 - xfs_iflags_set(ip, XFS_ISTALE); 2568 2552 2569 2553 /* 2570 - * We don't need to attach clean inodes or those only with unlogged 2571 - * changes (which we throw away, anyway). 2554 + * If we can't get the flush lock, the inode is already attached. All 2555 + * we needed to do here is mark the inode stale so buffer IO completion 2556 + * will remove it from the AIL. 2572 2557 */ 2573 - if (!ip->i_itemp || xfs_inode_clean(ip)) { 2574 - ASSERT(ip != free_ip); 2575 - xfs_ifunlock(ip); 2576 - xfs_iunlock(ip, XFS_ILOCK_EXCL); 2577 - goto out_no_inode; 2558 + iip = ip->i_itemp; 2559 + if (!xfs_iflock_nowait(ip)) { 2560 + ASSERT(!list_empty(&iip->ili_item.li_bio_list)); 2561 + ASSERT(iip->ili_last_fields); 2562 + goto out_iunlock; 2578 2563 } 2579 - return ip; 2580 2564 2581 - out_rcu_unlock: 2582 - rcu_read_unlock(); 2583 - out_no_inode: 2584 - return NULL; 2565 + /* 2566 + * Inodes not attached to the buffer can be released immediately. 2567 + * Everything else has to go through xfs_iflush_abort() on journal 2568 + * commit as the flock synchronises removal of the inode from the 2569 + * cluster buffer against inode reclaim. 2570 + */ 2571 + if (!iip || list_empty(&iip->ili_item.li_bio_list)) { 2572 + xfs_ifunlock(ip); 2573 + goto out_iunlock; 2574 + } 2575 + 2576 + /* we have a dirty inode in memory that has not yet been flushed. */ 2577 + spin_lock(&iip->ili_lock); 2578 + iip->ili_last_fields = iip->ili_fields; 2579 + iip->ili_fields = 0; 2580 + iip->ili_fsync_fields = 0; 2581 + spin_unlock(&iip->ili_lock); 2582 + ASSERT(iip->ili_last_fields); 2583 + 2584 + out_iunlock: 2585 + if (ip != free_ip) 2586 + xfs_iunlock(ip, XFS_ILOCK_EXCL); 2585 2587 } 2586 2588 2587 2589 /* ··· 2598 2586 */ 2599 2587 STATIC int 2600 2588 xfs_ifree_cluster( 2601 - xfs_inode_t *free_ip, 2602 - xfs_trans_t *tp, 2589 + struct xfs_inode *free_ip, 2590 + struct xfs_trans *tp, 2603 2591 struct xfs_icluster *xic) 2604 2592 { 2605 - xfs_mount_t *mp = free_ip->i_mount; 2593 + struct xfs_mount *mp = free_ip->i_mount; 2594 + struct xfs_ino_geometry *igeo = M_IGEO(mp); 2595 + struct xfs_buf *bp; 2596 + xfs_daddr_t blkno; 2597 + xfs_ino_t inum = xic->first_ino; 2606 2598 int nbufs; 2607 2599 int i, j; 2608 2600 int ioffset; 2609 - xfs_daddr_t blkno; 2610 - xfs_buf_t *bp; 2611 - xfs_inode_t *ip; 2612 - struct xfs_inode_log_item *iip; 2613 - struct xfs_log_item *lip; 2614 - struct xfs_perag *pag; 2615 - struct xfs_ino_geometry *igeo = M_IGEO(mp); 2616 - xfs_ino_t inum; 2617 2601 int error; 2618 2602 2619 - inum = xic->first_ino; 2620 - pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, inum)); 2621 2603 nbufs = igeo->ialloc_blks / igeo->blocks_per_cluster; 2622 2604 2623 2605 for (j = 0; j < nbufs; j++, inum += igeo->inodes_per_cluster) { ··· 2640 2634 error = xfs_trans_get_buf(tp, mp->m_ddev_targp, blkno, 2641 2635 mp->m_bsize * igeo->blocks_per_cluster, 2642 2636 XBF_UNMAPPED, &bp); 2643 - if (error) { 2644 - xfs_perag_put(pag); 2637 + if (error) 2645 2638 return error; 2646 - } 2647 2639 2648 2640 /* 2649 2641 * This buffer may not have been correctly initialised as we ··· 2655 2651 bp->b_ops = &xfs_inode_buf_ops; 2656 2652 2657 2653 /* 2658 - * Walk the inodes already attached to the buffer and mark them 2659 - * stale. These will all have the flush locks held, so an 2660 - * in-memory inode walk can't lock them. By marking them all 2661 - * stale first, we will not attempt to lock them in the loop 2662 - * below as the XFS_ISTALE flag will be set. 2654 + * Now we need to set all the cached clean inodes as XFS_ISTALE, 2655 + * too. This requires lookups, and will skip inodes that we've 2656 + * already marked XFS_ISTALE. 2663 2657 */ 2664 - list_for_each_entry(lip, &bp->b_li_list, li_bio_list) { 2665 - if (lip->li_type == XFS_LI_INODE) { 2666 - iip = (struct xfs_inode_log_item *)lip; 2667 - ASSERT(iip->ili_logged == 1); 2668 - lip->li_cb = xfs_istale_done; 2669 - xfs_trans_ail_copy_lsn(mp->m_ail, 2670 - &iip->ili_flush_lsn, 2671 - &iip->ili_item.li_lsn); 2672 - xfs_iflags_set(iip->ili_inode, XFS_ISTALE); 2673 - } 2674 - } 2675 - 2676 - 2677 - /* 2678 - * For each inode in memory attempt to add it to the inode 2679 - * buffer and set it up for being staled on buffer IO 2680 - * completion. This is safe as we've locked out tail pushing 2681 - * and flushing by locking the buffer. 2682 - * 2683 - * We have already marked every inode that was part of a 2684 - * transaction stale above, which means there is no point in 2685 - * even trying to lock them. 2686 - */ 2687 - for (i = 0; i < igeo->inodes_per_cluster; i++) { 2688 - ip = xfs_ifree_get_one_inode(pag, free_ip, inum + i); 2689 - if (!ip) 2690 - continue; 2691 - 2692 - iip = ip->i_itemp; 2693 - iip->ili_last_fields = iip->ili_fields; 2694 - iip->ili_fields = 0; 2695 - iip->ili_fsync_fields = 0; 2696 - iip->ili_logged = 1; 2697 - xfs_trans_ail_copy_lsn(mp->m_ail, &iip->ili_flush_lsn, 2698 - &iip->ili_item.li_lsn); 2699 - 2700 - xfs_buf_attach_iodone(bp, xfs_istale_done, 2701 - &iip->ili_item); 2702 - 2703 - if (ip != free_ip) 2704 - xfs_iunlock(ip, XFS_ILOCK_EXCL); 2705 - } 2658 + for (i = 0; i < igeo->inodes_per_cluster; i++) 2659 + xfs_ifree_mark_inode_stale(bp, free_ip, inum + i); 2706 2660 2707 2661 xfs_trans_stale_inode_buf(tp, bp); 2708 2662 xfs_trans_binval(tp, bp); 2709 2663 } 2710 - 2711 - xfs_perag_put(pag); 2712 2664 return 0; 2713 2665 } 2714 2666 ··· 2685 2725 { 2686 2726 int error; 2687 2727 struct xfs_icluster xic = { 0 }; 2728 + struct xfs_inode_log_item *iip = ip->i_itemp; 2688 2729 2689 2730 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); 2690 2731 ASSERT(VFS_I(ip)->i_nlink == 0); ··· 2723 2762 ip->i_df.if_format = XFS_DINODE_FMT_EXTENTS; 2724 2763 2725 2764 /* Don't attempt to replay owner changes for a deleted inode */ 2726 - ip->i_itemp->ili_fields &= ~(XFS_ILOG_AOWNER|XFS_ILOG_DOWNER); 2765 + spin_lock(&iip->ili_lock); 2766 + iip->ili_fields &= ~(XFS_ILOG_AOWNER | XFS_ILOG_DOWNER); 2767 + spin_unlock(&iip->ili_lock); 2727 2768 2728 2769 /* 2729 2770 * Bump the generation count so no one will be confused ··· 3432 3469 return error; 3433 3470 } 3434 3471 3435 - STATIC int 3436 - xfs_iflush_cluster( 3437 - struct xfs_inode *ip, 3438 - struct xfs_buf *bp) 3439 - { 3440 - struct xfs_mount *mp = ip->i_mount; 3441 - struct xfs_perag *pag; 3442 - unsigned long first_index, mask; 3443 - int cilist_size; 3444 - struct xfs_inode **cilist; 3445 - struct xfs_inode *cip; 3446 - struct xfs_ino_geometry *igeo = M_IGEO(mp); 3447 - int error = 0; 3448 - int nr_found; 3449 - int clcount = 0; 3450 - int i; 3451 - 3452 - pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); 3453 - 3454 - cilist_size = igeo->inodes_per_cluster * sizeof(struct xfs_inode *); 3455 - cilist = kmem_alloc(cilist_size, KM_MAYFAIL|KM_NOFS); 3456 - if (!cilist) 3457 - goto out_put; 3458 - 3459 - mask = ~(igeo->inodes_per_cluster - 1); 3460 - first_index = XFS_INO_TO_AGINO(mp, ip->i_ino) & mask; 3461 - rcu_read_lock(); 3462 - /* really need a gang lookup range call here */ 3463 - nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, (void**)cilist, 3464 - first_index, igeo->inodes_per_cluster); 3465 - if (nr_found == 0) 3466 - goto out_free; 3467 - 3468 - for (i = 0; i < nr_found; i++) { 3469 - cip = cilist[i]; 3470 - if (cip == ip) 3471 - continue; 3472 - 3473 - /* 3474 - * because this is an RCU protected lookup, we could find a 3475 - * recently freed or even reallocated inode during the lookup. 3476 - * We need to check under the i_flags_lock for a valid inode 3477 - * here. Skip it if it is not valid or the wrong inode. 3478 - */ 3479 - spin_lock(&cip->i_flags_lock); 3480 - if (!cip->i_ino || 3481 - __xfs_iflags_test(cip, XFS_ISTALE)) { 3482 - spin_unlock(&cip->i_flags_lock); 3483 - continue; 3484 - } 3485 - 3486 - /* 3487 - * Once we fall off the end of the cluster, no point checking 3488 - * any more inodes in the list because they will also all be 3489 - * outside the cluster. 3490 - */ 3491 - if ((XFS_INO_TO_AGINO(mp, cip->i_ino) & mask) != first_index) { 3492 - spin_unlock(&cip->i_flags_lock); 3493 - break; 3494 - } 3495 - spin_unlock(&cip->i_flags_lock); 3496 - 3497 - /* 3498 - * Do an un-protected check to see if the inode is dirty and 3499 - * is a candidate for flushing. These checks will be repeated 3500 - * later after the appropriate locks are acquired. 3501 - */ 3502 - if (xfs_inode_clean(cip) && xfs_ipincount(cip) == 0) 3503 - continue; 3504 - 3505 - /* 3506 - * Try to get locks. If any are unavailable or it is pinned, 3507 - * then this inode cannot be flushed and is skipped. 3508 - */ 3509 - 3510 - if (!xfs_ilock_nowait(cip, XFS_ILOCK_SHARED)) 3511 - continue; 3512 - if (!xfs_iflock_nowait(cip)) { 3513 - xfs_iunlock(cip, XFS_ILOCK_SHARED); 3514 - continue; 3515 - } 3516 - if (xfs_ipincount(cip)) { 3517 - xfs_ifunlock(cip); 3518 - xfs_iunlock(cip, XFS_ILOCK_SHARED); 3519 - continue; 3520 - } 3521 - 3522 - 3523 - /* 3524 - * Check the inode number again, just to be certain we are not 3525 - * racing with freeing in xfs_reclaim_inode(). See the comments 3526 - * in that function for more information as to why the initial 3527 - * check is not sufficient. 3528 - */ 3529 - if (!cip->i_ino) { 3530 - xfs_ifunlock(cip); 3531 - xfs_iunlock(cip, XFS_ILOCK_SHARED); 3532 - continue; 3533 - } 3534 - 3535 - /* 3536 - * arriving here means that this inode can be flushed. First 3537 - * re-check that it's dirty before flushing. 3538 - */ 3539 - if (!xfs_inode_clean(cip)) { 3540 - error = xfs_iflush_int(cip, bp); 3541 - if (error) { 3542 - xfs_iunlock(cip, XFS_ILOCK_SHARED); 3543 - goto out_free; 3544 - } 3545 - clcount++; 3546 - } else { 3547 - xfs_ifunlock(cip); 3548 - } 3549 - xfs_iunlock(cip, XFS_ILOCK_SHARED); 3550 - } 3551 - 3552 - if (clcount) { 3553 - XFS_STATS_INC(mp, xs_icluster_flushcnt); 3554 - XFS_STATS_ADD(mp, xs_icluster_flushinode, clcount); 3555 - } 3556 - 3557 - out_free: 3558 - rcu_read_unlock(); 3559 - kmem_free(cilist); 3560 - out_put: 3561 - xfs_perag_put(pag); 3562 - return error; 3563 - } 3564 - 3565 - /* 3566 - * Flush dirty inode metadata into the backing buffer. 3567 - * 3568 - * The caller must have the inode lock and the inode flush lock held. The 3569 - * inode lock will still be held upon return to the caller, and the inode 3570 - * flush lock will be released after the inode has reached the disk. 3571 - * 3572 - * The caller must write out the buffer returned in *bpp and release it. 3573 - */ 3574 - int 3472 + static int 3575 3473 xfs_iflush( 3576 - struct xfs_inode *ip, 3577 - struct xfs_buf **bpp) 3578 - { 3579 - struct xfs_mount *mp = ip->i_mount; 3580 - struct xfs_buf *bp = NULL; 3581 - struct xfs_dinode *dip; 3582 - int error; 3583 - 3584 - XFS_STATS_INC(mp, xs_iflush_count); 3585 - 3586 - ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)); 3587 - ASSERT(xfs_isiflocked(ip)); 3588 - ASSERT(ip->i_df.if_format != XFS_DINODE_FMT_BTREE || 3589 - ip->i_df.if_nextents > XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK)); 3590 - 3591 - *bpp = NULL; 3592 - 3593 - xfs_iunpin_wait(ip); 3594 - 3595 - /* 3596 - * For stale inodes we cannot rely on the backing buffer remaining 3597 - * stale in cache for the remaining life of the stale inode and so 3598 - * xfs_imap_to_bp() below may give us a buffer that no longer contains 3599 - * inodes below. We have to check this after ensuring the inode is 3600 - * unpinned so that it is safe to reclaim the stale inode after the 3601 - * flush call. 3602 - */ 3603 - if (xfs_iflags_test(ip, XFS_ISTALE)) { 3604 - xfs_ifunlock(ip); 3605 - return 0; 3606 - } 3607 - 3608 - /* 3609 - * Get the buffer containing the on-disk inode. We are doing a try-lock 3610 - * operation here, so we may get an EAGAIN error. In that case, return 3611 - * leaving the inode dirty. 3612 - * 3613 - * If we get any other error, we effectively have a corruption situation 3614 - * and we cannot flush the inode. Abort the flush and shut down. 3615 - */ 3616 - error = xfs_imap_to_bp(mp, NULL, &ip->i_imap, &dip, &bp, XBF_TRYLOCK); 3617 - if (error == -EAGAIN) { 3618 - xfs_ifunlock(ip); 3619 - return error; 3620 - } 3621 - if (error) 3622 - goto abort; 3623 - 3624 - /* 3625 - * If the buffer is pinned then push on the log now so we won't 3626 - * get stuck waiting in the write for too long. 3627 - */ 3628 - if (xfs_buf_ispinned(bp)) 3629 - xfs_log_force(mp, 0); 3630 - 3631 - /* 3632 - * Flush the provided inode then attempt to gather others from the 3633 - * cluster into the write. 3634 - * 3635 - * Note: Once we attempt to flush an inode, we must run buffer 3636 - * completion callbacks on any failure. If this fails, simulate an I/O 3637 - * failure on the buffer and shut down. 3638 - */ 3639 - error = xfs_iflush_int(ip, bp); 3640 - if (!error) 3641 - error = xfs_iflush_cluster(ip, bp); 3642 - if (error) { 3643 - bp->b_flags |= XBF_ASYNC; 3644 - xfs_buf_ioend_fail(bp); 3645 - goto shutdown; 3646 - } 3647 - 3648 - *bpp = bp; 3649 - return 0; 3650 - 3651 - abort: 3652 - xfs_iflush_abort(ip); 3653 - shutdown: 3654 - xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); 3655 - return error; 3656 - } 3657 - 3658 - STATIC int 3659 - xfs_iflush_int( 3660 3474 struct xfs_inode *ip, 3661 3475 struct xfs_buf *bp) 3662 3476 { ··· 3446 3706 ASSERT(xfs_isiflocked(ip)); 3447 3707 ASSERT(ip->i_df.if_format != XFS_DINODE_FMT_BTREE || 3448 3708 ip->i_df.if_nextents > XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK)); 3449 - ASSERT(iip != NULL && iip->ili_fields != 0); 3709 + ASSERT(iip->ili_item.li_buf == bp); 3450 3710 3451 3711 dip = xfs_buf_offset(bp, ip->i_imap.im_boffset); 3452 3712 ··· 3541 3801 xfs_iflush_fork(ip, dip, iip, XFS_DATA_FORK); 3542 3802 if (XFS_IFORK_Q(ip)) 3543 3803 xfs_iflush_fork(ip, dip, iip, XFS_ATTR_FORK); 3544 - xfs_inobp_check(mp, bp); 3545 3804 3546 3805 /* 3547 3806 * We've recorded everything logged in the inode, so we'd like to clear ··· 3557 3818 * know that the information those bits represent is permanently on 3558 3819 * disk. As long as the flush completes before the inode is logged 3559 3820 * again, then both ili_fields and ili_last_fields will be cleared. 3560 - * 3561 - * We can play with the ili_fields bits here, because the inode lock 3562 - * must be held exclusively in order to set bits there and the flush 3563 - * lock protects the ili_last_fields bits. Set ili_logged so the flush 3564 - * done routine can tell whether or not to look in the AIL. Also, store 3565 - * the current LSN of the inode so that we can tell whether the item has 3566 - * moved in the AIL from xfs_iflush_done(). In order to read the lsn we 3567 - * need the AIL lock, because it is a 64 bit value that cannot be read 3568 - * atomically. 3569 3821 */ 3570 3822 error = 0; 3571 3823 flush_out: 3824 + spin_lock(&iip->ili_lock); 3572 3825 iip->ili_last_fields = iip->ili_fields; 3573 3826 iip->ili_fields = 0; 3574 3827 iip->ili_fsync_fields = 0; 3575 - iip->ili_logged = 1; 3828 + spin_unlock(&iip->ili_lock); 3576 3829 3830 + /* 3831 + * Store the current LSN of the inode so that we can tell whether the 3832 + * item has moved in the AIL from xfs_iflush_done(). 3833 + */ 3577 3834 xfs_trans_ail_copy_lsn(mp->m_ail, &iip->ili_flush_lsn, 3578 3835 &iip->ili_item.li_lsn); 3579 3836 3580 - /* 3581 - * Attach the inode item callback to the buffer whether the flush 3582 - * succeeded or not. If not, the caller will shut down and fail I/O 3583 - * completion on the buffer to remove the inode from the AIL and release 3584 - * the flush lock. 3585 - */ 3586 - xfs_buf_attach_iodone(bp, xfs_iflush_done, &iip->ili_item); 3587 - 3588 3837 /* generate the checksum. */ 3589 3838 xfs_dinode_calc_crc(mp, dip); 3590 - 3591 - ASSERT(!list_empty(&bp->b_li_list)); 3592 - ASSERT(bp->b_iodone != NULL); 3593 3839 return error; 3840 + } 3841 + 3842 + /* 3843 + * Non-blocking flush of dirty inode metadata into the backing buffer. 3844 + * 3845 + * The caller must have a reference to the inode and hold the cluster buffer 3846 + * locked. The function will walk across all the inodes on the cluster buffer it 3847 + * can find and lock without blocking, and flush them to the cluster buffer. 3848 + * 3849 + * On successful flushing of at least one inode, the caller must write out the 3850 + * buffer and release it. If no inodes are flushed, -EAGAIN will be returned and 3851 + * the caller needs to release the buffer. On failure, the filesystem will be 3852 + * shut down, the buffer will have been unlocked and released, and EFSCORRUPTED 3853 + * will be returned. 3854 + */ 3855 + int 3856 + xfs_iflush_cluster( 3857 + struct xfs_buf *bp) 3858 + { 3859 + struct xfs_mount *mp = bp->b_mount; 3860 + struct xfs_log_item *lip, *n; 3861 + struct xfs_inode *ip; 3862 + struct xfs_inode_log_item *iip; 3863 + int clcount = 0; 3864 + int error = 0; 3865 + 3866 + /* 3867 + * We must use the safe variant here as on shutdown xfs_iflush_abort() 3868 + * can remove itself from the list. 3869 + */ 3870 + list_for_each_entry_safe(lip, n, &bp->b_li_list, li_bio_list) { 3871 + iip = (struct xfs_inode_log_item *)lip; 3872 + ip = iip->ili_inode; 3873 + 3874 + /* 3875 + * Quick and dirty check to avoid locks if possible. 3876 + */ 3877 + if (__xfs_iflags_test(ip, XFS_IRECLAIM | XFS_IFLOCK)) 3878 + continue; 3879 + if (xfs_ipincount(ip)) 3880 + continue; 3881 + 3882 + /* 3883 + * The inode is still attached to the buffer, which means it is 3884 + * dirty but reclaim might try to grab it. Check carefully for 3885 + * that, and grab the ilock while still holding the i_flags_lock 3886 + * to guarantee reclaim will not be able to reclaim this inode 3887 + * once we drop the i_flags_lock. 3888 + */ 3889 + spin_lock(&ip->i_flags_lock); 3890 + ASSERT(!__xfs_iflags_test(ip, XFS_ISTALE)); 3891 + if (__xfs_iflags_test(ip, XFS_IRECLAIM | XFS_IFLOCK)) { 3892 + spin_unlock(&ip->i_flags_lock); 3893 + continue; 3894 + } 3895 + 3896 + /* 3897 + * ILOCK will pin the inode against reclaim and prevent 3898 + * concurrent transactions modifying the inode while we are 3899 + * flushing the inode. 3900 + */ 3901 + if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) { 3902 + spin_unlock(&ip->i_flags_lock); 3903 + continue; 3904 + } 3905 + spin_unlock(&ip->i_flags_lock); 3906 + 3907 + /* 3908 + * Skip inodes that are already flush locked as they have 3909 + * already been written to the buffer. 3910 + */ 3911 + if (!xfs_iflock_nowait(ip)) { 3912 + xfs_iunlock(ip, XFS_ILOCK_SHARED); 3913 + continue; 3914 + } 3915 + 3916 + /* 3917 + * Abort flushing this inode if we are shut down because the 3918 + * inode may not currently be in the AIL. This can occur when 3919 + * log I/O failure unpins the inode without inserting into the 3920 + * AIL, leaving a dirty/unpinned inode attached to the buffer 3921 + * that otherwise looks like it should be flushed. 3922 + */ 3923 + if (XFS_FORCED_SHUTDOWN(mp)) { 3924 + xfs_iunpin_wait(ip); 3925 + /* xfs_iflush_abort() drops the flush lock */ 3926 + xfs_iflush_abort(ip); 3927 + xfs_iunlock(ip, XFS_ILOCK_SHARED); 3928 + error = -EIO; 3929 + continue; 3930 + } 3931 + 3932 + /* don't block waiting on a log force to unpin dirty inodes */ 3933 + if (xfs_ipincount(ip)) { 3934 + xfs_ifunlock(ip); 3935 + xfs_iunlock(ip, XFS_ILOCK_SHARED); 3936 + continue; 3937 + } 3938 + 3939 + if (!xfs_inode_clean(ip)) 3940 + error = xfs_iflush(ip, bp); 3941 + else 3942 + xfs_ifunlock(ip); 3943 + xfs_iunlock(ip, XFS_ILOCK_SHARED); 3944 + if (error) 3945 + break; 3946 + clcount++; 3947 + } 3948 + 3949 + if (error) { 3950 + bp->b_flags |= XBF_ASYNC; 3951 + xfs_buf_ioend_fail(bp); 3952 + xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); 3953 + return error; 3954 + } 3955 + 3956 + if (!clcount) 3957 + return -EAGAIN; 3958 + 3959 + XFS_STATS_INC(mp, xs_icluster_flushcnt); 3960 + XFS_STATS_ADD(mp, xs_icluster_flushinode, clcount); 3961 + return 0; 3962 + 3594 3963 } 3595 3964 3596 3965 /* Release an inode. */ ··· 3727 3880 if (!lsn) 3728 3881 return 0; 3729 3882 return xfs_log_force_lsn(ip->i_mount, lsn, XFS_LOG_SYNC, NULL); 3883 + } 3884 + 3885 + /* 3886 + * Grab the exclusive iolock for a data copy from src to dest, making sure to 3887 + * abide vfs locking order (lowest pointer value goes first) and breaking the 3888 + * layout leases before proceeding. The loop is needed because we cannot call 3889 + * the blocking break_layout() with the iolocks held, and therefore have to 3890 + * back out both locks. 3891 + */ 3892 + static int 3893 + xfs_iolock_two_inodes_and_break_layout( 3894 + struct inode *src, 3895 + struct inode *dest) 3896 + { 3897 + int error; 3898 + 3899 + if (src > dest) 3900 + swap(src, dest); 3901 + 3902 + retry: 3903 + /* Wait to break both inodes' layouts before we start locking. */ 3904 + error = break_layout(src, true); 3905 + if (error) 3906 + return error; 3907 + if (src != dest) { 3908 + error = break_layout(dest, true); 3909 + if (error) 3910 + return error; 3911 + } 3912 + 3913 + /* Lock one inode and make sure nobody got in and leased it. */ 3914 + inode_lock(src); 3915 + error = break_layout(src, false); 3916 + if (error) { 3917 + inode_unlock(src); 3918 + if (error == -EWOULDBLOCK) 3919 + goto retry; 3920 + return error; 3921 + } 3922 + 3923 + if (src == dest) 3924 + return 0; 3925 + 3926 + /* Lock the other inode and make sure nobody got in and leased it. */ 3927 + inode_lock_nested(dest, I_MUTEX_NONDIR2); 3928 + error = break_layout(dest, false); 3929 + if (error) { 3930 + inode_unlock(src); 3931 + inode_unlock(dest); 3932 + if (error == -EWOULDBLOCK) 3933 + goto retry; 3934 + return error; 3935 + } 3936 + 3937 + return 0; 3938 + } 3939 + 3940 + /* 3941 + * Lock two inodes so that userspace cannot initiate I/O via file syscalls or 3942 + * mmap activity. 3943 + */ 3944 + int 3945 + xfs_ilock2_io_mmap( 3946 + struct xfs_inode *ip1, 3947 + struct xfs_inode *ip2) 3948 + { 3949 + int ret; 3950 + 3951 + ret = xfs_iolock_two_inodes_and_break_layout(VFS_I(ip1), VFS_I(ip2)); 3952 + if (ret) 3953 + return ret; 3954 + if (ip1 == ip2) 3955 + xfs_ilock(ip1, XFS_MMAPLOCK_EXCL); 3956 + else 3957 + xfs_lock_two_inodes(ip1, XFS_MMAPLOCK_EXCL, 3958 + ip2, XFS_MMAPLOCK_EXCL); 3959 + return 0; 3960 + } 3961 + 3962 + /* Unlock both inodes to allow IO and mmap activity. */ 3963 + void 3964 + xfs_iunlock2_io_mmap( 3965 + struct xfs_inode *ip1, 3966 + struct xfs_inode *ip2) 3967 + { 3968 + bool same_inode = (ip1 == ip2); 3969 + 3970 + xfs_iunlock(ip2, XFS_MMAPLOCK_EXCL); 3971 + if (!same_inode) 3972 + xfs_iunlock(ip1, XFS_MMAPLOCK_EXCL); 3973 + inode_unlock(VFS_I(ip2)); 3974 + if (!same_inode) 3975 + inode_unlock(VFS_I(ip1)); 3730 3976 }

+4 -1

fs/xfs/xfs_inode.h

··· 426 426 void xfs_iunpin_wait(xfs_inode_t *); 427 427 #define xfs_ipincount(ip) ((unsigned int) atomic_read(&ip->i_pincount)) 428 428 429 - int xfs_iflush(struct xfs_inode *, struct xfs_buf **); 429 + int xfs_iflush_cluster(struct xfs_buf *); 430 430 void xfs_lock_two_inodes(struct xfs_inode *ip0, uint ip0_mode, 431 431 struct xfs_inode *ip1, uint ip1_mode); 432 432 ··· 498 498 void xfs_iunlink_destroy(struct xfs_perag *pag); 499 499 500 500 void xfs_end_io(struct work_struct *work); 501 + 502 + int xfs_ilock2_io_mmap(struct xfs_inode *ip1, struct xfs_inode *ip2); 503 + void xfs_iunlock2_io_mmap(struct xfs_inode *ip1, struct xfs_inode *ip2); 501 504 502 505 #endif /* __XFS_INODE_H__ */

+158 -158

fs/xfs/xfs_inode_item.c

··· 439 439 struct xfs_inode *ip = INODE_ITEM(lip)->ili_inode; 440 440 441 441 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); 442 + ASSERT(lip->li_buf); 442 443 443 444 trace_xfs_inode_pin(ip, _RET_IP_); 444 445 atomic_inc(&ip->i_pincount); ··· 451 450 * item which was previously pinned with a call to xfs_inode_item_pin(). 452 451 * 453 452 * Also wake up anyone in xfs_iunpin_wait() if the count goes to 0. 453 + * 454 + * Note that unpin can race with inode cluster buffer freeing marking the buffer 455 + * stale. In that case, flush completions are run from the buffer unpin call, 456 + * which may happen before the inode is unpinned. If we lose the race, there 457 + * will be no buffer attached to the log item, but the inode will be marked 458 + * XFS_ISTALE. 454 459 */ 455 460 STATIC void 456 461 xfs_inode_item_unpin( ··· 466 459 struct xfs_inode *ip = INODE_ITEM(lip)->ili_inode; 467 460 468 461 trace_xfs_inode_unpin(ip, _RET_IP_); 462 + ASSERT(lip->li_buf || xfs_iflags_test(ip, XFS_ISTALE)); 469 463 ASSERT(atomic_read(&ip->i_pincount) > 0); 470 464 if (atomic_dec_and_test(&ip->i_pincount)) 471 465 wake_up_bit(&ip->i_flags, __XFS_IPINNED_BIT); 472 - } 473 - 474 - /* 475 - * Callback used to mark a buffer with XFS_LI_FAILED when items in the buffer 476 - * have been failed during writeback 477 - * 478 - * This informs the AIL that the inode is already flush locked on the next push, 479 - * and acquires a hold on the buffer to ensure that it isn't reclaimed before 480 - * dirty data makes it to disk. 481 - */ 482 - STATIC void 483 - xfs_inode_item_error( 484 - struct xfs_log_item *lip, 485 - struct xfs_buf *bp) 486 - { 487 - ASSERT(xfs_isiflocked(INODE_ITEM(lip)->ili_inode)); 488 - xfs_set_li_failed(lip, bp); 489 466 } 490 467 491 468 STATIC uint ··· 485 494 uint rval = XFS_ITEM_SUCCESS; 486 495 int error; 487 496 488 - if (xfs_ipincount(ip) > 0) 497 + ASSERT(iip->ili_item.li_buf); 498 + 499 + if (xfs_ipincount(ip) > 0 || xfs_buf_ispinned(bp) || 500 + (ip->i_flags & XFS_ISTALE)) 489 501 return XFS_ITEM_PINNED; 490 502 491 - if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) 503 + /* If the inode is already flush locked, we're already flushing. */ 504 + if (xfs_isiflocked(ip)) 505 + return XFS_ITEM_FLUSHING; 506 + 507 + if (!xfs_buf_trylock(bp)) 492 508 return XFS_ITEM_LOCKED; 493 - 494 - /* 495 - * Re-check the pincount now that we stabilized the value by 496 - * taking the ilock. 497 - */ 498 - if (xfs_ipincount(ip) > 0) { 499 - rval = XFS_ITEM_PINNED; 500 - goto out_unlock; 501 - } 502 - 503 - /* 504 - * Stale inode items should force out the iclog. 505 - */ 506 - if (ip->i_flags & XFS_ISTALE) { 507 - rval = XFS_ITEM_PINNED; 508 - goto out_unlock; 509 - } 510 - 511 - /* 512 - * Someone else is already flushing the inode. Nothing we can do 513 - * here but wait for the flush to finish and remove the item from 514 - * the AIL. 515 - */ 516 - if (!xfs_iflock_nowait(ip)) { 517 - rval = XFS_ITEM_FLUSHING; 518 - goto out_unlock; 519 - } 520 - 521 - ASSERT(iip->ili_fields != 0 || XFS_FORCED_SHUTDOWN(ip->i_mount)); 522 - ASSERT(iip->ili_logged == 0 || XFS_FORCED_SHUTDOWN(ip->i_mount)); 523 509 524 510 spin_unlock(&lip->li_ailp->ail_lock); 525 511 526 - error = xfs_iflush(ip, &bp); 512 + /* 513 + * We need to hold a reference for flushing the cluster buffer as it may 514 + * fail the buffer without IO submission. In which case, we better get a 515 + * reference for that completion because otherwise we don't get a 516 + * reference for IO until we queue the buffer for delwri submission. 517 + */ 518 + xfs_buf_hold(bp); 519 + error = xfs_iflush_cluster(bp); 527 520 if (!error) { 528 521 if (!xfs_buf_delwri_queue(bp, buffer_list)) 529 522 rval = XFS_ITEM_FLUSHING; 530 523 xfs_buf_relse(bp); 531 - } else if (error == -EAGAIN) 524 + } else { 525 + /* 526 + * Release the buffer if we were unable to flush anything. On 527 + * any other error, the buffer has already been released. 528 + */ 529 + if (error == -EAGAIN) 530 + xfs_buf_relse(bp); 532 531 rval = XFS_ITEM_LOCKED; 532 + } 533 533 534 534 spin_lock(&lip->li_ailp->ail_lock); 535 - out_unlock: 536 - xfs_iunlock(ip, XFS_ILOCK_SHARED); 537 535 return rval; 538 536 } 539 537 ··· 601 621 .iop_committed = xfs_inode_item_committed, 602 622 .iop_push = xfs_inode_item_push, 603 623 .iop_committing = xfs_inode_item_committing, 604 - .iop_error = xfs_inode_item_error 605 624 }; 606 625 607 626 ··· 615 636 struct xfs_inode_log_item *iip; 616 637 617 638 ASSERT(ip->i_itemp == NULL); 618 - iip = ip->i_itemp = kmem_zone_zalloc(xfs_ili_zone, 0); 639 + iip = ip->i_itemp = kmem_cache_zalloc(xfs_ili_zone, 640 + GFP_KERNEL | __GFP_NOFAIL); 619 641 620 642 iip->ili_inode = ip; 643 + spin_lock_init(&iip->ili_lock); 621 644 xfs_log_item_init(mp, &iip->ili_item, XFS_LI_INODE, 622 645 &xfs_inode_item_ops); 623 646 } ··· 629 648 */ 630 649 void 631 650 xfs_inode_item_destroy( 632 - xfs_inode_t *ip) 651 + struct xfs_inode *ip) 633 652 { 634 - kmem_free(ip->i_itemp->ili_item.li_lv_shadow); 635 - kmem_cache_free(xfs_ili_zone, ip->i_itemp); 653 + struct xfs_inode_log_item *iip = ip->i_itemp; 654 + 655 + ASSERT(iip->ili_item.li_buf == NULL); 656 + 657 + ip->i_itemp = NULL; 658 + kmem_free(iip->ili_item.li_lv_shadow); 659 + kmem_cache_free(xfs_ili_zone, iip); 636 660 } 637 661 638 662 639 663 /* 640 - * This is the inode flushing I/O completion routine. It is called 641 - * from interrupt level when the buffer containing the inode is 642 - * flushed to disk. It is responsible for removing the inode item 643 - * from the AIL if it has not been re-logged, and unlocking the inode's 644 - * flush lock. 645 - * 646 - * To reduce AIL lock traffic as much as possible, we scan the buffer log item 647 - * list for other inodes that will run this function. We remove them from the 648 - * buffer list so we can process all the inode IO completions in one AIL lock 649 - * traversal. 664 + * We only want to pull the item from the AIL if it is actually there 665 + * and its location in the log has not changed since we started the 666 + * flush. Thus, we only bother if the inode's lsn has not changed. 667 + */ 668 + static void 669 + xfs_iflush_ail_updates( 670 + struct xfs_ail *ailp, 671 + struct list_head *list) 672 + { 673 + struct xfs_log_item *lip; 674 + xfs_lsn_t tail_lsn = 0; 675 + 676 + /* this is an opencoded batch version of xfs_trans_ail_delete */ 677 + spin_lock(&ailp->ail_lock); 678 + list_for_each_entry(lip, list, li_bio_list) { 679 + xfs_lsn_t lsn; 680 + 681 + clear_bit(XFS_LI_FAILED, &lip->li_flags); 682 + if (INODE_ITEM(lip)->ili_flush_lsn != lip->li_lsn) 683 + continue; 684 + 685 + lsn = xfs_ail_delete_one(ailp, lip); 686 + if (!tail_lsn && lsn) 687 + tail_lsn = lsn; 688 + } 689 + xfs_ail_update_finish(ailp, tail_lsn); 690 + } 691 + 692 + /* 693 + * Walk the list of inodes that have completed their IOs. If they are clean 694 + * remove them from the list and dissociate them from the buffer. Buffers that 695 + * are still dirty remain linked to the buffer and on the list. Caller must 696 + * handle them appropriately. 697 + */ 698 + static void 699 + xfs_iflush_finish( 700 + struct xfs_buf *bp, 701 + struct list_head *list) 702 + { 703 + struct xfs_log_item *lip, *n; 704 + 705 + list_for_each_entry_safe(lip, n, list, li_bio_list) { 706 + struct xfs_inode_log_item *iip = INODE_ITEM(lip); 707 + bool drop_buffer = false; 708 + 709 + spin_lock(&iip->ili_lock); 710 + 711 + /* 712 + * Remove the reference to the cluster buffer if the inode is 713 + * clean in memory and drop the buffer reference once we've 714 + * dropped the locks we hold. 715 + */ 716 + ASSERT(iip->ili_item.li_buf == bp); 717 + if (!iip->ili_fields) { 718 + iip->ili_item.li_buf = NULL; 719 + list_del_init(&lip->li_bio_list); 720 + drop_buffer = true; 721 + } 722 + iip->ili_last_fields = 0; 723 + iip->ili_flush_lsn = 0; 724 + spin_unlock(&iip->ili_lock); 725 + xfs_ifunlock(iip->ili_inode); 726 + if (drop_buffer) 727 + xfs_buf_rele(bp); 728 + } 729 + } 730 + 731 + /* 732 + * Inode buffer IO completion routine. It is responsible for removing inodes 733 + * attached to the buffer from the AIL if they have not been re-logged, as well 734 + * as completing the flush and unlocking the inode. 650 735 */ 651 736 void 652 737 xfs_iflush_done( 653 - struct xfs_buf *bp, 654 - struct xfs_log_item *lip) 738 + struct xfs_buf *bp) 655 739 { 656 - struct xfs_inode_log_item *iip; 657 - struct xfs_log_item *blip, *n; 658 - struct xfs_ail *ailp = lip->li_ailp; 659 - int need_ail = 0; 660 - LIST_HEAD(tmp); 740 + struct xfs_log_item *lip, *n; 741 + LIST_HEAD(flushed_inodes); 742 + LIST_HEAD(ail_updates); 661 743 662 744 /* 663 - * Scan the buffer IO completions for other inodes being completed and 664 - * attach them to the current inode log item. 745 + * Pull the attached inodes from the buffer one at a time and take the 746 + * appropriate action on them. 665 747 */ 748 + list_for_each_entry_safe(lip, n, &bp->b_li_list, li_bio_list) { 749 + struct xfs_inode_log_item *iip = INODE_ITEM(lip); 666 750 667 - list_add_tail(&lip->li_bio_list, &tmp); 668 - 669 - list_for_each_entry_safe(blip, n, &bp->b_li_list, li_bio_list) { 670 - if (lip->li_cb != xfs_iflush_done) 751 + if (xfs_iflags_test(iip->ili_inode, XFS_ISTALE)) { 752 + xfs_iflush_abort(iip->ili_inode); 753 + continue; 754 + } 755 + if (!iip->ili_last_fields) 671 756 continue; 672 757 673 - list_move_tail(&blip->li_bio_list, &tmp); 674 - /* 675 - * while we have the item, do the unlocked check for needing 676 - * the AIL lock. 677 - */ 678 - iip = INODE_ITEM(blip); 679 - if ((iip->ili_logged && blip->li_lsn == iip->ili_flush_lsn) || 680 - test_bit(XFS_LI_FAILED, &blip->li_flags)) 681 - need_ail++; 758 + /* Do an unlocked check for needing the AIL lock. */ 759 + if (iip->ili_flush_lsn == lip->li_lsn || 760 + test_bit(XFS_LI_FAILED, &lip->li_flags)) 761 + list_move_tail(&lip->li_bio_list, &ail_updates); 762 + else 763 + list_move_tail(&lip->li_bio_list, &flushed_inodes); 682 764 } 683 765 684 - /* make sure we capture the state of the initial inode. */ 685 - iip = INODE_ITEM(lip); 686 - if ((iip->ili_logged && lip->li_lsn == iip->ili_flush_lsn) || 687 - test_bit(XFS_LI_FAILED, &lip->li_flags)) 688 - need_ail++; 689 - 690 - /* 691 - * We only want to pull the item from the AIL if it is 692 - * actually there and its location in the log has not 693 - * changed since we started the flush. Thus, we only bother 694 - * if the ili_logged flag is set and the inode's lsn has not 695 - * changed. First we check the lsn outside 696 - * the lock since it's cheaper, and then we recheck while 697 - * holding the lock before removing the inode from the AIL. 698 - */ 699 - if (need_ail) { 700 - xfs_lsn_t tail_lsn = 0; 701 - 702 - /* this is an opencoded batch version of xfs_trans_ail_delete */ 703 - spin_lock(&ailp->ail_lock); 704 - list_for_each_entry(blip, &tmp, li_bio_list) { 705 - if (INODE_ITEM(blip)->ili_logged && 706 - blip->li_lsn == INODE_ITEM(blip)->ili_flush_lsn) { 707 - /* 708 - * xfs_ail_update_finish() only cares about the 709 - * lsn of the first tail item removed, any 710 - * others will be at the same or higher lsn so 711 - * we just ignore them. 712 - */ 713 - xfs_lsn_t lsn = xfs_ail_delete_one(ailp, blip); 714 - if (!tail_lsn && lsn) 715 - tail_lsn = lsn; 716 - } else { 717 - xfs_clear_li_failed(blip); 718 - } 719 - } 720 - xfs_ail_update_finish(ailp, tail_lsn); 766 + if (!list_empty(&ail_updates)) { 767 + xfs_iflush_ail_updates(bp->b_mount->m_ail, &ail_updates); 768 + list_splice_tail(&ail_updates, &flushed_inodes); 721 769 } 722 770 723 - /* 724 - * clean up and unlock the flush lock now we are done. We can clear the 725 - * ili_last_fields bits now that we know that the data corresponding to 726 - * them is safely on disk. 727 - */ 728 - list_for_each_entry_safe(blip, n, &tmp, li_bio_list) { 729 - list_del_init(&blip->li_bio_list); 730 - iip = INODE_ITEM(blip); 731 - iip->ili_logged = 0; 732 - iip->ili_last_fields = 0; 733 - xfs_ifunlock(iip->ili_inode); 734 - } 735 - list_del(&tmp); 771 + xfs_iflush_finish(bp, &flushed_inodes); 772 + if (!list_empty(&flushed_inodes)) 773 + list_splice_tail(&flushed_inodes, &bp->b_li_list); 736 774 } 737 775 738 776 /* ··· 762 762 */ 763 763 void 764 764 xfs_iflush_abort( 765 - struct xfs_inode *ip) 765 + struct xfs_inode *ip) 766 766 { 767 - struct xfs_inode_log_item *iip = ip->i_itemp; 767 + struct xfs_inode_log_item *iip = ip->i_itemp; 768 + struct xfs_buf *bp = NULL; 768 769 769 770 if (iip) { 770 - xfs_trans_ail_delete(&iip->ili_item, 0); 771 - iip->ili_logged = 0; 772 771 /* 773 - * Clear the ili_last_fields bits now that we know that the 774 - * data corresponding to them is safely on disk. 772 + * Clear the failed bit before removing the item from the AIL so 773 + * xfs_trans_ail_delete() doesn't try to clear and release the 774 + * buffer attached to the log item before we are done with it. 775 775 */ 776 - iip->ili_last_fields = 0; 776 + clear_bit(XFS_LI_FAILED, &iip->ili_item.li_flags); 777 + xfs_trans_ail_delete(&iip->ili_item, 0); 778 + 777 779 /* 778 780 * Clear the inode logging fields so no more flushes are 779 781 * attempted. 780 782 */ 783 + spin_lock(&iip->ili_lock); 784 + iip->ili_last_fields = 0; 781 785 iip->ili_fields = 0; 782 786 iip->ili_fsync_fields = 0; 787 + iip->ili_flush_lsn = 0; 788 + bp = iip->ili_item.li_buf; 789 + iip->ili_item.li_buf = NULL; 790 + list_del_init(&iip->ili_item.li_bio_list); 791 + spin_unlock(&iip->ili_lock); 783 792 } 784 - /* 785 - * Release the inode's flush lock since we're done with it. 786 - */ 787 793 xfs_ifunlock(ip); 788 - } 789 - 790 - void 791 - xfs_istale_done( 792 - struct xfs_buf *bp, 793 - struct xfs_log_item *lip) 794 - { 795 - xfs_iflush_abort(INODE_ITEM(lip)->ili_inode); 794 + if (bp) 795 + xfs_buf_rele(bp); 796 796 } 797 797 798 798 /*

+17 -7

fs/xfs/xfs_inode_item.h

··· 16 16 struct xfs_inode_log_item { 17 17 struct xfs_log_item ili_item; /* common portion */ 18 18 struct xfs_inode *ili_inode; /* inode ptr */ 19 - xfs_lsn_t ili_flush_lsn; /* lsn at last flush */ 20 - xfs_lsn_t ili_last_lsn; /* lsn at last transaction */ 21 - unsigned short ili_lock_flags; /* lock flags */ 22 - unsigned short ili_logged; /* flushed logged data */ 19 + unsigned short ili_lock_flags; /* inode lock flags */ 20 + /* 21 + * The ili_lock protects the interactions between the dirty state and 22 + * the flush state of the inode log item. This allows us to do atomic 23 + * modifications of multiple state fields without having to hold a 24 + * specific inode lock to serialise them. 25 + * 26 + * We need atomic changes between inode dirtying, inode flushing and 27 + * inode completion, but these all hold different combinations of 28 + * ILOCK and iflock and hence we need some other method of serialising 29 + * updates to the flush state. 30 + */ 31 + spinlock_t ili_lock; /* flush state lock */ 23 32 unsigned int ili_last_fields; /* fields when flushed */ 24 33 unsigned int ili_fields; /* fields to be logged */ 25 34 unsigned int ili_fsync_fields; /* logged since last fsync */ 35 + xfs_lsn_t ili_flush_lsn; /* lsn at last flush */ 36 + xfs_lsn_t ili_last_lsn; /* lsn at last transaction */ 26 37 }; 27 38 28 - static inline int xfs_inode_clean(xfs_inode_t *ip) 39 + static inline int xfs_inode_clean(struct xfs_inode *ip) 29 40 { 30 41 return !ip->i_itemp || !(ip->i_itemp->ili_fields & XFS_ILOG_ALL); 31 42 } 32 43 33 44 extern void xfs_inode_item_init(struct xfs_inode *, struct xfs_mount *); 34 45 extern void xfs_inode_item_destroy(struct xfs_inode *); 35 - extern void xfs_iflush_done(struct xfs_buf *, struct xfs_log_item *); 36 - extern void xfs_istale_done(struct xfs_buf *, struct xfs_log_item *); 46 + extern void xfs_iflush_done(struct xfs_buf *); 37 47 extern void xfs_iflush_abort(struct xfs_inode *); 38 48 extern int xfs_inode_item_format_convert(xfs_log_iovec_t *, 39 49 struct xfs_inode_log_format *);

+1 -1

fs/xfs/xfs_inode_item_recover.c

··· 376 376 xfs_dinode_calc_crc(log->l_mp, dip); 377 377 378 378 ASSERT(bp->b_mount == mp); 379 - bp->b_iodone = xlog_recover_iodone; 379 + bp->b_flags |= _XBF_LOGRECOVERY; 380 380 xfs_buf_delwri_queue(bp, buffer_list); 381 381 382 382 out_release:

+11 -3

fs/xfs/xfs_ioctl.c

··· 1075 1075 xflags |= FS_XFLAG_NODUMP; 1076 1076 else 1077 1077 xflags &= ~FS_XFLAG_NODUMP; 1078 + if (flags & FS_DAX_FL) 1079 + xflags |= FS_XFLAG_DAX; 1080 + else 1081 + xflags &= ~FS_XFLAG_DAX; 1078 1082 1079 1083 return xflags; 1080 1084 } 1081 1085 1082 1086 STATIC unsigned int 1083 1087 xfs_di2lxflags( 1084 - uint16_t di_flags) 1088 + uint16_t di_flags, 1089 + uint64_t di_flags2) 1085 1090 { 1086 1091 unsigned int flags = 0; 1087 1092 ··· 1100 1095 flags |= FS_NOATIME_FL; 1101 1096 if (di_flags & XFS_DIFLAG_NODUMP) 1102 1097 flags |= FS_NODUMP_FL; 1098 + if (di_flags2 & XFS_DIFLAG2_DAX) { 1099 + flags |= FS_DAX_FL; 1100 + } 1103 1101 return flags; 1104 1102 } 1105 1103 ··· 1573 1565 { 1574 1566 unsigned int flags; 1575 1567 1576 - flags = xfs_di2lxflags(ip->i_d.di_flags); 1568 + flags = xfs_di2lxflags(ip->i_d.di_flags, ip->i_d.di_flags2); 1577 1569 if (copy_to_user(arg, &flags, sizeof(flags))) 1578 1570 return -EFAULT; 1579 1571 return 0; ··· 1596 1588 1597 1589 if (flags & ~(FS_IMMUTABLE_FL | FS_APPEND_FL | \ 1598 1590 FS_NOATIME_FL | FS_NODUMP_FL | \ 1599 - FS_SYNC_FL)) 1591 + FS_SYNC_FL | FS_DAX_FL)) 1600 1592 return -EOPNOTSUPP; 1601 1593 1602 1594 fa.fsx_xflags = xfs_merge_ioc_xflags(flags, xfs_ip2xflags(ip));

+21 -21

fs/xfs/xfs_iomap.c

··· 293 293 294 294 STATIC bool 295 295 xfs_quota_need_throttle( 296 - struct xfs_inode *ip, 297 - int type, 298 - xfs_fsblock_t alloc_blocks) 296 + struct xfs_inode *ip, 297 + xfs_dqtype_t type, 298 + xfs_fsblock_t alloc_blocks) 299 299 { 300 - struct xfs_dquot *dq = xfs_inode_dquot(ip, type); 300 + struct xfs_dquot *dq = xfs_inode_dquot(ip, type); 301 301 302 302 if (!dq || !xfs_this_quota_on(ip->i_mount, type)) 303 303 return false; ··· 307 307 return false; 308 308 309 309 /* under the lo watermark, no throttle */ 310 - if (dq->q_res_bcount + alloc_blocks < dq->q_prealloc_lo_wmark) 310 + if (dq->q_blk.reserved + alloc_blocks < dq->q_prealloc_lo_wmark) 311 311 return false; 312 312 313 313 return true; ··· 315 315 316 316 STATIC void 317 317 xfs_quota_calc_throttle( 318 - struct xfs_inode *ip, 319 - int type, 320 - xfs_fsblock_t *qblocks, 321 - int *qshift, 322 - int64_t *qfreesp) 318 + struct xfs_inode *ip, 319 + xfs_dqtype_t type, 320 + xfs_fsblock_t *qblocks, 321 + int *qshift, 322 + int64_t *qfreesp) 323 323 { 324 - int64_t freesp; 325 - int shift = 0; 326 - struct xfs_dquot *dq = xfs_inode_dquot(ip, type); 324 + struct xfs_dquot *dq = xfs_inode_dquot(ip, type); 325 + int64_t freesp; 326 + int shift = 0; 327 327 328 328 /* no dq, or over hi wmark, squash the prealloc completely */ 329 - if (!dq || dq->q_res_bcount >= dq->q_prealloc_hi_wmark) { 329 + if (!dq || dq->q_blk.reserved >= dq->q_prealloc_hi_wmark) { 330 330 *qblocks = 0; 331 331 *qfreesp = 0; 332 332 return; 333 333 } 334 334 335 - freesp = dq->q_prealloc_hi_wmark - dq->q_res_bcount; 335 + freesp = dq->q_prealloc_hi_wmark - dq->q_blk.reserved; 336 336 if (freesp < dq->q_low_space[XFS_QLOWSP_5_PCNT]) { 337 337 shift = 2; 338 338 if (freesp < dq->q_low_space[XFS_QLOWSP_3_PCNT]) ··· 450 450 * Check each quota to cap the prealloc size, provide a shift value to 451 451 * throttle with and adjust amount of available space. 452 452 */ 453 - if (xfs_quota_need_throttle(ip, XFS_DQ_USER, alloc_blocks)) 454 - xfs_quota_calc_throttle(ip, XFS_DQ_USER, &qblocks, &qshift, 453 + if (xfs_quota_need_throttle(ip, XFS_DQTYPE_USER, alloc_blocks)) 454 + xfs_quota_calc_throttle(ip, XFS_DQTYPE_USER, &qblocks, &qshift, 455 455 &freesp); 456 - if (xfs_quota_need_throttle(ip, XFS_DQ_GROUP, alloc_blocks)) 457 - xfs_quota_calc_throttle(ip, XFS_DQ_GROUP, &qblocks, &qshift, 456 + if (xfs_quota_need_throttle(ip, XFS_DQTYPE_GROUP, alloc_blocks)) 457 + xfs_quota_calc_throttle(ip, XFS_DQTYPE_GROUP, &qblocks, &qshift, 458 458 &freesp); 459 - if (xfs_quota_need_throttle(ip, XFS_DQ_PROJ, alloc_blocks)) 460 - xfs_quota_calc_throttle(ip, XFS_DQ_PROJ, &qblocks, &qshift, 459 + if (xfs_quota_need_throttle(ip, XFS_DQTYPE_PROJ, alloc_blocks)) 460 + xfs_quota_calc_throttle(ip, XFS_DQTYPE_PROJ, &qblocks, &qshift, 461 461 &freesp); 462 462 463 463 /*

-4

fs/xfs/xfs_linux.h

··· 102 102 #define xfs_cowb_secs xfs_params.cowb_timer.val 103 103 104 104 #define current_cpu() (raw_smp_processor_id()) 105 - #define current_pid() (current->pid) 106 - #define current_test_flags(f) (current->flags & (f)) 107 105 #define current_set_flags_nested(sp, f) \ 108 106 (*(sp) = current->flags, current->flags |= (f)) 109 - #define current_clear_flags_nested(sp, f) \ 110 - (*(sp) = current->flags, current->flags &= ~(f)) 111 107 #define current_restore_flags_nested(sp, f) \ 112 108 (current->flags = ((current->flags & ~(f)) | (*(sp) & (f)))) 113 109

+3 -6

fs/xfs/xfs_log.c

··· 433 433 XFS_STATS_INC(mp, xs_try_logspace); 434 434 435 435 ASSERT(*ticp == NULL); 436 - tic = xlog_ticket_alloc(log, unit_bytes, cnt, client, permanent, 0); 436 + tic = xlog_ticket_alloc(log, unit_bytes, cnt, client, permanent); 437 437 *ticp = tic; 438 438 439 439 xlog_grant_push_ail(log, tic->t_cnt ? tic->t_unit_res * tic->t_cnt ··· 3408 3408 int unit_bytes, 3409 3409 int cnt, 3410 3410 char client, 3411 - bool permanent, 3412 - xfs_km_flags_t alloc_flags) 3411 + bool permanent) 3413 3412 { 3414 3413 struct xlog_ticket *tic; 3415 3414 int unit_res; 3416 3415 3417 - tic = kmem_zone_zalloc(xfs_log_ticket_zone, alloc_flags); 3418 - if (!tic) 3419 - return NULL; 3416 + tic = kmem_cache_zalloc(xfs_log_ticket_zone, GFP_NOFS | __GFP_NOFAIL); 3420 3417 3421 3418 unit_res = xfs_log_calc_unit_res(log->l_mp, unit_bytes); 3422 3419

+1 -2

fs/xfs/xfs_log_cil.c

··· 37 37 { 38 38 struct xlog_ticket *tic; 39 39 40 - tic = xlog_ticket_alloc(log, 0, 1, XFS_TRANSACTION, 0, 41 - KM_NOFS); 40 + tic = xlog_ticket_alloc(log, 0, 1, XFS_TRANSACTION, 0); 42 41 43 42 /* 44 43 * set the current reservation to zero so we know to steal the basic

+1 -3

fs/xfs/xfs_log_priv.h

··· 464 464 int unit_bytes, 465 465 int count, 466 466 char client, 467 - bool permanent, 468 - xfs_km_flags_t alloc_flags); 469 - 467 + bool permanent); 470 468 471 469 static inline void 472 470 xlog_write_adv_cnt(void **ptr, int *len, int *off, size_t bytes)

+2 -3

fs/xfs/xfs_log_recover.c

··· 287 287 if (bp->b_log_item) 288 288 xfs_buf_item_relse(bp); 289 289 ASSERT(bp->b_log_item == NULL); 290 - 291 - bp->b_iodone = NULL; 292 - xfs_buf_ioend(bp); 290 + bp->b_flags &= ~_XBF_LOGRECOVERY; 291 + xfs_buf_ioend_finish(bp); 293 292 } 294 293 295 294 /*

+5 -10

fs/xfs/xfs_mount.c

··· 148 148 ASSERT(atomic_read(&pag->pag_ref) == 0); 149 149 xfs_iunlink_destroy(pag); 150 150 xfs_buf_hash_destroy(pag); 151 - mutex_destroy(&pag->pag_ici_reclaim_lock); 152 151 call_rcu(&pag->rcu_head, __xfs_free_perag); 153 152 } 154 153 } ··· 199 200 pag->pag_agno = index; 200 201 pag->pag_mount = mp; 201 202 spin_lock_init(&pag->pag_ici_lock); 202 - mutex_init(&pag->pag_ici_reclaim_lock); 203 203 INIT_RADIX_TREE(&pag->pag_ici_root, GFP_ATOMIC); 204 204 if (xfs_buf_hash_init(pag)) 205 205 goto out_free_pag; ··· 240 242 out_hash_destroy: 241 243 xfs_buf_hash_destroy(pag); 242 244 out_free_pag: 243 - mutex_destroy(&pag->pag_ici_reclaim_lock); 244 245 kmem_free(pag); 245 246 out_unwind_new_pags: 246 247 /* unwind any prior newly initialized pags */ ··· 249 252 break; 250 253 xfs_buf_hash_destroy(pag); 251 254 xfs_iunlink_destroy(pag); 252 - mutex_destroy(&pag->pag_ici_reclaim_lock); 253 255 kmem_free(pag); 254 256 } 255 257 return error; ··· 1011 1015 * quota inodes. 1012 1016 */ 1013 1017 cancel_delayed_work_sync(&mp->m_reclaim_work); 1014 - xfs_reclaim_inodes(mp, SYNC_WAIT); 1018 + xfs_reclaim_inodes(mp); 1015 1019 xfs_health_unmount(mp); 1016 1020 out_log_dealloc: 1017 1021 mp->m_flags |= XFS_MOUNT_UNMOUNTING; ··· 1088 1092 xfs_ail_push_all_sync(mp->m_ail); 1089 1093 1090 1094 /* 1091 - * And reclaim all inodes. At this point there should be no dirty 1092 - * inodes and none should be pinned or locked, but use synchronous 1093 - * reclaim just to be sure. We can stop background inode reclaim 1094 - * here as well if it is still running. 1095 + * Reclaim all inodes. At this point there should be no dirty inodes and 1096 + * none should be pinned or locked. Stop background inode reclaim here 1097 + * if it is still running. 1095 1098 */ 1096 1099 cancel_delayed_work_sync(&mp->m_reclaim_work); 1097 - xfs_reclaim_inodes(mp, SYNC_WAIT); 1100 + xfs_reclaim_inodes(mp); 1098 1101 xfs_health_unmount(mp); 1099 1102 1100 1103 xfs_qm_unmount(mp);

-1

fs/xfs/xfs_mount.h

··· 354 354 spinlock_t pag_ici_lock; /* incore inode cache lock */ 355 355 struct radix_tree_root pag_ici_root; /* incore inode cache root */ 356 356 int pag_ici_reclaimable; /* reclaimable inodes */ 357 - struct mutex pag_ici_reclaim_lock; /* serialisation point */ 358 357 unsigned long pag_ici_reclaim_cursor; /* reclaim restart point */ 359 358 360 359 /* buffer cache index */

+89 -100

fs/xfs/xfs_qm.c

··· 47 47 STATIC int 48 48 xfs_qm_dquot_walk( 49 49 struct xfs_mount *mp, 50 - int type, 50 + xfs_dqtype_t type, 51 51 int (*execute)(struct xfs_dquot *dqp, void *data), 52 52 void *data) 53 53 { ··· 79 79 for (i = 0; i < nr_found; i++) { 80 80 struct xfs_dquot *dqp = batch[i]; 81 81 82 - next_index = be32_to_cpu(dqp->q_core.d_id) + 1; 82 + next_index = dqp->q_id + 1; 83 83 84 84 error = execute(batch[i], data); 85 85 if (error == -EAGAIN) { ··· 124 124 int error = -EAGAIN; 125 125 126 126 xfs_dqlock(dqp); 127 - if ((dqp->dq_flags & XFS_DQ_FREEING) || dqp->q_nrefs != 0) 127 + if ((dqp->q_flags & XFS_DQFLAG_FREEING) || dqp->q_nrefs != 0) 128 128 goto out_unlock; 129 129 130 - dqp->dq_flags |= XFS_DQ_FREEING; 130 + dqp->q_flags |= XFS_DQFLAG_FREEING; 131 131 132 132 xfs_dqflock(dqp); 133 133 ··· 148 148 error = xfs_bwrite(bp); 149 149 xfs_buf_relse(bp); 150 150 } else if (error == -EAGAIN) { 151 + dqp->q_flags &= ~XFS_DQFLAG_FREEING; 151 152 goto out_unlock; 152 153 } 153 154 xfs_dqflock(dqp); ··· 161 160 xfs_dqfunlock(dqp); 162 161 xfs_dqunlock(dqp); 163 162 164 - radix_tree_delete(xfs_dquot_tree(qi, dqp->q_core.d_flags), 165 - be32_to_cpu(dqp->q_core.d_id)); 163 + radix_tree_delete(xfs_dquot_tree(qi, xfs_dquot_type(dqp)), dqp->q_id); 166 164 qi->qi_dquots--; 167 165 168 166 /* ··· 189 189 uint flags) 190 190 { 191 191 if (flags & XFS_QMOPT_UQUOTA) 192 - xfs_qm_dquot_walk(mp, XFS_DQ_USER, xfs_qm_dqpurge, NULL); 192 + xfs_qm_dquot_walk(mp, XFS_DQTYPE_USER, xfs_qm_dqpurge, NULL); 193 193 if (flags & XFS_QMOPT_GQUOTA) 194 - xfs_qm_dquot_walk(mp, XFS_DQ_GROUP, xfs_qm_dqpurge, NULL); 194 + xfs_qm_dquot_walk(mp, XFS_DQTYPE_GROUP, xfs_qm_dqpurge, NULL); 195 195 if (flags & XFS_QMOPT_PQUOTA) 196 - xfs_qm_dquot_walk(mp, XFS_DQ_PROJ, xfs_qm_dqpurge, NULL); 196 + xfs_qm_dquot_walk(mp, XFS_DQTYPE_PROJ, xfs_qm_dqpurge, NULL); 197 197 } 198 198 199 199 /* ··· 250 250 xfs_qm_dqattach_one( 251 251 struct xfs_inode *ip, 252 252 xfs_dqid_t id, 253 - uint type, 253 + xfs_dqtype_t type, 254 254 bool doalloc, 255 255 struct xfs_dquot **IO_idqpp) 256 256 { ··· 331 331 332 332 if (XFS_IS_UQUOTA_ON(mp) && !ip->i_udquot) { 333 333 error = xfs_qm_dqattach_one(ip, i_uid_read(VFS_I(ip)), 334 - XFS_DQ_USER, doalloc, &ip->i_udquot); 334 + XFS_DQTYPE_USER, doalloc, &ip->i_udquot); 335 335 if (error) 336 336 goto done; 337 337 ASSERT(ip->i_udquot); ··· 339 339 340 340 if (XFS_IS_GQUOTA_ON(mp) && !ip->i_gdquot) { 341 341 error = xfs_qm_dqattach_one(ip, i_gid_read(VFS_I(ip)), 342 - XFS_DQ_GROUP, doalloc, &ip->i_gdquot); 342 + XFS_DQTYPE_GROUP, doalloc, &ip->i_gdquot); 343 343 if (error) 344 344 goto done; 345 345 ASSERT(ip->i_gdquot); 346 346 } 347 347 348 348 if (XFS_IS_PQUOTA_ON(mp) && !ip->i_pdquot) { 349 - error = xfs_qm_dqattach_one(ip, ip->i_d.di_projid, XFS_DQ_PROJ, 349 + error = xfs_qm_dqattach_one(ip, ip->i_d.di_projid, XFS_DQTYPE_PROJ, 350 350 doalloc, &ip->i_pdquot); 351 351 if (error) 352 352 goto done; ··· 473 473 /* 474 474 * Prevent lookups now that we are past the point of no return. 475 475 */ 476 - dqp->dq_flags |= XFS_DQ_FREEING; 476 + dqp->q_flags |= XFS_DQFLAG_FREEING; 477 477 xfs_dqunlock(dqp); 478 478 479 479 ASSERT(dqp->q_nrefs == 0); ··· 545 545 STATIC void 546 546 xfs_qm_set_defquota( 547 547 struct xfs_mount *mp, 548 - uint type, 548 + xfs_dqtype_t type, 549 549 struct xfs_quotainfo *qinf) 550 550 { 551 551 struct xfs_dquot *dqp; 552 552 struct xfs_def_quota *defq; 553 - struct xfs_disk_dquot *ddqp; 554 553 int error; 555 554 556 555 error = xfs_qm_dqget_uncached(mp, 0, type, &dqp); 557 556 if (error) 558 557 return; 559 558 560 - ddqp = &dqp->q_core; 561 559 defq = xfs_get_defquota(qinf, xfs_dquot_type(dqp)); 562 560 563 561 /* 564 562 * Timers and warnings have been already set, let's just set the 565 563 * default limits for this quota type 566 564 */ 567 - defq->bhardlimit = be64_to_cpu(ddqp->d_blk_hardlimit); 568 - defq->bsoftlimit = be64_to_cpu(ddqp->d_blk_softlimit); 569 - defq->ihardlimit = be64_to_cpu(ddqp->d_ino_hardlimit); 570 - defq->isoftlimit = be64_to_cpu(ddqp->d_ino_softlimit); 571 - defq->rtbhardlimit = be64_to_cpu(ddqp->d_rtb_hardlimit); 572 - defq->rtbsoftlimit = be64_to_cpu(ddqp->d_rtb_softlimit); 565 + defq->blk.hard = dqp->q_blk.hardlimit; 566 + defq->blk.soft = dqp->q_blk.softlimit; 567 + defq->ino.hard = dqp->q_ino.hardlimit; 568 + defq->ino.soft = dqp->q_ino.softlimit; 569 + defq->rtb.hard = dqp->q_rtb.hardlimit; 570 + defq->rtb.soft = dqp->q_rtb.softlimit; 573 571 xfs_qm_dqdestroy(dqp); 574 572 } 575 573 ··· 575 577 static void 576 578 xfs_qm_init_timelimits( 577 579 struct xfs_mount *mp, 578 - uint type) 580 + xfs_dqtype_t type) 579 581 { 580 582 struct xfs_quotainfo *qinf = mp->m_quotainfo; 581 583 struct xfs_def_quota *defq; 582 - struct xfs_disk_dquot *ddqp; 583 584 struct xfs_dquot *dqp; 584 585 int error; 585 586 586 587 defq = xfs_get_defquota(qinf, type); 587 588 588 - defq->btimelimit = XFS_QM_BTIMELIMIT; 589 - defq->itimelimit = XFS_QM_ITIMELIMIT; 590 - defq->rtbtimelimit = XFS_QM_RTBTIMELIMIT; 591 - defq->bwarnlimit = XFS_QM_BWARNLIMIT; 592 - defq->iwarnlimit = XFS_QM_IWARNLIMIT; 593 - defq->rtbwarnlimit = XFS_QM_RTBWARNLIMIT; 589 + defq->blk.time = XFS_QM_BTIMELIMIT; 590 + defq->ino.time = XFS_QM_ITIMELIMIT; 591 + defq->rtb.time = XFS_QM_RTBTIMELIMIT; 592 + defq->blk.warn = XFS_QM_BWARNLIMIT; 593 + defq->ino.warn = XFS_QM_IWARNLIMIT; 594 + defq->rtb.warn = XFS_QM_RTBWARNLIMIT; 594 595 595 596 /* 596 597 * We try to get the limits from the superuser's limits fields. ··· 602 605 if (error) 603 606 return; 604 607 605 - ddqp = &dqp->q_core; 606 - 607 608 /* 608 609 * The warnings and timers set the grace period given to 609 610 * a user or group before he or she can not perform any 610 611 * more writing. If it is zero, a default is used. 611 612 */ 612 - if (ddqp->d_btimer) 613 - defq->btimelimit = be32_to_cpu(ddqp->d_btimer); 614 - if (ddqp->d_itimer) 615 - defq->itimelimit = be32_to_cpu(ddqp->d_itimer); 616 - if (ddqp->d_rtbtimer) 617 - defq->rtbtimelimit = be32_to_cpu(ddqp->d_rtbtimer); 618 - if (ddqp->d_bwarns) 619 - defq->bwarnlimit = be16_to_cpu(ddqp->d_bwarns); 620 - if (ddqp->d_iwarns) 621 - defq->iwarnlimit = be16_to_cpu(ddqp->d_iwarns); 622 - if (ddqp->d_rtbwarns) 623 - defq->rtbwarnlimit = be16_to_cpu(ddqp->d_rtbwarns); 613 + if (dqp->q_blk.timer) 614 + defq->blk.time = dqp->q_blk.timer; 615 + if (dqp->q_ino.timer) 616 + defq->ino.time = dqp->q_ino.timer; 617 + if (dqp->q_rtb.timer) 618 + defq->rtb.time = dqp->q_rtb.timer; 619 + if (dqp->q_blk.warnings) 620 + defq->blk.warn = dqp->q_blk.warnings; 621 + if (dqp->q_ino.warnings) 622 + defq->ino.warn = dqp->q_ino.warnings; 623 + if (dqp->q_rtb.warnings) 624 + defq->rtb.warn = dqp->q_rtb.warnings; 624 625 625 626 xfs_qm_dqdestroy(dqp); 626 627 } ··· 664 669 665 670 mp->m_qflags |= (mp->m_sb.sb_qflags & XFS_ALL_QUOTA_CHKD); 666 671 667 - xfs_qm_init_timelimits(mp, XFS_DQ_USER); 668 - xfs_qm_init_timelimits(mp, XFS_DQ_GROUP); 669 - xfs_qm_init_timelimits(mp, XFS_DQ_PROJ); 672 + xfs_qm_init_timelimits(mp, XFS_DQTYPE_USER); 673 + xfs_qm_init_timelimits(mp, XFS_DQTYPE_GROUP); 674 + xfs_qm_init_timelimits(mp, XFS_DQTYPE_PROJ); 670 675 671 676 if (XFS_IS_UQUOTA_RUNNING(mp)) 672 - xfs_qm_set_defquota(mp, XFS_DQ_USER, qinf); 677 + xfs_qm_set_defquota(mp, XFS_DQTYPE_USER, qinf); 673 678 if (XFS_IS_GQUOTA_RUNNING(mp)) 674 - xfs_qm_set_defquota(mp, XFS_DQ_GROUP, qinf); 679 + xfs_qm_set_defquota(mp, XFS_DQTYPE_GROUP, qinf); 675 680 if (XFS_IS_PQUOTA_RUNNING(mp)) 676 - xfs_qm_set_defquota(mp, XFS_DQ_PROJ, qinf); 681 + xfs_qm_set_defquota(mp, XFS_DQTYPE_PROJ, qinf); 677 682 678 683 qinf->qi_shrinker.count_objects = xfs_qm_shrink_count; 679 684 qinf->qi_shrinker.scan_objects = xfs_qm_shrink_scan; ··· 823 828 824 829 STATIC void 825 830 xfs_qm_reset_dqcounts( 826 - xfs_mount_t *mp, 827 - xfs_buf_t *bp, 828 - xfs_dqid_t id, 829 - uint type) 831 + struct xfs_mount *mp, 832 + struct xfs_buf *bp, 833 + xfs_dqid_t id, 834 + xfs_dqtype_t type) 830 835 { 831 836 struct xfs_dqblk *dqb; 832 837 int j; 833 - xfs_failaddr_t fa; 834 838 835 839 trace_xfs_reset_dqcounts(bp, _RET_IP_); 836 840 ··· 854 860 * find uninitialised dquot blks. See comment in 855 861 * xfs_dquot_verify. 856 862 */ 857 - fa = xfs_dqblk_verify(mp, &dqb[j], id + j, type); 858 - if (fa) 863 + if (xfs_dqblk_verify(mp, &dqb[j], id + j) || 864 + (dqb[j].dd_diskdq.d_type & XFS_DQTYPE_REC_MASK) != type) 859 865 xfs_dqblk_repair(mp, &dqb[j], id + j, type); 860 866 861 867 /* 862 868 * Reset type in case we are reusing group quota file for 863 869 * project quotas or vice versa 864 870 */ 865 - ddq->d_flags = type; 871 + ddq->d_type = type; 866 872 ddq->d_bcount = 0; 867 873 ddq->d_icount = 0; 868 874 ddq->d_rtbcount = 0; ··· 895 901 xfs_dqid_t firstid, 896 902 xfs_fsblock_t bno, 897 903 xfs_filblks_t blkcnt, 898 - uint flags, 904 + xfs_dqtype_t type, 899 905 struct list_head *buffer_list) 900 906 { 901 907 struct xfs_buf *bp; 902 - int error; 903 - int type; 908 + int error = 0; 904 909 905 910 ASSERT(blkcnt > 0); 906 - type = flags & XFS_QMOPT_UQUOTA ? XFS_DQ_USER : 907 - (flags & XFS_QMOPT_PQUOTA ? XFS_DQ_PROJ : XFS_DQ_GROUP); 908 - error = 0; 909 911 910 912 /* 911 913 * Blkcnt arg can be a very big number, and might even be ··· 961 971 xfs_qm_reset_dqcounts_buf( 962 972 struct xfs_mount *mp, 963 973 struct xfs_inode *qip, 964 - uint flags, 974 + xfs_dqtype_t type, 965 975 struct list_head *buffer_list) 966 976 { 967 977 struct xfs_bmbt_irec *map; ··· 1037 1047 error = xfs_qm_reset_dqcounts_all(mp, firstid, 1038 1048 map[i].br_startblock, 1039 1049 map[i].br_blockcount, 1040 - flags, buffer_list); 1050 + type, buffer_list); 1041 1051 if (error) 1042 1052 goto out; 1043 1053 } ··· 1059 1069 STATIC int 1060 1070 xfs_qm_quotacheck_dqadjust( 1061 1071 struct xfs_inode *ip, 1062 - uint type, 1072 + xfs_dqtype_t type, 1063 1073 xfs_qcnt_t nblks, 1064 1074 xfs_qcnt_t rtblks) 1065 1075 { ··· 1085 1095 * Adjust the inode count and the block count to reflect this inode's 1086 1096 * resource usage. 1087 1097 */ 1088 - be64_add_cpu(&dqp->q_core.d_icount, 1); 1089 - dqp->q_res_icount++; 1098 + dqp->q_ino.count++; 1099 + dqp->q_ino.reserved++; 1090 1100 if (nblks) { 1091 - be64_add_cpu(&dqp->q_core.d_bcount, nblks); 1092 - dqp->q_res_bcount += nblks; 1101 + dqp->q_blk.count += nblks; 1102 + dqp->q_blk.reserved += nblks; 1093 1103 } 1094 1104 if (rtblks) { 1095 - be64_add_cpu(&dqp->q_core.d_rtbcount, rtblks); 1096 - dqp->q_res_rtbcount += rtblks; 1105 + dqp->q_rtb.count += rtblks; 1106 + dqp->q_rtb.reserved += rtblks; 1097 1107 } 1098 1108 1099 1109 /* ··· 1101 1111 * 1102 1112 * There are no timers for the default values set in the root dquot. 1103 1113 */ 1104 - if (dqp->q_core.d_id) { 1105 - xfs_qm_adjust_dqlimits(mp, dqp); 1106 - xfs_qm_adjust_dqtimers(mp, dqp); 1114 + if (dqp->q_id) { 1115 + xfs_qm_adjust_dqlimits(dqp); 1116 + xfs_qm_adjust_dqtimers(dqp); 1107 1117 } 1108 1118 1109 - dqp->dq_flags |= XFS_DQ_DIRTY; 1119 + dqp->q_flags |= XFS_DQFLAG_DIRTY; 1110 1120 xfs_qm_dqput(dqp); 1111 1121 return 0; 1112 1122 } ··· 1176 1186 * and quotaoffs don't race. (Quotachecks happen at mount time only). 1177 1187 */ 1178 1188 if (XFS_IS_UQUOTA_ON(mp)) { 1179 - error = xfs_qm_quotacheck_dqadjust(ip, XFS_DQ_USER, nblks, 1189 + error = xfs_qm_quotacheck_dqadjust(ip, XFS_DQTYPE_USER, nblks, 1180 1190 rtblks); 1181 1191 if (error) 1182 1192 goto error0; 1183 1193 } 1184 1194 1185 1195 if (XFS_IS_GQUOTA_ON(mp)) { 1186 - error = xfs_qm_quotacheck_dqadjust(ip, XFS_DQ_GROUP, nblks, 1196 + error = xfs_qm_quotacheck_dqadjust(ip, XFS_DQTYPE_GROUP, nblks, 1187 1197 rtblks); 1188 1198 if (error) 1189 1199 goto error0; 1190 1200 } 1191 1201 1192 1202 if (XFS_IS_PQUOTA_ON(mp)) { 1193 - error = xfs_qm_quotacheck_dqadjust(ip, XFS_DQ_PROJ, nblks, 1203 + error = xfs_qm_quotacheck_dqadjust(ip, XFS_DQTYPE_PROJ, nblks, 1194 1204 rtblks); 1195 1205 if (error) 1196 1206 goto error0; ··· 1212 1222 int error = 0; 1213 1223 1214 1224 xfs_dqlock(dqp); 1215 - if (dqp->dq_flags & XFS_DQ_FREEING) 1225 + if (dqp->q_flags & XFS_DQFLAG_FREEING) 1216 1226 goto out_unlock; 1217 1227 if (!XFS_DQ_IS_DIRTY(dqp)) 1218 1228 goto out_unlock; ··· 1281 1291 * We don't log our changes till later. 1282 1292 */ 1283 1293 if (uip) { 1284 - error = xfs_qm_reset_dqcounts_buf(mp, uip, XFS_QMOPT_UQUOTA, 1294 + error = xfs_qm_reset_dqcounts_buf(mp, uip, XFS_DQTYPE_USER, 1285 1295 &buffer_list); 1286 1296 if (error) 1287 1297 goto error_return; ··· 1289 1299 } 1290 1300 1291 1301 if (gip) { 1292 - error = xfs_qm_reset_dqcounts_buf(mp, gip, XFS_QMOPT_GQUOTA, 1302 + error = xfs_qm_reset_dqcounts_buf(mp, gip, XFS_DQTYPE_GROUP, 1293 1303 &buffer_list); 1294 1304 if (error) 1295 1305 goto error_return; ··· 1297 1307 } 1298 1308 1299 1309 if (pip) { 1300 - error = xfs_qm_reset_dqcounts_buf(mp, pip, XFS_QMOPT_PQUOTA, 1310 + error = xfs_qm_reset_dqcounts_buf(mp, pip, XFS_DQTYPE_PROJ, 1301 1311 &buffer_list); 1302 1312 if (error) 1303 1313 goto error_return; ··· 1314 1324 * down to disk buffers if everything was updated successfully. 1315 1325 */ 1316 1326 if (XFS_IS_UQUOTA_ON(mp)) { 1317 - error = xfs_qm_dquot_walk(mp, XFS_DQ_USER, xfs_qm_flush_one, 1327 + error = xfs_qm_dquot_walk(mp, XFS_DQTYPE_USER, xfs_qm_flush_one, 1318 1328 &buffer_list); 1319 1329 } 1320 1330 if (XFS_IS_GQUOTA_ON(mp)) { 1321 - error2 = xfs_qm_dquot_walk(mp, XFS_DQ_GROUP, xfs_qm_flush_one, 1331 + error2 = xfs_qm_dquot_walk(mp, XFS_DQTYPE_GROUP, xfs_qm_flush_one, 1322 1332 &buffer_list); 1323 1333 if (!error) 1324 1334 error = error2; 1325 1335 } 1326 1336 if (XFS_IS_PQUOTA_ON(mp)) { 1327 - error2 = xfs_qm_dquot_walk(mp, XFS_DQ_PROJ, xfs_qm_flush_one, 1337 + error2 = xfs_qm_dquot_walk(mp, XFS_DQTYPE_PROJ, xfs_qm_flush_one, 1328 1338 &buffer_list); 1329 1339 if (!error) 1330 1340 error = error2; ··· 1587 1597 struct xfs_quotainfo *qi = mp->m_quotainfo; 1588 1598 1589 1599 mutex_lock(&qi->qi_tree_lock); 1590 - radix_tree_delete(xfs_dquot_tree(qi, dqp->q_core.d_flags), 1591 - be32_to_cpu(dqp->q_core.d_id)); 1600 + radix_tree_delete(xfs_dquot_tree(qi, xfs_dquot_type(dqp)), dqp->q_id); 1592 1601 1593 1602 qi->qi_dquots--; 1594 1603 mutex_unlock(&qi->qi_tree_lock); ··· 1662 1673 */ 1663 1674 xfs_iunlock(ip, lockflags); 1664 1675 error = xfs_qm_dqget(mp, from_kuid(user_ns, uid), 1665 - XFS_DQ_USER, true, &uq); 1676 + XFS_DQTYPE_USER, true, &uq); 1666 1677 if (error) { 1667 1678 ASSERT(error != -ENOENT); 1668 1679 return error; ··· 1686 1697 if (!gid_eq(inode->i_gid, gid)) { 1687 1698 xfs_iunlock(ip, lockflags); 1688 1699 error = xfs_qm_dqget(mp, from_kgid(user_ns, gid), 1689 - XFS_DQ_GROUP, true, &gq); 1700 + XFS_DQTYPE_GROUP, true, &gq); 1690 1701 if (error) { 1691 1702 ASSERT(error != -ENOENT); 1692 1703 goto error_rele; ··· 1702 1713 if ((flags & XFS_QMOPT_PQUOTA) && XFS_IS_PQUOTA_ON(mp)) { 1703 1714 if (ip->i_d.di_projid != prid) { 1704 1715 xfs_iunlock(ip, lockflags); 1705 - error = xfs_qm_dqget(mp, (xfs_dqid_t)prid, XFS_DQ_PROJ, 1706 - true, &pq); 1716 + error = xfs_qm_dqget(mp, (xfs_dqid_t)prid, 1717 + XFS_DQTYPE_PROJ, true, &pq); 1707 1718 if (error) { 1708 1719 ASSERT(error != -ENOENT); 1709 1720 goto error_rele; ··· 1811 1822 XFS_QMOPT_RES_RTBLKS : XFS_QMOPT_RES_REGBLKS; 1812 1823 1813 1824 if (XFS_IS_UQUOTA_ON(mp) && udqp && 1814 - i_uid_read(VFS_I(ip)) != be32_to_cpu(udqp->q_core.d_id)) { 1825 + i_uid_read(VFS_I(ip)) != udqp->q_id) { 1815 1826 udq_delblks = udqp; 1816 1827 /* 1817 1828 * If there are delayed allocation blocks, then we have to ··· 1824 1835 } 1825 1836 } 1826 1837 if (XFS_IS_GQUOTA_ON(ip->i_mount) && gdqp && 1827 - i_gid_read(VFS_I(ip)) != be32_to_cpu(gdqp->q_core.d_id)) { 1838 + i_gid_read(VFS_I(ip)) != gdqp->q_id) { 1828 1839 gdq_delblks = gdqp; 1829 1840 if (delblks) { 1830 1841 ASSERT(ip->i_gdquot); ··· 1833 1844 } 1834 1845 1835 1846 if (XFS_IS_PQUOTA_ON(ip->i_mount) && pdqp && 1836 - ip->i_d.di_projid != be32_to_cpu(pdqp->q_core.d_id)) { 1847 + ip->i_d.di_projid != pdqp->q_id) { 1837 1848 pdq_delblks = pdqp; 1838 1849 if (delblks) { 1839 1850 ASSERT(ip->i_pdquot); ··· 1917 1928 1918 1929 if (udqp && XFS_IS_UQUOTA_ON(mp)) { 1919 1930 ASSERT(ip->i_udquot == NULL); 1920 - ASSERT(i_uid_read(VFS_I(ip)) == be32_to_cpu(udqp->q_core.d_id)); 1931 + ASSERT(i_uid_read(VFS_I(ip)) == udqp->q_id); 1921 1932 1922 1933 ip->i_udquot = xfs_qm_dqhold(udqp); 1923 1934 xfs_trans_mod_dquot(tp, udqp, XFS_TRANS_DQ_ICOUNT, 1); 1924 1935 } 1925 1936 if (gdqp && XFS_IS_GQUOTA_ON(mp)) { 1926 1937 ASSERT(ip->i_gdquot == NULL); 1927 - ASSERT(i_gid_read(VFS_I(ip)) == be32_to_cpu(gdqp->q_core.d_id)); 1938 + ASSERT(i_gid_read(VFS_I(ip)) == gdqp->q_id); 1928 1939 1929 1940 ip->i_gdquot = xfs_qm_dqhold(gdqp); 1930 1941 xfs_trans_mod_dquot(tp, gdqp, XFS_TRANS_DQ_ICOUNT, 1); 1931 1942 } 1932 1943 if (pdqp && XFS_IS_PQUOTA_ON(mp)) { 1933 1944 ASSERT(ip->i_pdquot == NULL); 1934 - ASSERT(ip->i_d.di_projid == be32_to_cpu(pdqp->q_core.d_id)); 1945 + ASSERT(ip->i_d.di_projid == pdqp->q_id); 1935 1946 1936 1947 ip->i_pdquot = xfs_qm_dqhold(pdqp); 1937 1948 xfs_trans_mod_dquot(tp, pdqp, XFS_TRANS_DQ_ICOUNT, 1);

+43 -61

fs/xfs/xfs_qm.h

··· 20 20 #define XFS_DQITER_MAP_SIZE 10 21 21 22 22 #define XFS_IS_DQUOT_UNINITIALIZED(dqp) ( \ 23 - !dqp->q_core.d_blk_hardlimit && \ 24 - !dqp->q_core.d_blk_softlimit && \ 25 - !dqp->q_core.d_rtb_hardlimit && \ 26 - !dqp->q_core.d_rtb_softlimit && \ 27 - !dqp->q_core.d_ino_hardlimit && \ 28 - !dqp->q_core.d_ino_softlimit && \ 29 - !dqp->q_core.d_bcount && \ 30 - !dqp->q_core.d_rtbcount && \ 31 - !dqp->q_core.d_icount) 23 + !dqp->q_blk.hardlimit && \ 24 + !dqp->q_blk.softlimit && \ 25 + !dqp->q_rtb.hardlimit && \ 26 + !dqp->q_rtb.softlimit && \ 27 + !dqp->q_ino.hardlimit && \ 28 + !dqp->q_ino.softlimit && \ 29 + !dqp->q_blk.count && \ 30 + !dqp->q_rtb.count && \ 31 + !dqp->q_ino.count) 32 32 33 - /* 34 - * This defines the unit of allocation of dquots. 35 - * Currently, it is just one file system block, and a 4K blk contains 30 36 - * (136 * 30 = 4080) dquots. It's probably not worth trying to make 37 - * this more dynamic. 38 - * XXXsup However, if this number is changed, we have to make sure that we don't 39 - * implicitly assume that we do allocations in chunks of a single filesystem 40 - * block in the dquot/xqm code. 41 - */ 42 - #define XFS_DQUOT_CLUSTER_SIZE_FSB (xfs_filblks_t)1 33 + struct xfs_quota_limits { 34 + xfs_qcnt_t hard; /* default hard limit */ 35 + xfs_qcnt_t soft; /* default soft limit */ 36 + time64_t time; /* limit for timers */ 37 + xfs_qwarncnt_t warn; /* limit for warnings */ 38 + }; 43 39 44 40 /* Defaults for each quota type: time limits, warn limits, usage limits */ 45 41 struct xfs_def_quota { 46 - time64_t btimelimit; /* limit for blks timer */ 47 - time64_t itimelimit; /* limit for inodes timer */ 48 - time64_t rtbtimelimit; /* limit for rt blks timer */ 49 - xfs_qwarncnt_t bwarnlimit; /* limit for blks warnings */ 50 - xfs_qwarncnt_t iwarnlimit; /* limit for inodes warnings */ 51 - xfs_qwarncnt_t rtbwarnlimit; /* limit for rt blks warnings */ 52 - xfs_qcnt_t bhardlimit; /* default data blk hard limit */ 53 - xfs_qcnt_t bsoftlimit; /* default data blk soft limit */ 54 - xfs_qcnt_t ihardlimit; /* default inode count hard limit */ 55 - xfs_qcnt_t isoftlimit; /* default inode count soft limit */ 56 - xfs_qcnt_t rtbhardlimit; /* default realtime blk hard limit */ 57 - xfs_qcnt_t rtbsoftlimit; /* default realtime blk soft limit */ 42 + struct xfs_quota_limits blk; 43 + struct xfs_quota_limits ino; 44 + struct xfs_quota_limits rtb; 58 45 }; 59 46 60 47 /* ··· 70 83 static inline struct radix_tree_root * 71 84 xfs_dquot_tree( 72 85 struct xfs_quotainfo *qi, 73 - int type) 86 + xfs_dqtype_t type) 74 87 { 75 88 switch (type) { 76 - case XFS_DQ_USER: 89 + case XFS_DQTYPE_USER: 77 90 return &qi->qi_uquota_tree; 78 - case XFS_DQ_GROUP: 91 + case XFS_DQTYPE_GROUP: 79 92 return &qi->qi_gquota_tree; 80 - case XFS_DQ_PROJ: 93 + case XFS_DQTYPE_PROJ: 81 94 return &qi->qi_pquota_tree; 82 95 default: 83 96 ASSERT(0); ··· 86 99 } 87 100 88 101 static inline struct xfs_inode * 89 - xfs_quota_inode(xfs_mount_t *mp, uint dq_flags) 102 + xfs_quota_inode(struct xfs_mount *mp, xfs_dqtype_t type) 90 103 { 91 - switch (dq_flags & XFS_DQ_ALLTYPES) { 92 - case XFS_DQ_USER: 104 + switch (type) { 105 + case XFS_DQTYPE_USER: 93 106 return mp->m_quotainfo->qi_uquotaip; 94 - case XFS_DQ_GROUP: 107 + case XFS_DQTYPE_GROUP: 95 108 return mp->m_quotainfo->qi_gquotaip; 96 - case XFS_DQ_PROJ: 109 + case XFS_DQTYPE_PROJ: 97 110 return mp->m_quotainfo->qi_pquotaip; 98 111 default: 99 112 ASSERT(0); 100 113 } 101 114 return NULL; 102 - } 103 - 104 - static inline int 105 - xfs_dquot_type(struct xfs_dquot *dqp) 106 - { 107 - if (XFS_QM_ISUDQ(dqp)) 108 - return XFS_DQ_USER; 109 - if (XFS_QM_ISGDQ(dqp)) 110 - return XFS_DQ_GROUP; 111 - ASSERT(XFS_QM_ISPDQ(dqp)); 112 - return XFS_DQ_PROJ; 113 115 } 114 116 115 117 extern void xfs_trans_mod_dquot(struct xfs_trans *tp, struct xfs_dquot *dqp, ··· 142 166 143 167 /* quota ops */ 144 168 extern int xfs_qm_scall_trunc_qfiles(struct xfs_mount *, uint); 145 - extern int xfs_qm_scall_getquota(struct xfs_mount *, xfs_dqid_t, 146 - uint, struct qc_dqblk *); 147 - extern int xfs_qm_scall_getquota_next(struct xfs_mount *, 148 - xfs_dqid_t *, uint, struct qc_dqblk *); 149 - extern int xfs_qm_scall_setqlim(struct xfs_mount *, xfs_dqid_t, uint, 150 - struct qc_dqblk *); 169 + extern int xfs_qm_scall_getquota(struct xfs_mount *mp, 170 + xfs_dqid_t id, 171 + xfs_dqtype_t type, 172 + struct qc_dqblk *dst); 173 + extern int xfs_qm_scall_getquota_next(struct xfs_mount *mp, 174 + xfs_dqid_t *id, 175 + xfs_dqtype_t type, 176 + struct qc_dqblk *dst); 177 + extern int xfs_qm_scall_setqlim(struct xfs_mount *mp, 178 + xfs_dqid_t id, 179 + xfs_dqtype_t type, 180 + struct qc_dqblk *newlim); 151 181 extern int xfs_qm_scall_quotaon(struct xfs_mount *, uint); 152 182 extern int xfs_qm_scall_quotaoff(struct xfs_mount *, uint); 153 183 154 184 static inline struct xfs_def_quota * 155 - xfs_get_defquota(struct xfs_quotainfo *qi, int type) 185 + xfs_get_defquota(struct xfs_quotainfo *qi, xfs_dqtype_t type) 156 186 { 157 187 switch (type) { 158 - case XFS_DQ_USER: 188 + case XFS_DQTYPE_USER: 159 189 return &qi->qi_usr_default; 160 - case XFS_DQ_GROUP: 190 + case XFS_DQTYPE_GROUP: 161 191 return &qi->qi_grp_default; 162 - case XFS_DQ_PROJ: 192 + case XFS_DQTYPE_PROJ: 163 193 return &qi->qi_prj_default; 164 194 default: 165 195 ASSERT(0);

+11 -11

fs/xfs/xfs_qm_bhv.c

··· 23 23 { 24 24 uint64_t limit; 25 25 26 - limit = dqp->q_core.d_blk_softlimit ? 27 - be64_to_cpu(dqp->q_core.d_blk_softlimit) : 28 - be64_to_cpu(dqp->q_core.d_blk_hardlimit); 26 + limit = dqp->q_blk.softlimit ? 27 + dqp->q_blk.softlimit : 28 + dqp->q_blk.hardlimit; 29 29 if (limit && statp->f_blocks > limit) { 30 30 statp->f_blocks = limit; 31 31 statp->f_bfree = statp->f_bavail = 32 - (statp->f_blocks > dqp->q_res_bcount) ? 33 - (statp->f_blocks - dqp->q_res_bcount) : 0; 32 + (statp->f_blocks > dqp->q_blk.reserved) ? 33 + (statp->f_blocks - dqp->q_blk.reserved) : 0; 34 34 } 35 35 36 - limit = dqp->q_core.d_ino_softlimit ? 37 - be64_to_cpu(dqp->q_core.d_ino_softlimit) : 38 - be64_to_cpu(dqp->q_core.d_ino_hardlimit); 36 + limit = dqp->q_ino.softlimit ? 37 + dqp->q_ino.softlimit : 38 + dqp->q_ino.hardlimit; 39 39 if (limit && statp->f_files > limit) { 40 40 statp->f_files = limit; 41 41 statp->f_ffree = 42 - (statp->f_files > dqp->q_res_icount) ? 43 - (statp->f_files - dqp->q_res_icount) : 0; 42 + (statp->f_files > dqp->q_ino.reserved) ? 43 + (statp->f_files - dqp->q_ino.reserved) : 0; 44 44 } 45 45 } 46 46 ··· 60 60 struct xfs_mount *mp = ip->i_mount; 61 61 struct xfs_dquot *dqp; 62 62 63 - if (!xfs_qm_dqget(mp, ip->i_d.di_projid, XFS_DQ_PROJ, false, &dqp)) { 63 + if (!xfs_qm_dqget(mp, ip->i_d.di_projid, XFS_DQTYPE_PROJ, false, &dqp)) { 64 64 xfs_fill_statvfs_from_dquot(statp, dqp); 65 65 xfs_qm_dqput(dqp); 66 66 }

+138 -130

fs/xfs/xfs_qm_syscalls.c

··· 322 322 int error = -EINVAL; 323 323 324 324 if (!xfs_sb_version_hasquota(&mp->m_sb) || flags == 0 || 325 - (flags & ~XFS_DQ_ALLTYPES)) { 325 + (flags & ~XFS_QMOPT_QUOTALL)) { 326 326 xfs_debug(mp, "%s: flags=%x m_qflags=%x", 327 327 __func__, flags, mp->m_qflags); 328 328 return -EINVAL; 329 329 } 330 330 331 - if (flags & XFS_DQ_USER) { 331 + if (flags & XFS_QMOPT_UQUOTA) { 332 332 error = xfs_qm_scall_trunc_qfile(mp, mp->m_sb.sb_uquotino); 333 333 if (error) 334 334 return error; 335 335 } 336 - if (flags & XFS_DQ_GROUP) { 336 + if (flags & XFS_QMOPT_GQUOTA) { 337 337 error = xfs_qm_scall_trunc_qfile(mp, mp->m_sb.sb_gquotino); 338 338 if (error) 339 339 return error; 340 340 } 341 - if (flags & XFS_DQ_PROJ) 341 + if (flags & XFS_QMOPT_PQUOTA) 342 342 error = xfs_qm_scall_trunc_qfile(mp, mp->m_sb.sb_pquotino); 343 343 344 344 return error; ··· 437 437 (QC_LIMIT_MASK | QC_TIMER_MASK | QC_WARNS_MASK) 438 438 439 439 /* 440 + * Adjust limits of this quota, and the defaults if passed in. Returns true 441 + * if the new limits made sense and were applied, false otherwise. 442 + */ 443 + static inline bool 444 + xfs_setqlim_limits( 445 + struct xfs_mount *mp, 446 + struct xfs_dquot_res *res, 447 + struct xfs_quota_limits *qlim, 448 + xfs_qcnt_t hard, 449 + xfs_qcnt_t soft, 450 + const char *tag) 451 + { 452 + /* The hard limit can't be less than the soft limit. */ 453 + if (hard != 0 && hard < soft) { 454 + xfs_debug(mp, "%shard %lld < %ssoft %lld", tag, hard, tag, 455 + soft); 456 + return false; 457 + } 458 + 459 + res->hardlimit = hard; 460 + res->softlimit = soft; 461 + if (qlim) { 462 + qlim->hard = hard; 463 + qlim->soft = soft; 464 + } 465 + 466 + return true; 467 + } 468 + 469 + static inline void 470 + xfs_setqlim_warns( 471 + struct xfs_dquot_res *res, 472 + struct xfs_quota_limits *qlim, 473 + int warns) 474 + { 475 + res->warnings = warns; 476 + if (qlim) 477 + qlim->warn = warns; 478 + } 479 + 480 + static inline void 481 + xfs_setqlim_timer( 482 + struct xfs_dquot_res *res, 483 + struct xfs_quota_limits *qlim, 484 + s64 timer) 485 + { 486 + res->timer = timer; 487 + if (qlim) 488 + qlim->time = timer; 489 + } 490 + 491 + /* 440 492 * Adjust quota limits, and start/stop timers accordingly. 441 493 */ 442 494 int 443 495 xfs_qm_scall_setqlim( 444 496 struct xfs_mount *mp, 445 497 xfs_dqid_t id, 446 - uint type, 498 + xfs_dqtype_t type, 447 499 struct qc_dqblk *newlim) 448 500 { 449 501 struct xfs_quotainfo *q = mp->m_quotainfo; 450 - struct xfs_disk_dquot *ddq; 451 502 struct xfs_dquot *dqp; 452 503 struct xfs_trans *tp; 453 504 struct xfs_def_quota *defq; 505 + struct xfs_dquot_res *res; 506 + struct xfs_quota_limits *qlim; 454 507 int error; 455 508 xfs_qcnt_t hard, soft; 456 509 ··· 541 488 542 489 xfs_dqlock(dqp); 543 490 xfs_trans_dqjoin(tp, dqp); 544 - ddq = &dqp->q_core; 545 491 546 492 /* 493 + * Update quota limits, warnings, and timers, and the defaults 494 + * if we're touching id == 0. 495 + * 547 496 * Make sure that hardlimits are >= soft limits before changing. 548 - */ 549 - hard = (newlim->d_fieldmask & QC_SPC_HARD) ? 550 - (xfs_qcnt_t) XFS_B_TO_FSB(mp, newlim->d_spc_hardlimit) : 551 - be64_to_cpu(ddq->d_blk_hardlimit); 552 - soft = (newlim->d_fieldmask & QC_SPC_SOFT) ? 553 - (xfs_qcnt_t) XFS_B_TO_FSB(mp, newlim->d_spc_softlimit) : 554 - be64_to_cpu(ddq->d_blk_softlimit); 555 - if (hard == 0 || hard >= soft) { 556 - ddq->d_blk_hardlimit = cpu_to_be64(hard); 557 - ddq->d_blk_softlimit = cpu_to_be64(soft); 558 - xfs_dquot_set_prealloc_limits(dqp); 559 - if (id == 0) { 560 - defq->bhardlimit = hard; 561 - defq->bsoftlimit = soft; 562 - } 563 - } else { 564 - xfs_debug(mp, "blkhard %Ld < blksoft %Ld", hard, soft); 565 - } 566 - hard = (newlim->d_fieldmask & QC_RT_SPC_HARD) ? 567 - (xfs_qcnt_t) XFS_B_TO_FSB(mp, newlim->d_rt_spc_hardlimit) : 568 - be64_to_cpu(ddq->d_rtb_hardlimit); 569 - soft = (newlim->d_fieldmask & QC_RT_SPC_SOFT) ? 570 - (xfs_qcnt_t) XFS_B_TO_FSB(mp, newlim->d_rt_spc_softlimit) : 571 - be64_to_cpu(ddq->d_rtb_softlimit); 572 - if (hard == 0 || hard >= soft) { 573 - ddq->d_rtb_hardlimit = cpu_to_be64(hard); 574 - ddq->d_rtb_softlimit = cpu_to_be64(soft); 575 - if (id == 0) { 576 - defq->rtbhardlimit = hard; 577 - defq->rtbsoftlimit = soft; 578 - } 579 - } else { 580 - xfs_debug(mp, "rtbhard %Ld < rtbsoft %Ld", hard, soft); 581 - } 582 - 583 - hard = (newlim->d_fieldmask & QC_INO_HARD) ? 584 - (xfs_qcnt_t) newlim->d_ino_hardlimit : 585 - be64_to_cpu(ddq->d_ino_hardlimit); 586 - soft = (newlim->d_fieldmask & QC_INO_SOFT) ? 587 - (xfs_qcnt_t) newlim->d_ino_softlimit : 588 - be64_to_cpu(ddq->d_ino_softlimit); 589 - if (hard == 0 || hard >= soft) { 590 - ddq->d_ino_hardlimit = cpu_to_be64(hard); 591 - ddq->d_ino_softlimit = cpu_to_be64(soft); 592 - if (id == 0) { 593 - defq->ihardlimit = hard; 594 - defq->isoftlimit = soft; 595 - } 596 - } else { 597 - xfs_debug(mp, "ihard %Ld < isoft %Ld", hard, soft); 598 - } 599 - 600 - /* 601 - * Update warnings counter(s) if requested 602 - */ 603 - if (newlim->d_fieldmask & QC_SPC_WARNS) 604 - ddq->d_bwarns = cpu_to_be16(newlim->d_spc_warns); 605 - if (newlim->d_fieldmask & QC_INO_WARNS) 606 - ddq->d_iwarns = cpu_to_be16(newlim->d_ino_warns); 607 - if (newlim->d_fieldmask & QC_RT_SPC_WARNS) 608 - ddq->d_rtbwarns = cpu_to_be16(newlim->d_rt_spc_warns); 609 - 610 - if (id == 0) { 611 - if (newlim->d_fieldmask & QC_SPC_WARNS) 612 - defq->bwarnlimit = newlim->d_spc_warns; 613 - if (newlim->d_fieldmask & QC_INO_WARNS) 614 - defq->iwarnlimit = newlim->d_ino_warns; 615 - if (newlim->d_fieldmask & QC_RT_SPC_WARNS) 616 - defq->rtbwarnlimit = newlim->d_rt_spc_warns; 617 - } 618 - 619 - /* 497 + * 498 + * Update warnings counter(s) if requested. 499 + * 620 500 * Timelimits for the super user set the relative time the other users 621 501 * can be over quota for this file system. If it is zero a default is 622 502 * used. Ditto for the default soft and hard limit values (already ··· 558 572 * For other IDs, userspace can bump out the grace period if over 559 573 * the soft limit. 560 574 */ 561 - if (newlim->d_fieldmask & QC_SPC_TIMER) 562 - ddq->d_btimer = cpu_to_be32(newlim->d_spc_timer); 563 - if (newlim->d_fieldmask & QC_INO_TIMER) 564 - ddq->d_itimer = cpu_to_be32(newlim->d_ino_timer); 565 - if (newlim->d_fieldmask & QC_RT_SPC_TIMER) 566 - ddq->d_rtbtimer = cpu_to_be32(newlim->d_rt_spc_timer); 567 575 568 - if (id == 0) { 569 - if (newlim->d_fieldmask & QC_SPC_TIMER) 570 - defq->btimelimit = newlim->d_spc_timer; 571 - if (newlim->d_fieldmask & QC_INO_TIMER) 572 - defq->itimelimit = newlim->d_ino_timer; 573 - if (newlim->d_fieldmask & QC_RT_SPC_TIMER) 574 - defq->rtbtimelimit = newlim->d_rt_spc_timer; 575 - } 576 + /* Blocks on the data device. */ 577 + hard = (newlim->d_fieldmask & QC_SPC_HARD) ? 578 + (xfs_qcnt_t) XFS_B_TO_FSB(mp, newlim->d_spc_hardlimit) : 579 + dqp->q_blk.hardlimit; 580 + soft = (newlim->d_fieldmask & QC_SPC_SOFT) ? 581 + (xfs_qcnt_t) XFS_B_TO_FSB(mp, newlim->d_spc_softlimit) : 582 + dqp->q_blk.softlimit; 583 + res = &dqp->q_blk; 584 + qlim = id == 0 ? &defq->blk : NULL; 585 + 586 + if (xfs_setqlim_limits(mp, res, qlim, hard, soft, "blk")) 587 + xfs_dquot_set_prealloc_limits(dqp); 588 + if (newlim->d_fieldmask & QC_SPC_WARNS) 589 + xfs_setqlim_warns(res, qlim, newlim->d_spc_warns); 590 + if (newlim->d_fieldmask & QC_SPC_TIMER) 591 + xfs_setqlim_timer(res, qlim, newlim->d_spc_timer); 592 + 593 + /* Blocks on the realtime device. */ 594 + hard = (newlim->d_fieldmask & QC_RT_SPC_HARD) ? 595 + (xfs_qcnt_t) XFS_B_TO_FSB(mp, newlim->d_rt_spc_hardlimit) : 596 + dqp->q_rtb.hardlimit; 597 + soft = (newlim->d_fieldmask & QC_RT_SPC_SOFT) ? 598 + (xfs_qcnt_t) XFS_B_TO_FSB(mp, newlim->d_rt_spc_softlimit) : 599 + dqp->q_rtb.softlimit; 600 + res = &dqp->q_rtb; 601 + qlim = id == 0 ? &defq->rtb : NULL; 602 + 603 + xfs_setqlim_limits(mp, res, qlim, hard, soft, "rtb"); 604 + if (newlim->d_fieldmask & QC_RT_SPC_WARNS) 605 + xfs_setqlim_warns(res, qlim, newlim->d_rt_spc_warns); 606 + if (newlim->d_fieldmask & QC_RT_SPC_TIMER) 607 + xfs_setqlim_timer(res, qlim, newlim->d_rt_spc_timer); 608 + 609 + /* Inodes */ 610 + hard = (newlim->d_fieldmask & QC_INO_HARD) ? 611 + (xfs_qcnt_t) newlim->d_ino_hardlimit : 612 + dqp->q_ino.hardlimit; 613 + soft = (newlim->d_fieldmask & QC_INO_SOFT) ? 614 + (xfs_qcnt_t) newlim->d_ino_softlimit : 615 + dqp->q_ino.softlimit; 616 + res = &dqp->q_ino; 617 + qlim = id == 0 ? &defq->ino : NULL; 618 + 619 + xfs_setqlim_limits(mp, res, qlim, hard, soft, "ino"); 620 + if (newlim->d_fieldmask & QC_INO_WARNS) 621 + xfs_setqlim_warns(res, qlim, newlim->d_ino_warns); 622 + if (newlim->d_fieldmask & QC_INO_TIMER) 623 + xfs_setqlim_timer(res, qlim, newlim->d_ino_timer); 576 624 577 625 if (id != 0) { 578 626 /* ··· 616 596 * is on or off. We don't really want to bother with iterating 617 597 * over all ondisk dquots and turning the timers on/off. 618 598 */ 619 - xfs_qm_adjust_dqtimers(mp, dqp); 599 + xfs_qm_adjust_dqtimers(dqp); 620 600 } 621 - dqp->dq_flags |= XFS_DQ_DIRTY; 601 + dqp->q_flags |= XFS_DQFLAG_DIRTY; 622 602 xfs_trans_log_dquot(tp, dqp); 623 603 624 604 error = xfs_trans_commit(tp); ··· 634 614 static void 635 615 xfs_qm_scall_getquota_fill_qc( 636 616 struct xfs_mount *mp, 637 - uint type, 617 + xfs_dqtype_t type, 638 618 const struct xfs_dquot *dqp, 639 619 struct qc_dqblk *dst) 640 620 { 641 621 memset(dst, 0, sizeof(*dst)); 642 - dst->d_spc_hardlimit = 643 - XFS_FSB_TO_B(mp, be64_to_cpu(dqp->q_core.d_blk_hardlimit)); 644 - dst->d_spc_softlimit = 645 - XFS_FSB_TO_B(mp, be64_to_cpu(dqp->q_core.d_blk_softlimit)); 646 - dst->d_ino_hardlimit = be64_to_cpu(dqp->q_core.d_ino_hardlimit); 647 - dst->d_ino_softlimit = be64_to_cpu(dqp->q_core.d_ino_softlimit); 648 - dst->d_space = XFS_FSB_TO_B(mp, dqp->q_res_bcount); 649 - dst->d_ino_count = dqp->q_res_icount; 650 - dst->d_spc_timer = be32_to_cpu(dqp->q_core.d_btimer); 651 - dst->d_ino_timer = be32_to_cpu(dqp->q_core.d_itimer); 652 - dst->d_ino_warns = be16_to_cpu(dqp->q_core.d_iwarns); 653 - dst->d_spc_warns = be16_to_cpu(dqp->q_core.d_bwarns); 654 - dst->d_rt_spc_hardlimit = 655 - XFS_FSB_TO_B(mp, be64_to_cpu(dqp->q_core.d_rtb_hardlimit)); 656 - dst->d_rt_spc_softlimit = 657 - XFS_FSB_TO_B(mp, be64_to_cpu(dqp->q_core.d_rtb_softlimit)); 658 - dst->d_rt_space = XFS_FSB_TO_B(mp, dqp->q_res_rtbcount); 659 - dst->d_rt_spc_timer = be32_to_cpu(dqp->q_core.d_rtbtimer); 660 - dst->d_rt_spc_warns = be16_to_cpu(dqp->q_core.d_rtbwarns); 622 + dst->d_spc_hardlimit = XFS_FSB_TO_B(mp, dqp->q_blk.hardlimit); 623 + dst->d_spc_softlimit = XFS_FSB_TO_B(mp, dqp->q_blk.softlimit); 624 + dst->d_ino_hardlimit = dqp->q_ino.hardlimit; 625 + dst->d_ino_softlimit = dqp->q_ino.softlimit; 626 + dst->d_space = XFS_FSB_TO_B(mp, dqp->q_blk.reserved); 627 + dst->d_ino_count = dqp->q_ino.reserved; 628 + dst->d_spc_timer = dqp->q_blk.timer; 629 + dst->d_ino_timer = dqp->q_ino.timer; 630 + dst->d_ino_warns = dqp->q_ino.warnings; 631 + dst->d_spc_warns = dqp->q_blk.warnings; 632 + dst->d_rt_spc_hardlimit = XFS_FSB_TO_B(mp, dqp->q_rtb.hardlimit); 633 + dst->d_rt_spc_softlimit = XFS_FSB_TO_B(mp, dqp->q_rtb.softlimit); 634 + dst->d_rt_space = XFS_FSB_TO_B(mp, dqp->q_rtb.reserved); 635 + dst->d_rt_spc_timer = dqp->q_rtb.timer; 636 + dst->d_rt_spc_warns = dqp->q_rtb.warnings; 661 637 662 638 /* 663 639 * Internally, we don't reset all the timers when quota enforcement 664 640 * gets turned off. No need to confuse the user level code, 665 641 * so return zeroes in that case. 666 642 */ 667 - if ((!XFS_IS_UQUOTA_ENFORCED(mp) && 668 - dqp->q_core.d_flags == XFS_DQ_USER) || 669 - (!XFS_IS_GQUOTA_ENFORCED(mp) && 670 - dqp->q_core.d_flags == XFS_DQ_GROUP) || 671 - (!XFS_IS_PQUOTA_ENFORCED(mp) && 672 - dqp->q_core.d_flags == XFS_DQ_PROJ)) { 643 + if (!xfs_dquot_is_enforced(dqp)) { 673 644 dst->d_spc_timer = 0; 674 645 dst->d_ino_timer = 0; 675 646 dst->d_rt_spc_timer = 0; 676 647 } 677 648 678 649 #ifdef DEBUG 679 - if (((XFS_IS_UQUOTA_ENFORCED(mp) && type == XFS_DQ_USER) || 680 - (XFS_IS_GQUOTA_ENFORCED(mp) && type == XFS_DQ_GROUP) || 681 - (XFS_IS_PQUOTA_ENFORCED(mp) && type == XFS_DQ_PROJ)) && 682 - dqp->q_core.d_id != 0) { 650 + if (xfs_dquot_is_enforced(dqp) && dqp->q_id != 0) { 683 651 if ((dst->d_space > dst->d_spc_softlimit) && 684 652 (dst->d_spc_softlimit > 0)) { 685 653 ASSERT(dst->d_spc_timer != 0); 686 654 } 687 - if ((dst->d_ino_count > dst->d_ino_softlimit) && 688 - (dst->d_ino_softlimit > 0)) { 655 + if ((dst->d_ino_count > dqp->q_ino.softlimit) && 656 + (dqp->q_ino.softlimit > 0)) { 689 657 ASSERT(dst->d_ino_timer != 0); 690 658 } 691 659 } ··· 685 677 xfs_qm_scall_getquota( 686 678 struct xfs_mount *mp, 687 679 xfs_dqid_t id, 688 - uint type, 680 + xfs_dqtype_t type, 689 681 struct qc_dqblk *dst) 690 682 { 691 683 struct xfs_dquot *dqp; ··· 723 715 xfs_qm_scall_getquota_next( 724 716 struct xfs_mount *mp, 725 717 xfs_dqid_t *id, 726 - uint type, 718 + xfs_dqtype_t type, 727 719 struct qc_dqblk *dst) 728 720 { 729 721 struct xfs_dquot *dqp; ··· 734 726 return error; 735 727 736 728 /* Fill in the ID we actually read from disk */ 737 - *id = be32_to_cpu(dqp->q_core.d_id); 729 + *id = dqp->q_id; 738 730 739 731 xfs_qm_scall_getquota_fill_qc(mp, type, dqp, dst); 740 732

+14 -5

fs/xfs/xfs_quota.h

··· 13 13 */ 14 14 15 15 struct xfs_trans; 16 + struct xfs_buf; 16 17 17 18 /* 18 19 * This check is done typically without holding the inode lock; ··· 39 38 40 39 static inline uint 41 40 xfs_quota_chkd_flag( 42 - uint dqtype) 41 + xfs_dqtype_t type) 43 42 { 44 - switch (dqtype) { 45 - case XFS_DQ_USER: 43 + switch (type) { 44 + case XFS_DQTYPE_USER: 46 45 return XFS_UQUOTA_CHKD; 47 - case XFS_DQ_GROUP: 46 + case XFS_DQTYPE_GROUP: 48 47 return XFS_GQUOTA_CHKD; 49 - case XFS_DQ_PROJ: 48 + case XFS_DQTYPE_PROJ: 50 49 return XFS_PQUOTA_CHKD; 51 50 default: 52 51 return 0; ··· 108 107 extern void xfs_qm_unmount(struct xfs_mount *); 109 108 extern void xfs_qm_unmount_quotas(struct xfs_mount *); 110 109 110 + void xfs_dquot_done(struct xfs_buf *); 111 + 111 112 #else 112 113 static inline int 113 114 xfs_qm_vop_dqalloc(struct xfs_inode *ip, kuid_t kuid, kgid_t kgid, ··· 151 148 #define xfs_qm_mount_quotas(mp) 152 149 #define xfs_qm_unmount(mp) 153 150 #define xfs_qm_unmount_quotas(mp) 151 + 152 + static inline void xfs_dquot_done(struct xfs_buf *bp) 153 + { 154 + return; 155 + } 156 + 154 157 #endif /* CONFIG_XFS_QUOTA */ 155 158 156 159 #define xfs_trans_unreserve_quota_nblks(tp, ip, nblks, ninos, flags) \

+13 -13

fs/xfs/xfs_quotaops.c

··· 37 37 tstate->flags |= QCI_SYSFILE; 38 38 tstate->blocks = ip->i_d.di_nblocks; 39 39 tstate->nextents = ip->i_df.if_nextents; 40 - tstate->spc_timelimit = (u32)defq->btimelimit; 41 - tstate->ino_timelimit = (u32)defq->itimelimit; 42 - tstate->rt_spc_timelimit = (u32)defq->rtbtimelimit; 43 - tstate->spc_warnlimit = defq->bwarnlimit; 44 - tstate->ino_warnlimit = defq->iwarnlimit; 45 - tstate->rt_spc_warnlimit = defq->rtbwarnlimit; 40 + tstate->spc_timelimit = (u32)defq->blk.time; 41 + tstate->ino_timelimit = (u32)defq->ino.time; 42 + tstate->rt_spc_timelimit = (u32)defq->rtb.time; 43 + tstate->spc_warnlimit = defq->blk.warn; 44 + tstate->ino_warnlimit = defq->ino.warn; 45 + tstate->rt_spc_warnlimit = defq->rtb.warn; 46 46 if (tempqip) 47 47 xfs_irele(ip); 48 48 } ··· 85 85 return 0; 86 86 } 87 87 88 - STATIC int 88 + STATIC xfs_dqtype_t 89 89 xfs_quota_type(int type) 90 90 { 91 91 switch (type) { 92 92 case USRQUOTA: 93 - return XFS_DQ_USER; 93 + return XFS_DQTYPE_USER; 94 94 case GRPQUOTA: 95 - return XFS_DQ_GROUP; 95 + return XFS_DQTYPE_GROUP; 96 96 default: 97 - return XFS_DQ_PROJ; 97 + return XFS_DQTYPE_PROJ; 98 98 } 99 99 } 100 100 ··· 205 205 return -EINVAL; 206 206 207 207 if (uflags & FS_USER_QUOTA) 208 - flags |= XFS_DQ_USER; 208 + flags |= XFS_QMOPT_UQUOTA; 209 209 if (uflags & FS_GROUP_QUOTA) 210 - flags |= XFS_DQ_GROUP; 210 + flags |= XFS_QMOPT_GQUOTA; 211 211 if (uflags & FS_PROJ_QUOTA) 212 - flags |= XFS_DQ_PROJ; 212 + flags |= XFS_QMOPT_PQUOTA; 213 213 214 214 return xfs_qm_scall_trunc_qfiles(mp, flags); 215 215 }

+3 -2

fs/xfs/xfs_refcount_item.c

··· 143 143 cuip = kmem_zalloc(xfs_cui_log_item_sizeof(nextents), 144 144 0); 145 145 else 146 - cuip = kmem_zone_zalloc(xfs_cui_zone, 0); 146 + cuip = kmem_cache_zalloc(xfs_cui_zone, 147 + GFP_KERNEL | __GFP_NOFAIL); 147 148 148 149 xfs_log_item_init(mp, &cuip->cui_item, XFS_LI_CUI, &xfs_cui_item_ops); 149 150 cuip->cui_format.cui_nextents = nextents; ··· 221 220 { 222 221 struct xfs_cud_log_item *cudp; 223 222 224 - cudp = kmem_zone_zalloc(xfs_cud_zone, 0); 223 + cudp = kmem_cache_zalloc(xfs_cud_zone, GFP_KERNEL | __GFP_NOFAIL); 225 224 xfs_log_item_init(tp->t_mountp, &cudp->cud_item, XFS_LI_CUD, 226 225 &xfs_cud_item_ops); 227 226 cudp->cud_cuip = cuip;

+165 -194

fs/xfs/xfs_reflink.c

··· 179 179 int error = 0; 180 180 181 181 /* Holes, unwritten, and delalloc extents cannot be shared */ 182 - if (!xfs_is_cow_inode(ip) || !xfs_bmap_is_real_extent(irec)) { 182 + if (!xfs_is_cow_inode(ip) || !xfs_bmap_is_written_extent(irec)) { 183 183 *shared = false; 184 184 return 0; 185 185 } ··· 655 655 * preallocations can leak into the range we are called upon, and we 656 656 * need to skip them. 657 657 */ 658 - if (!xfs_bmap_is_real_extent(&got)) { 658 + if (!xfs_bmap_is_written_extent(&got)) { 659 659 *end_fsb = del.br_startoff; 660 660 goto out_cancel; 661 661 } ··· 984 984 } 985 985 986 986 /* 987 - * Unmap a range of blocks from a file, then map other blocks into the hole. 988 - * The range to unmap is (destoff : destoff + srcioff + irec->br_blockcount). 989 - * The extent irec is mapped into dest at irec->br_startoff. 987 + * Remap the given extent into the file. The dmap blockcount will be set to 988 + * the number of blocks that were actually remapped. 990 989 */ 991 990 STATIC int 992 991 xfs_reflink_remap_extent( 993 992 struct xfs_inode *ip, 994 - struct xfs_bmbt_irec *irec, 995 - xfs_fileoff_t destoff, 993 + struct xfs_bmbt_irec *dmap, 996 994 xfs_off_t new_isize) 997 995 { 996 + struct xfs_bmbt_irec smap; 998 997 struct xfs_mount *mp = ip->i_mount; 999 - bool real_extent = xfs_bmap_is_real_extent(irec); 1000 998 struct xfs_trans *tp; 1001 - unsigned int resblks; 1002 - struct xfs_bmbt_irec uirec; 1003 - xfs_filblks_t rlen; 1004 - xfs_filblks_t unmap_len; 1005 999 xfs_off_t newlen; 1000 + int64_t qres, qdelta; 1001 + unsigned int resblks; 1002 + bool smap_real; 1003 + bool dmap_written = xfs_bmap_is_written_extent(dmap); 1004 + int nimaps; 1006 1005 int error; 1007 1006 1008 - unmap_len = irec->br_startoff + irec->br_blockcount - destoff; 1009 - trace_xfs_reflink_punch_range(ip, destoff, unmap_len); 1010 - 1011 - /* No reflinking if we're low on space */ 1012 - if (real_extent) { 1013 - error = xfs_reflink_ag_has_free_space(mp, 1014 - XFS_FSB_TO_AGNO(mp, irec->br_startblock)); 1015 - if (error) 1016 - goto out; 1017 - } 1018 - 1019 1007 /* Start a rolling transaction to switch the mappings */ 1020 - resblks = XFS_EXTENTADD_SPACE_RES(ip->i_mount, XFS_DATA_FORK); 1008 + resblks = XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK); 1021 1009 error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, 0, &tp); 1022 1010 if (error) 1023 1011 goto out; ··· 1013 1025 xfs_ilock(ip, XFS_ILOCK_EXCL); 1014 1026 xfs_trans_ijoin(tp, ip, 0); 1015 1027 1016 - /* If we're not just clearing space, then do we have enough quota? */ 1017 - if (real_extent) { 1018 - error = xfs_trans_reserve_quota_nblks(tp, ip, 1019 - irec->br_blockcount, 0, XFS_QMOPT_RES_REGBLKS); 1028 + /* 1029 + * Read what's currently mapped in the destination file into smap. 1030 + * If smap isn't a hole, we will have to remove it before we can add 1031 + * dmap to the destination file. 1032 + */ 1033 + nimaps = 1; 1034 + error = xfs_bmapi_read(ip, dmap->br_startoff, dmap->br_blockcount, 1035 + &smap, &nimaps, 0); 1036 + if (error) 1037 + goto out_cancel; 1038 + ASSERT(nimaps == 1 && smap.br_startoff == dmap->br_startoff); 1039 + smap_real = xfs_bmap_is_real_extent(&smap); 1040 + 1041 + /* 1042 + * We can only remap as many blocks as the smaller of the two extent 1043 + * maps, because we can only remap one extent at a time. 1044 + */ 1045 + dmap->br_blockcount = min(dmap->br_blockcount, smap.br_blockcount); 1046 + ASSERT(dmap->br_blockcount == smap.br_blockcount); 1047 + 1048 + trace_xfs_reflink_remap_extent_dest(ip, &smap); 1049 + 1050 + /* 1051 + * Two extents mapped to the same physical block must not have 1052 + * different states; that's filesystem corruption. Move on to the next 1053 + * extent if they're both holes or both the same physical extent. 1054 + */ 1055 + if (dmap->br_startblock == smap.br_startblock) { 1056 + if (dmap->br_state != smap.br_state) 1057 + error = -EFSCORRUPTED; 1058 + goto out_cancel; 1059 + } 1060 + 1061 + /* If both extents are unwritten, leave them alone. */ 1062 + if (dmap->br_state == XFS_EXT_UNWRITTEN && 1063 + smap.br_state == XFS_EXT_UNWRITTEN) 1064 + goto out_cancel; 1065 + 1066 + /* No reflinking if the AG of the dest mapping is low on space. */ 1067 + if (dmap_written) { 1068 + error = xfs_reflink_ag_has_free_space(mp, 1069 + XFS_FSB_TO_AGNO(mp, dmap->br_startblock)); 1020 1070 if (error) 1021 1071 goto out_cancel; 1022 1072 } 1023 1073 1024 - trace_xfs_reflink_remap(ip, irec->br_startoff, 1025 - irec->br_blockcount, irec->br_startblock); 1026 - 1027 - /* Unmap the old blocks in the data fork. */ 1028 - rlen = unmap_len; 1029 - while (rlen) { 1030 - ASSERT(tp->t_firstblock == NULLFSBLOCK); 1031 - error = __xfs_bunmapi(tp, ip, destoff, &rlen, 0, 1); 1074 + /* 1075 + * Compute quota reservation if we think the quota block counter for 1076 + * this file could increase. 1077 + * 1078 + * Adding a written extent to the extent map can cause a bmbt split, 1079 + * and removing a mapped extent from the extent can cause a bmbt split. 1080 + * The two operations cannot both cause a split since they operate on 1081 + * the same index in the bmap btree, so we only need a reservation for 1082 + * one bmbt split if either thing is happening. 1083 + * 1084 + * If we are mapping a written extent into the file, we need to have 1085 + * enough quota block count reservation to handle the blocks in that 1086 + * extent. We log only the delta to the quota block counts, so if the 1087 + * extent we're unmapping also has blocks allocated to it, we don't 1088 + * need a quota reservation for the extent itself. 1089 + * 1090 + * Note that if we're replacing a delalloc reservation with a written 1091 + * extent, we have to take the full quota reservation because removing 1092 + * the delalloc reservation gives the block count back to the quota 1093 + * count. This is suboptimal, but the VFS flushed the dest range 1094 + * before we started. That should have removed all the delalloc 1095 + * reservations, but we code defensively. 1096 + */ 1097 + qres = qdelta = 0; 1098 + if (smap_real || dmap_written) 1099 + qres = XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK); 1100 + if (!smap_real && dmap_written) 1101 + qres += dmap->br_blockcount; 1102 + if (qres > 0) { 1103 + error = xfs_trans_reserve_quota_nblks(tp, ip, qres, 0, 1104 + XFS_QMOPT_RES_REGBLKS); 1032 1105 if (error) 1033 1106 goto out_cancel; 1107 + } 1108 + 1109 + if (smap_real) { 1110 + /* 1111 + * If the extent we're unmapping is backed by storage (written 1112 + * or not), unmap the extent and drop its refcount. 1113 + */ 1114 + xfs_bmap_unmap_extent(tp, ip, &smap); 1115 + xfs_refcount_decrease_extent(tp, &smap); 1116 + qdelta -= smap.br_blockcount; 1117 + } else if (smap.br_startblock == DELAYSTARTBLOCK) { 1118 + xfs_filblks_t len = smap.br_blockcount; 1034 1119 1035 1120 /* 1036 - * Trim the extent to whatever got unmapped. 1037 - * Remember, bunmapi works backwards. 1121 + * If the extent we're unmapping is a delalloc reservation, 1122 + * we can use the regular bunmapi function to release the 1123 + * incore state. Dropping the delalloc reservation takes care 1124 + * of the quota reservation for us. 1038 1125 */ 1039 - uirec.br_startblock = irec->br_startblock + rlen; 1040 - uirec.br_startoff = irec->br_startoff + rlen; 1041 - uirec.br_blockcount = unmap_len - rlen; 1042 - uirec.br_state = irec->br_state; 1043 - unmap_len = rlen; 1044 - 1045 - /* If this isn't a real mapping, we're done. */ 1046 - if (!real_extent || uirec.br_blockcount == 0) 1047 - goto next_extent; 1048 - 1049 - trace_xfs_reflink_remap(ip, uirec.br_startoff, 1050 - uirec.br_blockcount, uirec.br_startblock); 1051 - 1052 - /* Update the refcount tree */ 1053 - xfs_refcount_increase_extent(tp, &uirec); 1054 - 1055 - /* Map the new blocks into the data fork. */ 1056 - xfs_bmap_map_extent(tp, ip, &uirec); 1057 - 1058 - /* Update quota accounting. */ 1059 - xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, 1060 - uirec.br_blockcount); 1061 - 1062 - /* Update dest isize if needed. */ 1063 - newlen = XFS_FSB_TO_B(mp, 1064 - uirec.br_startoff + uirec.br_blockcount); 1065 - newlen = min_t(xfs_off_t, newlen, new_isize); 1066 - if (newlen > i_size_read(VFS_I(ip))) { 1067 - trace_xfs_reflink_update_inode_size(ip, newlen); 1068 - i_size_write(VFS_I(ip), newlen); 1069 - ip->i_d.di_size = newlen; 1070 - xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); 1071 - } 1072 - 1073 - next_extent: 1074 - /* Process all the deferred stuff. */ 1075 - error = xfs_defer_finish(&tp); 1126 + error = __xfs_bunmapi(NULL, ip, smap.br_startoff, &len, 0, 1); 1076 1127 if (error) 1077 1128 goto out_cancel; 1129 + ASSERT(len == 0); 1078 1130 } 1079 1131 1132 + /* 1133 + * If the extent we're sharing is backed by written storage, increase 1134 + * its refcount and map it into the file. 1135 + */ 1136 + if (dmap_written) { 1137 + xfs_refcount_increase_extent(tp, dmap); 1138 + xfs_bmap_map_extent(tp, ip, dmap); 1139 + qdelta += dmap->br_blockcount; 1140 + } 1141 + 1142 + xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, qdelta); 1143 + 1144 + /* Update dest isize if needed. */ 1145 + newlen = XFS_FSB_TO_B(mp, dmap->br_startoff + dmap->br_blockcount); 1146 + newlen = min_t(xfs_off_t, newlen, new_isize); 1147 + if (newlen > i_size_read(VFS_I(ip))) { 1148 + trace_xfs_reflink_update_inode_size(ip, newlen); 1149 + i_size_write(VFS_I(ip), newlen); 1150 + ip->i_d.di_size = newlen; 1151 + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); 1152 + } 1153 + 1154 + /* Commit everything and unlock. */ 1080 1155 error = xfs_trans_commit(tp); 1081 - xfs_iunlock(ip, XFS_ILOCK_EXCL); 1082 - if (error) 1083 - goto out; 1084 - return 0; 1156 + goto out_unlock; 1085 1157 1086 1158 out_cancel: 1087 1159 xfs_trans_cancel(tp); 1160 + out_unlock: 1088 1161 xfs_iunlock(ip, XFS_ILOCK_EXCL); 1089 1162 out: 1090 - trace_xfs_reflink_remap_extent_error(ip, error, _RET_IP_); 1163 + if (error) 1164 + trace_xfs_reflink_remap_extent_error(ip, error, _RET_IP_); 1091 1165 return error; 1092 1166 } 1093 1167 1094 - /* 1095 - * Iteratively remap one file's extents (and holes) to another's. 1096 - */ 1168 + /* Remap a range of one file to the other. */ 1097 1169 int 1098 1170 xfs_reflink_remap_blocks( 1099 1171 struct xfs_inode *src, ··· 1164 1116 loff_t *remapped) 1165 1117 { 1166 1118 struct xfs_bmbt_irec imap; 1167 - xfs_fileoff_t srcoff; 1168 - xfs_fileoff_t destoff; 1119 + struct xfs_mount *mp = src->i_mount; 1120 + xfs_fileoff_t srcoff = XFS_B_TO_FSBT(mp, pos_in); 1121 + xfs_fileoff_t destoff = XFS_B_TO_FSBT(mp, pos_out); 1169 1122 xfs_filblks_t len; 1170 - xfs_filblks_t range_len; 1171 1123 xfs_filblks_t remapped_len = 0; 1172 1124 xfs_off_t new_isize = pos_out + remap_len; 1173 1125 int nimaps; 1174 1126 int error = 0; 1175 1127 1176 - destoff = XFS_B_TO_FSBT(src->i_mount, pos_out); 1177 - srcoff = XFS_B_TO_FSBT(src->i_mount, pos_in); 1178 - len = XFS_B_TO_FSB(src->i_mount, remap_len); 1128 + len = min_t(xfs_filblks_t, XFS_B_TO_FSB(mp, remap_len), 1129 + XFS_MAX_FILEOFF); 1179 1130 1180 - /* drange = (destoff, destoff + len); srange = (srcoff, srcoff + len) */ 1181 - while (len) { 1182 - uint lock_mode; 1131 + trace_xfs_reflink_remap_blocks(src, srcoff, len, dest, destoff); 1183 1132 1184 - trace_xfs_reflink_remap_blocks_loop(src, srcoff, len, 1185 - dest, destoff); 1133 + while (len > 0) { 1134 + unsigned int lock_mode; 1186 1135 1187 1136 /* Read extent from the source file */ 1188 1137 nimaps = 1; ··· 1188 1143 xfs_iunlock(src, lock_mode); 1189 1144 if (error) 1190 1145 break; 1191 - ASSERT(nimaps == 1); 1146 + /* 1147 + * The caller supposedly flushed all dirty pages in the source 1148 + * file range, which means that writeback should have allocated 1149 + * or deleted all delalloc reservations in that range. If we 1150 + * find one, that's a good sign that something is seriously 1151 + * wrong here. 1152 + */ 1153 + ASSERT(nimaps == 1 && imap.br_startoff == srcoff); 1154 + if (imap.br_startblock == DELAYSTARTBLOCK) { 1155 + ASSERT(imap.br_startblock != DELAYSTARTBLOCK); 1156 + error = -EFSCORRUPTED; 1157 + break; 1158 + } 1192 1159 1193 - trace_xfs_reflink_remap_imap(src, srcoff, len, XFS_DATA_FORK, 1194 - &imap); 1160 + trace_xfs_reflink_remap_extent_src(src, &imap); 1195 1161 1196 - /* Translate imap into the destination file. */ 1197 - range_len = imap.br_startoff + imap.br_blockcount - srcoff; 1198 - imap.br_startoff += destoff - srcoff; 1199 - 1200 - /* Clear dest from destoff to the end of imap and map it in. */ 1201 - error = xfs_reflink_remap_extent(dest, &imap, destoff, 1202 - new_isize); 1162 + /* Remap into the destination file at the given offset. */ 1163 + imap.br_startoff = destoff; 1164 + error = xfs_reflink_remap_extent(dest, &imap, new_isize); 1203 1165 if (error) 1204 1166 break; 1205 1167 ··· 1216 1164 } 1217 1165 1218 1166 /* Advance drange/srange */ 1219 - srcoff += range_len; 1220 - destoff += range_len; 1221 - len -= range_len; 1222 - remapped_len += range_len; 1167 + srcoff += imap.br_blockcount; 1168 + destoff += imap.br_blockcount; 1169 + len -= imap.br_blockcount; 1170 + remapped_len += imap.br_blockcount; 1223 1171 } 1224 1172 1225 1173 if (error) ··· 1227 1175 *remapped = min_t(loff_t, remap_len, 1228 1176 XFS_FSB_TO_B(src->i_mount, remapped_len)); 1229 1177 return error; 1230 - } 1231 - 1232 - /* 1233 - * Grab the exclusive iolock for a data copy from src to dest, making sure to 1234 - * abide vfs locking order (lowest pointer value goes first) and breaking the 1235 - * layout leases before proceeding. The loop is needed because we cannot call 1236 - * the blocking break_layout() with the iolocks held, and therefore have to 1237 - * back out both locks. 1238 - */ 1239 - static int 1240 - xfs_iolock_two_inodes_and_break_layout( 1241 - struct inode *src, 1242 - struct inode *dest) 1243 - { 1244 - int error; 1245 - 1246 - if (src > dest) 1247 - swap(src, dest); 1248 - 1249 - retry: 1250 - /* Wait to break both inodes' layouts before we start locking. */ 1251 - error = break_layout(src, true); 1252 - if (error) 1253 - return error; 1254 - if (src != dest) { 1255 - error = break_layout(dest, true); 1256 - if (error) 1257 - return error; 1258 - } 1259 - 1260 - /* Lock one inode and make sure nobody got in and leased it. */ 1261 - inode_lock(src); 1262 - error = break_layout(src, false); 1263 - if (error) { 1264 - inode_unlock(src); 1265 - if (error == -EWOULDBLOCK) 1266 - goto retry; 1267 - return error; 1268 - } 1269 - 1270 - if (src == dest) 1271 - return 0; 1272 - 1273 - /* Lock the other inode and make sure nobody got in and leased it. */ 1274 - inode_lock_nested(dest, I_MUTEX_NONDIR2); 1275 - error = break_layout(dest, false); 1276 - if (error) { 1277 - inode_unlock(src); 1278 - inode_unlock(dest); 1279 - if (error == -EWOULDBLOCK) 1280 - goto retry; 1281 - return error; 1282 - } 1283 - 1284 - return 0; 1285 - } 1286 - 1287 - /* Unlock both inodes after they've been prepped for a range clone. */ 1288 - void 1289 - xfs_reflink_remap_unlock( 1290 - struct file *file_in, 1291 - struct file *file_out) 1292 - { 1293 - struct inode *inode_in = file_inode(file_in); 1294 - struct xfs_inode *src = XFS_I(inode_in); 1295 - struct inode *inode_out = file_inode(file_out); 1296 - struct xfs_inode *dest = XFS_I(inode_out); 1297 - bool same_inode = (inode_in == inode_out); 1298 - 1299 - xfs_iunlock(dest, XFS_MMAPLOCK_EXCL); 1300 - if (!same_inode) 1301 - xfs_iunlock(src, XFS_MMAPLOCK_EXCL); 1302 - inode_unlock(inode_out); 1303 - if (!same_inode) 1304 - inode_unlock(inode_in); 1305 1178 } 1306 1179 1307 1180 /* ··· 1291 1314 struct xfs_inode *src = XFS_I(inode_in); 1292 1315 struct inode *inode_out = file_inode(file_out); 1293 1316 struct xfs_inode *dest = XFS_I(inode_out); 1294 - bool same_inode = (inode_in == inode_out); 1295 - ssize_t ret; 1317 + int ret; 1296 1318 1297 1319 /* Lock both files against IO */ 1298 - ret = xfs_iolock_two_inodes_and_break_layout(inode_in, inode_out); 1320 + ret = xfs_ilock2_io_mmap(src, dest); 1299 1321 if (ret) 1300 1322 return ret; 1301 - if (same_inode) 1302 - xfs_ilock(src, XFS_MMAPLOCK_EXCL); 1303 - else 1304 - xfs_lock_two_inodes(src, XFS_MMAPLOCK_EXCL, dest, 1305 - XFS_MMAPLOCK_EXCL); 1306 1323 1307 1324 /* Check file eligibility and prepare for block sharing. */ 1308 1325 ret = -EINVAL; ··· 1310 1339 1311 1340 ret = generic_remap_file_range_prep(file_in, pos_in, file_out, pos_out, 1312 1341 len, remap_flags); 1313 - if (ret < 0 || *len == 0) 1342 + if (ret || *len == 0) 1314 1343 goto out_unlock; 1315 1344 1316 1345 /* Attach dquots to dest inode before changing block map */ ··· 1345 1374 if (ret) 1346 1375 goto out_unlock; 1347 1376 1348 - return 1; 1377 + return 0; 1349 1378 out_unlock: 1350 - xfs_reflink_remap_unlock(file_in, file_out); 1379 + xfs_iunlock2_io_mmap(src, dest); 1351 1380 return ret; 1352 1381 } 1353 1382

-2

fs/xfs/xfs_reflink.h

··· 56 56 loff_t *remapped); 57 57 extern int xfs_reflink_update_dest(struct xfs_inode *dest, xfs_off_t newlen, 58 58 xfs_extlen_t cowextsize, unsigned int remap_flags); 59 - extern void xfs_reflink_remap_unlock(struct file *file_in, 60 - struct file *file_out); 61 59 62 60 #endif /* __XFS_REFLINK_H */

+3 -2

fs/xfs/xfs_rmap_item.c

··· 141 141 if (nextents > XFS_RUI_MAX_FAST_EXTENTS) 142 142 ruip = kmem_zalloc(xfs_rui_log_item_sizeof(nextents), 0); 143 143 else 144 - ruip = kmem_zone_zalloc(xfs_rui_zone, 0); 144 + ruip = kmem_cache_zalloc(xfs_rui_zone, 145 + GFP_KERNEL | __GFP_NOFAIL); 145 146 146 147 xfs_log_item_init(mp, &ruip->rui_item, XFS_LI_RUI, &xfs_rui_item_ops); 147 148 ruip->rui_format.rui_nextents = nextents; ··· 244 243 { 245 244 struct xfs_rud_log_item *rudp; 246 245 247 - rudp = kmem_zone_zalloc(xfs_rud_zone, 0); 246 + rudp = kmem_cache_zalloc(xfs_rud_zone, GFP_KERNEL | __GFP_NOFAIL); 248 247 xfs_log_item_init(tp->t_mountp, &rudp->rud_item, XFS_LI_RUD, 249 248 &xfs_rud_item_ops); 250 249 rudp->rud_ruip = ruip;

+15 -4

fs/xfs/xfs_super.c

··· 890 890 /* force the log to unpin objects from the now complete transactions */ 891 891 xfs_log_force(mp, XFS_LOG_SYNC); 892 892 893 - /* reclaim inodes to do any IO before the freeze completes */ 894 - xfs_reclaim_inodes(mp, 0); 895 - xfs_reclaim_inodes(mp, SYNC_WAIT); 896 893 897 894 /* Push the superblock and write an unmount record */ 898 895 error = xfs_log_sbcount(mp); ··· 910 913 struct super_block *sb) 911 914 { 912 915 struct xfs_mount *mp = XFS_M(sb); 916 + unsigned int flags; 917 + int ret; 913 918 919 + /* 920 + * The filesystem is now frozen far enough that memory reclaim 921 + * cannot safely operate on the filesystem. Hence we need to 922 + * set a GFP_NOFS context here to avoid recursion deadlocks. 923 + */ 924 + flags = memalloc_nofs_save(); 914 925 xfs_stop_block_reaping(mp); 915 926 xfs_save_resvblks(mp); 916 927 xfs_quiesce_attr(mp); 917 - return xfs_sync_sb(mp, true); 928 + ret = xfs_sync_sb(mp, true); 929 + memalloc_nofs_restore(flags); 930 + return ret; 918 931 } 919 932 920 933 STATIC int ··· 1720 1713 xfs_sb_t *sbp = &mp->m_sb; 1721 1714 int flags = fc->sb_flags; 1722 1715 int error; 1716 + 1717 + /* version 5 superblocks always support version counters. */ 1718 + if (XFS_SB_VERSION_NUM(&mp->m_sb) == XFS_SB_VERSION_5) 1719 + fc->sb_flags |= SB_I_VERSION; 1723 1720 1724 1721 error = xfs_fc_validate_params(new_mp); 1725 1722 if (error)

+160 -66

fs/xfs/xfs_trace.h

··· 36 36 struct xfs_trans_res; 37 37 struct xfs_inobt_rec_incore; 38 38 union xfs_btree_ptr; 39 + struct xfs_dqtrx; 39 40 40 41 #define XFS_ATTR_FILTER_FLAGS \ 41 42 { XFS_ATTR_ROOT, "ROOT" }, \ ··· 865 864 TP_STRUCT__entry( 866 865 __field(dev_t, dev) 867 866 __field(u32, id) 867 + __field(xfs_dqtype_t, type) 868 868 __field(unsigned, flags) 869 869 __field(unsigned, nrefs) 870 870 __field(unsigned long long, res_bcount) 871 + __field(unsigned long long, res_rtbcount) 872 + __field(unsigned long long, res_icount) 873 + 871 874 __field(unsigned long long, bcount) 875 + __field(unsigned long long, rtbcount) 872 876 __field(unsigned long long, icount) 877 + 873 878 __field(unsigned long long, blk_hardlimit) 874 879 __field(unsigned long long, blk_softlimit) 880 + __field(unsigned long long, rtb_hardlimit) 881 + __field(unsigned long long, rtb_softlimit) 875 882 __field(unsigned long long, ino_hardlimit) 876 883 __field(unsigned long long, ino_softlimit) 877 - ), \ 884 + ), 878 885 TP_fast_assign( 879 886 __entry->dev = dqp->q_mount->m_super->s_dev; 880 - __entry->id = be32_to_cpu(dqp->q_core.d_id); 881 - __entry->flags = dqp->dq_flags; 887 + __entry->id = dqp->q_id; 888 + __entry->type = dqp->q_type; 889 + __entry->flags = dqp->q_flags; 882 890 __entry->nrefs = dqp->q_nrefs; 883 - __entry->res_bcount = dqp->q_res_bcount; 884 - __entry->bcount = be64_to_cpu(dqp->q_core.d_bcount); 885 - __entry->icount = be64_to_cpu(dqp->q_core.d_icount); 886 - __entry->blk_hardlimit = 887 - be64_to_cpu(dqp->q_core.d_blk_hardlimit); 888 - __entry->blk_softlimit = 889 - be64_to_cpu(dqp->q_core.d_blk_softlimit); 890 - __entry->ino_hardlimit = 891 - be64_to_cpu(dqp->q_core.d_ino_hardlimit); 892 - __entry->ino_softlimit = 893 - be64_to_cpu(dqp->q_core.d_ino_softlimit); 891 + 892 + __entry->res_bcount = dqp->q_blk.reserved; 893 + __entry->res_rtbcount = dqp->q_rtb.reserved; 894 + __entry->res_icount = dqp->q_ino.reserved; 895 + 896 + __entry->bcount = dqp->q_blk.count; 897 + __entry->rtbcount = dqp->q_rtb.count; 898 + __entry->icount = dqp->q_ino.count; 899 + 900 + __entry->blk_hardlimit = dqp->q_blk.hardlimit; 901 + __entry->blk_softlimit = dqp->q_blk.softlimit; 902 + __entry->rtb_hardlimit = dqp->q_rtb.hardlimit; 903 + __entry->rtb_softlimit = dqp->q_rtb.softlimit; 904 + __entry->ino_hardlimit = dqp->q_ino.hardlimit; 905 + __entry->ino_softlimit = dqp->q_ino.softlimit; 894 906 ), 895 - TP_printk("dev %d:%d id 0x%x flags %s nrefs %u res_bc 0x%llx " 907 + TP_printk("dev %d:%d id 0x%x type %s flags %s nrefs %u " 908 + "res_bc 0x%llx res_rtbc 0x%llx res_ic 0x%llx " 896 909 "bcnt 0x%llx bhardlimit 0x%llx bsoftlimit 0x%llx " 910 + "rtbcnt 0x%llx rtbhardlimit 0x%llx rtbsoftlimit 0x%llx " 897 911 "icnt 0x%llx ihardlimit 0x%llx isoftlimit 0x%llx]", 898 912 MAJOR(__entry->dev), MINOR(__entry->dev), 899 913 __entry->id, 900 - __print_flags(__entry->flags, "|", XFS_DQ_FLAGS), 914 + __print_flags(__entry->type, "|", XFS_DQTYPE_STRINGS), 915 + __print_flags(__entry->flags, "|", XFS_DQFLAG_STRINGS), 901 916 __entry->nrefs, 902 917 __entry->res_bcount, 918 + __entry->res_rtbcount, 919 + __entry->res_icount, 903 920 __entry->bcount, 904 921 __entry->blk_hardlimit, 905 922 __entry->blk_softlimit, 923 + __entry->rtbcount, 924 + __entry->rtb_hardlimit, 925 + __entry->rtb_softlimit, 906 926 __entry->icount, 907 927 __entry->ino_hardlimit, 908 928 __entry->ino_softlimit) ··· 954 932 DEFINE_DQUOT_EVENT(xfs_dqflush); 955 933 DEFINE_DQUOT_EVENT(xfs_dqflush_force); 956 934 DEFINE_DQUOT_EVENT(xfs_dqflush_done); 935 + DEFINE_DQUOT_EVENT(xfs_trans_apply_dquot_deltas_before); 936 + DEFINE_DQUOT_EVENT(xfs_trans_apply_dquot_deltas_after); 937 + 938 + #define XFS_QMOPT_FLAGS \ 939 + { XFS_QMOPT_UQUOTA, "UQUOTA" }, \ 940 + { XFS_QMOPT_PQUOTA, "PQUOTA" }, \ 941 + { XFS_QMOPT_FORCE_RES, "FORCE_RES" }, \ 942 + { XFS_QMOPT_SBVERSION, "SBVERSION" }, \ 943 + { XFS_QMOPT_GQUOTA, "GQUOTA" }, \ 944 + { XFS_QMOPT_INHERIT, "INHERIT" }, \ 945 + { XFS_QMOPT_RES_REGBLKS, "RES_REGBLKS" }, \ 946 + { XFS_QMOPT_RES_RTBLKS, "RES_RTBLKS" }, \ 947 + { XFS_QMOPT_BCOUNT, "BCOUNT" }, \ 948 + { XFS_QMOPT_ICOUNT, "ICOUNT" }, \ 949 + { XFS_QMOPT_RTBCOUNT, "RTBCOUNT" }, \ 950 + { XFS_QMOPT_DELBCOUNT, "DELBCOUNT" }, \ 951 + { XFS_QMOPT_DELRTBCOUNT, "DELRTBCOUNT" }, \ 952 + { XFS_QMOPT_RES_INOS, "RES_INOS" } 953 + 954 + TRACE_EVENT(xfs_trans_mod_dquot, 955 + TP_PROTO(struct xfs_trans *tp, struct xfs_dquot *dqp, 956 + unsigned int field, int64_t delta), 957 + TP_ARGS(tp, dqp, field, delta), 958 + TP_STRUCT__entry( 959 + __field(dev_t, dev) 960 + __field(xfs_dqtype_t, type) 961 + __field(unsigned int, flags) 962 + __field(unsigned int, dqid) 963 + __field(unsigned int, field) 964 + __field(int64_t, delta) 965 + ), 966 + TP_fast_assign( 967 + __entry->dev = tp->t_mountp->m_super->s_dev; 968 + __entry->type = dqp->q_type; 969 + __entry->flags = dqp->q_flags; 970 + __entry->dqid = dqp->q_id; 971 + __entry->field = field; 972 + __entry->delta = delta; 973 + ), 974 + TP_printk("dev %d:%d dquot id 0x%x type %s flags %s field %s delta %lld", 975 + MAJOR(__entry->dev), MINOR(__entry->dev), 976 + __entry->dqid, 977 + __print_flags(__entry->type, "|", XFS_DQTYPE_STRINGS), 978 + __print_flags(__entry->flags, "|", XFS_DQFLAG_STRINGS), 979 + __print_flags(__entry->field, "|", XFS_QMOPT_FLAGS), 980 + __entry->delta) 981 + ); 982 + 983 + DECLARE_EVENT_CLASS(xfs_dqtrx_class, 984 + TP_PROTO(struct xfs_dqtrx *qtrx), 985 + TP_ARGS(qtrx), 986 + TP_STRUCT__entry( 987 + __field(dev_t, dev) 988 + __field(xfs_dqtype_t, type) 989 + __field(unsigned int, flags) 990 + __field(u32, dqid) 991 + 992 + __field(uint64_t, blk_res) 993 + __field(int64_t, bcount_delta) 994 + __field(int64_t, delbcnt_delta) 995 + 996 + __field(uint64_t, rtblk_res) 997 + __field(uint64_t, rtblk_res_used) 998 + __field(int64_t, rtbcount_delta) 999 + __field(int64_t, delrtb_delta) 1000 + 1001 + __field(uint64_t, ino_res) 1002 + __field(uint64_t, ino_res_used) 1003 + __field(int64_t, icount_delta) 1004 + ), 1005 + TP_fast_assign( 1006 + __entry->dev = qtrx->qt_dquot->q_mount->m_super->s_dev; 1007 + __entry->type = qtrx->qt_dquot->q_type; 1008 + __entry->flags = qtrx->qt_dquot->q_flags; 1009 + __entry->dqid = qtrx->qt_dquot->q_id; 1010 + 1011 + __entry->blk_res = qtrx->qt_blk_res; 1012 + __entry->bcount_delta = qtrx->qt_bcount_delta; 1013 + __entry->delbcnt_delta = qtrx->qt_delbcnt_delta; 1014 + 1015 + __entry->rtblk_res = qtrx->qt_rtblk_res; 1016 + __entry->rtblk_res_used = qtrx->qt_rtblk_res_used; 1017 + __entry->rtbcount_delta = qtrx->qt_rtbcount_delta; 1018 + __entry->delrtb_delta = qtrx->qt_delrtb_delta; 1019 + 1020 + __entry->ino_res = qtrx->qt_ino_res; 1021 + __entry->ino_res_used = qtrx->qt_ino_res_used; 1022 + __entry->icount_delta = qtrx->qt_icount_delta; 1023 + ), 1024 + TP_printk("dev %d:%d dquot id 0x%x type %s flags %s" 1025 + "blk_res %llu bcount_delta %lld delbcnt_delta %lld " 1026 + "rtblk_res %llu rtblk_res_used %llu rtbcount_delta %lld delrtb_delta %lld " 1027 + "ino_res %llu ino_res_used %llu icount_delta %lld", 1028 + MAJOR(__entry->dev), MINOR(__entry->dev), 1029 + __entry->dqid, 1030 + __print_flags(__entry->type, "|", XFS_DQTYPE_STRINGS), 1031 + __print_flags(__entry->flags, "|", XFS_DQFLAG_STRINGS), 1032 + 1033 + __entry->blk_res, 1034 + __entry->bcount_delta, 1035 + __entry->delbcnt_delta, 1036 + 1037 + __entry->rtblk_res, 1038 + __entry->rtblk_res_used, 1039 + __entry->rtbcount_delta, 1040 + __entry->delrtb_delta, 1041 + 1042 + __entry->ino_res, 1043 + __entry->ino_res_used, 1044 + __entry->icount_delta) 1045 + ) 1046 + 1047 + #define DEFINE_DQTRX_EVENT(name) \ 1048 + DEFINE_EVENT(xfs_dqtrx_class, name, \ 1049 + TP_PROTO(struct xfs_dqtrx *qtrx), \ 1050 + TP_ARGS(qtrx)) 1051 + DEFINE_DQTRX_EVENT(xfs_trans_apply_dquot_deltas); 1052 + DEFINE_DQTRX_EVENT(xfs_trans_mod_dquot_before); 1053 + DEFINE_DQTRX_EVENT(xfs_trans_mod_dquot_after); 957 1054 958 1055 DECLARE_EVENT_CLASS(xfs_loggrant_class, 959 1056 TP_PROTO(struct xlog *log, struct xlog_ticket *tic), ··· 3193 3052 DEFINE_INODE_EVENT(xfs_reflink_set_inode_flag); 3194 3053 DEFINE_INODE_EVENT(xfs_reflink_unset_inode_flag); 3195 3054 DEFINE_ITRUNC_EVENT(xfs_reflink_update_inode_size); 3196 - DEFINE_IMAP_EVENT(xfs_reflink_remap_imap); 3197 - TRACE_EVENT(xfs_reflink_remap_blocks_loop, 3055 + TRACE_EVENT(xfs_reflink_remap_blocks, 3198 3056 TP_PROTO(struct xfs_inode *src, xfs_fileoff_t soffset, 3199 3057 xfs_filblks_t len, struct xfs_inode *dest, 3200 3058 xfs_fileoff_t doffset), ··· 3224 3084 __entry->dest_ino, 3225 3085 __entry->dest_lblk) 3226 3086 ); 3227 - TRACE_EVENT(xfs_reflink_punch_range, 3228 - TP_PROTO(struct xfs_inode *ip, xfs_fileoff_t lblk, 3229 - xfs_extlen_t len), 3230 - TP_ARGS(ip, lblk, len), 3231 - TP_STRUCT__entry( 3232 - __field(dev_t, dev) 3233 - __field(xfs_ino_t, ino) 3234 - __field(xfs_fileoff_t, lblk) 3235 - __field(xfs_extlen_t, len) 3236 - ), 3237 - TP_fast_assign( 3238 - __entry->dev = VFS_I(ip)->i_sb->s_dev; 3239 - __entry->ino = ip->i_ino; 3240 - __entry->lblk = lblk; 3241 - __entry->len = len; 3242 - ), 3243 - TP_printk("dev %d:%d ino 0x%llx lblk 0x%llx len 0x%x", 3244 - MAJOR(__entry->dev), MINOR(__entry->dev), 3245 - __entry->ino, 3246 - __entry->lblk, 3247 - __entry->len) 3248 - ); 3249 - TRACE_EVENT(xfs_reflink_remap, 3250 - TP_PROTO(struct xfs_inode *ip, xfs_fileoff_t lblk, 3251 - xfs_extlen_t len, xfs_fsblock_t new_pblk), 3252 - TP_ARGS(ip, lblk, len, new_pblk), 3253 - TP_STRUCT__entry( 3254 - __field(dev_t, dev) 3255 - __field(xfs_ino_t, ino) 3256 - __field(xfs_fileoff_t, lblk) 3257 - __field(xfs_extlen_t, len) 3258 - __field(xfs_fsblock_t, new_pblk) 3259 - ), 3260 - TP_fast_assign( 3261 - __entry->dev = VFS_I(ip)->i_sb->s_dev; 3262 - __entry->ino = ip->i_ino; 3263 - __entry->lblk = lblk; 3264 - __entry->len = len; 3265 - __entry->new_pblk = new_pblk; 3266 - ), 3267 - TP_printk("dev %d:%d ino 0x%llx lblk 0x%llx len 0x%x new_pblk %llu", 3268 - MAJOR(__entry->dev), MINOR(__entry->dev), 3269 - __entry->ino, 3270 - __entry->lblk, 3271 - __entry->len, 3272 - __entry->new_pblk) 3273 - ); 3274 3087 DEFINE_DOUBLE_IO_EVENT(xfs_reflink_remap_range); 3275 3088 DEFINE_INODE_ERROR_EVENT(xfs_reflink_remap_range_error); 3276 3089 DEFINE_INODE_ERROR_EVENT(xfs_reflink_set_inode_flag_error); 3277 3090 DEFINE_INODE_ERROR_EVENT(xfs_reflink_update_inode_size_error); 3278 3091 DEFINE_INODE_ERROR_EVENT(xfs_reflink_remap_blocks_error); 3279 3092 DEFINE_INODE_ERROR_EVENT(xfs_reflink_remap_extent_error); 3093 + DEFINE_INODE_IREC_EVENT(xfs_reflink_remap_extent_src); 3094 + DEFINE_INODE_IREC_EVENT(xfs_reflink_remap_extent_dest); 3280 3095 3281 3096 /* dedupe tracepoints */ 3282 3097 DEFINE_DOUBLE_IO_EVENT(xfs_reflink_compare_extents); ··· 3677 3582 DEFINE_KMEM_EVENT(kmem_alloc_io); 3678 3583 DEFINE_KMEM_EVENT(kmem_alloc_large); 3679 3584 DEFINE_KMEM_EVENT(kmem_realloc); 3680 - DEFINE_KMEM_EVENT(kmem_zone_alloc); 3681 3585 3682 3586 TRACE_EVENT(xfs_check_new_dalign, 3683 3587 TP_PROTO(struct xfs_mount *mp, int new_dalign, xfs_ino_t calc_rootino),

+20 -3

fs/xfs/xfs_trans.c

··· 90 90 91 91 trace_xfs_trans_dup(tp, _RET_IP_); 92 92 93 - ntp = kmem_zone_zalloc(xfs_trans_zone, 0); 93 + ntp = kmem_cache_zalloc(xfs_trans_zone, GFP_KERNEL | __GFP_NOFAIL); 94 94 95 95 /* 96 96 * Initialize the new transaction structure. ··· 107 107 108 108 ntp->t_flags = XFS_TRANS_PERM_LOG_RES | 109 109 (tp->t_flags & XFS_TRANS_RESERVE) | 110 - (tp->t_flags & XFS_TRANS_NO_WRITECOUNT); 110 + (tp->t_flags & XFS_TRANS_NO_WRITECOUNT) | 111 + (tp->t_flags & XFS_TRANS_RES_FDBLKS); 111 112 /* We gave our writer reference to the new transaction */ 112 113 tp->t_flags |= XFS_TRANS_NO_WRITECOUNT; 113 114 ntp->t_ticket = xfs_log_ticket_get(tp->t_ticket); ··· 263 262 * GFP_NOFS allocation context so that we avoid lockdep false positives 264 263 * by doing GFP_KERNEL allocations inside sb_start_intwrite(). 265 264 */ 266 - tp = kmem_zone_zalloc(xfs_trans_zone, 0); 265 + tp = kmem_cache_zalloc(xfs_trans_zone, GFP_KERNEL | __GFP_NOFAIL); 267 266 if (!(flags & XFS_TRANS_NO_WRITECOUNT)) 268 267 sb_start_intwrite(mp->m_super); 269 268 ··· 273 272 */ 274 273 WARN_ON(resp->tr_logres > 0 && 275 274 mp->m_super->s_writers.frozen == SB_FREEZE_COMPLETE); 275 + ASSERT(!(flags & XFS_TRANS_RES_FDBLKS) || 276 + xfs_sb_version_haslazysbcount(&mp->m_sb)); 276 277 277 278 tp->t_magic = XFS_TRANS_HEADER_MAGIC; 278 279 tp->t_flags = flags; ··· 368 365 tp->t_blk_res_used += (uint)-delta; 369 366 if (tp->t_blk_res_used > tp->t_blk_res) 370 367 xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); 368 + } else if (delta > 0 && (tp->t_flags & XFS_TRANS_RES_FDBLKS)) { 369 + int64_t blkres_delta; 370 + 371 + /* 372 + * Return freed blocks directly to the reservation 373 + * instead of the global pool, being careful not to 374 + * overflow the trans counter. This is used to preserve 375 + * reservation across chains of transaction rolls that 376 + * repeatedly free and allocate blocks. 377 + */ 378 + blkres_delta = min_t(int64_t, delta, 379 + UINT_MAX - tp->t_blk_res); 380 + tp->t_blk_res += blkres_delta; 381 + delta -= blkres_delta; 371 382 } 372 383 tp->t_fdblocks_delta += delta; 373 384 if (xfs_sb_version_haslazysbcount(&mp->m_sb))

-5

fs/xfs/xfs_trans.h

··· 37 37 unsigned long li_flags; /* misc flags */ 38 38 struct xfs_buf *li_buf; /* real buffer pointer */ 39 39 struct list_head li_bio_list; /* buffer item list */ 40 - void (*li_cb)(struct xfs_buf *, 41 - struct xfs_log_item *); 42 - /* buffer item iodone */ 43 - /* callback func */ 44 40 const struct xfs_item_ops *li_ops; /* function list */ 45 41 46 42 /* delayed logging */ ··· 74 78 void (*iop_committing)(struct xfs_log_item *, xfs_lsn_t commit_lsn); 75 79 void (*iop_release)(struct xfs_log_item *); 76 80 xfs_lsn_t (*iop_committed)(struct xfs_log_item *, xfs_lsn_t); 77 - void (*iop_error)(struct xfs_log_item *, xfs_buf_t *); 78 81 int (*iop_recover)(struct xfs_log_item *lip, struct xfs_trans *tp); 79 82 bool (*iop_match)(struct xfs_log_item *item, uint64_t id); 80 83 };

+13 -13

fs/xfs/xfs_trans_ail.c

··· 377 377 } 378 378 379 379 /* protected by ail_lock */ 380 - list_for_each_entry(lip, &bp->b_li_list, li_bio_list) 381 - xfs_clear_li_failed(lip); 380 + list_for_each_entry(lip, &bp->b_li_list, li_bio_list) { 381 + if (bp->b_flags & _XBF_INODES) 382 + clear_bit(XFS_LI_FAILED, &lip->li_flags); 383 + else 384 + xfs_clear_li_failed(lip); 385 + } 382 386 383 387 xfs_buf_unlock(bp); 384 388 return XFS_ITEM_SUCCESS; ··· 448 444 target = ailp->ail_target; 449 445 ailp->ail_target_prev = target; 450 446 447 + /* we're done if the AIL is empty or our push has reached the end */ 451 448 lip = xfs_trans_ail_cursor_first(ailp, &cur, ailp->ail_last_pushed_lsn); 452 - if (!lip) { 453 - /* 454 - * If the AIL is empty or our push has reached the end we are 455 - * done now. 456 - */ 457 - xfs_trans_ail_cursor_done(&cur); 458 - spin_unlock(&ailp->ail_lock); 449 + if (!lip) 459 450 goto out_done; 460 - } 461 451 462 452 XFS_STATS_INC(mp, xs_push_ail); 463 453 ··· 533 535 break; 534 536 lsn = lip->li_lsn; 535 537 } 538 + 539 + out_done: 536 540 xfs_trans_ail_cursor_done(&cur); 537 541 spin_unlock(&ailp->ail_lock); 538 542 ··· 542 542 ailp->ail_log_flush++; 543 543 544 544 if (!count || XFS_LSN_CMP(lsn, target) >= 0) { 545 - out_done: 546 545 /* 547 546 * We reached the target or the AIL is empty, so wait a bit 548 547 * longer for I/O to complete and remove pushed items from the ··· 633 634 */ 634 635 smp_rmb(); 635 636 if (!xfs_ail_min(ailp) && 636 - ailp->ail_target == ailp->ail_target_prev) { 637 + ailp->ail_target == ailp->ail_target_prev && 638 + list_empty(&ailp->ail_buf_list)) { 637 639 spin_unlock(&ailp->ail_lock); 638 640 freezable_schedule(); 639 641 tout = 0; ··· 843 843 844 844 trace_xfs_ail_delete(lip, mlip->li_lsn, lip->li_lsn); 845 845 xfs_ail_delete(ailp, lip); 846 - xfs_clear_li_failed(lip); 847 846 clear_bit(XFS_LI_IN_AIL, &lip->li_flags); 848 847 lip->li_lsn = 0; 849 848 ··· 873 874 } 874 875 875 876 /* xfs_ail_update_finish() drops the AIL lock */ 877 + xfs_clear_li_failed(lip); 876 878 tail_lsn = xfs_ail_delete_one(ailp, lip); 877 879 xfs_ail_update_finish(ailp, tail_lsn); 878 880 }

+5 -10

fs/xfs/xfs_trans_buf.c

··· 465 465 466 466 ASSERT(bp->b_transp == tp); 467 467 ASSERT(bip != NULL); 468 - ASSERT(bp->b_iodone == NULL || 469 - bp->b_iodone == xfs_buf_iodone_callbacks); 470 468 471 469 /* 472 470 * Mark the buffer as needing to be written out eventually, 473 471 * and set its iodone function to remove the buffer's buf log 474 472 * item from the AIL and free it when the buffer is flushed 475 - * to disk. See xfs_buf_attach_iodone() for more details 476 - * on li_cb and xfs_buf_iodone_callbacks(). 477 - * If we end up aborting this transaction, we trap this buffer 478 - * inside the b_bdstrat callback so that this won't get written to 479 - * disk. 473 + * to disk. 480 474 */ 481 475 bp->b_flags |= XBF_DONE; 482 476 483 477 ASSERT(atomic_read(&bip->bli_refcount) > 0); 484 - bp->b_iodone = xfs_buf_iodone_callbacks; 485 - bip->bli_item.li_cb = xfs_buf_iodone; 486 478 487 479 /* 488 480 * If we invalidated the buffer within this transaction, then ··· 618 626 ASSERT(atomic_read(&bip->bli_refcount) > 0); 619 627 620 628 bip->bli_flags |= XFS_BLI_INODE_BUF; 629 + bp->b_flags |= _XBF_INODES; 621 630 xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DINO_BUF); 622 631 } 623 632 ··· 643 650 ASSERT(atomic_read(&bip->bli_refcount) > 0); 644 651 645 652 bip->bli_flags |= XFS_BLI_STALE_INODE; 646 - bip->bli_item.li_cb = xfs_buf_iodone; 653 + bp->b_flags |= _XBF_INODES; 647 654 xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DINO_BUF); 648 655 } 649 656 ··· 668 675 ASSERT(atomic_read(&bip->bli_refcount) > 0); 669 676 670 677 bip->bli_flags |= XFS_BLI_INODE_ALLOC_BUF; 678 + bp->b_flags |= _XBF_INODES; 671 679 xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DINO_BUF); 672 680 } 673 681 ··· 779 785 break; 780 786 } 781 787 788 + bp->b_flags |= _XBF_DQUOTS; 782 789 xfs_trans_buf_set_type(tp, bp, type); 783 790 }

+192 -177

fs/xfs/xfs_trans_dquot.c

··· 15 15 #include "xfs_trans_priv.h" 16 16 #include "xfs_quota.h" 17 17 #include "xfs_qm.h" 18 + #include "xfs_trace.h" 18 19 19 20 STATIC void xfs_trans_alloc_dqinfo(xfs_trans_t *); 20 21 ··· 156 155 int i; 157 156 struct xfs_dqtrx *qa; 158 157 159 - if (XFS_QM_ISUDQ(dqp)) 158 + switch (xfs_dquot_type(dqp)) { 159 + case XFS_DQTYPE_USER: 160 160 qa = tp->t_dqinfo->dqs[XFS_QM_TRANS_USR]; 161 - else if (XFS_QM_ISGDQ(dqp)) 161 + break; 162 + case XFS_DQTYPE_GROUP: 162 163 qa = tp->t_dqinfo->dqs[XFS_QM_TRANS_GRP]; 163 - else if (XFS_QM_ISPDQ(dqp)) 164 + break; 165 + case XFS_DQTYPE_PROJ: 164 166 qa = tp->t_dqinfo->dqs[XFS_QM_TRANS_PRJ]; 165 - else 167 + break; 168 + default: 166 169 return NULL; 170 + } 167 171 168 172 for (i = 0; i < XFS_QM_TRANS_MAXDQS; i++) { 169 173 if (qa[i].qt_dquot == NULL || ··· 208 202 ASSERT(qtrx); 209 203 if (qtrx->qt_dquot == NULL) 210 204 qtrx->qt_dquot = dqp; 205 + 206 + if (delta) { 207 + trace_xfs_trans_mod_dquot_before(qtrx); 208 + trace_xfs_trans_mod_dquot(tp, dqp, field, delta); 209 + } 211 210 212 211 switch (field) { 213 212 ··· 277 266 default: 278 267 ASSERT(0); 279 268 } 269 + 270 + if (delta) 271 + trace_xfs_trans_mod_dquot_after(qtrx); 272 + 280 273 tp->t_flags |= XFS_TRANS_DQ_DIRTY; 281 274 } 282 275 ··· 308 293 } 309 294 } 310 295 296 + /* Apply dqtrx changes to the quota reservation counters. */ 297 + static inline void 298 + xfs_apply_quota_reservation_deltas( 299 + struct xfs_dquot_res *res, 300 + uint64_t reserved, 301 + int64_t res_used, 302 + int64_t count_delta) 303 + { 304 + if (reserved != 0) { 305 + /* 306 + * Subtle math here: If reserved > res_used (the normal case), 307 + * we're simply subtracting the unused transaction quota 308 + * reservation from the dquot reservation. 309 + * 310 + * If, however, res_used > reserved, then we have allocated 311 + * more quota blocks than were reserved for the transaction. 312 + * We must add that excess to the dquot reservation since it 313 + * tracks (usage + resv) and by definition we didn't reserve 314 + * that excess. 315 + */ 316 + res->reserved -= abs(reserved - res_used); 317 + } else if (count_delta != 0) { 318 + /* 319 + * These blks were never reserved, either inside a transaction 320 + * or outside one (in a delayed allocation). Also, this isn't 321 + * always a negative number since we sometimes deliberately 322 + * skip quota reservations. 323 + */ 324 + res->reserved += count_delta; 325 + } 326 + } 311 327 312 328 /* 313 329 * Called by xfs_trans_commit() and similar in spirit to ··· 355 309 int i, j; 356 310 struct xfs_dquot *dqp; 357 311 struct xfs_dqtrx *qtrx, *qa; 358 - struct xfs_disk_dquot *d; 359 312 int64_t totalbdelta; 360 313 int64_t totalrtbdelta; 361 314 ··· 373 328 xfs_trans_dqlockedjoin(tp, qa); 374 329 375 330 for (i = 0; i < XFS_QM_TRANS_MAXDQS; i++) { 331 + uint64_t blk_res_used; 332 + 376 333 qtrx = &qa[i]; 377 334 /* 378 335 * The array of dquots is filled ··· 388 341 /* 389 342 * adjust the actual number of blocks used 390 343 */ 391 - d = &dqp->q_core; 392 344 393 345 /* 394 346 * The issue here is - sometimes we don't make a blkquota ··· 406 360 qtrx->qt_delbcnt_delta; 407 361 totalrtbdelta = qtrx->qt_rtbcount_delta + 408 362 qtrx->qt_delrtb_delta; 363 + 364 + if (totalbdelta != 0 || totalrtbdelta != 0 || 365 + qtrx->qt_icount_delta != 0) { 366 + trace_xfs_trans_apply_dquot_deltas_before(dqp); 367 + trace_xfs_trans_apply_dquot_deltas(qtrx); 368 + } 369 + 409 370 #ifdef DEBUG 410 371 if (totalbdelta < 0) 411 - ASSERT(be64_to_cpu(d->d_bcount) >= 412 - -totalbdelta); 372 + ASSERT(dqp->q_blk.count >= -totalbdelta); 413 373 414 374 if (totalrtbdelta < 0) 415 - ASSERT(be64_to_cpu(d->d_rtbcount) >= 416 - -totalrtbdelta); 375 + ASSERT(dqp->q_rtb.count >= -totalrtbdelta); 417 376 418 377 if (qtrx->qt_icount_delta < 0) 419 - ASSERT(be64_to_cpu(d->d_icount) >= 420 - -qtrx->qt_icount_delta); 378 + ASSERT(dqp->q_ino.count >= -qtrx->qt_icount_delta); 421 379 #endif 422 380 if (totalbdelta) 423 - be64_add_cpu(&d->d_bcount, (xfs_qcnt_t)totalbdelta); 381 + dqp->q_blk.count += totalbdelta; 424 382 425 383 if (qtrx->qt_icount_delta) 426 - be64_add_cpu(&d->d_icount, (xfs_qcnt_t)qtrx->qt_icount_delta); 384 + dqp->q_ino.count += qtrx->qt_icount_delta; 427 385 428 386 if (totalrtbdelta) 429 - be64_add_cpu(&d->d_rtbcount, (xfs_qcnt_t)totalrtbdelta); 387 + dqp->q_rtb.count += totalrtbdelta; 388 + 389 + if (totalbdelta != 0 || totalrtbdelta != 0 || 390 + qtrx->qt_icount_delta != 0) 391 + trace_xfs_trans_apply_dquot_deltas_after(dqp); 430 392 431 393 /* 432 394 * Get any default limits in use. 433 395 * Start/reset the timer(s) if needed. 434 396 */ 435 - if (d->d_id) { 436 - xfs_qm_adjust_dqlimits(tp->t_mountp, dqp); 437 - xfs_qm_adjust_dqtimers(tp->t_mountp, dqp); 397 + if (dqp->q_id) { 398 + xfs_qm_adjust_dqlimits(dqp); 399 + xfs_qm_adjust_dqtimers(dqp); 438 400 } 439 401 440 - dqp->dq_flags |= XFS_DQ_DIRTY; 402 + dqp->q_flags |= XFS_DQFLAG_DIRTY; 441 403 /* 442 404 * add this to the list of items to get logged 443 405 */ ··· 455 401 * In case of delayed allocations, there's no 456 402 * reservation that a transaction structure knows of. 457 403 */ 458 - if (qtrx->qt_blk_res != 0) { 459 - uint64_t blk_res_used = 0; 404 + blk_res_used = max_t(int64_t, 0, qtrx->qt_bcount_delta); 405 + xfs_apply_quota_reservation_deltas(&dqp->q_blk, 406 + qtrx->qt_blk_res, blk_res_used, 407 + qtrx->qt_bcount_delta); 460 408 461 - if (qtrx->qt_bcount_delta > 0) 462 - blk_res_used = qtrx->qt_bcount_delta; 463 - 464 - if (qtrx->qt_blk_res != blk_res_used) { 465 - if (qtrx->qt_blk_res > blk_res_used) 466 - dqp->q_res_bcount -= (xfs_qcnt_t) 467 - (qtrx->qt_blk_res - 468 - blk_res_used); 469 - else 470 - dqp->q_res_bcount -= (xfs_qcnt_t) 471 - (blk_res_used - 472 - qtrx->qt_blk_res); 473 - } 474 - } else { 475 - /* 476 - * These blks were never reserved, either inside 477 - * a transaction or outside one (in a delayed 478 - * allocation). Also, this isn't always a 479 - * negative number since we sometimes 480 - * deliberately skip quota reservations. 481 - */ 482 - if (qtrx->qt_bcount_delta) { 483 - dqp->q_res_bcount += 484 - (xfs_qcnt_t)qtrx->qt_bcount_delta; 485 - } 486 - } 487 409 /* 488 410 * Adjust the RT reservation. 489 411 */ 490 - if (qtrx->qt_rtblk_res != 0) { 491 - if (qtrx->qt_rtblk_res != qtrx->qt_rtblk_res_used) { 492 - if (qtrx->qt_rtblk_res > 493 - qtrx->qt_rtblk_res_used) 494 - dqp->q_res_rtbcount -= (xfs_qcnt_t) 495 - (qtrx->qt_rtblk_res - 496 - qtrx->qt_rtblk_res_used); 497 - else 498 - dqp->q_res_rtbcount -= (xfs_qcnt_t) 499 - (qtrx->qt_rtblk_res_used - 500 - qtrx->qt_rtblk_res); 501 - } 502 - } else { 503 - if (qtrx->qt_rtbcount_delta) 504 - dqp->q_res_rtbcount += 505 - (xfs_qcnt_t)qtrx->qt_rtbcount_delta; 506 - } 412 + xfs_apply_quota_reservation_deltas(&dqp->q_rtb, 413 + qtrx->qt_rtblk_res, 414 + qtrx->qt_rtblk_res_used, 415 + qtrx->qt_rtbcount_delta); 507 416 508 417 /* 509 418 * Adjust the inode reservation. 510 419 */ 511 - if (qtrx->qt_ino_res != 0) { 512 - ASSERT(qtrx->qt_ino_res >= 513 - qtrx->qt_ino_res_used); 514 - if (qtrx->qt_ino_res > qtrx->qt_ino_res_used) 515 - dqp->q_res_icount -= (xfs_qcnt_t) 516 - (qtrx->qt_ino_res - 517 - qtrx->qt_ino_res_used); 518 - } else { 519 - if (qtrx->qt_icount_delta) 520 - dqp->q_res_icount += 521 - (xfs_qcnt_t)qtrx->qt_icount_delta; 522 - } 420 + ASSERT(qtrx->qt_ino_res >= qtrx->qt_ino_res_used); 421 + xfs_apply_quota_reservation_deltas(&dqp->q_ino, 422 + qtrx->qt_ino_res, 423 + qtrx->qt_ino_res_used, 424 + qtrx->qt_icount_delta); 523 425 524 - ASSERT(dqp->q_res_bcount >= 525 - be64_to_cpu(dqp->q_core.d_bcount)); 526 - ASSERT(dqp->q_res_icount >= 527 - be64_to_cpu(dqp->q_core.d_icount)); 528 - ASSERT(dqp->q_res_rtbcount >= 529 - be64_to_cpu(dqp->q_core.d_rtbcount)); 426 + ASSERT(dqp->q_blk.reserved >= dqp->q_blk.count); 427 + ASSERT(dqp->q_ino.reserved >= dqp->q_ino.count); 428 + ASSERT(dqp->q_rtb.reserved >= dqp->q_rtb.count); 530 429 } 531 430 } 532 431 } ··· 523 516 if (qtrx->qt_blk_res) { 524 517 xfs_dqlock(dqp); 525 518 locked = true; 526 - dqp->q_res_bcount -= 519 + dqp->q_blk.reserved -= 527 520 (xfs_qcnt_t)qtrx->qt_blk_res; 528 521 } 529 522 if (qtrx->qt_ino_res) { ··· 531 524 xfs_dqlock(dqp); 532 525 locked = true; 533 526 } 534 - dqp->q_res_icount -= 527 + dqp->q_ino.reserved -= 535 528 (xfs_qcnt_t)qtrx->qt_ino_res; 536 529 } 537 530 ··· 540 533 xfs_dqlock(dqp); 541 534 locked = true; 542 535 } 543 - dqp->q_res_rtbcount -= 536 + dqp->q_rtb.reserved -= 544 537 (xfs_qcnt_t)qtrx->qt_rtblk_res; 545 538 } 546 539 if (locked) ··· 556 549 struct xfs_dquot *dqp, 557 550 int type) 558 551 { 559 - enum quota_type qtype; 552 + enum quota_type qtype; 560 553 561 - if (dqp->dq_flags & XFS_DQ_PROJ) 554 + switch (xfs_dquot_type(dqp)) { 555 + case XFS_DQTYPE_PROJ: 562 556 qtype = PRJQUOTA; 563 - else if (dqp->dq_flags & XFS_DQ_USER) 557 + break; 558 + case XFS_DQTYPE_USER: 564 559 qtype = USRQUOTA; 565 - else 560 + break; 561 + case XFS_DQTYPE_GROUP: 566 562 qtype = GRPQUOTA; 563 + break; 564 + default: 565 + return; 566 + } 567 567 568 - quota_send_warning(make_kqid(&init_user_ns, qtype, 569 - be32_to_cpu(dqp->q_core.d_id)), 568 + quota_send_warning(make_kqid(&init_user_ns, qtype, dqp->q_id), 570 569 mp->m_super->s_dev, type); 570 + } 571 + 572 + /* 573 + * Decide if we can make an additional reservation against a quota resource. 574 + * Returns an inode QUOTA_NL_ warning code and whether or not it's fatal. 575 + * 576 + * Note that we assume that the numeric difference between the inode and block 577 + * warning codes will always be 3 since it's userspace ABI now, and will never 578 + * decrease the quota reservation, so the *BELOW messages are irrelevant. 579 + */ 580 + static inline int 581 + xfs_dqresv_check( 582 + struct xfs_dquot_res *res, 583 + struct xfs_quota_limits *qlim, 584 + int64_t delta, 585 + bool *fatal) 586 + { 587 + xfs_qcnt_t hardlimit = res->hardlimit; 588 + xfs_qcnt_t softlimit = res->softlimit; 589 + xfs_qcnt_t total_count = res->reserved + delta; 590 + 591 + BUILD_BUG_ON(QUOTA_NL_BHARDWARN != QUOTA_NL_IHARDWARN + 3); 592 + BUILD_BUG_ON(QUOTA_NL_BSOFTLONGWARN != QUOTA_NL_ISOFTLONGWARN + 3); 593 + BUILD_BUG_ON(QUOTA_NL_BSOFTWARN != QUOTA_NL_ISOFTWARN + 3); 594 + 595 + *fatal = false; 596 + if (delta <= 0) 597 + return QUOTA_NL_NOWARN; 598 + 599 + if (!hardlimit) 600 + hardlimit = qlim->hard; 601 + if (!softlimit) 602 + softlimit = qlim->soft; 603 + 604 + if (hardlimit && total_count > hardlimit) { 605 + *fatal = true; 606 + return QUOTA_NL_IHARDWARN; 607 + } 608 + 609 + if (softlimit && total_count > softlimit) { 610 + time64_t now = ktime_get_real_seconds(); 611 + 612 + if ((res->timer != 0 && now > res->timer) || 613 + (res->warnings != 0 && res->warnings >= qlim->warn)) { 614 + *fatal = true; 615 + return QUOTA_NL_ISOFTLONGWARN; 616 + } 617 + 618 + res->warnings++; 619 + return QUOTA_NL_ISOFTWARN; 620 + } 621 + 622 + return QUOTA_NL_NOWARN; 571 623 } 572 624 573 625 /* ··· 644 578 long ninos, 645 579 uint flags) 646 580 { 647 - xfs_qcnt_t hardlimit; 648 - xfs_qcnt_t softlimit; 649 - time64_t timer; 650 - xfs_qwarncnt_t warns; 651 - xfs_qwarncnt_t warnlimit; 652 - xfs_qcnt_t total_count; 653 - xfs_qcnt_t *resbcountp; 654 581 struct xfs_quotainfo *q = mp->m_quotainfo; 655 582 struct xfs_def_quota *defq; 656 - 583 + struct xfs_dquot_res *blkres; 584 + struct xfs_quota_limits *qlim; 657 585 658 586 xfs_dqlock(dqp); 659 587 660 588 defq = xfs_get_defquota(q, xfs_dquot_type(dqp)); 661 589 662 590 if (flags & XFS_TRANS_DQ_RES_BLKS) { 663 - hardlimit = be64_to_cpu(dqp->q_core.d_blk_hardlimit); 664 - if (!hardlimit) 665 - hardlimit = defq->bhardlimit; 666 - softlimit = be64_to_cpu(dqp->q_core.d_blk_softlimit); 667 - if (!softlimit) 668 - softlimit = defq->bsoftlimit; 669 - timer = be32_to_cpu(dqp->q_core.d_btimer); 670 - warns = be16_to_cpu(dqp->q_core.d_bwarns); 671 - warnlimit = defq->bwarnlimit; 672 - resbcountp = &dqp->q_res_bcount; 591 + blkres = &dqp->q_blk; 592 + qlim = &defq->blk; 673 593 } else { 674 - ASSERT(flags & XFS_TRANS_DQ_RES_RTBLKS); 675 - hardlimit = be64_to_cpu(dqp->q_core.d_rtb_hardlimit); 676 - if (!hardlimit) 677 - hardlimit = defq->rtbhardlimit; 678 - softlimit = be64_to_cpu(dqp->q_core.d_rtb_softlimit); 679 - if (!softlimit) 680 - softlimit = defq->rtbsoftlimit; 681 - timer = be32_to_cpu(dqp->q_core.d_rtbtimer); 682 - warns = be16_to_cpu(dqp->q_core.d_rtbwarns); 683 - warnlimit = defq->rtbwarnlimit; 684 - resbcountp = &dqp->q_res_rtbcount; 594 + blkres = &dqp->q_rtb; 595 + qlim = &defq->rtb; 685 596 } 686 597 687 - if ((flags & XFS_QMOPT_FORCE_RES) == 0 && 688 - dqp->q_core.d_id && 689 - ((XFS_IS_UQUOTA_ENFORCED(dqp->q_mount) && XFS_QM_ISUDQ(dqp)) || 690 - (XFS_IS_GQUOTA_ENFORCED(dqp->q_mount) && XFS_QM_ISGDQ(dqp)) || 691 - (XFS_IS_PQUOTA_ENFORCED(dqp->q_mount) && XFS_QM_ISPDQ(dqp)))) { 692 - if (nblks > 0) { 598 + if ((flags & XFS_QMOPT_FORCE_RES) == 0 && dqp->q_id && 599 + xfs_dquot_is_enforced(dqp)) { 600 + int quota_nl; 601 + bool fatal; 602 + 603 + /* 604 + * dquot is locked already. See if we'd go over the hardlimit 605 + * or exceed the timelimit if we'd reserve resources. 606 + */ 607 + quota_nl = xfs_dqresv_check(blkres, qlim, nblks, &fatal); 608 + if (quota_nl != QUOTA_NL_NOWARN) { 693 609 /* 694 - * dquot is locked already. See if we'd go over the 695 - * hardlimit or exceed the timelimit if we allocate 696 - * nblks. 610 + * Quota block warning codes are 3 more than the inode 611 + * codes, which we check above. 697 612 */ 698 - total_count = *resbcountp + nblks; 699 - if (hardlimit && total_count > hardlimit) { 700 - xfs_quota_warn(mp, dqp, QUOTA_NL_BHARDWARN); 613 + xfs_quota_warn(mp, dqp, quota_nl + 3); 614 + if (fatal) 701 615 goto error_return; 702 - } 703 - if (softlimit && total_count > softlimit) { 704 - if ((timer != 0 && 705 - ktime_get_real_seconds() > timer) || 706 - (warns != 0 && warns >= warnlimit)) { 707 - xfs_quota_warn(mp, dqp, 708 - QUOTA_NL_BSOFTLONGWARN); 709 - goto error_return; 710 - } 711 - 712 - xfs_quota_warn(mp, dqp, QUOTA_NL_BSOFTWARN); 713 - } 714 616 } 715 - if (ninos > 0) { 716 - total_count = be64_to_cpu(dqp->q_core.d_icount) + ninos; 717 - timer = be32_to_cpu(dqp->q_core.d_itimer); 718 - warns = be16_to_cpu(dqp->q_core.d_iwarns); 719 - warnlimit = defq->iwarnlimit; 720 - hardlimit = be64_to_cpu(dqp->q_core.d_ino_hardlimit); 721 - if (!hardlimit) 722 - hardlimit = defq->ihardlimit; 723 - softlimit = be64_to_cpu(dqp->q_core.d_ino_softlimit); 724 - if (!softlimit) 725 - softlimit = defq->isoftlimit; 726 617 727 - if (hardlimit && total_count > hardlimit) { 728 - xfs_quota_warn(mp, dqp, QUOTA_NL_IHARDWARN); 618 + quota_nl = xfs_dqresv_check(&dqp->q_ino, &defq->ino, ninos, 619 + &fatal); 620 + if (quota_nl != QUOTA_NL_NOWARN) { 621 + xfs_quota_warn(mp, dqp, quota_nl); 622 + if (fatal) 729 623 goto error_return; 730 - } 731 - if (softlimit && total_count > softlimit) { 732 - if ((timer != 0 && 733 - ktime_get_real_seconds() > timer) || 734 - (warns != 0 && warns >= warnlimit)) { 735 - xfs_quota_warn(mp, dqp, 736 - QUOTA_NL_ISOFTLONGWARN); 737 - goto error_return; 738 - } 739 - xfs_quota_warn(mp, dqp, QUOTA_NL_ISOFTWARN); 740 - } 741 624 } 742 625 } 743 626 744 627 /* 745 628 * Change the reservation, but not the actual usage. 746 - * Note that q_res_bcount = q_core.d_bcount + resv 629 + * Note that q_blk.reserved = q_blk.count + resv 747 630 */ 748 - (*resbcountp) += (xfs_qcnt_t)nblks; 749 - if (ninos != 0) 750 - dqp->q_res_icount += (xfs_qcnt_t)ninos; 631 + blkres->reserved += (xfs_qcnt_t)nblks; 632 + dqp->q_ino.reserved += (xfs_qcnt_t)ninos; 751 633 752 634 /* 753 635 * note the reservation amt in the trans struct too, ··· 716 702 XFS_TRANS_DQ_RES_INOS, 717 703 ninos); 718 704 } 719 - ASSERT(dqp->q_res_bcount >= be64_to_cpu(dqp->q_core.d_bcount)); 720 - ASSERT(dqp->q_res_rtbcount >= be64_to_cpu(dqp->q_core.d_rtbcount)); 721 - ASSERT(dqp->q_res_icount >= be64_to_cpu(dqp->q_core.d_icount)); 705 + ASSERT(dqp->q_blk.reserved >= dqp->q_blk.count); 706 + ASSERT(dqp->q_rtb.reserved >= dqp->q_rtb.count); 707 + ASSERT(dqp->q_ino.reserved >= dqp->q_ino.count); 722 708 723 709 xfs_dqunlock(dqp); 724 710 return 0; 725 711 726 712 error_return: 727 713 xfs_dqunlock(dqp); 728 - if (XFS_QM_ISPDQ(dqp)) 714 + if (xfs_dquot_type(dqp) == XFS_DQTYPE_PROJ) 729 715 return -ENOSPC; 730 716 return -EDQUOT; 731 717 } ··· 874 860 xfs_trans_alloc_dqinfo( 875 861 xfs_trans_t *tp) 876 862 { 877 - tp->t_dqinfo = kmem_zone_zalloc(xfs_qm_dqtrxzone, 0); 863 + tp->t_dqinfo = kmem_cache_zalloc(xfs_qm_dqtrxzone, 864 + GFP_KERNEL | __GFP_NOFAIL); 878 865 } 879 866 880 867 void