Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge branch 'xfs-4.9-log-recovery-fixes' into for-next

+248 -98
+12 -11
fs/xfs/libxfs/xfs_alloc.c
··· 258 258 xfs_agblock_t wantbno, /* target starting block */ 259 259 xfs_extlen_t wantlen, /* target length */ 260 260 xfs_extlen_t alignment, /* target alignment */ 261 - char userdata, /* are we allocating data? */ 261 + int datatype, /* are we allocating data? */ 262 262 xfs_agblock_t freebno, /* freespace's starting block */ 263 263 xfs_extlen_t freelen, /* freespace's length */ 264 264 xfs_agblock_t *newbnop) /* result: best start block from free */ ··· 269 269 xfs_extlen_t newlen1=0; /* length with newbno1 */ 270 270 xfs_extlen_t newlen2=0; /* length with newbno2 */ 271 271 xfs_agblock_t wantend; /* end of target extent */ 272 + bool userdata = xfs_alloc_is_userdata(datatype); 272 273 273 274 ASSERT(freelen >= wantlen); 274 275 freeend = freebno + freelen; ··· 925 924 926 925 sdiff = xfs_alloc_compute_diff(args->agbno, args->len, 927 926 args->alignment, 928 - args->userdata, *sbnoa, 927 + args->datatype, *sbnoa, 929 928 *slena, &new); 930 929 931 930 /* ··· 1109 1108 if (args->len < blen) 1110 1109 continue; 1111 1110 ltdiff = xfs_alloc_compute_diff(args->agbno, args->len, 1112 - args->alignment, args->userdata, ltbnoa, 1111 + args->alignment, args->datatype, ltbnoa, 1113 1112 ltlena, &ltnew); 1114 1113 if (ltnew != NULLAGBLOCK && 1115 1114 (args->len > blen || ltdiff < bdiff)) { ··· 1262 1261 args->len = XFS_EXTLEN_MIN(ltlena, args->maxlen); 1263 1262 xfs_alloc_fix_len(args); 1264 1263 ltdiff = xfs_alloc_compute_diff(args->agbno, args->len, 1265 - args->alignment, args->userdata, ltbnoa, 1264 + args->alignment, args->datatype, ltbnoa, 1266 1265 ltlena, &ltnew); 1267 1266 1268 1267 error = xfs_alloc_find_best_extent(args, ··· 1279 1278 args->len = XFS_EXTLEN_MIN(gtlena, args->maxlen); 1280 1279 xfs_alloc_fix_len(args); 1281 1280 gtdiff = xfs_alloc_compute_diff(args->agbno, args->len, 1282 - args->alignment, args->userdata, gtbnoa, 1281 + args->alignment, args->datatype, gtbnoa, 1283 1282 gtlena, &gtnew); 1284 1283 1285 1284 error = xfs_alloc_find_best_extent(args, ··· 1339 1338 } 1340 1339 rlen = args->len; 1341 1340 (void)xfs_alloc_compute_diff(args->agbno, rlen, args->alignment, 1342 - args->userdata, ltbnoa, ltlena, &ltnew); 1341 + args->datatype, ltbnoa, ltlena, &ltnew); 1343 1342 ASSERT(ltnew >= ltbno); 1344 1343 ASSERT(ltnew + rlen <= ltbnoa + ltlena); 1345 1344 ASSERT(ltnew + rlen <= be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length)); ··· 1618 1617 goto error0; 1619 1618 if (fbno != NULLAGBLOCK) { 1620 1619 xfs_extent_busy_reuse(args->mp, args->agno, fbno, 1, 1621 - args->userdata); 1620 + xfs_alloc_allow_busy_reuse(args->datatype)); 1622 1621 1623 - if (args->userdata) { 1622 + if (xfs_alloc_is_userdata(args->datatype)) { 1624 1623 xfs_buf_t *bp; 1625 1624 1626 1625 bp = xfs_btree_get_bufs(args->mp, args->tp, ··· 2100 2099 * somewhere else if we are not being asked to try harder at this 2101 2100 * point 2102 2101 */ 2103 - if (pag->pagf_metadata && args->userdata && 2102 + if (pag->pagf_metadata && xfs_alloc_is_userdata(args->datatype) && 2104 2103 (flags & XFS_ALLOC_FLAG_TRYLOCK)) { 2105 2104 ASSERT(!(flags & XFS_ALLOC_FLAG_FREEING)); 2106 2105 goto out_agbp_relse; ··· 2676 2675 * Try near allocation first, then anywhere-in-ag after 2677 2676 * the first a.g. fails. 2678 2677 */ 2679 - if ((args->userdata & XFS_ALLOC_INITIAL_USER_DATA) && 2678 + if ((args->datatype & XFS_ALLOC_INITIAL_USER_DATA) && 2680 2679 (mp->m_flags & XFS_MOUNT_32BITINODES)) { 2681 2680 args->fsbno = XFS_AGB_TO_FSB(mp, 2682 2681 ((mp->m_agfrotor / rotorstep) % ··· 2809 2808 #endif 2810 2809 2811 2810 /* Zero the extent if we were asked to do so */ 2812 - if (args->userdata & XFS_ALLOC_USERDATA_ZERO) { 2811 + if (args->datatype & XFS_ALLOC_USERDATA_ZERO) { 2813 2812 error = xfs_zero_extent(args->ip, args->fsbno, args->len); 2814 2813 if (error) 2815 2814 goto error0;
+15 -2
fs/xfs/libxfs/xfs_alloc.h
··· 85 85 xfs_extlen_t len; /* output: actual size of extent */ 86 86 xfs_alloctype_t type; /* allocation type XFS_ALLOCTYPE_... */ 87 87 xfs_alloctype_t otype; /* original allocation type */ 88 + int datatype; /* mask defining data type treatment */ 88 89 char wasdel; /* set if allocation was prev delayed */ 89 90 char wasfromfl; /* set if allocation is from freelist */ 90 - char userdata; /* mask defining userdata treatment */ 91 91 xfs_fsblock_t firstblock; /* io first block allocated */ 92 92 struct xfs_owner_info oinfo; /* owner of blocks being allocated */ 93 93 enum xfs_ag_resv_type resv; /* block reservation to use */ 94 94 } xfs_alloc_arg_t; 95 95 96 96 /* 97 - * Defines for userdata 97 + * Defines for datatype 98 98 */ 99 99 #define XFS_ALLOC_USERDATA (1 << 0)/* allocation is for user data*/ 100 100 #define XFS_ALLOC_INITIAL_USER_DATA (1 << 1)/* special case start of file */ 101 101 #define XFS_ALLOC_USERDATA_ZERO (1 << 2)/* zero extent on allocation */ 102 + #define XFS_ALLOC_NOBUSY (1 << 3)/* Busy extents not allowed */ 103 + 104 + static inline bool 105 + xfs_alloc_is_userdata(int datatype) 106 + { 107 + return (datatype & ~XFS_ALLOC_NOBUSY) != 0; 108 + } 109 + 110 + static inline bool 111 + xfs_alloc_allow_busy_reuse(int datatype) 112 + { 113 + return (datatype & XFS_ALLOC_NOBUSY) == 0; 114 + } 102 115 103 116 /* freespace limit calculations */ 104 117 #define XFS_ALLOC_AGFL_RESERVE 4
+26 -15
fs/xfs/libxfs/xfs_bmap.c
··· 3348 3348 3349 3349 mp = ap->ip->i_mount; 3350 3350 nullfb = *ap->firstblock == NULLFSBLOCK; 3351 - rt = XFS_IS_REALTIME_INODE(ap->ip) && ap->userdata; 3351 + rt = XFS_IS_REALTIME_INODE(ap->ip) && 3352 + xfs_alloc_is_userdata(ap->datatype); 3352 3353 fb_agno = nullfb ? NULLAGNUMBER : XFS_FSB_TO_AGNO(mp, *ap->firstblock); 3353 3354 /* 3354 3355 * If allocating at eof, and there's a previous real block, ··· 3625 3624 { 3626 3625 xfs_mount_t *mp; /* mount point structure */ 3627 3626 xfs_alloctype_t atype = 0; /* type for allocation routines */ 3628 - xfs_extlen_t align; /* minimum allocation alignment */ 3627 + xfs_extlen_t align = 0; /* minimum allocation alignment */ 3629 3628 xfs_agnumber_t fb_agno; /* ag number of ap->firstblock */ 3630 3629 xfs_agnumber_t ag; 3631 3630 xfs_alloc_arg_t args; ··· 3648 3647 else if (mp->m_dalign) 3649 3648 stripe_align = mp->m_dalign; 3650 3649 3651 - align = ap->userdata ? xfs_get_extsz_hint(ap->ip) : 0; 3650 + if (xfs_alloc_is_userdata(ap->datatype)) 3651 + align = xfs_get_extsz_hint(ap->ip); 3652 3652 if (unlikely(align)) { 3653 3653 error = xfs_bmap_extsize_align(mp, &ap->got, &ap->prev, 3654 3654 align, 0, ap->eof, 0, ap->conv, ··· 3662 3660 nullfb = *ap->firstblock == NULLFSBLOCK; 3663 3661 fb_agno = nullfb ? NULLAGNUMBER : XFS_FSB_TO_AGNO(mp, *ap->firstblock); 3664 3662 if (nullfb) { 3665 - if (ap->userdata && xfs_inode_is_filestream(ap->ip)) { 3663 + if (xfs_alloc_is_userdata(ap->datatype) && 3664 + xfs_inode_is_filestream(ap->ip)) { 3666 3665 ag = xfs_filestream_lookup_ag(ap->ip); 3667 3666 ag = (ag != NULLAGNUMBER) ? ag : 0; 3668 3667 ap->blkno = XFS_AGB_TO_FSB(mp, ag, 0); ··· 3703 3700 * enough for the request. If one isn't found, then adjust 3704 3701 * the minimum allocation size to the largest space found. 3705 3702 */ 3706 - if (ap->userdata && xfs_inode_is_filestream(ap->ip)) 3703 + if (xfs_alloc_is_userdata(ap->datatype) && 3704 + xfs_inode_is_filestream(ap->ip)) 3707 3705 error = xfs_bmap_btalloc_filestreams(ap, &args, &blen); 3708 3706 else 3709 3707 error = xfs_bmap_btalloc_nullfb(ap, &args, &blen); ··· 3788 3784 args.minleft = ap->minleft; 3789 3785 args.wasdel = ap->wasdel; 3790 3786 args.resv = XFS_AG_RESV_NONE; 3791 - args.userdata = ap->userdata; 3792 - if (ap->userdata & XFS_ALLOC_USERDATA_ZERO) 3787 + args.datatype = ap->datatype; 3788 + if (ap->datatype & XFS_ALLOC_USERDATA_ZERO) 3793 3789 args.ip = ap->ip; 3794 3790 3795 3791 error = xfs_alloc_vextent(&args); ··· 3883 3879 xfs_bmap_alloc( 3884 3880 struct xfs_bmalloca *ap) /* bmap alloc argument struct */ 3885 3881 { 3886 - if (XFS_IS_REALTIME_INODE(ap->ip) && ap->userdata) 3882 + if (XFS_IS_REALTIME_INODE(ap->ip) && 3883 + xfs_alloc_is_userdata(ap->datatype)) 3887 3884 return xfs_bmap_rtalloc(ap); 3888 3885 return xfs_bmap_btalloc(ap); 3889 3886 } ··· 4209 4204 } 4210 4205 4211 4206 /* 4212 - * Indicate if this is the first user data in the file, or just any 4213 - * user data. And if it is userdata, indicate whether it needs to 4214 - * be initialised to zero during allocation. 4207 + * Set the data type being allocated. For the data fork, the first data 4208 + * in the file is treated differently to all other allocations. For the 4209 + * attribute fork, we only need to ensure the allocated range is not on 4210 + * the busy list. 4215 4211 */ 4216 4212 if (!(bma->flags & XFS_BMAPI_METADATA)) { 4217 - bma->userdata = (bma->offset == 0) ? 4218 - XFS_ALLOC_INITIAL_USER_DATA : XFS_ALLOC_USERDATA; 4213 + bma->datatype = XFS_ALLOC_NOBUSY; 4214 + if (whichfork == XFS_DATA_FORK) { 4215 + if (bma->offset == 0) 4216 + bma->datatype |= XFS_ALLOC_INITIAL_USER_DATA; 4217 + else 4218 + bma->datatype |= XFS_ALLOC_USERDATA; 4219 + } 4219 4220 if (bma->flags & XFS_BMAPI_ZERO) 4220 - bma->userdata |= XFS_ALLOC_USERDATA_ZERO; 4221 + bma->datatype |= XFS_ALLOC_USERDATA_ZERO; 4221 4222 } 4222 4223 4223 4224 bma->minlen = (bma->flags & XFS_BMAPI_CONTIG) ? bma->length : 1; ··· 4493 4482 bma.tp = tp; 4494 4483 bma.ip = ip; 4495 4484 bma.total = total; 4496 - bma.userdata = 0; 4485 + bma.datatype = 0; 4497 4486 bma.dfops = dfops; 4498 4487 bma.firstblock = firstblock; 4499 4488
+1 -1
fs/xfs/libxfs/xfs_bmap.h
··· 54 54 bool wasdel; /* replacing a delayed allocation */ 55 55 bool aeof; /* allocated space at eof */ 56 56 bool conv; /* overwriting unwritten extents */ 57 - char userdata;/* userdata mask */ 57 + int datatype;/* data type being allocated */ 58 58 int flags; 59 59 }; 60 60
+1 -1
fs/xfs/xfs_bmap_util.c
··· 182 182 XFS_TRANS_DQ_RTBCOUNT, (long) ralen); 183 183 184 184 /* Zero the extent if we were asked to do so */ 185 - if (ap->userdata & XFS_ALLOC_USERDATA_ZERO) { 185 + if (ap->datatype & XFS_ALLOC_USERDATA_ZERO) { 186 186 error = xfs_zero_extent(ap->ip, ap->blkno, ap->length); 187 187 if (error) 188 188 return error;
+1 -1
fs/xfs/xfs_extent_busy.c
··· 384 384 * If this is a metadata allocation, try to reuse the busy 385 385 * extent instead of trimming the allocation. 386 386 */ 387 - if (!args->userdata && 387 + if (!xfs_alloc_is_userdata(args->datatype) && 388 388 !(busyp->flags & XFS_EXTENT_BUSY_DISCARDED)) { 389 389 if (!xfs_extent_busy_update_extent(args->mp, args->pag, 390 390 busyp, fbno, flen,
+6 -3
fs/xfs/xfs_filestream.c
··· 371 371 struct xfs_mount *mp = ip->i_mount; 372 372 xfs_extlen_t minlen = ap->length; 373 373 xfs_agnumber_t startag = 0; 374 - int flags, err = 0; 374 + int flags = 0; 375 + int err = 0; 375 376 struct xfs_mru_cache_elem *mru; 376 377 377 378 *agp = NULLAGNUMBER; ··· 388 387 startag = (item->ag + 1) % mp->m_sb.sb_agcount; 389 388 } 390 389 391 - flags = (ap->userdata ? XFS_PICK_USERDATA : 0) | 392 - (ap->dfops->dop_low ? XFS_PICK_LOWSPACE : 0); 390 + if (xfs_alloc_is_userdata(ap->datatype)) 391 + flags |= XFS_PICK_USERDATA; 392 + if (ap->dfops->dop_low) 393 + flags |= XFS_PICK_LOWSPACE; 393 394 394 395 err = xfs_filestream_pick_ag(pip, startag, agp, flags, minlen); 395 396
+2 -1
fs/xfs/xfs_log_priv.h
··· 413 413 /* log record crc error injection factor */ 414 414 uint32_t l_badcrc_factor; 415 415 #endif 416 - 416 + /* log recovery lsn tracking (for buffer submission */ 417 + xfs_lsn_t l_recovery_lsn; 417 418 }; 418 419 419 420 #define XLOG_BUF_CANCEL_BUCKET(log, blkno) \
+135 -56
fs/xfs/xfs_log_recover.c
··· 44 44 #include "xfs_error.h" 45 45 #include "xfs_dir2.h" 46 46 #include "xfs_rmap_item.h" 47 + #include "xfs_buf_item.h" 47 48 48 49 #define BLK_AVG(blk1, blk2) ((blk1+blk2) >> 1) 49 50 ··· 382 381 SHUTDOWN_META_IO_ERROR); 383 382 } 384 383 } 384 + 385 + /* 386 + * On v5 supers, a bli could be attached to update the metadata LSN. 387 + * Clean it up. 388 + */ 389 + if (bp->b_fspriv) 390 + xfs_buf_item_relse(bp); 391 + ASSERT(bp->b_fspriv == NULL); 392 + 385 393 bp->b_iodone = NULL; 386 394 xfs_buf_ioend(bp); 387 395 } ··· 2370 2360 xlog_recover_validate_buf_type( 2371 2361 struct xfs_mount *mp, 2372 2362 struct xfs_buf *bp, 2373 - xfs_buf_log_format_t *buf_f) 2363 + xfs_buf_log_format_t *buf_f, 2364 + xfs_lsn_t current_lsn) 2374 2365 { 2375 2366 struct xfs_da_blkinfo *info = bp->b_addr; 2376 2367 __uint32_t magic32; 2377 2368 __uint16_t magic16; 2378 2369 __uint16_t magicda; 2370 + char *warnmsg = NULL; 2379 2371 2380 2372 /* 2381 2373 * We can only do post recovery validation on items on CRC enabled ··· 2416 2404 bp->b_ops = &xfs_rmapbt_buf_ops; 2417 2405 break; 2418 2406 default: 2419 - xfs_warn(mp, "Bad btree block magic!"); 2420 - ASSERT(0); 2407 + warnmsg = "Bad btree block magic!"; 2421 2408 break; 2422 2409 } 2423 2410 break; 2424 2411 case XFS_BLFT_AGF_BUF: 2425 2412 if (magic32 != XFS_AGF_MAGIC) { 2426 - xfs_warn(mp, "Bad AGF block magic!"); 2427 - ASSERT(0); 2413 + warnmsg = "Bad AGF block magic!"; 2428 2414 break; 2429 2415 } 2430 2416 bp->b_ops = &xfs_agf_buf_ops; 2431 2417 break; 2432 2418 case XFS_BLFT_AGFL_BUF: 2433 2419 if (magic32 != XFS_AGFL_MAGIC) { 2434 - xfs_warn(mp, "Bad AGFL block magic!"); 2435 - ASSERT(0); 2420 + warnmsg = "Bad AGFL block magic!"; 2436 2421 break; 2437 2422 } 2438 2423 bp->b_ops = &xfs_agfl_buf_ops; 2439 2424 break; 2440 2425 case XFS_BLFT_AGI_BUF: 2441 2426 if (magic32 != XFS_AGI_MAGIC) { 2442 - xfs_warn(mp, "Bad AGI block magic!"); 2443 - ASSERT(0); 2427 + warnmsg = "Bad AGI block magic!"; 2444 2428 break; 2445 2429 } 2446 2430 bp->b_ops = &xfs_agi_buf_ops; ··· 2446 2438 case XFS_BLFT_GDQUOT_BUF: 2447 2439 #ifdef CONFIG_XFS_QUOTA 2448 2440 if (magic16 != XFS_DQUOT_MAGIC) { 2449 - xfs_warn(mp, "Bad DQUOT block magic!"); 2450 - ASSERT(0); 2441 + warnmsg = "Bad DQUOT block magic!"; 2451 2442 break; 2452 2443 } 2453 2444 bp->b_ops = &xfs_dquot_buf_ops; ··· 2458 2451 break; 2459 2452 case XFS_BLFT_DINO_BUF: 2460 2453 if (magic16 != XFS_DINODE_MAGIC) { 2461 - xfs_warn(mp, "Bad INODE block magic!"); 2462 - ASSERT(0); 2454 + warnmsg = "Bad INODE block magic!"; 2463 2455 break; 2464 2456 } 2465 2457 bp->b_ops = &xfs_inode_buf_ops; 2466 2458 break; 2467 2459 case XFS_BLFT_SYMLINK_BUF: 2468 2460 if (magic32 != XFS_SYMLINK_MAGIC) { 2469 - xfs_warn(mp, "Bad symlink block magic!"); 2470 - ASSERT(0); 2461 + warnmsg = "Bad symlink block magic!"; 2471 2462 break; 2472 2463 } 2473 2464 bp->b_ops = &xfs_symlink_buf_ops; ··· 2473 2468 case XFS_BLFT_DIR_BLOCK_BUF: 2474 2469 if (magic32 != XFS_DIR2_BLOCK_MAGIC && 2475 2470 magic32 != XFS_DIR3_BLOCK_MAGIC) { 2476 - xfs_warn(mp, "Bad dir block magic!"); 2477 - ASSERT(0); 2471 + warnmsg = "Bad dir block magic!"; 2478 2472 break; 2479 2473 } 2480 2474 bp->b_ops = &xfs_dir3_block_buf_ops; ··· 2481 2477 case XFS_BLFT_DIR_DATA_BUF: 2482 2478 if (magic32 != XFS_DIR2_DATA_MAGIC && 2483 2479 magic32 != XFS_DIR3_DATA_MAGIC) { 2484 - xfs_warn(mp, "Bad dir data magic!"); 2485 - ASSERT(0); 2480 + warnmsg = "Bad dir data magic!"; 2486 2481 break; 2487 2482 } 2488 2483 bp->b_ops = &xfs_dir3_data_buf_ops; ··· 2489 2486 case XFS_BLFT_DIR_FREE_BUF: 2490 2487 if (magic32 != XFS_DIR2_FREE_MAGIC && 2491 2488 magic32 != XFS_DIR3_FREE_MAGIC) { 2492 - xfs_warn(mp, "Bad dir3 free magic!"); 2493 - ASSERT(0); 2489 + warnmsg = "Bad dir3 free magic!"; 2494 2490 break; 2495 2491 } 2496 2492 bp->b_ops = &xfs_dir3_free_buf_ops; ··· 2497 2495 case XFS_BLFT_DIR_LEAF1_BUF: 2498 2496 if (magicda != XFS_DIR2_LEAF1_MAGIC && 2499 2497 magicda != XFS_DIR3_LEAF1_MAGIC) { 2500 - xfs_warn(mp, "Bad dir leaf1 magic!"); 2501 - ASSERT(0); 2498 + warnmsg = "Bad dir leaf1 magic!"; 2502 2499 break; 2503 2500 } 2504 2501 bp->b_ops = &xfs_dir3_leaf1_buf_ops; ··· 2505 2504 case XFS_BLFT_DIR_LEAFN_BUF: 2506 2505 if (magicda != XFS_DIR2_LEAFN_MAGIC && 2507 2506 magicda != XFS_DIR3_LEAFN_MAGIC) { 2508 - xfs_warn(mp, "Bad dir leafn magic!"); 2509 - ASSERT(0); 2507 + warnmsg = "Bad dir leafn magic!"; 2510 2508 break; 2511 2509 } 2512 2510 bp->b_ops = &xfs_dir3_leafn_buf_ops; ··· 2513 2513 case XFS_BLFT_DA_NODE_BUF: 2514 2514 if (magicda != XFS_DA_NODE_MAGIC && 2515 2515 magicda != XFS_DA3_NODE_MAGIC) { 2516 - xfs_warn(mp, "Bad da node magic!"); 2517 - ASSERT(0); 2516 + warnmsg = "Bad da node magic!"; 2518 2517 break; 2519 2518 } 2520 2519 bp->b_ops = &xfs_da3_node_buf_ops; ··· 2521 2522 case XFS_BLFT_ATTR_LEAF_BUF: 2522 2523 if (magicda != XFS_ATTR_LEAF_MAGIC && 2523 2524 magicda != XFS_ATTR3_LEAF_MAGIC) { 2524 - xfs_warn(mp, "Bad attr leaf magic!"); 2525 - ASSERT(0); 2525 + warnmsg = "Bad attr leaf magic!"; 2526 2526 break; 2527 2527 } 2528 2528 bp->b_ops = &xfs_attr3_leaf_buf_ops; 2529 2529 break; 2530 2530 case XFS_BLFT_ATTR_RMT_BUF: 2531 2531 if (magic32 != XFS_ATTR3_RMT_MAGIC) { 2532 - xfs_warn(mp, "Bad attr remote magic!"); 2533 - ASSERT(0); 2532 + warnmsg = "Bad attr remote magic!"; 2534 2533 break; 2535 2534 } 2536 2535 bp->b_ops = &xfs_attr3_rmt_buf_ops; 2537 2536 break; 2538 2537 case XFS_BLFT_SB_BUF: 2539 2538 if (magic32 != XFS_SB_MAGIC) { 2540 - xfs_warn(mp, "Bad SB block magic!"); 2541 - ASSERT(0); 2539 + warnmsg = "Bad SB block magic!"; 2542 2540 break; 2543 2541 } 2544 2542 bp->b_ops = &xfs_sb_buf_ops; ··· 2552 2556 xfs_blft_from_flags(buf_f)); 2553 2557 break; 2554 2558 } 2559 + 2560 + /* 2561 + * Nothing else to do in the case of a NULL current LSN as this means 2562 + * the buffer is more recent than the change in the log and will be 2563 + * skipped. 2564 + */ 2565 + if (current_lsn == NULLCOMMITLSN) 2566 + return; 2567 + 2568 + if (warnmsg) { 2569 + xfs_warn(mp, warnmsg); 2570 + ASSERT(0); 2571 + } 2572 + 2573 + /* 2574 + * We must update the metadata LSN of the buffer as it is written out to 2575 + * ensure that older transactions never replay over this one and corrupt 2576 + * the buffer. This can occur if log recovery is interrupted at some 2577 + * point after the current transaction completes, at which point a 2578 + * subsequent mount starts recovery from the beginning. 2579 + * 2580 + * Write verifiers update the metadata LSN from log items attached to 2581 + * the buffer. Therefore, initialize a bli purely to carry the LSN to 2582 + * the verifier. We'll clean it up in our ->iodone() callback. 2583 + */ 2584 + if (bp->b_ops) { 2585 + struct xfs_buf_log_item *bip; 2586 + 2587 + ASSERT(!bp->b_iodone || bp->b_iodone == xlog_recover_iodone); 2588 + bp->b_iodone = xlog_recover_iodone; 2589 + xfs_buf_item_init(bp, mp); 2590 + bip = bp->b_fspriv; 2591 + bip->bli_item.li_lsn = current_lsn; 2592 + } 2555 2593 } 2556 2594 2557 2595 /* ··· 2599 2569 struct xfs_mount *mp, 2600 2570 xlog_recover_item_t *item, 2601 2571 struct xfs_buf *bp, 2602 - xfs_buf_log_format_t *buf_f) 2572 + xfs_buf_log_format_t *buf_f, 2573 + xfs_lsn_t current_lsn) 2603 2574 { 2604 2575 int i; 2605 2576 int bit; ··· 2673 2642 /* Shouldn't be any more regions */ 2674 2643 ASSERT(i == item->ri_total); 2675 2644 2676 - xlog_recover_validate_buf_type(mp, bp, buf_f); 2645 + xlog_recover_validate_buf_type(mp, bp, buf_f, current_lsn); 2677 2646 } 2678 2647 2679 2648 /* ··· 2716 2685 if (log->l_quotaoffs_flag & type) 2717 2686 return false; 2718 2687 2719 - xlog_recover_do_reg_buffer(mp, item, bp, buf_f); 2688 + xlog_recover_do_reg_buffer(mp, item, bp, buf_f, NULLCOMMITLSN); 2720 2689 return true; 2721 2690 } 2722 2691 ··· 2804 2773 */ 2805 2774 lsn = xlog_recover_get_buf_lsn(mp, bp); 2806 2775 if (lsn && lsn != -1 && XFS_LSN_CMP(lsn, current_lsn) >= 0) { 2807 - xlog_recover_validate_buf_type(mp, bp, buf_f); 2776 + trace_xfs_log_recover_buf_skip(log, buf_f); 2777 + xlog_recover_validate_buf_type(mp, bp, buf_f, NULLCOMMITLSN); 2808 2778 goto out_release; 2809 2779 } 2810 2780 ··· 2821 2789 if (!dirty) 2822 2790 goto out_release; 2823 2791 } else { 2824 - xlog_recover_do_reg_buffer(mp, item, bp, buf_f); 2792 + xlog_recover_do_reg_buffer(mp, item, bp, buf_f, current_lsn); 2825 2793 } 2826 2794 2827 2795 /* ··· 3878 3846 xlog_recover_commit_trans( 3879 3847 struct xlog *log, 3880 3848 struct xlog_recover *trans, 3881 - int pass) 3849 + int pass, 3850 + struct list_head *buffer_list) 3882 3851 { 3883 3852 int error = 0; 3884 - int error2; 3885 3853 int items_queued = 0; 3886 3854 struct xlog_recover_item *item; 3887 3855 struct xlog_recover_item *next; 3888 - LIST_HEAD (buffer_list); 3889 3856 LIST_HEAD (ra_list); 3890 3857 LIST_HEAD (done_list); 3891 3858 ··· 3907 3876 items_queued++; 3908 3877 if (items_queued >= XLOG_RECOVER_COMMIT_QUEUE_MAX) { 3909 3878 error = xlog_recover_items_pass2(log, trans, 3910 - &buffer_list, &ra_list); 3879 + buffer_list, &ra_list); 3911 3880 list_splice_tail_init(&ra_list, &done_list); 3912 3881 items_queued = 0; 3913 3882 } ··· 3925 3894 if (!list_empty(&ra_list)) { 3926 3895 if (!error) 3927 3896 error = xlog_recover_items_pass2(log, trans, 3928 - &buffer_list, &ra_list); 3897 + buffer_list, &ra_list); 3929 3898 list_splice_tail_init(&ra_list, &done_list); 3930 3899 } 3931 3900 3932 3901 if (!list_empty(&done_list)) 3933 3902 list_splice_init(&done_list, &trans->r_itemq); 3934 3903 3935 - error2 = xfs_buf_delwri_submit(&buffer_list); 3936 - return error ? error : error2; 3904 + return error; 3937 3905 } 3938 3906 3939 3907 STATIC void ··· 4115 4085 char *dp, 4116 4086 unsigned int len, 4117 4087 unsigned int flags, 4118 - int pass) 4088 + int pass, 4089 + struct list_head *buffer_list) 4119 4090 { 4120 4091 int error = 0; 4121 4092 bool freeit = false; ··· 4140 4109 error = xlog_recover_add_to_cont_trans(log, trans, dp, len); 4141 4110 break; 4142 4111 case XLOG_COMMIT_TRANS: 4143 - error = xlog_recover_commit_trans(log, trans, pass); 4112 + error = xlog_recover_commit_trans(log, trans, pass, 4113 + buffer_list); 4144 4114 /* success or fail, we are now done with this transaction. */ 4145 4115 freeit = true; 4146 4116 break; ··· 4223 4191 struct xlog_op_header *ohead, 4224 4192 char *dp, 4225 4193 char *end, 4226 - int pass) 4194 + int pass, 4195 + struct list_head *buffer_list) 4227 4196 { 4228 4197 struct xlog_recover *trans; 4229 4198 unsigned int len; 4199 + int error; 4230 4200 4231 4201 /* Do we understand who wrote this op? */ 4232 4202 if (ohead->oh_clientid != XFS_TRANSACTION && ··· 4255 4221 return 0; 4256 4222 } 4257 4223 4224 + /* 4225 + * The recovered buffer queue is drained only once we know that all 4226 + * recovery items for the current LSN have been processed. This is 4227 + * required because: 4228 + * 4229 + * - Buffer write submission updates the metadata LSN of the buffer. 4230 + * - Log recovery skips items with a metadata LSN >= the current LSN of 4231 + * the recovery item. 4232 + * - Separate recovery items against the same metadata buffer can share 4233 + * a current LSN. I.e., consider that the LSN of a recovery item is 4234 + * defined as the starting LSN of the first record in which its 4235 + * transaction appears, that a record can hold multiple transactions, 4236 + * and/or that a transaction can span multiple records. 4237 + * 4238 + * In other words, we are allowed to submit a buffer from log recovery 4239 + * once per current LSN. Otherwise, we may incorrectly skip recovery 4240 + * items and cause corruption. 4241 + * 4242 + * We don't know up front whether buffers are updated multiple times per 4243 + * LSN. Therefore, track the current LSN of each commit log record as it 4244 + * is processed and drain the queue when it changes. Use commit records 4245 + * because they are ordered correctly by the logging code. 4246 + */ 4247 + if (log->l_recovery_lsn != trans->r_lsn && 4248 + ohead->oh_flags & XLOG_COMMIT_TRANS) { 4249 + error = xfs_buf_delwri_submit(buffer_list); 4250 + if (error) 4251 + return error; 4252 + log->l_recovery_lsn = trans->r_lsn; 4253 + } 4254 + 4258 4255 return xlog_recovery_process_trans(log, trans, dp, len, 4259 - ohead->oh_flags, pass); 4256 + ohead->oh_flags, pass, buffer_list); 4260 4257 } 4261 4258 4262 4259 /* ··· 4305 4240 struct hlist_head rhash[], 4306 4241 struct xlog_rec_header *rhead, 4307 4242 char *dp, 4308 - int pass) 4243 + int pass, 4244 + struct list_head *buffer_list) 4309 4245 { 4310 4246 struct xlog_op_header *ohead; 4311 4247 char *end; ··· 4320 4254 if (xlog_header_check_recover(log->l_mp, rhead)) 4321 4255 return -EIO; 4322 4256 4257 + trace_xfs_log_recover_record(log, rhead, pass); 4323 4258 while ((dp < end) && num_logops) { 4324 4259 4325 4260 ohead = (struct xlog_op_header *)dp; ··· 4329 4262 4330 4263 /* errors will abort recovery */ 4331 4264 error = xlog_recover_process_ophdr(log, rhash, rhead, ohead, 4332 - dp, end, pass); 4265 + dp, end, pass, buffer_list); 4333 4266 if (error) 4334 4267 return error; 4335 4268 ··· 4752 4685 struct hlist_head rhash[], 4753 4686 struct xlog_rec_header *rhead, 4754 4687 char *dp, 4755 - int pass) 4688 + int pass, 4689 + struct list_head *buffer_list) 4756 4690 { 4757 4691 int error; 4758 4692 __le32 crc; ··· 4800 4732 if (error) 4801 4733 return error; 4802 4734 4803 - return xlog_recover_process_data(log, rhash, rhead, dp, pass); 4735 + return xlog_recover_process_data(log, rhash, rhead, dp, pass, 4736 + buffer_list); 4804 4737 } 4805 4738 4806 4739 STATIC int ··· 4862 4793 char *offset; 4863 4794 xfs_buf_t *hbp, *dbp; 4864 4795 int error = 0, h_size, h_len; 4796 + int error2 = 0; 4865 4797 int bblks, split_bblks; 4866 4798 int hblks, split_hblks, wrapped_hblks; 4867 4799 struct hlist_head rhash[XLOG_RHASH_SIZE]; 4800 + LIST_HEAD (buffer_list); 4868 4801 4869 4802 ASSERT(head_blk != tail_blk); 4870 4803 rhead_blk = 0; ··· 5052 4981 } 5053 4982 5054 4983 error = xlog_recover_process(log, rhash, rhead, offset, 5055 - pass); 4984 + pass, &buffer_list); 5056 4985 if (error) 5057 4986 goto bread_err2; 5058 4987 ··· 5083 5012 if (error) 5084 5013 goto bread_err2; 5085 5014 5086 - error = xlog_recover_process(log, rhash, rhead, offset, pass); 5015 + error = xlog_recover_process(log, rhash, rhead, offset, pass, 5016 + &buffer_list); 5087 5017 if (error) 5088 5018 goto bread_err2; 5089 5019 ··· 5097 5025 bread_err1: 5098 5026 xlog_put_bp(hbp); 5099 5027 5028 + /* 5029 + * Submit buffers that have been added from the last record processed, 5030 + * regardless of error status. 5031 + */ 5032 + if (!list_empty(&buffer_list)) 5033 + error2 = xfs_buf_delwri_submit(&buffer_list); 5034 + 5100 5035 if (error && first_bad) 5101 5036 *first_bad = rhead_blk; 5102 5037 5103 - return error; 5038 + return error ? error : error2; 5104 5039 } 5105 5040 5106 5041 /*
+14
fs/xfs/xfs_mount.c
··· 934 934 } 935 935 936 936 /* 937 + * Now the log is fully replayed, we can transition to full read-only 938 + * mode for read-only mounts. This will sync all the metadata and clean 939 + * the log so that the recovery we just performed does not have to be 940 + * replayed again on the next mount. 941 + * 942 + * We use the same quiesce mechanism as the rw->ro remount, as they are 943 + * semantically identical operations. 944 + */ 945 + if ((mp->m_flags & (XFS_MOUNT_RDONLY|XFS_MOUNT_NORECOVERY)) == 946 + XFS_MOUNT_RDONLY) { 947 + xfs_quiesce_attr(mp); 948 + } 949 + 950 + /* 937 951 * Complete the quota initialisation, post-log-replay component. 938 952 */ 939 953 if (quotamount) {
+1 -1
fs/xfs/xfs_super.c
··· 1137 1137 * Note: xfs_log_quiesce() stops background log work - the callers must ensure 1138 1138 * it is started again when appropriate. 1139 1139 */ 1140 - static void 1140 + void 1141 1141 xfs_quiesce_attr( 1142 1142 struct xfs_mount *mp) 1143 1143 {
+1
fs/xfs/xfs_super.h
··· 61 61 struct xfs_buftarg; 62 62 struct block_device; 63 63 64 + extern void xfs_quiesce_attr(struct xfs_mount *mp); 64 65 extern void xfs_flush_inodes(struct xfs_mount *mp); 65 66 extern void xfs_blkdev_issue_flush(struct xfs_buftarg *); 66 67 extern xfs_agnumber_t xfs_set_inode_alloc(struct xfs_mount *,
+33 -6
fs/xfs/xfs_trace.h
··· 1624 1624 __field(char, wasdel) 1625 1625 __field(char, wasfromfl) 1626 1626 __field(int, resv) 1627 - __field(char, userdata) 1627 + __field(int, datatype) 1628 1628 __field(xfs_fsblock_t, firstblock) 1629 1629 ), 1630 1630 TP_fast_assign( ··· 1645 1645 __entry->wasdel = args->wasdel; 1646 1646 __entry->wasfromfl = args->wasfromfl; 1647 1647 __entry->resv = args->resv; 1648 - __entry->userdata = args->userdata; 1648 + __entry->datatype = args->datatype; 1649 1649 __entry->firstblock = args->firstblock; 1650 1650 ), 1651 1651 TP_printk("dev %d:%d agno %u agbno %u minlen %u maxlen %u mod %u " 1652 1652 "prod %u minleft %u total %u alignment %u minalignslop %u " 1653 1653 "len %u type %s otype %s wasdel %d wasfromfl %d resv %d " 1654 - "userdata %d firstblock 0x%llx", 1654 + "datatype 0x%x firstblock 0x%llx", 1655 1655 MAJOR(__entry->dev), MINOR(__entry->dev), 1656 1656 __entry->agno, 1657 1657 __entry->agbno, ··· 1669 1669 __entry->wasdel, 1670 1670 __entry->wasfromfl, 1671 1671 __entry->resv, 1672 - __entry->userdata, 1672 + __entry->datatype, 1673 1673 (unsigned long long)__entry->firstblock) 1674 1674 ) 1675 1675 ··· 1985 1985 DEFINE_SWAPEXT_EVENT(xfs_swap_extent_before); 1986 1986 DEFINE_SWAPEXT_EVENT(xfs_swap_extent_after); 1987 1987 1988 + TRACE_EVENT(xfs_log_recover_record, 1989 + TP_PROTO(struct xlog *log, struct xlog_rec_header *rhead, int pass), 1990 + TP_ARGS(log, rhead, pass), 1991 + TP_STRUCT__entry( 1992 + __field(dev_t, dev) 1993 + __field(xfs_lsn_t, lsn) 1994 + __field(int, len) 1995 + __field(int, num_logops) 1996 + __field(int, pass) 1997 + ), 1998 + TP_fast_assign( 1999 + __entry->dev = log->l_mp->m_super->s_dev; 2000 + __entry->lsn = be64_to_cpu(rhead->h_lsn); 2001 + __entry->len = be32_to_cpu(rhead->h_len); 2002 + __entry->num_logops = be32_to_cpu(rhead->h_num_logops); 2003 + __entry->pass = pass; 2004 + ), 2005 + TP_printk("dev %d:%d lsn 0x%llx len 0x%x num_logops 0x%x pass %d", 2006 + MAJOR(__entry->dev), MINOR(__entry->dev), 2007 + __entry->lsn, __entry->len, __entry->num_logops, 2008 + __entry->pass) 2009 + ) 2010 + 1988 2011 DECLARE_EVENT_CLASS(xfs_log_recover_item_class, 1989 2012 TP_PROTO(struct xlog *log, struct xlog_recover *trans, 1990 2013 struct xlog_recover_item *item, int pass), ··· 2016 1993 __field(dev_t, dev) 2017 1994 __field(unsigned long, item) 2018 1995 __field(xlog_tid_t, tid) 1996 + __field(xfs_lsn_t, lsn) 2019 1997 __field(int, type) 2020 1998 __field(int, pass) 2021 1999 __field(int, count) ··· 2026 2002 __entry->dev = log->l_mp->m_super->s_dev; 2027 2003 __entry->item = (unsigned long)item; 2028 2004 __entry->tid = trans->r_log_tid; 2005 + __entry->lsn = trans->r_lsn; 2029 2006 __entry->type = ITEM_TYPE(item); 2030 2007 __entry->pass = pass; 2031 2008 __entry->count = item->ri_cnt; 2032 2009 __entry->total = item->ri_total; 2033 2010 ), 2034 - TP_printk("dev %d:%d trans 0x%x, pass %d, item 0x%p, item type %s " 2035 - "item region count/total %d/%d", 2011 + TP_printk("dev %d:%d tid 0x%x lsn 0x%llx, pass %d, item 0x%p, " 2012 + "item type %s item region count/total %d/%d", 2036 2013 MAJOR(__entry->dev), MINOR(__entry->dev), 2037 2014 __entry->tid, 2015 + __entry->lsn, 2038 2016 __entry->pass, 2039 2017 (void *)__entry->item, 2040 2018 __print_symbolic(__entry->type, XFS_LI_TYPE_DESC), ··· 2095 2069 DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_cancel_add); 2096 2070 DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_cancel_ref_inc); 2097 2071 DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_recover); 2072 + DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_skip); 2098 2073 DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_inode_buf); 2099 2074 DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_reg_buf); 2100 2075 DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_dquot_buf);