fs/xfs/xfs_log_recover.c at v5.5-rc4

tjh.dev / kernel
fork
Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
fork
kernel / fs / xfs / xfs_log_recover.c
at v5.5-rc4 5857 lines 166 kB view raw
wrap content
   1// SPDX-License-Identifier: GPL-2.0
   2/*
   3 * Copyright (c) 2000-2006 Silicon Graphics, Inc.
   4 * All Rights Reserved.
   5 */
   6#include "xfs.h"
   7#include "xfs_fs.h"
   8#include "xfs_shared.h"
   9#include "xfs_format.h"
  10#include "xfs_log_format.h"
  11#include "xfs_trans_resv.h"
  12#include "xfs_bit.h"
  13#include "xfs_sb.h"
  14#include "xfs_mount.h"
  15#include "xfs_defer.h"
  16#include "xfs_inode.h"
  17#include "xfs_trans.h"
  18#include "xfs_log.h"
  19#include "xfs_log_priv.h"
  20#include "xfs_log_recover.h"
  21#include "xfs_inode_item.h"
  22#include "xfs_extfree_item.h"
  23#include "xfs_trans_priv.h"
  24#include "xfs_alloc.h"
  25#include "xfs_ialloc.h"
  26#include "xfs_quota.h"
  27#include "xfs_trace.h"
  28#include "xfs_icache.h"
  29#include "xfs_bmap_btree.h"
  30#include "xfs_error.h"
  31#include "xfs_dir2.h"
  32#include "xfs_rmap_item.h"
  33#include "xfs_buf_item.h"
  34#include "xfs_refcount_item.h"
  35#include "xfs_bmap_item.h"
  36
  37#define BLK_AVG(blk1, blk2)	((blk1+blk2) >> 1)
  38
  39STATIC int
  40xlog_find_zeroed(
  41	struct xlog	*,
  42	xfs_daddr_t	*);
  43STATIC int
  44xlog_clear_stale_blocks(
  45	struct xlog	*,
  46	xfs_lsn_t);
  47#if defined(DEBUG)
  48STATIC void
  49xlog_recover_check_summary(
  50	struct xlog *);
  51#else
  52#define	xlog_recover_check_summary(log)
  53#endif
  54STATIC int
  55xlog_do_recovery_pass(
  56        struct xlog *, xfs_daddr_t, xfs_daddr_t, int, xfs_daddr_t *);
  57
  58/*
  59 * This structure is used during recovery to record the buf log items which
  60 * have been canceled and should not be replayed.
  61 */
  62struct xfs_buf_cancel {
  63	xfs_daddr_t		bc_blkno;
  64	uint			bc_len;
  65	int			bc_refcount;
  66	struct list_head	bc_list;
  67};
  68
  69/*
  70 * Sector aligned buffer routines for buffer create/read/write/access
  71 */
  72
  73/*
  74 * Verify the log-relative block number and length in basic blocks are valid for
  75 * an operation involving the given XFS log buffer. Returns true if the fields
  76 * are valid, false otherwise.
  77 */
  78static inline bool
  79xlog_verify_bno(
  80	struct xlog	*log,
  81	xfs_daddr_t	blk_no,
  82	int		bbcount)
  83{
  84	if (blk_no < 0 || blk_no >= log->l_logBBsize)
  85		return false;
  86	if (bbcount <= 0 || (blk_no + bbcount) > log->l_logBBsize)
  87		return false;
  88	return true;
  89}
  90
  91/*
  92 * Allocate a buffer to hold log data.  The buffer needs to be able to map to
  93 * a range of nbblks basic blocks at any valid offset within the log.
  94 */
  95static char *
  96xlog_alloc_buffer(
  97	struct xlog	*log,
  98	int		nbblks)
  99{
 100	int align_mask = xfs_buftarg_dma_alignment(log->l_targ);
 101
 102	/*
 103	 * Pass log block 0 since we don't have an addr yet, buffer will be
 104	 * verified on read.
 105	 */
 106	if (XFS_IS_CORRUPT(log->l_mp, !xlog_verify_bno(log, 0, nbblks))) {
 107		xfs_warn(log->l_mp, "Invalid block length (0x%x) for buffer",
 108			nbblks);
 109		return NULL;
 110	}
 111
 112	/*
 113	 * We do log I/O in units of log sectors (a power-of-2 multiple of the
 114	 * basic block size), so we round up the requested size to accommodate
 115	 * the basic blocks required for complete log sectors.
 116	 *
 117	 * In addition, the buffer may be used for a non-sector-aligned block
 118	 * offset, in which case an I/O of the requested size could extend
 119	 * beyond the end of the buffer.  If the requested size is only 1 basic
 120	 * block it will never straddle a sector boundary, so this won't be an
 121	 * issue.  Nor will this be a problem if the log I/O is done in basic
 122	 * blocks (sector size 1).  But otherwise we extend the buffer by one
 123	 * extra log sector to ensure there's space to accommodate this
 124	 * possibility.
 125	 */
 126	if (nbblks > 1 && log->l_sectBBsize > 1)
 127		nbblks += log->l_sectBBsize;
 128	nbblks = round_up(nbblks, log->l_sectBBsize);
 129	return kmem_alloc_io(BBTOB(nbblks), align_mask, KM_MAYFAIL | KM_ZERO);
 130}
 131
 132/*
 133 * Return the address of the start of the given block number's data
 134 * in a log buffer.  The buffer covers a log sector-aligned region.
 135 */
 136static inline unsigned int
 137xlog_align(
 138	struct xlog	*log,
 139	xfs_daddr_t	blk_no)
 140{
 141	return BBTOB(blk_no & ((xfs_daddr_t)log->l_sectBBsize - 1));
 142}
 143
 144static int
 145xlog_do_io(
 146	struct xlog		*log,
 147	xfs_daddr_t		blk_no,
 148	unsigned int		nbblks,
 149	char			*data,
 150	unsigned int		op)
 151{
 152	int			error;
 153
 154	if (XFS_IS_CORRUPT(log->l_mp, !xlog_verify_bno(log, blk_no, nbblks))) {
 155		xfs_warn(log->l_mp,
 156			 "Invalid log block/length (0x%llx, 0x%x) for buffer",
 157			 blk_no, nbblks);
 158		return -EFSCORRUPTED;
 159	}
 160
 161	blk_no = round_down(blk_no, log->l_sectBBsize);
 162	nbblks = round_up(nbblks, log->l_sectBBsize);
 163	ASSERT(nbblks > 0);
 164
 165	error = xfs_rw_bdev(log->l_targ->bt_bdev, log->l_logBBstart + blk_no,
 166			BBTOB(nbblks), data, op);
 167	if (error && !XFS_FORCED_SHUTDOWN(log->l_mp)) {
 168		xfs_alert(log->l_mp,
 169			  "log recovery %s I/O error at daddr 0x%llx len %d error %d",
 170			  op == REQ_OP_WRITE ? "write" : "read",
 171			  blk_no, nbblks, error);
 172	}
 173	return error;
 174}
 175
 176STATIC int
 177xlog_bread_noalign(
 178	struct xlog	*log,
 179	xfs_daddr_t	blk_no,
 180	int		nbblks,
 181	char		*data)
 182{
 183	return xlog_do_io(log, blk_no, nbblks, data, REQ_OP_READ);
 184}
 185
 186STATIC int
 187xlog_bread(
 188	struct xlog	*log,
 189	xfs_daddr_t	blk_no,
 190	int		nbblks,
 191	char		*data,
 192	char		**offset)
 193{
 194	int		error;
 195
 196	error = xlog_do_io(log, blk_no, nbblks, data, REQ_OP_READ);
 197	if (!error)
 198		*offset = data + xlog_align(log, blk_no);
 199	return error;
 200}
 201
 202STATIC int
 203xlog_bwrite(
 204	struct xlog	*log,
 205	xfs_daddr_t	blk_no,
 206	int		nbblks,
 207	char		*data)
 208{
 209	return xlog_do_io(log, blk_no, nbblks, data, REQ_OP_WRITE);
 210}
 211
 212#ifdef DEBUG
 213/*
 214 * dump debug superblock and log record information
 215 */
 216STATIC void
 217xlog_header_check_dump(
 218	xfs_mount_t		*mp,
 219	xlog_rec_header_t	*head)
 220{
 221	xfs_debug(mp, "%s:  SB : uuid = %pU, fmt = %d",
 222		__func__, &mp->m_sb.sb_uuid, XLOG_FMT);
 223	xfs_debug(mp, "    log : uuid = %pU, fmt = %d",
 224		&head->h_fs_uuid, be32_to_cpu(head->h_fmt));
 225}
 226#else
 227#define xlog_header_check_dump(mp, head)
 228#endif
 229
 230/*
 231 * check log record header for recovery
 232 */
 233STATIC int
 234xlog_header_check_recover(
 235	xfs_mount_t		*mp,
 236	xlog_rec_header_t	*head)
 237{
 238	ASSERT(head->h_magicno == cpu_to_be32(XLOG_HEADER_MAGIC_NUM));
 239
 240	/*
 241	 * IRIX doesn't write the h_fmt field and leaves it zeroed
 242	 * (XLOG_FMT_UNKNOWN). This stops us from trying to recover
 243	 * a dirty log created in IRIX.
 244	 */
 245	if (XFS_IS_CORRUPT(mp, head->h_fmt != cpu_to_be32(XLOG_FMT))) {
 246		xfs_warn(mp,
 247	"dirty log written in incompatible format - can't recover");
 248		xlog_header_check_dump(mp, head);
 249		return -EFSCORRUPTED;
 250	}
 251	if (XFS_IS_CORRUPT(mp, !uuid_equal(&mp->m_sb.sb_uuid,
 252					   &head->h_fs_uuid))) {
 253		xfs_warn(mp,
 254	"dirty log entry has mismatched uuid - can't recover");
 255		xlog_header_check_dump(mp, head);
 256		return -EFSCORRUPTED;
 257	}
 258	return 0;
 259}
 260
 261/*
 262 * read the head block of the log and check the header
 263 */
 264STATIC int
 265xlog_header_check_mount(
 266	xfs_mount_t		*mp,
 267	xlog_rec_header_t	*head)
 268{
 269	ASSERT(head->h_magicno == cpu_to_be32(XLOG_HEADER_MAGIC_NUM));
 270
 271	if (uuid_is_null(&head->h_fs_uuid)) {
 272		/*
 273		 * IRIX doesn't write the h_fs_uuid or h_fmt fields. If
 274		 * h_fs_uuid is null, we assume this log was last mounted
 275		 * by IRIX and continue.
 276		 */
 277		xfs_warn(mp, "null uuid in log - IRIX style log");
 278	} else if (XFS_IS_CORRUPT(mp, !uuid_equal(&mp->m_sb.sb_uuid,
 279						  &head->h_fs_uuid))) {
 280		xfs_warn(mp, "log has mismatched uuid - can't recover");
 281		xlog_header_check_dump(mp, head);
 282		return -EFSCORRUPTED;
 283	}
 284	return 0;
 285}
 286
 287STATIC void
 288xlog_recover_iodone(
 289	struct xfs_buf	*bp)
 290{
 291	if (bp->b_error) {
 292		/*
 293		 * We're not going to bother about retrying
 294		 * this during recovery. One strike!
 295		 */
 296		if (!XFS_FORCED_SHUTDOWN(bp->b_mount)) {
 297			xfs_buf_ioerror_alert(bp, __func__);
 298			xfs_force_shutdown(bp->b_mount, SHUTDOWN_META_IO_ERROR);
 299		}
 300	}
 301
 302	/*
 303	 * On v5 supers, a bli could be attached to update the metadata LSN.
 304	 * Clean it up.
 305	 */
 306	if (bp->b_log_item)
 307		xfs_buf_item_relse(bp);
 308	ASSERT(bp->b_log_item == NULL);
 309
 310	bp->b_iodone = NULL;
 311	xfs_buf_ioend(bp);
 312}
 313
 314/*
 315 * This routine finds (to an approximation) the first block in the physical
 316 * log which contains the given cycle.  It uses a binary search algorithm.
 317 * Note that the algorithm can not be perfect because the disk will not
 318 * necessarily be perfect.
 319 */
 320STATIC int
 321xlog_find_cycle_start(
 322	struct xlog	*log,
 323	char		*buffer,
 324	xfs_daddr_t	first_blk,
 325	xfs_daddr_t	*last_blk,
 326	uint		cycle)
 327{
 328	char		*offset;
 329	xfs_daddr_t	mid_blk;
 330	xfs_daddr_t	end_blk;
 331	uint		mid_cycle;
 332	int		error;
 333
 334	end_blk = *last_blk;
 335	mid_blk = BLK_AVG(first_blk, end_blk);
 336	while (mid_blk != first_blk && mid_blk != end_blk) {
 337		error = xlog_bread(log, mid_blk, 1, buffer, &offset);
 338		if (error)
 339			return error;
 340		mid_cycle = xlog_get_cycle(offset);
 341		if (mid_cycle == cycle)
 342			end_blk = mid_blk;   /* last_half_cycle == mid_cycle */
 343		else
 344			first_blk = mid_blk; /* first_half_cycle == mid_cycle */
 345		mid_blk = BLK_AVG(first_blk, end_blk);
 346	}
 347	ASSERT((mid_blk == first_blk && mid_blk+1 == end_blk) ||
 348	       (mid_blk == end_blk && mid_blk-1 == first_blk));
 349
 350	*last_blk = end_blk;
 351
 352	return 0;
 353}
 354
 355/*
 356 * Check that a range of blocks does not contain stop_on_cycle_no.
 357 * Fill in *new_blk with the block offset where such a block is
 358 * found, or with -1 (an invalid block number) if there is no such
 359 * block in the range.  The scan needs to occur from front to back
 360 * and the pointer into the region must be updated since a later
 361 * routine will need to perform another test.
 362 */
 363STATIC int
 364xlog_find_verify_cycle(
 365	struct xlog	*log,
 366	xfs_daddr_t	start_blk,
 367	int		nbblks,
 368	uint		stop_on_cycle_no,
 369	xfs_daddr_t	*new_blk)
 370{
 371	xfs_daddr_t	i, j;
 372	uint		cycle;
 373	char		*buffer;
 374	xfs_daddr_t	bufblks;
 375	char		*buf = NULL;
 376	int		error = 0;
 377
 378	/*
 379	 * Greedily allocate a buffer big enough to handle the full
 380	 * range of basic blocks we'll be examining.  If that fails,
 381	 * try a smaller size.  We need to be able to read at least
 382	 * a log sector, or we're out of luck.
 383	 */
 384	bufblks = 1 << ffs(nbblks);
 385	while (bufblks > log->l_logBBsize)
 386		bufblks >>= 1;
 387	while (!(buffer = xlog_alloc_buffer(log, bufblks))) {
 388		bufblks >>= 1;
 389		if (bufblks < log->l_sectBBsize)
 390			return -ENOMEM;
 391	}
 392
 393	for (i = start_blk; i < start_blk + nbblks; i += bufblks) {
 394		int	bcount;
 395
 396		bcount = min(bufblks, (start_blk + nbblks - i));
 397
 398		error = xlog_bread(log, i, bcount, buffer, &buf);
 399		if (error)
 400			goto out;
 401
 402		for (j = 0; j < bcount; j++) {
 403			cycle = xlog_get_cycle(buf);
 404			if (cycle == stop_on_cycle_no) {
 405				*new_blk = i+j;
 406				goto out;
 407			}
 408
 409			buf += BBSIZE;
 410		}
 411	}
 412
 413	*new_blk = -1;
 414
 415out:
 416	kmem_free(buffer);
 417	return error;
 418}
 419
 420/*
 421 * Potentially backup over partial log record write.
 422 *
 423 * In the typical case, last_blk is the number of the block directly after
 424 * a good log record.  Therefore, we subtract one to get the block number
 425 * of the last block in the given buffer.  extra_bblks contains the number
 426 * of blocks we would have read on a previous read.  This happens when the
 427 * last log record is split over the end of the physical log.
 428 *
 429 * extra_bblks is the number of blocks potentially verified on a previous
 430 * call to this routine.
 431 */
 432STATIC int
 433xlog_find_verify_log_record(
 434	struct xlog		*log,
 435	xfs_daddr_t		start_blk,
 436	xfs_daddr_t		*last_blk,
 437	int			extra_bblks)
 438{
 439	xfs_daddr_t		i;
 440	char			*buffer;
 441	char			*offset = NULL;
 442	xlog_rec_header_t	*head = NULL;
 443	int			error = 0;
 444	int			smallmem = 0;
 445	int			num_blks = *last_blk - start_blk;
 446	int			xhdrs;
 447
 448	ASSERT(start_blk != 0 || *last_blk != start_blk);
 449
 450	buffer = xlog_alloc_buffer(log, num_blks);
 451	if (!buffer) {
 452		buffer = xlog_alloc_buffer(log, 1);
 453		if (!buffer)
 454			return -ENOMEM;
 455		smallmem = 1;
 456	} else {
 457		error = xlog_bread(log, start_blk, num_blks, buffer, &offset);
 458		if (error)
 459			goto out;
 460		offset += ((num_blks - 1) << BBSHIFT);
 461	}
 462
 463	for (i = (*last_blk) - 1; i >= 0; i--) {
 464		if (i < start_blk) {
 465			/* valid log record not found */
 466			xfs_warn(log->l_mp,
 467		"Log inconsistent (didn't find previous header)");
 468			ASSERT(0);
 469			error = -EFSCORRUPTED;
 470			goto out;
 471		}
 472
 473		if (smallmem) {
 474			error = xlog_bread(log, i, 1, buffer, &offset);
 475			if (error)
 476				goto out;
 477		}
 478
 479		head = (xlog_rec_header_t *)offset;
 480
 481		if (head->h_magicno == cpu_to_be32(XLOG_HEADER_MAGIC_NUM))
 482			break;
 483
 484		if (!smallmem)
 485			offset -= BBSIZE;
 486	}
 487
 488	/*
 489	 * We hit the beginning of the physical log & still no header.  Return
 490	 * to caller.  If caller can handle a return of -1, then this routine
 491	 * will be called again for the end of the physical log.
 492	 */
 493	if (i == -1) {
 494		error = 1;
 495		goto out;
 496	}
 497
 498	/*
 499	 * We have the final block of the good log (the first block
 500	 * of the log record _before_ the head. So we check the uuid.
 501	 */
 502	if ((error = xlog_header_check_mount(log->l_mp, head)))
 503		goto out;
 504
 505	/*
 506	 * We may have found a log record header before we expected one.
 507	 * last_blk will be the 1st block # with a given cycle #.  We may end
 508	 * up reading an entire log record.  In this case, we don't want to
 509	 * reset last_blk.  Only when last_blk points in the middle of a log
 510	 * record do we update last_blk.
 511	 */
 512	if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
 513		uint	h_size = be32_to_cpu(head->h_size);
 514
 515		xhdrs = h_size / XLOG_HEADER_CYCLE_SIZE;
 516		if (h_size % XLOG_HEADER_CYCLE_SIZE)
 517			xhdrs++;
 518	} else {
 519		xhdrs = 1;
 520	}
 521
 522	if (*last_blk - i + extra_bblks !=
 523	    BTOBB(be32_to_cpu(head->h_len)) + xhdrs)
 524		*last_blk = i;
 525
 526out:
 527	kmem_free(buffer);
 528	return error;
 529}
 530
 531/*
 532 * Head is defined to be the point of the log where the next log write
 533 * could go.  This means that incomplete LR writes at the end are
 534 * eliminated when calculating the head.  We aren't guaranteed that previous
 535 * LR have complete transactions.  We only know that a cycle number of
 536 * current cycle number -1 won't be present in the log if we start writing
 537 * from our current block number.
 538 *
 539 * last_blk contains the block number of the first block with a given
 540 * cycle number.
 541 *
 542 * Return: zero if normal, non-zero if error.
 543 */
 544STATIC int
 545xlog_find_head(
 546	struct xlog	*log,
 547	xfs_daddr_t	*return_head_blk)
 548{
 549	char		*buffer;
 550	char		*offset;
 551	xfs_daddr_t	new_blk, first_blk, start_blk, last_blk, head_blk;
 552	int		num_scan_bblks;
 553	uint		first_half_cycle, last_half_cycle;
 554	uint		stop_on_cycle;
 555	int		error, log_bbnum = log->l_logBBsize;
 556
 557	/* Is the end of the log device zeroed? */
 558	error = xlog_find_zeroed(log, &first_blk);
 559	if (error < 0) {
 560		xfs_warn(log->l_mp, "empty log check failed");
 561		return error;
 562	}
 563	if (error == 1) {
 564		*return_head_blk = first_blk;
 565
 566		/* Is the whole lot zeroed? */
 567		if (!first_blk) {
 568			/* Linux XFS shouldn't generate totally zeroed logs -
 569			 * mkfs etc write a dummy unmount record to a fresh
 570			 * log so we can store the uuid in there
 571			 */
 572			xfs_warn(log->l_mp, "totally zeroed log");
 573		}
 574
 575		return 0;
 576	}
 577
 578	first_blk = 0;			/* get cycle # of 1st block */
 579	buffer = xlog_alloc_buffer(log, 1);
 580	if (!buffer)
 581		return -ENOMEM;
 582
 583	error = xlog_bread(log, 0, 1, buffer, &offset);
 584	if (error)
 585		goto out_free_buffer;
 586
 587	first_half_cycle = xlog_get_cycle(offset);
 588
 589	last_blk = head_blk = log_bbnum - 1;	/* get cycle # of last block */
 590	error = xlog_bread(log, last_blk, 1, buffer, &offset);
 591	if (error)
 592		goto out_free_buffer;
 593
 594	last_half_cycle = xlog_get_cycle(offset);
 595	ASSERT(last_half_cycle != 0);
 596
 597	/*
 598	 * If the 1st half cycle number is equal to the last half cycle number,
 599	 * then the entire log is stamped with the same cycle number.  In this
 600	 * case, head_blk can't be set to zero (which makes sense).  The below
 601	 * math doesn't work out properly with head_blk equal to zero.  Instead,
 602	 * we set it to log_bbnum which is an invalid block number, but this
 603	 * value makes the math correct.  If head_blk doesn't changed through
 604	 * all the tests below, *head_blk is set to zero at the very end rather
 605	 * than log_bbnum.  In a sense, log_bbnum and zero are the same block
 606	 * in a circular file.
 607	 */
 608	if (first_half_cycle == last_half_cycle) {
 609		/*
 610		 * In this case we believe that the entire log should have
 611		 * cycle number last_half_cycle.  We need to scan backwards
 612		 * from the end verifying that there are no holes still
 613		 * containing last_half_cycle - 1.  If we find such a hole,
 614		 * then the start of that hole will be the new head.  The
 615		 * simple case looks like
 616		 *        x | x ... | x - 1 | x
 617		 * Another case that fits this picture would be
 618		 *        x | x + 1 | x ... | x
 619		 * In this case the head really is somewhere at the end of the
 620		 * log, as one of the latest writes at the beginning was
 621		 * incomplete.
 622		 * One more case is
 623		 *        x | x + 1 | x ... | x - 1 | x
 624		 * This is really the combination of the above two cases, and
 625		 * the head has to end up at the start of the x-1 hole at the
 626		 * end of the log.
 627		 *
 628		 * In the 256k log case, we will read from the beginning to the
 629		 * end of the log and search for cycle numbers equal to x-1.
 630		 * We don't worry about the x+1 blocks that we encounter,
 631		 * because we know that they cannot be the head since the log
 632		 * started with x.
 633		 */
 634		head_blk = log_bbnum;
 635		stop_on_cycle = last_half_cycle - 1;
 636	} else {
 637		/*
 638		 * In this case we want to find the first block with cycle
 639		 * number matching last_half_cycle.  We expect the log to be
 640		 * some variation on
 641		 *        x + 1 ... | x ... | x
 642		 * The first block with cycle number x (last_half_cycle) will
 643		 * be where the new head belongs.  First we do a binary search
 644		 * for the first occurrence of last_half_cycle.  The binary
 645		 * search may not be totally accurate, so then we scan back
 646		 * from there looking for occurrences of last_half_cycle before
 647		 * us.  If that backwards scan wraps around the beginning of
 648		 * the log, then we look for occurrences of last_half_cycle - 1
 649		 * at the end of the log.  The cases we're looking for look
 650		 * like
 651		 *                               v binary search stopped here
 652		 *        x + 1 ... | x | x + 1 | x ... | x
 653		 *                   ^ but we want to locate this spot
 654		 * or
 655		 *        <---------> less than scan distance
 656		 *        x + 1 ... | x ... | x - 1 | x
 657		 *                           ^ we want to locate this spot
 658		 */
 659		stop_on_cycle = last_half_cycle;
 660		error = xlog_find_cycle_start(log, buffer, first_blk, &head_blk,
 661				last_half_cycle);
 662		if (error)
 663			goto out_free_buffer;
 664	}
 665
 666	/*
 667	 * Now validate the answer.  Scan back some number of maximum possible
 668	 * blocks and make sure each one has the expected cycle number.  The
 669	 * maximum is determined by the total possible amount of buffering
 670	 * in the in-core log.  The following number can be made tighter if
 671	 * we actually look at the block size of the filesystem.
 672	 */
 673	num_scan_bblks = min_t(int, log_bbnum, XLOG_TOTAL_REC_SHIFT(log));
 674	if (head_blk >= num_scan_bblks) {
 675		/*
 676		 * We are guaranteed that the entire check can be performed
 677		 * in one buffer.
 678		 */
 679		start_blk = head_blk - num_scan_bblks;
 680		if ((error = xlog_find_verify_cycle(log,
 681						start_blk, num_scan_bblks,
 682						stop_on_cycle, &new_blk)))
 683			goto out_free_buffer;
 684		if (new_blk != -1)
 685			head_blk = new_blk;
 686	} else {		/* need to read 2 parts of log */
 687		/*
 688		 * We are going to scan backwards in the log in two parts.
 689		 * First we scan the physical end of the log.  In this part
 690		 * of the log, we are looking for blocks with cycle number
 691		 * last_half_cycle - 1.
 692		 * If we find one, then we know that the log starts there, as
 693		 * we've found a hole that didn't get written in going around
 694		 * the end of the physical log.  The simple case for this is
 695		 *        x + 1 ... | x ... | x - 1 | x
 696		 *        <---------> less than scan distance
 697		 * If all of the blocks at the end of the log have cycle number
 698		 * last_half_cycle, then we check the blocks at the start of
 699		 * the log looking for occurrences of last_half_cycle.  If we
 700		 * find one, then our current estimate for the location of the
 701		 * first occurrence of last_half_cycle is wrong and we move
 702		 * back to the hole we've found.  This case looks like
 703		 *        x + 1 ... | x | x + 1 | x ...
 704		 *                               ^ binary search stopped here
 705		 * Another case we need to handle that only occurs in 256k
 706		 * logs is
 707		 *        x + 1 ... | x ... | x+1 | x ...
 708		 *                   ^ binary search stops here
 709		 * In a 256k log, the scan at the end of the log will see the
 710		 * x + 1 blocks.  We need to skip past those since that is
 711		 * certainly not the head of the log.  By searching for
 712		 * last_half_cycle-1 we accomplish that.
 713		 */
 714		ASSERT(head_blk <= INT_MAX &&
 715			(xfs_daddr_t) num_scan_bblks >= head_blk);
 716		start_blk = log_bbnum - (num_scan_bblks - head_blk);
 717		if ((error = xlog_find_verify_cycle(log, start_blk,
 718					num_scan_bblks - (int)head_blk,
 719					(stop_on_cycle - 1), &new_blk)))
 720			goto out_free_buffer;
 721		if (new_blk != -1) {
 722			head_blk = new_blk;
 723			goto validate_head;
 724		}
 725
 726		/*
 727		 * Scan beginning of log now.  The last part of the physical
 728		 * log is good.  This scan needs to verify that it doesn't find
 729		 * the last_half_cycle.
 730		 */
 731		start_blk = 0;
 732		ASSERT(head_blk <= INT_MAX);
 733		if ((error = xlog_find_verify_cycle(log,
 734					start_blk, (int)head_blk,
 735					stop_on_cycle, &new_blk)))
 736			goto out_free_buffer;
 737		if (new_blk != -1)
 738			head_blk = new_blk;
 739	}
 740
 741validate_head:
 742	/*
 743	 * Now we need to make sure head_blk is not pointing to a block in
 744	 * the middle of a log record.
 745	 */
 746	num_scan_bblks = XLOG_REC_SHIFT(log);
 747	if (head_blk >= num_scan_bblks) {
 748		start_blk = head_blk - num_scan_bblks; /* don't read head_blk */
 749
 750		/* start ptr at last block ptr before head_blk */
 751		error = xlog_find_verify_log_record(log, start_blk, &head_blk, 0);
 752		if (error == 1)
 753			error = -EIO;
 754		if (error)
 755			goto out_free_buffer;
 756	} else {
 757		start_blk = 0;
 758		ASSERT(head_blk <= INT_MAX);
 759		error = xlog_find_verify_log_record(log, start_blk, &head_blk, 0);
 760		if (error < 0)
 761			goto out_free_buffer;
 762		if (error == 1) {
 763			/* We hit the beginning of the log during our search */
 764			start_blk = log_bbnum - (num_scan_bblks - head_blk);
 765			new_blk = log_bbnum;
 766			ASSERT(start_blk <= INT_MAX &&
 767				(xfs_daddr_t) log_bbnum-start_blk >= 0);
 768			ASSERT(head_blk <= INT_MAX);
 769			error = xlog_find_verify_log_record(log, start_blk,
 770							&new_blk, (int)head_blk);
 771			if (error == 1)
 772				error = -EIO;
 773			if (error)
 774				goto out_free_buffer;
 775			if (new_blk != log_bbnum)
 776				head_blk = new_blk;
 777		} else if (error)
 778			goto out_free_buffer;
 779	}
 780
 781	kmem_free(buffer);
 782	if (head_blk == log_bbnum)
 783		*return_head_blk = 0;
 784	else
 785		*return_head_blk = head_blk;
 786	/*
 787	 * When returning here, we have a good block number.  Bad block
 788	 * means that during a previous crash, we didn't have a clean break
 789	 * from cycle number N to cycle number N-1.  In this case, we need
 790	 * to find the first block with cycle number N-1.
 791	 */
 792	return 0;
 793
 794out_free_buffer:
 795	kmem_free(buffer);
 796	if (error)
 797		xfs_warn(log->l_mp, "failed to find log head");
 798	return error;
 799}
 800
 801/*
 802 * Seek backwards in the log for log record headers.
 803 *
 804 * Given a starting log block, walk backwards until we find the provided number
 805 * of records or hit the provided tail block. The return value is the number of
 806 * records encountered or a negative error code. The log block and buffer
 807 * pointer of the last record seen are returned in rblk and rhead respectively.
 808 */
 809STATIC int
 810xlog_rseek_logrec_hdr(
 811	struct xlog		*log,
 812	xfs_daddr_t		head_blk,
 813	xfs_daddr_t		tail_blk,
 814	int			count,
 815	char			*buffer,
 816	xfs_daddr_t		*rblk,
 817	struct xlog_rec_header	**rhead,
 818	bool			*wrapped)
 819{
 820	int			i;
 821	int			error;
 822	int			found = 0;
 823	char			*offset = NULL;
 824	xfs_daddr_t		end_blk;
 825
 826	*wrapped = false;
 827
 828	/*
 829	 * Walk backwards from the head block until we hit the tail or the first
 830	 * block in the log.
 831	 */
 832	end_blk = head_blk > tail_blk ? tail_blk : 0;
 833	for (i = (int) head_blk - 1; i >= end_blk; i--) {
 834		error = xlog_bread(log, i, 1, buffer, &offset);
 835		if (error)
 836			goto out_error;
 837
 838		if (*(__be32 *) offset == cpu_to_be32(XLOG_HEADER_MAGIC_NUM)) {
 839			*rblk = i;
 840			*rhead = (struct xlog_rec_header *) offset;
 841			if (++found == count)
 842				break;
 843		}
 844	}
 845
 846	/*
 847	 * If we haven't hit the tail block or the log record header count,
 848	 * start looking again from the end of the physical log. Note that
 849	 * callers can pass head == tail if the tail is not yet known.
 850	 */
 851	if (tail_blk >= head_blk && found != count) {
 852		for (i = log->l_logBBsize - 1; i >= (int) tail_blk; i--) {
 853			error = xlog_bread(log, i, 1, buffer, &offset);
 854			if (error)
 855				goto out_error;
 856
 857			if (*(__be32 *)offset ==
 858			    cpu_to_be32(XLOG_HEADER_MAGIC_NUM)) {
 859				*wrapped = true;
 860				*rblk = i;
 861				*rhead = (struct xlog_rec_header *) offset;
 862				if (++found == count)
 863					break;
 864			}
 865		}
 866	}
 867
 868	return found;
 869
 870out_error:
 871	return error;
 872}
 873
 874/*
 875 * Seek forward in the log for log record headers.
 876 *
 877 * Given head and tail blocks, walk forward from the tail block until we find
 878 * the provided number of records or hit the head block. The return value is the
 879 * number of records encountered or a negative error code. The log block and
 880 * buffer pointer of the last record seen are returned in rblk and rhead
 881 * respectively.
 882 */
 883STATIC int
 884xlog_seek_logrec_hdr(
 885	struct xlog		*log,
 886	xfs_daddr_t		head_blk,
 887	xfs_daddr_t		tail_blk,
 888	int			count,
 889	char			*buffer,
 890	xfs_daddr_t		*rblk,
 891	struct xlog_rec_header	**rhead,
 892	bool			*wrapped)
 893{
 894	int			i;
 895	int			error;
 896	int			found = 0;
 897	char			*offset = NULL;
 898	xfs_daddr_t		end_blk;
 899
 900	*wrapped = false;
 901
 902	/*
 903	 * Walk forward from the tail block until we hit the head or the last
 904	 * block in the log.
 905	 */
 906	end_blk = head_blk > tail_blk ? head_blk : log->l_logBBsize - 1;
 907	for (i = (int) tail_blk; i <= end_blk; i++) {
 908		error = xlog_bread(log, i, 1, buffer, &offset);
 909		if (error)
 910			goto out_error;
 911
 912		if (*(__be32 *) offset == cpu_to_be32(XLOG_HEADER_MAGIC_NUM)) {
 913			*rblk = i;
 914			*rhead = (struct xlog_rec_header *) offset;
 915			if (++found == count)
 916				break;
 917		}
 918	}
 919
 920	/*
 921	 * If we haven't hit the head block or the log record header count,
 922	 * start looking again from the start of the physical log.
 923	 */
 924	if (tail_blk > head_blk && found != count) {
 925		for (i = 0; i < (int) head_blk; i++) {
 926			error = xlog_bread(log, i, 1, buffer, &offset);
 927			if (error)
 928				goto out_error;
 929
 930			if (*(__be32 *)offset ==
 931			    cpu_to_be32(XLOG_HEADER_MAGIC_NUM)) {
 932				*wrapped = true;
 933				*rblk = i;
 934				*rhead = (struct xlog_rec_header *) offset;
 935				if (++found == count)
 936					break;
 937			}
 938		}
 939	}
 940
 941	return found;
 942
 943out_error:
 944	return error;
 945}
 946
 947/*
 948 * Calculate distance from head to tail (i.e., unused space in the log).
 949 */
 950static inline int
 951xlog_tail_distance(
 952	struct xlog	*log,
 953	xfs_daddr_t	head_blk,
 954	xfs_daddr_t	tail_blk)
 955{
 956	if (head_blk < tail_blk)
 957		return tail_blk - head_blk;
 958
 959	return tail_blk + (log->l_logBBsize - head_blk);
 960}
 961
 962/*
 963 * Verify the log tail. This is particularly important when torn or incomplete
 964 * writes have been detected near the front of the log and the head has been
 965 * walked back accordingly.
 966 *
 967 * We also have to handle the case where the tail was pinned and the head
 968 * blocked behind the tail right before a crash. If the tail had been pushed
 969 * immediately prior to the crash and the subsequent checkpoint was only
 970 * partially written, it's possible it overwrote the last referenced tail in the
 971 * log with garbage. This is not a coherency problem because the tail must have
 972 * been pushed before it can be overwritten, but appears as log corruption to
 973 * recovery because we have no way to know the tail was updated if the
 974 * subsequent checkpoint didn't write successfully.
 975 *
 976 * Therefore, CRC check the log from tail to head. If a failure occurs and the
 977 * offending record is within max iclog bufs from the head, walk the tail
 978 * forward and retry until a valid tail is found or corruption is detected out
 979 * of the range of a possible overwrite.
 980 */
 981STATIC int
 982xlog_verify_tail(
 983	struct xlog		*log,
 984	xfs_daddr_t		head_blk,
 985	xfs_daddr_t		*tail_blk,
 986	int			hsize)
 987{
 988	struct xlog_rec_header	*thead;
 989	char			*buffer;
 990	xfs_daddr_t		first_bad;
 991	int			error = 0;
 992	bool			wrapped;
 993	xfs_daddr_t		tmp_tail;
 994	xfs_daddr_t		orig_tail = *tail_blk;
 995
 996	buffer = xlog_alloc_buffer(log, 1);
 997	if (!buffer)
 998		return -ENOMEM;
 999
1000	/*
1001	 * Make sure the tail points to a record (returns positive count on
1002	 * success).
1003	 */
1004	error = xlog_seek_logrec_hdr(log, head_blk, *tail_blk, 1, buffer,
1005			&tmp_tail, &thead, &wrapped);
1006	if (error < 0)
1007		goto out;
1008	if (*tail_blk != tmp_tail)
1009		*tail_blk = tmp_tail;
1010
1011	/*
1012	 * Run a CRC check from the tail to the head. We can't just check
1013	 * MAX_ICLOGS records past the tail because the tail may point to stale
1014	 * blocks cleared during the search for the head/tail. These blocks are
1015	 * overwritten with zero-length records and thus record count is not a
1016	 * reliable indicator of the iclog state before a crash.
1017	 */
1018	first_bad = 0;
1019	error = xlog_do_recovery_pass(log, head_blk, *tail_blk,
1020				      XLOG_RECOVER_CRCPASS, &first_bad);
1021	while ((error == -EFSBADCRC || error == -EFSCORRUPTED) && first_bad) {
1022		int	tail_distance;
1023
1024		/*
1025		 * Is corruption within range of the head? If so, retry from
1026		 * the next record. Otherwise return an error.
1027		 */
1028		tail_distance = xlog_tail_distance(log, head_blk, first_bad);
1029		if (tail_distance > BTOBB(XLOG_MAX_ICLOGS * hsize))
1030			break;
1031
1032		/* skip to the next record; returns positive count on success */
1033		error = xlog_seek_logrec_hdr(log, head_blk, first_bad, 2,
1034				buffer, &tmp_tail, &thead, &wrapped);
1035		if (error < 0)
1036			goto out;
1037
1038		*tail_blk = tmp_tail;
1039		first_bad = 0;
1040		error = xlog_do_recovery_pass(log, head_blk, *tail_blk,
1041					      XLOG_RECOVER_CRCPASS, &first_bad);
1042	}
1043
1044	if (!error && *tail_blk != orig_tail)
1045		xfs_warn(log->l_mp,
1046		"Tail block (0x%llx) overwrite detected. Updated to 0x%llx",
1047			 orig_tail, *tail_blk);
1048out:
1049	kmem_free(buffer);
1050	return error;
1051}
1052
1053/*
1054 * Detect and trim torn writes from the head of the log.
1055 *
1056 * Storage without sector atomicity guarantees can result in torn writes in the
1057 * log in the event of a crash. Our only means to detect this scenario is via
1058 * CRC verification. While we can't always be certain that CRC verification
1059 * failure is due to a torn write vs. an unrelated corruption, we do know that
1060 * only a certain number (XLOG_MAX_ICLOGS) of log records can be written out at
1061 * one time. Therefore, CRC verify up to XLOG_MAX_ICLOGS records at the head of
1062 * the log and treat failures in this range as torn writes as a matter of
1063 * policy. In the event of CRC failure, the head is walked back to the last good
1064 * record in the log and the tail is updated from that record and verified.
1065 */
1066STATIC int
1067xlog_verify_head(
1068	struct xlog		*log,
1069	xfs_daddr_t		*head_blk,	/* in/out: unverified head */
1070	xfs_daddr_t		*tail_blk,	/* out: tail block */
1071	char			*buffer,
1072	xfs_daddr_t		*rhead_blk,	/* start blk of last record */
1073	struct xlog_rec_header	**rhead,	/* ptr to last record */
1074	bool			*wrapped)	/* last rec. wraps phys. log */
1075{
1076	struct xlog_rec_header	*tmp_rhead;
1077	char			*tmp_buffer;
1078	xfs_daddr_t		first_bad;
1079	xfs_daddr_t		tmp_rhead_blk;
1080	int			found;
1081	int			error;
1082	bool			tmp_wrapped;
1083
1084	/*
1085	 * Check the head of the log for torn writes. Search backwards from the
1086	 * head until we hit the tail or the maximum number of log record I/Os
1087	 * that could have been in flight at one time. Use a temporary buffer so
1088	 * we don't trash the rhead/buffer pointers from the caller.
1089	 */
1090	tmp_buffer = xlog_alloc_buffer(log, 1);
1091	if (!tmp_buffer)
1092		return -ENOMEM;
1093	error = xlog_rseek_logrec_hdr(log, *head_blk, *tail_blk,
1094				      XLOG_MAX_ICLOGS, tmp_buffer,
1095				      &tmp_rhead_blk, &tmp_rhead, &tmp_wrapped);
1096	kmem_free(tmp_buffer);
1097	if (error < 0)
1098		return error;
1099
1100	/*
1101	 * Now run a CRC verification pass over the records starting at the
1102	 * block found above to the current head. If a CRC failure occurs, the
1103	 * log block of the first bad record is saved in first_bad.
1104	 */
1105	error = xlog_do_recovery_pass(log, *head_blk, tmp_rhead_blk,
1106				      XLOG_RECOVER_CRCPASS, &first_bad);
1107	if ((error == -EFSBADCRC || error == -EFSCORRUPTED) && first_bad) {
1108		/*
1109		 * We've hit a potential torn write. Reset the error and warn
1110		 * about it.
1111		 */
1112		error = 0;
1113		xfs_warn(log->l_mp,
1114"Torn write (CRC failure) detected at log block 0x%llx. Truncating head block from 0x%llx.",
1115			 first_bad, *head_blk);
1116
1117		/*
1118		 * Get the header block and buffer pointer for the last good
1119		 * record before the bad record.
1120		 *
1121		 * Note that xlog_find_tail() clears the blocks at the new head
1122		 * (i.e., the records with invalid CRC) if the cycle number
1123		 * matches the the current cycle.
1124		 */
1125		found = xlog_rseek_logrec_hdr(log, first_bad, *tail_blk, 1,
1126				buffer, rhead_blk, rhead, wrapped);
1127		if (found < 0)
1128			return found;
1129		if (found == 0)		/* XXX: right thing to do here? */
1130			return -EIO;
1131
1132		/*
1133		 * Reset the head block to the starting block of the first bad
1134		 * log record and set the tail block based on the last good
1135		 * record.
1136		 *
1137		 * Bail out if the updated head/tail match as this indicates
1138		 * possible corruption outside of the acceptable
1139		 * (XLOG_MAX_ICLOGS) range. This is a job for xfs_repair...
1140		 */
1141		*head_blk = first_bad;
1142		*tail_blk = BLOCK_LSN(be64_to_cpu((*rhead)->h_tail_lsn));
1143		if (*head_blk == *tail_blk) {
1144			ASSERT(0);
1145			return 0;
1146		}
1147	}
1148	if (error)
1149		return error;
1150
1151	return xlog_verify_tail(log, *head_blk, tail_blk,
1152				be32_to_cpu((*rhead)->h_size));
1153}
1154
1155/*
1156 * We need to make sure we handle log wrapping properly, so we can't use the
1157 * calculated logbno directly. Make sure it wraps to the correct bno inside the
1158 * log.
1159 *
1160 * The log is limited to 32 bit sizes, so we use the appropriate modulus
1161 * operation here and cast it back to a 64 bit daddr on return.
1162 */
1163static inline xfs_daddr_t
1164xlog_wrap_logbno(
1165	struct xlog		*log,
1166	xfs_daddr_t		bno)
1167{
1168	int			mod;
1169
1170	div_s64_rem(bno, log->l_logBBsize, &mod);
1171	return mod;
1172}
1173
1174/*
1175 * Check whether the head of the log points to an unmount record. In other
1176 * words, determine whether the log is clean. If so, update the in-core state
1177 * appropriately.
1178 */
1179static int
1180xlog_check_unmount_rec(
1181	struct xlog		*log,
1182	xfs_daddr_t		*head_blk,
1183	xfs_daddr_t		*tail_blk,
1184	struct xlog_rec_header	*rhead,
1185	xfs_daddr_t		rhead_blk,
1186	char			*buffer,
1187	bool			*clean)
1188{
1189	struct xlog_op_header	*op_head;
1190	xfs_daddr_t		umount_data_blk;
1191	xfs_daddr_t		after_umount_blk;
1192	int			hblks;
1193	int			error;
1194	char			*offset;
1195
1196	*clean = false;
1197
1198	/*
1199	 * Look for unmount record. If we find it, then we know there was a
1200	 * clean unmount. Since 'i' could be the last block in the physical
1201	 * log, we convert to a log block before comparing to the head_blk.
1202	 *
1203	 * Save the current tail lsn to use to pass to xlog_clear_stale_blocks()
1204	 * below. We won't want to clear the unmount record if there is one, so
1205	 * we pass the lsn of the unmount record rather than the block after it.
1206	 */
1207	if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
1208		int	h_size = be32_to_cpu(rhead->h_size);
1209		int	h_version = be32_to_cpu(rhead->h_version);
1210
1211		if ((h_version & XLOG_VERSION_2) &&
1212		    (h_size > XLOG_HEADER_CYCLE_SIZE)) {
1213			hblks = h_size / XLOG_HEADER_CYCLE_SIZE;
1214			if (h_size % XLOG_HEADER_CYCLE_SIZE)
1215				hblks++;
1216		} else {
1217			hblks = 1;
1218		}
1219	} else {
1220		hblks = 1;
1221	}
1222
1223	after_umount_blk = xlog_wrap_logbno(log,
1224			rhead_blk + hblks + BTOBB(be32_to_cpu(rhead->h_len)));
1225
1226	if (*head_blk == after_umount_blk &&
1227	    be32_to_cpu(rhead->h_num_logops) == 1) {
1228		umount_data_blk = xlog_wrap_logbno(log, rhead_blk + hblks);
1229		error = xlog_bread(log, umount_data_blk, 1, buffer, &offset);
1230		if (error)
1231			return error;
1232
1233		op_head = (struct xlog_op_header *)offset;
1234		if (op_head->oh_flags & XLOG_UNMOUNT_TRANS) {
1235			/*
1236			 * Set tail and last sync so that newly written log
1237			 * records will point recovery to after the current
1238			 * unmount record.
1239			 */
1240			xlog_assign_atomic_lsn(&log->l_tail_lsn,
1241					log->l_curr_cycle, after_umount_blk);
1242			xlog_assign_atomic_lsn(&log->l_last_sync_lsn,
1243					log->l_curr_cycle, after_umount_blk);
1244			*tail_blk = after_umount_blk;
1245
1246			*clean = true;
1247		}
1248	}
1249
1250	return 0;
1251}
1252
1253static void
1254xlog_set_state(
1255	struct xlog		*log,
1256	xfs_daddr_t		head_blk,
1257	struct xlog_rec_header	*rhead,
1258	xfs_daddr_t		rhead_blk,
1259	bool			bump_cycle)
1260{
1261	/*
1262	 * Reset log values according to the state of the log when we
1263	 * crashed.  In the case where head_blk == 0, we bump curr_cycle
1264	 * one because the next write starts a new cycle rather than
1265	 * continuing the cycle of the last good log record.  At this
1266	 * point we have guaranteed that all partial log records have been
1267	 * accounted for.  Therefore, we know that the last good log record
1268	 * written was complete and ended exactly on the end boundary
1269	 * of the physical log.
1270	 */
1271	log->l_prev_block = rhead_blk;
1272	log->l_curr_block = (int)head_blk;
1273	log->l_curr_cycle = be32_to_cpu(rhead->h_cycle);
1274	if (bump_cycle)
1275		log->l_curr_cycle++;
1276	atomic64_set(&log->l_tail_lsn, be64_to_cpu(rhead->h_tail_lsn));
1277	atomic64_set(&log->l_last_sync_lsn, be64_to_cpu(rhead->h_lsn));
1278	xlog_assign_grant_head(&log->l_reserve_head.grant, log->l_curr_cycle,
1279					BBTOB(log->l_curr_block));
1280	xlog_assign_grant_head(&log->l_write_head.grant, log->l_curr_cycle,
1281					BBTOB(log->l_curr_block));
1282}
1283
1284/*
1285 * Find the sync block number or the tail of the log.
1286 *
1287 * This will be the block number of the last record to have its
1288 * associated buffers synced to disk.  Every log record header has
1289 * a sync lsn embedded in it.  LSNs hold block numbers, so it is easy
1290 * to get a sync block number.  The only concern is to figure out which
1291 * log record header to believe.
1292 *
1293 * The following algorithm uses the log record header with the largest
1294 * lsn.  The entire log record does not need to be valid.  We only care
1295 * that the header is valid.
1296 *
1297 * We could speed up search by using current head_blk buffer, but it is not
1298 * available.
1299 */
1300STATIC int
1301xlog_find_tail(
1302	struct xlog		*log,
1303	xfs_daddr_t		*head_blk,
1304	xfs_daddr_t		*tail_blk)
1305{
1306	xlog_rec_header_t	*rhead;
1307	char			*offset = NULL;
1308	char			*buffer;
1309	int			error;
1310	xfs_daddr_t		rhead_blk;
1311	xfs_lsn_t		tail_lsn;
1312	bool			wrapped = false;
1313	bool			clean = false;
1314
1315	/*
1316	 * Find previous log record
1317	 */
1318	if ((error = xlog_find_head(log, head_blk)))
1319		return error;
1320	ASSERT(*head_blk < INT_MAX);
1321
1322	buffer = xlog_alloc_buffer(log, 1);
1323	if (!buffer)
1324		return -ENOMEM;
1325	if (*head_blk == 0) {				/* special case */
1326		error = xlog_bread(log, 0, 1, buffer, &offset);
1327		if (error)
1328			goto done;
1329
1330		if (xlog_get_cycle(offset) == 0) {
1331			*tail_blk = 0;
1332			/* leave all other log inited values alone */
1333			goto done;
1334		}
1335	}
1336
1337	/*
1338	 * Search backwards through the log looking for the log record header
1339	 * block. This wraps all the way back around to the head so something is
1340	 * seriously wrong if we can't find it.
1341	 */
1342	error = xlog_rseek_logrec_hdr(log, *head_blk, *head_blk, 1, buffer,
1343				      &rhead_blk, &rhead, &wrapped);
1344	if (error < 0)
1345		goto done;
1346	if (!error) {
1347		xfs_warn(log->l_mp, "%s: couldn't find sync record", __func__);
1348		error = -EFSCORRUPTED;
1349		goto done;
1350	}
1351	*tail_blk = BLOCK_LSN(be64_to_cpu(rhead->h_tail_lsn));
1352
1353	/*
1354	 * Set the log state based on the current head record.
1355	 */
1356	xlog_set_state(log, *head_blk, rhead, rhead_blk, wrapped);
1357	tail_lsn = atomic64_read(&log->l_tail_lsn);
1358
1359	/*
1360	 * Look for an unmount record at the head of the log. This sets the log
1361	 * state to determine whether recovery is necessary.
1362	 */
1363	error = xlog_check_unmount_rec(log, head_blk, tail_blk, rhead,
1364				       rhead_blk, buffer, &clean);
1365	if (error)
1366		goto done;
1367
1368	/*
1369	 * Verify the log head if the log is not clean (e.g., we have anything
1370	 * but an unmount record at the head). This uses CRC verification to
1371	 * detect and trim torn writes. If discovered, CRC failures are
1372	 * considered torn writes and the log head is trimmed accordingly.
1373	 *
1374	 * Note that we can only run CRC verification when the log is dirty
1375	 * because there's no guarantee that the log data behind an unmount
1376	 * record is compatible with the current architecture.
1377	 */
1378	if (!clean) {
1379		xfs_daddr_t	orig_head = *head_blk;
1380
1381		error = xlog_verify_head(log, head_blk, tail_blk, buffer,
1382					 &rhead_blk, &rhead, &wrapped);
1383		if (error)
1384			goto done;
1385
1386		/* update in-core state again if the head changed */
1387		if (*head_blk != orig_head) {
1388			xlog_set_state(log, *head_blk, rhead, rhead_blk,
1389				       wrapped);
1390			tail_lsn = atomic64_read(&log->l_tail_lsn);
1391			error = xlog_check_unmount_rec(log, head_blk, tail_blk,
1392						       rhead, rhead_blk, buffer,
1393						       &clean);
1394			if (error)
1395				goto done;
1396		}
1397	}
1398
1399	/*
1400	 * Note that the unmount was clean. If the unmount was not clean, we
1401	 * need to know this to rebuild the superblock counters from the perag
1402	 * headers if we have a filesystem using non-persistent counters.
1403	 */
1404	if (clean)
1405		log->l_mp->m_flags |= XFS_MOUNT_WAS_CLEAN;
1406
1407	/*
1408	 * Make sure that there are no blocks in front of the head
1409	 * with the same cycle number as the head.  This can happen
1410	 * because we allow multiple outstanding log writes concurrently,
1411	 * and the later writes might make it out before earlier ones.
1412	 *
1413	 * We use the lsn from before modifying it so that we'll never
1414	 * overwrite the unmount record after a clean unmount.
1415	 *
1416	 * Do this only if we are going to recover the filesystem
1417	 *
1418	 * NOTE: This used to say "if (!readonly)"
1419	 * However on Linux, we can & do recover a read-only filesystem.
1420	 * We only skip recovery if NORECOVERY is specified on mount,
1421	 * in which case we would not be here.
1422	 *
1423	 * But... if the -device- itself is readonly, just skip this.
1424	 * We can't recover this device anyway, so it won't matter.
1425	 */
1426	if (!xfs_readonly_buftarg(log->l_targ))
1427		error = xlog_clear_stale_blocks(log, tail_lsn);
1428
1429done:
1430	kmem_free(buffer);
1431
1432	if (error)
1433		xfs_warn(log->l_mp, "failed to locate log tail");
1434	return error;
1435}
1436
1437/*
1438 * Is the log zeroed at all?
1439 *
1440 * The last binary search should be changed to perform an X block read
1441 * once X becomes small enough.  You can then search linearly through
1442 * the X blocks.  This will cut down on the number of reads we need to do.
1443 *
1444 * If the log is partially zeroed, this routine will pass back the blkno
1445 * of the first block with cycle number 0.  It won't have a complete LR
1446 * preceding it.
1447 *
1448 * Return:
1449 *	0  => the log is completely written to
1450 *	1 => use *blk_no as the first block of the log
1451 *	<0 => error has occurred
1452 */
1453STATIC int
1454xlog_find_zeroed(
1455	struct xlog	*log,
1456	xfs_daddr_t	*blk_no)
1457{
1458	char		*buffer;
1459	char		*offset;
1460	uint	        first_cycle, last_cycle;
1461	xfs_daddr_t	new_blk, last_blk, start_blk;
1462	xfs_daddr_t     num_scan_bblks;
1463	int	        error, log_bbnum = log->l_logBBsize;
1464
1465	*blk_no = 0;
1466
1467	/* check totally zeroed log */
1468	buffer = xlog_alloc_buffer(log, 1);
1469	if (!buffer)
1470		return -ENOMEM;
1471	error = xlog_bread(log, 0, 1, buffer, &offset);
1472	if (error)
1473		goto out_free_buffer;
1474
1475	first_cycle = xlog_get_cycle(offset);
1476	if (first_cycle == 0) {		/* completely zeroed log */
1477		*blk_no = 0;
1478		kmem_free(buffer);
1479		return 1;
1480	}
1481
1482	/* check partially zeroed log */
1483	error = xlog_bread(log, log_bbnum-1, 1, buffer, &offset);
1484	if (error)
1485		goto out_free_buffer;
1486
1487	last_cycle = xlog_get_cycle(offset);
1488	if (last_cycle != 0) {		/* log completely written to */
1489		kmem_free(buffer);
1490		return 0;
1491	}
1492
1493	/* we have a partially zeroed log */
1494	last_blk = log_bbnum-1;
1495	error = xlog_find_cycle_start(log, buffer, 0, &last_blk, 0);
1496	if (error)
1497		goto out_free_buffer;
1498
1499	/*
1500	 * Validate the answer.  Because there is no way to guarantee that
1501	 * the entire log is made up of log records which are the same size,
1502	 * we scan over the defined maximum blocks.  At this point, the maximum
1503	 * is not chosen to mean anything special.   XXXmiken
1504	 */
1505	num_scan_bblks = XLOG_TOTAL_REC_SHIFT(log);
1506	ASSERT(num_scan_bblks <= INT_MAX);
1507
1508	if (last_blk < num_scan_bblks)
1509		num_scan_bblks = last_blk;
1510	start_blk = last_blk - num_scan_bblks;
1511
1512	/*
1513	 * We search for any instances of cycle number 0 that occur before
1514	 * our current estimate of the head.  What we're trying to detect is
1515	 *        1 ... | 0 | 1 | 0...
1516	 *                       ^ binary search ends here
1517	 */
1518	if ((error = xlog_find_verify_cycle(log, start_blk,
1519					 (int)num_scan_bblks, 0, &new_blk)))
1520		goto out_free_buffer;
1521	if (new_blk != -1)
1522		last_blk = new_blk;
1523
1524	/*
1525	 * Potentially backup over partial log record write.  We don't need
1526	 * to search the end of the log because we know it is zero.
1527	 */
1528	error = xlog_find_verify_log_record(log, start_blk, &last_blk, 0);
1529	if (error == 1)
1530		error = -EIO;
1531	if (error)
1532		goto out_free_buffer;
1533
1534	*blk_no = last_blk;
1535out_free_buffer:
1536	kmem_free(buffer);
1537	if (error)
1538		return error;
1539	return 1;
1540}
1541
1542/*
1543 * These are simple subroutines used by xlog_clear_stale_blocks() below
1544 * to initialize a buffer full of empty log record headers and write
1545 * them into the log.
1546 */
1547STATIC void
1548xlog_add_record(
1549	struct xlog		*log,
1550	char			*buf,
1551	int			cycle,
1552	int			block,
1553	int			tail_cycle,
1554	int			tail_block)
1555{
1556	xlog_rec_header_t	*recp = (xlog_rec_header_t *)buf;
1557
1558	memset(buf, 0, BBSIZE);
1559	recp->h_magicno = cpu_to_be32(XLOG_HEADER_MAGIC_NUM);
1560	recp->h_cycle = cpu_to_be32(cycle);
1561	recp->h_version = cpu_to_be32(
1562			xfs_sb_version_haslogv2(&log->l_mp->m_sb) ? 2 : 1);
1563	recp->h_lsn = cpu_to_be64(xlog_assign_lsn(cycle, block));
1564	recp->h_tail_lsn = cpu_to_be64(xlog_assign_lsn(tail_cycle, tail_block));
1565	recp->h_fmt = cpu_to_be32(XLOG_FMT);
1566	memcpy(&recp->h_fs_uuid, &log->l_mp->m_sb.sb_uuid, sizeof(uuid_t));
1567}
1568
1569STATIC int
1570xlog_write_log_records(
1571	struct xlog	*log,
1572	int		cycle,
1573	int		start_block,
1574	int		blocks,
1575	int		tail_cycle,
1576	int		tail_block)
1577{
1578	char		*offset;
1579	char		*buffer;
1580	int		balign, ealign;
1581	int		sectbb = log->l_sectBBsize;
1582	int		end_block = start_block + blocks;
1583	int		bufblks;
1584	int		error = 0;
1585	int		i, j = 0;
1586
1587	/*
1588	 * Greedily allocate a buffer big enough to handle the full
1589	 * range of basic blocks to be written.  If that fails, try
1590	 * a smaller size.  We need to be able to write at least a
1591	 * log sector, or we're out of luck.
1592	 */
1593	bufblks = 1 << ffs(blocks);
1594	while (bufblks > log->l_logBBsize)
1595		bufblks >>= 1;
1596	while (!(buffer = xlog_alloc_buffer(log, bufblks))) {
1597		bufblks >>= 1;
1598		if (bufblks < sectbb)
1599			return -ENOMEM;
1600	}
1601
1602	/* We may need to do a read at the start to fill in part of
1603	 * the buffer in the starting sector not covered by the first
1604	 * write below.
1605	 */
1606	balign = round_down(start_block, sectbb);
1607	if (balign != start_block) {
1608		error = xlog_bread_noalign(log, start_block, 1, buffer);
1609		if (error)
1610			goto out_free_buffer;
1611
1612		j = start_block - balign;
1613	}
1614
1615	for (i = start_block; i < end_block; i += bufblks) {
1616		int		bcount, endcount;
1617
1618		bcount = min(bufblks, end_block - start_block);
1619		endcount = bcount - j;
1620
1621		/* We may need to do a read at the end to fill in part of
1622		 * the buffer in the final sector not covered by the write.
1623		 * If this is the same sector as the above read, skip it.
1624		 */
1625		ealign = round_down(end_block, sectbb);
1626		if (j == 0 && (start_block + endcount > ealign)) {
1627			error = xlog_bread_noalign(log, ealign, sectbb,
1628					buffer + BBTOB(ealign - start_block));
1629			if (error)
1630				break;
1631
1632		}
1633
1634		offset = buffer + xlog_align(log, start_block);
1635		for (; j < endcount; j++) {
1636			xlog_add_record(log, offset, cycle, i+j,
1637					tail_cycle, tail_block);
1638			offset += BBSIZE;
1639		}
1640		error = xlog_bwrite(log, start_block, endcount, buffer);
1641		if (error)
1642			break;
1643		start_block += endcount;
1644		j = 0;
1645	}
1646
1647out_free_buffer:
1648	kmem_free(buffer);
1649	return error;
1650}
1651
1652/*
1653 * This routine is called to blow away any incomplete log writes out
1654 * in front of the log head.  We do this so that we won't become confused
1655 * if we come up, write only a little bit more, and then crash again.
1656 * If we leave the partial log records out there, this situation could
1657 * cause us to think those partial writes are valid blocks since they
1658 * have the current cycle number.  We get rid of them by overwriting them
1659 * with empty log records with the old cycle number rather than the
1660 * current one.
1661 *
1662 * The tail lsn is passed in rather than taken from
1663 * the log so that we will not write over the unmount record after a
1664 * clean unmount in a 512 block log.  Doing so would leave the log without
1665 * any valid log records in it until a new one was written.  If we crashed
1666 * during that time we would not be able to recover.
1667 */
1668STATIC int
1669xlog_clear_stale_blocks(
1670	struct xlog	*log,
1671	xfs_lsn_t	tail_lsn)
1672{
1673	int		tail_cycle, head_cycle;
1674	int		tail_block, head_block;
1675	int		tail_distance, max_distance;
1676	int		distance;
1677	int		error;
1678
1679	tail_cycle = CYCLE_LSN(tail_lsn);
1680	tail_block = BLOCK_LSN(tail_lsn);
1681	head_cycle = log->l_curr_cycle;
1682	head_block = log->l_curr_block;
1683
1684	/*
1685	 * Figure out the distance between the new head of the log
1686	 * and the tail.  We want to write over any blocks beyond the
1687	 * head that we may have written just before the crash, but
1688	 * we don't want to overwrite the tail of the log.
1689	 */
1690	if (head_cycle == tail_cycle) {
1691		/*
1692		 * The tail is behind the head in the physical log,
1693		 * so the distance from the head to the tail is the
1694		 * distance from the head to the end of the log plus
1695		 * the distance from the beginning of the log to the
1696		 * tail.
1697		 */
1698		if (XFS_IS_CORRUPT(log->l_mp,
1699				   head_block < tail_block ||
1700				   head_block >= log->l_logBBsize))
1701			return -EFSCORRUPTED;
1702		tail_distance = tail_block + (log->l_logBBsize - head_block);
1703	} else {
1704		/*
1705		 * The head is behind the tail in the physical log,
1706		 * so the distance from the head to the tail is just
1707		 * the tail block minus the head block.
1708		 */
1709		if (XFS_IS_CORRUPT(log->l_mp,
1710				   head_block >= tail_block ||
1711				   head_cycle != tail_cycle + 1))
1712			return -EFSCORRUPTED;
1713		tail_distance = tail_block - head_block;
1714	}
1715
1716	/*
1717	 * If the head is right up against the tail, we can't clear
1718	 * anything.
1719	 */
1720	if (tail_distance <= 0) {
1721		ASSERT(tail_distance == 0);
1722		return 0;
1723	}
1724
1725	max_distance = XLOG_TOTAL_REC_SHIFT(log);
1726	/*
1727	 * Take the smaller of the maximum amount of outstanding I/O
1728	 * we could have and the distance to the tail to clear out.
1729	 * We take the smaller so that we don't overwrite the tail and
1730	 * we don't waste all day writing from the head to the tail
1731	 * for no reason.
1732	 */
1733	max_distance = min(max_distance, tail_distance);
1734
1735	if ((head_block + max_distance) <= log->l_logBBsize) {
1736		/*
1737		 * We can stomp all the blocks we need to without
1738		 * wrapping around the end of the log.  Just do it
1739		 * in a single write.  Use the cycle number of the
1740		 * current cycle minus one so that the log will look like:
1741		 *     n ... | n - 1 ...
1742		 */
1743		error = xlog_write_log_records(log, (head_cycle - 1),
1744				head_block, max_distance, tail_cycle,
1745				tail_block);
1746		if (error)
1747			return error;
1748	} else {
1749		/*
1750		 * We need to wrap around the end of the physical log in
1751		 * order to clear all the blocks.  Do it in two separate
1752		 * I/Os.  The first write should be from the head to the
1753		 * end of the physical log, and it should use the current
1754		 * cycle number minus one just like above.
1755		 */
1756		distance = log->l_logBBsize - head_block;
1757		error = xlog_write_log_records(log, (head_cycle - 1),
1758				head_block, distance, tail_cycle,
1759				tail_block);
1760
1761		if (error)
1762			return error;
1763
1764		/*
1765		 * Now write the blocks at the start of the physical log.
1766		 * This writes the remainder of the blocks we want to clear.
1767		 * It uses the current cycle number since we're now on the
1768		 * same cycle as the head so that we get:
1769		 *    n ... n ... | n - 1 ...
1770		 *    ^^^^^ blocks we're writing
1771		 */
1772		distance = max_distance - (log->l_logBBsize - head_block);
1773		error = xlog_write_log_records(log, head_cycle, 0, distance,
1774				tail_cycle, tail_block);
1775		if (error)
1776			return error;
1777	}
1778
1779	return 0;
1780}
1781
1782/******************************************************************************
1783 *
1784 *		Log recover routines
1785 *
1786 ******************************************************************************
1787 */
1788
1789/*
1790 * Sort the log items in the transaction.
1791 *
1792 * The ordering constraints are defined by the inode allocation and unlink
1793 * behaviour. The rules are:
1794 *
1795 *	1. Every item is only logged once in a given transaction. Hence it
1796 *	   represents the last logged state of the item. Hence ordering is
1797 *	   dependent on the order in which operations need to be performed so
1798 *	   required initial conditions are always met.
1799 *
1800 *	2. Cancelled buffers are recorded in pass 1 in a separate table and
1801 *	   there's nothing to replay from them so we can simply cull them
1802 *	   from the transaction. However, we can't do that until after we've
1803 *	   replayed all the other items because they may be dependent on the
1804 *	   cancelled buffer and replaying the cancelled buffer can remove it
1805 *	   form the cancelled buffer table. Hence they have tobe done last.
1806 *
1807 *	3. Inode allocation buffers must be replayed before inode items that
1808 *	   read the buffer and replay changes into it. For filesystems using the
1809 *	   ICREATE transactions, this means XFS_LI_ICREATE objects need to get
1810 *	   treated the same as inode allocation buffers as they create and
1811 *	   initialise the buffers directly.
1812 *
1813 *	4. Inode unlink buffers must be replayed after inode items are replayed.
1814 *	   This ensures that inodes are completely flushed to the inode buffer
1815 *	   in a "free" state before we remove the unlinked inode list pointer.
1816 *
1817 * Hence the ordering needs to be inode allocation buffers first, inode items
1818 * second, inode unlink buffers third and cancelled buffers last.
1819 *
1820 * But there's a problem with that - we can't tell an inode allocation buffer
1821 * apart from a regular buffer, so we can't separate them. We can, however,
1822 * tell an inode unlink buffer from the others, and so we can separate them out
1823 * from all the other buffers and move them to last.
1824 *
1825 * Hence, 4 lists, in order from head to tail:
1826 *	- buffer_list for all buffers except cancelled/inode unlink buffers
1827 *	- item_list for all non-buffer items
1828 *	- inode_buffer_list for inode unlink buffers
1829 *	- cancel_list for the cancelled buffers
1830 *
1831 * Note that we add objects to the tail of the lists so that first-to-last
1832 * ordering is preserved within the lists. Adding objects to the head of the
1833 * list means when we traverse from the head we walk them in last-to-first
1834 * order. For cancelled buffers and inode unlink buffers this doesn't matter,
1835 * but for all other items there may be specific ordering that we need to
1836 * preserve.
1837 */
1838STATIC int
1839xlog_recover_reorder_trans(
1840	struct xlog		*log,
1841	struct xlog_recover	*trans,
1842	int			pass)
1843{
1844	xlog_recover_item_t	*item, *n;
1845	int			error = 0;
1846	LIST_HEAD(sort_list);
1847	LIST_HEAD(cancel_list);
1848	LIST_HEAD(buffer_list);
1849	LIST_HEAD(inode_buffer_list);
1850	LIST_HEAD(inode_list);
1851
1852	list_splice_init(&trans->r_itemq, &sort_list);
1853	list_for_each_entry_safe(item, n, &sort_list, ri_list) {
1854		xfs_buf_log_format_t	*buf_f = item->ri_buf[0].i_addr;
1855
1856		switch (ITEM_TYPE(item)) {
1857		case XFS_LI_ICREATE:
1858			list_move_tail(&item->ri_list, &buffer_list);
1859			break;
1860		case XFS_LI_BUF:
1861			if (buf_f->blf_flags & XFS_BLF_CANCEL) {
1862				trace_xfs_log_recover_item_reorder_head(log,
1863							trans, item, pass);
1864				list_move(&item->ri_list, &cancel_list);
1865				break;
1866			}
1867			if (buf_f->blf_flags & XFS_BLF_INODE_BUF) {
1868				list_move(&item->ri_list, &inode_buffer_list);
1869				break;
1870			}
1871			list_move_tail(&item->ri_list, &buffer_list);
1872			break;
1873		case XFS_LI_INODE:
1874		case XFS_LI_DQUOT:
1875		case XFS_LI_QUOTAOFF:
1876		case XFS_LI_EFD:
1877		case XFS_LI_EFI:
1878		case XFS_LI_RUI:
1879		case XFS_LI_RUD:
1880		case XFS_LI_CUI:
1881		case XFS_LI_CUD:
1882		case XFS_LI_BUI:
1883		case XFS_LI_BUD:
1884			trace_xfs_log_recover_item_reorder_tail(log,
1885							trans, item, pass);
1886			list_move_tail(&item->ri_list, &inode_list);
1887			break;
1888		default:
1889			xfs_warn(log->l_mp,
1890				"%s: unrecognized type of log operation",
1891				__func__);
1892			ASSERT(0);
1893			/*
1894			 * return the remaining items back to the transaction
1895			 * item list so they can be freed in caller.
1896			 */
1897			if (!list_empty(&sort_list))
1898				list_splice_init(&sort_list, &trans->r_itemq);
1899			error = -EIO;
1900			goto out;
1901		}
1902	}
1903out:
1904	ASSERT(list_empty(&sort_list));
1905	if (!list_empty(&buffer_list))
1906		list_splice(&buffer_list, &trans->r_itemq);
1907	if (!list_empty(&inode_list))
1908		list_splice_tail(&inode_list, &trans->r_itemq);
1909	if (!list_empty(&inode_buffer_list))
1910		list_splice_tail(&inode_buffer_list, &trans->r_itemq);
1911	if (!list_empty(&cancel_list))
1912		list_splice_tail(&cancel_list, &trans->r_itemq);
1913	return error;
1914}
1915
1916/*
1917 * Build up the table of buf cancel records so that we don't replay
1918 * cancelled data in the second pass.  For buffer records that are
1919 * not cancel records, there is nothing to do here so we just return.
1920 *
1921 * If we get a cancel record which is already in the table, this indicates
1922 * that the buffer was cancelled multiple times.  In order to ensure
1923 * that during pass 2 we keep the record in the table until we reach its
1924 * last occurrence in the log, we keep a reference count in the cancel
1925 * record in the table to tell us how many times we expect to see this
1926 * record during the second pass.
1927 */
1928STATIC int
1929xlog_recover_buffer_pass1(
1930	struct xlog			*log,
1931	struct xlog_recover_item	*item)
1932{
1933	xfs_buf_log_format_t	*buf_f = item->ri_buf[0].i_addr;
1934	struct list_head	*bucket;
1935	struct xfs_buf_cancel	*bcp;
1936
1937	/*
1938	 * If this isn't a cancel buffer item, then just return.
1939	 */
1940	if (!(buf_f->blf_flags & XFS_BLF_CANCEL)) {
1941		trace_xfs_log_recover_buf_not_cancel(log, buf_f);
1942		return 0;
1943	}
1944
1945	/*
1946	 * Insert an xfs_buf_cancel record into the hash table of them.
1947	 * If there is already an identical record, bump its reference count.
1948	 */
1949	bucket = XLOG_BUF_CANCEL_BUCKET(log, buf_f->blf_blkno);
1950	list_for_each_entry(bcp, bucket, bc_list) {
1951		if (bcp->bc_blkno == buf_f->blf_blkno &&
1952		    bcp->bc_len == buf_f->blf_len) {
1953			bcp->bc_refcount++;
1954			trace_xfs_log_recover_buf_cancel_ref_inc(log, buf_f);
1955			return 0;
1956		}
1957	}
1958
1959	bcp = kmem_alloc(sizeof(struct xfs_buf_cancel), 0);
1960	bcp->bc_blkno = buf_f->blf_blkno;
1961	bcp->bc_len = buf_f->blf_len;
1962	bcp->bc_refcount = 1;
1963	list_add_tail(&bcp->bc_list, bucket);
1964
1965	trace_xfs_log_recover_buf_cancel_add(log, buf_f);
1966	return 0;
1967}
1968
1969/*
1970 * Check to see whether the buffer being recovered has a corresponding
1971 * entry in the buffer cancel record table. If it is, return the cancel
1972 * buffer structure to the caller.
1973 */
1974STATIC struct xfs_buf_cancel *
1975xlog_peek_buffer_cancelled(
1976	struct xlog		*log,
1977	xfs_daddr_t		blkno,
1978	uint			len,
1979	unsigned short			flags)
1980{
1981	struct list_head	*bucket;
1982	struct xfs_buf_cancel	*bcp;
1983
1984	if (!log->l_buf_cancel_table) {
1985		/* empty table means no cancelled buffers in the log */
1986		ASSERT(!(flags & XFS_BLF_CANCEL));
1987		return NULL;
1988	}
1989
1990	bucket = XLOG_BUF_CANCEL_BUCKET(log, blkno);
1991	list_for_each_entry(bcp, bucket, bc_list) {
1992		if (bcp->bc_blkno == blkno && bcp->bc_len == len)
1993			return bcp;
1994	}
1995
1996	/*
1997	 * We didn't find a corresponding entry in the table, so return 0 so
1998	 * that the buffer is NOT cancelled.
1999	 */
2000	ASSERT(!(flags & XFS_BLF_CANCEL));
2001	return NULL;
2002}
2003
2004/*
2005 * If the buffer is being cancelled then return 1 so that it will be cancelled,
2006 * otherwise return 0.  If the buffer is actually a buffer cancel item
2007 * (XFS_BLF_CANCEL is set), then decrement the refcount on the entry in the
2008 * table and remove it from the table if this is the last reference.
2009 *
2010 * We remove the cancel record from the table when we encounter its last
2011 * occurrence in the log so that if the same buffer is re-used again after its
2012 * last cancellation we actually replay the changes made at that point.
2013 */
2014STATIC int
2015xlog_check_buffer_cancelled(
2016	struct xlog		*log,
2017	xfs_daddr_t		blkno,
2018	uint			len,
2019	unsigned short			flags)
2020{
2021	struct xfs_buf_cancel	*bcp;
2022
2023	bcp = xlog_peek_buffer_cancelled(log, blkno, len, flags);
2024	if (!bcp)
2025		return 0;
2026
2027	/*
2028	 * We've go a match, so return 1 so that the recovery of this buffer
2029	 * is cancelled.  If this buffer is actually a buffer cancel log
2030	 * item, then decrement the refcount on the one in the table and
2031	 * remove it if this is the last reference.
2032	 */
2033	if (flags & XFS_BLF_CANCEL) {
2034		if (--bcp->bc_refcount == 0) {
2035			list_del(&bcp->bc_list);
2036			kmem_free(bcp);
2037		}
2038	}
2039	return 1;
2040}
2041
2042/*
2043 * Perform recovery for a buffer full of inodes.  In these buffers, the only
2044 * data which should be recovered is that which corresponds to the
2045 * di_next_unlinked pointers in the on disk inode structures.  The rest of the
2046 * data for the inodes is always logged through the inodes themselves rather
2047 * than the inode buffer and is recovered in xlog_recover_inode_pass2().
2048 *
2049 * The only time when buffers full of inodes are fully recovered is when the
2050 * buffer is full of newly allocated inodes.  In this case the buffer will
2051 * not be marked as an inode buffer and so will be sent to
2052 * xlog_recover_do_reg_buffer() below during recovery.
2053 */
2054STATIC int
2055xlog_recover_do_inode_buffer(
2056	struct xfs_mount	*mp,
2057	xlog_recover_item_t	*item,
2058	struct xfs_buf		*bp,
2059	xfs_buf_log_format_t	*buf_f)
2060{
2061	int			i;
2062	int			item_index = 0;
2063	int			bit = 0;
2064	int			nbits = 0;
2065	int			reg_buf_offset = 0;
2066	int			reg_buf_bytes = 0;
2067	int			next_unlinked_offset;
2068	int			inodes_per_buf;
2069	xfs_agino_t		*logged_nextp;
2070	xfs_agino_t		*buffer_nextp;
2071
2072	trace_xfs_log_recover_buf_inode_buf(mp->m_log, buf_f);
2073
2074	/*
2075	 * Post recovery validation only works properly on CRC enabled
2076	 * filesystems.
2077	 */
2078	if (xfs_sb_version_hascrc(&mp->m_sb))
2079		bp->b_ops = &xfs_inode_buf_ops;
2080
2081	inodes_per_buf = BBTOB(bp->b_length) >> mp->m_sb.sb_inodelog;
2082	for (i = 0; i < inodes_per_buf; i++) {
2083		next_unlinked_offset = (i * mp->m_sb.sb_inodesize) +
2084			offsetof(xfs_dinode_t, di_next_unlinked);
2085
2086		while (next_unlinked_offset >=
2087		       (reg_buf_offset + reg_buf_bytes)) {
2088			/*
2089			 * The next di_next_unlinked field is beyond
2090			 * the current logged region.  Find the next
2091			 * logged region that contains or is beyond
2092			 * the current di_next_unlinked field.
2093			 */
2094			bit += nbits;
2095			bit = xfs_next_bit(buf_f->blf_data_map,
2096					   buf_f->blf_map_size, bit);
2097
2098			/*
2099			 * If there are no more logged regions in the
2100			 * buffer, then we're done.
2101			 */
2102			if (bit == -1)
2103				return 0;
2104
2105			nbits = xfs_contig_bits(buf_f->blf_data_map,
2106						buf_f->blf_map_size, bit);
2107			ASSERT(nbits > 0);
2108			reg_buf_offset = bit << XFS_BLF_SHIFT;
2109			reg_buf_bytes = nbits << XFS_BLF_SHIFT;
2110			item_index++;
2111		}
2112
2113		/*
2114		 * If the current logged region starts after the current
2115		 * di_next_unlinked field, then move on to the next
2116		 * di_next_unlinked field.
2117		 */
2118		if (next_unlinked_offset < reg_buf_offset)
2119			continue;
2120
2121		ASSERT(item->ri_buf[item_index].i_addr != NULL);
2122		ASSERT((item->ri_buf[item_index].i_len % XFS_BLF_CHUNK) == 0);
2123		ASSERT((reg_buf_offset + reg_buf_bytes) <= BBTOB(bp->b_length));
2124
2125		/*
2126		 * The current logged region contains a copy of the
2127		 * current di_next_unlinked field.  Extract its value
2128		 * and copy it to the buffer copy.
2129		 */
2130		logged_nextp = item->ri_buf[item_index].i_addr +
2131				next_unlinked_offset - reg_buf_offset;
2132		if (XFS_IS_CORRUPT(mp, *logged_nextp == 0)) {
2133			xfs_alert(mp,
2134		"Bad inode buffer log record (ptr = "PTR_FMT", bp = "PTR_FMT"). "
2135		"Trying to replay bad (0) inode di_next_unlinked field.",
2136				item, bp);
2137			return -EFSCORRUPTED;
2138		}
2139
2140		buffer_nextp = xfs_buf_offset(bp, next_unlinked_offset);
2141		*buffer_nextp = *logged_nextp;
2142
2143		/*
2144		 * If necessary, recalculate the CRC in the on-disk inode. We
2145		 * have to leave the inode in a consistent state for whoever
2146		 * reads it next....
2147		 */
2148		xfs_dinode_calc_crc(mp,
2149				xfs_buf_offset(bp, i * mp->m_sb.sb_inodesize));
2150
2151	}
2152
2153	return 0;
2154}
2155
2156/*
2157 * V5 filesystems know the age of the buffer on disk being recovered. We can
2158 * have newer objects on disk than we are replaying, and so for these cases we
2159 * don't want to replay the current change as that will make the buffer contents
2160 * temporarily invalid on disk.
2161 *
2162 * The magic number might not match the buffer type we are going to recover
2163 * (e.g. reallocated blocks), so we ignore the xfs_buf_log_format flags.  Hence
2164 * extract the LSN of the existing object in the buffer based on it's current
2165 * magic number.  If we don't recognise the magic number in the buffer, then
2166 * return a LSN of -1 so that the caller knows it was an unrecognised block and
2167 * so can recover the buffer.
2168 *
2169 * Note: we cannot rely solely on magic number matches to determine that the
2170 * buffer has a valid LSN - we also need to verify that it belongs to this
2171 * filesystem, so we need to extract the object's LSN and compare it to that
2172 * which we read from the superblock. If the UUIDs don't match, then we've got a
2173 * stale metadata block from an old filesystem instance that we need to recover
2174 * over the top of.
2175 */
2176static xfs_lsn_t
2177xlog_recover_get_buf_lsn(
2178	struct xfs_mount	*mp,
2179	struct xfs_buf		*bp)
2180{
2181	uint32_t		magic32;
2182	uint16_t		magic16;
2183	uint16_t		magicda;
2184	void			*blk = bp->b_addr;
2185	uuid_t			*uuid;
2186	xfs_lsn_t		lsn = -1;
2187
2188	/* v4 filesystems always recover immediately */
2189	if (!xfs_sb_version_hascrc(&mp->m_sb))
2190		goto recover_immediately;
2191
2192	magic32 = be32_to_cpu(*(__be32 *)blk);
2193	switch (magic32) {
2194	case XFS_ABTB_CRC_MAGIC:
2195	case XFS_ABTC_CRC_MAGIC:
2196	case XFS_ABTB_MAGIC:
2197	case XFS_ABTC_MAGIC:
2198	case XFS_RMAP_CRC_MAGIC:
2199	case XFS_REFC_CRC_MAGIC:
2200	case XFS_IBT_CRC_MAGIC:
2201	case XFS_IBT_MAGIC: {
2202		struct xfs_btree_block *btb = blk;
2203
2204		lsn = be64_to_cpu(btb->bb_u.s.bb_lsn);
2205		uuid = &btb->bb_u.s.bb_uuid;
2206		break;
2207	}
2208	case XFS_BMAP_CRC_MAGIC:
2209	case XFS_BMAP_MAGIC: {
2210		struct xfs_btree_block *btb = blk;
2211
2212		lsn = be64_to_cpu(btb->bb_u.l.bb_lsn);
2213		uuid = &btb->bb_u.l.bb_uuid;
2214		break;
2215	}
2216	case XFS_AGF_MAGIC:
2217		lsn = be64_to_cpu(((struct xfs_agf *)blk)->agf_lsn);
2218		uuid = &((struct xfs_agf *)blk)->agf_uuid;
2219		break;
2220	case XFS_AGFL_MAGIC:
2221		lsn = be64_to_cpu(((struct xfs_agfl *)blk)->agfl_lsn);
2222		uuid = &((struct xfs_agfl *)blk)->agfl_uuid;
2223		break;
2224	case XFS_AGI_MAGIC:
2225		lsn = be64_to_cpu(((struct xfs_agi *)blk)->agi_lsn);
2226		uuid = &((struct xfs_agi *)blk)->agi_uuid;
2227		break;
2228	case XFS_SYMLINK_MAGIC:
2229		lsn = be64_to_cpu(((struct xfs_dsymlink_hdr *)blk)->sl_lsn);
2230		uuid = &((struct xfs_dsymlink_hdr *)blk)->sl_uuid;
2231		break;
2232	case XFS_DIR3_BLOCK_MAGIC:
2233	case XFS_DIR3_DATA_MAGIC:
2234	case XFS_DIR3_FREE_MAGIC:
2235		lsn = be64_to_cpu(((struct xfs_dir3_blk_hdr *)blk)->lsn);
2236		uuid = &((struct xfs_dir3_blk_hdr *)blk)->uuid;
2237		break;
2238	case XFS_ATTR3_RMT_MAGIC:
2239		/*
2240		 * Remote attr blocks are written synchronously, rather than
2241		 * being logged. That means they do not contain a valid LSN
2242		 * (i.e. transactionally ordered) in them, and hence any time we
2243		 * see a buffer to replay over the top of a remote attribute
2244		 * block we should simply do so.
2245		 */
2246		goto recover_immediately;
2247	case XFS_SB_MAGIC:
2248		/*
2249		 * superblock uuids are magic. We may or may not have a
2250		 * sb_meta_uuid on disk, but it will be set in the in-core
2251		 * superblock. We set the uuid pointer for verification
2252		 * according to the superblock feature mask to ensure we check
2253		 * the relevant UUID in the superblock.
2254		 */
2255		lsn = be64_to_cpu(((struct xfs_dsb *)blk)->sb_lsn);
2256		if (xfs_sb_version_hasmetauuid(&mp->m_sb))
2257			uuid = &((struct xfs_dsb *)blk)->sb_meta_uuid;
2258		else
2259			uuid = &((struct xfs_dsb *)blk)->sb_uuid;
2260		break;
2261	default:
2262		break;
2263	}
2264
2265	if (lsn != (xfs_lsn_t)-1) {
2266		if (!uuid_equal(&mp->m_sb.sb_meta_uuid, uuid))
2267			goto recover_immediately;
2268		return lsn;
2269	}
2270
2271	magicda = be16_to_cpu(((struct xfs_da_blkinfo *)blk)->magic);
2272	switch (magicda) {
2273	case XFS_DIR3_LEAF1_MAGIC:
2274	case XFS_DIR3_LEAFN_MAGIC:
2275	case XFS_DA3_NODE_MAGIC:
2276		lsn = be64_to_cpu(((struct xfs_da3_blkinfo *)blk)->lsn);
2277		uuid = &((struct xfs_da3_blkinfo *)blk)->uuid;
2278		break;
2279	default:
2280		break;
2281	}
2282
2283	if (lsn != (xfs_lsn_t)-1) {
2284		if (!uuid_equal(&mp->m_sb.sb_uuid, uuid))
2285			goto recover_immediately;
2286		return lsn;
2287	}
2288
2289	/*
2290	 * We do individual object checks on dquot and inode buffers as they
2291	 * have their own individual LSN records. Also, we could have a stale
2292	 * buffer here, so we have to at least recognise these buffer types.
2293	 *
2294	 * A notd complexity here is inode unlinked list processing - it logs
2295	 * the inode directly in the buffer, but we don't know which inodes have
2296	 * been modified, and there is no global buffer LSN. Hence we need to
2297	 * recover all inode buffer types immediately. This problem will be
2298	 * fixed by logical logging of the unlinked list modifications.
2299	 */
2300	magic16 = be16_to_cpu(*(__be16 *)blk);
2301	switch (magic16) {
2302	case XFS_DQUOT_MAGIC:
2303	case XFS_DINODE_MAGIC:
2304		goto recover_immediately;
2305	default:
2306		break;
2307	}
2308
2309	/* unknown buffer contents, recover immediately */
2310
2311recover_immediately:
2312	return (xfs_lsn_t)-1;
2313
2314}
2315
2316/*
2317 * Validate the recovered buffer is of the correct type and attach the
2318 * appropriate buffer operations to them for writeback. Magic numbers are in a
2319 * few places:
2320 *	the first 16 bits of the buffer (inode buffer, dquot buffer),
2321 *	the first 32 bits of the buffer (most blocks),
2322 *	inside a struct xfs_da_blkinfo at the start of the buffer.
2323 */
2324static void
2325xlog_recover_validate_buf_type(
2326	struct xfs_mount	*mp,
2327	struct xfs_buf		*bp,
2328	xfs_buf_log_format_t	*buf_f,
2329	xfs_lsn_t		current_lsn)
2330{
2331	struct xfs_da_blkinfo	*info = bp->b_addr;
2332	uint32_t		magic32;
2333	uint16_t		magic16;
2334	uint16_t		magicda;
2335	char			*warnmsg = NULL;
2336
2337	/*
2338	 * We can only do post recovery validation on items on CRC enabled
2339	 * fielsystems as we need to know when the buffer was written to be able
2340	 * to determine if we should have replayed the item. If we replay old
2341	 * metadata over a newer buffer, then it will enter a temporarily
2342	 * inconsistent state resulting in verification failures. Hence for now
2343	 * just avoid the verification stage for non-crc filesystems
2344	 */
2345	if (!xfs_sb_version_hascrc(&mp->m_sb))
2346		return;
2347
2348	magic32 = be32_to_cpu(*(__be32 *)bp->b_addr);
2349	magic16 = be16_to_cpu(*(__be16*)bp->b_addr);
2350	magicda = be16_to_cpu(info->magic);
2351	switch (xfs_blft_from_flags(buf_f)) {
2352	case XFS_BLFT_BTREE_BUF:
2353		switch (magic32) {
2354		case XFS_ABTB_CRC_MAGIC:
2355		case XFS_ABTB_MAGIC:
2356			bp->b_ops = &xfs_bnobt_buf_ops;
2357			break;
2358		case XFS_ABTC_CRC_MAGIC:
2359		case XFS_ABTC_MAGIC:
2360			bp->b_ops = &xfs_cntbt_buf_ops;
2361			break;
2362		case XFS_IBT_CRC_MAGIC:
2363		case XFS_IBT_MAGIC:
2364			bp->b_ops = &xfs_inobt_buf_ops;
2365			break;
2366		case XFS_FIBT_CRC_MAGIC:
2367		case XFS_FIBT_MAGIC:
2368			bp->b_ops = &xfs_finobt_buf_ops;
2369			break;
2370		case XFS_BMAP_CRC_MAGIC:
2371		case XFS_BMAP_MAGIC:
2372			bp->b_ops = &xfs_bmbt_buf_ops;
2373			break;
2374		case XFS_RMAP_CRC_MAGIC:
2375			bp->b_ops = &xfs_rmapbt_buf_ops;
2376			break;
2377		case XFS_REFC_CRC_MAGIC:
2378			bp->b_ops = &xfs_refcountbt_buf_ops;
2379			break;
2380		default:
2381			warnmsg = "Bad btree block magic!";
2382			break;
2383		}
2384		break;
2385	case XFS_BLFT_AGF_BUF:
2386		if (magic32 != XFS_AGF_MAGIC) {
2387			warnmsg = "Bad AGF block magic!";
2388			break;
2389		}
2390		bp->b_ops = &xfs_agf_buf_ops;
2391		break;
2392	case XFS_BLFT_AGFL_BUF:
2393		if (magic32 != XFS_AGFL_MAGIC) {
2394			warnmsg = "Bad AGFL block magic!";
2395			break;
2396		}
2397		bp->b_ops = &xfs_agfl_buf_ops;
2398		break;
2399	case XFS_BLFT_AGI_BUF:
2400		if (magic32 != XFS_AGI_MAGIC) {
2401			warnmsg = "Bad AGI block magic!";
2402			break;
2403		}
2404		bp->b_ops = &xfs_agi_buf_ops;
2405		break;
2406	case XFS_BLFT_UDQUOT_BUF:
2407	case XFS_BLFT_PDQUOT_BUF:
2408	case XFS_BLFT_GDQUOT_BUF:
2409#ifdef CONFIG_XFS_QUOTA
2410		if (magic16 != XFS_DQUOT_MAGIC) {
2411			warnmsg = "Bad DQUOT block magic!";
2412			break;
2413		}
2414		bp->b_ops = &xfs_dquot_buf_ops;
2415#else
2416		xfs_alert(mp,
2417	"Trying to recover dquots without QUOTA support built in!");
2418		ASSERT(0);
2419#endif
2420		break;
2421	case XFS_BLFT_DINO_BUF:
2422		if (magic16 != XFS_DINODE_MAGIC) {
2423			warnmsg = "Bad INODE block magic!";
2424			break;
2425		}
2426		bp->b_ops = &xfs_inode_buf_ops;
2427		break;
2428	case XFS_BLFT_SYMLINK_BUF:
2429		if (magic32 != XFS_SYMLINK_MAGIC) {
2430			warnmsg = "Bad symlink block magic!";
2431			break;
2432		}
2433		bp->b_ops = &xfs_symlink_buf_ops;
2434		break;
2435	case XFS_BLFT_DIR_BLOCK_BUF:
2436		if (magic32 != XFS_DIR2_BLOCK_MAGIC &&
2437		    magic32 != XFS_DIR3_BLOCK_MAGIC) {
2438			warnmsg = "Bad dir block magic!";
2439			break;
2440		}
2441		bp->b_ops = &xfs_dir3_block_buf_ops;
2442		break;
2443	case XFS_BLFT_DIR_DATA_BUF:
2444		if (magic32 != XFS_DIR2_DATA_MAGIC &&
2445		    magic32 != XFS_DIR3_DATA_MAGIC) {
2446			warnmsg = "Bad dir data magic!";
2447			break;
2448		}
2449		bp->b_ops = &xfs_dir3_data_buf_ops;
2450		break;
2451	case XFS_BLFT_DIR_FREE_BUF:
2452		if (magic32 != XFS_DIR2_FREE_MAGIC &&
2453		    magic32 != XFS_DIR3_FREE_MAGIC) {
2454			warnmsg = "Bad dir3 free magic!";
2455			break;
2456		}
2457		bp->b_ops = &xfs_dir3_free_buf_ops;
2458		break;
2459	case XFS_BLFT_DIR_LEAF1_BUF:
2460		if (magicda != XFS_DIR2_LEAF1_MAGIC &&
2461		    magicda != XFS_DIR3_LEAF1_MAGIC) {
2462			warnmsg = "Bad dir leaf1 magic!";
2463			break;
2464		}
2465		bp->b_ops = &xfs_dir3_leaf1_buf_ops;
2466		break;
2467	case XFS_BLFT_DIR_LEAFN_BUF:
2468		if (magicda != XFS_DIR2_LEAFN_MAGIC &&
2469		    magicda != XFS_DIR3_LEAFN_MAGIC) {
2470			warnmsg = "Bad dir leafn magic!";
2471			break;
2472		}
2473		bp->b_ops = &xfs_dir3_leafn_buf_ops;
2474		break;
2475	case XFS_BLFT_DA_NODE_BUF:
2476		if (magicda != XFS_DA_NODE_MAGIC &&
2477		    magicda != XFS_DA3_NODE_MAGIC) {
2478			warnmsg = "Bad da node magic!";
2479			break;
2480		}
2481		bp->b_ops = &xfs_da3_node_buf_ops;
2482		break;
2483	case XFS_BLFT_ATTR_LEAF_BUF:
2484		if (magicda != XFS_ATTR_LEAF_MAGIC &&
2485		    magicda != XFS_ATTR3_LEAF_MAGIC) {
2486			warnmsg = "Bad attr leaf magic!";
2487			break;
2488		}
2489		bp->b_ops = &xfs_attr3_leaf_buf_ops;
2490		break;
2491	case XFS_BLFT_ATTR_RMT_BUF:
2492		if (magic32 != XFS_ATTR3_RMT_MAGIC) {
2493			warnmsg = "Bad attr remote magic!";
2494			break;
2495		}
2496		bp->b_ops = &xfs_attr3_rmt_buf_ops;
2497		break;
2498	case XFS_BLFT_SB_BUF:
2499		if (magic32 != XFS_SB_MAGIC) {
2500			warnmsg = "Bad SB block magic!";
2501			break;
2502		}
2503		bp->b_ops = &xfs_sb_buf_ops;
2504		break;
2505#ifdef CONFIG_XFS_RT
2506	case XFS_BLFT_RTBITMAP_BUF:
2507	case XFS_BLFT_RTSUMMARY_BUF:
2508		/* no magic numbers for verification of RT buffers */
2509		bp->b_ops = &xfs_rtbuf_ops;
2510		break;
2511#endif /* CONFIG_XFS_RT */
2512	default:
2513		xfs_warn(mp, "Unknown buffer type %d!",
2514			 xfs_blft_from_flags(buf_f));
2515		break;
2516	}
2517
2518	/*
2519	 * Nothing else to do in the case of a NULL current LSN as this means
2520	 * the buffer is more recent than the change in the log and will be
2521	 * skipped.
2522	 */
2523	if (current_lsn == NULLCOMMITLSN)
2524		return;
2525
2526	if (warnmsg) {
2527		xfs_warn(mp, warnmsg);
2528		ASSERT(0);
2529	}
2530
2531	/*
2532	 * We must update the metadata LSN of the buffer as it is written out to
2533	 * ensure that older transactions never replay over this one and corrupt
2534	 * the buffer. This can occur if log recovery is interrupted at some
2535	 * point after the current transaction completes, at which point a
2536	 * subsequent mount starts recovery from the beginning.
2537	 *
2538	 * Write verifiers update the metadata LSN from log items attached to
2539	 * the buffer. Therefore, initialize a bli purely to carry the LSN to
2540	 * the verifier. We'll clean it up in our ->iodone() callback.
2541	 */
2542	if (bp->b_ops) {
2543		struct xfs_buf_log_item	*bip;
2544
2545		ASSERT(!bp->b_iodone || bp->b_iodone == xlog_recover_iodone);
2546		bp->b_iodone = xlog_recover_iodone;
2547		xfs_buf_item_init(bp, mp);
2548		bip = bp->b_log_item;
2549		bip->bli_item.li_lsn = current_lsn;
2550	}
2551}
2552
2553/*
2554 * Perform a 'normal' buffer recovery.  Each logged region of the
2555 * buffer should be copied over the corresponding region in the
2556 * given buffer.  The bitmap in the buf log format structure indicates
2557 * where to place the logged data.
2558 */
2559STATIC void
2560xlog_recover_do_reg_buffer(
2561	struct xfs_mount	*mp,
2562	xlog_recover_item_t	*item,
2563	struct xfs_buf		*bp,
2564	xfs_buf_log_format_t	*buf_f,
2565	xfs_lsn_t		current_lsn)
2566{
2567	int			i;
2568	int			bit;
2569	int			nbits;
2570	xfs_failaddr_t		fa;
2571	const size_t		size_disk_dquot = sizeof(struct xfs_disk_dquot);
2572
2573	trace_xfs_log_recover_buf_reg_buf(mp->m_log, buf_f);
2574
2575	bit = 0;
2576	i = 1;  /* 0 is the buf format structure */
2577	while (1) {
2578		bit = xfs_next_bit(buf_f->blf_data_map,
2579				   buf_f->blf_map_size, bit);
2580		if (bit == -1)
2581			break;
2582		nbits = xfs_contig_bits(buf_f->blf_data_map,
2583					buf_f->blf_map_size, bit);
2584		ASSERT(nbits > 0);
2585		ASSERT(item->ri_buf[i].i_addr != NULL);
2586		ASSERT(item->ri_buf[i].i_len % XFS_BLF_CHUNK == 0);
2587		ASSERT(BBTOB(bp->b_length) >=
2588		       ((uint)bit << XFS_BLF_SHIFT) + (nbits << XFS_BLF_SHIFT));
2589
2590		/*
2591		 * The dirty regions logged in the buffer, even though
2592		 * contiguous, may span multiple chunks. This is because the
2593		 * dirty region may span a physical page boundary in a buffer
2594		 * and hence be split into two separate vectors for writing into
2595		 * the log. Hence we need to trim nbits back to the length of
2596		 * the current region being copied out of the log.
2597		 */
2598		if (item->ri_buf[i].i_len < (nbits << XFS_BLF_SHIFT))
2599			nbits = item->ri_buf[i].i_len >> XFS_BLF_SHIFT;
2600
2601		/*
2602		 * Do a sanity check if this is a dquot buffer. Just checking
2603		 * the first dquot in the buffer should do. XXXThis is
2604		 * probably a good thing to do for other buf types also.
2605		 */
2606		fa = NULL;
2607		if (buf_f->blf_flags &
2608		   (XFS_BLF_UDQUOT_BUF|XFS_BLF_PDQUOT_BUF|XFS_BLF_GDQUOT_BUF)) {
2609			if (item->ri_buf[i].i_addr == NULL) {
2610				xfs_alert(mp,
2611					"XFS: NULL dquot in %s.", __func__);
2612				goto next;
2613			}
2614			if (item->ri_buf[i].i_len < size_disk_dquot) {
2615				xfs_alert(mp,
2616					"XFS: dquot too small (%d) in %s.",
2617					item->ri_buf[i].i_len, __func__);
2618				goto next;
2619			}
2620			fa = xfs_dquot_verify(mp, item->ri_buf[i].i_addr,
2621					       -1, 0);
2622			if (fa) {
2623				xfs_alert(mp,
2624	"dquot corrupt at %pS trying to replay into block 0x%llx",
2625					fa, bp->b_bn);
2626				goto next;
2627			}
2628		}
2629
2630		memcpy(xfs_buf_offset(bp,
2631			(uint)bit << XFS_BLF_SHIFT),	/* dest */
2632			item->ri_buf[i].i_addr,		/* source */
2633			nbits<<XFS_BLF_SHIFT);		/* length */
2634 next:
2635		i++;
2636		bit += nbits;
2637	}
2638
2639	/* Shouldn't be any more regions */
2640	ASSERT(i == item->ri_total);
2641
2642	xlog_recover_validate_buf_type(mp, bp, buf_f, current_lsn);
2643}
2644
2645/*
2646 * Perform a dquot buffer recovery.
2647 * Simple algorithm: if we have found a QUOTAOFF log item of the same type
2648 * (ie. USR or GRP), then just toss this buffer away; don't recover it.
2649 * Else, treat it as a regular buffer and do recovery.
2650 *
2651 * Return false if the buffer was tossed and true if we recovered the buffer to
2652 * indicate to the caller if the buffer needs writing.
2653 */
2654STATIC bool
2655xlog_recover_do_dquot_buffer(
2656	struct xfs_mount		*mp,
2657	struct xlog			*log,
2658	struct xlog_recover_item	*item,
2659	struct xfs_buf			*bp,
2660	struct xfs_buf_log_format	*buf_f)
2661{
2662	uint			type;
2663
2664	trace_xfs_log_recover_buf_dquot_buf(log, buf_f);
2665
2666	/*
2667	 * Filesystems are required to send in quota flags at mount time.
2668	 */
2669	if (!mp->m_qflags)
2670		return false;
2671
2672	type = 0;
2673	if (buf_f->blf_flags & XFS_BLF_UDQUOT_BUF)
2674		type |= XFS_DQ_USER;
2675	if (buf_f->blf_flags & XFS_BLF_PDQUOT_BUF)
2676		type |= XFS_DQ_PROJ;
2677	if (buf_f->blf_flags & XFS_BLF_GDQUOT_BUF)
2678		type |= XFS_DQ_GROUP;
2679	/*
2680	 * This type of quotas was turned off, so ignore this buffer
2681	 */
2682	if (log->l_quotaoffs_flag & type)
2683		return false;
2684
2685	xlog_recover_do_reg_buffer(mp, item, bp, buf_f, NULLCOMMITLSN);
2686	return true;
2687}
2688
2689/*
2690 * This routine replays a modification made to a buffer at runtime.
2691 * There are actually two types of buffer, regular and inode, which
2692 * are handled differently.  Inode buffers are handled differently
2693 * in that we only recover a specific set of data from them, namely
2694 * the inode di_next_unlinked fields.  This is because all other inode
2695 * data is actually logged via inode records and any data we replay
2696 * here which overlaps that may be stale.
2697 *
2698 * When meta-data buffers are freed at run time we log a buffer item
2699 * with the XFS_BLF_CANCEL bit set to indicate that previous copies
2700 * of the buffer in the log should not be replayed at recovery time.
2701 * This is so that if the blocks covered by the buffer are reused for
2702 * file data before we crash we don't end up replaying old, freed
2703 * meta-data into a user's file.
2704 *
2705 * To handle the cancellation of buffer log items, we make two passes
2706 * over the log during recovery.  During the first we build a table of
2707 * those buffers which have been cancelled, and during the second we
2708 * only replay those buffers which do not have corresponding cancel
2709 * records in the table.  See xlog_recover_buffer_pass[1,2] above
2710 * for more details on the implementation of the table of cancel records.
2711 */
2712STATIC int
2713xlog_recover_buffer_pass2(
2714	struct xlog			*log,
2715	struct list_head		*buffer_list,
2716	struct xlog_recover_item	*item,
2717	xfs_lsn_t			current_lsn)
2718{
2719	xfs_buf_log_format_t	*buf_f = item->ri_buf[0].i_addr;
2720	xfs_mount_t		*mp = log->l_mp;
2721	xfs_buf_t		*bp;
2722	int			error;
2723	uint			buf_flags;
2724	xfs_lsn_t		lsn;
2725
2726	/*
2727	 * In this pass we only want to recover all the buffers which have
2728	 * not been cancelled and are not cancellation buffers themselves.
2729	 */
2730	if (xlog_check_buffer_cancelled(log, buf_f->blf_blkno,
2731			buf_f->blf_len, buf_f->blf_flags)) {
2732		trace_xfs_log_recover_buf_cancel(log, buf_f);
2733		return 0;
2734	}
2735
2736	trace_xfs_log_recover_buf_recover(log, buf_f);
2737
2738	buf_flags = 0;
2739	if (buf_f->blf_flags & XFS_BLF_INODE_BUF)
2740		buf_flags |= XBF_UNMAPPED;
2741
2742	bp = xfs_buf_read(mp->m_ddev_targp, buf_f->blf_blkno, buf_f->blf_len,
2743			  buf_flags, NULL);
2744	if (!bp)
2745		return -ENOMEM;
2746	error = bp->b_error;
2747	if (error) {
2748		xfs_buf_ioerror_alert(bp, "xlog_recover_do..(read#1)");
2749		goto out_release;
2750	}
2751
2752	/*
2753	 * Recover the buffer only if we get an LSN from it and it's less than
2754	 * the lsn of the transaction we are replaying.
2755	 *
2756	 * Note that we have to be extremely careful of readahead here.
2757	 * Readahead does not attach verfiers to the buffers so if we don't
2758	 * actually do any replay after readahead because of the LSN we found
2759	 * in the buffer if more recent than that current transaction then we
2760	 * need to attach the verifier directly. Failure to do so can lead to
2761	 * future recovery actions (e.g. EFI and unlinked list recovery) can
2762	 * operate on the buffers and they won't get the verifier attached. This
2763	 * can lead to blocks on disk having the correct content but a stale
2764	 * CRC.
2765	 *
2766	 * It is safe to assume these clean buffers are currently up to date.
2767	 * If the buffer is dirtied by a later transaction being replayed, then
2768	 * the verifier will be reset to match whatever recover turns that
2769	 * buffer into.
2770	 */
2771	lsn = xlog_recover_get_buf_lsn(mp, bp);
2772	if (lsn && lsn != -1 && XFS_LSN_CMP(lsn, current_lsn) >= 0) {
2773		trace_xfs_log_recover_buf_skip(log, buf_f);
2774		xlog_recover_validate_buf_type(mp, bp, buf_f, NULLCOMMITLSN);
2775		goto out_release;
2776	}
2777
2778	if (buf_f->blf_flags & XFS_BLF_INODE_BUF) {
2779		error = xlog_recover_do_inode_buffer(mp, item, bp, buf_f);
2780		if (error)
2781			goto out_release;
2782	} else if (buf_f->blf_flags &
2783		  (XFS_BLF_UDQUOT_BUF|XFS_BLF_PDQUOT_BUF|XFS_BLF_GDQUOT_BUF)) {
2784		bool	dirty;
2785
2786		dirty = xlog_recover_do_dquot_buffer(mp, log, item, bp, buf_f);
2787		if (!dirty)
2788			goto out_release;
2789	} else {
2790		xlog_recover_do_reg_buffer(mp, item, bp, buf_f, current_lsn);
2791	}
2792
2793	/*
2794	 * Perform delayed write on the buffer.  Asynchronous writes will be
2795	 * slower when taking into account all the buffers to be flushed.
2796	 *
2797	 * Also make sure that only inode buffers with good sizes stay in
2798	 * the buffer cache.  The kernel moves inodes in buffers of 1 block
2799	 * or inode_cluster_size bytes, whichever is bigger.  The inode
2800	 * buffers in the log can be a different size if the log was generated
2801	 * by an older kernel using unclustered inode buffers or a newer kernel
2802	 * running with a different inode cluster size.  Regardless, if the
2803	 * the inode buffer size isn't max(blocksize, inode_cluster_size)
2804	 * for *our* value of inode_cluster_size, then we need to keep
2805	 * the buffer out of the buffer cache so that the buffer won't
2806	 * overlap with future reads of those inodes.
2807	 */
2808	if (XFS_DINODE_MAGIC ==
2809	    be16_to_cpu(*((__be16 *)xfs_buf_offset(bp, 0))) &&
2810	    (BBTOB(bp->b_length) != M_IGEO(log->l_mp)->inode_cluster_size)) {
2811		xfs_buf_stale(bp);
2812		error = xfs_bwrite(bp);
2813	} else {
2814		ASSERT(bp->b_mount == mp);
2815		bp->b_iodone = xlog_recover_iodone;
2816		xfs_buf_delwri_queue(bp, buffer_list);
2817	}
2818
2819out_release:
2820	xfs_buf_relse(bp);
2821	return error;
2822}
2823
2824/*
2825 * Inode fork owner changes
2826 *
2827 * If we have been told that we have to reparent the inode fork, it's because an
2828 * extent swap operation on a CRC enabled filesystem has been done and we are
2829 * replaying it. We need to walk the BMBT of the appropriate fork and change the
2830 * owners of it.
2831 *
2832 * The complexity here is that we don't have an inode context to work with, so
2833 * after we've replayed the inode we need to instantiate one.  This is where the
2834 * fun begins.
2835 *
2836 * We are in the middle of log recovery, so we can't run transactions. That
2837 * means we cannot use cache coherent inode instantiation via xfs_iget(), as
2838 * that will result in the corresponding iput() running the inode through
2839 * xfs_inactive(). If we've just replayed an inode core that changes the link
2840 * count to zero (i.e. it's been unlinked), then xfs_inactive() will run
2841 * transactions (bad!).
2842 *
2843 * So, to avoid this, we instantiate an inode directly from the inode core we've
2844 * just recovered. We have the buffer still locked, and all we really need to
2845 * instantiate is the inode core and the forks being modified. We can do this
2846 * manually, then run the inode btree owner change, and then tear down the
2847 * xfs_inode without having to run any transactions at all.
2848 *
2849 * Also, because we don't have a transaction context available here but need to
2850 * gather all the buffers we modify for writeback so we pass the buffer_list
2851 * instead for the operation to use.
2852 */
2853
2854STATIC int
2855xfs_recover_inode_owner_change(
2856	struct xfs_mount	*mp,
2857	struct xfs_dinode	*dip,
2858	struct xfs_inode_log_format *in_f,
2859	struct list_head	*buffer_list)
2860{
2861	struct xfs_inode	*ip;
2862	int			error;
2863
2864	ASSERT(in_f->ilf_fields & (XFS_ILOG_DOWNER|XFS_ILOG_AOWNER));
2865
2866	ip = xfs_inode_alloc(mp, in_f->ilf_ino);
2867	if (!ip)
2868		return -ENOMEM;
2869
2870	/* instantiate the inode */
2871	xfs_inode_from_disk(ip, dip);
2872	ASSERT(ip->i_d.di_version >= 3);
2873
2874	error = xfs_iformat_fork(ip, dip);
2875	if (error)
2876		goto out_free_ip;
2877
2878	if (!xfs_inode_verify_forks(ip)) {
2879		error = -EFSCORRUPTED;
2880		goto out_free_ip;
2881	}
2882
2883	if (in_f->ilf_fields & XFS_ILOG_DOWNER) {
2884		ASSERT(in_f->ilf_fields & XFS_ILOG_DBROOT);
2885		error = xfs_bmbt_change_owner(NULL, ip, XFS_DATA_FORK,
2886					      ip->i_ino, buffer_list);
2887		if (error)
2888			goto out_free_ip;
2889	}
2890
2891	if (in_f->ilf_fields & XFS_ILOG_AOWNER) {
2892		ASSERT(in_f->ilf_fields & XFS_ILOG_ABROOT);
2893		error = xfs_bmbt_change_owner(NULL, ip, XFS_ATTR_FORK,
2894					      ip->i_ino, buffer_list);
2895		if (error)
2896			goto out_free_ip;
2897	}
2898
2899out_free_ip:
2900	xfs_inode_free(ip);
2901	return error;
2902}
2903
2904STATIC int
2905xlog_recover_inode_pass2(
2906	struct xlog			*log,
2907	struct list_head		*buffer_list,
2908	struct xlog_recover_item	*item,
2909	xfs_lsn_t			current_lsn)
2910{
2911	struct xfs_inode_log_format	*in_f;
2912	xfs_mount_t		*mp = log->l_mp;
2913	xfs_buf_t		*bp;
2914	xfs_dinode_t		*dip;
2915	int			len;
2916	char			*src;
2917	char			*dest;
2918	int			error;
2919	int			attr_index;
2920	uint			fields;
2921	struct xfs_log_dinode	*ldip;
2922	uint			isize;
2923	int			need_free = 0;
2924
2925	if (item->ri_buf[0].i_len == sizeof(struct xfs_inode_log_format)) {
2926		in_f = item->ri_buf[0].i_addr;
2927	} else {
2928		in_f = kmem_alloc(sizeof(struct xfs_inode_log_format), 0);
2929		need_free = 1;
2930		error = xfs_inode_item_format_convert(&item->ri_buf[0], in_f);
2931		if (error)
2932			goto error;
2933	}
2934
2935	/*
2936	 * Inode buffers can be freed, look out for it,
2937	 * and do not replay the inode.
2938	 */
2939	if (xlog_check_buffer_cancelled(log, in_f->ilf_blkno,
2940					in_f->ilf_len, 0)) {
2941		error = 0;
2942		trace_xfs_log_recover_inode_cancel(log, in_f);
2943		goto error;
2944	}
2945	trace_xfs_log_recover_inode_recover(log, in_f);
2946
2947	bp = xfs_buf_read(mp->m_ddev_targp, in_f->ilf_blkno, in_f->ilf_len, 0,
2948			  &xfs_inode_buf_ops);
2949	if (!bp) {
2950		error = -ENOMEM;
2951		goto error;
2952	}
2953	error = bp->b_error;
2954	if (error) {
2955		xfs_buf_ioerror_alert(bp, "xlog_recover_do..(read#2)");
2956		goto out_release;
2957	}
2958	ASSERT(in_f->ilf_fields & XFS_ILOG_CORE);
2959	dip = xfs_buf_offset(bp, in_f->ilf_boffset);
2960
2961	/*
2962	 * Make sure the place we're flushing out to really looks
2963	 * like an inode!
2964	 */
2965	if (XFS_IS_CORRUPT(mp, !xfs_verify_magic16(bp, dip->di_magic))) {
2966		xfs_alert(mp,
2967	"%s: Bad inode magic number, dip = "PTR_FMT", dino bp = "PTR_FMT", ino = %Ld",
2968			__func__, dip, bp, in_f->ilf_ino);
2969		error = -EFSCORRUPTED;
2970		goto out_release;
2971	}
2972	ldip = item->ri_buf[1].i_addr;
2973	if (XFS_IS_CORRUPT(mp, ldip->di_magic != XFS_DINODE_MAGIC)) {
2974		xfs_alert(mp,
2975			"%s: Bad inode log record, rec ptr "PTR_FMT", ino %Ld",
2976			__func__, item, in_f->ilf_ino);
2977		error = -EFSCORRUPTED;
2978		goto out_release;
2979	}
2980
2981	/*
2982	 * If the inode has an LSN in it, recover the inode only if it's less
2983	 * than the lsn of the transaction we are replaying. Note: we still
2984	 * need to replay an owner change even though the inode is more recent
2985	 * than the transaction as there is no guarantee that all the btree
2986	 * blocks are more recent than this transaction, too.
2987	 */
2988	if (dip->di_version >= 3) {
2989		xfs_lsn_t	lsn = be64_to_cpu(dip->di_lsn);
2990
2991		if (lsn && lsn != -1 && XFS_LSN_CMP(lsn, current_lsn) >= 0) {
2992			trace_xfs_log_recover_inode_skip(log, in_f);
2993			error = 0;
2994			goto out_owner_change;
2995		}
2996	}
2997
2998	/*
2999	 * di_flushiter is only valid for v1/2 inodes. All changes for v3 inodes
3000	 * are transactional and if ordering is necessary we can determine that
3001	 * more accurately by the LSN field in the V3 inode core. Don't trust
3002	 * the inode versions we might be changing them here - use the
3003	 * superblock flag to determine whether we need to look at di_flushiter
3004	 * to skip replay when the on disk inode is newer than the log one
3005	 */
3006	if (!xfs_sb_version_hascrc(&mp->m_sb) &&
3007	    ldip->di_flushiter < be16_to_cpu(dip->di_flushiter)) {
3008		/*
3009		 * Deal with the wrap case, DI_MAX_FLUSH is less
3010		 * than smaller numbers
3011		 */
3012		if (be16_to_cpu(dip->di_flushiter) == DI_MAX_FLUSH &&
3013		    ldip->di_flushiter < (DI_MAX_FLUSH >> 1)) {
3014			/* do nothing */
3015		} else {
3016			trace_xfs_log_recover_inode_skip(log, in_f);
3017			error = 0;
3018			goto out_release;
3019		}
3020	}
3021
3022	/* Take the opportunity to reset the flush iteration count */
3023	ldip->di_flushiter = 0;
3024
3025	if (unlikely(S_ISREG(ldip->di_mode))) {
3026		if ((ldip->di_format != XFS_DINODE_FMT_EXTENTS) &&
3027		    (ldip->di_format != XFS_DINODE_FMT_BTREE)) {
3028			XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(3)",
3029					 XFS_ERRLEVEL_LOW, mp, ldip,
3030					 sizeof(*ldip));
3031			xfs_alert(mp,
3032		"%s: Bad regular inode log record, rec ptr "PTR_FMT", "
3033		"ino ptr = "PTR_FMT", ino bp = "PTR_FMT", ino %Ld",
3034				__func__, item, dip, bp, in_f->ilf_ino);
3035			error = -EFSCORRUPTED;
3036			goto out_release;
3037		}
3038	} else if (unlikely(S_ISDIR(ldip->di_mode))) {
3039		if ((ldip->di_format != XFS_DINODE_FMT_EXTENTS) &&
3040		    (ldip->di_format != XFS_DINODE_FMT_BTREE) &&
3041		    (ldip->di_format != XFS_DINODE_FMT_LOCAL)) {
3042			XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(4)",
3043					     XFS_ERRLEVEL_LOW, mp, ldip,
3044					     sizeof(*ldip));
3045			xfs_alert(mp,
3046		"%s: Bad dir inode log record, rec ptr "PTR_FMT", "
3047		"ino ptr = "PTR_FMT", ino bp = "PTR_FMT", ino %Ld",
3048				__func__, item, dip, bp, in_f->ilf_ino);
3049			error = -EFSCORRUPTED;
3050			goto out_release;
3051		}
3052	}
3053	if (unlikely(ldip->di_nextents + ldip->di_anextents > ldip->di_nblocks)){
3054		XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(5)",
3055				     XFS_ERRLEVEL_LOW, mp, ldip,
3056				     sizeof(*ldip));
3057		xfs_alert(mp,
3058	"%s: Bad inode log record, rec ptr "PTR_FMT", dino ptr "PTR_FMT", "
3059	"dino bp "PTR_FMT", ino %Ld, total extents = %d, nblocks = %Ld",
3060			__func__, item, dip, bp, in_f->ilf_ino,
3061			ldip->di_nextents + ldip->di_anextents,
3062			ldip->di_nblocks);
3063		error = -EFSCORRUPTED;
3064		goto out_release;
3065	}
3066	if (unlikely(ldip->di_forkoff > mp->m_sb.sb_inodesize)) {
3067		XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(6)",
3068				     XFS_ERRLEVEL_LOW, mp, ldip,
3069				     sizeof(*ldip));
3070		xfs_alert(mp,
3071	"%s: Bad inode log record, rec ptr "PTR_FMT", dino ptr "PTR_FMT", "
3072	"dino bp "PTR_FMT", ino %Ld, forkoff 0x%x", __func__,
3073			item, dip, bp, in_f->ilf_ino, ldip->di_forkoff);
3074		error = -EFSCORRUPTED;
3075		goto out_release;
3076	}
3077	isize = xfs_log_dinode_size(ldip->di_version);
3078	if (unlikely(item->ri_buf[1].i_len > isize)) {
3079		XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(7)",
3080				     XFS_ERRLEVEL_LOW, mp, ldip,
3081				     sizeof(*ldip));
3082		xfs_alert(mp,
3083			"%s: Bad inode log record length %d, rec ptr "PTR_FMT,
3084			__func__, item->ri_buf[1].i_len, item);
3085		error = -EFSCORRUPTED;
3086		goto out_release;
3087	}
3088
3089	/* recover the log dinode inode into the on disk inode */
3090	xfs_log_dinode_to_disk(ldip, dip);
3091
3092	fields = in_f->ilf_fields;
3093	if (fields & XFS_ILOG_DEV)
3094		xfs_dinode_put_rdev(dip, in_f->ilf_u.ilfu_rdev);
3095
3096	if (in_f->ilf_size == 2)
3097		goto out_owner_change;
3098	len = item->ri_buf[2].i_len;
3099	src = item->ri_buf[2].i_addr;
3100	ASSERT(in_f->ilf_size <= 4);
3101	ASSERT((in_f->ilf_size == 3) || (fields & XFS_ILOG_AFORK));
3102	ASSERT(!(fields & XFS_ILOG_DFORK) ||
3103	       (len == in_f->ilf_dsize));
3104
3105	switch (fields & XFS_ILOG_DFORK) {
3106	case XFS_ILOG_DDATA:
3107	case XFS_ILOG_DEXT:
3108		memcpy(XFS_DFORK_DPTR(dip), src, len);
3109		break;
3110
3111	case XFS_ILOG_DBROOT:
3112		xfs_bmbt_to_bmdr(mp, (struct xfs_btree_block *)src, len,
3113				 (xfs_bmdr_block_t *)XFS_DFORK_DPTR(dip),
3114				 XFS_DFORK_DSIZE(dip, mp));
3115		break;
3116
3117	default:
3118		/*
3119		 * There are no data fork flags set.
3120		 */
3121		ASSERT((fields & XFS_ILOG_DFORK) == 0);
3122		break;
3123	}
3124
3125	/*
3126	 * If we logged any attribute data, recover it.  There may or
3127	 * may not have been any other non-core data logged in this
3128	 * transaction.
3129	 */
3130	if (in_f->ilf_fields & XFS_ILOG_AFORK) {
3131		if (in_f->ilf_fields & XFS_ILOG_DFORK) {
3132			attr_index = 3;
3133		} else {
3134			attr_index = 2;
3135		}
3136		len = item->ri_buf[attr_index].i_len;
3137		src = item->ri_buf[attr_index].i_addr;
3138		ASSERT(len == in_f->ilf_asize);
3139
3140		switch (in_f->ilf_fields & XFS_ILOG_AFORK) {
3141		case XFS_ILOG_ADATA:
3142		case XFS_ILOG_AEXT:
3143			dest = XFS_DFORK_APTR(dip);
3144			ASSERT(len <= XFS_DFORK_ASIZE(dip, mp));
3145			memcpy(dest, src, len);
3146			break;
3147
3148		case XFS_ILOG_ABROOT:
3149			dest = XFS_DFORK_APTR(dip);
3150			xfs_bmbt_to_bmdr(mp, (struct xfs_btree_block *)src,
3151					 len, (xfs_bmdr_block_t*)dest,
3152					 XFS_DFORK_ASIZE(dip, mp));
3153			break;
3154
3155		default:
3156			xfs_warn(log->l_mp, "%s: Invalid flag", __func__);
3157			ASSERT(0);
3158			error = -EFSCORRUPTED;
3159			goto out_release;
3160		}
3161	}
3162
3163out_owner_change:
3164	/* Recover the swapext owner change unless inode has been deleted */
3165	if ((in_f->ilf_fields & (XFS_ILOG_DOWNER|XFS_ILOG_AOWNER)) &&
3166	    (dip->di_mode != 0))
3167		error = xfs_recover_inode_owner_change(mp, dip, in_f,
3168						       buffer_list);
3169	/* re-generate the checksum. */
3170	xfs_dinode_calc_crc(log->l_mp, dip);
3171
3172	ASSERT(bp->b_mount == mp);
3173	bp->b_iodone = xlog_recover_iodone;
3174	xfs_buf_delwri_queue(bp, buffer_list);
3175
3176out_release:
3177	xfs_buf_relse(bp);
3178error:
3179	if (need_free)
3180		kmem_free(in_f);
3181	return error;
3182}
3183
3184/*
3185 * Recover QUOTAOFF records. We simply make a note of it in the xlog
3186 * structure, so that we know not to do any dquot item or dquot buffer recovery,
3187 * of that type.
3188 */
3189STATIC int
3190xlog_recover_quotaoff_pass1(
3191	struct xlog			*log,
3192	struct xlog_recover_item	*item)
3193{
3194	xfs_qoff_logformat_t	*qoff_f = item->ri_buf[0].i_addr;
3195	ASSERT(qoff_f);
3196
3197	/*
3198	 * The logitem format's flag tells us if this was user quotaoff,
3199	 * group/project quotaoff or both.
3200	 */
3201	if (qoff_f->qf_flags & XFS_UQUOTA_ACCT)
3202		log->l_quotaoffs_flag |= XFS_DQ_USER;
3203	if (qoff_f->qf_flags & XFS_PQUOTA_ACCT)
3204		log->l_quotaoffs_flag |= XFS_DQ_PROJ;
3205	if (qoff_f->qf_flags & XFS_GQUOTA_ACCT)
3206		log->l_quotaoffs_flag |= XFS_DQ_GROUP;
3207
3208	return 0;
3209}
3210
3211/*
3212 * Recover a dquot record
3213 */
3214STATIC int
3215xlog_recover_dquot_pass2(
3216	struct xlog			*log,
3217	struct list_head		*buffer_list,
3218	struct xlog_recover_item	*item,
3219	xfs_lsn_t			current_lsn)
3220{
3221	xfs_mount_t		*mp = log->l_mp;
3222	xfs_buf_t		*bp;
3223	struct xfs_disk_dquot	*ddq, *recddq;
3224	xfs_failaddr_t		fa;
3225	int			error;
3226	xfs_dq_logformat_t	*dq_f;
3227	uint			type;
3228
3229
3230	/*
3231	 * Filesystems are required to send in quota flags at mount time.
3232	 */
3233	if (mp->m_qflags == 0)
3234		return 0;
3235
3236	recddq = item->ri_buf[1].i_addr;
3237	if (recddq == NULL) {
3238		xfs_alert(log->l_mp, "NULL dquot in %s.", __func__);
3239		return -EFSCORRUPTED;
3240	}
3241	if (item->ri_buf[1].i_len < sizeof(struct xfs_disk_dquot)) {
3242		xfs_alert(log->l_mp, "dquot too small (%d) in %s.",
3243			item->ri_buf[1].i_len, __func__);
3244		return -EFSCORRUPTED;
3245	}
3246
3247	/*
3248	 * This type of quotas was turned off, so ignore this record.
3249	 */
3250	type = recddq->d_flags & (XFS_DQ_USER | XFS_DQ_PROJ | XFS_DQ_GROUP);
3251	ASSERT(type);
3252	if (log->l_quotaoffs_flag & type)
3253		return 0;
3254
3255	/*
3256	 * At this point we know that quota was _not_ turned off.
3257	 * Since the mount flags are not indicating to us otherwise, this
3258	 * must mean that quota is on, and the dquot needs to be replayed.
3259	 * Remember that we may not have fully recovered the superblock yet,
3260	 * so we can't do the usual trick of looking at the SB quota bits.
3261	 *
3262	 * The other possibility, of course, is that the quota subsystem was
3263	 * removed since the last mount - ENOSYS.
3264	 */
3265	dq_f = item->ri_buf[0].i_addr;
3266	ASSERT(dq_f);
3267	fa = xfs_dquot_verify(mp, recddq, dq_f->qlf_id, 0);
3268	if (fa) {
3269		xfs_alert(mp, "corrupt dquot ID 0x%x in log at %pS",
3270				dq_f->qlf_id, fa);
3271		return -EFSCORRUPTED;
3272	}
3273	ASSERT(dq_f->qlf_len == 1);
3274
3275	/*
3276	 * At this point we are assuming that the dquots have been allocated
3277	 * and hence the buffer has valid dquots stamped in it. It should,
3278	 * therefore, pass verifier validation. If the dquot is bad, then the
3279	 * we'll return an error here, so we don't need to specifically check
3280	 * the dquot in the buffer after the verifier has run.
3281	 */
3282	error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, dq_f->qlf_blkno,
3283				   XFS_FSB_TO_BB(mp, dq_f->qlf_len), 0, &bp,
3284				   &xfs_dquot_buf_ops);
3285	if (error)
3286		return error;
3287
3288	ASSERT(bp);
3289	ddq = xfs_buf_offset(bp, dq_f->qlf_boffset);
3290
3291	/*
3292	 * If the dquot has an LSN in it, recover the dquot only if it's less
3293	 * than the lsn of the transaction we are replaying.
3294	 */
3295	if (xfs_sb_version_hascrc(&mp->m_sb)) {
3296		struct xfs_dqblk *dqb = (struct xfs_dqblk *)ddq;
3297		xfs_lsn_t	lsn = be64_to_cpu(dqb->dd_lsn);
3298
3299		if (lsn && lsn != -1 && XFS_LSN_CMP(lsn, current_lsn) >= 0) {
3300			goto out_release;
3301		}
3302	}
3303
3304	memcpy(ddq, recddq, item->ri_buf[1].i_len);
3305	if (xfs_sb_version_hascrc(&mp->m_sb)) {
3306		xfs_update_cksum((char *)ddq, sizeof(struct xfs_dqblk),
3307				 XFS_DQUOT_CRC_OFF);
3308	}
3309
3310	ASSERT(dq_f->qlf_size == 2);
3311	ASSERT(bp->b_mount == mp);
3312	bp->b_iodone = xlog_recover_iodone;
3313	xfs_buf_delwri_queue(bp, buffer_list);
3314
3315out_release:
3316	xfs_buf_relse(bp);
3317	return 0;
3318}
3319
3320/*
3321 * This routine is called to create an in-core extent free intent
3322 * item from the efi format structure which was logged on disk.
3323 * It allocates an in-core efi, copies the extents from the format
3324 * structure into it, and adds the efi to the AIL with the given
3325 * LSN.
3326 */
3327STATIC int
3328xlog_recover_efi_pass2(
3329	struct xlog			*log,
3330	struct xlog_recover_item	*item,
3331	xfs_lsn_t			lsn)
3332{
3333	int				error;
3334	struct xfs_mount		*mp = log->l_mp;
3335	struct xfs_efi_log_item		*efip;
3336	struct xfs_efi_log_format	*efi_formatp;
3337
3338	efi_formatp = item->ri_buf[0].i_addr;
3339
3340	efip = xfs_efi_init(mp, efi_formatp->efi_nextents);
3341	error = xfs_efi_copy_format(&item->ri_buf[0], &efip->efi_format);
3342	if (error) {
3343		xfs_efi_item_free(efip);
3344		return error;
3345	}
3346	atomic_set(&efip->efi_next_extent, efi_formatp->efi_nextents);
3347
3348	spin_lock(&log->l_ailp->ail_lock);
3349	/*
3350	 * The EFI has two references. One for the EFD and one for EFI to ensure
3351	 * it makes it into the AIL. Insert the EFI into the AIL directly and
3352	 * drop the EFI reference. Note that xfs_trans_ail_update() drops the
3353	 * AIL lock.
3354	 */
3355	xfs_trans_ail_update(log->l_ailp, &efip->efi_item, lsn);
3356	xfs_efi_release(efip);
3357	return 0;
3358}
3359
3360
3361/*
3362 * This routine is called when an EFD format structure is found in a committed
3363 * transaction in the log. Its purpose is to cancel the corresponding EFI if it
3364 * was still in the log. To do this it searches the AIL for the EFI with an id
3365 * equal to that in the EFD format structure. If we find it we drop the EFD
3366 * reference, which removes the EFI from the AIL and frees it.
3367 */
3368STATIC int
3369xlog_recover_efd_pass2(
3370	struct xlog			*log,
3371	struct xlog_recover_item	*item)
3372{
3373	xfs_efd_log_format_t	*efd_formatp;
3374	xfs_efi_log_item_t	*efip = NULL;
3375	struct xfs_log_item	*lip;
3376	uint64_t		efi_id;
3377	struct xfs_ail_cursor	cur;
3378	struct xfs_ail		*ailp = log->l_ailp;
3379
3380	efd_formatp = item->ri_buf[0].i_addr;
3381	ASSERT((item->ri_buf[0].i_len == (sizeof(xfs_efd_log_format_32_t) +
3382		((efd_formatp->efd_nextents - 1) * sizeof(xfs_extent_32_t)))) ||
3383	       (item->ri_buf[0].i_len == (sizeof(xfs_efd_log_format_64_t) +
3384		((efd_formatp->efd_nextents - 1) * sizeof(xfs_extent_64_t)))));
3385	efi_id = efd_formatp->efd_efi_id;
3386
3387	/*
3388	 * Search for the EFI with the id in the EFD format structure in the
3389	 * AIL.
3390	 */
3391	spin_lock(&ailp->ail_lock);
3392	lip = xfs_trans_ail_cursor_first(ailp, &cur, 0);
3393	while (lip != NULL) {
3394		if (lip->li_type == XFS_LI_EFI) {
3395			efip = (xfs_efi_log_item_t *)lip;
3396			if (efip->efi_format.efi_id == efi_id) {
3397				/*
3398				 * Drop the EFD reference to the EFI. This
3399				 * removes the EFI from the AIL and frees it.
3400				 */
3401				spin_unlock(&ailp->ail_lock);
3402				xfs_efi_release(efip);
3403				spin_lock(&ailp->ail_lock);
3404				break;
3405			}
3406		}
3407		lip = xfs_trans_ail_cursor_next(ailp, &cur);
3408	}
3409
3410	xfs_trans_ail_cursor_done(&cur);
3411	spin_unlock(&ailp->ail_lock);
3412
3413	return 0;
3414}
3415
3416/*
3417 * This routine is called to create an in-core extent rmap update
3418 * item from the rui format structure which was logged on disk.
3419 * It allocates an in-core rui, copies the extents from the format
3420 * structure into it, and adds the rui to the AIL with the given
3421 * LSN.
3422 */
3423STATIC int
3424xlog_recover_rui_pass2(
3425	struct xlog			*log,
3426	struct xlog_recover_item	*item,
3427	xfs_lsn_t			lsn)
3428{
3429	int				error;
3430	struct xfs_mount		*mp = log->l_mp;
3431	struct xfs_rui_log_item		*ruip;
3432	struct xfs_rui_log_format	*rui_formatp;
3433
3434	rui_formatp = item->ri_buf[0].i_addr;
3435
3436	ruip = xfs_rui_init(mp, rui_formatp->rui_nextents);
3437	error = xfs_rui_copy_format(&item->ri_buf[0], &ruip->rui_format);
3438	if (error) {
3439		xfs_rui_item_free(ruip);
3440		return error;
3441	}
3442	atomic_set(&ruip->rui_next_extent, rui_formatp->rui_nextents);
3443
3444	spin_lock(&log->l_ailp->ail_lock);
3445	/*
3446	 * The RUI has two references. One for the RUD and one for RUI to ensure
3447	 * it makes it into the AIL. Insert the RUI into the AIL directly and
3448	 * drop the RUI reference. Note that xfs_trans_ail_update() drops the
3449	 * AIL lock.
3450	 */
3451	xfs_trans_ail_update(log->l_ailp, &ruip->rui_item, lsn);
3452	xfs_rui_release(ruip);
3453	return 0;
3454}
3455
3456
3457/*
3458 * This routine is called when an RUD format structure is found in a committed
3459 * transaction in the log. Its purpose is to cancel the corresponding RUI if it
3460 * was still in the log. To do this it searches the AIL for the RUI with an id
3461 * equal to that in the RUD format structure. If we find it we drop the RUD
3462 * reference, which removes the RUI from the AIL and frees it.
3463 */
3464STATIC int
3465xlog_recover_rud_pass2(
3466	struct xlog			*log,
3467	struct xlog_recover_item	*item)
3468{
3469	struct xfs_rud_log_format	*rud_formatp;
3470	struct xfs_rui_log_item		*ruip = NULL;
3471	struct xfs_log_item		*lip;
3472	uint64_t			rui_id;
3473	struct xfs_ail_cursor		cur;
3474	struct xfs_ail			*ailp = log->l_ailp;
3475
3476	rud_formatp = item->ri_buf[0].i_addr;
3477	ASSERT(item->ri_buf[0].i_len == sizeof(struct xfs_rud_log_format));
3478	rui_id = rud_formatp->rud_rui_id;
3479
3480	/*
3481	 * Search for the RUI with the id in the RUD format structure in the
3482	 * AIL.
3483	 */
3484	spin_lock(&ailp->ail_lock);
3485	lip = xfs_trans_ail_cursor_first(ailp, &cur, 0);
3486	while (lip != NULL) {
3487		if (lip->li_type == XFS_LI_RUI) {
3488			ruip = (struct xfs_rui_log_item *)lip;
3489			if (ruip->rui_format.rui_id == rui_id) {
3490				/*
3491				 * Drop the RUD reference to the RUI. This
3492				 * removes the RUI from the AIL and frees it.
3493				 */
3494				spin_unlock(&ailp->ail_lock);
3495				xfs_rui_release(ruip);
3496				spin_lock(&ailp->ail_lock);
3497				break;
3498			}
3499		}
3500		lip = xfs_trans_ail_cursor_next(ailp, &cur);
3501	}
3502
3503	xfs_trans_ail_cursor_done(&cur);
3504	spin_unlock(&ailp->ail_lock);
3505
3506	return 0;
3507}
3508
3509/*
3510 * Copy an CUI format buffer from the given buf, and into the destination
3511 * CUI format structure.  The CUI/CUD items were designed not to need any
3512 * special alignment handling.
3513 */
3514static int
3515xfs_cui_copy_format(
3516	struct xfs_log_iovec		*buf,
3517	struct xfs_cui_log_format	*dst_cui_fmt)
3518{
3519	struct xfs_cui_log_format	*src_cui_fmt;
3520	uint				len;
3521
3522	src_cui_fmt = buf->i_addr;
3523	len = xfs_cui_log_format_sizeof(src_cui_fmt->cui_nextents);
3524
3525	if (buf->i_len == len) {
3526		memcpy(dst_cui_fmt, src_cui_fmt, len);
3527		return 0;
3528	}
3529	XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, NULL);
3530	return -EFSCORRUPTED;
3531}
3532
3533/*
3534 * This routine is called to create an in-core extent refcount update
3535 * item from the cui format structure which was logged on disk.
3536 * It allocates an in-core cui, copies the extents from the format
3537 * structure into it, and adds the cui to the AIL with the given
3538 * LSN.
3539 */
3540STATIC int
3541xlog_recover_cui_pass2(
3542	struct xlog			*log,
3543	struct xlog_recover_item	*item,
3544	xfs_lsn_t			lsn)
3545{
3546	int				error;
3547	struct xfs_mount		*mp = log->l_mp;
3548	struct xfs_cui_log_item		*cuip;
3549	struct xfs_cui_log_format	*cui_formatp;
3550
3551	cui_formatp = item->ri_buf[0].i_addr;
3552
3553	cuip = xfs_cui_init(mp, cui_formatp->cui_nextents);
3554	error = xfs_cui_copy_format(&item->ri_buf[0], &cuip->cui_format);
3555	if (error) {
3556		xfs_cui_item_free(cuip);
3557		return error;
3558	}
3559	atomic_set(&cuip->cui_next_extent, cui_formatp->cui_nextents);
3560
3561	spin_lock(&log->l_ailp->ail_lock);
3562	/*
3563	 * The CUI has two references. One for the CUD and one for CUI to ensure
3564	 * it makes it into the AIL. Insert the CUI into the AIL directly and
3565	 * drop the CUI reference. Note that xfs_trans_ail_update() drops the
3566	 * AIL lock.
3567	 */
3568	xfs_trans_ail_update(log->l_ailp, &cuip->cui_item, lsn);
3569	xfs_cui_release(cuip);
3570	return 0;
3571}
3572
3573
3574/*
3575 * This routine is called when an CUD format structure is found in a committed
3576 * transaction in the log. Its purpose is to cancel the corresponding CUI if it
3577 * was still in the log. To do this it searches the AIL for the CUI with an id
3578 * equal to that in the CUD format structure. If we find it we drop the CUD
3579 * reference, which removes the CUI from the AIL and frees it.
3580 */
3581STATIC int
3582xlog_recover_cud_pass2(
3583	struct xlog			*log,
3584	struct xlog_recover_item	*item)
3585{
3586	struct xfs_cud_log_format	*cud_formatp;
3587	struct xfs_cui_log_item		*cuip = NULL;
3588	struct xfs_log_item		*lip;
3589	uint64_t			cui_id;
3590	struct xfs_ail_cursor		cur;
3591	struct xfs_ail			*ailp = log->l_ailp;
3592
3593	cud_formatp = item->ri_buf[0].i_addr;
3594	if (item->ri_buf[0].i_len != sizeof(struct xfs_cud_log_format)) {
3595		XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, log->l_mp);
3596		return -EFSCORRUPTED;
3597	}
3598	cui_id = cud_formatp->cud_cui_id;
3599
3600	/*
3601	 * Search for the CUI with the id in the CUD format structure in the
3602	 * AIL.
3603	 */
3604	spin_lock(&ailp->ail_lock);
3605	lip = xfs_trans_ail_cursor_first(ailp, &cur, 0);
3606	while (lip != NULL) {
3607		if (lip->li_type == XFS_LI_CUI) {
3608			cuip = (struct xfs_cui_log_item *)lip;
3609			if (cuip->cui_format.cui_id == cui_id) {
3610				/*
3611				 * Drop the CUD reference to the CUI. This
3612				 * removes the CUI from the AIL and frees it.
3613				 */
3614				spin_unlock(&ailp->ail_lock);
3615				xfs_cui_release(cuip);
3616				spin_lock(&ailp->ail_lock);
3617				break;
3618			}
3619		}
3620		lip = xfs_trans_ail_cursor_next(ailp, &cur);
3621	}
3622
3623	xfs_trans_ail_cursor_done(&cur);
3624	spin_unlock(&ailp->ail_lock);
3625
3626	return 0;
3627}
3628
3629/*
3630 * Copy an BUI format buffer from the given buf, and into the destination
3631 * BUI format structure.  The BUI/BUD items were designed not to need any
3632 * special alignment handling.
3633 */
3634static int
3635xfs_bui_copy_format(
3636	struct xfs_log_iovec		*buf,
3637	struct xfs_bui_log_format	*dst_bui_fmt)
3638{
3639	struct xfs_bui_log_format	*src_bui_fmt;
3640	uint				len;
3641
3642	src_bui_fmt = buf->i_addr;
3643	len = xfs_bui_log_format_sizeof(src_bui_fmt->bui_nextents);
3644
3645	if (buf->i_len == len) {
3646		memcpy(dst_bui_fmt, src_bui_fmt, len);
3647		return 0;
3648	}
3649	XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, NULL);
3650	return -EFSCORRUPTED;
3651}
3652
3653/*
3654 * This routine is called to create an in-core extent bmap update
3655 * item from the bui format structure which was logged on disk.
3656 * It allocates an in-core bui, copies the extents from the format
3657 * structure into it, and adds the bui to the AIL with the given
3658 * LSN.
3659 */
3660STATIC int
3661xlog_recover_bui_pass2(
3662	struct xlog			*log,
3663	struct xlog_recover_item	*item,
3664	xfs_lsn_t			lsn)
3665{
3666	int				error;
3667	struct xfs_mount		*mp = log->l_mp;
3668	struct xfs_bui_log_item		*buip;
3669	struct xfs_bui_log_format	*bui_formatp;
3670
3671	bui_formatp = item->ri_buf[0].i_addr;
3672
3673	if (bui_formatp->bui_nextents != XFS_BUI_MAX_FAST_EXTENTS) {
3674		XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, log->l_mp);
3675		return -EFSCORRUPTED;
3676	}
3677	buip = xfs_bui_init(mp);
3678	error = xfs_bui_copy_format(&item->ri_buf[0], &buip->bui_format);
3679	if (error) {
3680		xfs_bui_item_free(buip);
3681		return error;
3682	}
3683	atomic_set(&buip->bui_next_extent, bui_formatp->bui_nextents);
3684
3685	spin_lock(&log->l_ailp->ail_lock);
3686	/*
3687	 * The RUI has two references. One for the RUD and one for RUI to ensure
3688	 * it makes it into the AIL. Insert the RUI into the AIL directly and
3689	 * drop the RUI reference. Note that xfs_trans_ail_update() drops the
3690	 * AIL lock.
3691	 */
3692	xfs_trans_ail_update(log->l_ailp, &buip->bui_item, lsn);
3693	xfs_bui_release(buip);
3694	return 0;
3695}
3696
3697
3698/*
3699 * This routine is called when an BUD format structure is found in a committed
3700 * transaction in the log. Its purpose is to cancel the corresponding BUI if it
3701 * was still in the log. To do this it searches the AIL for the BUI with an id
3702 * equal to that in the BUD format structure. If we find it we drop the BUD
3703 * reference, which removes the BUI from the AIL and frees it.
3704 */
3705STATIC int
3706xlog_recover_bud_pass2(
3707	struct xlog			*log,
3708	struct xlog_recover_item	*item)
3709{
3710	struct xfs_bud_log_format	*bud_formatp;
3711	struct xfs_bui_log_item		*buip = NULL;
3712	struct xfs_log_item		*lip;
3713	uint64_t			bui_id;
3714	struct xfs_ail_cursor		cur;
3715	struct xfs_ail			*ailp = log->l_ailp;
3716
3717	bud_formatp = item->ri_buf[0].i_addr;
3718	if (item->ri_buf[0].i_len != sizeof(struct xfs_bud_log_format)) {
3719		XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, log->l_mp);
3720		return -EFSCORRUPTED;
3721	}
3722	bui_id = bud_formatp->bud_bui_id;
3723
3724	/*
3725	 * Search for the BUI with the id in the BUD format structure in the
3726	 * AIL.
3727	 */
3728	spin_lock(&ailp->ail_lock);
3729	lip = xfs_trans_ail_cursor_first(ailp, &cur, 0);
3730	while (lip != NULL) {
3731		if (lip->li_type == XFS_LI_BUI) {
3732			buip = (struct xfs_bui_log_item *)lip;
3733			if (buip->bui_format.bui_id == bui_id) {
3734				/*
3735				 * Drop the BUD reference to the BUI. This
3736				 * removes the BUI from the AIL and frees it.
3737				 */
3738				spin_unlock(&ailp->ail_lock);
3739				xfs_bui_release(buip);
3740				spin_lock(&ailp->ail_lock);
3741				break;
3742			}
3743		}
3744		lip = xfs_trans_ail_cursor_next(ailp, &cur);
3745	}
3746
3747	xfs_trans_ail_cursor_done(&cur);
3748	spin_unlock(&ailp->ail_lock);
3749
3750	return 0;
3751}
3752
3753/*
3754 * This routine is called when an inode create format structure is found in a
3755 * committed transaction in the log.  It's purpose is to initialise the inodes
3756 * being allocated on disk. This requires us to get inode cluster buffers that
3757 * match the range to be initialised, stamped with inode templates and written
3758 * by delayed write so that subsequent modifications will hit the cached buffer
3759 * and only need writing out at the end of recovery.
3760 */
3761STATIC int
3762xlog_recover_do_icreate_pass2(
3763	struct xlog		*log,
3764	struct list_head	*buffer_list,
3765	xlog_recover_item_t	*item)
3766{
3767	struct xfs_mount	*mp = log->l_mp;
3768	struct xfs_icreate_log	*icl;
3769	struct xfs_ino_geometry	*igeo = M_IGEO(mp);
3770	xfs_agnumber_t		agno;
3771	xfs_agblock_t		agbno;
3772	unsigned int		count;
3773	unsigned int		isize;
3774	xfs_agblock_t		length;
3775	int			bb_per_cluster;
3776	int			cancel_count;
3777	int			nbufs;
3778	int			i;
3779
3780	icl = (struct xfs_icreate_log *)item->ri_buf[0].i_addr;
3781	if (icl->icl_type != XFS_LI_ICREATE) {
3782		xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad type");
3783		return -EINVAL;
3784	}
3785
3786	if (icl->icl_size != 1) {
3787		xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad icl size");
3788		return -EINVAL;
3789	}
3790
3791	agno = be32_to_cpu(icl->icl_ag);
3792	if (agno >= mp->m_sb.sb_agcount) {
3793		xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad agno");
3794		return -EINVAL;
3795	}
3796	agbno = be32_to_cpu(icl->icl_agbno);
3797	if (!agbno || agbno == NULLAGBLOCK || agbno >= mp->m_sb.sb_agblocks) {
3798		xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad agbno");
3799		return -EINVAL;
3800	}
3801	isize = be32_to_cpu(icl->icl_isize);
3802	if (isize != mp->m_sb.sb_inodesize) {
3803		xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad isize");
3804		return -EINVAL;
3805	}
3806	count = be32_to_cpu(icl->icl_count);
3807	if (!count) {
3808		xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad count");
3809		return -EINVAL;
3810	}
3811	length = be32_to_cpu(icl->icl_length);
3812	if (!length || length >= mp->m_sb.sb_agblocks) {
3813		xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad length");
3814		return -EINVAL;
3815	}
3816
3817	/*
3818	 * The inode chunk is either full or sparse and we only support
3819	 * m_ino_geo.ialloc_min_blks sized sparse allocations at this time.
3820	 */
3821	if (length != igeo->ialloc_blks &&
3822	    length != igeo->ialloc_min_blks) {
3823		xfs_warn(log->l_mp,
3824			 "%s: unsupported chunk length", __FUNCTION__);
3825		return -EINVAL;
3826	}
3827
3828	/* verify inode count is consistent with extent length */
3829	if ((count >> mp->m_sb.sb_inopblog) != length) {
3830		xfs_warn(log->l_mp,
3831			 "%s: inconsistent inode count and chunk length",
3832			 __FUNCTION__);
3833		return -EINVAL;
3834	}
3835
3836	/*
3837	 * The icreate transaction can cover multiple cluster buffers and these
3838	 * buffers could have been freed and reused. Check the individual
3839	 * buffers for cancellation so we don't overwrite anything written after
3840	 * a cancellation.
3841	 */
3842	bb_per_cluster = XFS_FSB_TO_BB(mp, igeo->blocks_per_cluster);
3843	nbufs = length / igeo->blocks_per_cluster;
3844	for (i = 0, cancel_count = 0; i < nbufs; i++) {
3845		xfs_daddr_t	daddr;
3846
3847		daddr = XFS_AGB_TO_DADDR(mp, agno,
3848				agbno + i * igeo->blocks_per_cluster);
3849		if (xlog_check_buffer_cancelled(log, daddr, bb_per_cluster, 0))
3850			cancel_count++;
3851	}
3852
3853	/*
3854	 * We currently only use icreate for a single allocation at a time. This
3855	 * means we should expect either all or none of the buffers to be
3856	 * cancelled. Be conservative and skip replay if at least one buffer is
3857	 * cancelled, but warn the user that something is awry if the buffers
3858	 * are not consistent.
3859	 *
3860	 * XXX: This must be refined to only skip cancelled clusters once we use
3861	 * icreate for multiple chunk allocations.
3862	 */
3863	ASSERT(!cancel_count || cancel_count == nbufs);
3864	if (cancel_count) {
3865		if (cancel_count != nbufs)
3866			xfs_warn(mp,
3867	"WARNING: partial inode chunk cancellation, skipped icreate.");
3868		trace_xfs_log_recover_icreate_cancel(log, icl);
3869		return 0;
3870	}
3871
3872	trace_xfs_log_recover_icreate_recover(log, icl);
3873	return xfs_ialloc_inode_init(mp, NULL, buffer_list, count, agno, agbno,
3874				     length, be32_to_cpu(icl->icl_gen));
3875}
3876
3877STATIC void
3878xlog_recover_buffer_ra_pass2(
3879	struct xlog                     *log,
3880	struct xlog_recover_item        *item)
3881{
3882	struct xfs_buf_log_format	*buf_f = item->ri_buf[0].i_addr;
3883	struct xfs_mount		*mp = log->l_mp;
3884
3885	if (xlog_peek_buffer_cancelled(log, buf_f->blf_blkno,
3886			buf_f->blf_len, buf_f->blf_flags)) {
3887		return;
3888	}
3889
3890	xfs_buf_readahead(mp->m_ddev_targp, buf_f->blf_blkno,
3891				buf_f->blf_len, NULL);
3892}
3893
3894STATIC void
3895xlog_recover_inode_ra_pass2(
3896	struct xlog                     *log,
3897	struct xlog_recover_item        *item)
3898{
3899	struct xfs_inode_log_format	ilf_buf;
3900	struct xfs_inode_log_format	*ilfp;
3901	struct xfs_mount		*mp = log->l_mp;
3902	int			error;
3903
3904	if (item->ri_buf[0].i_len == sizeof(struct xfs_inode_log_format)) {
3905		ilfp = item->ri_buf[0].i_addr;
3906	} else {
3907		ilfp = &ilf_buf;
3908		memset(ilfp, 0, sizeof(*ilfp));
3909		error = xfs_inode_item_format_convert(&item->ri_buf[0], ilfp);
3910		if (error)
3911			return;
3912	}
3913
3914	if (xlog_peek_buffer_cancelled(log, ilfp->ilf_blkno, ilfp->ilf_len, 0))
3915		return;
3916
3917	xfs_buf_readahead(mp->m_ddev_targp, ilfp->ilf_blkno,
3918				ilfp->ilf_len, &xfs_inode_buf_ra_ops);
3919}
3920
3921STATIC void
3922xlog_recover_dquot_ra_pass2(
3923	struct xlog			*log,
3924	struct xlog_recover_item	*item)
3925{
3926	struct xfs_mount	*mp = log->l_mp;
3927	struct xfs_disk_dquot	*recddq;
3928	struct xfs_dq_logformat	*dq_f;
3929	uint			type;
3930	int			len;
3931
3932
3933	if (mp->m_qflags == 0)
3934		return;
3935
3936	recddq = item->ri_buf[1].i_addr;
3937	if (recddq == NULL)
3938		return;
3939	if (item->ri_buf[1].i_len < sizeof(struct xfs_disk_dquot))
3940		return;
3941
3942	type = recddq->d_flags & (XFS_DQ_USER | XFS_DQ_PROJ | XFS_DQ_GROUP);
3943	ASSERT(type);
3944	if (log->l_quotaoffs_flag & type)
3945		return;
3946
3947	dq_f = item->ri_buf[0].i_addr;
3948	ASSERT(dq_f);
3949	ASSERT(dq_f->qlf_len == 1);
3950
3951	len = XFS_FSB_TO_BB(mp, dq_f->qlf_len);
3952	if (xlog_peek_buffer_cancelled(log, dq_f->qlf_blkno, len, 0))
3953		return;
3954
3955	xfs_buf_readahead(mp->m_ddev_targp, dq_f->qlf_blkno, len,
3956			  &xfs_dquot_buf_ra_ops);
3957}
3958
3959STATIC void
3960xlog_recover_ra_pass2(
3961	struct xlog			*log,
3962	struct xlog_recover_item	*item)
3963{
3964	switch (ITEM_TYPE(item)) {
3965	case XFS_LI_BUF:
3966		xlog_recover_buffer_ra_pass2(log, item);
3967		break;
3968	case XFS_LI_INODE:
3969		xlog_recover_inode_ra_pass2(log, item);
3970		break;
3971	case XFS_LI_DQUOT:
3972		xlog_recover_dquot_ra_pass2(log, item);
3973		break;
3974	case XFS_LI_EFI:
3975	case XFS_LI_EFD:
3976	case XFS_LI_QUOTAOFF:
3977	case XFS_LI_RUI:
3978	case XFS_LI_RUD:
3979	case XFS_LI_CUI:
3980	case XFS_LI_CUD:
3981	case XFS_LI_BUI:
3982	case XFS_LI_BUD:
3983	default:
3984		break;
3985	}
3986}
3987
3988STATIC int
3989xlog_recover_commit_pass1(
3990	struct xlog			*log,
3991	struct xlog_recover		*trans,
3992	struct xlog_recover_item	*item)
3993{
3994	trace_xfs_log_recover_item_recover(log, trans, item, XLOG_RECOVER_PASS1);
3995
3996	switch (ITEM_TYPE(item)) {
3997	case XFS_LI_BUF:
3998		return xlog_recover_buffer_pass1(log, item);
3999	case XFS_LI_QUOTAOFF:
4000		return xlog_recover_quotaoff_pass1(log, item);
4001	case XFS_LI_INODE:
4002	case XFS_LI_EFI:
4003	case XFS_LI_EFD:
4004	case XFS_LI_DQUOT:
4005	case XFS_LI_ICREATE:
4006	case XFS_LI_RUI:
4007	case XFS_LI_RUD:
4008	case XFS_LI_CUI:
4009	case XFS_LI_CUD:
4010	case XFS_LI_BUI:
4011	case XFS_LI_BUD:
4012		/* nothing to do in pass 1 */
4013		return 0;
4014	default:
4015		xfs_warn(log->l_mp, "%s: invalid item type (%d)",
4016			__func__, ITEM_TYPE(item));
4017		ASSERT(0);
4018		return -EFSCORRUPTED;
4019	}
4020}
4021
4022STATIC int
4023xlog_recover_commit_pass2(
4024	struct xlog			*log,
4025	struct xlog_recover		*trans,
4026	struct list_head		*buffer_list,
4027	struct xlog_recover_item	*item)
4028{
4029	trace_xfs_log_recover_item_recover(log, trans, item, XLOG_RECOVER_PASS2);
4030
4031	switch (ITEM_TYPE(item)) {
4032	case XFS_LI_BUF:
4033		return xlog_recover_buffer_pass2(log, buffer_list, item,
4034						 trans->r_lsn);
4035	case XFS_LI_INODE:
4036		return xlog_recover_inode_pass2(log, buffer_list, item,
4037						 trans->r_lsn);
4038	case XFS_LI_EFI:
4039		return xlog_recover_efi_pass2(log, item, trans->r_lsn);
4040	case XFS_LI_EFD:
4041		return xlog_recover_efd_pass2(log, item);
4042	case XFS_LI_RUI:
4043		return xlog_recover_rui_pass2(log, item, trans->r_lsn);
4044	case XFS_LI_RUD:
4045		return xlog_recover_rud_pass2(log, item);
4046	case XFS_LI_CUI:
4047		return xlog_recover_cui_pass2(log, item, trans->r_lsn);
4048	case XFS_LI_CUD:
4049		return xlog_recover_cud_pass2(log, item);
4050	case XFS_LI_BUI:
4051		return xlog_recover_bui_pass2(log, item, trans->r_lsn);
4052	case XFS_LI_BUD:
4053		return xlog_recover_bud_pass2(log, item);
4054	case XFS_LI_DQUOT:
4055		return xlog_recover_dquot_pass2(log, buffer_list, item,
4056						trans->r_lsn);
4057	case XFS_LI_ICREATE:
4058		return xlog_recover_do_icreate_pass2(log, buffer_list, item);
4059	case XFS_LI_QUOTAOFF:
4060		/* nothing to do in pass2 */
4061		return 0;
4062	default:
4063		xfs_warn(log->l_mp, "%s: invalid item type (%d)",
4064			__func__, ITEM_TYPE(item));
4065		ASSERT(0);
4066		return -EFSCORRUPTED;
4067	}
4068}
4069
4070STATIC int
4071xlog_recover_items_pass2(
4072	struct xlog                     *log,
4073	struct xlog_recover             *trans,
4074	struct list_head                *buffer_list,
4075	struct list_head                *item_list)
4076{
4077	struct xlog_recover_item	*item;
4078	int				error = 0;
4079
4080	list_for_each_entry(item, item_list, ri_list) {
4081		error = xlog_recover_commit_pass2(log, trans,
4082					  buffer_list, item);
4083		if (error)
4084			return error;
4085	}
4086
4087	return error;
4088}
4089
4090/*
4091 * Perform the transaction.
4092 *
4093 * If the transaction modifies a buffer or inode, do it now.  Otherwise,
4094 * EFIs and EFDs get queued up by adding entries into the AIL for them.
4095 */
4096STATIC int
4097xlog_recover_commit_trans(
4098	struct xlog		*log,
4099	struct xlog_recover	*trans,
4100	int			pass,
4101	struct list_head	*buffer_list)
4102{
4103	int				error = 0;
4104	int				items_queued = 0;
4105	struct xlog_recover_item	*item;
4106	struct xlog_recover_item	*next;
4107	LIST_HEAD			(ra_list);
4108	LIST_HEAD			(done_list);
4109
4110	#define XLOG_RECOVER_COMMIT_QUEUE_MAX 100
4111
4112	hlist_del_init(&trans->r_list);
4113
4114	error = xlog_recover_reorder_trans(log, trans, pass);
4115	if (error)
4116		return error;
4117
4118	list_for_each_entry_safe(item, next, &trans->r_itemq, ri_list) {
4119		switch (pass) {
4120		case XLOG_RECOVER_PASS1:
4121			error = xlog_recover_commit_pass1(log, trans, item);
4122			break;
4123		case XLOG_RECOVER_PASS2:
4124			xlog_recover_ra_pass2(log, item);
4125			list_move_tail(&item->ri_list, &ra_list);
4126			items_queued++;
4127			if (items_queued >= XLOG_RECOVER_COMMIT_QUEUE_MAX) {
4128				error = xlog_recover_items_pass2(log, trans,
4129						buffer_list, &ra_list);
4130				list_splice_tail_init(&ra_list, &done_list);
4131				items_queued = 0;
4132			}
4133
4134			break;
4135		default:
4136			ASSERT(0);
4137		}
4138
4139		if (error)
4140			goto out;
4141	}
4142
4143out:
4144	if (!list_empty(&ra_list)) {
4145		if (!error)
4146			error = xlog_recover_items_pass2(log, trans,
4147					buffer_list, &ra_list);
4148		list_splice_tail_init(&ra_list, &done_list);
4149	}
4150
4151	if (!list_empty(&done_list))
4152		list_splice_init(&done_list, &trans->r_itemq);
4153
4154	return error;
4155}
4156
4157STATIC void
4158xlog_recover_add_item(
4159	struct list_head	*head)
4160{
4161	xlog_recover_item_t	*item;
4162
4163	item = kmem_zalloc(sizeof(xlog_recover_item_t), 0);
4164	INIT_LIST_HEAD(&item->ri_list);
4165	list_add_tail(&item->ri_list, head);
4166}
4167
4168STATIC int
4169xlog_recover_add_to_cont_trans(
4170	struct xlog		*log,
4171	struct xlog_recover	*trans,
4172	char			*dp,
4173	int			len)
4174{
4175	xlog_recover_item_t	*item;
4176	char			*ptr, *old_ptr;
4177	int			old_len;
4178
4179	/*
4180	 * If the transaction is empty, the header was split across this and the
4181	 * previous record. Copy the rest of the header.
4182	 */
4183	if (list_empty(&trans->r_itemq)) {
4184		ASSERT(len <= sizeof(struct xfs_trans_header));
4185		if (len > sizeof(struct xfs_trans_header)) {
4186			xfs_warn(log->l_mp, "%s: bad header length", __func__);
4187			return -EFSCORRUPTED;
4188		}
4189
4190		xlog_recover_add_item(&trans->r_itemq);
4191		ptr = (char *)&trans->r_theader +
4192				sizeof(struct xfs_trans_header) - len;
4193		memcpy(ptr, dp, len);
4194		return 0;
4195	}
4196
4197	/* take the tail entry */
4198	item = list_entry(trans->r_itemq.prev, xlog_recover_item_t, ri_list);
4199
4200	old_ptr = item->ri_buf[item->ri_cnt-1].i_addr;
4201	old_len = item->ri_buf[item->ri_cnt-1].i_len;
4202
4203	ptr = kmem_realloc(old_ptr, len + old_len, 0);
4204	memcpy(&ptr[old_len], dp, len);
4205	item->ri_buf[item->ri_cnt-1].i_len += len;
4206	item->ri_buf[item->ri_cnt-1].i_addr = ptr;
4207	trace_xfs_log_recover_item_add_cont(log, trans, item, 0);
4208	return 0;
4209}
4210
4211/*
4212 * The next region to add is the start of a new region.  It could be
4213 * a whole region or it could be the first part of a new region.  Because
4214 * of this, the assumption here is that the type and size fields of all
4215 * format structures fit into the first 32 bits of the structure.
4216 *
4217 * This works because all regions must be 32 bit aligned.  Therefore, we
4218 * either have both fields or we have neither field.  In the case we have
4219 * neither field, the data part of the region is zero length.  We only have
4220 * a log_op_header and can throw away the header since a new one will appear
4221 * later.  If we have at least 4 bytes, then we can determine how many regions
4222 * will appear in the current log item.
4223 */
4224STATIC int
4225xlog_recover_add_to_trans(
4226	struct xlog		*log,
4227	struct xlog_recover	*trans,
4228	char			*dp,
4229	int			len)
4230{
4231	struct xfs_inode_log_format	*in_f;			/* any will do */
4232	xlog_recover_item_t	*item;
4233	char			*ptr;
4234
4235	if (!len)
4236		return 0;
4237	if (list_empty(&trans->r_itemq)) {
4238		/* we need to catch log corruptions here */
4239		if (*(uint *)dp != XFS_TRANS_HEADER_MAGIC) {
4240			xfs_warn(log->l_mp, "%s: bad header magic number",
4241				__func__);
4242			ASSERT(0);
4243			return -EFSCORRUPTED;
4244		}
4245
4246		if (len > sizeof(struct xfs_trans_header)) {
4247			xfs_warn(log->l_mp, "%s: bad header length", __func__);
4248			ASSERT(0);
4249			return -EFSCORRUPTED;
4250		}
4251
4252		/*
4253		 * The transaction header can be arbitrarily split across op
4254		 * records. If we don't have the whole thing here, copy what we
4255		 * do have and handle the rest in the next record.
4256		 */
4257		if (len == sizeof(struct xfs_trans_header))
4258			xlog_recover_add_item(&trans->r_itemq);
4259		memcpy(&trans->r_theader, dp, len);
4260		return 0;
4261	}
4262
4263	ptr = kmem_alloc(len, 0);
4264	memcpy(ptr, dp, len);
4265	in_f = (struct xfs_inode_log_format *)ptr;
4266
4267	/* take the tail entry */
4268	item = list_entry(trans->r_itemq.prev, xlog_recover_item_t, ri_list);
4269	if (item->ri_total != 0 &&
4270	     item->ri_total == item->ri_cnt) {
4271		/* tail item is in use, get a new one */
4272		xlog_recover_add_item(&trans->r_itemq);
4273		item = list_entry(trans->r_itemq.prev,
4274					xlog_recover_item_t, ri_list);
4275	}
4276
4277	if (item->ri_total == 0) {		/* first region to be added */
4278		if (in_f->ilf_size == 0 ||
4279		    in_f->ilf_size > XLOG_MAX_REGIONS_IN_ITEM) {
4280			xfs_warn(log->l_mp,
4281		"bad number of regions (%d) in inode log format",
4282				  in_f->ilf_size);
4283			ASSERT(0);
4284			kmem_free(ptr);
4285			return -EFSCORRUPTED;
4286		}
4287
4288		item->ri_total = in_f->ilf_size;
4289		item->ri_buf =
4290			kmem_zalloc(item->ri_total * sizeof(xfs_log_iovec_t),
4291				    0);
4292	}
4293
4294	if (item->ri_total <= item->ri_cnt) {
4295		xfs_warn(log->l_mp,
4296	"log item region count (%d) overflowed size (%d)",
4297				item->ri_cnt, item->ri_total);
4298		ASSERT(0);
4299		kmem_free(ptr);
4300		return -EFSCORRUPTED;
4301	}
4302
4303	/* Description region is ri_buf[0] */
4304	item->ri_buf[item->ri_cnt].i_addr = ptr;
4305	item->ri_buf[item->ri_cnt].i_len  = len;
4306	item->ri_cnt++;
4307	trace_xfs_log_recover_item_add(log, trans, item, 0);
4308	return 0;
4309}
4310
4311/*
4312 * Free up any resources allocated by the transaction
4313 *
4314 * Remember that EFIs, EFDs, and IUNLINKs are handled later.
4315 */
4316STATIC void
4317xlog_recover_free_trans(
4318	struct xlog_recover	*trans)
4319{
4320	xlog_recover_item_t	*item, *n;
4321	int			i;
4322
4323	hlist_del_init(&trans->r_list);
4324
4325	list_for_each_entry_safe(item, n, &trans->r_itemq, ri_list) {
4326		/* Free the regions in the item. */
4327		list_del(&item->ri_list);
4328		for (i = 0; i < item->ri_cnt; i++)
4329			kmem_free(item->ri_buf[i].i_addr);
4330		/* Free the item itself */
4331		kmem_free(item->ri_buf);
4332		kmem_free(item);
4333	}
4334	/* Free the transaction recover structure */
4335	kmem_free(trans);
4336}
4337
4338/*
4339 * On error or completion, trans is freed.
4340 */
4341STATIC int
4342xlog_recovery_process_trans(
4343	struct xlog		*log,
4344	struct xlog_recover	*trans,
4345	char			*dp,
4346	unsigned int		len,
4347	unsigned int		flags,
4348	int			pass,
4349	struct list_head	*buffer_list)
4350{
4351	int			error = 0;
4352	bool			freeit = false;
4353
4354	/* mask off ophdr transaction container flags */
4355	flags &= ~XLOG_END_TRANS;
4356	if (flags & XLOG_WAS_CONT_TRANS)
4357		flags &= ~XLOG_CONTINUE_TRANS;
4358
4359	/*
4360	 * Callees must not free the trans structure. We'll decide if we need to
4361	 * free it or not based on the operation being done and it's result.
4362	 */
4363	switch (flags) {
4364	/* expected flag values */
4365	case 0:
4366	case XLOG_CONTINUE_TRANS:
4367		error = xlog_recover_add_to_trans(log, trans, dp, len);
4368		break;
4369	case XLOG_WAS_CONT_TRANS:
4370		error = xlog_recover_add_to_cont_trans(log, trans, dp, len);
4371		break;
4372	case XLOG_COMMIT_TRANS:
4373		error = xlog_recover_commit_trans(log, trans, pass,
4374						  buffer_list);
4375		/* success or fail, we are now done with this transaction. */
4376		freeit = true;
4377		break;
4378
4379	/* unexpected flag values */
4380	case XLOG_UNMOUNT_TRANS:
4381		/* just skip trans */
4382		xfs_warn(log->l_mp, "%s: Unmount LR", __func__);
4383		freeit = true;
4384		break;
4385	case XLOG_START_TRANS:
4386	default:
4387		xfs_warn(log->l_mp, "%s: bad flag 0x%x", __func__, flags);
4388		ASSERT(0);
4389		error = -EFSCORRUPTED;
4390		break;
4391	}
4392	if (error || freeit)
4393		xlog_recover_free_trans(trans);
4394	return error;
4395}
4396
4397/*
4398 * Lookup the transaction recovery structure associated with the ID in the
4399 * current ophdr. If the transaction doesn't exist and the start flag is set in
4400 * the ophdr, then allocate a new transaction for future ID matches to find.
4401 * Either way, return what we found during the lookup - an existing transaction
4402 * or nothing.
4403 */
4404STATIC struct xlog_recover *
4405xlog_recover_ophdr_to_trans(
4406	struct hlist_head	rhash[],
4407	struct xlog_rec_header	*rhead,
4408	struct xlog_op_header	*ohead)
4409{
4410	struct xlog_recover	*trans;
4411	xlog_tid_t		tid;
4412	struct hlist_head	*rhp;
4413
4414	tid = be32_to_cpu(ohead->oh_tid);
4415	rhp = &rhash[XLOG_RHASH(tid)];
4416	hlist_for_each_entry(trans, rhp, r_list) {
4417		if (trans->r_log_tid == tid)
4418			return trans;
4419	}
4420
4421	/*
4422	 * skip over non-start transaction headers - we could be
4423	 * processing slack space before the next transaction starts
4424	 */
4425	if (!(ohead->oh_flags & XLOG_START_TRANS))
4426		return NULL;
4427
4428	ASSERT(be32_to_cpu(ohead->oh_len) == 0);
4429
4430	/*
4431	 * This is a new transaction so allocate a new recovery container to
4432	 * hold the recovery ops that will follow.
4433	 */
4434	trans = kmem_zalloc(sizeof(struct xlog_recover), 0);
4435	trans->r_log_tid = tid;
4436	trans->r_lsn = be64_to_cpu(rhead->h_lsn);
4437	INIT_LIST_HEAD(&trans->r_itemq);
4438	INIT_HLIST_NODE(&trans->r_list);
4439	hlist_add_head(&trans->r_list, rhp);
4440
4441	/*
4442	 * Nothing more to do for this ophdr. Items to be added to this new
4443	 * transaction will be in subsequent ophdr containers.
4444	 */
4445	return NULL;
4446}
4447
4448STATIC int
4449xlog_recover_process_ophdr(
4450	struct xlog		*log,
4451	struct hlist_head	rhash[],
4452	struct xlog_rec_header	*rhead,
4453	struct xlog_op_header	*ohead,
4454	char			*dp,
4455	char			*end,
4456	int			pass,
4457	struct list_head	*buffer_list)
4458{
4459	struct xlog_recover	*trans;
4460	unsigned int		len;
4461	int			error;
4462
4463	/* Do we understand who wrote this op? */
4464	if (ohead->oh_clientid != XFS_TRANSACTION &&
4465	    ohead->oh_clientid != XFS_LOG) {
4466		xfs_warn(log->l_mp, "%s: bad clientid 0x%x",
4467			__func__, ohead->oh_clientid);
4468		ASSERT(0);
4469		return -EFSCORRUPTED;
4470	}
4471
4472	/*
4473	 * Check the ophdr contains all the data it is supposed to contain.
4474	 */
4475	len = be32_to_cpu(ohead->oh_len);
4476	if (dp + len > end) {
4477		xfs_warn(log->l_mp, "%s: bad length 0x%x", __func__, len);
4478		WARN_ON(1);
4479		return -EFSCORRUPTED;
4480	}
4481
4482	trans = xlog_recover_ophdr_to_trans(rhash, rhead, ohead);
4483	if (!trans) {
4484		/* nothing to do, so skip over this ophdr */
4485		return 0;
4486	}
4487
4488	/*
4489	 * The recovered buffer queue is drained only once we know that all
4490	 * recovery items for the current LSN have been processed. This is
4491	 * required because:
4492	 *
4493	 * - Buffer write submission updates the metadata LSN of the buffer.
4494	 * - Log recovery skips items with a metadata LSN >= the current LSN of
4495	 *   the recovery item.
4496	 * - Separate recovery items against the same metadata buffer can share
4497	 *   a current LSN. I.e., consider that the LSN of a recovery item is
4498	 *   defined as the starting LSN of the first record in which its
4499	 *   transaction appears, that a record can hold multiple transactions,
4500	 *   and/or that a transaction can span multiple records.
4501	 *
4502	 * In other words, we are allowed to submit a buffer from log recovery
4503	 * once per current LSN. Otherwise, we may incorrectly skip recovery
4504	 * items and cause corruption.
4505	 *
4506	 * We don't know up front whether buffers are updated multiple times per
4507	 * LSN. Therefore, track the current LSN of each commit log record as it
4508	 * is processed and drain the queue when it changes. Use commit records
4509	 * because they are ordered correctly by the logging code.
4510	 */
4511	if (log->l_recovery_lsn != trans->r_lsn &&
4512	    ohead->oh_flags & XLOG_COMMIT_TRANS) {
4513		error = xfs_buf_delwri_submit(buffer_list);
4514		if (error)
4515			return error;
4516		log->l_recovery_lsn = trans->r_lsn;
4517	}
4518
4519	return xlog_recovery_process_trans(log, trans, dp, len,
4520					   ohead->oh_flags, pass, buffer_list);
4521}
4522
4523/*
4524 * There are two valid states of the r_state field.  0 indicates that the
4525 * transaction structure is in a normal state.  We have either seen the
4526 * start of the transaction or the last operation we added was not a partial
4527 * operation.  If the last operation we added to the transaction was a
4528 * partial operation, we need to mark r_state with XLOG_WAS_CONT_TRANS.
4529 *
4530 * NOTE: skip LRs with 0 data length.
4531 */
4532STATIC int
4533xlog_recover_process_data(
4534	struct xlog		*log,
4535	struct hlist_head	rhash[],
4536	struct xlog_rec_header	*rhead,
4537	char			*dp,
4538	int			pass,
4539	struct list_head	*buffer_list)
4540{
4541	struct xlog_op_header	*ohead;
4542	char			*end;
4543	int			num_logops;
4544	int			error;
4545
4546	end = dp + be32_to_cpu(rhead->h_len);
4547	num_logops = be32_to_cpu(rhead->h_num_logops);
4548
4549	/* check the log format matches our own - else we can't recover */
4550	if (xlog_header_check_recover(log->l_mp, rhead))
4551		return -EIO;
4552
4553	trace_xfs_log_recover_record(log, rhead, pass);
4554	while ((dp < end) && num_logops) {
4555
4556		ohead = (struct xlog_op_header *)dp;
4557		dp += sizeof(*ohead);
4558		ASSERT(dp <= end);
4559
4560		/* errors will abort recovery */
4561		error = xlog_recover_process_ophdr(log, rhash, rhead, ohead,
4562						   dp, end, pass, buffer_list);
4563		if (error)
4564			return error;
4565
4566		dp += be32_to_cpu(ohead->oh_len);
4567		num_logops--;
4568	}
4569	return 0;
4570}
4571
4572/* Recover the EFI if necessary. */
4573STATIC int
4574xlog_recover_process_efi(
4575	struct xfs_mount		*mp,
4576	struct xfs_ail			*ailp,
4577	struct xfs_log_item		*lip)
4578{
4579	struct xfs_efi_log_item		*efip;
4580	int				error;
4581
4582	/*
4583	 * Skip EFIs that we've already processed.
4584	 */
4585	efip = container_of(lip, struct xfs_efi_log_item, efi_item);
4586	if (test_bit(XFS_EFI_RECOVERED, &efip->efi_flags))
4587		return 0;
4588
4589	spin_unlock(&ailp->ail_lock);
4590	error = xfs_efi_recover(mp, efip);
4591	spin_lock(&ailp->ail_lock);
4592
4593	return error;
4594}
4595
4596/* Release the EFI since we're cancelling everything. */
4597STATIC void
4598xlog_recover_cancel_efi(
4599	struct xfs_mount		*mp,
4600	struct xfs_ail			*ailp,
4601	struct xfs_log_item		*lip)
4602{
4603	struct xfs_efi_log_item		*efip;
4604
4605	efip = container_of(lip, struct xfs_efi_log_item, efi_item);
4606
4607	spin_unlock(&ailp->ail_lock);
4608	xfs_efi_release(efip);
4609	spin_lock(&ailp->ail_lock);
4610}
4611
4612/* Recover the RUI if necessary. */
4613STATIC int
4614xlog_recover_process_rui(
4615	struct xfs_mount		*mp,
4616	struct xfs_ail			*ailp,
4617	struct xfs_log_item		*lip)
4618{
4619	struct xfs_rui_log_item		*ruip;
4620	int				error;
4621
4622	/*
4623	 * Skip RUIs that we've already processed.
4624	 */
4625	ruip = container_of(lip, struct xfs_rui_log_item, rui_item);
4626	if (test_bit(XFS_RUI_RECOVERED, &ruip->rui_flags))
4627		return 0;
4628
4629	spin_unlock(&ailp->ail_lock);
4630	error = xfs_rui_recover(mp, ruip);
4631	spin_lock(&ailp->ail_lock);
4632
4633	return error;
4634}
4635
4636/* Release the RUI since we're cancelling everything. */
4637STATIC void
4638xlog_recover_cancel_rui(
4639	struct xfs_mount		*mp,
4640	struct xfs_ail			*ailp,
4641	struct xfs_log_item		*lip)
4642{
4643	struct xfs_rui_log_item		*ruip;
4644
4645	ruip = container_of(lip, struct xfs_rui_log_item, rui_item);
4646
4647	spin_unlock(&ailp->ail_lock);
4648	xfs_rui_release(ruip);
4649	spin_lock(&ailp->ail_lock);
4650}
4651
4652/* Recover the CUI if necessary. */
4653STATIC int
4654xlog_recover_process_cui(
4655	struct xfs_trans		*parent_tp,
4656	struct xfs_ail			*ailp,
4657	struct xfs_log_item		*lip)
4658{
4659	struct xfs_cui_log_item		*cuip;
4660	int				error;
4661
4662	/*
4663	 * Skip CUIs that we've already processed.
4664	 */
4665	cuip = container_of(lip, struct xfs_cui_log_item, cui_item);
4666	if (test_bit(XFS_CUI_RECOVERED, &cuip->cui_flags))
4667		return 0;
4668
4669	spin_unlock(&ailp->ail_lock);
4670	error = xfs_cui_recover(parent_tp, cuip);
4671	spin_lock(&ailp->ail_lock);
4672
4673	return error;
4674}
4675
4676/* Release the CUI since we're cancelling everything. */
4677STATIC void
4678xlog_recover_cancel_cui(
4679	struct xfs_mount		*mp,
4680	struct xfs_ail			*ailp,
4681	struct xfs_log_item		*lip)
4682{
4683	struct xfs_cui_log_item		*cuip;
4684
4685	cuip = container_of(lip, struct xfs_cui_log_item, cui_item);
4686
4687	spin_unlock(&ailp->ail_lock);
4688	xfs_cui_release(cuip);
4689	spin_lock(&ailp->ail_lock);
4690}
4691
4692/* Recover the BUI if necessary. */
4693STATIC int
4694xlog_recover_process_bui(
4695	struct xfs_trans		*parent_tp,
4696	struct xfs_ail			*ailp,
4697	struct xfs_log_item		*lip)
4698{
4699	struct xfs_bui_log_item		*buip;
4700	int				error;
4701
4702	/*
4703	 * Skip BUIs that we've already processed.
4704	 */
4705	buip = container_of(lip, struct xfs_bui_log_item, bui_item);
4706	if (test_bit(XFS_BUI_RECOVERED, &buip->bui_flags))
4707		return 0;
4708
4709	spin_unlock(&ailp->ail_lock);
4710	error = xfs_bui_recover(parent_tp, buip);
4711	spin_lock(&ailp->ail_lock);
4712
4713	return error;
4714}
4715
4716/* Release the BUI since we're cancelling everything. */
4717STATIC void
4718xlog_recover_cancel_bui(
4719	struct xfs_mount		*mp,
4720	struct xfs_ail			*ailp,
4721	struct xfs_log_item		*lip)
4722{
4723	struct xfs_bui_log_item		*buip;
4724
4725	buip = container_of(lip, struct xfs_bui_log_item, bui_item);
4726
4727	spin_unlock(&ailp->ail_lock);
4728	xfs_bui_release(buip);
4729	spin_lock(&ailp->ail_lock);
4730}
4731
4732/* Is this log item a deferred action intent? */
4733static inline bool xlog_item_is_intent(struct xfs_log_item *lip)
4734{
4735	switch (lip->li_type) {
4736	case XFS_LI_EFI:
4737	case XFS_LI_RUI:
4738	case XFS_LI_CUI:
4739	case XFS_LI_BUI:
4740		return true;
4741	default:
4742		return false;
4743	}
4744}
4745
4746/* Take all the collected deferred ops and finish them in order. */
4747static int
4748xlog_finish_defer_ops(
4749	struct xfs_trans	*parent_tp)
4750{
4751	struct xfs_mount	*mp = parent_tp->t_mountp;
4752	struct xfs_trans	*tp;
4753	int64_t			freeblks;
4754	uint			resblks;
4755	int			error;
4756
4757	/*
4758	 * We're finishing the defer_ops that accumulated as a result of
4759	 * recovering unfinished intent items during log recovery.  We
4760	 * reserve an itruncate transaction because it is the largest
4761	 * permanent transaction type.  Since we're the only user of the fs
4762	 * right now, take 93% (15/16) of the available free blocks.  Use
4763	 * weird math to avoid a 64-bit division.
4764	 */
4765	freeblks = percpu_counter_sum(&mp->m_fdblocks);
4766	if (freeblks <= 0)
4767		return -ENOSPC;
4768	resblks = min_t(int64_t, UINT_MAX, freeblks);
4769	resblks = (resblks * 15) >> 4;
4770	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, resblks,
4771			0, XFS_TRANS_RESERVE, &tp);
4772	if (error)
4773		return error;
4774	/* transfer all collected dfops to this transaction */
4775	xfs_defer_move(tp, parent_tp);
4776
4777	return xfs_trans_commit(tp);
4778}
4779
4780/*
4781 * When this is called, all of the log intent items which did not have
4782 * corresponding log done items should be in the AIL.  What we do now
4783 * is update the data structures associated with each one.
4784 *
4785 * Since we process the log intent items in normal transactions, they
4786 * will be removed at some point after the commit.  This prevents us
4787 * from just walking down the list processing each one.  We'll use a
4788 * flag in the intent item to skip those that we've already processed
4789 * and use the AIL iteration mechanism's generation count to try to
4790 * speed this up at least a bit.
4791 *
4792 * When we start, we know that the intents are the only things in the
4793 * AIL.  As we process them, however, other items are added to the
4794 * AIL.
4795 */
4796STATIC int
4797xlog_recover_process_intents(
4798	struct xlog		*log)
4799{
4800	struct xfs_trans	*parent_tp;
4801	struct xfs_ail_cursor	cur;
4802	struct xfs_log_item	*lip;
4803	struct xfs_ail		*ailp;
4804	int			error;
4805#if defined(DEBUG) || defined(XFS_WARN)
4806	xfs_lsn_t		last_lsn;
4807#endif
4808
4809	/*
4810	 * The intent recovery handlers commit transactions to complete recovery
4811	 * for individual intents, but any new deferred operations that are
4812	 * queued during that process are held off until the very end. The
4813	 * purpose of this transaction is to serve as a container for deferred
4814	 * operations. Each intent recovery handler must transfer dfops here
4815	 * before its local transaction commits, and we'll finish the entire
4816	 * list below.
4817	 */
4818	error = xfs_trans_alloc_empty(log->l_mp, &parent_tp);
4819	if (error)
4820		return error;
4821
4822	ailp = log->l_ailp;
4823	spin_lock(&ailp->ail_lock);
4824	lip = xfs_trans_ail_cursor_first(ailp, &cur, 0);
4825#if defined(DEBUG) || defined(XFS_WARN)
4826	last_lsn = xlog_assign_lsn(log->l_curr_cycle, log->l_curr_block);
4827#endif
4828	while (lip != NULL) {
4829		/*
4830		 * We're done when we see something other than an intent.
4831		 * There should be no intents left in the AIL now.
4832		 */
4833		if (!xlog_item_is_intent(lip)) {
4834#ifdef DEBUG
4835			for (; lip; lip = xfs_trans_ail_cursor_next(ailp, &cur))
4836				ASSERT(!xlog_item_is_intent(lip));
4837#endif
4838			break;
4839		}
4840
4841		/*
4842		 * We should never see a redo item with a LSN higher than
4843		 * the last transaction we found in the log at the start
4844		 * of recovery.
4845		 */
4846		ASSERT(XFS_LSN_CMP(last_lsn, lip->li_lsn) >= 0);
4847
4848		/*
4849		 * NOTE: If your intent processing routine can create more
4850		 * deferred ops, you /must/ attach them to the dfops in this
4851		 * routine or else those subsequent intents will get
4852		 * replayed in the wrong order!
4853		 */
4854		switch (lip->li_type) {
4855		case XFS_LI_EFI:
4856			error = xlog_recover_process_efi(log->l_mp, ailp, lip);
4857			break;
4858		case XFS_LI_RUI:
4859			error = xlog_recover_process_rui(log->l_mp, ailp, lip);
4860			break;
4861		case XFS_LI_CUI:
4862			error = xlog_recover_process_cui(parent_tp, ailp, lip);
4863			break;
4864		case XFS_LI_BUI:
4865			error = xlog_recover_process_bui(parent_tp, ailp, lip);
4866			break;
4867		}
4868		if (error)
4869			goto out;
4870		lip = xfs_trans_ail_cursor_next(ailp, &cur);
4871	}
4872out:
4873	xfs_trans_ail_cursor_done(&cur);
4874	spin_unlock(&ailp->ail_lock);
4875	if (!error)
4876		error = xlog_finish_defer_ops(parent_tp);
4877	xfs_trans_cancel(parent_tp);
4878
4879	return error;
4880}
4881
4882/*
4883 * A cancel occurs when the mount has failed and we're bailing out.
4884 * Release all pending log intent items so they don't pin the AIL.
4885 */
4886STATIC void
4887xlog_recover_cancel_intents(
4888	struct xlog		*log)
4889{
4890	struct xfs_log_item	*lip;
4891	struct xfs_ail_cursor	cur;
4892	struct xfs_ail		*ailp;
4893
4894	ailp = log->l_ailp;
4895	spin_lock(&ailp->ail_lock);
4896	lip = xfs_trans_ail_cursor_first(ailp, &cur, 0);
4897	while (lip != NULL) {
4898		/*
4899		 * We're done when we see something other than an intent.
4900		 * There should be no intents left in the AIL now.
4901		 */
4902		if (!xlog_item_is_intent(lip)) {
4903#ifdef DEBUG
4904			for (; lip; lip = xfs_trans_ail_cursor_next(ailp, &cur))
4905				ASSERT(!xlog_item_is_intent(lip));
4906#endif
4907			break;
4908		}
4909
4910		switch (lip->li_type) {
4911		case XFS_LI_EFI:
4912			xlog_recover_cancel_efi(log->l_mp, ailp, lip);
4913			break;
4914		case XFS_LI_RUI:
4915			xlog_recover_cancel_rui(log->l_mp, ailp, lip);
4916			break;
4917		case XFS_LI_CUI:
4918			xlog_recover_cancel_cui(log->l_mp, ailp, lip);
4919			break;
4920		case XFS_LI_BUI:
4921			xlog_recover_cancel_bui(log->l_mp, ailp, lip);
4922			break;
4923		}
4924
4925		lip = xfs_trans_ail_cursor_next(ailp, &cur);
4926	}
4927
4928	xfs_trans_ail_cursor_done(&cur);
4929	spin_unlock(&ailp->ail_lock);
4930}
4931
4932/*
4933 * This routine performs a transaction to null out a bad inode pointer
4934 * in an agi unlinked inode hash bucket.
4935 */
4936STATIC void
4937xlog_recover_clear_agi_bucket(
4938	xfs_mount_t	*mp,
4939	xfs_agnumber_t	agno,
4940	int		bucket)
4941{
4942	xfs_trans_t	*tp;
4943	xfs_agi_t	*agi;
4944	xfs_buf_t	*agibp;
4945	int		offset;
4946	int		error;
4947
4948	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_clearagi, 0, 0, 0, &tp);
4949	if (error)
4950		goto out_error;
4951
4952	error = xfs_read_agi(mp, tp, agno, &agibp);
4953	if (error)
4954		goto out_abort;
4955
4956	agi = XFS_BUF_TO_AGI(agibp);
4957	agi->agi_unlinked[bucket] = cpu_to_be32(NULLAGINO);
4958	offset = offsetof(xfs_agi_t, agi_unlinked) +
4959		 (sizeof(xfs_agino_t) * bucket);
4960	xfs_trans_log_buf(tp, agibp, offset,
4961			  (offset + sizeof(xfs_agino_t) - 1));
4962
4963	error = xfs_trans_commit(tp);
4964	if (error)
4965		goto out_error;
4966	return;
4967
4968out_abort:
4969	xfs_trans_cancel(tp);
4970out_error:
4971	xfs_warn(mp, "%s: failed to clear agi %d. Continuing.", __func__, agno);
4972	return;
4973}
4974
4975STATIC xfs_agino_t
4976xlog_recover_process_one_iunlink(
4977	struct xfs_mount		*mp,
4978	xfs_agnumber_t			agno,
4979	xfs_agino_t			agino,
4980	int				bucket)
4981{
4982	struct xfs_buf			*ibp;
4983	struct xfs_dinode		*dip;
4984	struct xfs_inode		*ip;
4985	xfs_ino_t			ino;
4986	int				error;
4987
4988	ino = XFS_AGINO_TO_INO(mp, agno, agino);
4989	error = xfs_iget(mp, NULL, ino, 0, 0, &ip);
4990	if (error)
4991		goto fail;
4992
4993	/*
4994	 * Get the on disk inode to find the next inode in the bucket.
4995	 */
4996	error = xfs_imap_to_bp(mp, NULL, &ip->i_imap, &dip, &ibp, 0, 0);
4997	if (error)
4998		goto fail_iput;
4999
5000	xfs_iflags_clear(ip, XFS_IRECOVERY);
5001	ASSERT(VFS_I(ip)->i_nlink == 0);
5002	ASSERT(VFS_I(ip)->i_mode != 0);
5003
5004	/* setup for the next pass */
5005	agino = be32_to_cpu(dip->di_next_unlinked);
5006	xfs_buf_relse(ibp);
5007
5008	/*
5009	 * Prevent any DMAPI event from being sent when the reference on
5010	 * the inode is dropped.
5011	 */
5012	ip->i_d.di_dmevmask = 0;
5013
5014	xfs_irele(ip);
5015	return agino;
5016
5017 fail_iput:
5018	xfs_irele(ip);
5019 fail:
5020	/*
5021	 * We can't read in the inode this bucket points to, or this inode
5022	 * is messed up.  Just ditch this bucket of inodes.  We will lose
5023	 * some inodes and space, but at least we won't hang.
5024	 *
5025	 * Call xlog_recover_clear_agi_bucket() to perform a transaction to
5026	 * clear the inode pointer in the bucket.
5027	 */
5028	xlog_recover_clear_agi_bucket(mp, agno, bucket);
5029	return NULLAGINO;
5030}
5031
5032/*
5033 * Recover AGI unlinked lists
5034 *
5035 * This is called during recovery to process any inodes which we unlinked but
5036 * not freed when the system crashed.  These inodes will be on the lists in the
5037 * AGI blocks. What we do here is scan all the AGIs and fully truncate and free
5038 * any inodes found on the lists. Each inode is removed from the lists when it
5039 * has been fully truncated and is freed. The freeing of the inode and its
5040 * removal from the list must be atomic.
5041 *
5042 * If everything we touch in the agi processing loop is already in memory, this
5043 * loop can hold the cpu for a long time. It runs without lock contention,
5044 * memory allocation contention, the need wait for IO, etc, and so will run
5045 * until we either run out of inodes to process, run low on memory or we run out
5046 * of log space.
5047 *
5048 * This behaviour is bad for latency on single CPU and non-preemptible kernels,
5049 * and can prevent other filesytem work (such as CIL pushes) from running. This
5050 * can lead to deadlocks if the recovery process runs out of log reservation
5051 * space. Hence we need to yield the CPU when there is other kernel work
5052 * scheduled on this CPU to ensure other scheduled work can run without undue
5053 * latency.
5054 */
5055STATIC void
5056xlog_recover_process_iunlinks(
5057	struct xlog	*log)
5058{
5059	xfs_mount_t	*mp;
5060	xfs_agnumber_t	agno;
5061	xfs_agi_t	*agi;
5062	xfs_buf_t	*agibp;
5063	xfs_agino_t	agino;
5064	int		bucket;
5065	int		error;
5066
5067	mp = log->l_mp;
5068
5069	for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) {
5070		/*
5071		 * Find the agi for this ag.
5072		 */
5073		error = xfs_read_agi(mp, NULL, agno, &agibp);
5074		if (error) {
5075			/*
5076			 * AGI is b0rked. Don't process it.
5077			 *
5078			 * We should probably mark the filesystem as corrupt
5079			 * after we've recovered all the ag's we can....
5080			 */
5081			continue;
5082		}
5083		/*
5084		 * Unlock the buffer so that it can be acquired in the normal
5085		 * course of the transaction to truncate and free each inode.
5086		 * Because we are not racing with anyone else here for the AGI
5087		 * buffer, we don't even need to hold it locked to read the
5088		 * initial unlinked bucket entries out of the buffer. We keep
5089		 * buffer reference though, so that it stays pinned in memory
5090		 * while we need the buffer.
5091		 */
5092		agi = XFS_BUF_TO_AGI(agibp);
5093		xfs_buf_unlock(agibp);
5094
5095		for (bucket = 0; bucket < XFS_AGI_UNLINKED_BUCKETS; bucket++) {
5096			agino = be32_to_cpu(agi->agi_unlinked[bucket]);
5097			while (agino != NULLAGINO) {
5098				agino = xlog_recover_process_one_iunlink(mp,
5099							agno, agino, bucket);
5100				cond_resched();
5101			}
5102		}
5103		xfs_buf_rele(agibp);
5104	}
5105}
5106
5107STATIC void
5108xlog_unpack_data(
5109	struct xlog_rec_header	*rhead,
5110	char			*dp,
5111	struct xlog		*log)
5112{
5113	int			i, j, k;
5114
5115	for (i = 0; i < BTOBB(be32_to_cpu(rhead->h_len)) &&
5116		  i < (XLOG_HEADER_CYCLE_SIZE / BBSIZE); i++) {
5117		*(__be32 *)dp = *(__be32 *)&rhead->h_cycle_data[i];
5118		dp += BBSIZE;
5119	}
5120
5121	if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
5122		xlog_in_core_2_t *xhdr = (xlog_in_core_2_t *)rhead;
5123		for ( ; i < BTOBB(be32_to_cpu(rhead->h_len)); i++) {
5124			j = i / (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
5125			k = i % (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
5126			*(__be32 *)dp = xhdr[j].hic_xheader.xh_cycle_data[k];
5127			dp += BBSIZE;
5128		}
5129	}
5130}
5131
5132/*
5133 * CRC check, unpack and process a log record.
5134 */
5135STATIC int
5136xlog_recover_process(
5137	struct xlog		*log,
5138	struct hlist_head	rhash[],
5139	struct xlog_rec_header	*rhead,
5140	char			*dp,
5141	int			pass,
5142	struct list_head	*buffer_list)
5143{
5144	__le32			old_crc = rhead->h_crc;
5145	__le32			crc;
5146
5147	crc = xlog_cksum(log, rhead, dp, be32_to_cpu(rhead->h_len));
5148
5149	/*
5150	 * Nothing else to do if this is a CRC verification pass. Just return
5151	 * if this a record with a non-zero crc. Unfortunately, mkfs always
5152	 * sets old_crc to 0 so we must consider this valid even on v5 supers.
5153	 * Otherwise, return EFSBADCRC on failure so the callers up the stack
5154	 * know precisely what failed.
5155	 */
5156	if (pass == XLOG_RECOVER_CRCPASS) {
5157		if (old_crc && crc != old_crc)
5158			return -EFSBADCRC;
5159		return 0;
5160	}
5161
5162	/*
5163	 * We're in the normal recovery path. Issue a warning if and only if the
5164	 * CRC in the header is non-zero. This is an advisory warning and the
5165	 * zero CRC check prevents warnings from being emitted when upgrading
5166	 * the kernel from one that does not add CRCs by default.
5167	 */
5168	if (crc != old_crc) {
5169		if (old_crc || xfs_sb_version_hascrc(&log->l_mp->m_sb)) {
5170			xfs_alert(log->l_mp,
5171		"log record CRC mismatch: found 0x%x, expected 0x%x.",
5172					le32_to_cpu(old_crc),
5173					le32_to_cpu(crc));
5174			xfs_hex_dump(dp, 32);
5175		}
5176
5177		/*
5178		 * If the filesystem is CRC enabled, this mismatch becomes a
5179		 * fatal log corruption failure.
5180		 */
5181		if (xfs_sb_version_hascrc(&log->l_mp->m_sb)) {
5182			XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, log->l_mp);
5183			return -EFSCORRUPTED;
5184		}
5185	}
5186
5187	xlog_unpack_data(rhead, dp, log);
5188
5189	return xlog_recover_process_data(log, rhash, rhead, dp, pass,
5190					 buffer_list);
5191}
5192
5193STATIC int
5194xlog_valid_rec_header(
5195	struct xlog		*log,
5196	struct xlog_rec_header	*rhead,
5197	xfs_daddr_t		blkno)
5198{
5199	int			hlen;
5200
5201	if (XFS_IS_CORRUPT(log->l_mp,
5202			   rhead->h_magicno != cpu_to_be32(XLOG_HEADER_MAGIC_NUM)))
5203		return -EFSCORRUPTED;
5204	if (XFS_IS_CORRUPT(log->l_mp,
5205			   (!rhead->h_version ||
5206			   (be32_to_cpu(rhead->h_version) &
5207			    (~XLOG_VERSION_OKBITS))))) {
5208		xfs_warn(log->l_mp, "%s: unrecognised log version (%d).",
5209			__func__, be32_to_cpu(rhead->h_version));
5210		return -EFSCORRUPTED;
5211	}
5212
5213	/* LR body must have data or it wouldn't have been written */
5214	hlen = be32_to_cpu(rhead->h_len);
5215	if (XFS_IS_CORRUPT(log->l_mp, hlen <= 0 || hlen > INT_MAX))
5216		return -EFSCORRUPTED;
5217	if (XFS_IS_CORRUPT(log->l_mp,
5218			   blkno > log->l_logBBsize || blkno > INT_MAX))
5219		return -EFSCORRUPTED;
5220	return 0;
5221}
5222
5223/*
5224 * Read the log from tail to head and process the log records found.
5225 * Handle the two cases where the tail and head are in the same cycle
5226 * and where the active portion of the log wraps around the end of
5227 * the physical log separately.  The pass parameter is passed through
5228 * to the routines called to process the data and is not looked at
5229 * here.
5230 */
5231STATIC int
5232xlog_do_recovery_pass(
5233	struct xlog		*log,
5234	xfs_daddr_t		head_blk,
5235	xfs_daddr_t		tail_blk,
5236	int			pass,
5237	xfs_daddr_t		*first_bad)	/* out: first bad log rec */
5238{
5239	xlog_rec_header_t	*rhead;
5240	xfs_daddr_t		blk_no, rblk_no;
5241	xfs_daddr_t		rhead_blk;
5242	char			*offset;
5243	char			*hbp, *dbp;
5244	int			error = 0, h_size, h_len;
5245	int			error2 = 0;
5246	int			bblks, split_bblks;
5247	int			hblks, split_hblks, wrapped_hblks;
5248	int			i;
5249	struct hlist_head	rhash[XLOG_RHASH_SIZE];
5250	LIST_HEAD		(buffer_list);
5251
5252	ASSERT(head_blk != tail_blk);
5253	blk_no = rhead_blk = tail_blk;
5254
5255	for (i = 0; i < XLOG_RHASH_SIZE; i++)
5256		INIT_HLIST_HEAD(&rhash[i]);
5257
5258	/*
5259	 * Read the header of the tail block and get the iclog buffer size from
5260	 * h_size.  Use this to tell how many sectors make up the log header.
5261	 */
5262	if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
5263		/*
5264		 * When using variable length iclogs, read first sector of
5265		 * iclog header and extract the header size from it.  Get a
5266		 * new hbp that is the correct size.
5267		 */
5268		hbp = xlog_alloc_buffer(log, 1);
5269		if (!hbp)
5270			return -ENOMEM;
5271
5272		error = xlog_bread(log, tail_blk, 1, hbp, &offset);
5273		if (error)
5274			goto bread_err1;
5275
5276		rhead = (xlog_rec_header_t *)offset;
5277		error = xlog_valid_rec_header(log, rhead, tail_blk);
5278		if (error)
5279			goto bread_err1;
5280
5281		/*
5282		 * xfsprogs has a bug where record length is based on lsunit but
5283		 * h_size (iclog size) is hardcoded to 32k. Now that we
5284		 * unconditionally CRC verify the unmount record, this means the
5285		 * log buffer can be too small for the record and cause an
5286		 * overrun.
5287		 *
5288		 * Detect this condition here. Use lsunit for the buffer size as
5289		 * long as this looks like the mkfs case. Otherwise, return an
5290		 * error to avoid a buffer overrun.
5291		 */
5292		h_size = be32_to_cpu(rhead->h_size);
5293		h_len = be32_to_cpu(rhead->h_len);
5294		if (h_len > h_size) {
5295			if (h_len <= log->l_mp->m_logbsize &&
5296			    be32_to_cpu(rhead->h_num_logops) == 1) {
5297				xfs_warn(log->l_mp,
5298		"invalid iclog size (%d bytes), using lsunit (%d bytes)",
5299					 h_size, log->l_mp->m_logbsize);
5300				h_size = log->l_mp->m_logbsize;
5301			} else {
5302				XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW,
5303						log->l_mp);
5304				error = -EFSCORRUPTED;
5305				goto bread_err1;
5306			}
5307		}
5308
5309		if ((be32_to_cpu(rhead->h_version) & XLOG_VERSION_2) &&
5310		    (h_size > XLOG_HEADER_CYCLE_SIZE)) {
5311			hblks = h_size / XLOG_HEADER_CYCLE_SIZE;
5312			if (h_size % XLOG_HEADER_CYCLE_SIZE)
5313				hblks++;
5314			kmem_free(hbp);
5315			hbp = xlog_alloc_buffer(log, hblks);
5316		} else {
5317			hblks = 1;
5318		}
5319	} else {
5320		ASSERT(log->l_sectBBsize == 1);
5321		hblks = 1;
5322		hbp = xlog_alloc_buffer(log, 1);
5323		h_size = XLOG_BIG_RECORD_BSIZE;
5324	}
5325
5326	if (!hbp)
5327		return -ENOMEM;
5328	dbp = xlog_alloc_buffer(log, BTOBB(h_size));
5329	if (!dbp) {
5330		kmem_free(hbp);
5331		return -ENOMEM;
5332	}
5333
5334	memset(rhash, 0, sizeof(rhash));
5335	if (tail_blk > head_blk) {
5336		/*
5337		 * Perform recovery around the end of the physical log.
5338		 * When the head is not on the same cycle number as the tail,
5339		 * we can't do a sequential recovery.
5340		 */
5341		while (blk_no < log->l_logBBsize) {
5342			/*
5343			 * Check for header wrapping around physical end-of-log
5344			 */
5345			offset = hbp;
5346			split_hblks = 0;
5347			wrapped_hblks = 0;
5348			if (blk_no + hblks <= log->l_logBBsize) {
5349				/* Read header in one read */
5350				error = xlog_bread(log, blk_no, hblks, hbp,
5351						   &offset);
5352				if (error)
5353					goto bread_err2;
5354			} else {
5355				/* This LR is split across physical log end */
5356				if (blk_no != log->l_logBBsize) {
5357					/* some data before physical log end */
5358					ASSERT(blk_no <= INT_MAX);
5359					split_hblks = log->l_logBBsize - (int)blk_no;
5360					ASSERT(split_hblks > 0);
5361					error = xlog_bread(log, blk_no,
5362							   split_hblks, hbp,
5363							   &offset);
5364					if (error)
5365						goto bread_err2;
5366				}
5367
5368				/*
5369				 * Note: this black magic still works with
5370				 * large sector sizes (non-512) only because:
5371				 * - we increased the buffer size originally
5372				 *   by 1 sector giving us enough extra space
5373				 *   for the second read;
5374				 * - the log start is guaranteed to be sector
5375				 *   aligned;
5376				 * - we read the log end (LR header start)
5377				 *   _first_, then the log start (LR header end)
5378				 *   - order is important.
5379				 */
5380				wrapped_hblks = hblks - split_hblks;
5381				error = xlog_bread_noalign(log, 0,
5382						wrapped_hblks,
5383						offset + BBTOB(split_hblks));
5384				if (error)
5385					goto bread_err2;
5386			}
5387			rhead = (xlog_rec_header_t *)offset;
5388			error = xlog_valid_rec_header(log, rhead,
5389						split_hblks ? blk_no : 0);
5390			if (error)
5391				goto bread_err2;
5392
5393			bblks = (int)BTOBB(be32_to_cpu(rhead->h_len));
5394			blk_no += hblks;
5395
5396			/*
5397			 * Read the log record data in multiple reads if it
5398			 * wraps around the end of the log. Note that if the
5399			 * header already wrapped, blk_no could point past the
5400			 * end of the log. The record data is contiguous in
5401			 * that case.
5402			 */
5403			if (blk_no + bblks <= log->l_logBBsize ||
5404			    blk_no >= log->l_logBBsize) {
5405				rblk_no = xlog_wrap_logbno(log, blk_no);
5406				error = xlog_bread(log, rblk_no, bblks, dbp,
5407						   &offset);
5408				if (error)
5409					goto bread_err2;
5410			} else {
5411				/* This log record is split across the
5412				 * physical end of log */
5413				offset = dbp;
5414				split_bblks = 0;
5415				if (blk_no != log->l_logBBsize) {
5416					/* some data is before the physical
5417					 * end of log */
5418					ASSERT(!wrapped_hblks);
5419					ASSERT(blk_no <= INT_MAX);
5420					split_bblks =
5421						log->l_logBBsize - (int)blk_no;
5422					ASSERT(split_bblks > 0);
5423					error = xlog_bread(log, blk_no,
5424							split_bblks, dbp,
5425							&offset);
5426					if (error)
5427						goto bread_err2;
5428				}
5429
5430				/*
5431				 * Note: this black magic still works with
5432				 * large sector sizes (non-512) only because:
5433				 * - we increased the buffer size originally
5434				 *   by 1 sector giving us enough extra space
5435				 *   for the second read;
5436				 * - the log start is guaranteed to be sector
5437				 *   aligned;
5438				 * - we read the log end (LR header start)
5439				 *   _first_, then the log start (LR header end)
5440				 *   - order is important.
5441				 */
5442				error = xlog_bread_noalign(log, 0,
5443						bblks - split_bblks,
5444						offset + BBTOB(split_bblks));
5445				if (error)
5446					goto bread_err2;
5447			}
5448
5449			error = xlog_recover_process(log, rhash, rhead, offset,
5450						     pass, &buffer_list);
5451			if (error)
5452				goto bread_err2;
5453
5454			blk_no += bblks;
5455			rhead_blk = blk_no;
5456		}
5457
5458		ASSERT(blk_no >= log->l_logBBsize);
5459		blk_no -= log->l_logBBsize;
5460		rhead_blk = blk_no;
5461	}
5462
5463	/* read first part of physical log */
5464	while (blk_no < head_blk) {
5465		error = xlog_bread(log, blk_no, hblks, hbp, &offset);
5466		if (error)
5467			goto bread_err2;
5468
5469		rhead = (xlog_rec_header_t *)offset;
5470		error = xlog_valid_rec_header(log, rhead, blk_no);
5471		if (error)
5472			goto bread_err2;
5473
5474		/* blocks in data section */
5475		bblks = (int)BTOBB(be32_to_cpu(rhead->h_len));
5476		error = xlog_bread(log, blk_no+hblks, bblks, dbp,
5477				   &offset);
5478		if (error)
5479			goto bread_err2;
5480
5481		error = xlog_recover_process(log, rhash, rhead, offset, pass,
5482					     &buffer_list);
5483		if (error)
5484			goto bread_err2;
5485
5486		blk_no += bblks + hblks;
5487		rhead_blk = blk_no;
5488	}
5489
5490 bread_err2:
5491	kmem_free(dbp);
5492 bread_err1:
5493	kmem_free(hbp);
5494
5495	/*
5496	 * Submit buffers that have been added from the last record processed,
5497	 * regardless of error status.
5498	 */
5499	if (!list_empty(&buffer_list))
5500		error2 = xfs_buf_delwri_submit(&buffer_list);
5501
5502	if (error && first_bad)
5503		*first_bad = rhead_blk;
5504
5505	/*
5506	 * Transactions are freed at commit time but transactions without commit
5507	 * records on disk are never committed. Free any that may be left in the
5508	 * hash table.
5509	 */
5510	for (i = 0; i < XLOG_RHASH_SIZE; i++) {
5511		struct hlist_node	*tmp;
5512		struct xlog_recover	*trans;
5513
5514		hlist_for_each_entry_safe(trans, tmp, &rhash[i], r_list)
5515			xlog_recover_free_trans(trans);
5516	}
5517
5518	return error ? error : error2;
5519}
5520
5521/*
5522 * Do the recovery of the log.  We actually do this in two phases.
5523 * The two passes are necessary in order to implement the function
5524 * of cancelling a record written into the log.  The first pass
5525 * determines those things which have been cancelled, and the
5526 * second pass replays log items normally except for those which
5527 * have been cancelled.  The handling of the replay and cancellations
5528 * takes place in the log item type specific routines.
5529 *
5530 * The table of items which have cancel records in the log is allocated
5531 * and freed at this level, since only here do we know when all of
5532 * the log recovery has been completed.
5533 */
5534STATIC int
5535xlog_do_log_recovery(
5536	struct xlog	*log,
5537	xfs_daddr_t	head_blk,
5538	xfs_daddr_t	tail_blk)
5539{
5540	int		error, i;
5541
5542	ASSERT(head_blk != tail_blk);
5543
5544	/*
5545	 * First do a pass to find all of the cancelled buf log items.
5546	 * Store them in the buf_cancel_table for use in the second pass.
5547	 */
5548	log->l_buf_cancel_table = kmem_zalloc(XLOG_BC_TABLE_SIZE *
5549						 sizeof(struct list_head),
5550						 0);
5551	for (i = 0; i < XLOG_BC_TABLE_SIZE; i++)
5552		INIT_LIST_HEAD(&log->l_buf_cancel_table[i]);
5553
5554	error = xlog_do_recovery_pass(log, head_blk, tail_blk,
5555				      XLOG_RECOVER_PASS1, NULL);
5556	if (error != 0) {
5557		kmem_free(log->l_buf_cancel_table);
5558		log->l_buf_cancel_table = NULL;
5559		return error;
5560	}
5561	/*
5562	 * Then do a second pass to actually recover the items in the log.
5563	 * When it is complete free the table of buf cancel items.
5564	 */
5565	error = xlog_do_recovery_pass(log, head_blk, tail_blk,
5566				      XLOG_RECOVER_PASS2, NULL);
5567#ifdef DEBUG
5568	if (!error) {
5569		int	i;
5570
5571		for (i = 0; i < XLOG_BC_TABLE_SIZE; i++)
5572			ASSERT(list_empty(&log->l_buf_cancel_table[i]));
5573	}
5574#endif	/* DEBUG */
5575
5576	kmem_free(log->l_buf_cancel_table);
5577	log->l_buf_cancel_table = NULL;
5578
5579	return error;
5580}
5581
5582/*
5583 * Do the actual recovery
5584 */
5585STATIC int
5586xlog_do_recover(
5587	struct xlog	*log,
5588	xfs_daddr_t	head_blk,
5589	xfs_daddr_t	tail_blk)
5590{
5591	struct xfs_mount *mp = log->l_mp;
5592	int		error;
5593	xfs_buf_t	*bp;
5594	xfs_sb_t	*sbp;
5595
5596	trace_xfs_log_recover(log, head_blk, tail_blk);
5597
5598	/*
5599	 * First replay the images in the log.
5600	 */
5601	error = xlog_do_log_recovery(log, head_blk, tail_blk);
5602	if (error)
5603		return error;
5604
5605	/*
5606	 * If IO errors happened during recovery, bail out.
5607	 */
5608	if (XFS_FORCED_SHUTDOWN(mp)) {
5609		return -EIO;
5610	}
5611
5612	/*
5613	 * We now update the tail_lsn since much of the recovery has completed
5614	 * and there may be space available to use.  If there were no extent
5615	 * or iunlinks, we can free up the entire log and set the tail_lsn to
5616	 * be the last_sync_lsn.  This was set in xlog_find_tail to be the
5617	 * lsn of the last known good LR on disk.  If there are extent frees
5618	 * or iunlinks they will have some entries in the AIL; so we look at
5619	 * the AIL to determine how to set the tail_lsn.
5620	 */
5621	xlog_assign_tail_lsn(mp);
5622
5623	/*
5624	 * Now that we've finished replaying all buffer and inode
5625	 * updates, re-read in the superblock and reverify it.
5626	 */
5627	bp = xfs_getsb(mp);
5628	bp->b_flags &= ~(XBF_DONE | XBF_ASYNC);
5629	ASSERT(!(bp->b_flags & XBF_WRITE));
5630	bp->b_flags |= XBF_READ;
5631	bp->b_ops = &xfs_sb_buf_ops;
5632
5633	error = xfs_buf_submit(bp);
5634	if (error) {
5635		if (!XFS_FORCED_SHUTDOWN(mp)) {
5636			xfs_buf_ioerror_alert(bp, __func__);
5637			ASSERT(0);
5638		}
5639		xfs_buf_relse(bp);
5640		return error;
5641	}
5642
5643	/* Convert superblock from on-disk format */
5644	sbp = &mp->m_sb;
5645	xfs_sb_from_disk(sbp, XFS_BUF_TO_SBP(bp));
5646	xfs_buf_relse(bp);
5647
5648	/* re-initialise in-core superblock and geometry structures */
5649	xfs_reinit_percpu_counters(mp);
5650	error = xfs_initialize_perag(mp, sbp->sb_agcount, &mp->m_maxagi);
5651	if (error) {
5652		xfs_warn(mp, "Failed post-recovery per-ag init: %d", error);
5653		return error;
5654	}
5655	mp->m_alloc_set_aside = xfs_alloc_set_aside(mp);
5656
5657	xlog_recover_check_summary(log);
5658
5659	/* Normal transactions can now occur */
5660	log->l_flags &= ~XLOG_ACTIVE_RECOVERY;
5661	return 0;
5662}
5663
5664/*
5665 * Perform recovery and re-initialize some log variables in xlog_find_tail.
5666 *
5667 * Return error or zero.
5668 */
5669int
5670xlog_recover(
5671	struct xlog	*log)
5672{
5673	xfs_daddr_t	head_blk, tail_blk;
5674	int		error;
5675
5676	/* find the tail of the log */
5677	error = xlog_find_tail(log, &head_blk, &tail_blk);
5678	if (error)
5679		return error;
5680
5681	/*
5682	 * The superblock was read before the log was available and thus the LSN
5683	 * could not be verified. Check the superblock LSN against the current
5684	 * LSN now that it's known.
5685	 */
5686	if (xfs_sb_version_hascrc(&log->l_mp->m_sb) &&
5687	    !xfs_log_check_lsn(log->l_mp, log->l_mp->m_sb.sb_lsn))
5688		return -EINVAL;
5689
5690	if (tail_blk != head_blk) {
5691		/* There used to be a comment here:
5692		 *
5693		 * disallow recovery on read-only mounts.  note -- mount
5694		 * checks for ENOSPC and turns it into an intelligent
5695		 * error message.
5696		 * ...but this is no longer true.  Now, unless you specify
5697		 * NORECOVERY (in which case this function would never be
5698		 * called), we just go ahead and recover.  We do this all
5699		 * under the vfs layer, so we can get away with it unless
5700		 * the device itself is read-only, in which case we fail.
5701		 */
5702		if ((error = xfs_dev_is_read_only(log->l_mp, "recovery"))) {
5703			return error;
5704		}
5705
5706		/*
5707		 * Version 5 superblock log feature mask validation. We know the
5708		 * log is dirty so check if there are any unknown log features
5709		 * in what we need to recover. If there are unknown features
5710		 * (e.g. unsupported transactions, then simply reject the
5711		 * attempt at recovery before touching anything.
5712		 */
5713		if (XFS_SB_VERSION_NUM(&log->l_mp->m_sb) == XFS_SB_VERSION_5 &&
5714		    xfs_sb_has_incompat_log_feature(&log->l_mp->m_sb,
5715					XFS_SB_FEAT_INCOMPAT_LOG_UNKNOWN)) {
5716			xfs_warn(log->l_mp,
5717"Superblock has unknown incompatible log features (0x%x) enabled.",
5718				(log->l_mp->m_sb.sb_features_log_incompat &
5719					XFS_SB_FEAT_INCOMPAT_LOG_UNKNOWN));
5720			xfs_warn(log->l_mp,
5721"The log can not be fully and/or safely recovered by this kernel.");
5722			xfs_warn(log->l_mp,
5723"Please recover the log on a kernel that supports the unknown features.");
5724			return -EINVAL;
5725		}
5726
5727		/*
5728		 * Delay log recovery if the debug hook is set. This is debug
5729		 * instrumention to coordinate simulation of I/O failures with
5730		 * log recovery.
5731		 */
5732		if (xfs_globals.log_recovery_delay) {
5733			xfs_notice(log->l_mp,
5734				"Delaying log recovery for %d seconds.",
5735				xfs_globals.log_recovery_delay);
5736			msleep(xfs_globals.log_recovery_delay * 1000);
5737		}
5738
5739		xfs_notice(log->l_mp, "Starting recovery (logdev: %s)",
5740				log->l_mp->m_logname ? log->l_mp->m_logname
5741						     : "internal");
5742
5743		error = xlog_do_recover(log, head_blk, tail_blk);
5744		log->l_flags |= XLOG_RECOVERY_NEEDED;
5745	}
5746	return error;
5747}
5748
5749/*
5750 * In the first part of recovery we replay inodes and buffers and build
5751 * up the list of extent free items which need to be processed.  Here
5752 * we process the extent free items and clean up the on disk unlinked
5753 * inode lists.  This is separated from the first part of recovery so
5754 * that the root and real-time bitmap inodes can be read in from disk in
5755 * between the two stages.  This is necessary so that we can free space
5756 * in the real-time portion of the file system.
5757 */
5758int
5759xlog_recover_finish(
5760	struct xlog	*log)
5761{
5762	/*
5763	 * Now we're ready to do the transactions needed for the
5764	 * rest of recovery.  Start with completing all the extent
5765	 * free intent records and then process the unlinked inode
5766	 * lists.  At this point, we essentially run in normal mode
5767	 * except that we're still performing recovery actions
5768	 * rather than accepting new requests.
5769	 */
5770	if (log->l_flags & XLOG_RECOVERY_NEEDED) {
5771		int	error;
5772		error = xlog_recover_process_intents(log);
5773		if (error) {
5774			xfs_alert(log->l_mp, "Failed to recover intents");
5775			return error;
5776		}
5777
5778		/*
5779		 * Sync the log to get all the intents out of the AIL.
5780		 * This isn't absolutely necessary, but it helps in
5781		 * case the unlink transactions would have problems
5782		 * pushing the intents out of the way.
5783		 */
5784		xfs_log_force(log->l_mp, XFS_LOG_SYNC);
5785
5786		xlog_recover_process_iunlinks(log);
5787
5788		xlog_recover_check_summary(log);
5789
5790		xfs_notice(log->l_mp, "Ending recovery (logdev: %s)",
5791				log->l_mp->m_logname ? log->l_mp->m_logname
5792						     : "internal");
5793		log->l_flags &= ~XLOG_RECOVERY_NEEDED;
5794	} else {
5795		xfs_info(log->l_mp, "Ending clean mount");
5796	}
5797	return 0;
5798}
5799
5800void
5801xlog_recover_cancel(
5802	struct xlog	*log)
5803{
5804	if (log->l_flags & XLOG_RECOVERY_NEEDED)
5805		xlog_recover_cancel_intents(log);
5806}
5807
5808#if defined(DEBUG)
5809/*
5810 * Read all of the agf and agi counters and check that they
5811 * are consistent with the superblock counters.
5812 */
5813STATIC void
5814xlog_recover_check_summary(
5815	struct xlog	*log)
5816{
5817	xfs_mount_t	*mp;
5818	xfs_agf_t	*agfp;
5819	xfs_buf_t	*agfbp;
5820	xfs_buf_t	*agibp;
5821	xfs_agnumber_t	agno;
5822	uint64_t	freeblks;
5823	uint64_t	itotal;
5824	uint64_t	ifree;
5825	int		error;
5826
5827	mp = log->l_mp;
5828
5829	freeblks = 0LL;
5830	itotal = 0LL;
5831	ifree = 0LL;
5832	for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) {
5833		error = xfs_read_agf(mp, NULL, agno, 0, &agfbp);
5834		if (error) {
5835			xfs_alert(mp, "%s agf read failed agno %d error %d",
5836						__func__, agno, error);
5837		} else {
5838			agfp = XFS_BUF_TO_AGF(agfbp);
5839			freeblks += be32_to_cpu(agfp->agf_freeblks) +
5840				    be32_to_cpu(agfp->agf_flcount);
5841			xfs_buf_relse(agfbp);
5842		}
5843
5844		error = xfs_read_agi(mp, NULL, agno, &agibp);
5845		if (error) {
5846			xfs_alert(mp, "%s agi read failed agno %d error %d",
5847						__func__, agno, error);
5848		} else {
5849			struct xfs_agi	*agi = XFS_BUF_TO_AGI(agibp);
5850
5851			itotal += be32_to_cpu(agi->agi_count);
5852			ifree += be32_to_cpu(agi->agi_freecount);
5853			xfs_buf_relse(agibp);
5854		}
5855	}
5856}
5857#endif /* DEBUG */
Configure Feed

Configure Feed