fs/xfs/xfs_buf_item.c at master · tjh.dev/kernel

tjh.dev / kernel
Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
kernel / fs / xfs / xfs_buf_item.c
at master 32 kB view raw
   1// SPDX-License-Identifier: GPL-2.0
   2/*
   3 * Copyright (c) 2000-2005 Silicon Graphics, Inc.
   4 * All Rights Reserved.
   5 */
   6#include "xfs.h"
   7#include "xfs_fs.h"
   8#include "xfs_shared.h"
   9#include "xfs_format.h"
  10#include "xfs_log_format.h"
  11#include "xfs_trans_resv.h"
  12#include "xfs_bit.h"
  13#include "xfs_mount.h"
  14#include "xfs_trans.h"
  15#include "xfs_trans_priv.h"
  16#include "xfs_buf_item.h"
  17#include "xfs_inode.h"
  18#include "xfs_inode_item.h"
  19#include "xfs_quota.h"
  20#include "xfs_dquot_item.h"
  21#include "xfs_dquot.h"
  22#include "xfs_trace.h"
  23#include "xfs_log.h"
  24#include "xfs_log_priv.h"
  25#include "xfs_error.h"
  26
  27
  28struct kmem_cache	*xfs_buf_item_cache;
  29
  30static inline struct xfs_buf_log_item *BUF_ITEM(struct xfs_log_item *lip)
  31{
  32	return container_of(lip, struct xfs_buf_log_item, bli_item);
  33}
  34
  35static void
  36xfs_buf_item_get_format(
  37	struct xfs_buf_log_item	*bip,
  38	int			count)
  39{
  40	ASSERT(bip->bli_formats == NULL);
  41	bip->bli_format_count = count;
  42
  43	if (count == 1) {
  44		bip->bli_formats = &bip->__bli_format;
  45		return;
  46	}
  47
  48	bip->bli_formats = kzalloc(count * sizeof(struct xfs_buf_log_format),
  49				GFP_KERNEL | __GFP_NOFAIL);
  50}
  51
  52static void
  53xfs_buf_item_free_format(
  54	struct xfs_buf_log_item	*bip)
  55{
  56	if (bip->bli_formats != &bip->__bli_format) {
  57		kfree(bip->bli_formats);
  58		bip->bli_formats = NULL;
  59	}
  60}
  61
  62static void
  63xfs_buf_item_free(
  64	struct xfs_buf_log_item	*bip)
  65{
  66	xfs_buf_item_free_format(bip);
  67	kvfree(bip->bli_item.li_lv_shadow);
  68	kmem_cache_free(xfs_buf_item_cache, bip);
  69}
  70
  71/*
  72 * xfs_buf_item_relse() is called when the buf log item is no longer needed.
  73 */
  74static void
  75xfs_buf_item_relse(
  76	struct xfs_buf_log_item	*bip)
  77{
  78	struct xfs_buf		*bp = bip->bli_buf;
  79
  80	trace_xfs_buf_item_relse(bp, _RET_IP_);
  81
  82	ASSERT(!test_bit(XFS_LI_IN_AIL, &bip->bli_item.li_flags));
  83	ASSERT(atomic_read(&bip->bli_refcount) == 0);
  84
  85	bp->b_log_item = NULL;
  86	xfs_buf_rele(bp);
  87	xfs_buf_item_free(bip);
  88}
  89
  90/* Is this log iovec plausibly large enough to contain the buffer log format? */
  91bool
  92xfs_buf_log_check_iovec(
  93	struct kvec			*iovec)
  94{
  95	struct xfs_buf_log_format	*blfp = iovec->iov_base;
  96	char				*bmp_end;
  97	char				*item_end;
  98
  99	if (offsetof(struct xfs_buf_log_format, blf_data_map) > iovec->iov_len)
 100		return false;
 101
 102	item_end = (char *)iovec->iov_base + iovec->iov_len;
 103	bmp_end = (char *)&blfp->blf_data_map[blfp->blf_map_size];
 104	return bmp_end <= item_end;
 105}
 106
 107static inline int
 108xfs_buf_log_format_size(
 109	struct xfs_buf_log_format *blfp)
 110{
 111	return offsetof(struct xfs_buf_log_format, blf_data_map) +
 112			(blfp->blf_map_size * sizeof(blfp->blf_data_map[0]));
 113}
 114
 115/*
 116 * Return the number of log iovecs and space needed to log the given buf log
 117 * item segment.
 118 *
 119 * It calculates this as 1 iovec for the buf log format structure and 1 for each
 120 * stretch of non-contiguous chunks to be logged.  Contiguous chunks are logged
 121 * in a single iovec.
 122 */
 123STATIC void
 124xfs_buf_item_size_segment(
 125	struct xfs_buf_log_item		*bip,
 126	struct xfs_buf_log_format	*blfp,
 127	uint				offset,
 128	int				*nvecs,
 129	int				*nbytes)
 130{
 131	int				first_bit;
 132	int				nbits;
 133
 134	first_bit = xfs_next_bit(blfp->blf_data_map, blfp->blf_map_size, 0);
 135	if (first_bit == -1)
 136		return;
 137
 138	(*nvecs)++;
 139	*nbytes += xfs_buf_log_format_size(blfp);
 140
 141	do {
 142		nbits = xfs_contig_bits(blfp->blf_data_map,
 143					blfp->blf_map_size, first_bit);
 144		ASSERT(nbits > 0);
 145		(*nvecs)++;
 146		*nbytes += nbits * XFS_BLF_CHUNK;
 147
 148		/*
 149		 * This takes the bit number to start looking from and
 150		 * returns the next set bit from there.  It returns -1
 151		 * if there are no more bits set or the start bit is
 152		 * beyond the end of the bitmap.
 153		 */
 154		first_bit = xfs_next_bit(blfp->blf_data_map, blfp->blf_map_size,
 155					(uint)first_bit + nbits + 1);
 156	} while (first_bit != -1);
 157
 158	return;
 159}
 160
 161/*
 162 * Compute the worst case log item overhead for an invalidated buffer with the
 163 * given map count and block size.
 164 */
 165unsigned int
 166xfs_buf_inval_log_space(
 167	unsigned int	map_count,
 168	unsigned int	blocksize)
 169{
 170	unsigned int	chunks = DIV_ROUND_UP(blocksize, XFS_BLF_CHUNK);
 171	unsigned int	bitmap_size = DIV_ROUND_UP(chunks, NBWORD);
 172	unsigned int	ret =
 173		offsetof(struct xfs_buf_log_format, blf_data_map) +
 174			(bitmap_size * sizeof_field(struct xfs_buf_log_format,
 175						    blf_data_map[0]));
 176
 177	return ret * map_count;
 178}
 179
 180/*
 181 * Return the number of log iovecs and space needed to log the given buf log
 182 * item.
 183 *
 184 * Discontiguous buffers need a format structure per region that is being
 185 * logged. This makes the changes in the buffer appear to log recovery as though
 186 * they came from separate buffers, just like would occur if multiple buffers
 187 * were used instead of a single discontiguous buffer. This enables
 188 * discontiguous buffers to be in-memory constructs, completely transparent to
 189 * what ends up on disk.
 190 *
 191 * If the XFS_BLI_STALE flag has been set, then log nothing but the buf log
 192 * format structures. If the item has previously been logged and has dirty
 193 * regions, we do not relog them in stale buffers. This has the effect of
 194 * reducing the size of the relogged item by the amount of dirty data tracked
 195 * by the log item. This can result in the committing transaction reducing the
 196 * amount of space being consumed by the CIL.
 197 */
 198STATIC void
 199xfs_buf_item_size(
 200	struct xfs_log_item	*lip,
 201	int			*nvecs,
 202	int			*nbytes)
 203{
 204	struct xfs_buf_log_item	*bip = BUF_ITEM(lip);
 205	struct xfs_buf		*bp = bip->bli_buf;
 206	int			i;
 207	int			bytes;
 208	uint			offset = 0;
 209
 210	ASSERT(atomic_read(&bip->bli_refcount) > 0);
 211	if (bip->bli_flags & XFS_BLI_STALE) {
 212		/*
 213		 * The buffer is stale, so all we need to log is the buf log
 214		 * format structure with the cancel flag in it as we are never
 215		 * going to replay the changes tracked in the log item.
 216		 */
 217		trace_xfs_buf_item_size_stale(bip);
 218		ASSERT(bip->__bli_format.blf_flags & XFS_BLF_CANCEL);
 219		*nvecs += bip->bli_format_count;
 220		for (i = 0; i < bip->bli_format_count; i++) {
 221			*nbytes += xfs_buf_log_format_size(&bip->bli_formats[i]);
 222		}
 223		return;
 224	}
 225
 226	ASSERT(bip->bli_flags & XFS_BLI_LOGGED);
 227
 228	if (bip->bli_flags & XFS_BLI_ORDERED) {
 229		/*
 230		 * The buffer has been logged just to order it. It is not being
 231		 * included in the transaction commit, so no vectors are used at
 232		 * all.
 233		 */
 234		trace_xfs_buf_item_size_ordered(bip);
 235		*nvecs = XFS_LOG_VEC_ORDERED;
 236		return;
 237	}
 238
 239	/*
 240	 * The vector count is based on the number of buffer vectors we have
 241	 * dirty bits in. This will only be greater than one when we have a
 242	 * compound buffer with more than one segment dirty. Hence for compound
 243	 * buffers we need to track which segment the dirty bits correspond to,
 244	 * and when we move from one segment to the next increment the vector
 245	 * count for the extra buf log format structure that will need to be
 246	 * written.
 247	 */
 248	bytes = 0;
 249	for (i = 0; i < bip->bli_format_count; i++) {
 250		xfs_buf_item_size_segment(bip, &bip->bli_formats[i], offset,
 251					  nvecs, &bytes);
 252		offset += BBTOB(bp->b_maps[i].bm_len);
 253	}
 254
 255	/*
 256	 * Round up the buffer size required to minimise the number of memory
 257	 * allocations that need to be done as this item grows when relogged by
 258	 * repeated modifications.
 259	 */
 260	*nbytes = round_up(bytes, 512);
 261	trace_xfs_buf_item_size(bip);
 262}
 263
 264static inline void
 265xfs_buf_item_copy_iovec(
 266	struct xfs_log_vec	*lv,
 267	struct xfs_log_iovec	**vecp,
 268	struct xfs_buf		*bp,
 269	uint			offset,
 270	int			first_bit,
 271	uint			nbits)
 272{
 273	offset += first_bit * XFS_BLF_CHUNK;
 274	xlog_copy_iovec(lv, vecp, XLOG_REG_TYPE_BCHUNK,
 275			xfs_buf_offset(bp, offset),
 276			nbits * XFS_BLF_CHUNK);
 277}
 278
 279static void
 280xfs_buf_item_format_segment(
 281	struct xfs_buf_log_item	*bip,
 282	struct xfs_log_vec	*lv,
 283	struct xfs_log_iovec	**vecp,
 284	uint			offset,
 285	struct xfs_buf_log_format *blfp)
 286{
 287	struct xfs_buf		*bp = bip->bli_buf;
 288	uint			base_size;
 289	int			first_bit;
 290	uint			nbits;
 291
 292	/* copy the flags across from the base format item */
 293	blfp->blf_flags = bip->__bli_format.blf_flags;
 294
 295	/*
 296	 * Base size is the actual size of the ondisk structure - it reflects
 297	 * the actual size of the dirty bitmap rather than the size of the in
 298	 * memory structure.
 299	 */
 300	base_size = xfs_buf_log_format_size(blfp);
 301
 302	first_bit = xfs_next_bit(blfp->blf_data_map, blfp->blf_map_size, 0);
 303	if (!(bip->bli_flags & XFS_BLI_STALE) && first_bit == -1) {
 304		/*
 305		 * If the map is not be dirty in the transaction, mark
 306		 * the size as zero and do not advance the vector pointer.
 307		 */
 308		return;
 309	}
 310
 311	blfp = xlog_copy_iovec(lv, vecp, XLOG_REG_TYPE_BFORMAT, blfp, base_size);
 312	blfp->blf_size = 1;
 313
 314	if (bip->bli_flags & XFS_BLI_STALE) {
 315		/*
 316		 * The buffer is stale, so all we need to log
 317		 * is the buf log format structure with the
 318		 * cancel flag in it.
 319		 */
 320		trace_xfs_buf_item_format_stale(bip);
 321		ASSERT(blfp->blf_flags & XFS_BLF_CANCEL);
 322		return;
 323	}
 324
 325
 326	/*
 327	 * Fill in an iovec for each set of contiguous chunks.
 328	 */
 329	do {
 330		ASSERT(first_bit >= 0);
 331		nbits = xfs_contig_bits(blfp->blf_data_map,
 332					blfp->blf_map_size, first_bit);
 333		ASSERT(nbits > 0);
 334		xfs_buf_item_copy_iovec(lv, vecp, bp, offset,
 335					first_bit, nbits);
 336		blfp->blf_size++;
 337
 338		/*
 339		 * This takes the bit number to start looking from and
 340		 * returns the next set bit from there.  It returns -1
 341		 * if there are no more bits set or the start bit is
 342		 * beyond the end of the bitmap.
 343		 */
 344		first_bit = xfs_next_bit(blfp->blf_data_map, blfp->blf_map_size,
 345					(uint)first_bit + nbits + 1);
 346	} while (first_bit != -1);
 347
 348	return;
 349}
 350
 351/*
 352 * This is called to fill in the vector of log iovecs for the
 353 * given log buf item.  It fills the first entry with a buf log
 354 * format structure, and the rest point to contiguous chunks
 355 * within the buffer.
 356 */
 357STATIC void
 358xfs_buf_item_format(
 359	struct xfs_log_item	*lip,
 360	struct xfs_log_vec	*lv)
 361{
 362	struct xfs_buf_log_item	*bip = BUF_ITEM(lip);
 363	struct xfs_buf		*bp = bip->bli_buf;
 364	struct xfs_log_iovec	*vecp = NULL;
 365	uint			offset = 0;
 366	int			i;
 367
 368	ASSERT(atomic_read(&bip->bli_refcount) > 0);
 369	ASSERT((bip->bli_flags & XFS_BLI_LOGGED) ||
 370	       (bip->bli_flags & XFS_BLI_STALE));
 371	ASSERT((bip->bli_flags & XFS_BLI_STALE) ||
 372	       (xfs_blft_from_flags(&bip->__bli_format) > XFS_BLFT_UNKNOWN_BUF
 373	        && xfs_blft_from_flags(&bip->__bli_format) < XFS_BLFT_MAX_BUF));
 374	ASSERT(!(bip->bli_flags & XFS_BLI_ORDERED) ||
 375	       (bip->bli_flags & XFS_BLI_STALE));
 376
 377
 378	/*
 379	 * If it is an inode buffer, transfer the in-memory state to the
 380	 * format flags and clear the in-memory state.
 381	 *
 382	 * For buffer based inode allocation, we do not transfer
 383	 * this state if the inode buffer allocation has not yet been committed
 384	 * to the log as setting the XFS_BLI_INODE_BUF flag will prevent
 385	 * correct replay of the inode allocation.
 386	 *
 387	 * For icreate item based inode allocation, the buffers aren't written
 388	 * to the journal during allocation, and hence we should always tag the
 389	 * buffer as an inode buffer so that the correct unlinked list replay
 390	 * occurs during recovery.
 391	 */
 392	if (bip->bli_flags & XFS_BLI_INODE_BUF) {
 393		if (xfs_has_v3inodes(lip->li_log->l_mp) ||
 394		    !((bip->bli_flags & XFS_BLI_INODE_ALLOC_BUF) &&
 395		      xfs_log_item_in_current_chkpt(lip)))
 396			bip->__bli_format.blf_flags |= XFS_BLF_INODE_BUF;
 397		bip->bli_flags &= ~XFS_BLI_INODE_BUF;
 398	}
 399
 400	for (i = 0; i < bip->bli_format_count; i++) {
 401		xfs_buf_item_format_segment(bip, lv, &vecp, offset,
 402					    &bip->bli_formats[i]);
 403		offset += BBTOB(bp->b_maps[i].bm_len);
 404	}
 405
 406	/*
 407	 * Check to make sure everything is consistent.
 408	 */
 409	trace_xfs_buf_item_format(bip);
 410}
 411
 412/*
 413 * This is called to pin the buffer associated with the buf log item in memory
 414 * so it cannot be written out.
 415 *
 416 * We take a reference to the buffer log item here so that the BLI life cycle
 417 * extends at least until the buffer is unpinned via xfs_buf_item_unpin() and
 418 * inserted into the AIL.
 419 *
 420 * We also need to take a reference to the buffer itself as the BLI unpin
 421 * processing requires accessing the buffer after the BLI has dropped the final
 422 * BLI reference. See xfs_buf_item_unpin() for an explanation.
 423 * If unpins race to drop the final BLI reference and only the
 424 * BLI owns a reference to the buffer, then the loser of the race can have the
 425 * buffer fgreed from under it (e.g. on shutdown). Taking a buffer reference per
 426 * pin count ensures the life cycle of the buffer extends for as
 427 * long as we hold the buffer pin reference in xfs_buf_item_unpin().
 428 */
 429STATIC void
 430xfs_buf_item_pin(
 431	struct xfs_log_item	*lip)
 432{
 433	struct xfs_buf_log_item	*bip = BUF_ITEM(lip);
 434
 435	ASSERT(atomic_read(&bip->bli_refcount) > 0);
 436	ASSERT((bip->bli_flags & XFS_BLI_LOGGED) ||
 437	       (bip->bli_flags & XFS_BLI_ORDERED) ||
 438	       (bip->bli_flags & XFS_BLI_STALE));
 439
 440	trace_xfs_buf_item_pin(bip);
 441
 442	xfs_buf_hold(bip->bli_buf);
 443	atomic_inc(&bip->bli_refcount);
 444	atomic_inc(&bip->bli_buf->b_pin_count);
 445}
 446
 447/*
 448 * For a stale BLI, process all the necessary completions that must be
 449 * performed when the final BLI reference goes away. The buffer will be
 450 * referenced and locked here - we return to the caller with the buffer still
 451 * referenced and locked for them to finalise processing of the buffer.
 452 */
 453static void
 454xfs_buf_item_finish_stale(
 455	struct xfs_buf_log_item	*bip)
 456{
 457	struct xfs_buf		*bp = bip->bli_buf;
 458	struct xfs_log_item	*lip = &bip->bli_item;
 459
 460	ASSERT(bip->bli_flags & XFS_BLI_STALE);
 461	ASSERT(xfs_buf_islocked(bp));
 462	ASSERT(bp->b_flags & XBF_STALE);
 463	ASSERT(bip->__bli_format.blf_flags & XFS_BLF_CANCEL);
 464	ASSERT(list_empty(&lip->li_trans));
 465	ASSERT(!bp->b_transp);
 466
 467	if (bip->bli_flags & XFS_BLI_STALE_INODE) {
 468		xfs_buf_item_done(bp);
 469		xfs_buf_inode_iodone(bp);
 470		ASSERT(list_empty(&bp->b_li_list));
 471		return;
 472	}
 473
 474	/*
 475	 * We may or may not be on the AIL here, xfs_trans_ail_delete() will do
 476	 * the right thing regardless of the situation in which we are called.
 477	 */
 478	xfs_trans_ail_delete(lip, SHUTDOWN_LOG_IO_ERROR);
 479	xfs_buf_item_relse(bip);
 480	ASSERT(bp->b_log_item == NULL);
 481}
 482
 483/*
 484 * This is called to unpin the buffer associated with the buf log item which was
 485 * previously pinned with a call to xfs_buf_item_pin().  We enter this function
 486 * with a buffer pin count, a buffer reference and a BLI reference.
 487 *
 488 * We must drop the BLI reference before we unpin the buffer because the AIL
 489 * doesn't acquire a BLI reference whenever it accesses it. Therefore if the
 490 * refcount drops to zero, the bli could still be AIL resident and the buffer
 491 * submitted for I/O at any point before we return. This can result in IO
 492 * completion freeing the buffer while we are still trying to access it here.
 493 * This race condition can also occur in shutdown situations where we abort and
 494 * unpin buffers from contexts other that journal IO completion.
 495 *
 496 * Hence we have to hold a buffer reference per pin count to ensure that the
 497 * buffer cannot be freed until we have finished processing the unpin operation.
 498 * The reference is taken in xfs_buf_item_pin(), and we must hold it until we
 499 * are done processing the buffer state. In the case of an abort (remove =
 500 * true) then we re-use the current pin reference as the IO reference we hand
 501 * off to IO failure handling.
 502 */
 503STATIC void
 504xfs_buf_item_unpin(
 505	struct xfs_log_item	*lip,
 506	int			remove)
 507{
 508	struct xfs_buf_log_item	*bip = BUF_ITEM(lip);
 509	struct xfs_buf		*bp = bip->bli_buf;
 510	int			stale = bip->bli_flags & XFS_BLI_STALE;
 511	int			freed;
 512
 513	ASSERT(bp->b_log_item == bip);
 514	ASSERT(atomic_read(&bip->bli_refcount) > 0);
 515
 516	trace_xfs_buf_item_unpin(bip);
 517
 518	freed = atomic_dec_and_test(&bip->bli_refcount);
 519	if (atomic_dec_and_test(&bp->b_pin_count))
 520		wake_up_all(&bp->b_waiters);
 521
 522	/*
 523	 * Nothing to do but drop the buffer pin reference if the BLI is
 524	 * still active.
 525	 */
 526	if (!freed) {
 527		xfs_buf_rele(bp);
 528		return;
 529	}
 530
 531	if (stale) {
 532		trace_xfs_buf_item_unpin_stale(bip);
 533
 534		/*
 535		 * The buffer has been locked and referenced since it was marked
 536		 * stale so we own both lock and reference exclusively here. We
 537		 * do not need the pin reference any more, so drop it now so
 538		 * that we only have one reference to drop once item completion
 539		 * processing is complete.
 540		 */
 541		xfs_buf_rele(bp);
 542		xfs_buf_item_finish_stale(bip);
 543		xfs_buf_relse(bp);
 544		return;
 545	}
 546
 547	if (remove) {
 548		/*
 549		 * We need to simulate an async IO failures here to ensure that
 550		 * the correct error completion is run on this buffer. This
 551		 * requires a reference to the buffer and for the buffer to be
 552		 * locked. We can safely pass ownership of the pin reference to
 553		 * the IO to ensure that nothing can free the buffer while we
 554		 * wait for the lock and then run the IO failure completion.
 555		 */
 556		xfs_buf_lock(bp);
 557		bp->b_flags |= XBF_ASYNC;
 558		xfs_buf_ioend_fail(bp);
 559		return;
 560	}
 561
 562	/*
 563	 * BLI has no more active references - it will be moved to the AIL to
 564	 * manage the remaining BLI/buffer life cycle. There is nothing left for
 565	 * us to do here so drop the pin reference to the buffer.
 566	 */
 567	xfs_buf_rele(bp);
 568}
 569
 570STATIC uint
 571xfs_buf_item_push(
 572	struct xfs_log_item	*lip,
 573	struct list_head	*buffer_list)
 574{
 575	struct xfs_buf_log_item	*bip = BUF_ITEM(lip);
 576	struct xfs_buf		*bp = bip->bli_buf;
 577	uint			rval = XFS_ITEM_SUCCESS;
 578
 579	if (xfs_buf_ispinned(bp))
 580		return XFS_ITEM_PINNED;
 581	if (!xfs_buf_trylock(bp)) {
 582		/*
 583		 * If we have just raced with a buffer being pinned and it has
 584		 * been marked stale, we could end up stalling until someone else
 585		 * issues a log force to unpin the stale buffer. Check for the
 586		 * race condition here so xfsaild recognizes the buffer is pinned
 587		 * and queues a log force to move it along.
 588		 */
 589		if (xfs_buf_ispinned(bp))
 590			return XFS_ITEM_PINNED;
 591		return XFS_ITEM_LOCKED;
 592	}
 593
 594	ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
 595
 596	trace_xfs_buf_item_push(bip);
 597
 598	/* has a previous flush failed due to IO errors? */
 599	if (bp->b_flags & XBF_WRITE_FAIL) {
 600		xfs_buf_alert_ratelimited(bp, "XFS: Failing async write",
 601	    "Failing async write on buffer block 0x%llx. Retrying async write.",
 602					  (long long)xfs_buf_daddr(bp));
 603	}
 604
 605	if (!xfs_buf_delwri_queue(bp, buffer_list))
 606		rval = XFS_ITEM_FLUSHING;
 607	xfs_buf_unlock(bp);
 608	return rval;
 609}
 610
 611/*
 612 * Drop the buffer log item refcount and take appropriate action. This helper
 613 * determines whether the bli must be freed or not, since a decrement to zero
 614 * does not necessarily mean the bli is unused.
 615 */
 616void
 617xfs_buf_item_put(
 618	struct xfs_buf_log_item	*bip)
 619{
 620
 621	ASSERT(xfs_buf_islocked(bip->bli_buf));
 622
 623	/* drop the bli ref and return if it wasn't the last one */
 624	if (!atomic_dec_and_test(&bip->bli_refcount))
 625		return;
 626
 627	/* If the BLI is in the AIL, then it is still dirty and in use */
 628	if (test_bit(XFS_LI_IN_AIL, &bip->bli_item.li_flags)) {
 629		ASSERT(bip->bli_flags & XFS_BLI_DIRTY);
 630		return;
 631	}
 632
 633	/*
 634	 * In shutdown conditions, we can be asked to free a dirty BLI that
 635	 * isn't in the AIL. This can occur due to a checkpoint aborting a BLI
 636	 * instead of inserting it into the AIL at checkpoint IO completion. If
 637	 * there's another bli reference (e.g. a btree cursor holds a clean
 638	 * reference) and it is released via xfs_trans_brelse(), we can get here
 639	 * with that aborted, dirty BLI. In this case, it is safe to free the
 640	 * dirty BLI immediately, as it is not in the AIL and there are no
 641	 * other references to it.
 642	 *
 643	 * We should never get here with a stale BLI via that path as
 644	 * xfs_trans_brelse() specifically holds onto stale buffers rather than
 645	 * releasing them.
 646	 */
 647	ASSERT(!(bip->bli_flags & XFS_BLI_DIRTY) ||
 648			test_bit(XFS_LI_ABORTED, &bip->bli_item.li_flags));
 649	ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
 650	xfs_buf_item_relse(bip);
 651}
 652
 653/*
 654 * Release the buffer associated with the buf log item.  If there is no dirty
 655 * logged data associated with the buffer recorded in the buf log item, then
 656 * free the buf log item and remove the reference to it in the buffer.
 657 *
 658 * This call ignores the recursion count.  It is only called when the buffer
 659 * should REALLY be unlocked, regardless of the recursion count.
 660 *
 661 * We unconditionally drop the transaction's reference to the log item. If the
 662 * item was logged, then another reference was taken when it was pinned, so we
 663 * can safely drop the transaction reference now.  This also allows us to avoid
 664 * potential races with the unpin code freeing the bli by not referencing the
 665 * bli after we've dropped the reference count.
 666 *
 667 * If the XFS_BLI_HOLD flag is set in the buf log item, then free the log item
 668 * if necessary but do not unlock the buffer.  This is for support of
 669 * xfs_trans_bhold(). Make sure the XFS_BLI_HOLD field is cleared if we don't
 670 * free the item.
 671 *
 672 * If the XFS_BLI_STALE flag is set, the last reference to the BLI *must*
 673 * perform a completion abort of any objects attached to the buffer for IO
 674 * tracking purposes. This generally only happens in shutdown situations,
 675 * normally xfs_buf_item_unpin() will drop the last BLI reference and perform
 676 * completion processing. However, because transaction completion can race with
 677 * checkpoint completion during a shutdown, this release context may end up
 678 * being the last active reference to the BLI and so needs to perform this
 679 * cleanup.
 680 */
 681STATIC void
 682xfs_buf_item_release(
 683	struct xfs_log_item	*lip)
 684{
 685	struct xfs_buf_log_item	*bip = BUF_ITEM(lip);
 686	struct xfs_buf		*bp = bip->bli_buf;
 687	bool			hold = bip->bli_flags & XFS_BLI_HOLD;
 688	bool			stale = bip->bli_flags & XFS_BLI_STALE;
 689	bool			aborted = test_bit(XFS_LI_ABORTED,
 690						   &lip->li_flags);
 691	bool			dirty = bip->bli_flags & XFS_BLI_DIRTY;
 692#if defined(DEBUG) || defined(XFS_WARN)
 693	bool			ordered = bip->bli_flags & XFS_BLI_ORDERED;
 694#endif
 695
 696	trace_xfs_buf_item_release(bip);
 697
 698	ASSERT(xfs_buf_islocked(bp));
 699
 700	/*
 701	 * The bli dirty state should match whether the blf has logged segments
 702	 * except for ordered buffers, where only the bli should be dirty.
 703	 */
 704	ASSERT((!ordered && dirty == xfs_buf_item_dirty_format(bip)) ||
 705	       (ordered && dirty && !xfs_buf_item_dirty_format(bip)));
 706	ASSERT(!stale || (bip->__bli_format.blf_flags & XFS_BLF_CANCEL));
 707
 708	/*
 709	 * Clear the buffer's association with this transaction and
 710	 * per-transaction state from the bli, which has been copied above.
 711	 */
 712	bp->b_transp = NULL;
 713	bip->bli_flags &= ~(XFS_BLI_LOGGED | XFS_BLI_HOLD | XFS_BLI_ORDERED);
 714
 715	/* If there are other references, then we have nothing to do. */
 716	if (!atomic_dec_and_test(&bip->bli_refcount))
 717		goto out_release;
 718
 719	/*
 720	 * Stale buffer completion frees the BLI, unlocks and releases the
 721	 * buffer. Neither the BLI or buffer are safe to reference after this
 722	 * call, so there's nothing more we need to do here.
 723	 *
 724	 * If we get here with a stale buffer and references to the BLI remain,
 725	 * we must not unlock the buffer as the last BLI reference owns lock
 726	 * context, not us.
 727	 */
 728	if (stale) {
 729		xfs_buf_item_finish_stale(bip);
 730		xfs_buf_relse(bp);
 731		ASSERT(!hold);
 732		return;
 733	}
 734
 735	/*
 736	 * Dirty or clean, aborted items are done and need to be removed from
 737	 * the AIL and released. This frees the BLI, but leaves the buffer
 738	 * locked and referenced.
 739	 */
 740	if (aborted || xlog_is_shutdown(lip->li_log)) {
 741		ASSERT(list_empty(&bip->bli_buf->b_li_list));
 742		xfs_buf_item_done(bp);
 743		goto out_release;
 744	}
 745
 746	/*
 747	 * Clean, unreferenced BLIs can be immediately freed, leaving the buffer
 748	 * locked and referenced.
 749	 *
 750	 * Dirty, unreferenced BLIs *must* be in the AIL awaiting writeback.
 751	 */
 752	if (!dirty)
 753		xfs_buf_item_relse(bip);
 754	else
 755		ASSERT(test_bit(XFS_LI_IN_AIL, &lip->li_flags));
 756
 757	/* Not safe to reference the BLI from here */
 758out_release:
 759	/*
 760	 * If we get here with a stale buffer, we must not unlock the
 761	 * buffer as the last BLI reference owns lock context, not us.
 762	 */
 763	if (stale || hold)
 764		return;
 765	xfs_buf_relse(bp);
 766}
 767
 768STATIC void
 769xfs_buf_item_committing(
 770	struct xfs_log_item	*lip,
 771	xfs_csn_t		seq)
 772{
 773	return xfs_buf_item_release(lip);
 774}
 775
 776/*
 777 * This is called to find out where the oldest active copy of the
 778 * buf log item in the on disk log resides now that the last log
 779 * write of it completed at the given lsn.
 780 * We always re-log all the dirty data in a buffer, so usually the
 781 * latest copy in the on disk log is the only one that matters.  For
 782 * those cases we simply return the given lsn.
 783 *
 784 * The one exception to this is for buffers full of newly allocated
 785 * inodes.  These buffers are only relogged with the XFS_BLI_INODE_BUF
 786 * flag set, indicating that only the di_next_unlinked fields from the
 787 * inodes in the buffers will be replayed during recovery.  If the
 788 * original newly allocated inode images have not yet been flushed
 789 * when the buffer is so relogged, then we need to make sure that we
 790 * keep the old images in the 'active' portion of the log.  We do this
 791 * by returning the original lsn of that transaction here rather than
 792 * the current one.
 793 */
 794STATIC xfs_lsn_t
 795xfs_buf_item_committed(
 796	struct xfs_log_item	*lip,
 797	xfs_lsn_t		lsn)
 798{
 799	struct xfs_buf_log_item	*bip = BUF_ITEM(lip);
 800
 801	trace_xfs_buf_item_committed(bip);
 802
 803	if ((bip->bli_flags & XFS_BLI_INODE_ALLOC_BUF) && lip->li_lsn != 0)
 804		return lip->li_lsn;
 805	return lsn;
 806}
 807
 808#ifdef DEBUG_EXPENSIVE
 809static int
 810xfs_buf_item_precommit(
 811	struct xfs_trans	*tp,
 812	struct xfs_log_item	*lip)
 813{
 814	struct xfs_buf_log_item	*bip = BUF_ITEM(lip);
 815	struct xfs_buf		*bp = bip->bli_buf;
 816	struct xfs_mount	*mp = bp->b_mount;
 817	xfs_failaddr_t		fa;
 818
 819	if (!bp->b_ops || !bp->b_ops->verify_struct)
 820		return 0;
 821	if (bip->bli_flags & XFS_BLI_STALE)
 822		return 0;
 823
 824	fa = bp->b_ops->verify_struct(bp);
 825	if (fa) {
 826		xfs_buf_verifier_error(bp, -EFSCORRUPTED, bp->b_ops->name,
 827				bp->b_addr, BBTOB(bp->b_length), fa);
 828		xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
 829		ASSERT(fa == NULL);
 830	}
 831
 832	return 0;
 833}
 834#else
 835# define xfs_buf_item_precommit	NULL
 836#endif
 837
 838static const struct xfs_item_ops xfs_buf_item_ops = {
 839	.iop_size	= xfs_buf_item_size,
 840	.iop_precommit	= xfs_buf_item_precommit,
 841	.iop_format	= xfs_buf_item_format,
 842	.iop_pin	= xfs_buf_item_pin,
 843	.iop_unpin	= xfs_buf_item_unpin,
 844	.iop_release	= xfs_buf_item_release,
 845	.iop_committing	= xfs_buf_item_committing,
 846	.iop_committed	= xfs_buf_item_committed,
 847	.iop_push	= xfs_buf_item_push,
 848};
 849
 850/*
 851 * Allocate a new buf log item to go with the given buffer.
 852 * Set the buffer's b_log_item field to point to the new
 853 * buf log item.
 854 */
 855int
 856xfs_buf_item_init(
 857	struct xfs_buf	*bp,
 858	struct xfs_mount *mp)
 859{
 860	struct xfs_buf_log_item	*bip = bp->b_log_item;
 861	int			chunks;
 862	int			map_size;
 863	int			i;
 864
 865	/*
 866	 * Check to see if there is already a buf log item for
 867	 * this buffer. If we do already have one, there is
 868	 * nothing to do here so return.
 869	 */
 870	ASSERT(bp->b_mount == mp);
 871	if (bip) {
 872		ASSERT(bip->bli_item.li_type == XFS_LI_BUF);
 873		ASSERT(!bp->b_transp);
 874		ASSERT(bip->bli_buf == bp);
 875		return 0;
 876	}
 877
 878	bip = kmem_cache_zalloc(xfs_buf_item_cache, GFP_KERNEL | __GFP_NOFAIL);
 879	xfs_log_item_init(mp, &bip->bli_item, XFS_LI_BUF, &xfs_buf_item_ops);
 880	bip->bli_buf = bp;
 881
 882	/*
 883	 * chunks is the number of XFS_BLF_CHUNK size pieces the buffer
 884	 * can be divided into. Make sure not to truncate any pieces.
 885	 * map_size is the size of the bitmap needed to describe the
 886	 * chunks of the buffer.
 887	 *
 888	 * Discontiguous buffer support follows the layout of the underlying
 889	 * buffer. This makes the implementation as simple as possible.
 890	 */
 891	xfs_buf_item_get_format(bip, bp->b_map_count);
 892
 893	for (i = 0; i < bip->bli_format_count; i++) {
 894		chunks = DIV_ROUND_UP(BBTOB(bp->b_maps[i].bm_len),
 895				      XFS_BLF_CHUNK);
 896		map_size = DIV_ROUND_UP(chunks, NBWORD);
 897
 898		if (map_size > XFS_BLF_DATAMAP_SIZE) {
 899			xfs_buf_item_free_format(bip);
 900			kmem_cache_free(xfs_buf_item_cache, bip);
 901			xfs_err(mp,
 902	"buffer item dirty bitmap (%u uints) too small to reflect %u bytes!",
 903					map_size,
 904					BBTOB(bp->b_maps[i].bm_len));
 905			return -EFSCORRUPTED;
 906		}
 907
 908		bip->bli_formats[i].blf_type = XFS_LI_BUF;
 909		bip->bli_formats[i].blf_blkno = bp->b_maps[i].bm_bn;
 910		bip->bli_formats[i].blf_len = bp->b_maps[i].bm_len;
 911		bip->bli_formats[i].blf_map_size = map_size;
 912	}
 913
 914	bp->b_log_item = bip;
 915	xfs_buf_hold(bp);
 916	return 0;
 917}
 918
 919
 920/*
 921 * Mark bytes first through last inclusive as dirty in the buf
 922 * item's bitmap.
 923 */
 924static void
 925xfs_buf_item_log_segment(
 926	uint			first,
 927	uint			last,
 928	uint			*map)
 929{
 930	uint		first_bit;
 931	uint		last_bit;
 932	uint		bits_to_set;
 933	uint		bits_set;
 934	uint		word_num;
 935	uint		*wordp;
 936	uint		bit;
 937	uint		end_bit;
 938	uint		mask;
 939
 940	ASSERT(first < XFS_BLF_DATAMAP_SIZE * XFS_BLF_CHUNK * NBWORD);
 941	ASSERT(last < XFS_BLF_DATAMAP_SIZE * XFS_BLF_CHUNK * NBWORD);
 942
 943	/*
 944	 * Convert byte offsets to bit numbers.
 945	 */
 946	first_bit = first >> XFS_BLF_SHIFT;
 947	last_bit = last >> XFS_BLF_SHIFT;
 948
 949	/*
 950	 * Calculate the total number of bits to be set.
 951	 */
 952	bits_to_set = last_bit - first_bit + 1;
 953
 954	/*
 955	 * Get a pointer to the first word in the bitmap
 956	 * to set a bit in.
 957	 */
 958	word_num = first_bit >> BIT_TO_WORD_SHIFT;
 959	wordp = &map[word_num];
 960
 961	/*
 962	 * Calculate the starting bit in the first word.
 963	 */
 964	bit = first_bit & (uint)(NBWORD - 1);
 965
 966	/*
 967	 * First set any bits in the first word of our range.
 968	 * If it starts at bit 0 of the word, it will be
 969	 * set below rather than here.  That is what the variable
 970	 * bit tells us. The variable bits_set tracks the number
 971	 * of bits that have been set so far.  End_bit is the number
 972	 * of the last bit to be set in this word plus one.
 973	 */
 974	if (bit) {
 975		end_bit = min(bit + bits_to_set, (uint)NBWORD);
 976		mask = ((1U << (end_bit - bit)) - 1) << bit;
 977		*wordp |= mask;
 978		wordp++;
 979		bits_set = end_bit - bit;
 980	} else {
 981		bits_set = 0;
 982	}
 983
 984	/*
 985	 * Now set bits a whole word at a time that are between
 986	 * first_bit and last_bit.
 987	 */
 988	while ((bits_to_set - bits_set) >= NBWORD) {
 989		*wordp = 0xffffffff;
 990		bits_set += NBWORD;
 991		wordp++;
 992	}
 993
 994	/*
 995	 * Finally, set any bits left to be set in one last partial word.
 996	 */
 997	end_bit = bits_to_set - bits_set;
 998	if (end_bit) {
 999		mask = (1U << end_bit) - 1;
1000		*wordp |= mask;
1001	}
1002}
1003
1004/*
1005 * Mark bytes first through last inclusive as dirty in the buf
1006 * item's bitmap.
1007 */
1008void
1009xfs_buf_item_log(
1010	struct xfs_buf_log_item	*bip,
1011	uint			first,
1012	uint			last)
1013{
1014	int			i;
1015	uint			start;
1016	uint			end;
1017	struct xfs_buf		*bp = bip->bli_buf;
1018
1019	/*
1020	 * walk each buffer segment and mark them dirty appropriately.
1021	 */
1022	start = 0;
1023	for (i = 0; i < bip->bli_format_count; i++) {
1024		if (start > last)
1025			break;
1026		end = start + BBTOB(bp->b_maps[i].bm_len) - 1;
1027
1028		/* skip to the map that includes the first byte to log */
1029		if (first > end) {
1030			start += BBTOB(bp->b_maps[i].bm_len);
1031			continue;
1032		}
1033
1034		/*
1035		 * Trim the range to this segment and mark it in the bitmap.
1036		 * Note that we must convert buffer offsets to segment relative
1037		 * offsets (e.g., the first byte of each segment is byte 0 of
1038		 * that segment).
1039		 */
1040		if (first < start)
1041			first = start;
1042		if (end > last)
1043			end = last;
1044		xfs_buf_item_log_segment(first - start, end - start,
1045					 &bip->bli_formats[i].blf_data_map[0]);
1046
1047		start += BBTOB(bp->b_maps[i].bm_len);
1048	}
1049}
1050
1051
1052/*
1053 * Return true if the buffer has any ranges logged/dirtied by a transaction,
1054 * false otherwise.
1055 */
1056bool
1057xfs_buf_item_dirty_format(
1058	struct xfs_buf_log_item	*bip)
1059{
1060	int			i;
1061
1062	for (i = 0; i < bip->bli_format_count; i++) {
1063		if (!xfs_bitmap_empty(bip->bli_formats[i].blf_data_map,
1064			     bip->bli_formats[i].blf_map_size))
1065			return true;
1066	}
1067
1068	return false;
1069}
1070
1071void
1072xfs_buf_item_done(
1073	struct xfs_buf		*bp)
1074{
1075	/*
1076	 * If we are forcibly shutting down, this may well be off the AIL
1077	 * already. That's because we simulate the log-committed callbacks to
1078	 * unpin these buffers. Or we may never have put this item on AIL
1079	 * because of the transaction was aborted forcibly.
1080	 * xfs_trans_ail_delete() takes care of these.
1081	 *
1082	 * Either way, AIL is useless if we're forcing a shutdown.
1083	 *
1084	 * Note that log recovery writes might have buffer items that are not on
1085	 * the AIL even when the file system is not shut down.
1086	 */
1087	xfs_trans_ail_delete(&bp->b_log_item->bli_item,
1088			     (bp->b_flags & _XBF_LOGRECOVERY) ? 0 :
1089			     SHUTDOWN_CORRUPT_INCORE);
1090	xfs_buf_item_relse(bp->b_log_item);
1091}