at master 32 kB view raw
1// SPDX-License-Identifier: GPL-2.0 2/* 3 * Copyright (c) 2000-2005 Silicon Graphics, Inc. 4 * All Rights Reserved. 5 */ 6#include "xfs.h" 7#include "xfs_fs.h" 8#include "xfs_shared.h" 9#include "xfs_format.h" 10#include "xfs_log_format.h" 11#include "xfs_trans_resv.h" 12#include "xfs_bit.h" 13#include "xfs_mount.h" 14#include "xfs_trans.h" 15#include "xfs_trans_priv.h" 16#include "xfs_buf_item.h" 17#include "xfs_inode.h" 18#include "xfs_inode_item.h" 19#include "xfs_quota.h" 20#include "xfs_dquot_item.h" 21#include "xfs_dquot.h" 22#include "xfs_trace.h" 23#include "xfs_log.h" 24#include "xfs_log_priv.h" 25#include "xfs_error.h" 26 27 28struct kmem_cache *xfs_buf_item_cache; 29 30static inline struct xfs_buf_log_item *BUF_ITEM(struct xfs_log_item *lip) 31{ 32 return container_of(lip, struct xfs_buf_log_item, bli_item); 33} 34 35static void 36xfs_buf_item_get_format( 37 struct xfs_buf_log_item *bip, 38 int count) 39{ 40 ASSERT(bip->bli_formats == NULL); 41 bip->bli_format_count = count; 42 43 if (count == 1) { 44 bip->bli_formats = &bip->__bli_format; 45 return; 46 } 47 48 bip->bli_formats = kzalloc(count * sizeof(struct xfs_buf_log_format), 49 GFP_KERNEL | __GFP_NOFAIL); 50} 51 52static void 53xfs_buf_item_free_format( 54 struct xfs_buf_log_item *bip) 55{ 56 if (bip->bli_formats != &bip->__bli_format) { 57 kfree(bip->bli_formats); 58 bip->bli_formats = NULL; 59 } 60} 61 62static void 63xfs_buf_item_free( 64 struct xfs_buf_log_item *bip) 65{ 66 xfs_buf_item_free_format(bip); 67 kvfree(bip->bli_item.li_lv_shadow); 68 kmem_cache_free(xfs_buf_item_cache, bip); 69} 70 71/* 72 * xfs_buf_item_relse() is called when the buf log item is no longer needed. 73 */ 74static void 75xfs_buf_item_relse( 76 struct xfs_buf_log_item *bip) 77{ 78 struct xfs_buf *bp = bip->bli_buf; 79 80 trace_xfs_buf_item_relse(bp, _RET_IP_); 81 82 ASSERT(!test_bit(XFS_LI_IN_AIL, &bip->bli_item.li_flags)); 83 ASSERT(atomic_read(&bip->bli_refcount) == 0); 84 85 bp->b_log_item = NULL; 86 xfs_buf_rele(bp); 87 xfs_buf_item_free(bip); 88} 89 90/* Is this log iovec plausibly large enough to contain the buffer log format? */ 91bool 92xfs_buf_log_check_iovec( 93 struct kvec *iovec) 94{ 95 struct xfs_buf_log_format *blfp = iovec->iov_base; 96 char *bmp_end; 97 char *item_end; 98 99 if (offsetof(struct xfs_buf_log_format, blf_data_map) > iovec->iov_len) 100 return false; 101 102 item_end = (char *)iovec->iov_base + iovec->iov_len; 103 bmp_end = (char *)&blfp->blf_data_map[blfp->blf_map_size]; 104 return bmp_end <= item_end; 105} 106 107static inline int 108xfs_buf_log_format_size( 109 struct xfs_buf_log_format *blfp) 110{ 111 return offsetof(struct xfs_buf_log_format, blf_data_map) + 112 (blfp->blf_map_size * sizeof(blfp->blf_data_map[0])); 113} 114 115/* 116 * Return the number of log iovecs and space needed to log the given buf log 117 * item segment. 118 * 119 * It calculates this as 1 iovec for the buf log format structure and 1 for each 120 * stretch of non-contiguous chunks to be logged. Contiguous chunks are logged 121 * in a single iovec. 122 */ 123STATIC void 124xfs_buf_item_size_segment( 125 struct xfs_buf_log_item *bip, 126 struct xfs_buf_log_format *blfp, 127 uint offset, 128 int *nvecs, 129 int *nbytes) 130{ 131 int first_bit; 132 int nbits; 133 134 first_bit = xfs_next_bit(blfp->blf_data_map, blfp->blf_map_size, 0); 135 if (first_bit == -1) 136 return; 137 138 (*nvecs)++; 139 *nbytes += xfs_buf_log_format_size(blfp); 140 141 do { 142 nbits = xfs_contig_bits(blfp->blf_data_map, 143 blfp->blf_map_size, first_bit); 144 ASSERT(nbits > 0); 145 (*nvecs)++; 146 *nbytes += nbits * XFS_BLF_CHUNK; 147 148 /* 149 * This takes the bit number to start looking from and 150 * returns the next set bit from there. It returns -1 151 * if there are no more bits set or the start bit is 152 * beyond the end of the bitmap. 153 */ 154 first_bit = xfs_next_bit(blfp->blf_data_map, blfp->blf_map_size, 155 (uint)first_bit + nbits + 1); 156 } while (first_bit != -1); 157 158 return; 159} 160 161/* 162 * Compute the worst case log item overhead for an invalidated buffer with the 163 * given map count and block size. 164 */ 165unsigned int 166xfs_buf_inval_log_space( 167 unsigned int map_count, 168 unsigned int blocksize) 169{ 170 unsigned int chunks = DIV_ROUND_UP(blocksize, XFS_BLF_CHUNK); 171 unsigned int bitmap_size = DIV_ROUND_UP(chunks, NBWORD); 172 unsigned int ret = 173 offsetof(struct xfs_buf_log_format, blf_data_map) + 174 (bitmap_size * sizeof_field(struct xfs_buf_log_format, 175 blf_data_map[0])); 176 177 return ret * map_count; 178} 179 180/* 181 * Return the number of log iovecs and space needed to log the given buf log 182 * item. 183 * 184 * Discontiguous buffers need a format structure per region that is being 185 * logged. This makes the changes in the buffer appear to log recovery as though 186 * they came from separate buffers, just like would occur if multiple buffers 187 * were used instead of a single discontiguous buffer. This enables 188 * discontiguous buffers to be in-memory constructs, completely transparent to 189 * what ends up on disk. 190 * 191 * If the XFS_BLI_STALE flag has been set, then log nothing but the buf log 192 * format structures. If the item has previously been logged and has dirty 193 * regions, we do not relog them in stale buffers. This has the effect of 194 * reducing the size of the relogged item by the amount of dirty data tracked 195 * by the log item. This can result in the committing transaction reducing the 196 * amount of space being consumed by the CIL. 197 */ 198STATIC void 199xfs_buf_item_size( 200 struct xfs_log_item *lip, 201 int *nvecs, 202 int *nbytes) 203{ 204 struct xfs_buf_log_item *bip = BUF_ITEM(lip); 205 struct xfs_buf *bp = bip->bli_buf; 206 int i; 207 int bytes; 208 uint offset = 0; 209 210 ASSERT(atomic_read(&bip->bli_refcount) > 0); 211 if (bip->bli_flags & XFS_BLI_STALE) { 212 /* 213 * The buffer is stale, so all we need to log is the buf log 214 * format structure with the cancel flag in it as we are never 215 * going to replay the changes tracked in the log item. 216 */ 217 trace_xfs_buf_item_size_stale(bip); 218 ASSERT(bip->__bli_format.blf_flags & XFS_BLF_CANCEL); 219 *nvecs += bip->bli_format_count; 220 for (i = 0; i < bip->bli_format_count; i++) { 221 *nbytes += xfs_buf_log_format_size(&bip->bli_formats[i]); 222 } 223 return; 224 } 225 226 ASSERT(bip->bli_flags & XFS_BLI_LOGGED); 227 228 if (bip->bli_flags & XFS_BLI_ORDERED) { 229 /* 230 * The buffer has been logged just to order it. It is not being 231 * included in the transaction commit, so no vectors are used at 232 * all. 233 */ 234 trace_xfs_buf_item_size_ordered(bip); 235 *nvecs = XFS_LOG_VEC_ORDERED; 236 return; 237 } 238 239 /* 240 * The vector count is based on the number of buffer vectors we have 241 * dirty bits in. This will only be greater than one when we have a 242 * compound buffer with more than one segment dirty. Hence for compound 243 * buffers we need to track which segment the dirty bits correspond to, 244 * and when we move from one segment to the next increment the vector 245 * count for the extra buf log format structure that will need to be 246 * written. 247 */ 248 bytes = 0; 249 for (i = 0; i < bip->bli_format_count; i++) { 250 xfs_buf_item_size_segment(bip, &bip->bli_formats[i], offset, 251 nvecs, &bytes); 252 offset += BBTOB(bp->b_maps[i].bm_len); 253 } 254 255 /* 256 * Round up the buffer size required to minimise the number of memory 257 * allocations that need to be done as this item grows when relogged by 258 * repeated modifications. 259 */ 260 *nbytes = round_up(bytes, 512); 261 trace_xfs_buf_item_size(bip); 262} 263 264static inline void 265xfs_buf_item_copy_iovec( 266 struct xfs_log_vec *lv, 267 struct xfs_log_iovec **vecp, 268 struct xfs_buf *bp, 269 uint offset, 270 int first_bit, 271 uint nbits) 272{ 273 offset += first_bit * XFS_BLF_CHUNK; 274 xlog_copy_iovec(lv, vecp, XLOG_REG_TYPE_BCHUNK, 275 xfs_buf_offset(bp, offset), 276 nbits * XFS_BLF_CHUNK); 277} 278 279static void 280xfs_buf_item_format_segment( 281 struct xfs_buf_log_item *bip, 282 struct xfs_log_vec *lv, 283 struct xfs_log_iovec **vecp, 284 uint offset, 285 struct xfs_buf_log_format *blfp) 286{ 287 struct xfs_buf *bp = bip->bli_buf; 288 uint base_size; 289 int first_bit; 290 uint nbits; 291 292 /* copy the flags across from the base format item */ 293 blfp->blf_flags = bip->__bli_format.blf_flags; 294 295 /* 296 * Base size is the actual size of the ondisk structure - it reflects 297 * the actual size of the dirty bitmap rather than the size of the in 298 * memory structure. 299 */ 300 base_size = xfs_buf_log_format_size(blfp); 301 302 first_bit = xfs_next_bit(blfp->blf_data_map, blfp->blf_map_size, 0); 303 if (!(bip->bli_flags & XFS_BLI_STALE) && first_bit == -1) { 304 /* 305 * If the map is not be dirty in the transaction, mark 306 * the size as zero and do not advance the vector pointer. 307 */ 308 return; 309 } 310 311 blfp = xlog_copy_iovec(lv, vecp, XLOG_REG_TYPE_BFORMAT, blfp, base_size); 312 blfp->blf_size = 1; 313 314 if (bip->bli_flags & XFS_BLI_STALE) { 315 /* 316 * The buffer is stale, so all we need to log 317 * is the buf log format structure with the 318 * cancel flag in it. 319 */ 320 trace_xfs_buf_item_format_stale(bip); 321 ASSERT(blfp->blf_flags & XFS_BLF_CANCEL); 322 return; 323 } 324 325 326 /* 327 * Fill in an iovec for each set of contiguous chunks. 328 */ 329 do { 330 ASSERT(first_bit >= 0); 331 nbits = xfs_contig_bits(blfp->blf_data_map, 332 blfp->blf_map_size, first_bit); 333 ASSERT(nbits > 0); 334 xfs_buf_item_copy_iovec(lv, vecp, bp, offset, 335 first_bit, nbits); 336 blfp->blf_size++; 337 338 /* 339 * This takes the bit number to start looking from and 340 * returns the next set bit from there. It returns -1 341 * if there are no more bits set or the start bit is 342 * beyond the end of the bitmap. 343 */ 344 first_bit = xfs_next_bit(blfp->blf_data_map, blfp->blf_map_size, 345 (uint)first_bit + nbits + 1); 346 } while (first_bit != -1); 347 348 return; 349} 350 351/* 352 * This is called to fill in the vector of log iovecs for the 353 * given log buf item. It fills the first entry with a buf log 354 * format structure, and the rest point to contiguous chunks 355 * within the buffer. 356 */ 357STATIC void 358xfs_buf_item_format( 359 struct xfs_log_item *lip, 360 struct xfs_log_vec *lv) 361{ 362 struct xfs_buf_log_item *bip = BUF_ITEM(lip); 363 struct xfs_buf *bp = bip->bli_buf; 364 struct xfs_log_iovec *vecp = NULL; 365 uint offset = 0; 366 int i; 367 368 ASSERT(atomic_read(&bip->bli_refcount) > 0); 369 ASSERT((bip->bli_flags & XFS_BLI_LOGGED) || 370 (bip->bli_flags & XFS_BLI_STALE)); 371 ASSERT((bip->bli_flags & XFS_BLI_STALE) || 372 (xfs_blft_from_flags(&bip->__bli_format) > XFS_BLFT_UNKNOWN_BUF 373 && xfs_blft_from_flags(&bip->__bli_format) < XFS_BLFT_MAX_BUF)); 374 ASSERT(!(bip->bli_flags & XFS_BLI_ORDERED) || 375 (bip->bli_flags & XFS_BLI_STALE)); 376 377 378 /* 379 * If it is an inode buffer, transfer the in-memory state to the 380 * format flags and clear the in-memory state. 381 * 382 * For buffer based inode allocation, we do not transfer 383 * this state if the inode buffer allocation has not yet been committed 384 * to the log as setting the XFS_BLI_INODE_BUF flag will prevent 385 * correct replay of the inode allocation. 386 * 387 * For icreate item based inode allocation, the buffers aren't written 388 * to the journal during allocation, and hence we should always tag the 389 * buffer as an inode buffer so that the correct unlinked list replay 390 * occurs during recovery. 391 */ 392 if (bip->bli_flags & XFS_BLI_INODE_BUF) { 393 if (xfs_has_v3inodes(lip->li_log->l_mp) || 394 !((bip->bli_flags & XFS_BLI_INODE_ALLOC_BUF) && 395 xfs_log_item_in_current_chkpt(lip))) 396 bip->__bli_format.blf_flags |= XFS_BLF_INODE_BUF; 397 bip->bli_flags &= ~XFS_BLI_INODE_BUF; 398 } 399 400 for (i = 0; i < bip->bli_format_count; i++) { 401 xfs_buf_item_format_segment(bip, lv, &vecp, offset, 402 &bip->bli_formats[i]); 403 offset += BBTOB(bp->b_maps[i].bm_len); 404 } 405 406 /* 407 * Check to make sure everything is consistent. 408 */ 409 trace_xfs_buf_item_format(bip); 410} 411 412/* 413 * This is called to pin the buffer associated with the buf log item in memory 414 * so it cannot be written out. 415 * 416 * We take a reference to the buffer log item here so that the BLI life cycle 417 * extends at least until the buffer is unpinned via xfs_buf_item_unpin() and 418 * inserted into the AIL. 419 * 420 * We also need to take a reference to the buffer itself as the BLI unpin 421 * processing requires accessing the buffer after the BLI has dropped the final 422 * BLI reference. See xfs_buf_item_unpin() for an explanation. 423 * If unpins race to drop the final BLI reference and only the 424 * BLI owns a reference to the buffer, then the loser of the race can have the 425 * buffer fgreed from under it (e.g. on shutdown). Taking a buffer reference per 426 * pin count ensures the life cycle of the buffer extends for as 427 * long as we hold the buffer pin reference in xfs_buf_item_unpin(). 428 */ 429STATIC void 430xfs_buf_item_pin( 431 struct xfs_log_item *lip) 432{ 433 struct xfs_buf_log_item *bip = BUF_ITEM(lip); 434 435 ASSERT(atomic_read(&bip->bli_refcount) > 0); 436 ASSERT((bip->bli_flags & XFS_BLI_LOGGED) || 437 (bip->bli_flags & XFS_BLI_ORDERED) || 438 (bip->bli_flags & XFS_BLI_STALE)); 439 440 trace_xfs_buf_item_pin(bip); 441 442 xfs_buf_hold(bip->bli_buf); 443 atomic_inc(&bip->bli_refcount); 444 atomic_inc(&bip->bli_buf->b_pin_count); 445} 446 447/* 448 * For a stale BLI, process all the necessary completions that must be 449 * performed when the final BLI reference goes away. The buffer will be 450 * referenced and locked here - we return to the caller with the buffer still 451 * referenced and locked for them to finalise processing of the buffer. 452 */ 453static void 454xfs_buf_item_finish_stale( 455 struct xfs_buf_log_item *bip) 456{ 457 struct xfs_buf *bp = bip->bli_buf; 458 struct xfs_log_item *lip = &bip->bli_item; 459 460 ASSERT(bip->bli_flags & XFS_BLI_STALE); 461 ASSERT(xfs_buf_islocked(bp)); 462 ASSERT(bp->b_flags & XBF_STALE); 463 ASSERT(bip->__bli_format.blf_flags & XFS_BLF_CANCEL); 464 ASSERT(list_empty(&lip->li_trans)); 465 ASSERT(!bp->b_transp); 466 467 if (bip->bli_flags & XFS_BLI_STALE_INODE) { 468 xfs_buf_item_done(bp); 469 xfs_buf_inode_iodone(bp); 470 ASSERT(list_empty(&bp->b_li_list)); 471 return; 472 } 473 474 /* 475 * We may or may not be on the AIL here, xfs_trans_ail_delete() will do 476 * the right thing regardless of the situation in which we are called. 477 */ 478 xfs_trans_ail_delete(lip, SHUTDOWN_LOG_IO_ERROR); 479 xfs_buf_item_relse(bip); 480 ASSERT(bp->b_log_item == NULL); 481} 482 483/* 484 * This is called to unpin the buffer associated with the buf log item which was 485 * previously pinned with a call to xfs_buf_item_pin(). We enter this function 486 * with a buffer pin count, a buffer reference and a BLI reference. 487 * 488 * We must drop the BLI reference before we unpin the buffer because the AIL 489 * doesn't acquire a BLI reference whenever it accesses it. Therefore if the 490 * refcount drops to zero, the bli could still be AIL resident and the buffer 491 * submitted for I/O at any point before we return. This can result in IO 492 * completion freeing the buffer while we are still trying to access it here. 493 * This race condition can also occur in shutdown situations where we abort and 494 * unpin buffers from contexts other that journal IO completion. 495 * 496 * Hence we have to hold a buffer reference per pin count to ensure that the 497 * buffer cannot be freed until we have finished processing the unpin operation. 498 * The reference is taken in xfs_buf_item_pin(), and we must hold it until we 499 * are done processing the buffer state. In the case of an abort (remove = 500 * true) then we re-use the current pin reference as the IO reference we hand 501 * off to IO failure handling. 502 */ 503STATIC void 504xfs_buf_item_unpin( 505 struct xfs_log_item *lip, 506 int remove) 507{ 508 struct xfs_buf_log_item *bip = BUF_ITEM(lip); 509 struct xfs_buf *bp = bip->bli_buf; 510 int stale = bip->bli_flags & XFS_BLI_STALE; 511 int freed; 512 513 ASSERT(bp->b_log_item == bip); 514 ASSERT(atomic_read(&bip->bli_refcount) > 0); 515 516 trace_xfs_buf_item_unpin(bip); 517 518 freed = atomic_dec_and_test(&bip->bli_refcount); 519 if (atomic_dec_and_test(&bp->b_pin_count)) 520 wake_up_all(&bp->b_waiters); 521 522 /* 523 * Nothing to do but drop the buffer pin reference if the BLI is 524 * still active. 525 */ 526 if (!freed) { 527 xfs_buf_rele(bp); 528 return; 529 } 530 531 if (stale) { 532 trace_xfs_buf_item_unpin_stale(bip); 533 534 /* 535 * The buffer has been locked and referenced since it was marked 536 * stale so we own both lock and reference exclusively here. We 537 * do not need the pin reference any more, so drop it now so 538 * that we only have one reference to drop once item completion 539 * processing is complete. 540 */ 541 xfs_buf_rele(bp); 542 xfs_buf_item_finish_stale(bip); 543 xfs_buf_relse(bp); 544 return; 545 } 546 547 if (remove) { 548 /* 549 * We need to simulate an async IO failures here to ensure that 550 * the correct error completion is run on this buffer. This 551 * requires a reference to the buffer and for the buffer to be 552 * locked. We can safely pass ownership of the pin reference to 553 * the IO to ensure that nothing can free the buffer while we 554 * wait for the lock and then run the IO failure completion. 555 */ 556 xfs_buf_lock(bp); 557 bp->b_flags |= XBF_ASYNC; 558 xfs_buf_ioend_fail(bp); 559 return; 560 } 561 562 /* 563 * BLI has no more active references - it will be moved to the AIL to 564 * manage the remaining BLI/buffer life cycle. There is nothing left for 565 * us to do here so drop the pin reference to the buffer. 566 */ 567 xfs_buf_rele(bp); 568} 569 570STATIC uint 571xfs_buf_item_push( 572 struct xfs_log_item *lip, 573 struct list_head *buffer_list) 574{ 575 struct xfs_buf_log_item *bip = BUF_ITEM(lip); 576 struct xfs_buf *bp = bip->bli_buf; 577 uint rval = XFS_ITEM_SUCCESS; 578 579 if (xfs_buf_ispinned(bp)) 580 return XFS_ITEM_PINNED; 581 if (!xfs_buf_trylock(bp)) { 582 /* 583 * If we have just raced with a buffer being pinned and it has 584 * been marked stale, we could end up stalling until someone else 585 * issues a log force to unpin the stale buffer. Check for the 586 * race condition here so xfsaild recognizes the buffer is pinned 587 * and queues a log force to move it along. 588 */ 589 if (xfs_buf_ispinned(bp)) 590 return XFS_ITEM_PINNED; 591 return XFS_ITEM_LOCKED; 592 } 593 594 ASSERT(!(bip->bli_flags & XFS_BLI_STALE)); 595 596 trace_xfs_buf_item_push(bip); 597 598 /* has a previous flush failed due to IO errors? */ 599 if (bp->b_flags & XBF_WRITE_FAIL) { 600 xfs_buf_alert_ratelimited(bp, "XFS: Failing async write", 601 "Failing async write on buffer block 0x%llx. Retrying async write.", 602 (long long)xfs_buf_daddr(bp)); 603 } 604 605 if (!xfs_buf_delwri_queue(bp, buffer_list)) 606 rval = XFS_ITEM_FLUSHING; 607 xfs_buf_unlock(bp); 608 return rval; 609} 610 611/* 612 * Drop the buffer log item refcount and take appropriate action. This helper 613 * determines whether the bli must be freed or not, since a decrement to zero 614 * does not necessarily mean the bli is unused. 615 */ 616void 617xfs_buf_item_put( 618 struct xfs_buf_log_item *bip) 619{ 620 621 ASSERT(xfs_buf_islocked(bip->bli_buf)); 622 623 /* drop the bli ref and return if it wasn't the last one */ 624 if (!atomic_dec_and_test(&bip->bli_refcount)) 625 return; 626 627 /* If the BLI is in the AIL, then it is still dirty and in use */ 628 if (test_bit(XFS_LI_IN_AIL, &bip->bli_item.li_flags)) { 629 ASSERT(bip->bli_flags & XFS_BLI_DIRTY); 630 return; 631 } 632 633 /* 634 * In shutdown conditions, we can be asked to free a dirty BLI that 635 * isn't in the AIL. This can occur due to a checkpoint aborting a BLI 636 * instead of inserting it into the AIL at checkpoint IO completion. If 637 * there's another bli reference (e.g. a btree cursor holds a clean 638 * reference) and it is released via xfs_trans_brelse(), we can get here 639 * with that aborted, dirty BLI. In this case, it is safe to free the 640 * dirty BLI immediately, as it is not in the AIL and there are no 641 * other references to it. 642 * 643 * We should never get here with a stale BLI via that path as 644 * xfs_trans_brelse() specifically holds onto stale buffers rather than 645 * releasing them. 646 */ 647 ASSERT(!(bip->bli_flags & XFS_BLI_DIRTY) || 648 test_bit(XFS_LI_ABORTED, &bip->bli_item.li_flags)); 649 ASSERT(!(bip->bli_flags & XFS_BLI_STALE)); 650 xfs_buf_item_relse(bip); 651} 652 653/* 654 * Release the buffer associated with the buf log item. If there is no dirty 655 * logged data associated with the buffer recorded in the buf log item, then 656 * free the buf log item and remove the reference to it in the buffer. 657 * 658 * This call ignores the recursion count. It is only called when the buffer 659 * should REALLY be unlocked, regardless of the recursion count. 660 * 661 * We unconditionally drop the transaction's reference to the log item. If the 662 * item was logged, then another reference was taken when it was pinned, so we 663 * can safely drop the transaction reference now. This also allows us to avoid 664 * potential races with the unpin code freeing the bli by not referencing the 665 * bli after we've dropped the reference count. 666 * 667 * If the XFS_BLI_HOLD flag is set in the buf log item, then free the log item 668 * if necessary but do not unlock the buffer. This is for support of 669 * xfs_trans_bhold(). Make sure the XFS_BLI_HOLD field is cleared if we don't 670 * free the item. 671 * 672 * If the XFS_BLI_STALE flag is set, the last reference to the BLI *must* 673 * perform a completion abort of any objects attached to the buffer for IO 674 * tracking purposes. This generally only happens in shutdown situations, 675 * normally xfs_buf_item_unpin() will drop the last BLI reference and perform 676 * completion processing. However, because transaction completion can race with 677 * checkpoint completion during a shutdown, this release context may end up 678 * being the last active reference to the BLI and so needs to perform this 679 * cleanup. 680 */ 681STATIC void 682xfs_buf_item_release( 683 struct xfs_log_item *lip) 684{ 685 struct xfs_buf_log_item *bip = BUF_ITEM(lip); 686 struct xfs_buf *bp = bip->bli_buf; 687 bool hold = bip->bli_flags & XFS_BLI_HOLD; 688 bool stale = bip->bli_flags & XFS_BLI_STALE; 689 bool aborted = test_bit(XFS_LI_ABORTED, 690 &lip->li_flags); 691 bool dirty = bip->bli_flags & XFS_BLI_DIRTY; 692#if defined(DEBUG) || defined(XFS_WARN) 693 bool ordered = bip->bli_flags & XFS_BLI_ORDERED; 694#endif 695 696 trace_xfs_buf_item_release(bip); 697 698 ASSERT(xfs_buf_islocked(bp)); 699 700 /* 701 * The bli dirty state should match whether the blf has logged segments 702 * except for ordered buffers, where only the bli should be dirty. 703 */ 704 ASSERT((!ordered && dirty == xfs_buf_item_dirty_format(bip)) || 705 (ordered && dirty && !xfs_buf_item_dirty_format(bip))); 706 ASSERT(!stale || (bip->__bli_format.blf_flags & XFS_BLF_CANCEL)); 707 708 /* 709 * Clear the buffer's association with this transaction and 710 * per-transaction state from the bli, which has been copied above. 711 */ 712 bp->b_transp = NULL; 713 bip->bli_flags &= ~(XFS_BLI_LOGGED | XFS_BLI_HOLD | XFS_BLI_ORDERED); 714 715 /* If there are other references, then we have nothing to do. */ 716 if (!atomic_dec_and_test(&bip->bli_refcount)) 717 goto out_release; 718 719 /* 720 * Stale buffer completion frees the BLI, unlocks and releases the 721 * buffer. Neither the BLI or buffer are safe to reference after this 722 * call, so there's nothing more we need to do here. 723 * 724 * If we get here with a stale buffer and references to the BLI remain, 725 * we must not unlock the buffer as the last BLI reference owns lock 726 * context, not us. 727 */ 728 if (stale) { 729 xfs_buf_item_finish_stale(bip); 730 xfs_buf_relse(bp); 731 ASSERT(!hold); 732 return; 733 } 734 735 /* 736 * Dirty or clean, aborted items are done and need to be removed from 737 * the AIL and released. This frees the BLI, but leaves the buffer 738 * locked and referenced. 739 */ 740 if (aborted || xlog_is_shutdown(lip->li_log)) { 741 ASSERT(list_empty(&bip->bli_buf->b_li_list)); 742 xfs_buf_item_done(bp); 743 goto out_release; 744 } 745 746 /* 747 * Clean, unreferenced BLIs can be immediately freed, leaving the buffer 748 * locked and referenced. 749 * 750 * Dirty, unreferenced BLIs *must* be in the AIL awaiting writeback. 751 */ 752 if (!dirty) 753 xfs_buf_item_relse(bip); 754 else 755 ASSERT(test_bit(XFS_LI_IN_AIL, &lip->li_flags)); 756 757 /* Not safe to reference the BLI from here */ 758out_release: 759 /* 760 * If we get here with a stale buffer, we must not unlock the 761 * buffer as the last BLI reference owns lock context, not us. 762 */ 763 if (stale || hold) 764 return; 765 xfs_buf_relse(bp); 766} 767 768STATIC void 769xfs_buf_item_committing( 770 struct xfs_log_item *lip, 771 xfs_csn_t seq) 772{ 773 return xfs_buf_item_release(lip); 774} 775 776/* 777 * This is called to find out where the oldest active copy of the 778 * buf log item in the on disk log resides now that the last log 779 * write of it completed at the given lsn. 780 * We always re-log all the dirty data in a buffer, so usually the 781 * latest copy in the on disk log is the only one that matters. For 782 * those cases we simply return the given lsn. 783 * 784 * The one exception to this is for buffers full of newly allocated 785 * inodes. These buffers are only relogged with the XFS_BLI_INODE_BUF 786 * flag set, indicating that only the di_next_unlinked fields from the 787 * inodes in the buffers will be replayed during recovery. If the 788 * original newly allocated inode images have not yet been flushed 789 * when the buffer is so relogged, then we need to make sure that we 790 * keep the old images in the 'active' portion of the log. We do this 791 * by returning the original lsn of that transaction here rather than 792 * the current one. 793 */ 794STATIC xfs_lsn_t 795xfs_buf_item_committed( 796 struct xfs_log_item *lip, 797 xfs_lsn_t lsn) 798{ 799 struct xfs_buf_log_item *bip = BUF_ITEM(lip); 800 801 trace_xfs_buf_item_committed(bip); 802 803 if ((bip->bli_flags & XFS_BLI_INODE_ALLOC_BUF) && lip->li_lsn != 0) 804 return lip->li_lsn; 805 return lsn; 806} 807 808#ifdef DEBUG_EXPENSIVE 809static int 810xfs_buf_item_precommit( 811 struct xfs_trans *tp, 812 struct xfs_log_item *lip) 813{ 814 struct xfs_buf_log_item *bip = BUF_ITEM(lip); 815 struct xfs_buf *bp = bip->bli_buf; 816 struct xfs_mount *mp = bp->b_mount; 817 xfs_failaddr_t fa; 818 819 if (!bp->b_ops || !bp->b_ops->verify_struct) 820 return 0; 821 if (bip->bli_flags & XFS_BLI_STALE) 822 return 0; 823 824 fa = bp->b_ops->verify_struct(bp); 825 if (fa) { 826 xfs_buf_verifier_error(bp, -EFSCORRUPTED, bp->b_ops->name, 827 bp->b_addr, BBTOB(bp->b_length), fa); 828 xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); 829 ASSERT(fa == NULL); 830 } 831 832 return 0; 833} 834#else 835# define xfs_buf_item_precommit NULL 836#endif 837 838static const struct xfs_item_ops xfs_buf_item_ops = { 839 .iop_size = xfs_buf_item_size, 840 .iop_precommit = xfs_buf_item_precommit, 841 .iop_format = xfs_buf_item_format, 842 .iop_pin = xfs_buf_item_pin, 843 .iop_unpin = xfs_buf_item_unpin, 844 .iop_release = xfs_buf_item_release, 845 .iop_committing = xfs_buf_item_committing, 846 .iop_committed = xfs_buf_item_committed, 847 .iop_push = xfs_buf_item_push, 848}; 849 850/* 851 * Allocate a new buf log item to go with the given buffer. 852 * Set the buffer's b_log_item field to point to the new 853 * buf log item. 854 */ 855int 856xfs_buf_item_init( 857 struct xfs_buf *bp, 858 struct xfs_mount *mp) 859{ 860 struct xfs_buf_log_item *bip = bp->b_log_item; 861 int chunks; 862 int map_size; 863 int i; 864 865 /* 866 * Check to see if there is already a buf log item for 867 * this buffer. If we do already have one, there is 868 * nothing to do here so return. 869 */ 870 ASSERT(bp->b_mount == mp); 871 if (bip) { 872 ASSERT(bip->bli_item.li_type == XFS_LI_BUF); 873 ASSERT(!bp->b_transp); 874 ASSERT(bip->bli_buf == bp); 875 return 0; 876 } 877 878 bip = kmem_cache_zalloc(xfs_buf_item_cache, GFP_KERNEL | __GFP_NOFAIL); 879 xfs_log_item_init(mp, &bip->bli_item, XFS_LI_BUF, &xfs_buf_item_ops); 880 bip->bli_buf = bp; 881 882 /* 883 * chunks is the number of XFS_BLF_CHUNK size pieces the buffer 884 * can be divided into. Make sure not to truncate any pieces. 885 * map_size is the size of the bitmap needed to describe the 886 * chunks of the buffer. 887 * 888 * Discontiguous buffer support follows the layout of the underlying 889 * buffer. This makes the implementation as simple as possible. 890 */ 891 xfs_buf_item_get_format(bip, bp->b_map_count); 892 893 for (i = 0; i < bip->bli_format_count; i++) { 894 chunks = DIV_ROUND_UP(BBTOB(bp->b_maps[i].bm_len), 895 XFS_BLF_CHUNK); 896 map_size = DIV_ROUND_UP(chunks, NBWORD); 897 898 if (map_size > XFS_BLF_DATAMAP_SIZE) { 899 xfs_buf_item_free_format(bip); 900 kmem_cache_free(xfs_buf_item_cache, bip); 901 xfs_err(mp, 902 "buffer item dirty bitmap (%u uints) too small to reflect %u bytes!", 903 map_size, 904 BBTOB(bp->b_maps[i].bm_len)); 905 return -EFSCORRUPTED; 906 } 907 908 bip->bli_formats[i].blf_type = XFS_LI_BUF; 909 bip->bli_formats[i].blf_blkno = bp->b_maps[i].bm_bn; 910 bip->bli_formats[i].blf_len = bp->b_maps[i].bm_len; 911 bip->bli_formats[i].blf_map_size = map_size; 912 } 913 914 bp->b_log_item = bip; 915 xfs_buf_hold(bp); 916 return 0; 917} 918 919 920/* 921 * Mark bytes first through last inclusive as dirty in the buf 922 * item's bitmap. 923 */ 924static void 925xfs_buf_item_log_segment( 926 uint first, 927 uint last, 928 uint *map) 929{ 930 uint first_bit; 931 uint last_bit; 932 uint bits_to_set; 933 uint bits_set; 934 uint word_num; 935 uint *wordp; 936 uint bit; 937 uint end_bit; 938 uint mask; 939 940 ASSERT(first < XFS_BLF_DATAMAP_SIZE * XFS_BLF_CHUNK * NBWORD); 941 ASSERT(last < XFS_BLF_DATAMAP_SIZE * XFS_BLF_CHUNK * NBWORD); 942 943 /* 944 * Convert byte offsets to bit numbers. 945 */ 946 first_bit = first >> XFS_BLF_SHIFT; 947 last_bit = last >> XFS_BLF_SHIFT; 948 949 /* 950 * Calculate the total number of bits to be set. 951 */ 952 bits_to_set = last_bit - first_bit + 1; 953 954 /* 955 * Get a pointer to the first word in the bitmap 956 * to set a bit in. 957 */ 958 word_num = first_bit >> BIT_TO_WORD_SHIFT; 959 wordp = &map[word_num]; 960 961 /* 962 * Calculate the starting bit in the first word. 963 */ 964 bit = first_bit & (uint)(NBWORD - 1); 965 966 /* 967 * First set any bits in the first word of our range. 968 * If it starts at bit 0 of the word, it will be 969 * set below rather than here. That is what the variable 970 * bit tells us. The variable bits_set tracks the number 971 * of bits that have been set so far. End_bit is the number 972 * of the last bit to be set in this word plus one. 973 */ 974 if (bit) { 975 end_bit = min(bit + bits_to_set, (uint)NBWORD); 976 mask = ((1U << (end_bit - bit)) - 1) << bit; 977 *wordp |= mask; 978 wordp++; 979 bits_set = end_bit - bit; 980 } else { 981 bits_set = 0; 982 } 983 984 /* 985 * Now set bits a whole word at a time that are between 986 * first_bit and last_bit. 987 */ 988 while ((bits_to_set - bits_set) >= NBWORD) { 989 *wordp = 0xffffffff; 990 bits_set += NBWORD; 991 wordp++; 992 } 993 994 /* 995 * Finally, set any bits left to be set in one last partial word. 996 */ 997 end_bit = bits_to_set - bits_set; 998 if (end_bit) { 999 mask = (1U << end_bit) - 1; 1000 *wordp |= mask; 1001 } 1002} 1003 1004/* 1005 * Mark bytes first through last inclusive as dirty in the buf 1006 * item's bitmap. 1007 */ 1008void 1009xfs_buf_item_log( 1010 struct xfs_buf_log_item *bip, 1011 uint first, 1012 uint last) 1013{ 1014 int i; 1015 uint start; 1016 uint end; 1017 struct xfs_buf *bp = bip->bli_buf; 1018 1019 /* 1020 * walk each buffer segment and mark them dirty appropriately. 1021 */ 1022 start = 0; 1023 for (i = 0; i < bip->bli_format_count; i++) { 1024 if (start > last) 1025 break; 1026 end = start + BBTOB(bp->b_maps[i].bm_len) - 1; 1027 1028 /* skip to the map that includes the first byte to log */ 1029 if (first > end) { 1030 start += BBTOB(bp->b_maps[i].bm_len); 1031 continue; 1032 } 1033 1034 /* 1035 * Trim the range to this segment and mark it in the bitmap. 1036 * Note that we must convert buffer offsets to segment relative 1037 * offsets (e.g., the first byte of each segment is byte 0 of 1038 * that segment). 1039 */ 1040 if (first < start) 1041 first = start; 1042 if (end > last) 1043 end = last; 1044 xfs_buf_item_log_segment(first - start, end - start, 1045 &bip->bli_formats[i].blf_data_map[0]); 1046 1047 start += BBTOB(bp->b_maps[i].bm_len); 1048 } 1049} 1050 1051 1052/* 1053 * Return true if the buffer has any ranges logged/dirtied by a transaction, 1054 * false otherwise. 1055 */ 1056bool 1057xfs_buf_item_dirty_format( 1058 struct xfs_buf_log_item *bip) 1059{ 1060 int i; 1061 1062 for (i = 0; i < bip->bli_format_count; i++) { 1063 if (!xfs_bitmap_empty(bip->bli_formats[i].blf_data_map, 1064 bip->bli_formats[i].blf_map_size)) 1065 return true; 1066 } 1067 1068 return false; 1069} 1070 1071void 1072xfs_buf_item_done( 1073 struct xfs_buf *bp) 1074{ 1075 /* 1076 * If we are forcibly shutting down, this may well be off the AIL 1077 * already. That's because we simulate the log-committed callbacks to 1078 * unpin these buffers. Or we may never have put this item on AIL 1079 * because of the transaction was aborted forcibly. 1080 * xfs_trans_ail_delete() takes care of these. 1081 * 1082 * Either way, AIL is useless if we're forcing a shutdown. 1083 * 1084 * Note that log recovery writes might have buffer items that are not on 1085 * the AIL even when the file system is not shut down. 1086 */ 1087 xfs_trans_ail_delete(&bp->b_log_item->bli_item, 1088 (bp->b_flags & _XBF_LOGRECOVERY) ? 0 : 1089 SHUTDOWN_CORRUPT_INCORE); 1090 xfs_buf_item_relse(bp->b_log_item); 1091}