Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

at v2.6.39-rc2 1068 lines 28 kB view raw
1/* 2 * Copyright (c) 2000-2005 Silicon Graphics, Inc. 3 * All Rights Reserved. 4 * 5 * This program is free software; you can redistribute it and/or 6 * modify it under the terms of the GNU General Public License as 7 * published by the Free Software Foundation. 8 * 9 * This program is distributed in the hope that it would be useful, 10 * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 * GNU General Public License for more details. 13 * 14 * You should have received a copy of the GNU General Public License 15 * along with this program; if not, write the Free Software Foundation, 16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 17 */ 18#include "xfs.h" 19#include "xfs_fs.h" 20#include "xfs_types.h" 21#include "xfs_bit.h" 22#include "xfs_log.h" 23#include "xfs_inum.h" 24#include "xfs_trans.h" 25#include "xfs_sb.h" 26#include "xfs_ag.h" 27#include "xfs_mount.h" 28#include "xfs_bmap_btree.h" 29#include "xfs_inode.h" 30#include "xfs_dinode.h" 31#include "xfs_error.h" 32#include "xfs_filestream.h" 33#include "xfs_vnodeops.h" 34#include "xfs_inode_item.h" 35#include "xfs_quota.h" 36#include "xfs_trace.h" 37#include "xfs_fsops.h" 38 39#include <linux/kthread.h> 40#include <linux/freezer.h> 41 42/* 43 * The inode lookup is done in batches to keep the amount of lock traffic and 44 * radix tree lookups to a minimum. The batch size is a trade off between 45 * lookup reduction and stack usage. This is in the reclaim path, so we can't 46 * be too greedy. 47 */ 48#define XFS_LOOKUP_BATCH 32 49 50STATIC int 51xfs_inode_ag_walk_grab( 52 struct xfs_inode *ip) 53{ 54 struct inode *inode = VFS_I(ip); 55 56 ASSERT(rcu_read_lock_held()); 57 58 /* 59 * check for stale RCU freed inode 60 * 61 * If the inode has been reallocated, it doesn't matter if it's not in 62 * the AG we are walking - we are walking for writeback, so if it 63 * passes all the "valid inode" checks and is dirty, then we'll write 64 * it back anyway. If it has been reallocated and still being 65 * initialised, the XFS_INEW check below will catch it. 66 */ 67 spin_lock(&ip->i_flags_lock); 68 if (!ip->i_ino) 69 goto out_unlock_noent; 70 71 /* avoid new or reclaimable inodes. Leave for reclaim code to flush */ 72 if (__xfs_iflags_test(ip, XFS_INEW | XFS_IRECLAIMABLE | XFS_IRECLAIM)) 73 goto out_unlock_noent; 74 spin_unlock(&ip->i_flags_lock); 75 76 /* nothing to sync during shutdown */ 77 if (XFS_FORCED_SHUTDOWN(ip->i_mount)) 78 return EFSCORRUPTED; 79 80 /* If we can't grab the inode, it must on it's way to reclaim. */ 81 if (!igrab(inode)) 82 return ENOENT; 83 84 if (is_bad_inode(inode)) { 85 IRELE(ip); 86 return ENOENT; 87 } 88 89 /* inode is valid */ 90 return 0; 91 92out_unlock_noent: 93 spin_unlock(&ip->i_flags_lock); 94 return ENOENT; 95} 96 97STATIC int 98xfs_inode_ag_walk( 99 struct xfs_mount *mp, 100 struct xfs_perag *pag, 101 int (*execute)(struct xfs_inode *ip, 102 struct xfs_perag *pag, int flags), 103 int flags) 104{ 105 uint32_t first_index; 106 int last_error = 0; 107 int skipped; 108 int done; 109 int nr_found; 110 111restart: 112 done = 0; 113 skipped = 0; 114 first_index = 0; 115 nr_found = 0; 116 do { 117 struct xfs_inode *batch[XFS_LOOKUP_BATCH]; 118 int error = 0; 119 int i; 120 121 rcu_read_lock(); 122 nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, 123 (void **)batch, first_index, 124 XFS_LOOKUP_BATCH); 125 if (!nr_found) { 126 rcu_read_unlock(); 127 break; 128 } 129 130 /* 131 * Grab the inodes before we drop the lock. if we found 132 * nothing, nr == 0 and the loop will be skipped. 133 */ 134 for (i = 0; i < nr_found; i++) { 135 struct xfs_inode *ip = batch[i]; 136 137 if (done || xfs_inode_ag_walk_grab(ip)) 138 batch[i] = NULL; 139 140 /* 141 * Update the index for the next lookup. Catch 142 * overflows into the next AG range which can occur if 143 * we have inodes in the last block of the AG and we 144 * are currently pointing to the last inode. 145 * 146 * Because we may see inodes that are from the wrong AG 147 * due to RCU freeing and reallocation, only update the 148 * index if it lies in this AG. It was a race that lead 149 * us to see this inode, so another lookup from the 150 * same index will not find it again. 151 */ 152 if (XFS_INO_TO_AGNO(mp, ip->i_ino) != pag->pag_agno) 153 continue; 154 first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1); 155 if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino)) 156 done = 1; 157 } 158 159 /* unlock now we've grabbed the inodes. */ 160 rcu_read_unlock(); 161 162 for (i = 0; i < nr_found; i++) { 163 if (!batch[i]) 164 continue; 165 error = execute(batch[i], pag, flags); 166 IRELE(batch[i]); 167 if (error == EAGAIN) { 168 skipped++; 169 continue; 170 } 171 if (error && last_error != EFSCORRUPTED) 172 last_error = error; 173 } 174 175 /* bail out if the filesystem is corrupted. */ 176 if (error == EFSCORRUPTED) 177 break; 178 179 } while (nr_found && !done); 180 181 if (skipped) { 182 delay(1); 183 goto restart; 184 } 185 return last_error; 186} 187 188int 189xfs_inode_ag_iterator( 190 struct xfs_mount *mp, 191 int (*execute)(struct xfs_inode *ip, 192 struct xfs_perag *pag, int flags), 193 int flags) 194{ 195 struct xfs_perag *pag; 196 int error = 0; 197 int last_error = 0; 198 xfs_agnumber_t ag; 199 200 ag = 0; 201 while ((pag = xfs_perag_get(mp, ag))) { 202 ag = pag->pag_agno + 1; 203 error = xfs_inode_ag_walk(mp, pag, execute, flags); 204 xfs_perag_put(pag); 205 if (error) { 206 last_error = error; 207 if (error == EFSCORRUPTED) 208 break; 209 } 210 } 211 return XFS_ERROR(last_error); 212} 213 214STATIC int 215xfs_sync_inode_data( 216 struct xfs_inode *ip, 217 struct xfs_perag *pag, 218 int flags) 219{ 220 struct inode *inode = VFS_I(ip); 221 struct address_space *mapping = inode->i_mapping; 222 int error = 0; 223 224 if (!mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) 225 goto out_wait; 226 227 if (!xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED)) { 228 if (flags & SYNC_TRYLOCK) 229 goto out_wait; 230 xfs_ilock(ip, XFS_IOLOCK_SHARED); 231 } 232 233 error = xfs_flush_pages(ip, 0, -1, (flags & SYNC_WAIT) ? 234 0 : XBF_ASYNC, FI_NONE); 235 xfs_iunlock(ip, XFS_IOLOCK_SHARED); 236 237 out_wait: 238 if (flags & SYNC_WAIT) 239 xfs_ioend_wait(ip); 240 return error; 241} 242 243STATIC int 244xfs_sync_inode_attr( 245 struct xfs_inode *ip, 246 struct xfs_perag *pag, 247 int flags) 248{ 249 int error = 0; 250 251 xfs_ilock(ip, XFS_ILOCK_SHARED); 252 if (xfs_inode_clean(ip)) 253 goto out_unlock; 254 if (!xfs_iflock_nowait(ip)) { 255 if (!(flags & SYNC_WAIT)) 256 goto out_unlock; 257 xfs_iflock(ip); 258 } 259 260 if (xfs_inode_clean(ip)) { 261 xfs_ifunlock(ip); 262 goto out_unlock; 263 } 264 265 error = xfs_iflush(ip, flags); 266 267 out_unlock: 268 xfs_iunlock(ip, XFS_ILOCK_SHARED); 269 return error; 270} 271 272/* 273 * Write out pagecache data for the whole filesystem. 274 */ 275STATIC int 276xfs_sync_data( 277 struct xfs_mount *mp, 278 int flags) 279{ 280 int error; 281 282 ASSERT((flags & ~(SYNC_TRYLOCK|SYNC_WAIT)) == 0); 283 284 error = xfs_inode_ag_iterator(mp, xfs_sync_inode_data, flags); 285 if (error) 286 return XFS_ERROR(error); 287 288 xfs_log_force(mp, (flags & SYNC_WAIT) ? XFS_LOG_SYNC : 0); 289 return 0; 290} 291 292/* 293 * Write out inode metadata (attributes) for the whole filesystem. 294 */ 295STATIC int 296xfs_sync_attr( 297 struct xfs_mount *mp, 298 int flags) 299{ 300 ASSERT((flags & ~SYNC_WAIT) == 0); 301 302 return xfs_inode_ag_iterator(mp, xfs_sync_inode_attr, flags); 303} 304 305STATIC int 306xfs_sync_fsdata( 307 struct xfs_mount *mp) 308{ 309 struct xfs_buf *bp; 310 311 /* 312 * If the buffer is pinned then push on the log so we won't get stuck 313 * waiting in the write for someone, maybe ourselves, to flush the log. 314 * 315 * Even though we just pushed the log above, we did not have the 316 * superblock buffer locked at that point so it can become pinned in 317 * between there and here. 318 */ 319 bp = xfs_getsb(mp, 0); 320 if (XFS_BUF_ISPINNED(bp)) 321 xfs_log_force(mp, 0); 322 323 return xfs_bwrite(mp, bp); 324} 325 326/* 327 * When remounting a filesystem read-only or freezing the filesystem, we have 328 * two phases to execute. This first phase is syncing the data before we 329 * quiesce the filesystem, and the second is flushing all the inodes out after 330 * we've waited for all the transactions created by the first phase to 331 * complete. The second phase ensures that the inodes are written to their 332 * location on disk rather than just existing in transactions in the log. This 333 * means after a quiesce there is no log replay required to write the inodes to 334 * disk (this is the main difference between a sync and a quiesce). 335 */ 336/* 337 * First stage of freeze - no writers will make progress now we are here, 338 * so we flush delwri and delalloc buffers here, then wait for all I/O to 339 * complete. Data is frozen at that point. Metadata is not frozen, 340 * transactions can still occur here so don't bother flushing the buftarg 341 * because it'll just get dirty again. 342 */ 343int 344xfs_quiesce_data( 345 struct xfs_mount *mp) 346{ 347 int error, error2 = 0; 348 349 /* push non-blocking */ 350 xfs_sync_data(mp, 0); 351 xfs_qm_sync(mp, SYNC_TRYLOCK); 352 353 /* push and block till complete */ 354 xfs_sync_data(mp, SYNC_WAIT); 355 xfs_qm_sync(mp, SYNC_WAIT); 356 357 /* write superblock and hoover up shutdown errors */ 358 error = xfs_sync_fsdata(mp); 359 360 /* make sure all delwri buffers are written out */ 361 xfs_flush_buftarg(mp->m_ddev_targp, 1); 362 363 /* mark the log as covered if needed */ 364 if (xfs_log_need_covered(mp)) 365 error2 = xfs_fs_log_dummy(mp); 366 367 /* flush data-only devices */ 368 if (mp->m_rtdev_targp) 369 XFS_bflush(mp->m_rtdev_targp); 370 371 return error ? error : error2; 372} 373 374STATIC void 375xfs_quiesce_fs( 376 struct xfs_mount *mp) 377{ 378 int count = 0, pincount; 379 380 xfs_reclaim_inodes(mp, 0); 381 xfs_flush_buftarg(mp->m_ddev_targp, 0); 382 383 /* 384 * This loop must run at least twice. The first instance of the loop 385 * will flush most meta data but that will generate more meta data 386 * (typically directory updates). Which then must be flushed and 387 * logged before we can write the unmount record. We also so sync 388 * reclaim of inodes to catch any that the above delwri flush skipped. 389 */ 390 do { 391 xfs_reclaim_inodes(mp, SYNC_WAIT); 392 xfs_sync_attr(mp, SYNC_WAIT); 393 pincount = xfs_flush_buftarg(mp->m_ddev_targp, 1); 394 if (!pincount) { 395 delay(50); 396 count++; 397 } 398 } while (count < 2); 399} 400 401/* 402 * Second stage of a quiesce. The data is already synced, now we have to take 403 * care of the metadata. New transactions are already blocked, so we need to 404 * wait for any remaining transactions to drain out before proceding. 405 */ 406void 407xfs_quiesce_attr( 408 struct xfs_mount *mp) 409{ 410 int error = 0; 411 412 /* wait for all modifications to complete */ 413 while (atomic_read(&mp->m_active_trans) > 0) 414 delay(100); 415 416 /* flush inodes and push all remaining buffers out to disk */ 417 xfs_quiesce_fs(mp); 418 419 /* 420 * Just warn here till VFS can correctly support 421 * read-only remount without racing. 422 */ 423 WARN_ON(atomic_read(&mp->m_active_trans) != 0); 424 425 /* Push the superblock and write an unmount record */ 426 error = xfs_log_sbcount(mp, 1); 427 if (error) 428 xfs_warn(mp, "xfs_attr_quiesce: failed to log sb changes. " 429 "Frozen image may not be consistent."); 430 xfs_log_unmount_write(mp); 431 xfs_unmountfs_writesb(mp); 432} 433 434/* 435 * Enqueue a work item to be picked up by the vfs xfssyncd thread. 436 * Doing this has two advantages: 437 * - It saves on stack space, which is tight in certain situations 438 * - It can be used (with care) as a mechanism to avoid deadlocks. 439 * Flushing while allocating in a full filesystem requires both. 440 */ 441STATIC void 442xfs_syncd_queue_work( 443 struct xfs_mount *mp, 444 void *data, 445 void (*syncer)(struct xfs_mount *, void *), 446 struct completion *completion) 447{ 448 struct xfs_sync_work *work; 449 450 work = kmem_alloc(sizeof(struct xfs_sync_work), KM_SLEEP); 451 INIT_LIST_HEAD(&work->w_list); 452 work->w_syncer = syncer; 453 work->w_data = data; 454 work->w_mount = mp; 455 work->w_completion = completion; 456 spin_lock(&mp->m_sync_lock); 457 list_add_tail(&work->w_list, &mp->m_sync_list); 458 spin_unlock(&mp->m_sync_lock); 459 wake_up_process(mp->m_sync_task); 460} 461 462/* 463 * Flush delayed allocate data, attempting to free up reserved space 464 * from existing allocations. At this point a new allocation attempt 465 * has failed with ENOSPC and we are in the process of scratching our 466 * heads, looking about for more room... 467 */ 468STATIC void 469xfs_flush_inodes_work( 470 struct xfs_mount *mp, 471 void *arg) 472{ 473 struct inode *inode = arg; 474 xfs_sync_data(mp, SYNC_TRYLOCK); 475 xfs_sync_data(mp, SYNC_TRYLOCK | SYNC_WAIT); 476 iput(inode); 477} 478 479void 480xfs_flush_inodes( 481 xfs_inode_t *ip) 482{ 483 struct inode *inode = VFS_I(ip); 484 DECLARE_COMPLETION_ONSTACK(completion); 485 486 igrab(inode); 487 xfs_syncd_queue_work(ip->i_mount, inode, xfs_flush_inodes_work, &completion); 488 wait_for_completion(&completion); 489 xfs_log_force(ip->i_mount, XFS_LOG_SYNC); 490} 491 492/* 493 * Every sync period we need to unpin all items, reclaim inodes and sync 494 * disk quotas. We might need to cover the log to indicate that the 495 * filesystem is idle and not frozen. 496 */ 497STATIC void 498xfs_sync_worker( 499 struct xfs_mount *mp, 500 void *unused) 501{ 502 int error; 503 504 if (!(mp->m_flags & XFS_MOUNT_RDONLY)) { 505 /* dgc: errors ignored here */ 506 if (mp->m_super->s_frozen == SB_UNFROZEN && 507 xfs_log_need_covered(mp)) 508 error = xfs_fs_log_dummy(mp); 509 else 510 xfs_log_force(mp, 0); 511 xfs_reclaim_inodes(mp, 0); 512 error = xfs_qm_sync(mp, SYNC_TRYLOCK); 513 } 514 mp->m_sync_seq++; 515 wake_up(&mp->m_wait_single_sync_task); 516} 517 518STATIC int 519xfssyncd( 520 void *arg) 521{ 522 struct xfs_mount *mp = arg; 523 long timeleft; 524 xfs_sync_work_t *work, *n; 525 LIST_HEAD (tmp); 526 527 set_freezable(); 528 timeleft = xfs_syncd_centisecs * msecs_to_jiffies(10); 529 for (;;) { 530 if (list_empty(&mp->m_sync_list)) 531 timeleft = schedule_timeout_interruptible(timeleft); 532 /* swsusp */ 533 try_to_freeze(); 534 if (kthread_should_stop() && list_empty(&mp->m_sync_list)) 535 break; 536 537 spin_lock(&mp->m_sync_lock); 538 /* 539 * We can get woken by laptop mode, to do a sync - 540 * that's the (only!) case where the list would be 541 * empty with time remaining. 542 */ 543 if (!timeleft || list_empty(&mp->m_sync_list)) { 544 if (!timeleft) 545 timeleft = xfs_syncd_centisecs * 546 msecs_to_jiffies(10); 547 INIT_LIST_HEAD(&mp->m_sync_work.w_list); 548 list_add_tail(&mp->m_sync_work.w_list, 549 &mp->m_sync_list); 550 } 551 list_splice_init(&mp->m_sync_list, &tmp); 552 spin_unlock(&mp->m_sync_lock); 553 554 list_for_each_entry_safe(work, n, &tmp, w_list) { 555 (*work->w_syncer)(mp, work->w_data); 556 list_del(&work->w_list); 557 if (work == &mp->m_sync_work) 558 continue; 559 if (work->w_completion) 560 complete(work->w_completion); 561 kmem_free(work); 562 } 563 } 564 565 return 0; 566} 567 568int 569xfs_syncd_init( 570 struct xfs_mount *mp) 571{ 572 mp->m_sync_work.w_syncer = xfs_sync_worker; 573 mp->m_sync_work.w_mount = mp; 574 mp->m_sync_work.w_completion = NULL; 575 mp->m_sync_task = kthread_run(xfssyncd, mp, "xfssyncd/%s", mp->m_fsname); 576 if (IS_ERR(mp->m_sync_task)) 577 return -PTR_ERR(mp->m_sync_task); 578 return 0; 579} 580 581void 582xfs_syncd_stop( 583 struct xfs_mount *mp) 584{ 585 kthread_stop(mp->m_sync_task); 586} 587 588void 589__xfs_inode_set_reclaim_tag( 590 struct xfs_perag *pag, 591 struct xfs_inode *ip) 592{ 593 radix_tree_tag_set(&pag->pag_ici_root, 594 XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino), 595 XFS_ICI_RECLAIM_TAG); 596 597 if (!pag->pag_ici_reclaimable) { 598 /* propagate the reclaim tag up into the perag radix tree */ 599 spin_lock(&ip->i_mount->m_perag_lock); 600 radix_tree_tag_set(&ip->i_mount->m_perag_tree, 601 XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino), 602 XFS_ICI_RECLAIM_TAG); 603 spin_unlock(&ip->i_mount->m_perag_lock); 604 trace_xfs_perag_set_reclaim(ip->i_mount, pag->pag_agno, 605 -1, _RET_IP_); 606 } 607 pag->pag_ici_reclaimable++; 608} 609 610/* 611 * We set the inode flag atomically with the radix tree tag. 612 * Once we get tag lookups on the radix tree, this inode flag 613 * can go away. 614 */ 615void 616xfs_inode_set_reclaim_tag( 617 xfs_inode_t *ip) 618{ 619 struct xfs_mount *mp = ip->i_mount; 620 struct xfs_perag *pag; 621 622 pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); 623 spin_lock(&pag->pag_ici_lock); 624 spin_lock(&ip->i_flags_lock); 625 __xfs_inode_set_reclaim_tag(pag, ip); 626 __xfs_iflags_set(ip, XFS_IRECLAIMABLE); 627 spin_unlock(&ip->i_flags_lock); 628 spin_unlock(&pag->pag_ici_lock); 629 xfs_perag_put(pag); 630} 631 632STATIC void 633__xfs_inode_clear_reclaim( 634 xfs_perag_t *pag, 635 xfs_inode_t *ip) 636{ 637 pag->pag_ici_reclaimable--; 638 if (!pag->pag_ici_reclaimable) { 639 /* clear the reclaim tag from the perag radix tree */ 640 spin_lock(&ip->i_mount->m_perag_lock); 641 radix_tree_tag_clear(&ip->i_mount->m_perag_tree, 642 XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino), 643 XFS_ICI_RECLAIM_TAG); 644 spin_unlock(&ip->i_mount->m_perag_lock); 645 trace_xfs_perag_clear_reclaim(ip->i_mount, pag->pag_agno, 646 -1, _RET_IP_); 647 } 648} 649 650void 651__xfs_inode_clear_reclaim_tag( 652 xfs_mount_t *mp, 653 xfs_perag_t *pag, 654 xfs_inode_t *ip) 655{ 656 radix_tree_tag_clear(&pag->pag_ici_root, 657 XFS_INO_TO_AGINO(mp, ip->i_ino), XFS_ICI_RECLAIM_TAG); 658 __xfs_inode_clear_reclaim(pag, ip); 659} 660 661/* 662 * Grab the inode for reclaim exclusively. 663 * Return 0 if we grabbed it, non-zero otherwise. 664 */ 665STATIC int 666xfs_reclaim_inode_grab( 667 struct xfs_inode *ip, 668 int flags) 669{ 670 ASSERT(rcu_read_lock_held()); 671 672 /* quick check for stale RCU freed inode */ 673 if (!ip->i_ino) 674 return 1; 675 676 /* 677 * do some unlocked checks first to avoid unnecessary lock traffic. 678 * The first is a flush lock check, the second is a already in reclaim 679 * check. Only do these checks if we are not going to block on locks. 680 */ 681 if ((flags & SYNC_TRYLOCK) && 682 (!ip->i_flush.done || __xfs_iflags_test(ip, XFS_IRECLAIM))) { 683 return 1; 684 } 685 686 /* 687 * The radix tree lock here protects a thread in xfs_iget from racing 688 * with us starting reclaim on the inode. Once we have the 689 * XFS_IRECLAIM flag set it will not touch us. 690 * 691 * Due to RCU lookup, we may find inodes that have been freed and only 692 * have XFS_IRECLAIM set. Indeed, we may see reallocated inodes that 693 * aren't candidates for reclaim at all, so we must check the 694 * XFS_IRECLAIMABLE is set first before proceeding to reclaim. 695 */ 696 spin_lock(&ip->i_flags_lock); 697 if (!__xfs_iflags_test(ip, XFS_IRECLAIMABLE) || 698 __xfs_iflags_test(ip, XFS_IRECLAIM)) { 699 /* not a reclaim candidate. */ 700 spin_unlock(&ip->i_flags_lock); 701 return 1; 702 } 703 __xfs_iflags_set(ip, XFS_IRECLAIM); 704 spin_unlock(&ip->i_flags_lock); 705 return 0; 706} 707 708/* 709 * Inodes in different states need to be treated differently, and the return 710 * value of xfs_iflush is not sufficient to get this right. The following table 711 * lists the inode states and the reclaim actions necessary for non-blocking 712 * reclaim: 713 * 714 * 715 * inode state iflush ret required action 716 * --------------- ---------- --------------- 717 * bad - reclaim 718 * shutdown EIO unpin and reclaim 719 * clean, unpinned 0 reclaim 720 * stale, unpinned 0 reclaim 721 * clean, pinned(*) 0 requeue 722 * stale, pinned EAGAIN requeue 723 * dirty, delwri ok 0 requeue 724 * dirty, delwri blocked EAGAIN requeue 725 * dirty, sync flush 0 reclaim 726 * 727 * (*) dgc: I don't think the clean, pinned state is possible but it gets 728 * handled anyway given the order of checks implemented. 729 * 730 * As can be seen from the table, the return value of xfs_iflush() is not 731 * sufficient to correctly decide the reclaim action here. The checks in 732 * xfs_iflush() might look like duplicates, but they are not. 733 * 734 * Also, because we get the flush lock first, we know that any inode that has 735 * been flushed delwri has had the flush completed by the time we check that 736 * the inode is clean. The clean inode check needs to be done before flushing 737 * the inode delwri otherwise we would loop forever requeuing clean inodes as 738 * we cannot tell apart a successful delwri flush and a clean inode from the 739 * return value of xfs_iflush(). 740 * 741 * Note that because the inode is flushed delayed write by background 742 * writeback, the flush lock may already be held here and waiting on it can 743 * result in very long latencies. Hence for sync reclaims, where we wait on the 744 * flush lock, the caller should push out delayed write inodes first before 745 * trying to reclaim them to minimise the amount of time spent waiting. For 746 * background relaim, we just requeue the inode for the next pass. 747 * 748 * Hence the order of actions after gaining the locks should be: 749 * bad => reclaim 750 * shutdown => unpin and reclaim 751 * pinned, delwri => requeue 752 * pinned, sync => unpin 753 * stale => reclaim 754 * clean => reclaim 755 * dirty, delwri => flush and requeue 756 * dirty, sync => flush, wait and reclaim 757 */ 758STATIC int 759xfs_reclaim_inode( 760 struct xfs_inode *ip, 761 struct xfs_perag *pag, 762 int sync_mode) 763{ 764 int error; 765 766restart: 767 error = 0; 768 xfs_ilock(ip, XFS_ILOCK_EXCL); 769 if (!xfs_iflock_nowait(ip)) { 770 if (!(sync_mode & SYNC_WAIT)) 771 goto out; 772 xfs_iflock(ip); 773 } 774 775 if (is_bad_inode(VFS_I(ip))) 776 goto reclaim; 777 if (XFS_FORCED_SHUTDOWN(ip->i_mount)) { 778 xfs_iunpin_wait(ip); 779 goto reclaim; 780 } 781 if (xfs_ipincount(ip)) { 782 if (!(sync_mode & SYNC_WAIT)) { 783 xfs_ifunlock(ip); 784 goto out; 785 } 786 xfs_iunpin_wait(ip); 787 } 788 if (xfs_iflags_test(ip, XFS_ISTALE)) 789 goto reclaim; 790 if (xfs_inode_clean(ip)) 791 goto reclaim; 792 793 /* 794 * Now we have an inode that needs flushing. 795 * 796 * We do a nonblocking flush here even if we are doing a SYNC_WAIT 797 * reclaim as we can deadlock with inode cluster removal. 798 * xfs_ifree_cluster() can lock the inode buffer before it locks the 799 * ip->i_lock, and we are doing the exact opposite here. As a result, 800 * doing a blocking xfs_itobp() to get the cluster buffer will result 801 * in an ABBA deadlock with xfs_ifree_cluster(). 802 * 803 * As xfs_ifree_cluser() must gather all inodes that are active in the 804 * cache to mark them stale, if we hit this case we don't actually want 805 * to do IO here - we want the inode marked stale so we can simply 806 * reclaim it. Hence if we get an EAGAIN error on a SYNC_WAIT flush, 807 * just unlock the inode, back off and try again. Hopefully the next 808 * pass through will see the stale flag set on the inode. 809 */ 810 error = xfs_iflush(ip, SYNC_TRYLOCK | sync_mode); 811 if (sync_mode & SYNC_WAIT) { 812 if (error == EAGAIN) { 813 xfs_iunlock(ip, XFS_ILOCK_EXCL); 814 /* backoff longer than in xfs_ifree_cluster */ 815 delay(2); 816 goto restart; 817 } 818 xfs_iflock(ip); 819 goto reclaim; 820 } 821 822 /* 823 * When we have to flush an inode but don't have SYNC_WAIT set, we 824 * flush the inode out using a delwri buffer and wait for the next 825 * call into reclaim to find it in a clean state instead of waiting for 826 * it now. We also don't return errors here - if the error is transient 827 * then the next reclaim pass will flush the inode, and if the error 828 * is permanent then the next sync reclaim will reclaim the inode and 829 * pass on the error. 830 */ 831 if (error && error != EAGAIN && !XFS_FORCED_SHUTDOWN(ip->i_mount)) { 832 xfs_warn(ip->i_mount, 833 "inode 0x%llx background reclaim flush failed with %d", 834 (long long)ip->i_ino, error); 835 } 836out: 837 xfs_iflags_clear(ip, XFS_IRECLAIM); 838 xfs_iunlock(ip, XFS_ILOCK_EXCL); 839 /* 840 * We could return EAGAIN here to make reclaim rescan the inode tree in 841 * a short while. However, this just burns CPU time scanning the tree 842 * waiting for IO to complete and xfssyncd never goes back to the idle 843 * state. Instead, return 0 to let the next scheduled background reclaim 844 * attempt to reclaim the inode again. 845 */ 846 return 0; 847 848reclaim: 849 xfs_ifunlock(ip); 850 xfs_iunlock(ip, XFS_ILOCK_EXCL); 851 852 XFS_STATS_INC(xs_ig_reclaims); 853 /* 854 * Remove the inode from the per-AG radix tree. 855 * 856 * Because radix_tree_delete won't complain even if the item was never 857 * added to the tree assert that it's been there before to catch 858 * problems with the inode life time early on. 859 */ 860 spin_lock(&pag->pag_ici_lock); 861 if (!radix_tree_delete(&pag->pag_ici_root, 862 XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino))) 863 ASSERT(0); 864 __xfs_inode_clear_reclaim(pag, ip); 865 spin_unlock(&pag->pag_ici_lock); 866 867 /* 868 * Here we do an (almost) spurious inode lock in order to coordinate 869 * with inode cache radix tree lookups. This is because the lookup 870 * can reference the inodes in the cache without taking references. 871 * 872 * We make that OK here by ensuring that we wait until the inode is 873 * unlocked after the lookup before we go ahead and free it. We get 874 * both the ilock and the iolock because the code may need to drop the 875 * ilock one but will still hold the iolock. 876 */ 877 xfs_ilock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); 878 xfs_qm_dqdetach(ip); 879 xfs_iunlock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); 880 881 xfs_inode_free(ip); 882 return error; 883 884} 885 886/* 887 * Walk the AGs and reclaim the inodes in them. Even if the filesystem is 888 * corrupted, we still want to try to reclaim all the inodes. If we don't, 889 * then a shut down during filesystem unmount reclaim walk leak all the 890 * unreclaimed inodes. 891 */ 892int 893xfs_reclaim_inodes_ag( 894 struct xfs_mount *mp, 895 int flags, 896 int *nr_to_scan) 897{ 898 struct xfs_perag *pag; 899 int error = 0; 900 int last_error = 0; 901 xfs_agnumber_t ag; 902 int trylock = flags & SYNC_TRYLOCK; 903 int skipped; 904 905restart: 906 ag = 0; 907 skipped = 0; 908 while ((pag = xfs_perag_get_tag(mp, ag, XFS_ICI_RECLAIM_TAG))) { 909 unsigned long first_index = 0; 910 int done = 0; 911 int nr_found = 0; 912 913 ag = pag->pag_agno + 1; 914 915 if (trylock) { 916 if (!mutex_trylock(&pag->pag_ici_reclaim_lock)) { 917 skipped++; 918 xfs_perag_put(pag); 919 continue; 920 } 921 first_index = pag->pag_ici_reclaim_cursor; 922 } else 923 mutex_lock(&pag->pag_ici_reclaim_lock); 924 925 do { 926 struct xfs_inode *batch[XFS_LOOKUP_BATCH]; 927 int i; 928 929 rcu_read_lock(); 930 nr_found = radix_tree_gang_lookup_tag( 931 &pag->pag_ici_root, 932 (void **)batch, first_index, 933 XFS_LOOKUP_BATCH, 934 XFS_ICI_RECLAIM_TAG); 935 if (!nr_found) { 936 rcu_read_unlock(); 937 break; 938 } 939 940 /* 941 * Grab the inodes before we drop the lock. if we found 942 * nothing, nr == 0 and the loop will be skipped. 943 */ 944 for (i = 0; i < nr_found; i++) { 945 struct xfs_inode *ip = batch[i]; 946 947 if (done || xfs_reclaim_inode_grab(ip, flags)) 948 batch[i] = NULL; 949 950 /* 951 * Update the index for the next lookup. Catch 952 * overflows into the next AG range which can 953 * occur if we have inodes in the last block of 954 * the AG and we are currently pointing to the 955 * last inode. 956 * 957 * Because we may see inodes that are from the 958 * wrong AG due to RCU freeing and 959 * reallocation, only update the index if it 960 * lies in this AG. It was a race that lead us 961 * to see this inode, so another lookup from 962 * the same index will not find it again. 963 */ 964 if (XFS_INO_TO_AGNO(mp, ip->i_ino) != 965 pag->pag_agno) 966 continue; 967 first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1); 968 if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino)) 969 done = 1; 970 } 971 972 /* unlock now we've grabbed the inodes. */ 973 rcu_read_unlock(); 974 975 for (i = 0; i < nr_found; i++) { 976 if (!batch[i]) 977 continue; 978 error = xfs_reclaim_inode(batch[i], pag, flags); 979 if (error && last_error != EFSCORRUPTED) 980 last_error = error; 981 } 982 983 *nr_to_scan -= XFS_LOOKUP_BATCH; 984 985 } while (nr_found && !done && *nr_to_scan > 0); 986 987 if (trylock && !done) 988 pag->pag_ici_reclaim_cursor = first_index; 989 else 990 pag->pag_ici_reclaim_cursor = 0; 991 mutex_unlock(&pag->pag_ici_reclaim_lock); 992 xfs_perag_put(pag); 993 } 994 995 /* 996 * if we skipped any AG, and we still have scan count remaining, do 997 * another pass this time using blocking reclaim semantics (i.e 998 * waiting on the reclaim locks and ignoring the reclaim cursors). This 999 * ensure that when we get more reclaimers than AGs we block rather 1000 * than spin trying to execute reclaim. 1001 */ 1002 if (trylock && skipped && *nr_to_scan > 0) { 1003 trylock = 0; 1004 goto restart; 1005 } 1006 return XFS_ERROR(last_error); 1007} 1008 1009int 1010xfs_reclaim_inodes( 1011 xfs_mount_t *mp, 1012 int mode) 1013{ 1014 int nr_to_scan = INT_MAX; 1015 1016 return xfs_reclaim_inodes_ag(mp, mode, &nr_to_scan); 1017} 1018 1019/* 1020 * Shrinker infrastructure. 1021 */ 1022static int 1023xfs_reclaim_inode_shrink( 1024 struct shrinker *shrink, 1025 int nr_to_scan, 1026 gfp_t gfp_mask) 1027{ 1028 struct xfs_mount *mp; 1029 struct xfs_perag *pag; 1030 xfs_agnumber_t ag; 1031 int reclaimable; 1032 1033 mp = container_of(shrink, struct xfs_mount, m_inode_shrink); 1034 if (nr_to_scan) { 1035 if (!(gfp_mask & __GFP_FS)) 1036 return -1; 1037 1038 xfs_reclaim_inodes_ag(mp, SYNC_TRYLOCK, &nr_to_scan); 1039 /* terminate if we don't exhaust the scan */ 1040 if (nr_to_scan > 0) 1041 return -1; 1042 } 1043 1044 reclaimable = 0; 1045 ag = 0; 1046 while ((pag = xfs_perag_get_tag(mp, ag, XFS_ICI_RECLAIM_TAG))) { 1047 ag = pag->pag_agno + 1; 1048 reclaimable += pag->pag_ici_reclaimable; 1049 xfs_perag_put(pag); 1050 } 1051 return reclaimable; 1052} 1053 1054void 1055xfs_inode_shrinker_register( 1056 struct xfs_mount *mp) 1057{ 1058 mp->m_inode_shrink.shrink = xfs_reclaim_inode_shrink; 1059 mp->m_inode_shrink.seeks = DEFAULT_SEEKS; 1060 register_shrinker(&mp->m_inode_shrink); 1061} 1062 1063void 1064xfs_inode_shrinker_unregister( 1065 struct xfs_mount *mp) 1066{ 1067 unregister_shrinker(&mp->m_inode_shrink); 1068}