Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
at v3.0-rc2 1084 lines 29 kB view raw
1/* 2 * Copyright (c) 2000-2005 Silicon Graphics, Inc. 3 * All Rights Reserved. 4 * 5 * This program is free software; you can redistribute it and/or 6 * modify it under the terms of the GNU General Public License as 7 * published by the Free Software Foundation. 8 * 9 * This program is distributed in the hope that it would be useful, 10 * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 * GNU General Public License for more details. 13 * 14 * You should have received a copy of the GNU General Public License 15 * along with this program; if not, write the Free Software Foundation, 16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 17 */ 18#include "xfs.h" 19#include "xfs_fs.h" 20#include "xfs_types.h" 21#include "xfs_bit.h" 22#include "xfs_log.h" 23#include "xfs_inum.h" 24#include "xfs_trans.h" 25#include "xfs_trans_priv.h" 26#include "xfs_sb.h" 27#include "xfs_ag.h" 28#include "xfs_mount.h" 29#include "xfs_bmap_btree.h" 30#include "xfs_inode.h" 31#include "xfs_dinode.h" 32#include "xfs_error.h" 33#include "xfs_filestream.h" 34#include "xfs_vnodeops.h" 35#include "xfs_inode_item.h" 36#include "xfs_quota.h" 37#include "xfs_trace.h" 38#include "xfs_fsops.h" 39 40#include <linux/kthread.h> 41#include <linux/freezer.h> 42 43struct workqueue_struct *xfs_syncd_wq; /* sync workqueue */ 44 45/* 46 * The inode lookup is done in batches to keep the amount of lock traffic and 47 * radix tree lookups to a minimum. The batch size is a trade off between 48 * lookup reduction and stack usage. This is in the reclaim path, so we can't 49 * be too greedy. 50 */ 51#define XFS_LOOKUP_BATCH 32 52 53STATIC int 54xfs_inode_ag_walk_grab( 55 struct xfs_inode *ip) 56{ 57 struct inode *inode = VFS_I(ip); 58 59 ASSERT(rcu_read_lock_held()); 60 61 /* 62 * check for stale RCU freed inode 63 * 64 * If the inode has been reallocated, it doesn't matter if it's not in 65 * the AG we are walking - we are walking for writeback, so if it 66 * passes all the "valid inode" checks and is dirty, then we'll write 67 * it back anyway. If it has been reallocated and still being 68 * initialised, the XFS_INEW check below will catch it. 69 */ 70 spin_lock(&ip->i_flags_lock); 71 if (!ip->i_ino) 72 goto out_unlock_noent; 73 74 /* avoid new or reclaimable inodes. Leave for reclaim code to flush */ 75 if (__xfs_iflags_test(ip, XFS_INEW | XFS_IRECLAIMABLE | XFS_IRECLAIM)) 76 goto out_unlock_noent; 77 spin_unlock(&ip->i_flags_lock); 78 79 /* nothing to sync during shutdown */ 80 if (XFS_FORCED_SHUTDOWN(ip->i_mount)) 81 return EFSCORRUPTED; 82 83 /* If we can't grab the inode, it must on it's way to reclaim. */ 84 if (!igrab(inode)) 85 return ENOENT; 86 87 if (is_bad_inode(inode)) { 88 IRELE(ip); 89 return ENOENT; 90 } 91 92 /* inode is valid */ 93 return 0; 94 95out_unlock_noent: 96 spin_unlock(&ip->i_flags_lock); 97 return ENOENT; 98} 99 100STATIC int 101xfs_inode_ag_walk( 102 struct xfs_mount *mp, 103 struct xfs_perag *pag, 104 int (*execute)(struct xfs_inode *ip, 105 struct xfs_perag *pag, int flags), 106 int flags) 107{ 108 uint32_t first_index; 109 int last_error = 0; 110 int skipped; 111 int done; 112 int nr_found; 113 114restart: 115 done = 0; 116 skipped = 0; 117 first_index = 0; 118 nr_found = 0; 119 do { 120 struct xfs_inode *batch[XFS_LOOKUP_BATCH]; 121 int error = 0; 122 int i; 123 124 rcu_read_lock(); 125 nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, 126 (void **)batch, first_index, 127 XFS_LOOKUP_BATCH); 128 if (!nr_found) { 129 rcu_read_unlock(); 130 break; 131 } 132 133 /* 134 * Grab the inodes before we drop the lock. if we found 135 * nothing, nr == 0 and the loop will be skipped. 136 */ 137 for (i = 0; i < nr_found; i++) { 138 struct xfs_inode *ip = batch[i]; 139 140 if (done || xfs_inode_ag_walk_grab(ip)) 141 batch[i] = NULL; 142 143 /* 144 * Update the index for the next lookup. Catch 145 * overflows into the next AG range which can occur if 146 * we have inodes in the last block of the AG and we 147 * are currently pointing to the last inode. 148 * 149 * Because we may see inodes that are from the wrong AG 150 * due to RCU freeing and reallocation, only update the 151 * index if it lies in this AG. It was a race that lead 152 * us to see this inode, so another lookup from the 153 * same index will not find it again. 154 */ 155 if (XFS_INO_TO_AGNO(mp, ip->i_ino) != pag->pag_agno) 156 continue; 157 first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1); 158 if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino)) 159 done = 1; 160 } 161 162 /* unlock now we've grabbed the inodes. */ 163 rcu_read_unlock(); 164 165 for (i = 0; i < nr_found; i++) { 166 if (!batch[i]) 167 continue; 168 error = execute(batch[i], pag, flags); 169 IRELE(batch[i]); 170 if (error == EAGAIN) { 171 skipped++; 172 continue; 173 } 174 if (error && last_error != EFSCORRUPTED) 175 last_error = error; 176 } 177 178 /* bail out if the filesystem is corrupted. */ 179 if (error == EFSCORRUPTED) 180 break; 181 182 } while (nr_found && !done); 183 184 if (skipped) { 185 delay(1); 186 goto restart; 187 } 188 return last_error; 189} 190 191int 192xfs_inode_ag_iterator( 193 struct xfs_mount *mp, 194 int (*execute)(struct xfs_inode *ip, 195 struct xfs_perag *pag, int flags), 196 int flags) 197{ 198 struct xfs_perag *pag; 199 int error = 0; 200 int last_error = 0; 201 xfs_agnumber_t ag; 202 203 ag = 0; 204 while ((pag = xfs_perag_get(mp, ag))) { 205 ag = pag->pag_agno + 1; 206 error = xfs_inode_ag_walk(mp, pag, execute, flags); 207 xfs_perag_put(pag); 208 if (error) { 209 last_error = error; 210 if (error == EFSCORRUPTED) 211 break; 212 } 213 } 214 return XFS_ERROR(last_error); 215} 216 217STATIC int 218xfs_sync_inode_data( 219 struct xfs_inode *ip, 220 struct xfs_perag *pag, 221 int flags) 222{ 223 struct inode *inode = VFS_I(ip); 224 struct address_space *mapping = inode->i_mapping; 225 int error = 0; 226 227 if (!mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) 228 goto out_wait; 229 230 if (!xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED)) { 231 if (flags & SYNC_TRYLOCK) 232 goto out_wait; 233 xfs_ilock(ip, XFS_IOLOCK_SHARED); 234 } 235 236 error = xfs_flush_pages(ip, 0, -1, (flags & SYNC_WAIT) ? 237 0 : XBF_ASYNC, FI_NONE); 238 xfs_iunlock(ip, XFS_IOLOCK_SHARED); 239 240 out_wait: 241 if (flags & SYNC_WAIT) 242 xfs_ioend_wait(ip); 243 return error; 244} 245 246STATIC int 247xfs_sync_inode_attr( 248 struct xfs_inode *ip, 249 struct xfs_perag *pag, 250 int flags) 251{ 252 int error = 0; 253 254 xfs_ilock(ip, XFS_ILOCK_SHARED); 255 if (xfs_inode_clean(ip)) 256 goto out_unlock; 257 if (!xfs_iflock_nowait(ip)) { 258 if (!(flags & SYNC_WAIT)) 259 goto out_unlock; 260 xfs_iflock(ip); 261 } 262 263 if (xfs_inode_clean(ip)) { 264 xfs_ifunlock(ip); 265 goto out_unlock; 266 } 267 268 error = xfs_iflush(ip, flags); 269 270 /* 271 * We don't want to try again on non-blocking flushes that can't run 272 * again immediately. If an inode really must be written, then that's 273 * what the SYNC_WAIT flag is for. 274 */ 275 if (error == EAGAIN) { 276 ASSERT(!(flags & SYNC_WAIT)); 277 error = 0; 278 } 279 280 out_unlock: 281 xfs_iunlock(ip, XFS_ILOCK_SHARED); 282 return error; 283} 284 285/* 286 * Write out pagecache data for the whole filesystem. 287 */ 288STATIC int 289xfs_sync_data( 290 struct xfs_mount *mp, 291 int flags) 292{ 293 int error; 294 295 ASSERT((flags & ~(SYNC_TRYLOCK|SYNC_WAIT)) == 0); 296 297 error = xfs_inode_ag_iterator(mp, xfs_sync_inode_data, flags); 298 if (error) 299 return XFS_ERROR(error); 300 301 xfs_log_force(mp, (flags & SYNC_WAIT) ? XFS_LOG_SYNC : 0); 302 return 0; 303} 304 305/* 306 * Write out inode metadata (attributes) for the whole filesystem. 307 */ 308STATIC int 309xfs_sync_attr( 310 struct xfs_mount *mp, 311 int flags) 312{ 313 ASSERT((flags & ~SYNC_WAIT) == 0); 314 315 return xfs_inode_ag_iterator(mp, xfs_sync_inode_attr, flags); 316} 317 318STATIC int 319xfs_sync_fsdata( 320 struct xfs_mount *mp) 321{ 322 struct xfs_buf *bp; 323 324 /* 325 * If the buffer is pinned then push on the log so we won't get stuck 326 * waiting in the write for someone, maybe ourselves, to flush the log. 327 * 328 * Even though we just pushed the log above, we did not have the 329 * superblock buffer locked at that point so it can become pinned in 330 * between there and here. 331 */ 332 bp = xfs_getsb(mp, 0); 333 if (XFS_BUF_ISPINNED(bp)) 334 xfs_log_force(mp, 0); 335 336 return xfs_bwrite(mp, bp); 337} 338 339/* 340 * When remounting a filesystem read-only or freezing the filesystem, we have 341 * two phases to execute. This first phase is syncing the data before we 342 * quiesce the filesystem, and the second is flushing all the inodes out after 343 * we've waited for all the transactions created by the first phase to 344 * complete. The second phase ensures that the inodes are written to their 345 * location on disk rather than just existing in transactions in the log. This 346 * means after a quiesce there is no log replay required to write the inodes to 347 * disk (this is the main difference between a sync and a quiesce). 348 */ 349/* 350 * First stage of freeze - no writers will make progress now we are here, 351 * so we flush delwri and delalloc buffers here, then wait for all I/O to 352 * complete. Data is frozen at that point. Metadata is not frozen, 353 * transactions can still occur here so don't bother flushing the buftarg 354 * because it'll just get dirty again. 355 */ 356int 357xfs_quiesce_data( 358 struct xfs_mount *mp) 359{ 360 int error, error2 = 0; 361 362 /* push non-blocking */ 363 xfs_sync_data(mp, 0); 364 xfs_qm_sync(mp, SYNC_TRYLOCK); 365 366 /* push and block till complete */ 367 xfs_sync_data(mp, SYNC_WAIT); 368 xfs_qm_sync(mp, SYNC_WAIT); 369 370 /* write superblock and hoover up shutdown errors */ 371 error = xfs_sync_fsdata(mp); 372 373 /* make sure all delwri buffers are written out */ 374 xfs_flush_buftarg(mp->m_ddev_targp, 1); 375 376 /* mark the log as covered if needed */ 377 if (xfs_log_need_covered(mp)) 378 error2 = xfs_fs_log_dummy(mp); 379 380 /* flush data-only devices */ 381 if (mp->m_rtdev_targp) 382 XFS_bflush(mp->m_rtdev_targp); 383 384 return error ? error : error2; 385} 386 387STATIC void 388xfs_quiesce_fs( 389 struct xfs_mount *mp) 390{ 391 int count = 0, pincount; 392 393 xfs_reclaim_inodes(mp, 0); 394 xfs_flush_buftarg(mp->m_ddev_targp, 0); 395 396 /* 397 * This loop must run at least twice. The first instance of the loop 398 * will flush most meta data but that will generate more meta data 399 * (typically directory updates). Which then must be flushed and 400 * logged before we can write the unmount record. We also so sync 401 * reclaim of inodes to catch any that the above delwri flush skipped. 402 */ 403 do { 404 xfs_reclaim_inodes(mp, SYNC_WAIT); 405 xfs_sync_attr(mp, SYNC_WAIT); 406 pincount = xfs_flush_buftarg(mp->m_ddev_targp, 1); 407 if (!pincount) { 408 delay(50); 409 count++; 410 } 411 } while (count < 2); 412} 413 414/* 415 * Second stage of a quiesce. The data is already synced, now we have to take 416 * care of the metadata. New transactions are already blocked, so we need to 417 * wait for any remaining transactions to drain out before proceeding. 418 */ 419void 420xfs_quiesce_attr( 421 struct xfs_mount *mp) 422{ 423 int error = 0; 424 425 /* wait for all modifications to complete */ 426 while (atomic_read(&mp->m_active_trans) > 0) 427 delay(100); 428 429 /* flush inodes and push all remaining buffers out to disk */ 430 xfs_quiesce_fs(mp); 431 432 /* 433 * Just warn here till VFS can correctly support 434 * read-only remount without racing. 435 */ 436 WARN_ON(atomic_read(&mp->m_active_trans) != 0); 437 438 /* Push the superblock and write an unmount record */ 439 error = xfs_log_sbcount(mp, 1); 440 if (error) 441 xfs_warn(mp, "xfs_attr_quiesce: failed to log sb changes. " 442 "Frozen image may not be consistent."); 443 xfs_log_unmount_write(mp); 444 xfs_unmountfs_writesb(mp); 445} 446 447static void 448xfs_syncd_queue_sync( 449 struct xfs_mount *mp) 450{ 451 queue_delayed_work(xfs_syncd_wq, &mp->m_sync_work, 452 msecs_to_jiffies(xfs_syncd_centisecs * 10)); 453} 454 455/* 456 * Every sync period we need to unpin all items, reclaim inodes and sync 457 * disk quotas. We might need to cover the log to indicate that the 458 * filesystem is idle and not frozen. 459 */ 460STATIC void 461xfs_sync_worker( 462 struct work_struct *work) 463{ 464 struct xfs_mount *mp = container_of(to_delayed_work(work), 465 struct xfs_mount, m_sync_work); 466 int error; 467 468 if (!(mp->m_flags & XFS_MOUNT_RDONLY)) { 469 /* dgc: errors ignored here */ 470 if (mp->m_super->s_frozen == SB_UNFROZEN && 471 xfs_log_need_covered(mp)) 472 error = xfs_fs_log_dummy(mp); 473 else 474 xfs_log_force(mp, 0); 475 error = xfs_qm_sync(mp, SYNC_TRYLOCK); 476 477 /* start pushing all the metadata that is currently dirty */ 478 xfs_ail_push_all(mp->m_ail); 479 } 480 481 /* queue us up again */ 482 xfs_syncd_queue_sync(mp); 483} 484 485/* 486 * Queue a new inode reclaim pass if there are reclaimable inodes and there 487 * isn't a reclaim pass already in progress. By default it runs every 5s based 488 * on the xfs syncd work default of 30s. Perhaps this should have it's own 489 * tunable, but that can be done if this method proves to be ineffective or too 490 * aggressive. 491 */ 492static void 493xfs_syncd_queue_reclaim( 494 struct xfs_mount *mp) 495{ 496 497 /* 498 * We can have inodes enter reclaim after we've shut down the syncd 499 * workqueue during unmount, so don't allow reclaim work to be queued 500 * during unmount. 501 */ 502 if (!(mp->m_super->s_flags & MS_ACTIVE)) 503 return; 504 505 rcu_read_lock(); 506 if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_RECLAIM_TAG)) { 507 queue_delayed_work(xfs_syncd_wq, &mp->m_reclaim_work, 508 msecs_to_jiffies(xfs_syncd_centisecs / 6 * 10)); 509 } 510 rcu_read_unlock(); 511} 512 513/* 514 * This is a fast pass over the inode cache to try to get reclaim moving on as 515 * many inodes as possible in a short period of time. It kicks itself every few 516 * seconds, as well as being kicked by the inode cache shrinker when memory 517 * goes low. It scans as quickly as possible avoiding locked inodes or those 518 * already being flushed, and once done schedules a future pass. 519 */ 520STATIC void 521xfs_reclaim_worker( 522 struct work_struct *work) 523{ 524 struct xfs_mount *mp = container_of(to_delayed_work(work), 525 struct xfs_mount, m_reclaim_work); 526 527 xfs_reclaim_inodes(mp, SYNC_TRYLOCK); 528 xfs_syncd_queue_reclaim(mp); 529} 530 531/* 532 * Flush delayed allocate data, attempting to free up reserved space 533 * from existing allocations. At this point a new allocation attempt 534 * has failed with ENOSPC and we are in the process of scratching our 535 * heads, looking about for more room. 536 * 537 * Queue a new data flush if there isn't one already in progress and 538 * wait for completion of the flush. This means that we only ever have one 539 * inode flush in progress no matter how many ENOSPC events are occurring and 540 * so will prevent the system from bogging down due to every concurrent 541 * ENOSPC event scanning all the active inodes in the system for writeback. 542 */ 543void 544xfs_flush_inodes( 545 struct xfs_inode *ip) 546{ 547 struct xfs_mount *mp = ip->i_mount; 548 549 queue_work(xfs_syncd_wq, &mp->m_flush_work); 550 flush_work_sync(&mp->m_flush_work); 551} 552 553STATIC void 554xfs_flush_worker( 555 struct work_struct *work) 556{ 557 struct xfs_mount *mp = container_of(work, 558 struct xfs_mount, m_flush_work); 559 560 xfs_sync_data(mp, SYNC_TRYLOCK); 561 xfs_sync_data(mp, SYNC_TRYLOCK | SYNC_WAIT); 562} 563 564int 565xfs_syncd_init( 566 struct xfs_mount *mp) 567{ 568 INIT_WORK(&mp->m_flush_work, xfs_flush_worker); 569 INIT_DELAYED_WORK(&mp->m_sync_work, xfs_sync_worker); 570 INIT_DELAYED_WORK(&mp->m_reclaim_work, xfs_reclaim_worker); 571 572 xfs_syncd_queue_sync(mp); 573 xfs_syncd_queue_reclaim(mp); 574 575 return 0; 576} 577 578void 579xfs_syncd_stop( 580 struct xfs_mount *mp) 581{ 582 cancel_delayed_work_sync(&mp->m_sync_work); 583 cancel_delayed_work_sync(&mp->m_reclaim_work); 584 cancel_work_sync(&mp->m_flush_work); 585} 586 587void 588__xfs_inode_set_reclaim_tag( 589 struct xfs_perag *pag, 590 struct xfs_inode *ip) 591{ 592 radix_tree_tag_set(&pag->pag_ici_root, 593 XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino), 594 XFS_ICI_RECLAIM_TAG); 595 596 if (!pag->pag_ici_reclaimable) { 597 /* propagate the reclaim tag up into the perag radix tree */ 598 spin_lock(&ip->i_mount->m_perag_lock); 599 radix_tree_tag_set(&ip->i_mount->m_perag_tree, 600 XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino), 601 XFS_ICI_RECLAIM_TAG); 602 spin_unlock(&ip->i_mount->m_perag_lock); 603 604 /* schedule periodic background inode reclaim */ 605 xfs_syncd_queue_reclaim(ip->i_mount); 606 607 trace_xfs_perag_set_reclaim(ip->i_mount, pag->pag_agno, 608 -1, _RET_IP_); 609 } 610 pag->pag_ici_reclaimable++; 611} 612 613/* 614 * We set the inode flag atomically with the radix tree tag. 615 * Once we get tag lookups on the radix tree, this inode flag 616 * can go away. 617 */ 618void 619xfs_inode_set_reclaim_tag( 620 xfs_inode_t *ip) 621{ 622 struct xfs_mount *mp = ip->i_mount; 623 struct xfs_perag *pag; 624 625 pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); 626 spin_lock(&pag->pag_ici_lock); 627 spin_lock(&ip->i_flags_lock); 628 __xfs_inode_set_reclaim_tag(pag, ip); 629 __xfs_iflags_set(ip, XFS_IRECLAIMABLE); 630 spin_unlock(&ip->i_flags_lock); 631 spin_unlock(&pag->pag_ici_lock); 632 xfs_perag_put(pag); 633} 634 635STATIC void 636__xfs_inode_clear_reclaim( 637 xfs_perag_t *pag, 638 xfs_inode_t *ip) 639{ 640 pag->pag_ici_reclaimable--; 641 if (!pag->pag_ici_reclaimable) { 642 /* clear the reclaim tag from the perag radix tree */ 643 spin_lock(&ip->i_mount->m_perag_lock); 644 radix_tree_tag_clear(&ip->i_mount->m_perag_tree, 645 XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino), 646 XFS_ICI_RECLAIM_TAG); 647 spin_unlock(&ip->i_mount->m_perag_lock); 648 trace_xfs_perag_clear_reclaim(ip->i_mount, pag->pag_agno, 649 -1, _RET_IP_); 650 } 651} 652 653void 654__xfs_inode_clear_reclaim_tag( 655 xfs_mount_t *mp, 656 xfs_perag_t *pag, 657 xfs_inode_t *ip) 658{ 659 radix_tree_tag_clear(&pag->pag_ici_root, 660 XFS_INO_TO_AGINO(mp, ip->i_ino), XFS_ICI_RECLAIM_TAG); 661 __xfs_inode_clear_reclaim(pag, ip); 662} 663 664/* 665 * Grab the inode for reclaim exclusively. 666 * Return 0 if we grabbed it, non-zero otherwise. 667 */ 668STATIC int 669xfs_reclaim_inode_grab( 670 struct xfs_inode *ip, 671 int flags) 672{ 673 ASSERT(rcu_read_lock_held()); 674 675 /* quick check for stale RCU freed inode */ 676 if (!ip->i_ino) 677 return 1; 678 679 /* 680 * do some unlocked checks first to avoid unnecessary lock traffic. 681 * The first is a flush lock check, the second is a already in reclaim 682 * check. Only do these checks if we are not going to block on locks. 683 */ 684 if ((flags & SYNC_TRYLOCK) && 685 (!ip->i_flush.done || __xfs_iflags_test(ip, XFS_IRECLAIM))) { 686 return 1; 687 } 688 689 /* 690 * The radix tree lock here protects a thread in xfs_iget from racing 691 * with us starting reclaim on the inode. Once we have the 692 * XFS_IRECLAIM flag set it will not touch us. 693 * 694 * Due to RCU lookup, we may find inodes that have been freed and only 695 * have XFS_IRECLAIM set. Indeed, we may see reallocated inodes that 696 * aren't candidates for reclaim at all, so we must check the 697 * XFS_IRECLAIMABLE is set first before proceeding to reclaim. 698 */ 699 spin_lock(&ip->i_flags_lock); 700 if (!__xfs_iflags_test(ip, XFS_IRECLAIMABLE) || 701 __xfs_iflags_test(ip, XFS_IRECLAIM)) { 702 /* not a reclaim candidate. */ 703 spin_unlock(&ip->i_flags_lock); 704 return 1; 705 } 706 __xfs_iflags_set(ip, XFS_IRECLAIM); 707 spin_unlock(&ip->i_flags_lock); 708 return 0; 709} 710 711/* 712 * Inodes in different states need to be treated differently, and the return 713 * value of xfs_iflush is not sufficient to get this right. The following table 714 * lists the inode states and the reclaim actions necessary for non-blocking 715 * reclaim: 716 * 717 * 718 * inode state iflush ret required action 719 * --------------- ---------- --------------- 720 * bad - reclaim 721 * shutdown EIO unpin and reclaim 722 * clean, unpinned 0 reclaim 723 * stale, unpinned 0 reclaim 724 * clean, pinned(*) 0 requeue 725 * stale, pinned EAGAIN requeue 726 * dirty, delwri ok 0 requeue 727 * dirty, delwri blocked EAGAIN requeue 728 * dirty, sync flush 0 reclaim 729 * 730 * (*) dgc: I don't think the clean, pinned state is possible but it gets 731 * handled anyway given the order of checks implemented. 732 * 733 * As can be seen from the table, the return value of xfs_iflush() is not 734 * sufficient to correctly decide the reclaim action here. The checks in 735 * xfs_iflush() might look like duplicates, but they are not. 736 * 737 * Also, because we get the flush lock first, we know that any inode that has 738 * been flushed delwri has had the flush completed by the time we check that 739 * the inode is clean. The clean inode check needs to be done before flushing 740 * the inode delwri otherwise we would loop forever requeuing clean inodes as 741 * we cannot tell apart a successful delwri flush and a clean inode from the 742 * return value of xfs_iflush(). 743 * 744 * Note that because the inode is flushed delayed write by background 745 * writeback, the flush lock may already be held here and waiting on it can 746 * result in very long latencies. Hence for sync reclaims, where we wait on the 747 * flush lock, the caller should push out delayed write inodes first before 748 * trying to reclaim them to minimise the amount of time spent waiting. For 749 * background relaim, we just requeue the inode for the next pass. 750 * 751 * Hence the order of actions after gaining the locks should be: 752 * bad => reclaim 753 * shutdown => unpin and reclaim 754 * pinned, delwri => requeue 755 * pinned, sync => unpin 756 * stale => reclaim 757 * clean => reclaim 758 * dirty, delwri => flush and requeue 759 * dirty, sync => flush, wait and reclaim 760 */ 761STATIC int 762xfs_reclaim_inode( 763 struct xfs_inode *ip, 764 struct xfs_perag *pag, 765 int sync_mode) 766{ 767 int error; 768 769restart: 770 error = 0; 771 xfs_ilock(ip, XFS_ILOCK_EXCL); 772 if (!xfs_iflock_nowait(ip)) { 773 if (!(sync_mode & SYNC_WAIT)) 774 goto out; 775 xfs_iflock(ip); 776 } 777 778 if (is_bad_inode(VFS_I(ip))) 779 goto reclaim; 780 if (XFS_FORCED_SHUTDOWN(ip->i_mount)) { 781 xfs_iunpin_wait(ip); 782 goto reclaim; 783 } 784 if (xfs_ipincount(ip)) { 785 if (!(sync_mode & SYNC_WAIT)) { 786 xfs_ifunlock(ip); 787 goto out; 788 } 789 xfs_iunpin_wait(ip); 790 } 791 if (xfs_iflags_test(ip, XFS_ISTALE)) 792 goto reclaim; 793 if (xfs_inode_clean(ip)) 794 goto reclaim; 795 796 /* 797 * Now we have an inode that needs flushing. 798 * 799 * We do a nonblocking flush here even if we are doing a SYNC_WAIT 800 * reclaim as we can deadlock with inode cluster removal. 801 * xfs_ifree_cluster() can lock the inode buffer before it locks the 802 * ip->i_lock, and we are doing the exact opposite here. As a result, 803 * doing a blocking xfs_itobp() to get the cluster buffer will result 804 * in an ABBA deadlock with xfs_ifree_cluster(). 805 * 806 * As xfs_ifree_cluser() must gather all inodes that are active in the 807 * cache to mark them stale, if we hit this case we don't actually want 808 * to do IO here - we want the inode marked stale so we can simply 809 * reclaim it. Hence if we get an EAGAIN error on a SYNC_WAIT flush, 810 * just unlock the inode, back off and try again. Hopefully the next 811 * pass through will see the stale flag set on the inode. 812 */ 813 error = xfs_iflush(ip, SYNC_TRYLOCK | sync_mode); 814 if (sync_mode & SYNC_WAIT) { 815 if (error == EAGAIN) { 816 xfs_iunlock(ip, XFS_ILOCK_EXCL); 817 /* backoff longer than in xfs_ifree_cluster */ 818 delay(2); 819 goto restart; 820 } 821 xfs_iflock(ip); 822 goto reclaim; 823 } 824 825 /* 826 * When we have to flush an inode but don't have SYNC_WAIT set, we 827 * flush the inode out using a delwri buffer and wait for the next 828 * call into reclaim to find it in a clean state instead of waiting for 829 * it now. We also don't return errors here - if the error is transient 830 * then the next reclaim pass will flush the inode, and if the error 831 * is permanent then the next sync reclaim will reclaim the inode and 832 * pass on the error. 833 */ 834 if (error && error != EAGAIN && !XFS_FORCED_SHUTDOWN(ip->i_mount)) { 835 xfs_warn(ip->i_mount, 836 "inode 0x%llx background reclaim flush failed with %d", 837 (long long)ip->i_ino, error); 838 } 839out: 840 xfs_iflags_clear(ip, XFS_IRECLAIM); 841 xfs_iunlock(ip, XFS_ILOCK_EXCL); 842 /* 843 * We could return EAGAIN here to make reclaim rescan the inode tree in 844 * a short while. However, this just burns CPU time scanning the tree 845 * waiting for IO to complete and xfssyncd never goes back to the idle 846 * state. Instead, return 0 to let the next scheduled background reclaim 847 * attempt to reclaim the inode again. 848 */ 849 return 0; 850 851reclaim: 852 xfs_ifunlock(ip); 853 xfs_iunlock(ip, XFS_ILOCK_EXCL); 854 855 XFS_STATS_INC(xs_ig_reclaims); 856 /* 857 * Remove the inode from the per-AG radix tree. 858 * 859 * Because radix_tree_delete won't complain even if the item was never 860 * added to the tree assert that it's been there before to catch 861 * problems with the inode life time early on. 862 */ 863 spin_lock(&pag->pag_ici_lock); 864 if (!radix_tree_delete(&pag->pag_ici_root, 865 XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino))) 866 ASSERT(0); 867 __xfs_inode_clear_reclaim(pag, ip); 868 spin_unlock(&pag->pag_ici_lock); 869 870 /* 871 * Here we do an (almost) spurious inode lock in order to coordinate 872 * with inode cache radix tree lookups. This is because the lookup 873 * can reference the inodes in the cache without taking references. 874 * 875 * We make that OK here by ensuring that we wait until the inode is 876 * unlocked after the lookup before we go ahead and free it. We get 877 * both the ilock and the iolock because the code may need to drop the 878 * ilock one but will still hold the iolock. 879 */ 880 xfs_ilock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); 881 xfs_qm_dqdetach(ip); 882 xfs_iunlock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); 883 884 xfs_inode_free(ip); 885 return error; 886 887} 888 889/* 890 * Walk the AGs and reclaim the inodes in them. Even if the filesystem is 891 * corrupted, we still want to try to reclaim all the inodes. If we don't, 892 * then a shut down during filesystem unmount reclaim walk leak all the 893 * unreclaimed inodes. 894 */ 895int 896xfs_reclaim_inodes_ag( 897 struct xfs_mount *mp, 898 int flags, 899 int *nr_to_scan) 900{ 901 struct xfs_perag *pag; 902 int error = 0; 903 int last_error = 0; 904 xfs_agnumber_t ag; 905 int trylock = flags & SYNC_TRYLOCK; 906 int skipped; 907 908restart: 909 ag = 0; 910 skipped = 0; 911 while ((pag = xfs_perag_get_tag(mp, ag, XFS_ICI_RECLAIM_TAG))) { 912 unsigned long first_index = 0; 913 int done = 0; 914 int nr_found = 0; 915 916 ag = pag->pag_agno + 1; 917 918 if (trylock) { 919 if (!mutex_trylock(&pag->pag_ici_reclaim_lock)) { 920 skipped++; 921 xfs_perag_put(pag); 922 continue; 923 } 924 first_index = pag->pag_ici_reclaim_cursor; 925 } else 926 mutex_lock(&pag->pag_ici_reclaim_lock); 927 928 do { 929 struct xfs_inode *batch[XFS_LOOKUP_BATCH]; 930 int i; 931 932 rcu_read_lock(); 933 nr_found = radix_tree_gang_lookup_tag( 934 &pag->pag_ici_root, 935 (void **)batch, first_index, 936 XFS_LOOKUP_BATCH, 937 XFS_ICI_RECLAIM_TAG); 938 if (!nr_found) { 939 done = 1; 940 rcu_read_unlock(); 941 break; 942 } 943 944 /* 945 * Grab the inodes before we drop the lock. if we found 946 * nothing, nr == 0 and the loop will be skipped. 947 */ 948 for (i = 0; i < nr_found; i++) { 949 struct xfs_inode *ip = batch[i]; 950 951 if (done || xfs_reclaim_inode_grab(ip, flags)) 952 batch[i] = NULL; 953 954 /* 955 * Update the index for the next lookup. Catch 956 * overflows into the next AG range which can 957 * occur if we have inodes in the last block of 958 * the AG and we are currently pointing to the 959 * last inode. 960 * 961 * Because we may see inodes that are from the 962 * wrong AG due to RCU freeing and 963 * reallocation, only update the index if it 964 * lies in this AG. It was a race that lead us 965 * to see this inode, so another lookup from 966 * the same index will not find it again. 967 */ 968 if (XFS_INO_TO_AGNO(mp, ip->i_ino) != 969 pag->pag_agno) 970 continue; 971 first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1); 972 if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino)) 973 done = 1; 974 } 975 976 /* unlock now we've grabbed the inodes. */ 977 rcu_read_unlock(); 978 979 for (i = 0; i < nr_found; i++) { 980 if (!batch[i]) 981 continue; 982 error = xfs_reclaim_inode(batch[i], pag, flags); 983 if (error && last_error != EFSCORRUPTED) 984 last_error = error; 985 } 986 987 *nr_to_scan -= XFS_LOOKUP_BATCH; 988 989 } while (nr_found && !done && *nr_to_scan > 0); 990 991 if (trylock && !done) 992 pag->pag_ici_reclaim_cursor = first_index; 993 else 994 pag->pag_ici_reclaim_cursor = 0; 995 mutex_unlock(&pag->pag_ici_reclaim_lock); 996 xfs_perag_put(pag); 997 } 998 999 /* 1000 * if we skipped any AG, and we still have scan count remaining, do 1001 * another pass this time using blocking reclaim semantics (i.e 1002 * waiting on the reclaim locks and ignoring the reclaim cursors). This 1003 * ensure that when we get more reclaimers than AGs we block rather 1004 * than spin trying to execute reclaim. 1005 */ 1006 if (trylock && skipped && *nr_to_scan > 0) { 1007 trylock = 0; 1008 goto restart; 1009 } 1010 return XFS_ERROR(last_error); 1011} 1012 1013int 1014xfs_reclaim_inodes( 1015 xfs_mount_t *mp, 1016 int mode) 1017{ 1018 int nr_to_scan = INT_MAX; 1019 1020 return xfs_reclaim_inodes_ag(mp, mode, &nr_to_scan); 1021} 1022 1023/* 1024 * Inode cache shrinker. 1025 * 1026 * When called we make sure that there is a background (fast) inode reclaim in 1027 * progress, while we will throttle the speed of reclaim via doiing synchronous 1028 * reclaim of inodes. That means if we come across dirty inodes, we wait for 1029 * them to be cleaned, which we hope will not be very long due to the 1030 * background walker having already kicked the IO off on those dirty inodes. 1031 */ 1032static int 1033xfs_reclaim_inode_shrink( 1034 struct shrinker *shrink, 1035 struct shrink_control *sc) 1036{ 1037 struct xfs_mount *mp; 1038 struct xfs_perag *pag; 1039 xfs_agnumber_t ag; 1040 int reclaimable; 1041 int nr_to_scan = sc->nr_to_scan; 1042 gfp_t gfp_mask = sc->gfp_mask; 1043 1044 mp = container_of(shrink, struct xfs_mount, m_inode_shrink); 1045 if (nr_to_scan) { 1046 /* kick background reclaimer and push the AIL */ 1047 xfs_syncd_queue_reclaim(mp); 1048 xfs_ail_push_all(mp->m_ail); 1049 1050 if (!(gfp_mask & __GFP_FS)) 1051 return -1; 1052 1053 xfs_reclaim_inodes_ag(mp, SYNC_TRYLOCK | SYNC_WAIT, 1054 &nr_to_scan); 1055 /* terminate if we don't exhaust the scan */ 1056 if (nr_to_scan > 0) 1057 return -1; 1058 } 1059 1060 reclaimable = 0; 1061 ag = 0; 1062 while ((pag = xfs_perag_get_tag(mp, ag, XFS_ICI_RECLAIM_TAG))) { 1063 ag = pag->pag_agno + 1; 1064 reclaimable += pag->pag_ici_reclaimable; 1065 xfs_perag_put(pag); 1066 } 1067 return reclaimable; 1068} 1069 1070void 1071xfs_inode_shrinker_register( 1072 struct xfs_mount *mp) 1073{ 1074 mp->m_inode_shrink.shrink = xfs_reclaim_inode_shrink; 1075 mp->m_inode_shrink.seeks = DEFAULT_SEEKS; 1076 register_shrinker(&mp->m_inode_shrink); 1077} 1078 1079void 1080xfs_inode_shrinker_unregister( 1081 struct xfs_mount *mp) 1082{ 1083 unregister_shrinker(&mp->m_inode_shrink); 1084}