Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

at v3.2 1110 lines 30 kB view raw
1/* 2 * Copyright (c) 2000-2005 Silicon Graphics, Inc. 3 * All Rights Reserved. 4 * 5 * This program is free software; you can redistribute it and/or 6 * modify it under the terms of the GNU General Public License as 7 * published by the Free Software Foundation. 8 * 9 * This program is distributed in the hope that it would be useful, 10 * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 * GNU General Public License for more details. 13 * 14 * You should have received a copy of the GNU General Public License 15 * along with this program; if not, write the Free Software Foundation, 16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 17 */ 18#include "xfs.h" 19#include "xfs_fs.h" 20#include "xfs_types.h" 21#include "xfs_bit.h" 22#include "xfs_log.h" 23#include "xfs_inum.h" 24#include "xfs_trans.h" 25#include "xfs_trans_priv.h" 26#include "xfs_sb.h" 27#include "xfs_ag.h" 28#include "xfs_mount.h" 29#include "xfs_bmap_btree.h" 30#include "xfs_inode.h" 31#include "xfs_dinode.h" 32#include "xfs_error.h" 33#include "xfs_filestream.h" 34#include "xfs_vnodeops.h" 35#include "xfs_inode_item.h" 36#include "xfs_quota.h" 37#include "xfs_trace.h" 38#include "xfs_fsops.h" 39 40#include <linux/kthread.h> 41#include <linux/freezer.h> 42 43struct workqueue_struct *xfs_syncd_wq; /* sync workqueue */ 44 45/* 46 * The inode lookup is done in batches to keep the amount of lock traffic and 47 * radix tree lookups to a minimum. The batch size is a trade off between 48 * lookup reduction and stack usage. This is in the reclaim path, so we can't 49 * be too greedy. 50 */ 51#define XFS_LOOKUP_BATCH 32 52 53STATIC int 54xfs_inode_ag_walk_grab( 55 struct xfs_inode *ip) 56{ 57 struct inode *inode = VFS_I(ip); 58 59 ASSERT(rcu_read_lock_held()); 60 61 /* 62 * check for stale RCU freed inode 63 * 64 * If the inode has been reallocated, it doesn't matter if it's not in 65 * the AG we are walking - we are walking for writeback, so if it 66 * passes all the "valid inode" checks and is dirty, then we'll write 67 * it back anyway. If it has been reallocated and still being 68 * initialised, the XFS_INEW check below will catch it. 69 */ 70 spin_lock(&ip->i_flags_lock); 71 if (!ip->i_ino) 72 goto out_unlock_noent; 73 74 /* avoid new or reclaimable inodes. Leave for reclaim code to flush */ 75 if (__xfs_iflags_test(ip, XFS_INEW | XFS_IRECLAIMABLE | XFS_IRECLAIM)) 76 goto out_unlock_noent; 77 spin_unlock(&ip->i_flags_lock); 78 79 /* nothing to sync during shutdown */ 80 if (XFS_FORCED_SHUTDOWN(ip->i_mount)) 81 return EFSCORRUPTED; 82 83 /* If we can't grab the inode, it must on it's way to reclaim. */ 84 if (!igrab(inode)) 85 return ENOENT; 86 87 if (is_bad_inode(inode)) { 88 IRELE(ip); 89 return ENOENT; 90 } 91 92 /* inode is valid */ 93 return 0; 94 95out_unlock_noent: 96 spin_unlock(&ip->i_flags_lock); 97 return ENOENT; 98} 99 100STATIC int 101xfs_inode_ag_walk( 102 struct xfs_mount *mp, 103 struct xfs_perag *pag, 104 int (*execute)(struct xfs_inode *ip, 105 struct xfs_perag *pag, int flags), 106 int flags) 107{ 108 uint32_t first_index; 109 int last_error = 0; 110 int skipped; 111 int done; 112 int nr_found; 113 114restart: 115 done = 0; 116 skipped = 0; 117 first_index = 0; 118 nr_found = 0; 119 do { 120 struct xfs_inode *batch[XFS_LOOKUP_BATCH]; 121 int error = 0; 122 int i; 123 124 rcu_read_lock(); 125 nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, 126 (void **)batch, first_index, 127 XFS_LOOKUP_BATCH); 128 if (!nr_found) { 129 rcu_read_unlock(); 130 break; 131 } 132 133 /* 134 * Grab the inodes before we drop the lock. if we found 135 * nothing, nr == 0 and the loop will be skipped. 136 */ 137 for (i = 0; i < nr_found; i++) { 138 struct xfs_inode *ip = batch[i]; 139 140 if (done || xfs_inode_ag_walk_grab(ip)) 141 batch[i] = NULL; 142 143 /* 144 * Update the index for the next lookup. Catch 145 * overflows into the next AG range which can occur if 146 * we have inodes in the last block of the AG and we 147 * are currently pointing to the last inode. 148 * 149 * Because we may see inodes that are from the wrong AG 150 * due to RCU freeing and reallocation, only update the 151 * index if it lies in this AG. It was a race that lead 152 * us to see this inode, so another lookup from the 153 * same index will not find it again. 154 */ 155 if (XFS_INO_TO_AGNO(mp, ip->i_ino) != pag->pag_agno) 156 continue; 157 first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1); 158 if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino)) 159 done = 1; 160 } 161 162 /* unlock now we've grabbed the inodes. */ 163 rcu_read_unlock(); 164 165 for (i = 0; i < nr_found; i++) { 166 if (!batch[i]) 167 continue; 168 error = execute(batch[i], pag, flags); 169 IRELE(batch[i]); 170 if (error == EAGAIN) { 171 skipped++; 172 continue; 173 } 174 if (error && last_error != EFSCORRUPTED) 175 last_error = error; 176 } 177 178 /* bail out if the filesystem is corrupted. */ 179 if (error == EFSCORRUPTED) 180 break; 181 182 cond_resched(); 183 184 } while (nr_found && !done); 185 186 if (skipped) { 187 delay(1); 188 goto restart; 189 } 190 return last_error; 191} 192 193int 194xfs_inode_ag_iterator( 195 struct xfs_mount *mp, 196 int (*execute)(struct xfs_inode *ip, 197 struct xfs_perag *pag, int flags), 198 int flags) 199{ 200 struct xfs_perag *pag; 201 int error = 0; 202 int last_error = 0; 203 xfs_agnumber_t ag; 204 205 ag = 0; 206 while ((pag = xfs_perag_get(mp, ag))) { 207 ag = pag->pag_agno + 1; 208 error = xfs_inode_ag_walk(mp, pag, execute, flags); 209 xfs_perag_put(pag); 210 if (error) { 211 last_error = error; 212 if (error == EFSCORRUPTED) 213 break; 214 } 215 } 216 return XFS_ERROR(last_error); 217} 218 219STATIC int 220xfs_sync_inode_data( 221 struct xfs_inode *ip, 222 struct xfs_perag *pag, 223 int flags) 224{ 225 struct inode *inode = VFS_I(ip); 226 struct address_space *mapping = inode->i_mapping; 227 int error = 0; 228 229 if (!mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) 230 return 0; 231 232 if (!xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED)) { 233 if (flags & SYNC_TRYLOCK) 234 return 0; 235 xfs_ilock(ip, XFS_IOLOCK_SHARED); 236 } 237 238 error = xfs_flush_pages(ip, 0, -1, (flags & SYNC_WAIT) ? 239 0 : XBF_ASYNC, FI_NONE); 240 xfs_iunlock(ip, XFS_IOLOCK_SHARED); 241 return error; 242} 243 244STATIC int 245xfs_sync_inode_attr( 246 struct xfs_inode *ip, 247 struct xfs_perag *pag, 248 int flags) 249{ 250 int error = 0; 251 252 xfs_ilock(ip, XFS_ILOCK_SHARED); 253 if (xfs_inode_clean(ip)) 254 goto out_unlock; 255 if (!xfs_iflock_nowait(ip)) { 256 if (!(flags & SYNC_WAIT)) 257 goto out_unlock; 258 xfs_iflock(ip); 259 } 260 261 if (xfs_inode_clean(ip)) { 262 xfs_ifunlock(ip); 263 goto out_unlock; 264 } 265 266 error = xfs_iflush(ip, flags); 267 268 /* 269 * We don't want to try again on non-blocking flushes that can't run 270 * again immediately. If an inode really must be written, then that's 271 * what the SYNC_WAIT flag is for. 272 */ 273 if (error == EAGAIN) { 274 ASSERT(!(flags & SYNC_WAIT)); 275 error = 0; 276 } 277 278 out_unlock: 279 xfs_iunlock(ip, XFS_ILOCK_SHARED); 280 return error; 281} 282 283/* 284 * Write out pagecache data for the whole filesystem. 285 */ 286STATIC int 287xfs_sync_data( 288 struct xfs_mount *mp, 289 int flags) 290{ 291 int error; 292 293 ASSERT((flags & ~(SYNC_TRYLOCK|SYNC_WAIT)) == 0); 294 295 error = xfs_inode_ag_iterator(mp, xfs_sync_inode_data, flags); 296 if (error) 297 return XFS_ERROR(error); 298 299 xfs_log_force(mp, (flags & SYNC_WAIT) ? XFS_LOG_SYNC : 0); 300 return 0; 301} 302 303/* 304 * Write out inode metadata (attributes) for the whole filesystem. 305 */ 306STATIC int 307xfs_sync_attr( 308 struct xfs_mount *mp, 309 int flags) 310{ 311 ASSERT((flags & ~SYNC_WAIT) == 0); 312 313 return xfs_inode_ag_iterator(mp, xfs_sync_inode_attr, flags); 314} 315 316STATIC int 317xfs_sync_fsdata( 318 struct xfs_mount *mp) 319{ 320 struct xfs_buf *bp; 321 int error; 322 323 /* 324 * If the buffer is pinned then push on the log so we won't get stuck 325 * waiting in the write for someone, maybe ourselves, to flush the log. 326 * 327 * Even though we just pushed the log above, we did not have the 328 * superblock buffer locked at that point so it can become pinned in 329 * between there and here. 330 */ 331 bp = xfs_getsb(mp, 0); 332 if (xfs_buf_ispinned(bp)) 333 xfs_log_force(mp, 0); 334 error = xfs_bwrite(bp); 335 xfs_buf_relse(bp); 336 return error; 337} 338 339int 340xfs_log_dirty_inode( 341 struct xfs_inode *ip, 342 struct xfs_perag *pag, 343 int flags) 344{ 345 struct xfs_mount *mp = ip->i_mount; 346 struct xfs_trans *tp; 347 int error; 348 349 if (!ip->i_update_core) 350 return 0; 351 352 tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS); 353 error = xfs_trans_reserve(tp, 0, XFS_FSYNC_TS_LOG_RES(mp), 0, 0, 0); 354 if (error) { 355 xfs_trans_cancel(tp, 0); 356 return error; 357 } 358 359 xfs_ilock(ip, XFS_ILOCK_EXCL); 360 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); 361 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); 362 return xfs_trans_commit(tp, 0); 363} 364 365/* 366 * When remounting a filesystem read-only or freezing the filesystem, we have 367 * two phases to execute. This first phase is syncing the data before we 368 * quiesce the filesystem, and the second is flushing all the inodes out after 369 * we've waited for all the transactions created by the first phase to 370 * complete. The second phase ensures that the inodes are written to their 371 * location on disk rather than just existing in transactions in the log. This 372 * means after a quiesce there is no log replay required to write the inodes to 373 * disk (this is the main difference between a sync and a quiesce). 374 */ 375/* 376 * First stage of freeze - no writers will make progress now we are here, 377 * so we flush delwri and delalloc buffers here, then wait for all I/O to 378 * complete. Data is frozen at that point. Metadata is not frozen, 379 * transactions can still occur here so don't bother flushing the buftarg 380 * because it'll just get dirty again. 381 */ 382int 383xfs_quiesce_data( 384 struct xfs_mount *mp) 385{ 386 int error, error2 = 0; 387 388 /* 389 * Log all pending size and timestamp updates. The vfs writeback 390 * code is supposed to do this, but due to its overagressive 391 * livelock detection it will skip inodes where appending writes 392 * were written out in the first non-blocking sync phase if their 393 * completion took long enough that it happened after taking the 394 * timestamp for the cut-off in the blocking phase. 395 */ 396 xfs_inode_ag_iterator(mp, xfs_log_dirty_inode, 0); 397 398 xfs_qm_sync(mp, SYNC_TRYLOCK); 399 xfs_qm_sync(mp, SYNC_WAIT); 400 401 /* force out the newly dirtied log buffers */ 402 xfs_log_force(mp, XFS_LOG_SYNC); 403 404 /* write superblock and hoover up shutdown errors */ 405 error = xfs_sync_fsdata(mp); 406 407 /* make sure all delwri buffers are written out */ 408 xfs_flush_buftarg(mp->m_ddev_targp, 1); 409 410 /* mark the log as covered if needed */ 411 if (xfs_log_need_covered(mp)) 412 error2 = xfs_fs_log_dummy(mp); 413 414 /* flush data-only devices */ 415 if (mp->m_rtdev_targp) 416 xfs_flush_buftarg(mp->m_rtdev_targp, 1); 417 418 return error ? error : error2; 419} 420 421STATIC void 422xfs_quiesce_fs( 423 struct xfs_mount *mp) 424{ 425 int count = 0, pincount; 426 427 xfs_reclaim_inodes(mp, 0); 428 xfs_flush_buftarg(mp->m_ddev_targp, 0); 429 430 /* 431 * This loop must run at least twice. The first instance of the loop 432 * will flush most meta data but that will generate more meta data 433 * (typically directory updates). Which then must be flushed and 434 * logged before we can write the unmount record. We also so sync 435 * reclaim of inodes to catch any that the above delwri flush skipped. 436 */ 437 do { 438 xfs_reclaim_inodes(mp, SYNC_WAIT); 439 xfs_sync_attr(mp, SYNC_WAIT); 440 pincount = xfs_flush_buftarg(mp->m_ddev_targp, 1); 441 if (!pincount) { 442 delay(50); 443 count++; 444 } 445 } while (count < 2); 446} 447 448/* 449 * Second stage of a quiesce. The data is already synced, now we have to take 450 * care of the metadata. New transactions are already blocked, so we need to 451 * wait for any remaining transactions to drain out before proceeding. 452 */ 453void 454xfs_quiesce_attr( 455 struct xfs_mount *mp) 456{ 457 int error = 0; 458 459 /* wait for all modifications to complete */ 460 while (atomic_read(&mp->m_active_trans) > 0) 461 delay(100); 462 463 /* flush inodes and push all remaining buffers out to disk */ 464 xfs_quiesce_fs(mp); 465 466 /* 467 * Just warn here till VFS can correctly support 468 * read-only remount without racing. 469 */ 470 WARN_ON(atomic_read(&mp->m_active_trans) != 0); 471 472 /* Push the superblock and write an unmount record */ 473 error = xfs_log_sbcount(mp); 474 if (error) 475 xfs_warn(mp, "xfs_attr_quiesce: failed to log sb changes. " 476 "Frozen image may not be consistent."); 477 xfs_log_unmount_write(mp); 478 xfs_unmountfs_writesb(mp); 479} 480 481static void 482xfs_syncd_queue_sync( 483 struct xfs_mount *mp) 484{ 485 queue_delayed_work(xfs_syncd_wq, &mp->m_sync_work, 486 msecs_to_jiffies(xfs_syncd_centisecs * 10)); 487} 488 489/* 490 * Every sync period we need to unpin all items, reclaim inodes and sync 491 * disk quotas. We might need to cover the log to indicate that the 492 * filesystem is idle and not frozen. 493 */ 494STATIC void 495xfs_sync_worker( 496 struct work_struct *work) 497{ 498 struct xfs_mount *mp = container_of(to_delayed_work(work), 499 struct xfs_mount, m_sync_work); 500 int error; 501 502 if (!(mp->m_flags & XFS_MOUNT_RDONLY)) { 503 /* dgc: errors ignored here */ 504 if (mp->m_super->s_frozen == SB_UNFROZEN && 505 xfs_log_need_covered(mp)) 506 error = xfs_fs_log_dummy(mp); 507 else 508 xfs_log_force(mp, 0); 509 error = xfs_qm_sync(mp, SYNC_TRYLOCK); 510 511 /* start pushing all the metadata that is currently dirty */ 512 xfs_ail_push_all(mp->m_ail); 513 } 514 515 /* queue us up again */ 516 xfs_syncd_queue_sync(mp); 517} 518 519/* 520 * Queue a new inode reclaim pass if there are reclaimable inodes and there 521 * isn't a reclaim pass already in progress. By default it runs every 5s based 522 * on the xfs syncd work default of 30s. Perhaps this should have it's own 523 * tunable, but that can be done if this method proves to be ineffective or too 524 * aggressive. 525 */ 526static void 527xfs_syncd_queue_reclaim( 528 struct xfs_mount *mp) 529{ 530 531 /* 532 * We can have inodes enter reclaim after we've shut down the syncd 533 * workqueue during unmount, so don't allow reclaim work to be queued 534 * during unmount. 535 */ 536 if (!(mp->m_super->s_flags & MS_ACTIVE)) 537 return; 538 539 rcu_read_lock(); 540 if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_RECLAIM_TAG)) { 541 queue_delayed_work(xfs_syncd_wq, &mp->m_reclaim_work, 542 msecs_to_jiffies(xfs_syncd_centisecs / 6 * 10)); 543 } 544 rcu_read_unlock(); 545} 546 547/* 548 * This is a fast pass over the inode cache to try to get reclaim moving on as 549 * many inodes as possible in a short period of time. It kicks itself every few 550 * seconds, as well as being kicked by the inode cache shrinker when memory 551 * goes low. It scans as quickly as possible avoiding locked inodes or those 552 * already being flushed, and once done schedules a future pass. 553 */ 554STATIC void 555xfs_reclaim_worker( 556 struct work_struct *work) 557{ 558 struct xfs_mount *mp = container_of(to_delayed_work(work), 559 struct xfs_mount, m_reclaim_work); 560 561 xfs_reclaim_inodes(mp, SYNC_TRYLOCK); 562 xfs_syncd_queue_reclaim(mp); 563} 564 565/* 566 * Flush delayed allocate data, attempting to free up reserved space 567 * from existing allocations. At this point a new allocation attempt 568 * has failed with ENOSPC and we are in the process of scratching our 569 * heads, looking about for more room. 570 * 571 * Queue a new data flush if there isn't one already in progress and 572 * wait for completion of the flush. This means that we only ever have one 573 * inode flush in progress no matter how many ENOSPC events are occurring and 574 * so will prevent the system from bogging down due to every concurrent 575 * ENOSPC event scanning all the active inodes in the system for writeback. 576 */ 577void 578xfs_flush_inodes( 579 struct xfs_inode *ip) 580{ 581 struct xfs_mount *mp = ip->i_mount; 582 583 queue_work(xfs_syncd_wq, &mp->m_flush_work); 584 flush_work_sync(&mp->m_flush_work); 585} 586 587STATIC void 588xfs_flush_worker( 589 struct work_struct *work) 590{ 591 struct xfs_mount *mp = container_of(work, 592 struct xfs_mount, m_flush_work); 593 594 xfs_sync_data(mp, SYNC_TRYLOCK); 595 xfs_sync_data(mp, SYNC_TRYLOCK | SYNC_WAIT); 596} 597 598int 599xfs_syncd_init( 600 struct xfs_mount *mp) 601{ 602 INIT_WORK(&mp->m_flush_work, xfs_flush_worker); 603 INIT_DELAYED_WORK(&mp->m_sync_work, xfs_sync_worker); 604 INIT_DELAYED_WORK(&mp->m_reclaim_work, xfs_reclaim_worker); 605 606 xfs_syncd_queue_sync(mp); 607 xfs_syncd_queue_reclaim(mp); 608 609 return 0; 610} 611 612void 613xfs_syncd_stop( 614 struct xfs_mount *mp) 615{ 616 cancel_delayed_work_sync(&mp->m_sync_work); 617 cancel_delayed_work_sync(&mp->m_reclaim_work); 618 cancel_work_sync(&mp->m_flush_work); 619} 620 621void 622__xfs_inode_set_reclaim_tag( 623 struct xfs_perag *pag, 624 struct xfs_inode *ip) 625{ 626 radix_tree_tag_set(&pag->pag_ici_root, 627 XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino), 628 XFS_ICI_RECLAIM_TAG); 629 630 if (!pag->pag_ici_reclaimable) { 631 /* propagate the reclaim tag up into the perag radix tree */ 632 spin_lock(&ip->i_mount->m_perag_lock); 633 radix_tree_tag_set(&ip->i_mount->m_perag_tree, 634 XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino), 635 XFS_ICI_RECLAIM_TAG); 636 spin_unlock(&ip->i_mount->m_perag_lock); 637 638 /* schedule periodic background inode reclaim */ 639 xfs_syncd_queue_reclaim(ip->i_mount); 640 641 trace_xfs_perag_set_reclaim(ip->i_mount, pag->pag_agno, 642 -1, _RET_IP_); 643 } 644 pag->pag_ici_reclaimable++; 645} 646 647/* 648 * We set the inode flag atomically with the radix tree tag. 649 * Once we get tag lookups on the radix tree, this inode flag 650 * can go away. 651 */ 652void 653xfs_inode_set_reclaim_tag( 654 xfs_inode_t *ip) 655{ 656 struct xfs_mount *mp = ip->i_mount; 657 struct xfs_perag *pag; 658 659 pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); 660 spin_lock(&pag->pag_ici_lock); 661 spin_lock(&ip->i_flags_lock); 662 __xfs_inode_set_reclaim_tag(pag, ip); 663 __xfs_iflags_set(ip, XFS_IRECLAIMABLE); 664 spin_unlock(&ip->i_flags_lock); 665 spin_unlock(&pag->pag_ici_lock); 666 xfs_perag_put(pag); 667} 668 669STATIC void 670__xfs_inode_clear_reclaim( 671 xfs_perag_t *pag, 672 xfs_inode_t *ip) 673{ 674 pag->pag_ici_reclaimable--; 675 if (!pag->pag_ici_reclaimable) { 676 /* clear the reclaim tag from the perag radix tree */ 677 spin_lock(&ip->i_mount->m_perag_lock); 678 radix_tree_tag_clear(&ip->i_mount->m_perag_tree, 679 XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino), 680 XFS_ICI_RECLAIM_TAG); 681 spin_unlock(&ip->i_mount->m_perag_lock); 682 trace_xfs_perag_clear_reclaim(ip->i_mount, pag->pag_agno, 683 -1, _RET_IP_); 684 } 685} 686 687void 688__xfs_inode_clear_reclaim_tag( 689 xfs_mount_t *mp, 690 xfs_perag_t *pag, 691 xfs_inode_t *ip) 692{ 693 radix_tree_tag_clear(&pag->pag_ici_root, 694 XFS_INO_TO_AGINO(mp, ip->i_ino), XFS_ICI_RECLAIM_TAG); 695 __xfs_inode_clear_reclaim(pag, ip); 696} 697 698/* 699 * Grab the inode for reclaim exclusively. 700 * Return 0 if we grabbed it, non-zero otherwise. 701 */ 702STATIC int 703xfs_reclaim_inode_grab( 704 struct xfs_inode *ip, 705 int flags) 706{ 707 ASSERT(rcu_read_lock_held()); 708 709 /* quick check for stale RCU freed inode */ 710 if (!ip->i_ino) 711 return 1; 712 713 /* 714 * do some unlocked checks first to avoid unnecessary lock traffic. 715 * The first is a flush lock check, the second is a already in reclaim 716 * check. Only do these checks if we are not going to block on locks. 717 */ 718 if ((flags & SYNC_TRYLOCK) && 719 (!ip->i_flush.done || __xfs_iflags_test(ip, XFS_IRECLAIM))) { 720 return 1; 721 } 722 723 /* 724 * The radix tree lock here protects a thread in xfs_iget from racing 725 * with us starting reclaim on the inode. Once we have the 726 * XFS_IRECLAIM flag set it will not touch us. 727 * 728 * Due to RCU lookup, we may find inodes that have been freed and only 729 * have XFS_IRECLAIM set. Indeed, we may see reallocated inodes that 730 * aren't candidates for reclaim at all, so we must check the 731 * XFS_IRECLAIMABLE is set first before proceeding to reclaim. 732 */ 733 spin_lock(&ip->i_flags_lock); 734 if (!__xfs_iflags_test(ip, XFS_IRECLAIMABLE) || 735 __xfs_iflags_test(ip, XFS_IRECLAIM)) { 736 /* not a reclaim candidate. */ 737 spin_unlock(&ip->i_flags_lock); 738 return 1; 739 } 740 __xfs_iflags_set(ip, XFS_IRECLAIM); 741 spin_unlock(&ip->i_flags_lock); 742 return 0; 743} 744 745/* 746 * Inodes in different states need to be treated differently, and the return 747 * value of xfs_iflush is not sufficient to get this right. The following table 748 * lists the inode states and the reclaim actions necessary for non-blocking 749 * reclaim: 750 * 751 * 752 * inode state iflush ret required action 753 * --------------- ---------- --------------- 754 * bad - reclaim 755 * shutdown EIO unpin and reclaim 756 * clean, unpinned 0 reclaim 757 * stale, unpinned 0 reclaim 758 * clean, pinned(*) 0 requeue 759 * stale, pinned EAGAIN requeue 760 * dirty, delwri ok 0 requeue 761 * dirty, delwri blocked EAGAIN requeue 762 * dirty, sync flush 0 reclaim 763 * 764 * (*) dgc: I don't think the clean, pinned state is possible but it gets 765 * handled anyway given the order of checks implemented. 766 * 767 * As can be seen from the table, the return value of xfs_iflush() is not 768 * sufficient to correctly decide the reclaim action here. The checks in 769 * xfs_iflush() might look like duplicates, but they are not. 770 * 771 * Also, because we get the flush lock first, we know that any inode that has 772 * been flushed delwri has had the flush completed by the time we check that 773 * the inode is clean. The clean inode check needs to be done before flushing 774 * the inode delwri otherwise we would loop forever requeuing clean inodes as 775 * we cannot tell apart a successful delwri flush and a clean inode from the 776 * return value of xfs_iflush(). 777 * 778 * Note that because the inode is flushed delayed write by background 779 * writeback, the flush lock may already be held here and waiting on it can 780 * result in very long latencies. Hence for sync reclaims, where we wait on the 781 * flush lock, the caller should push out delayed write inodes first before 782 * trying to reclaim them to minimise the amount of time spent waiting. For 783 * background relaim, we just requeue the inode for the next pass. 784 * 785 * Hence the order of actions after gaining the locks should be: 786 * bad => reclaim 787 * shutdown => unpin and reclaim 788 * pinned, delwri => requeue 789 * pinned, sync => unpin 790 * stale => reclaim 791 * clean => reclaim 792 * dirty, delwri => flush and requeue 793 * dirty, sync => flush, wait and reclaim 794 */ 795STATIC int 796xfs_reclaim_inode( 797 struct xfs_inode *ip, 798 struct xfs_perag *pag, 799 int sync_mode) 800{ 801 int error; 802 803restart: 804 error = 0; 805 xfs_ilock(ip, XFS_ILOCK_EXCL); 806 if (!xfs_iflock_nowait(ip)) { 807 if (!(sync_mode & SYNC_WAIT)) 808 goto out; 809 810 /* 811 * If we only have a single dirty inode in a cluster there is 812 * a fair chance that the AIL push may have pushed it into 813 * the buffer, but xfsbufd won't touch it until 30 seconds 814 * from now, and thus we will lock up here. 815 * 816 * Promote the inode buffer to the front of the delwri list 817 * and wake up xfsbufd now. 818 */ 819 xfs_promote_inode(ip); 820 xfs_iflock(ip); 821 } 822 823 if (is_bad_inode(VFS_I(ip))) 824 goto reclaim; 825 if (XFS_FORCED_SHUTDOWN(ip->i_mount)) { 826 xfs_iunpin_wait(ip); 827 goto reclaim; 828 } 829 if (xfs_ipincount(ip)) { 830 if (!(sync_mode & SYNC_WAIT)) { 831 xfs_ifunlock(ip); 832 goto out; 833 } 834 xfs_iunpin_wait(ip); 835 } 836 if (xfs_iflags_test(ip, XFS_ISTALE)) 837 goto reclaim; 838 if (xfs_inode_clean(ip)) 839 goto reclaim; 840 841 /* 842 * Now we have an inode that needs flushing. 843 * 844 * We do a nonblocking flush here even if we are doing a SYNC_WAIT 845 * reclaim as we can deadlock with inode cluster removal. 846 * xfs_ifree_cluster() can lock the inode buffer before it locks the 847 * ip->i_lock, and we are doing the exact opposite here. As a result, 848 * doing a blocking xfs_itobp() to get the cluster buffer will result 849 * in an ABBA deadlock with xfs_ifree_cluster(). 850 * 851 * As xfs_ifree_cluser() must gather all inodes that are active in the 852 * cache to mark them stale, if we hit this case we don't actually want 853 * to do IO here - we want the inode marked stale so we can simply 854 * reclaim it. Hence if we get an EAGAIN error on a SYNC_WAIT flush, 855 * just unlock the inode, back off and try again. Hopefully the next 856 * pass through will see the stale flag set on the inode. 857 */ 858 error = xfs_iflush(ip, SYNC_TRYLOCK | sync_mode); 859 if (sync_mode & SYNC_WAIT) { 860 if (error == EAGAIN) { 861 xfs_iunlock(ip, XFS_ILOCK_EXCL); 862 /* backoff longer than in xfs_ifree_cluster */ 863 delay(2); 864 goto restart; 865 } 866 xfs_iflock(ip); 867 goto reclaim; 868 } 869 870 /* 871 * When we have to flush an inode but don't have SYNC_WAIT set, we 872 * flush the inode out using a delwri buffer and wait for the next 873 * call into reclaim to find it in a clean state instead of waiting for 874 * it now. We also don't return errors here - if the error is transient 875 * then the next reclaim pass will flush the inode, and if the error 876 * is permanent then the next sync reclaim will reclaim the inode and 877 * pass on the error. 878 */ 879 if (error && error != EAGAIN && !XFS_FORCED_SHUTDOWN(ip->i_mount)) { 880 xfs_warn(ip->i_mount, 881 "inode 0x%llx background reclaim flush failed with %d", 882 (long long)ip->i_ino, error); 883 } 884out: 885 xfs_iflags_clear(ip, XFS_IRECLAIM); 886 xfs_iunlock(ip, XFS_ILOCK_EXCL); 887 /* 888 * We could return EAGAIN here to make reclaim rescan the inode tree in 889 * a short while. However, this just burns CPU time scanning the tree 890 * waiting for IO to complete and xfssyncd never goes back to the idle 891 * state. Instead, return 0 to let the next scheduled background reclaim 892 * attempt to reclaim the inode again. 893 */ 894 return 0; 895 896reclaim: 897 xfs_ifunlock(ip); 898 xfs_iunlock(ip, XFS_ILOCK_EXCL); 899 900 XFS_STATS_INC(xs_ig_reclaims); 901 /* 902 * Remove the inode from the per-AG radix tree. 903 * 904 * Because radix_tree_delete won't complain even if the item was never 905 * added to the tree assert that it's been there before to catch 906 * problems with the inode life time early on. 907 */ 908 spin_lock(&pag->pag_ici_lock); 909 if (!radix_tree_delete(&pag->pag_ici_root, 910 XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino))) 911 ASSERT(0); 912 __xfs_inode_clear_reclaim(pag, ip); 913 spin_unlock(&pag->pag_ici_lock); 914 915 /* 916 * Here we do an (almost) spurious inode lock in order to coordinate 917 * with inode cache radix tree lookups. This is because the lookup 918 * can reference the inodes in the cache without taking references. 919 * 920 * We make that OK here by ensuring that we wait until the inode is 921 * unlocked after the lookup before we go ahead and free it. We get 922 * both the ilock and the iolock because the code may need to drop the 923 * ilock one but will still hold the iolock. 924 */ 925 xfs_ilock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); 926 xfs_qm_dqdetach(ip); 927 xfs_iunlock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); 928 929 xfs_inode_free(ip); 930 return error; 931 932} 933 934/* 935 * Walk the AGs and reclaim the inodes in them. Even if the filesystem is 936 * corrupted, we still want to try to reclaim all the inodes. If we don't, 937 * then a shut down during filesystem unmount reclaim walk leak all the 938 * unreclaimed inodes. 939 */ 940int 941xfs_reclaim_inodes_ag( 942 struct xfs_mount *mp, 943 int flags, 944 int *nr_to_scan) 945{ 946 struct xfs_perag *pag; 947 int error = 0; 948 int last_error = 0; 949 xfs_agnumber_t ag; 950 int trylock = flags & SYNC_TRYLOCK; 951 int skipped; 952 953restart: 954 ag = 0; 955 skipped = 0; 956 while ((pag = xfs_perag_get_tag(mp, ag, XFS_ICI_RECLAIM_TAG))) { 957 unsigned long first_index = 0; 958 int done = 0; 959 int nr_found = 0; 960 961 ag = pag->pag_agno + 1; 962 963 if (trylock) { 964 if (!mutex_trylock(&pag->pag_ici_reclaim_lock)) { 965 skipped++; 966 xfs_perag_put(pag); 967 continue; 968 } 969 first_index = pag->pag_ici_reclaim_cursor; 970 } else 971 mutex_lock(&pag->pag_ici_reclaim_lock); 972 973 do { 974 struct xfs_inode *batch[XFS_LOOKUP_BATCH]; 975 int i; 976 977 rcu_read_lock(); 978 nr_found = radix_tree_gang_lookup_tag( 979 &pag->pag_ici_root, 980 (void **)batch, first_index, 981 XFS_LOOKUP_BATCH, 982 XFS_ICI_RECLAIM_TAG); 983 if (!nr_found) { 984 done = 1; 985 rcu_read_unlock(); 986 break; 987 } 988 989 /* 990 * Grab the inodes before we drop the lock. if we found 991 * nothing, nr == 0 and the loop will be skipped. 992 */ 993 for (i = 0; i < nr_found; i++) { 994 struct xfs_inode *ip = batch[i]; 995 996 if (done || xfs_reclaim_inode_grab(ip, flags)) 997 batch[i] = NULL; 998 999 /* 1000 * Update the index for the next lookup. Catch 1001 * overflows into the next AG range which can 1002 * occur if we have inodes in the last block of 1003 * the AG and we are currently pointing to the 1004 * last inode. 1005 * 1006 * Because we may see inodes that are from the 1007 * wrong AG due to RCU freeing and 1008 * reallocation, only update the index if it 1009 * lies in this AG. It was a race that lead us 1010 * to see this inode, so another lookup from 1011 * the same index will not find it again. 1012 */ 1013 if (XFS_INO_TO_AGNO(mp, ip->i_ino) != 1014 pag->pag_agno) 1015 continue; 1016 first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1); 1017 if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino)) 1018 done = 1; 1019 } 1020 1021 /* unlock now we've grabbed the inodes. */ 1022 rcu_read_unlock(); 1023 1024 for (i = 0; i < nr_found; i++) { 1025 if (!batch[i]) 1026 continue; 1027 error = xfs_reclaim_inode(batch[i], pag, flags); 1028 if (error && last_error != EFSCORRUPTED) 1029 last_error = error; 1030 } 1031 1032 *nr_to_scan -= XFS_LOOKUP_BATCH; 1033 1034 cond_resched(); 1035 1036 } while (nr_found && !done && *nr_to_scan > 0); 1037 1038 if (trylock && !done) 1039 pag->pag_ici_reclaim_cursor = first_index; 1040 else 1041 pag->pag_ici_reclaim_cursor = 0; 1042 mutex_unlock(&pag->pag_ici_reclaim_lock); 1043 xfs_perag_put(pag); 1044 } 1045 1046 /* 1047 * if we skipped any AG, and we still have scan count remaining, do 1048 * another pass this time using blocking reclaim semantics (i.e 1049 * waiting on the reclaim locks and ignoring the reclaim cursors). This 1050 * ensure that when we get more reclaimers than AGs we block rather 1051 * than spin trying to execute reclaim. 1052 */ 1053 if (skipped && (flags & SYNC_WAIT) && *nr_to_scan > 0) { 1054 trylock = 0; 1055 goto restart; 1056 } 1057 return XFS_ERROR(last_error); 1058} 1059 1060int 1061xfs_reclaim_inodes( 1062 xfs_mount_t *mp, 1063 int mode) 1064{ 1065 int nr_to_scan = INT_MAX; 1066 1067 return xfs_reclaim_inodes_ag(mp, mode, &nr_to_scan); 1068} 1069 1070/* 1071 * Scan a certain number of inodes for reclaim. 1072 * 1073 * When called we make sure that there is a background (fast) inode reclaim in 1074 * progress, while we will throttle the speed of reclaim via doing synchronous 1075 * reclaim of inodes. That means if we come across dirty inodes, we wait for 1076 * them to be cleaned, which we hope will not be very long due to the 1077 * background walker having already kicked the IO off on those dirty inodes. 1078 */ 1079void 1080xfs_reclaim_inodes_nr( 1081 struct xfs_mount *mp, 1082 int nr_to_scan) 1083{ 1084 /* kick background reclaimer and push the AIL */ 1085 xfs_syncd_queue_reclaim(mp); 1086 xfs_ail_push_all(mp->m_ail); 1087 1088 xfs_reclaim_inodes_ag(mp, SYNC_TRYLOCK | SYNC_WAIT, &nr_to_scan); 1089} 1090 1091/* 1092 * Return the number of reclaimable inodes in the filesystem for 1093 * the shrinker to determine how much to reclaim. 1094 */ 1095int 1096xfs_reclaim_inodes_count( 1097 struct xfs_mount *mp) 1098{ 1099 struct xfs_perag *pag; 1100 xfs_agnumber_t ag = 0; 1101 int reclaimable = 0; 1102 1103 while ((pag = xfs_perag_get_tag(mp, ag, XFS_ICI_RECLAIM_TAG))) { 1104 ag = pag->pag_agno + 1; 1105 reclaimable += pag->pag_ici_reclaimable; 1106 xfs_perag_put(pag); 1107 } 1108 return reclaimable; 1109} 1110