Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

at v3.5 964 lines 26 kB view raw
1/* 2 * Copyright (c) 2000-2005 Silicon Graphics, Inc. 3 * All Rights Reserved. 4 * 5 * This program is free software; you can redistribute it and/or 6 * modify it under the terms of the GNU General Public License as 7 * published by the Free Software Foundation. 8 * 9 * This program is distributed in the hope that it would be useful, 10 * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 * GNU General Public License for more details. 13 * 14 * You should have received a copy of the GNU General Public License 15 * along with this program; if not, write the Free Software Foundation, 16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 17 */ 18#include "xfs.h" 19#include "xfs_fs.h" 20#include "xfs_types.h" 21#include "xfs_log.h" 22#include "xfs_inum.h" 23#include "xfs_trans.h" 24#include "xfs_trans_priv.h" 25#include "xfs_sb.h" 26#include "xfs_ag.h" 27#include "xfs_mount.h" 28#include "xfs_bmap_btree.h" 29#include "xfs_inode.h" 30#include "xfs_dinode.h" 31#include "xfs_error.h" 32#include "xfs_filestream.h" 33#include "xfs_vnodeops.h" 34#include "xfs_inode_item.h" 35#include "xfs_quota.h" 36#include "xfs_trace.h" 37#include "xfs_fsops.h" 38 39#include <linux/kthread.h> 40#include <linux/freezer.h> 41 42struct workqueue_struct *xfs_syncd_wq; /* sync workqueue */ 43 44/* 45 * The inode lookup is done in batches to keep the amount of lock traffic and 46 * radix tree lookups to a minimum. The batch size is a trade off between 47 * lookup reduction and stack usage. This is in the reclaim path, so we can't 48 * be too greedy. 49 */ 50#define XFS_LOOKUP_BATCH 32 51 52STATIC int 53xfs_inode_ag_walk_grab( 54 struct xfs_inode *ip) 55{ 56 struct inode *inode = VFS_I(ip); 57 58 ASSERT(rcu_read_lock_held()); 59 60 /* 61 * check for stale RCU freed inode 62 * 63 * If the inode has been reallocated, it doesn't matter if it's not in 64 * the AG we are walking - we are walking for writeback, so if it 65 * passes all the "valid inode" checks and is dirty, then we'll write 66 * it back anyway. If it has been reallocated and still being 67 * initialised, the XFS_INEW check below will catch it. 68 */ 69 spin_lock(&ip->i_flags_lock); 70 if (!ip->i_ino) 71 goto out_unlock_noent; 72 73 /* avoid new or reclaimable inodes. Leave for reclaim code to flush */ 74 if (__xfs_iflags_test(ip, XFS_INEW | XFS_IRECLAIMABLE | XFS_IRECLAIM)) 75 goto out_unlock_noent; 76 spin_unlock(&ip->i_flags_lock); 77 78 /* nothing to sync during shutdown */ 79 if (XFS_FORCED_SHUTDOWN(ip->i_mount)) 80 return EFSCORRUPTED; 81 82 /* If we can't grab the inode, it must on it's way to reclaim. */ 83 if (!igrab(inode)) 84 return ENOENT; 85 86 if (is_bad_inode(inode)) { 87 IRELE(ip); 88 return ENOENT; 89 } 90 91 /* inode is valid */ 92 return 0; 93 94out_unlock_noent: 95 spin_unlock(&ip->i_flags_lock); 96 return ENOENT; 97} 98 99STATIC int 100xfs_inode_ag_walk( 101 struct xfs_mount *mp, 102 struct xfs_perag *pag, 103 int (*execute)(struct xfs_inode *ip, 104 struct xfs_perag *pag, int flags), 105 int flags) 106{ 107 uint32_t first_index; 108 int last_error = 0; 109 int skipped; 110 int done; 111 int nr_found; 112 113restart: 114 done = 0; 115 skipped = 0; 116 first_index = 0; 117 nr_found = 0; 118 do { 119 struct xfs_inode *batch[XFS_LOOKUP_BATCH]; 120 int error = 0; 121 int i; 122 123 rcu_read_lock(); 124 nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, 125 (void **)batch, first_index, 126 XFS_LOOKUP_BATCH); 127 if (!nr_found) { 128 rcu_read_unlock(); 129 break; 130 } 131 132 /* 133 * Grab the inodes before we drop the lock. if we found 134 * nothing, nr == 0 and the loop will be skipped. 135 */ 136 for (i = 0; i < nr_found; i++) { 137 struct xfs_inode *ip = batch[i]; 138 139 if (done || xfs_inode_ag_walk_grab(ip)) 140 batch[i] = NULL; 141 142 /* 143 * Update the index for the next lookup. Catch 144 * overflows into the next AG range which can occur if 145 * we have inodes in the last block of the AG and we 146 * are currently pointing to the last inode. 147 * 148 * Because we may see inodes that are from the wrong AG 149 * due to RCU freeing and reallocation, only update the 150 * index if it lies in this AG. It was a race that lead 151 * us to see this inode, so another lookup from the 152 * same index will not find it again. 153 */ 154 if (XFS_INO_TO_AGNO(mp, ip->i_ino) != pag->pag_agno) 155 continue; 156 first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1); 157 if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino)) 158 done = 1; 159 } 160 161 /* unlock now we've grabbed the inodes. */ 162 rcu_read_unlock(); 163 164 for (i = 0; i < nr_found; i++) { 165 if (!batch[i]) 166 continue; 167 error = execute(batch[i], pag, flags); 168 IRELE(batch[i]); 169 if (error == EAGAIN) { 170 skipped++; 171 continue; 172 } 173 if (error && last_error != EFSCORRUPTED) 174 last_error = error; 175 } 176 177 /* bail out if the filesystem is corrupted. */ 178 if (error == EFSCORRUPTED) 179 break; 180 181 cond_resched(); 182 183 } while (nr_found && !done); 184 185 if (skipped) { 186 delay(1); 187 goto restart; 188 } 189 return last_error; 190} 191 192int 193xfs_inode_ag_iterator( 194 struct xfs_mount *mp, 195 int (*execute)(struct xfs_inode *ip, 196 struct xfs_perag *pag, int flags), 197 int flags) 198{ 199 struct xfs_perag *pag; 200 int error = 0; 201 int last_error = 0; 202 xfs_agnumber_t ag; 203 204 ag = 0; 205 while ((pag = xfs_perag_get(mp, ag))) { 206 ag = pag->pag_agno + 1; 207 error = xfs_inode_ag_walk(mp, pag, execute, flags); 208 xfs_perag_put(pag); 209 if (error) { 210 last_error = error; 211 if (error == EFSCORRUPTED) 212 break; 213 } 214 } 215 return XFS_ERROR(last_error); 216} 217 218STATIC int 219xfs_sync_inode_data( 220 struct xfs_inode *ip, 221 struct xfs_perag *pag, 222 int flags) 223{ 224 struct inode *inode = VFS_I(ip); 225 struct address_space *mapping = inode->i_mapping; 226 int error = 0; 227 228 if (!mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) 229 return 0; 230 231 if (!xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED)) { 232 if (flags & SYNC_TRYLOCK) 233 return 0; 234 xfs_ilock(ip, XFS_IOLOCK_SHARED); 235 } 236 237 error = xfs_flush_pages(ip, 0, -1, (flags & SYNC_WAIT) ? 238 0 : XBF_ASYNC, FI_NONE); 239 xfs_iunlock(ip, XFS_IOLOCK_SHARED); 240 return error; 241} 242 243/* 244 * Write out pagecache data for the whole filesystem. 245 */ 246STATIC int 247xfs_sync_data( 248 struct xfs_mount *mp, 249 int flags) 250{ 251 int error; 252 253 ASSERT((flags & ~(SYNC_TRYLOCK|SYNC_WAIT)) == 0); 254 255 error = xfs_inode_ag_iterator(mp, xfs_sync_inode_data, flags); 256 if (error) 257 return XFS_ERROR(error); 258 259 xfs_log_force(mp, (flags & SYNC_WAIT) ? XFS_LOG_SYNC : 0); 260 return 0; 261} 262 263STATIC int 264xfs_sync_fsdata( 265 struct xfs_mount *mp) 266{ 267 struct xfs_buf *bp; 268 int error; 269 270 /* 271 * If the buffer is pinned then push on the log so we won't get stuck 272 * waiting in the write for someone, maybe ourselves, to flush the log. 273 * 274 * Even though we just pushed the log above, we did not have the 275 * superblock buffer locked at that point so it can become pinned in 276 * between there and here. 277 */ 278 bp = xfs_getsb(mp, 0); 279 if (xfs_buf_ispinned(bp)) 280 xfs_log_force(mp, 0); 281 error = xfs_bwrite(bp); 282 xfs_buf_relse(bp); 283 return error; 284} 285 286/* 287 * When remounting a filesystem read-only or freezing the filesystem, we have 288 * two phases to execute. This first phase is syncing the data before we 289 * quiesce the filesystem, and the second is flushing all the inodes out after 290 * we've waited for all the transactions created by the first phase to 291 * complete. The second phase ensures that the inodes are written to their 292 * location on disk rather than just existing in transactions in the log. This 293 * means after a quiesce there is no log replay required to write the inodes to 294 * disk (this is the main difference between a sync and a quiesce). 295 */ 296/* 297 * First stage of freeze - no writers will make progress now we are here, 298 * so we flush delwri and delalloc buffers here, then wait for all I/O to 299 * complete. Data is frozen at that point. Metadata is not frozen, 300 * transactions can still occur here so don't bother emptying the AIL 301 * because it'll just get dirty again. 302 */ 303int 304xfs_quiesce_data( 305 struct xfs_mount *mp) 306{ 307 int error, error2 = 0; 308 309 /* force out the log */ 310 xfs_log_force(mp, XFS_LOG_SYNC); 311 312 /* write superblock and hoover up shutdown errors */ 313 error = xfs_sync_fsdata(mp); 314 315 /* mark the log as covered if needed */ 316 if (xfs_log_need_covered(mp)) 317 error2 = xfs_fs_log_dummy(mp); 318 319 return error ? error : error2; 320} 321 322/* 323 * Second stage of a quiesce. The data is already synced, now we have to take 324 * care of the metadata. New transactions are already blocked, so we need to 325 * wait for any remaining transactions to drain out before proceeding. 326 */ 327void 328xfs_quiesce_attr( 329 struct xfs_mount *mp) 330{ 331 int error = 0; 332 333 /* wait for all modifications to complete */ 334 while (atomic_read(&mp->m_active_trans) > 0) 335 delay(100); 336 337 /* reclaim inodes to do any IO before the freeze completes */ 338 xfs_reclaim_inodes(mp, 0); 339 xfs_reclaim_inodes(mp, SYNC_WAIT); 340 341 /* flush all pending changes from the AIL */ 342 xfs_ail_push_all_sync(mp->m_ail); 343 344 /* 345 * Just warn here till VFS can correctly support 346 * read-only remount without racing. 347 */ 348 WARN_ON(atomic_read(&mp->m_active_trans) != 0); 349 350 /* Push the superblock and write an unmount record */ 351 error = xfs_log_sbcount(mp); 352 if (error) 353 xfs_warn(mp, "xfs_attr_quiesce: failed to log sb changes. " 354 "Frozen image may not be consistent."); 355 xfs_log_unmount_write(mp); 356 357 /* 358 * At this point we might have modified the superblock again and thus 359 * added an item to the AIL, thus flush it again. 360 */ 361 xfs_ail_push_all_sync(mp->m_ail); 362} 363 364static void 365xfs_syncd_queue_sync( 366 struct xfs_mount *mp) 367{ 368 queue_delayed_work(xfs_syncd_wq, &mp->m_sync_work, 369 msecs_to_jiffies(xfs_syncd_centisecs * 10)); 370} 371 372/* 373 * Every sync period we need to unpin all items, reclaim inodes and sync 374 * disk quotas. We might need to cover the log to indicate that the 375 * filesystem is idle and not frozen. 376 */ 377STATIC void 378xfs_sync_worker( 379 struct work_struct *work) 380{ 381 struct xfs_mount *mp = container_of(to_delayed_work(work), 382 struct xfs_mount, m_sync_work); 383 int error; 384 385 /* 386 * We shouldn't write/force the log if we are in the mount/unmount 387 * process or on a read only filesystem. The workqueue still needs to be 388 * active in both cases, however, because it is used for inode reclaim 389 * during these times. Use the MS_ACTIVE flag to avoid doing anything 390 * during mount. Doing work during unmount is avoided by calling 391 * cancel_delayed_work_sync on this work queue before tearing down 392 * the ail and the log in xfs_log_unmount. 393 */ 394 if (!(mp->m_super->s_flags & MS_ACTIVE) && 395 !(mp->m_flags & XFS_MOUNT_RDONLY)) { 396 /* dgc: errors ignored here */ 397 if (mp->m_super->s_frozen == SB_UNFROZEN && 398 xfs_log_need_covered(mp)) 399 error = xfs_fs_log_dummy(mp); 400 else 401 xfs_log_force(mp, 0); 402 403 /* start pushing all the metadata that is currently 404 * dirty */ 405 xfs_ail_push_all(mp->m_ail); 406 } 407 408 /* queue us up again */ 409 xfs_syncd_queue_sync(mp); 410} 411 412/* 413 * Queue a new inode reclaim pass if there are reclaimable inodes and there 414 * isn't a reclaim pass already in progress. By default it runs every 5s based 415 * on the xfs syncd work default of 30s. Perhaps this should have it's own 416 * tunable, but that can be done if this method proves to be ineffective or too 417 * aggressive. 418 */ 419static void 420xfs_syncd_queue_reclaim( 421 struct xfs_mount *mp) 422{ 423 424 rcu_read_lock(); 425 if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_RECLAIM_TAG)) { 426 queue_delayed_work(xfs_syncd_wq, &mp->m_reclaim_work, 427 msecs_to_jiffies(xfs_syncd_centisecs / 6 * 10)); 428 } 429 rcu_read_unlock(); 430} 431 432/* 433 * This is a fast pass over the inode cache to try to get reclaim moving on as 434 * many inodes as possible in a short period of time. It kicks itself every few 435 * seconds, as well as being kicked by the inode cache shrinker when memory 436 * goes low. It scans as quickly as possible avoiding locked inodes or those 437 * already being flushed, and once done schedules a future pass. 438 */ 439STATIC void 440xfs_reclaim_worker( 441 struct work_struct *work) 442{ 443 struct xfs_mount *mp = container_of(to_delayed_work(work), 444 struct xfs_mount, m_reclaim_work); 445 446 xfs_reclaim_inodes(mp, SYNC_TRYLOCK); 447 xfs_syncd_queue_reclaim(mp); 448} 449 450/* 451 * Flush delayed allocate data, attempting to free up reserved space 452 * from existing allocations. At this point a new allocation attempt 453 * has failed with ENOSPC and we are in the process of scratching our 454 * heads, looking about for more room. 455 * 456 * Queue a new data flush if there isn't one already in progress and 457 * wait for completion of the flush. This means that we only ever have one 458 * inode flush in progress no matter how many ENOSPC events are occurring and 459 * so will prevent the system from bogging down due to every concurrent 460 * ENOSPC event scanning all the active inodes in the system for writeback. 461 */ 462void 463xfs_flush_inodes( 464 struct xfs_inode *ip) 465{ 466 struct xfs_mount *mp = ip->i_mount; 467 468 queue_work(xfs_syncd_wq, &mp->m_flush_work); 469 flush_work_sync(&mp->m_flush_work); 470} 471 472STATIC void 473xfs_flush_worker( 474 struct work_struct *work) 475{ 476 struct xfs_mount *mp = container_of(work, 477 struct xfs_mount, m_flush_work); 478 479 xfs_sync_data(mp, SYNC_TRYLOCK); 480 xfs_sync_data(mp, SYNC_TRYLOCK | SYNC_WAIT); 481} 482 483int 484xfs_syncd_init( 485 struct xfs_mount *mp) 486{ 487 INIT_WORK(&mp->m_flush_work, xfs_flush_worker); 488 INIT_DELAYED_WORK(&mp->m_sync_work, xfs_sync_worker); 489 INIT_DELAYED_WORK(&mp->m_reclaim_work, xfs_reclaim_worker); 490 491 xfs_syncd_queue_sync(mp); 492 493 return 0; 494} 495 496void 497xfs_syncd_stop( 498 struct xfs_mount *mp) 499{ 500 cancel_delayed_work_sync(&mp->m_sync_work); 501 cancel_delayed_work_sync(&mp->m_reclaim_work); 502 cancel_work_sync(&mp->m_flush_work); 503} 504 505void 506__xfs_inode_set_reclaim_tag( 507 struct xfs_perag *pag, 508 struct xfs_inode *ip) 509{ 510 radix_tree_tag_set(&pag->pag_ici_root, 511 XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino), 512 XFS_ICI_RECLAIM_TAG); 513 514 if (!pag->pag_ici_reclaimable) { 515 /* propagate the reclaim tag up into the perag radix tree */ 516 spin_lock(&ip->i_mount->m_perag_lock); 517 radix_tree_tag_set(&ip->i_mount->m_perag_tree, 518 XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino), 519 XFS_ICI_RECLAIM_TAG); 520 spin_unlock(&ip->i_mount->m_perag_lock); 521 522 /* schedule periodic background inode reclaim */ 523 xfs_syncd_queue_reclaim(ip->i_mount); 524 525 trace_xfs_perag_set_reclaim(ip->i_mount, pag->pag_agno, 526 -1, _RET_IP_); 527 } 528 pag->pag_ici_reclaimable++; 529} 530 531/* 532 * We set the inode flag atomically with the radix tree tag. 533 * Once we get tag lookups on the radix tree, this inode flag 534 * can go away. 535 */ 536void 537xfs_inode_set_reclaim_tag( 538 xfs_inode_t *ip) 539{ 540 struct xfs_mount *mp = ip->i_mount; 541 struct xfs_perag *pag; 542 543 pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); 544 spin_lock(&pag->pag_ici_lock); 545 spin_lock(&ip->i_flags_lock); 546 __xfs_inode_set_reclaim_tag(pag, ip); 547 __xfs_iflags_set(ip, XFS_IRECLAIMABLE); 548 spin_unlock(&ip->i_flags_lock); 549 spin_unlock(&pag->pag_ici_lock); 550 xfs_perag_put(pag); 551} 552 553STATIC void 554__xfs_inode_clear_reclaim( 555 xfs_perag_t *pag, 556 xfs_inode_t *ip) 557{ 558 pag->pag_ici_reclaimable--; 559 if (!pag->pag_ici_reclaimable) { 560 /* clear the reclaim tag from the perag radix tree */ 561 spin_lock(&ip->i_mount->m_perag_lock); 562 radix_tree_tag_clear(&ip->i_mount->m_perag_tree, 563 XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino), 564 XFS_ICI_RECLAIM_TAG); 565 spin_unlock(&ip->i_mount->m_perag_lock); 566 trace_xfs_perag_clear_reclaim(ip->i_mount, pag->pag_agno, 567 -1, _RET_IP_); 568 } 569} 570 571void 572__xfs_inode_clear_reclaim_tag( 573 xfs_mount_t *mp, 574 xfs_perag_t *pag, 575 xfs_inode_t *ip) 576{ 577 radix_tree_tag_clear(&pag->pag_ici_root, 578 XFS_INO_TO_AGINO(mp, ip->i_ino), XFS_ICI_RECLAIM_TAG); 579 __xfs_inode_clear_reclaim(pag, ip); 580} 581 582/* 583 * Grab the inode for reclaim exclusively. 584 * Return 0 if we grabbed it, non-zero otherwise. 585 */ 586STATIC int 587xfs_reclaim_inode_grab( 588 struct xfs_inode *ip, 589 int flags) 590{ 591 ASSERT(rcu_read_lock_held()); 592 593 /* quick check for stale RCU freed inode */ 594 if (!ip->i_ino) 595 return 1; 596 597 /* 598 * If we are asked for non-blocking operation, do unlocked checks to 599 * see if the inode already is being flushed or in reclaim to avoid 600 * lock traffic. 601 */ 602 if ((flags & SYNC_TRYLOCK) && 603 __xfs_iflags_test(ip, XFS_IFLOCK | XFS_IRECLAIM)) 604 return 1; 605 606 /* 607 * The radix tree lock here protects a thread in xfs_iget from racing 608 * with us starting reclaim on the inode. Once we have the 609 * XFS_IRECLAIM flag set it will not touch us. 610 * 611 * Due to RCU lookup, we may find inodes that have been freed and only 612 * have XFS_IRECLAIM set. Indeed, we may see reallocated inodes that 613 * aren't candidates for reclaim at all, so we must check the 614 * XFS_IRECLAIMABLE is set first before proceeding to reclaim. 615 */ 616 spin_lock(&ip->i_flags_lock); 617 if (!__xfs_iflags_test(ip, XFS_IRECLAIMABLE) || 618 __xfs_iflags_test(ip, XFS_IRECLAIM)) { 619 /* not a reclaim candidate. */ 620 spin_unlock(&ip->i_flags_lock); 621 return 1; 622 } 623 __xfs_iflags_set(ip, XFS_IRECLAIM); 624 spin_unlock(&ip->i_flags_lock); 625 return 0; 626} 627 628/* 629 * Inodes in different states need to be treated differently. The following 630 * table lists the inode states and the reclaim actions necessary: 631 * 632 * inode state iflush ret required action 633 * --------------- ---------- --------------- 634 * bad - reclaim 635 * shutdown EIO unpin and reclaim 636 * clean, unpinned 0 reclaim 637 * stale, unpinned 0 reclaim 638 * clean, pinned(*) 0 requeue 639 * stale, pinned EAGAIN requeue 640 * dirty, async - requeue 641 * dirty, sync 0 reclaim 642 * 643 * (*) dgc: I don't think the clean, pinned state is possible but it gets 644 * handled anyway given the order of checks implemented. 645 * 646 * Also, because we get the flush lock first, we know that any inode that has 647 * been flushed delwri has had the flush completed by the time we check that 648 * the inode is clean. 649 * 650 * Note that because the inode is flushed delayed write by AIL pushing, the 651 * flush lock may already be held here and waiting on it can result in very 652 * long latencies. Hence for sync reclaims, where we wait on the flush lock, 653 * the caller should push the AIL first before trying to reclaim inodes to 654 * minimise the amount of time spent waiting. For background relaim, we only 655 * bother to reclaim clean inodes anyway. 656 * 657 * Hence the order of actions after gaining the locks should be: 658 * bad => reclaim 659 * shutdown => unpin and reclaim 660 * pinned, async => requeue 661 * pinned, sync => unpin 662 * stale => reclaim 663 * clean => reclaim 664 * dirty, async => requeue 665 * dirty, sync => flush, wait and reclaim 666 */ 667STATIC int 668xfs_reclaim_inode( 669 struct xfs_inode *ip, 670 struct xfs_perag *pag, 671 int sync_mode) 672{ 673 struct xfs_buf *bp = NULL; 674 int error; 675 676restart: 677 error = 0; 678 xfs_ilock(ip, XFS_ILOCK_EXCL); 679 if (!xfs_iflock_nowait(ip)) { 680 if (!(sync_mode & SYNC_WAIT)) 681 goto out; 682 xfs_iflock(ip); 683 } 684 685 if (is_bad_inode(VFS_I(ip))) 686 goto reclaim; 687 if (XFS_FORCED_SHUTDOWN(ip->i_mount)) { 688 xfs_iunpin_wait(ip); 689 xfs_iflush_abort(ip, false); 690 goto reclaim; 691 } 692 if (xfs_ipincount(ip)) { 693 if (!(sync_mode & SYNC_WAIT)) 694 goto out_ifunlock; 695 xfs_iunpin_wait(ip); 696 } 697 if (xfs_iflags_test(ip, XFS_ISTALE)) 698 goto reclaim; 699 if (xfs_inode_clean(ip)) 700 goto reclaim; 701 702 /* 703 * Never flush out dirty data during non-blocking reclaim, as it would 704 * just contend with AIL pushing trying to do the same job. 705 */ 706 if (!(sync_mode & SYNC_WAIT)) 707 goto out_ifunlock; 708 709 /* 710 * Now we have an inode that needs flushing. 711 * 712 * Note that xfs_iflush will never block on the inode buffer lock, as 713 * xfs_ifree_cluster() can lock the inode buffer before it locks the 714 * ip->i_lock, and we are doing the exact opposite here. As a result, 715 * doing a blocking xfs_itobp() to get the cluster buffer would result 716 * in an ABBA deadlock with xfs_ifree_cluster(). 717 * 718 * As xfs_ifree_cluser() must gather all inodes that are active in the 719 * cache to mark them stale, if we hit this case we don't actually want 720 * to do IO here - we want the inode marked stale so we can simply 721 * reclaim it. Hence if we get an EAGAIN error here, just unlock the 722 * inode, back off and try again. Hopefully the next pass through will 723 * see the stale flag set on the inode. 724 */ 725 error = xfs_iflush(ip, &bp); 726 if (error == EAGAIN) { 727 xfs_iunlock(ip, XFS_ILOCK_EXCL); 728 /* backoff longer than in xfs_ifree_cluster */ 729 delay(2); 730 goto restart; 731 } 732 733 if (!error) { 734 error = xfs_bwrite(bp); 735 xfs_buf_relse(bp); 736 } 737 738 xfs_iflock(ip); 739reclaim: 740 xfs_ifunlock(ip); 741 xfs_iunlock(ip, XFS_ILOCK_EXCL); 742 743 XFS_STATS_INC(xs_ig_reclaims); 744 /* 745 * Remove the inode from the per-AG radix tree. 746 * 747 * Because radix_tree_delete won't complain even if the item was never 748 * added to the tree assert that it's been there before to catch 749 * problems with the inode life time early on. 750 */ 751 spin_lock(&pag->pag_ici_lock); 752 if (!radix_tree_delete(&pag->pag_ici_root, 753 XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino))) 754 ASSERT(0); 755 __xfs_inode_clear_reclaim(pag, ip); 756 spin_unlock(&pag->pag_ici_lock); 757 758 /* 759 * Here we do an (almost) spurious inode lock in order to coordinate 760 * with inode cache radix tree lookups. This is because the lookup 761 * can reference the inodes in the cache without taking references. 762 * 763 * We make that OK here by ensuring that we wait until the inode is 764 * unlocked after the lookup before we go ahead and free it. 765 */ 766 xfs_ilock(ip, XFS_ILOCK_EXCL); 767 xfs_qm_dqdetach(ip); 768 xfs_iunlock(ip, XFS_ILOCK_EXCL); 769 770 xfs_inode_free(ip); 771 return error; 772 773out_ifunlock: 774 xfs_ifunlock(ip); 775out: 776 xfs_iflags_clear(ip, XFS_IRECLAIM); 777 xfs_iunlock(ip, XFS_ILOCK_EXCL); 778 /* 779 * We could return EAGAIN here to make reclaim rescan the inode tree in 780 * a short while. However, this just burns CPU time scanning the tree 781 * waiting for IO to complete and xfssyncd never goes back to the idle 782 * state. Instead, return 0 to let the next scheduled background reclaim 783 * attempt to reclaim the inode again. 784 */ 785 return 0; 786} 787 788/* 789 * Walk the AGs and reclaim the inodes in them. Even if the filesystem is 790 * corrupted, we still want to try to reclaim all the inodes. If we don't, 791 * then a shut down during filesystem unmount reclaim walk leak all the 792 * unreclaimed inodes. 793 */ 794int 795xfs_reclaim_inodes_ag( 796 struct xfs_mount *mp, 797 int flags, 798 int *nr_to_scan) 799{ 800 struct xfs_perag *pag; 801 int error = 0; 802 int last_error = 0; 803 xfs_agnumber_t ag; 804 int trylock = flags & SYNC_TRYLOCK; 805 int skipped; 806 807restart: 808 ag = 0; 809 skipped = 0; 810 while ((pag = xfs_perag_get_tag(mp, ag, XFS_ICI_RECLAIM_TAG))) { 811 unsigned long first_index = 0; 812 int done = 0; 813 int nr_found = 0; 814 815 ag = pag->pag_agno + 1; 816 817 if (trylock) { 818 if (!mutex_trylock(&pag->pag_ici_reclaim_lock)) { 819 skipped++; 820 xfs_perag_put(pag); 821 continue; 822 } 823 first_index = pag->pag_ici_reclaim_cursor; 824 } else 825 mutex_lock(&pag->pag_ici_reclaim_lock); 826 827 do { 828 struct xfs_inode *batch[XFS_LOOKUP_BATCH]; 829 int i; 830 831 rcu_read_lock(); 832 nr_found = radix_tree_gang_lookup_tag( 833 &pag->pag_ici_root, 834 (void **)batch, first_index, 835 XFS_LOOKUP_BATCH, 836 XFS_ICI_RECLAIM_TAG); 837 if (!nr_found) { 838 done = 1; 839 rcu_read_unlock(); 840 break; 841 } 842 843 /* 844 * Grab the inodes before we drop the lock. if we found 845 * nothing, nr == 0 and the loop will be skipped. 846 */ 847 for (i = 0; i < nr_found; i++) { 848 struct xfs_inode *ip = batch[i]; 849 850 if (done || xfs_reclaim_inode_grab(ip, flags)) 851 batch[i] = NULL; 852 853 /* 854 * Update the index for the next lookup. Catch 855 * overflows into the next AG range which can 856 * occur if we have inodes in the last block of 857 * the AG and we are currently pointing to the 858 * last inode. 859 * 860 * Because we may see inodes that are from the 861 * wrong AG due to RCU freeing and 862 * reallocation, only update the index if it 863 * lies in this AG. It was a race that lead us 864 * to see this inode, so another lookup from 865 * the same index will not find it again. 866 */ 867 if (XFS_INO_TO_AGNO(mp, ip->i_ino) != 868 pag->pag_agno) 869 continue; 870 first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1); 871 if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino)) 872 done = 1; 873 } 874 875 /* unlock now we've grabbed the inodes. */ 876 rcu_read_unlock(); 877 878 for (i = 0; i < nr_found; i++) { 879 if (!batch[i]) 880 continue; 881 error = xfs_reclaim_inode(batch[i], pag, flags); 882 if (error && last_error != EFSCORRUPTED) 883 last_error = error; 884 } 885 886 *nr_to_scan -= XFS_LOOKUP_BATCH; 887 888 cond_resched(); 889 890 } while (nr_found && !done && *nr_to_scan > 0); 891 892 if (trylock && !done) 893 pag->pag_ici_reclaim_cursor = first_index; 894 else 895 pag->pag_ici_reclaim_cursor = 0; 896 mutex_unlock(&pag->pag_ici_reclaim_lock); 897 xfs_perag_put(pag); 898 } 899 900 /* 901 * if we skipped any AG, and we still have scan count remaining, do 902 * another pass this time using blocking reclaim semantics (i.e 903 * waiting on the reclaim locks and ignoring the reclaim cursors). This 904 * ensure that when we get more reclaimers than AGs we block rather 905 * than spin trying to execute reclaim. 906 */ 907 if (skipped && (flags & SYNC_WAIT) && *nr_to_scan > 0) { 908 trylock = 0; 909 goto restart; 910 } 911 return XFS_ERROR(last_error); 912} 913 914int 915xfs_reclaim_inodes( 916 xfs_mount_t *mp, 917 int mode) 918{ 919 int nr_to_scan = INT_MAX; 920 921 return xfs_reclaim_inodes_ag(mp, mode, &nr_to_scan); 922} 923 924/* 925 * Scan a certain number of inodes for reclaim. 926 * 927 * When called we make sure that there is a background (fast) inode reclaim in 928 * progress, while we will throttle the speed of reclaim via doing synchronous 929 * reclaim of inodes. That means if we come across dirty inodes, we wait for 930 * them to be cleaned, which we hope will not be very long due to the 931 * background walker having already kicked the IO off on those dirty inodes. 932 */ 933void 934xfs_reclaim_inodes_nr( 935 struct xfs_mount *mp, 936 int nr_to_scan) 937{ 938 /* kick background reclaimer and push the AIL */ 939 xfs_syncd_queue_reclaim(mp); 940 xfs_ail_push_all(mp->m_ail); 941 942 xfs_reclaim_inodes_ag(mp, SYNC_TRYLOCK | SYNC_WAIT, &nr_to_scan); 943} 944 945/* 946 * Return the number of reclaimable inodes in the filesystem for 947 * the shrinker to determine how much to reclaim. 948 */ 949int 950xfs_reclaim_inodes_count( 951 struct xfs_mount *mp) 952{ 953 struct xfs_perag *pag; 954 xfs_agnumber_t ag = 0; 955 int reclaimable = 0; 956 957 while ((pag = xfs_perag_get_tag(mp, ag, XFS_ICI_RECLAIM_TAG))) { 958 ag = pag->pag_agno + 1; 959 reclaimable += pag->pag_ici_reclaimable; 960 xfs_perag_put(pag); 961 } 962 return reclaimable; 963} 964