Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge tag 'writeback' of git://git.kernel.org/pub/scm/linux/kernel/git/wfg/linux

Pull writeback tree from Wu Fengguang:
"Mainly from Jan Kara to avoid iput() in the flusher threads."

* tag 'writeback' of git://git.kernel.org/pub/scm/linux/kernel/git/wfg/linux:
writeback: Avoid iput() from flusher thread
vfs: Rename end_writeback() to clear_inode()
vfs: Move waiting for inode writeback from end_writeback() to evict_inode()
writeback: Refactor writeback_single_inode()
writeback: Remove wb->list_lock from writeback_single_inode()
writeback: Separate inode requeueing after writeback
writeback: Move I_DIRTY_PAGES handling
writeback: Move requeueing when I_SYNC set to writeback_sb_inodes()
writeback: Move clearing of I_SYNC into inode_sync_complete()
writeback: initialize global_dirty_limit
fs: remove 8 bytes of padding from struct writeback_control on 64 bit builds
mm: page-writeback.c: local functions should not be exposed globally

+327 -228
+7 -9
Documentation/filesystems/porting
··· 297 297 be used instead. It gets called whenever the inode is evicted, whether it has 298 298 remaining links or not. Caller does *not* evict the pagecache or inode-associated 299 299 metadata buffers; getting rid of those is responsibility of method, as it had 300 - been for ->delete_inode(). 300 + been for ->delete_inode(). Caller makes sure async writeback cannot be running 301 + for the inode while (or after) ->evict_inode() is called. 301 302 302 303 ->drop_inode() returns int now; it's called on final iput() with 303 304 inode->i_lock held and it returns true if filesystems wants the inode to be ··· 307 306 simply of return 1. Note that all actual eviction work is done by caller after 308 307 ->drop_inode() returns. 309 308 310 - clear_inode() is gone; use end_writeback() instead. As before, it must 311 - be called exactly once on each call of ->evict_inode() (as it used to be for 312 - each call of ->delete_inode()). Unlike before, if you are using inode-associated 313 - metadata buffers (i.e. mark_buffer_dirty_inode()), it's your responsibility to 314 - call invalidate_inode_buffers() before end_writeback(). 315 - No async writeback (and thus no calls of ->write_inode()) will happen 316 - after end_writeback() returns, so actions that should not overlap with ->write_inode() 317 - (e.g. freeing on-disk inode if i_nlink is 0) ought to be done after that call. 309 + As before, clear_inode() must be called exactly once on each call of 310 + ->evict_inode() (as it used to be for each call of ->delete_inode()). Unlike 311 + before, if you are using inode-associated metadata buffers (i.e. 312 + mark_buffer_dirty_inode()), it's your responsibility to call 313 + invalidate_inode_buffers() before clear_inode(). 318 314 319 315 NOTE: checking i_nlink in the beginning of ->write_inode() and bailing out 320 316 if it's zero is not *and* *never* *had* *been* enough. Final unlink() and iput()
+1 -1
arch/powerpc/platforms/cell/spufs/inode.c
··· 151 151 spufs_evict_inode(struct inode *inode) 152 152 { 153 153 struct spufs_inode_info *ei = SPUFS_I(inode); 154 - end_writeback(inode); 154 + clear_inode(inode); 155 155 if (ei->i_ctx) 156 156 put_spu_context(ei->i_ctx); 157 157 if (ei->i_gang)
+1 -1
arch/s390/hypfs/inode.c
··· 115 115 116 116 static void hypfs_evict_inode(struct inode *inode) 117 117 { 118 - end_writeback(inode); 118 + clear_inode(inode); 119 119 kfree(inode->i_private); 120 120 } 121 121
+1 -1
fs/9p/vfs_inode.c
··· 448 448 struct v9fs_inode *v9inode = V9FS_I(inode); 449 449 450 450 truncate_inode_pages(inode->i_mapping, 0); 451 - end_writeback(inode); 451 + clear_inode(inode); 452 452 filemap_fdatawrite(inode->i_mapping); 453 453 454 454 #ifdef CONFIG_9P_FSCACHE
+1 -1
fs/affs/inode.c
··· 264 264 } 265 265 266 266 invalidate_inode_buffers(inode); 267 - end_writeback(inode); 267 + clear_inode(inode); 268 268 affs_free_prealloc(inode); 269 269 cache_page = (unsigned long)AFFS_I(inode)->i_lc; 270 270 if (cache_page) {
+1 -1
fs/afs/inode.c
··· 423 423 ASSERTCMP(inode->i_ino, ==, vnode->fid.vnode); 424 424 425 425 truncate_inode_pages(&inode->i_data, 0); 426 - end_writeback(inode); 426 + clear_inode(inode); 427 427 428 428 afs_give_up_callback(vnode); 429 429
+1 -1
fs/autofs4/inode.c
··· 100 100 101 101 static void autofs4_evict_inode(struct inode *inode) 102 102 { 103 - end_writeback(inode); 103 + clear_inode(inode); 104 104 kfree(inode->i_private); 105 105 } 106 106
+1 -1
fs/bfs/inode.c
··· 174 174 175 175 truncate_inode_pages(&inode->i_data, 0); 176 176 invalidate_inode_buffers(inode); 177 - end_writeback(inode); 177 + clear_inode(inode); 178 178 179 179 if (inode->i_nlink) 180 180 return;
+1 -1
fs/binfmt_misc.c
··· 505 505 506 506 static void bm_evict_inode(struct inode *inode) 507 507 { 508 - end_writeback(inode); 508 + clear_inode(inode); 509 509 kfree(inode->i_private); 510 510 } 511 511
+1 -1
fs/block_dev.c
··· 487 487 struct list_head *p; 488 488 truncate_inode_pages(&inode->i_data, 0); 489 489 invalidate_inode_buffers(inode); /* is it needed here? */ 490 - end_writeback(inode); 490 + clear_inode(inode); 491 491 spin_lock(&bdev_lock); 492 492 while ( (p = bdev->bd_inodes.next) != &bdev->bd_inodes ) { 493 493 __bd_forget(list_entry(p, struct inode, i_devices));
+1 -1
fs/btrfs/inode.c
··· 3756 3756 btrfs_end_transaction(trans, root); 3757 3757 btrfs_btree_balance_dirty(root, nr); 3758 3758 no_delete: 3759 - end_writeback(inode); 3759 + clear_inode(inode); 3760 3760 return; 3761 3761 } 3762 3762
+1 -1
fs/cifs/cifsfs.c
··· 272 272 cifs_evict_inode(struct inode *inode) 273 273 { 274 274 truncate_inode_pages(&inode->i_data, 0); 275 - end_writeback(inode); 275 + clear_inode(inode); 276 276 cifs_fscache_release_inode_cookie(inode); 277 277 } 278 278
+1 -1
fs/coda/inode.c
··· 244 244 static void coda_evict_inode(struct inode *inode) 245 245 { 246 246 truncate_inode_pages(&inode->i_data, 0); 247 - end_writeback(inode); 247 + clear_inode(inode); 248 248 coda_cache_clear_inode(inode); 249 249 } 250 250
+1 -1
fs/ecryptfs/super.c
··· 133 133 static void ecryptfs_evict_inode(struct inode *inode) 134 134 { 135 135 truncate_inode_pages(&inode->i_data, 0); 136 - end_writeback(inode); 136 + clear_inode(inode); 137 137 iput(ecryptfs_inode_to_lower(inode)); 138 138 } 139 139
+2 -2
fs/exofs/inode.c
··· 1473 1473 goto no_delete; 1474 1474 1475 1475 inode->i_size = 0; 1476 - end_writeback(inode); 1476 + clear_inode(inode); 1477 1477 1478 1478 /* if we are deleting an obj that hasn't been created yet, wait. 1479 1479 * This also makes sure that create_done cannot be called with an ··· 1503 1503 return; 1504 1504 1505 1505 no_delete: 1506 - end_writeback(inode); 1506 + clear_inode(inode); 1507 1507 }
+1 -1
fs/ext2/inode.c
··· 90 90 } 91 91 92 92 invalidate_inode_buffers(inode); 93 - end_writeback(inode); 93 + clear_inode(inode); 94 94 95 95 ext2_discard_reservation(inode); 96 96 rsv = EXT2_I(inode)->i_block_alloc_info;
+3 -3
fs/ext3/inode.c
··· 272 272 if (ext3_mark_inode_dirty(handle, inode)) { 273 273 /* If that failed, just dquot_drop() and be done with that */ 274 274 dquot_drop(inode); 275 - end_writeback(inode); 275 + clear_inode(inode); 276 276 } else { 277 277 ext3_xattr_delete_inode(handle, inode); 278 278 dquot_free_inode(inode); 279 279 dquot_drop(inode); 280 - end_writeback(inode); 280 + clear_inode(inode); 281 281 ext3_free_inode(handle, inode); 282 282 } 283 283 ext3_journal_stop(handle); 284 284 return; 285 285 no_delete: 286 - end_writeback(inode); 286 + clear_inode(inode); 287 287 dquot_drop(inode); 288 288 } 289 289
+1 -1
fs/ext4/super.c
··· 1007 1007 void ext4_clear_inode(struct inode *inode) 1008 1008 { 1009 1009 invalidate_inode_buffers(inode); 1010 - end_writeback(inode); 1010 + clear_inode(inode); 1011 1011 dquot_drop(inode); 1012 1012 ext4_discard_preallocations(inode); 1013 1013 if (EXT4_I(inode)->jinode) {
+1 -1
fs/fat/inode.c
··· 454 454 fat_truncate_blocks(inode, 0); 455 455 } 456 456 invalidate_inode_buffers(inode); 457 - end_writeback(inode); 457 + clear_inode(inode); 458 458 fat_cache_inval_inode(inode); 459 459 fat_detach(inode); 460 460 }
+1 -1
fs/freevxfs/vxfs_inode.c
··· 355 355 vxfs_evict_inode(struct inode *ip) 356 356 { 357 357 truncate_inode_pages(&ip->i_data, 0); 358 - end_writeback(ip); 358 + clear_inode(ip); 359 359 call_rcu(&ip->i_rcu, vxfs_i_callback); 360 360 }
+213 -139
fs/fs-writeback.c
··· 231 231 232 232 static void inode_sync_complete(struct inode *inode) 233 233 { 234 - /* 235 - * Prevent speculative execution through 236 - * spin_unlock(&wb->list_lock); 237 - */ 238 - 234 + inode->i_state &= ~I_SYNC; 235 + /* Waiters must see I_SYNC cleared before being woken up */ 239 236 smp_mb(); 240 237 wake_up_bit(&inode->i_state, __I_SYNC); 241 238 } ··· 326 329 } 327 330 328 331 /* 329 - * Wait for writeback on an inode to complete. 332 + * Wait for writeback on an inode to complete. Called with i_lock held. 333 + * Caller must make sure inode cannot go away when we drop i_lock. 330 334 */ 331 - static void inode_wait_for_writeback(struct inode *inode, 332 - struct bdi_writeback *wb) 335 + static void __inode_wait_for_writeback(struct inode *inode) 336 + __releases(inode->i_lock) 337 + __acquires(inode->i_lock) 333 338 { 334 339 DEFINE_WAIT_BIT(wq, &inode->i_state, __I_SYNC); 335 340 wait_queue_head_t *wqh; ··· 339 340 wqh = bit_waitqueue(&inode->i_state, __I_SYNC); 340 341 while (inode->i_state & I_SYNC) { 341 342 spin_unlock(&inode->i_lock); 342 - spin_unlock(&wb->list_lock); 343 343 __wait_on_bit(wqh, &wq, inode_wait, TASK_UNINTERRUPTIBLE); 344 - spin_lock(&wb->list_lock); 345 344 spin_lock(&inode->i_lock); 346 345 } 347 346 } 348 347 349 348 /* 350 - * Write out an inode's dirty pages. Called under wb->list_lock and 351 - * inode->i_lock. Either the caller has an active reference on the inode or 352 - * the inode has I_WILL_FREE set. 353 - * 354 - * If `wait' is set, wait on the writeout. 355 - * 356 - * The whole writeout design is quite complex and fragile. We want to avoid 357 - * starvation of particular inodes when others are being redirtied, prevent 358 - * livelocks, etc. 349 + * Wait for writeback on an inode to complete. Caller must have inode pinned. 350 + */ 351 + void inode_wait_for_writeback(struct inode *inode) 352 + { 353 + spin_lock(&inode->i_lock); 354 + __inode_wait_for_writeback(inode); 355 + spin_unlock(&inode->i_lock); 356 + } 357 + 358 + /* 359 + * Sleep until I_SYNC is cleared. This function must be called with i_lock 360 + * held and drops it. It is aimed for callers not holding any inode reference 361 + * so once i_lock is dropped, inode can go away. 362 + */ 363 + static void inode_sleep_on_writeback(struct inode *inode) 364 + __releases(inode->i_lock) 365 + { 366 + DEFINE_WAIT(wait); 367 + wait_queue_head_t *wqh = bit_waitqueue(&inode->i_state, __I_SYNC); 368 + int sleep; 369 + 370 + prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE); 371 + sleep = inode->i_state & I_SYNC; 372 + spin_unlock(&inode->i_lock); 373 + if (sleep) 374 + schedule(); 375 + finish_wait(wqh, &wait); 376 + } 377 + 378 + /* 379 + * Find proper writeback list for the inode depending on its current state and 380 + * possibly also change of its state while we were doing writeback. Here we 381 + * handle things such as livelock prevention or fairness of writeback among 382 + * inodes. This function can be called only by flusher thread - noone else 383 + * processes all inodes in writeback lists and requeueing inodes behind flusher 384 + * thread's back can have unexpected consequences. 385 + */ 386 + static void requeue_inode(struct inode *inode, struct bdi_writeback *wb, 387 + struct writeback_control *wbc) 388 + { 389 + if (inode->i_state & I_FREEING) 390 + return; 391 + 392 + /* 393 + * Sync livelock prevention. Each inode is tagged and synced in one 394 + * shot. If still dirty, it will be redirty_tail()'ed below. Update 395 + * the dirty time to prevent enqueue and sync it again. 396 + */ 397 + if ((inode->i_state & I_DIRTY) && 398 + (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)) 399 + inode->dirtied_when = jiffies; 400 + 401 + if (wbc->pages_skipped) { 402 + /* 403 + * writeback is not making progress due to locked 404 + * buffers. Skip this inode for now. 405 + */ 406 + redirty_tail(inode, wb); 407 + return; 408 + } 409 + 410 + if (mapping_tagged(inode->i_mapping, PAGECACHE_TAG_DIRTY)) { 411 + /* 412 + * We didn't write back all the pages. nfs_writepages() 413 + * sometimes bales out without doing anything. 414 + */ 415 + if (wbc->nr_to_write <= 0) { 416 + /* Slice used up. Queue for next turn. */ 417 + requeue_io(inode, wb); 418 + } else { 419 + /* 420 + * Writeback blocked by something other than 421 + * congestion. Delay the inode for some time to 422 + * avoid spinning on the CPU (100% iowait) 423 + * retrying writeback of the dirty page/inode 424 + * that cannot be performed immediately. 425 + */ 426 + redirty_tail(inode, wb); 427 + } 428 + } else if (inode->i_state & I_DIRTY) { 429 + /* 430 + * Filesystems can dirty the inode during writeback operations, 431 + * such as delayed allocation during submission or metadata 432 + * updates after data IO completion. 433 + */ 434 + redirty_tail(inode, wb); 435 + } else { 436 + /* The inode is clean. Remove from writeback lists. */ 437 + list_del_init(&inode->i_wb_list); 438 + } 439 + } 440 + 441 + /* 442 + * Write out an inode and its dirty pages. Do not update the writeback list 443 + * linkage. That is left to the caller. The caller is also responsible for 444 + * setting I_SYNC flag and calling inode_sync_complete() to clear it. 359 445 */ 360 446 static int 361 - writeback_single_inode(struct inode *inode, struct bdi_writeback *wb, 362 - struct writeback_control *wbc) 447 + __writeback_single_inode(struct inode *inode, struct bdi_writeback *wb, 448 + struct writeback_control *wbc) 363 449 { 364 450 struct address_space *mapping = inode->i_mapping; 365 451 long nr_to_write = wbc->nr_to_write; 366 452 unsigned dirty; 367 453 int ret; 368 454 369 - assert_spin_locked(&wb->list_lock); 370 - assert_spin_locked(&inode->i_lock); 371 - 372 - if (!atomic_read(&inode->i_count)) 373 - WARN_ON(!(inode->i_state & (I_WILL_FREE|I_FREEING))); 374 - else 375 - WARN_ON(inode->i_state & I_WILL_FREE); 376 - 377 - if (inode->i_state & I_SYNC) { 378 - /* 379 - * If this inode is locked for writeback and we are not doing 380 - * writeback-for-data-integrity, move it to b_more_io so that 381 - * writeback can proceed with the other inodes on s_io. 382 - * 383 - * We'll have another go at writing back this inode when we 384 - * completed a full scan of b_io. 385 - */ 386 - if (wbc->sync_mode != WB_SYNC_ALL) { 387 - requeue_io(inode, wb); 388 - trace_writeback_single_inode_requeue(inode, wbc, 389 - nr_to_write); 390 - return 0; 391 - } 392 - 393 - /* 394 - * It's a data-integrity sync. We must wait. 395 - */ 396 - inode_wait_for_writeback(inode, wb); 397 - } 398 - 399 - BUG_ON(inode->i_state & I_SYNC); 400 - 401 - /* Set I_SYNC, reset I_DIRTY_PAGES */ 402 - inode->i_state |= I_SYNC; 403 - inode->i_state &= ~I_DIRTY_PAGES; 404 - spin_unlock(&inode->i_lock); 405 - spin_unlock(&wb->list_lock); 455 + WARN_ON(!(inode->i_state & I_SYNC)); 406 456 407 457 ret = do_writepages(mapping, wbc); 408 458 ··· 472 424 * write_inode() 473 425 */ 474 426 spin_lock(&inode->i_lock); 427 + /* Clear I_DIRTY_PAGES if we've written out all dirty pages */ 428 + if (!mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) 429 + inode->i_state &= ~I_DIRTY_PAGES; 475 430 dirty = inode->i_state & I_DIRTY; 476 431 inode->i_state &= ~(I_DIRTY_SYNC | I_DIRTY_DATASYNC); 477 432 spin_unlock(&inode->i_lock); ··· 484 433 if (ret == 0) 485 434 ret = err; 486 435 } 436 + trace_writeback_single_inode(inode, wbc, nr_to_write); 437 + return ret; 438 + } 439 + 440 + /* 441 + * Write out an inode's dirty pages. Either the caller has an active reference 442 + * on the inode or the inode has I_WILL_FREE set. 443 + * 444 + * This function is designed to be called for writing back one inode which 445 + * we go e.g. from filesystem. Flusher thread uses __writeback_single_inode() 446 + * and does more profound writeback list handling in writeback_sb_inodes(). 447 + */ 448 + static int 449 + writeback_single_inode(struct inode *inode, struct bdi_writeback *wb, 450 + struct writeback_control *wbc) 451 + { 452 + int ret = 0; 453 + 454 + spin_lock(&inode->i_lock); 455 + if (!atomic_read(&inode->i_count)) 456 + WARN_ON(!(inode->i_state & (I_WILL_FREE|I_FREEING))); 457 + else 458 + WARN_ON(inode->i_state & I_WILL_FREE); 459 + 460 + if (inode->i_state & I_SYNC) { 461 + if (wbc->sync_mode != WB_SYNC_ALL) 462 + goto out; 463 + /* 464 + * It's a data-integrity sync. We must wait. Since callers hold 465 + * inode reference or inode has I_WILL_FREE set, it cannot go 466 + * away under us. 467 + */ 468 + __inode_wait_for_writeback(inode); 469 + } 470 + WARN_ON(inode->i_state & I_SYNC); 471 + /* 472 + * Skip inode if it is clean. We don't want to mess with writeback 473 + * lists in this function since flusher thread may be doing for example 474 + * sync in parallel and if we move the inode, it could get skipped. So 475 + * here we make sure inode is on some writeback list and leave it there 476 + * unless we have completely cleaned the inode. 477 + */ 478 + if (!(inode->i_state & I_DIRTY)) 479 + goto out; 480 + inode->i_state |= I_SYNC; 481 + spin_unlock(&inode->i_lock); 482 + 483 + ret = __writeback_single_inode(inode, wb, wbc); 487 484 488 485 spin_lock(&wb->list_lock); 489 486 spin_lock(&inode->i_lock); 490 - inode->i_state &= ~I_SYNC; 491 - if (!(inode->i_state & I_FREEING)) { 492 - /* 493 - * Sync livelock prevention. Each inode is tagged and synced in 494 - * one shot. If still dirty, it will be redirty_tail()'ed below. 495 - * Update the dirty time to prevent enqueue and sync it again. 496 - */ 497 - if ((inode->i_state & I_DIRTY) && 498 - (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)) 499 - inode->dirtied_when = jiffies; 500 - 501 - if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) { 502 - /* 503 - * We didn't write back all the pages. nfs_writepages() 504 - * sometimes bales out without doing anything. 505 - */ 506 - inode->i_state |= I_DIRTY_PAGES; 507 - if (wbc->nr_to_write <= 0) { 508 - /* 509 - * slice used up: queue for next turn 510 - */ 511 - requeue_io(inode, wb); 512 - } else { 513 - /* 514 - * Writeback blocked by something other than 515 - * congestion. Delay the inode for some time to 516 - * avoid spinning on the CPU (100% iowait) 517 - * retrying writeback of the dirty page/inode 518 - * that cannot be performed immediately. 519 - */ 520 - redirty_tail(inode, wb); 521 - } 522 - } else if (inode->i_state & I_DIRTY) { 523 - /* 524 - * Filesystems can dirty the inode during writeback 525 - * operations, such as delayed allocation during 526 - * submission or metadata updates after data IO 527 - * completion. 528 - */ 529 - redirty_tail(inode, wb); 530 - } else { 531 - /* 532 - * The inode is clean. At this point we either have 533 - * a reference to the inode or it's on it's way out. 534 - * No need to add it back to the LRU. 535 - */ 536 - list_del_init(&inode->i_wb_list); 537 - } 538 - } 487 + /* 488 + * If inode is clean, remove it from writeback lists. Otherwise don't 489 + * touch it. See comment above for explanation. 490 + */ 491 + if (!(inode->i_state & I_DIRTY)) 492 + list_del_init(&inode->i_wb_list); 493 + spin_unlock(&wb->list_lock); 539 494 inode_sync_complete(inode); 540 - trace_writeback_single_inode(inode, wbc, nr_to_write); 495 + out: 496 + spin_unlock(&inode->i_lock); 541 497 return ret; 542 498 } 543 499 ··· 638 580 redirty_tail(inode, wb); 639 581 continue; 640 582 } 641 - __iget(inode); 583 + if ((inode->i_state & I_SYNC) && wbc.sync_mode != WB_SYNC_ALL) { 584 + /* 585 + * If this inode is locked for writeback and we are not 586 + * doing writeback-for-data-integrity, move it to 587 + * b_more_io so that writeback can proceed with the 588 + * other inodes on s_io. 589 + * 590 + * We'll have another go at writing back this inode 591 + * when we completed a full scan of b_io. 592 + */ 593 + spin_unlock(&inode->i_lock); 594 + requeue_io(inode, wb); 595 + trace_writeback_sb_inodes_requeue(inode); 596 + continue; 597 + } 598 + spin_unlock(&wb->list_lock); 599 + 600 + /* 601 + * We already requeued the inode if it had I_SYNC set and we 602 + * are doing WB_SYNC_NONE writeback. So this catches only the 603 + * WB_SYNC_ALL case. 604 + */ 605 + if (inode->i_state & I_SYNC) { 606 + /* Wait for I_SYNC. This function drops i_lock... */ 607 + inode_sleep_on_writeback(inode); 608 + /* Inode may be gone, start again */ 609 + continue; 610 + } 611 + inode->i_state |= I_SYNC; 612 + spin_unlock(&inode->i_lock); 613 + 642 614 write_chunk = writeback_chunk_size(wb->bdi, work); 643 615 wbc.nr_to_write = write_chunk; 644 616 wbc.pages_skipped = 0; 645 617 646 - writeback_single_inode(inode, wb, &wbc); 618 + /* 619 + * We use I_SYNC to pin the inode in memory. While it is set 620 + * evict_inode() will wait so the inode cannot be freed. 621 + */ 622 + __writeback_single_inode(inode, wb, &wbc); 647 623 648 624 work->nr_pages -= write_chunk - wbc.nr_to_write; 649 625 wrote += write_chunk - wbc.nr_to_write; 626 + spin_lock(&wb->list_lock); 627 + spin_lock(&inode->i_lock); 650 628 if (!(inode->i_state & I_DIRTY)) 651 629 wrote++; 652 - if (wbc.pages_skipped) { 653 - /* 654 - * writeback is not making progress due to locked 655 - * buffers. Skip this inode for now. 656 - */ 657 - redirty_tail(inode, wb); 658 - } 630 + requeue_inode(inode, wb, &wbc); 631 + inode_sync_complete(inode); 659 632 spin_unlock(&inode->i_lock); 660 - spin_unlock(&wb->list_lock); 661 - iput(inode); 662 - cond_resched(); 663 - spin_lock(&wb->list_lock); 633 + cond_resched_lock(&wb->list_lock); 664 634 /* 665 635 * bail out to wb_writeback() often enough to check 666 636 * background threshold and other termination conditions. ··· 882 796 trace_writeback_wait(wb->bdi, work); 883 797 inode = wb_inode(wb->b_more_io.prev); 884 798 spin_lock(&inode->i_lock); 885 - inode_wait_for_writeback(inode, wb); 886 - spin_unlock(&inode->i_lock); 799 + spin_unlock(&wb->list_lock); 800 + /* This function drops i_lock... */ 801 + inode_sleep_on_writeback(inode); 802 + spin_lock(&wb->list_lock); 887 803 } 888 804 } 889 805 spin_unlock(&wb->list_lock); ··· 1419 1331 int write_inode_now(struct inode *inode, int sync) 1420 1332 { 1421 1333 struct bdi_writeback *wb = &inode_to_bdi(inode)->wb; 1422 - int ret; 1423 1334 struct writeback_control wbc = { 1424 1335 .nr_to_write = LONG_MAX, 1425 1336 .sync_mode = sync ? WB_SYNC_ALL : WB_SYNC_NONE, ··· 1430 1343 wbc.nr_to_write = 0; 1431 1344 1432 1345 might_sleep(); 1433 - spin_lock(&wb->list_lock); 1434 - spin_lock(&inode->i_lock); 1435 - ret = writeback_single_inode(inode, wb, &wbc); 1436 - spin_unlock(&inode->i_lock); 1437 - spin_unlock(&wb->list_lock); 1438 - return ret; 1346 + return writeback_single_inode(inode, wb, &wbc); 1439 1347 } 1440 1348 EXPORT_SYMBOL(write_inode_now); 1441 1349 ··· 1447 1365 */ 1448 1366 int sync_inode(struct inode *inode, struct writeback_control *wbc) 1449 1367 { 1450 - struct bdi_writeback *wb = &inode_to_bdi(inode)->wb; 1451 - int ret; 1452 - 1453 - spin_lock(&wb->list_lock); 1454 - spin_lock(&inode->i_lock); 1455 - ret = writeback_single_inode(inode, wb, wbc); 1456 - spin_unlock(&inode->i_lock); 1457 - spin_unlock(&wb->list_lock); 1458 - return ret; 1368 + return writeback_single_inode(inode, &inode_to_bdi(inode)->wb, wbc); 1459 1369 } 1460 1370 EXPORT_SYMBOL(sync_inode); 1461 1371
+1 -1
fs/fuse/inode.c
··· 122 122 static void fuse_evict_inode(struct inode *inode) 123 123 { 124 124 truncate_inode_pages(&inode->i_data, 0); 125 - end_writeback(inode); 125 + clear_inode(inode); 126 126 if (inode->i_sb->s_flags & MS_ACTIVE) { 127 127 struct fuse_conn *fc = get_fuse_conn(inode); 128 128 struct fuse_inode *fi = get_fuse_inode(inode);
+1 -1
fs/gfs2/super.c
··· 1554 1554 out: 1555 1555 /* Case 3 starts here */ 1556 1556 truncate_inode_pages(&inode->i_data, 0); 1557 - end_writeback(inode); 1557 + clear_inode(inode); 1558 1558 gfs2_dir_hash_inval(ip); 1559 1559 ip->i_gl->gl_object = NULL; 1560 1560 flush_delayed_work_sync(&ip->i_gl->gl_work);
+1 -1
fs/hfs/inode.c
··· 532 532 void hfs_evict_inode(struct inode *inode) 533 533 { 534 534 truncate_inode_pages(&inode->i_data, 0); 535 - end_writeback(inode); 535 + clear_inode(inode); 536 536 if (HFS_IS_RSRC(inode) && HFS_I(inode)->rsrc_inode) { 537 537 HFS_I(HFS_I(inode)->rsrc_inode)->rsrc_inode = NULL; 538 538 iput(HFS_I(inode)->rsrc_inode);
+1 -1
fs/hfsplus/super.c
··· 154 154 { 155 155 dprint(DBG_INODE, "hfsplus_evict_inode: %lu\n", inode->i_ino); 156 156 truncate_inode_pages(&inode->i_data, 0); 157 - end_writeback(inode); 157 + clear_inode(inode); 158 158 if (HFSPLUS_IS_RSRC(inode)) { 159 159 HFSPLUS_I(HFSPLUS_I(inode)->rsrc_inode)->rsrc_inode = NULL; 160 160 iput(HFSPLUS_I(inode)->rsrc_inode);
+1 -1
fs/hostfs/hostfs_kern.c
··· 240 240 static void hostfs_evict_inode(struct inode *inode) 241 241 { 242 242 truncate_inode_pages(&inode->i_data, 0); 243 - end_writeback(inode); 243 + clear_inode(inode); 244 244 if (HOSTFS_I(inode)->fd != -1) { 245 245 close_file(&HOSTFS_I(inode)->fd); 246 246 HOSTFS_I(inode)->fd = -1;
+1 -1
fs/hpfs/inode.c
··· 299 299 void hpfs_evict_inode(struct inode *inode) 300 300 { 301 301 truncate_inode_pages(&inode->i_data, 0); 302 - end_writeback(inode); 302 + clear_inode(inode); 303 303 if (!inode->i_nlink) { 304 304 hpfs_lock(inode->i_sb); 305 305 hpfs_remove_fnode(inode->i_sb, inode->i_ino);
+1 -1
fs/hppfs/hppfs.c
··· 614 614 615 615 void hppfs_evict_inode(struct inode *ino) 616 616 { 617 - end_writeback(ino); 617 + clear_inode(ino); 618 618 dput(HPPFS_I(ino)->proc_dentry); 619 619 mntput(ino->i_sb->s_fs_info); 620 620 }
+1 -1
fs/hugetlbfs/inode.c
··· 393 393 static void hugetlbfs_evict_inode(struct inode *inode) 394 394 { 395 395 truncate_hugepages(inode, 0); 396 - end_writeback(inode); 396 + clear_inode(inode); 397 397 } 398 398 399 399 static inline void
+11 -4
fs/inode.c
··· 486 486 } 487 487 EXPORT_SYMBOL(__remove_inode_hash); 488 488 489 - void end_writeback(struct inode *inode) 489 + void clear_inode(struct inode *inode) 490 490 { 491 491 might_sleep(); 492 492 /* ··· 500 500 BUG_ON(!list_empty(&inode->i_data.private_list)); 501 501 BUG_ON(!(inode->i_state & I_FREEING)); 502 502 BUG_ON(inode->i_state & I_CLEAR); 503 - inode_sync_wait(inode); 504 503 /* don't need i_lock here, no concurrent mods to i_state */ 505 504 inode->i_state = I_FREEING | I_CLEAR; 506 505 } 507 - EXPORT_SYMBOL(end_writeback); 506 + EXPORT_SYMBOL(clear_inode); 508 507 509 508 /* 510 509 * Free the inode passed in, removing it from the lists it is still connected ··· 530 531 531 532 inode_sb_list_del(inode); 532 533 534 + /* 535 + * Wait for flusher thread to be done with the inode so that filesystem 536 + * does not start destroying it while writeback is still running. Since 537 + * the inode has I_FREEING set, flusher thread won't start new work on 538 + * the inode. We just have to wait for running writeback to finish. 539 + */ 540 + inode_wait_for_writeback(inode); 541 + 533 542 if (op->evict_inode) { 534 543 op->evict_inode(inode); 535 544 } else { 536 545 if (inode->i_data.nrpages) 537 546 truncate_inode_pages(&inode->i_data, 0); 538 - end_writeback(inode); 547 + clear_inode(inode); 539 548 } 540 549 if (S_ISBLK(inode->i_mode) && inode->i_bdev) 541 550 bd_forget(inode);
+1 -1
fs/jffs2/fs.c
··· 240 240 jffs2_dbg(1, "%s(): ino #%lu mode %o\n", 241 241 __func__, inode->i_ino, inode->i_mode); 242 242 truncate_inode_pages(&inode->i_data, 0); 243 - end_writeback(inode); 243 + clear_inode(inode); 244 244 jffs2_do_clear_inode(c, f); 245 245 } 246 246
+1 -1
fs/jfs/inode.c
··· 169 169 } else { 170 170 truncate_inode_pages(&inode->i_data, 0); 171 171 } 172 - end_writeback(inode); 172 + clear_inode(inode); 173 173 dquot_drop(inode); 174 174 } 175 175
+1 -1
fs/logfs/readwrite.c
··· 2175 2175 } 2176 2176 } 2177 2177 truncate_inode_pages(&inode->i_data, 0); 2178 - end_writeback(inode); 2178 + clear_inode(inode); 2179 2179 2180 2180 /* Cheaper version of write_inode. All changes are concealed in 2181 2181 * aliases, which are moved back. No write to the medium happens.
+1 -1
fs/minix/inode.c
··· 32 32 minix_truncate(inode); 33 33 } 34 34 invalidate_inode_buffers(inode); 35 - end_writeback(inode); 35 + clear_inode(inode); 36 36 if (!inode->i_nlink) 37 37 minix_free_inode(inode); 38 38 }
+1 -1
fs/ncpfs/inode.c
··· 292 292 ncp_evict_inode(struct inode *inode) 293 293 { 294 294 truncate_inode_pages(&inode->i_data, 0); 295 - end_writeback(inode); 295 + clear_inode(inode); 296 296 297 297 if (S_ISDIR(inode->i_mode)) { 298 298 DDPRINTK("ncp_evict_inode: put directory %ld\n", inode->i_ino);
+2 -2
fs/nfs/inode.c
··· 121 121 void nfs_evict_inode(struct inode *inode) 122 122 { 123 123 truncate_inode_pages(&inode->i_data, 0); 124 - end_writeback(inode); 124 + clear_inode(inode); 125 125 nfs_clear_inode(inode); 126 126 } 127 127 ··· 1500 1500 void nfs4_evict_inode(struct inode *inode) 1501 1501 { 1502 1502 truncate_inode_pages(&inode->i_data, 0); 1503 - end_writeback(inode); 1503 + clear_inode(inode); 1504 1504 pnfs_return_layout(inode); 1505 1505 pnfs_destroy_layout(NFS_I(inode)); 1506 1506 /* If we are holding a delegation, return it! */
+2 -2
fs/nilfs2/inode.c
··· 734 734 if (inode->i_nlink || !ii->i_root || unlikely(is_bad_inode(inode))) { 735 735 if (inode->i_data.nrpages) 736 736 truncate_inode_pages(&inode->i_data, 0); 737 - end_writeback(inode); 737 + clear_inode(inode); 738 738 nilfs_clear_inode(inode); 739 739 return; 740 740 } ··· 746 746 /* TODO: some of the following operations may fail. */ 747 747 nilfs_truncate_bmap(ii, 0); 748 748 nilfs_mark_inode_dirty(inode); 749 - end_writeback(inode); 749 + clear_inode(inode); 750 750 751 751 ret = nilfs_ifile_delete_inode(ii->i_root->ifile, inode->i_ino); 752 752 if (!ret)
+1 -1
fs/ntfs/inode.c
··· 2258 2258 ntfs_inode *ni = NTFS_I(vi); 2259 2259 2260 2260 truncate_inode_pages(&vi->i_data, 0); 2261 - end_writeback(vi); 2261 + clear_inode(vi); 2262 2262 2263 2263 #ifdef NTFS_RW 2264 2264 if (NInoDirty(ni)) {
+1 -1
fs/ocfs2/dlmfs/dlmfs.c
··· 367 367 int status; 368 368 struct dlmfs_inode_private *ip; 369 369 370 - end_writeback(inode); 370 + clear_inode(inode); 371 371 372 372 mlog(0, "inode %lu\n", inode->i_ino); 373 373
+1 -1
fs/ocfs2/inode.c
··· 1069 1069 int status; 1070 1070 struct ocfs2_inode_info *oi = OCFS2_I(inode); 1071 1071 1072 - end_writeback(inode); 1072 + clear_inode(inode); 1073 1073 trace_ocfs2_clear_inode((unsigned long long)oi->ip_blkno, 1074 1074 inode->i_nlink); 1075 1075
+1 -1
fs/omfs/inode.c
··· 184 184 static void omfs_evict_inode(struct inode *inode) 185 185 { 186 186 truncate_inode_pages(&inode->i_data, 0); 187 - end_writeback(inode); 187 + clear_inode(inode); 188 188 189 189 if (inode->i_nlink) 190 190 return;
+1 -1
fs/proc/inode.c
··· 33 33 const struct proc_ns_operations *ns_ops; 34 34 35 35 truncate_inode_pages(&inode->i_data, 0); 36 - end_writeback(inode); 36 + clear_inode(inode); 37 37 38 38 /* Stop tracking associated processes */ 39 39 put_pid(PROC_I(inode)->pid);
+1 -1
fs/pstore/inode.c
··· 85 85 struct pstore_private *p = inode->i_private; 86 86 unsigned long flags; 87 87 88 - end_writeback(inode); 88 + clear_inode(inode); 89 89 if (p) { 90 90 spin_lock_irqsave(&allpstore_lock, flags); 91 91 list_del(&p->list);
+2 -2
fs/reiserfs/inode.c
··· 76 76 ; 77 77 } 78 78 out: 79 - end_writeback(inode); /* note this must go after the journal_end to prevent deadlock */ 79 + clear_inode(inode); /* note this must go after the journal_end to prevent deadlock */ 80 80 dquot_drop(inode); 81 81 inode->i_blocks = 0; 82 82 reiserfs_write_unlock_once(inode->i_sb, depth); 83 83 return; 84 84 85 85 no_delete: 86 - end_writeback(inode); 86 + clear_inode(inode); 87 87 dquot_drop(inode); 88 88 } 89 89
+1 -1
fs/sysfs/inode.c
··· 310 310 struct sysfs_dirent *sd = inode->i_private; 311 311 312 312 truncate_inode_pages(&inode->i_data, 0); 313 - end_writeback(inode); 313 + clear_inode(inode); 314 314 sysfs_put(sd); 315 315 } 316 316
+1 -1
fs/sysv/inode.c
··· 316 316 sysv_truncate(inode); 317 317 } 318 318 invalidate_inode_buffers(inode); 319 - end_writeback(inode); 319 + clear_inode(inode); 320 320 if (!inode->i_nlink) 321 321 sysv_free_inode(inode); 322 322 }
+1 -1
fs/ubifs/super.c
··· 378 378 smp_wmb(); 379 379 } 380 380 done: 381 - end_writeback(inode); 381 + clear_inode(inode); 382 382 } 383 383 384 384 static void ubifs_dirty_inode(struct inode *inode, int flags)
+1 -1
fs/udf/inode.c
··· 80 80 } else 81 81 truncate_inode_pages(&inode->i_data, 0); 82 82 invalidate_inode_buffers(inode); 83 - end_writeback(inode); 83 + clear_inode(inode); 84 84 if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB && 85 85 inode->i_size != iinfo->i_lenExtents) { 86 86 udf_warn(inode->i_sb, "Inode %lu (mode %o) has inode size %llu different from extent length %llu. Filesystem need not be standards compliant.\n",
+1 -1
fs/ufs/inode.c
··· 895 895 } 896 896 897 897 invalidate_inode_buffers(inode); 898 - end_writeback(inode); 898 + clear_inode(inode); 899 899 900 900 if (want_delete) { 901 901 lock_ufs(inode->i_sb);
+1 -1
fs/xfs/xfs_super.c
··· 932 932 trace_xfs_evict_inode(ip); 933 933 934 934 truncate_inode_pages(&inode->i_data, 0); 935 - end_writeback(inode); 935 + clear_inode(inode); 936 936 XFS_STATS_INC(vn_rele); 937 937 XFS_STATS_INC(vn_remove); 938 938 XFS_STATS_DEC(vn_active);
+7 -6
include/linux/fs.h
··· 1764 1764 * I_FREEING Set when inode is about to be freed but still has dirty 1765 1765 * pages or buffers attached or the inode itself is still 1766 1766 * dirty. 1767 - * I_CLEAR Added by end_writeback(). In this state the inode is clean 1768 - * and can be destroyed. Inode keeps I_FREEING. 1767 + * I_CLEAR Added by clear_inode(). In this state the inode is 1768 + * clean and can be destroyed. Inode keeps I_FREEING. 1769 1769 * 1770 1770 * Inodes that are I_WILL_FREE, I_FREEING or I_CLEAR are 1771 1771 * prohibited for many purposes. iget() must wait for ··· 1773 1773 * anew. Other functions will just ignore such inodes, 1774 1774 * if appropriate. I_NEW is used for waiting. 1775 1775 * 1776 - * I_SYNC Synchonized write of dirty inode data. The bits is 1777 - * set during data writeback, and cleared with a wakeup 1778 - * on the bit address once it is done. 1776 + * I_SYNC Writeback of inode is running. The bit is set during 1777 + * data writeback, and cleared with a wakeup on the bit 1778 + * address once it is done. The bit is also used to pin 1779 + * the inode in memory for flusher thread. 1779 1780 * 1780 1781 * I_REFERENCED Marks the inode as recently references on the LRU list. 1781 1782 * ··· 2350 2349 2351 2350 extern void __iget(struct inode * inode); 2352 2351 extern void iget_failed(struct inode *); 2353 - extern void end_writeback(struct inode *); 2352 + extern void clear_inode(struct inode *); 2354 2353 extern void __destroy_inode(struct inode *); 2355 2354 extern struct inode *new_inode_pseudo(struct super_block *sb); 2356 2355 extern struct inode *new_inode(struct super_block *sb);
+3 -7
include/linux/writeback.h
··· 58 58 * in a manner such that unspecified fields are set to zero. 59 59 */ 60 60 struct writeback_control { 61 - enum writeback_sync_modes sync_mode; 62 61 long nr_to_write; /* Write this many pages, and decrement 63 62 this for each page written */ 64 63 long pages_skipped; /* Pages which were not written */ ··· 69 70 */ 70 71 loff_t range_start; 71 72 loff_t range_end; 73 + 74 + enum writeback_sync_modes sync_mode; 72 75 73 76 unsigned for_kupdate:1; /* A kupdate writeback */ 74 77 unsigned for_background:1; /* A background writeback */ ··· 95 94 enum wb_reason reason); 96 95 long wb_do_writeback(struct bdi_writeback *wb, int force_wait); 97 96 void wakeup_flusher_threads(long nr_pages, enum wb_reason reason); 97 + void inode_wait_for_writeback(struct inode *inode); 98 98 99 99 /* writeback.h requires fs.h; it, too, is not included from here. */ 100 100 static inline void wait_on_inode(struct inode *inode) 101 101 { 102 102 might_sleep(); 103 103 wait_on_bit(&inode->i_state, __I_NEW, inode_wait, TASK_UNINTERRUPTIBLE); 104 - } 105 - static inline void inode_sync_wait(struct inode *inode) 106 - { 107 - might_sleep(); 108 - wait_on_bit(&inode->i_state, __I_SYNC, inode_wait, 109 - TASK_UNINTERRUPTIBLE); 110 104 } 111 105 112 106
+29 -7
include/trace/events/writeback.h
··· 372 372 ) 373 373 ); 374 374 375 + TRACE_EVENT(writeback_sb_inodes_requeue, 376 + 377 + TP_PROTO(struct inode *inode), 378 + TP_ARGS(inode), 379 + 380 + TP_STRUCT__entry( 381 + __array(char, name, 32) 382 + __field(unsigned long, ino) 383 + __field(unsigned long, state) 384 + __field(unsigned long, dirtied_when) 385 + ), 386 + 387 + TP_fast_assign( 388 + strncpy(__entry->name, 389 + dev_name(inode_to_bdi(inode)->dev), 32); 390 + __entry->ino = inode->i_ino; 391 + __entry->state = inode->i_state; 392 + __entry->dirtied_when = inode->dirtied_when; 393 + ), 394 + 395 + TP_printk("bdi %s: ino=%lu state=%s dirtied_when=%lu age=%lu", 396 + __entry->name, 397 + __entry->ino, 398 + show_inode_state(__entry->state), 399 + __entry->dirtied_when, 400 + (jiffies - __entry->dirtied_when) / HZ 401 + ) 402 + ); 403 + 375 404 DECLARE_EVENT_CLASS(writeback_congest_waited_template, 376 405 377 406 TP_PROTO(unsigned int usec_timeout, unsigned int usec_delayed), ··· 477 448 __entry->nr_to_write, 478 449 __entry->wrote 479 450 ) 480 - ); 481 - 482 - DEFINE_EVENT(writeback_single_inode_template, writeback_single_inode_requeue, 483 - TP_PROTO(struct inode *inode, 484 - struct writeback_control *wbc, 485 - unsigned long nr_to_write), 486 - TP_ARGS(inode, wbc, nr_to_write) 487 451 ); 488 452 489 453 DEFINE_EVENT(writeback_single_inode_template, writeback_single_inode,
+1 -1
ipc/mqueue.c
··· 251 251 int i; 252 252 struct ipc_namespace *ipc_ns; 253 253 254 - end_writeback(inode); 254 + clear_inode(inode); 255 255 256 256 if (S_ISDIR(inode->i_mode)) 257 257 return;
+2 -1
mm/page-writeback.c
··· 204 204 * Returns the global number of pages potentially available for dirty 205 205 * page cache. This is the base value for the global dirty limits. 206 206 */ 207 - unsigned long global_dirtyable_memory(void) 207 + static unsigned long global_dirtyable_memory(void) 208 208 { 209 209 unsigned long x; 210 210 ··· 1568 1568 unsigned long background_thresh; 1569 1569 unsigned long dirty_thresh; 1570 1570 global_dirty_limits(&background_thresh, &dirty_thresh); 1571 + global_dirty_limit = dirty_thresh; 1571 1572 ratelimit_pages = dirty_thresh / (num_online_cpus() * 32); 1572 1573 if (ratelimit_pages < 16) 1573 1574 ratelimit_pages = 16;
+1 -1
mm/shmem.c
··· 597 597 } 598 598 BUG_ON(inode->i_blocks); 599 599 shmem_free_inode(inode->i_sb); 600 - end_writeback(inode); 600 + clear_inode(inode); 601 601 } 602 602 603 603 /*