Merge branch 'writeback' of git://git.kernel.dk/linux-2.6-block

Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git

kernel os linux

* 'writeback' of git://git.kernel.dk/linux-2.6-block:
writeback: writeback_inodes_sb() should use bdi_start_writeback()
writeback: don't delay inodes redirtied by a fast dirtier
writeback: make the super_block pinning more efficient
writeback: don't resort for a single super_block in move_expired_inodes()
writeback: move inodes from one super_block together
writeback: get rid to incorrect references to pdflush in comments
writeback: improve readability of the wb_writeback() continue/break logic
writeback: cleanup writeback_single_inode()
writeback: kupdate writeback shall not stop when more io is possible
writeback: stop background writeback when below background threshold
writeback: balance_dirty_pages() shall write more than dirtied pages
fs: Fix busyloop in wb_writeback()

Linus Torvalds 16 years ago 6d7f18f6 53cddfcc

+142 -72

5 changed files

expand all

buffer.c

fs-writeback.c

page-writeback.c

shmem.c

vmscan.c

+5 -5

fs/buffer.c

··· 280 280 EXPORT_SYMBOL(invalidate_bdev); 281 281 282 282 /* 283 - * Kick pdflush then try to free up some ZONE_NORMAL memory. 283 + * Kick the writeback threads then try to free up some ZONE_NORMAL memory. 284 284 */ 285 285 static void free_more_memory(void) 286 286 { ··· 1709 1709 /* 1710 1710 * If it's a fully non-blocking write attempt and we cannot 1711 1711 * lock the buffer then redirty the page. Note that this can 1712 - * potentially cause a busy-wait loop from pdflush and kswapd 1713 - * activity, but those code paths have their own higher-level 1714 - * throttling. 1712 + * potentially cause a busy-wait loop from writeback threads 1713 + * and kswapd activity, but those code paths have their own 1714 + * higher-level throttling. 1715 1715 */ 1716 1716 if (wbc->sync_mode != WB_SYNC_NONE || !wbc->nonblocking) { 1717 1717 lock_buffer(bh); ··· 3208 3208 * still running obsolete flush daemons, so we terminate them here. 3209 3209 * 3210 3210 * Use of bdflush() is deprecated and will be removed in a future kernel. 3211 - * The `pdflush' kernel threads fully replace bdflush daemons and this call. 3211 + * The `flush-X' kernel threads fully replace bdflush daemons and this call. 3212 3212 */ 3213 3213 SYSCALL_DEFINE2(bdflush, int, func, long, data) 3214 3214 {

+113 -48

fs/fs-writeback.c

··· 41 41 long nr_pages; 42 42 struct super_block *sb; 43 43 enum writeback_sync_modes sync_mode; 44 - int for_kupdate; 45 - int range_cyclic; 44 + int for_kupdate:1; 45 + int range_cyclic:1; 46 + int for_background:1; 46 47 }; 47 48 48 49 /* ··· 258 257 .range_cyclic = 1, 259 258 }; 260 259 260 + /* 261 + * We treat @nr_pages=0 as the special case to do background writeback, 262 + * ie. to sync pages until the background dirty threshold is reached. 263 + */ 264 + if (!nr_pages) { 265 + args.nr_pages = LONG_MAX; 266 + args.for_background = 1; 267 + } 268 + 261 269 bdi_alloc_queue_work(bdi, &args); 262 270 } 263 271 ··· 320 310 * For inodes being constantly redirtied, dirtied_when can get stuck. 321 311 * It _appears_ to be in the future, but is actually in distant past. 322 312 * This test is necessary to prevent such wrapped-around relative times 323 - * from permanently stopping the whole pdflush writeback. 313 + * from permanently stopping the whole bdi writeback. 324 314 */ 325 315 ret = ret && time_before_eq(inode->dirtied_when, jiffies); 326 316 #endif ··· 334 324 struct list_head *dispatch_queue, 335 325 unsigned long *older_than_this) 336 326 { 327 + LIST_HEAD(tmp); 328 + struct list_head *pos, *node; 329 + struct super_block *sb = NULL; 330 + struct inode *inode; 331 + int do_sb_sort = 0; 332 + 337 333 while (!list_empty(delaying_queue)) { 338 - struct inode *inode = list_entry(delaying_queue->prev, 339 - struct inode, i_list); 334 + inode = list_entry(delaying_queue->prev, struct inode, i_list); 340 335 if (older_than_this && 341 336 inode_dirtied_after(inode, *older_than_this)) 342 337 break; 343 - list_move(&inode->i_list, dispatch_queue); 338 + if (sb && sb != inode->i_sb) 339 + do_sb_sort = 1; 340 + sb = inode->i_sb; 341 + list_move(&inode->i_list, &tmp); 342 + } 343 + 344 + /* just one sb in list, splice to dispatch_queue and we're done */ 345 + if (!do_sb_sort) { 346 + list_splice(&tmp, dispatch_queue); 347 + return; 348 + } 349 + 350 + /* Move inodes from one superblock together */ 351 + while (!list_empty(&tmp)) { 352 + inode = list_entry(tmp.prev, struct inode, i_list); 353 + sb = inode->i_sb; 354 + list_for_each_prev_safe(pos, node, &tmp) { 355 + inode = list_entry(pos, struct inode, i_list); 356 + if (inode->i_sb == sb) 357 + list_move(&inode->i_list, dispatch_queue); 358 + } 344 359 } 345 360 } 346 361 ··· 474 439 spin_lock(&inode_lock); 475 440 inode->i_state &= ~I_SYNC; 476 441 if (!(inode->i_state & (I_FREEING | I_CLEAR))) { 477 - if (!(inode->i_state & I_DIRTY) && 478 - mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) { 442 + if ((inode->i_state & I_DIRTY_PAGES) && wbc->for_kupdate) { 443 + /* 444 + * More pages get dirtied by a fast dirtier. 445 + */ 446 + goto select_queue; 447 + } else if (inode->i_state & I_DIRTY) { 448 + /* 449 + * At least XFS will redirty the inode during the 450 + * writeback (delalloc) and on io completion (isize). 451 + */ 452 + redirty_tail(inode); 453 + } else if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) { 479 454 /* 480 455 * We didn't write back all the pages. nfs_writepages() 481 456 * sometimes bales out without doing anything. Redirty ··· 507 462 * soon as the queue becomes uncongested. 508 463 */ 509 464 inode->i_state |= I_DIRTY_PAGES; 465 + select_queue: 510 466 if (wbc->nr_to_write <= 0) { 511 467 /* 512 468 * slice used up: queue for next turn ··· 530 484 inode->i_state |= I_DIRTY_PAGES; 531 485 redirty_tail(inode); 532 486 } 533 - } else if (inode->i_state & I_DIRTY) { 534 - /* 535 - * Someone redirtied the inode while were writing back 536 - * the pages. 537 - */ 538 - redirty_tail(inode); 539 487 } else if (atomic_read(&inode->i_count)) { 540 488 /* 541 489 * The inode is clean, inuse ··· 546 506 return ret; 547 507 } 548 508 509 + static void unpin_sb_for_writeback(struct super_block **psb) 510 + { 511 + struct super_block *sb = *psb; 512 + 513 + if (sb) { 514 + up_read(&sb->s_umount); 515 + put_super(sb); 516 + *psb = NULL; 517 + } 518 + } 519 + 549 520 /* 550 521 * For WB_SYNC_NONE writeback, the caller does not have the sb pinned 551 522 * before calling writeback. So make sure that we do pin it, so it doesn't ··· 566 515 * 1 if we failed. 567 516 */ 568 517 static int pin_sb_for_writeback(struct writeback_control *wbc, 569 - struct inode *inode) 518 + struct inode *inode, struct super_block **psb) 570 519 { 571 520 struct super_block *sb = inode->i_sb; 521 + 522 + /* 523 + * If this sb is already pinned, nothing more to do. If not and 524 + * *psb is non-NULL, unpin the old one first 525 + */ 526 + if (sb == *psb) 527 + return 0; 528 + else if (*psb) 529 + unpin_sb_for_writeback(psb); 572 530 573 531 /* 574 532 * Caller must already hold the ref for this ··· 592 532 if (down_read_trylock(&sb->s_umount)) { 593 533 if (sb->s_root) { 594 534 spin_unlock(&sb_lock); 595 - return 0; 535 + goto pinned; 596 536 } 597 537 /* 598 538 * umounted, drop rwsem again and fall through to failure ··· 603 543 sb->s_count--; 604 544 spin_unlock(&sb_lock); 605 545 return 1; 606 - } 607 - 608 - static void unpin_sb_for_writeback(struct writeback_control *wbc, 609 - struct inode *inode) 610 - { 611 - struct super_block *sb = inode->i_sb; 612 - 613 - if (wbc->sync_mode == WB_SYNC_ALL) 614 - return; 615 - 616 - up_read(&sb->s_umount); 617 - put_super(sb); 546 + pinned: 547 + *psb = sb; 548 + return 0; 618 549 } 619 550 620 551 static void writeback_inodes_wb(struct bdi_writeback *wb, 621 552 struct writeback_control *wbc) 622 553 { 623 - struct super_block *sb = wbc->sb; 554 + struct super_block *sb = wbc->sb, *pin_sb = NULL; 624 555 const int is_blkdev_sb = sb_is_blkdev_sb(sb); 625 556 const unsigned long start = jiffies; /* livelock avoidance */ 626 557 ··· 670 619 if (inode_dirtied_after(inode, start)) 671 620 break; 672 621 673 - if (pin_sb_for_writeback(wbc, inode)) { 622 + if (pin_sb_for_writeback(wbc, inode, &pin_sb)) { 674 623 requeue_io(inode); 675 624 continue; 676 625 } ··· 679 628 __iget(inode); 680 629 pages_skipped = wbc->pages_skipped; 681 630 writeback_single_inode(inode, wbc); 682 - unpin_sb_for_writeback(wbc, inode); 683 631 if (wbc->pages_skipped != pages_skipped) { 684 632 /* 685 633 * writeback is not making progress due to locked ··· 697 647 if (!list_empty(&wb->b_more_io)) 698 648 wbc->more_io = 1; 699 649 } 650 + 651 + unpin_sb_for_writeback(&pin_sb); 700 652 701 653 spin_unlock(&inode_lock); 702 654 /* Leave any unwritten inodes on b_io */ ··· 758 706 }; 759 707 unsigned long oldest_jif; 760 708 long wrote = 0; 709 + struct inode *inode; 761 710 762 711 if (wbc.for_kupdate) { 763 712 wbc.older_than_this = &oldest_jif; ··· 772 719 773 720 for (;;) { 774 721 /* 775 - * Don't flush anything for non-integrity writeback where 776 - * no nr_pages was given 722 + * Stop writeback when nr_pages has been consumed 777 723 */ 778 - if (!args->for_kupdate && args->nr_pages <= 0 && 779 - args->sync_mode == WB_SYNC_NONE) 724 + if (args->nr_pages <= 0) 780 725 break; 781 726 782 727 /* 783 - * If no specific pages were given and this is just a 784 - * periodic background writeout and we are below the 785 - * background dirty threshold, don't do anything 728 + * For background writeout, stop when we are below the 729 + * background dirty threshold 786 730 */ 787 - if (args->for_kupdate && args->nr_pages <= 0 && 788 - !over_bground_thresh()) 731 + if (args->for_background && !over_bground_thresh()) 789 732 break; 790 733 791 734 wbc.more_io = 0; ··· 793 744 wrote += MAX_WRITEBACK_PAGES - wbc.nr_to_write; 794 745 795 746 /* 796 - * If we ran out of stuff to write, bail unless more_io got set 747 + * If we consumed everything, see if we have more 797 748 */ 798 - if (wbc.nr_to_write > 0 || wbc.pages_skipped > 0) { 799 - if (wbc.more_io && !wbc.for_kupdate) 800 - continue; 749 + if (wbc.nr_to_write <= 0) 750 + continue; 751 + /* 752 + * Didn't write everything and we don't have more IO, bail 753 + */ 754 + if (!wbc.more_io) 801 755 break; 756 + /* 757 + * Did we write something? Try for more 758 + */ 759 + if (wbc.nr_to_write < MAX_WRITEBACK_PAGES) 760 + continue; 761 + /* 762 + * Nothing written. Wait for some inode to 763 + * become available for writeback. Otherwise 764 + * we'll just busyloop. 765 + */ 766 + spin_lock(&inode_lock); 767 + if (!list_empty(&wb->b_more_io)) { 768 + inode = list_entry(wb->b_more_io.prev, 769 + struct inode, i_list); 770 + inode_wait_for_writeback(inode); 802 771 } 772 + spin_unlock(&inode_lock); 803 773 } 804 774 805 775 return wrote; ··· 1128 1060 * If older_than_this is non-NULL, then only write out inodes which 1129 1061 * had their first dirtying at a time earlier than *older_than_this. 1130 1062 * 1131 - * If we're a pdlfush thread, then implement pdflush collision avoidance 1132 - * against the entire list. 1133 - * 1134 1063 * If `bdi' is non-zero then we're being asked to writeback a specific queue. 1135 1064 * This function assumes that the blockdev superblock's inodes are backed by 1136 1065 * a variety of queues, so all inodes are searched. For other superblocks, ··· 1206 1141 nr_to_write = nr_dirty + nr_unstable + 1207 1142 (inodes_stat.nr_inodes - inodes_stat.nr_unused); 1208 1143 1209 - bdi_writeback_all(sb, nr_to_write); 1144 + bdi_start_writeback(sb->s_bdi, nr_to_write); 1210 1145 } 1211 1146 EXPORT_SYMBOL(writeback_inodes_sb); 1212 1147

+17 -13

mm/page-writeback.c

··· 44 44 /* 45 45 * When balance_dirty_pages decides that the caller needs to perform some 46 46 * non-background writeback, this is how many pages it will attempt to write. 47 - * It should be somewhat larger than RATELIMIT_PAGES to ensure that reasonably 47 + * It should be somewhat larger than dirtied pages to ensure that reasonably 48 48 * large amounts of I/O are submitted. 49 49 */ 50 - static inline long sync_writeback_pages(void) 50 + static inline long sync_writeback_pages(unsigned long dirtied) 51 51 { 52 - return ratelimit_pages + ratelimit_pages / 2; 52 + if (dirtied < ratelimit_pages) 53 + dirtied = ratelimit_pages; 54 + 55 + return dirtied + dirtied / 2; 53 56 } 54 57 55 58 /* The following parameters are exported via /proc/sys/vm */ 56 59 57 60 /* 58 - * Start background writeback (via pdflush) at this percentage 61 + * Start background writeback (via writeback threads) at this percentage 59 62 */ 60 63 int dirty_background_ratio = 10; 61 64 ··· 477 474 * balance_dirty_pages() must be called by processes which are generating dirty 478 475 * data. It looks at the number of dirty pages in the machine and will force 479 476 * the caller to perform writeback if the system is over `vm_dirty_ratio'. 480 - * If we're over `background_thresh' then pdflush is woken to perform some 481 - * writeout. 477 + * If we're over `background_thresh' then the writeback threads are woken to 478 + * perform some writeout. 482 479 */ 483 - static void balance_dirty_pages(struct address_space *mapping) 480 + static void balance_dirty_pages(struct address_space *mapping, 481 + unsigned long write_chunk) 484 482 { 485 483 long nr_reclaimable, bdi_nr_reclaimable; 486 484 long nr_writeback, bdi_nr_writeback; ··· 489 485 unsigned long dirty_thresh; 490 486 unsigned long bdi_thresh; 491 487 unsigned long pages_written = 0; 492 - unsigned long write_chunk = sync_writeback_pages(); 493 488 unsigned long pause = 1; 494 489 495 490 struct backing_dev_info *bdi = mapping->backing_dev_info; ··· 582 579 bdi->dirty_exceeded = 0; 583 580 584 581 if (writeback_in_progress(bdi)) 585 - return; /* pdflush is already working this queue */ 582 + return; 586 583 587 584 /* 588 585 * In laptop mode, we wait until hitting the higher threshold before ··· 593 590 * background_thresh, to keep the amount of dirty memory low. 594 591 */ 595 592 if ((laptop_mode && pages_written) || 596 - (!laptop_mode && ((nr_writeback = global_page_state(NR_FILE_DIRTY) 597 - + global_page_state(NR_UNSTABLE_NFS)) 593 + (!laptop_mode && ((global_page_state(NR_FILE_DIRTY) 594 + + global_page_state(NR_UNSTABLE_NFS)) 598 595 > background_thresh))) 599 - bdi_start_writeback(bdi, nr_writeback); 596 + bdi_start_writeback(bdi, 0); 600 597 } 601 598 602 599 void set_page_dirty_balance(struct page *page, int page_mkwrite) ··· 643 640 p = &__get_cpu_var(bdp_ratelimits); 644 641 *p += nr_pages_dirtied; 645 642 if (unlikely(*p >= ratelimit)) { 643 + ratelimit = sync_writeback_pages(*p); 646 644 *p = 0; 647 645 preempt_enable(); 648 - balance_dirty_pages(mapping); 646 + balance_dirty_pages(mapping, ratelimit); 649 647 return; 650 648 } 651 649 preempt_enable();

+3 -2

mm/shmem.c

··· 1046 1046 * sync from ever calling shmem_writepage; but a stacking filesystem 1047 1047 * may use the ->writepage of its underlying filesystem, in which case 1048 1048 * tmpfs should write out to swap only in response to memory pressure, 1049 - * and not for pdflush or sync. However, in those cases, we do still 1050 - * want to check if there's a redundant swappage to be discarded. 1049 + * and not for the writeback threads or sync. However, in those cases, 1050 + * we do still want to check if there's a redundant swappage to be 1051 + * discarded. 1051 1052 */ 1052 1053 if (wbc->for_reclaim) 1053 1054 swap = get_swap_page();

+4 -4

mm/vmscan.c

··· 1709 1709 * 1710 1710 * If the caller is !__GFP_FS then the probability of a failure is reasonably 1711 1711 * high - the zone may be full of dirty or under-writeback pages, which this 1712 - * caller can't do much about. We kick pdflush and take explicit naps in the 1713 - * hope that some of these pages can be written. But if the allocating task 1714 - * holds filesystem locks which prevent writeout this might not work, and the 1715 - * allocation attempt will fail. 1712 + * caller can't do much about. We kick the writeback threads and take explicit 1713 + * naps in the hope that some of these pages can be written. But if the 1714 + * allocating task holds filesystem locks which prevent writeout this might not 1715 + * work, and the allocation attempt will fail. 1716 1716 * 1717 1717 * returns: 0, if no pages reclaimed 1718 1718 * else, the number of pages reclaimed