Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

writeback: do not sleep on the congestion queue if there are no congested BDIs or if significant congestion is not being encountered in the current zone

If congestion_wait() is called with no BDI congested, the caller will
sleep for the full timeout and this may be an unnecessary sleep. This
patch adds a wait_iff_congested() that checks congestion and only sleeps
if a BDI is congested else, it calls cond_resched() to ensure the caller
is not hogging the CPU longer than its quota but otherwise will not sleep.

This is aimed at reducing some of the major desktop stalls reported during
IO. For example, while kswapd is operating, it calls congestion_wait()
but it could just have been reclaiming clean page cache pages with no
congestion. Without this patch, it would sleep for a full timeout but
after this patch, it'll just call schedule() if it has been on the CPU too
long. Similar logic applies to direct reclaimers that are not making
enough progress.

Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Minchan Kim <minchan.kim@gmail.com>
Cc: Wu Fengguang <fengguang.wu@intel.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

authored by

Mel Gorman and committed by
Linus Torvalds
0e093d99 08fc468f

+112 -12
+1 -1
include/linux/backing-dev.h
··· 285 285 void clear_bdi_congested(struct backing_dev_info *bdi, int sync); 286 286 void set_bdi_congested(struct backing_dev_info *bdi, int sync); 287 287 long congestion_wait(int sync, long timeout); 288 - 288 + long wait_iff_congested(struct zone *zone, int sync, long timeout); 289 289 290 290 static inline bool bdi_cap_writeback_dirty(struct backing_dev_info *bdi) 291 291 {
+8
include/linux/mmzone.h
··· 423 423 typedef enum { 424 424 ZONE_RECLAIM_LOCKED, /* prevents concurrent reclaim */ 425 425 ZONE_OOM_LOCKED, /* zone is in OOM killer zonelist */ 426 + ZONE_CONGESTED, /* zone has many dirty pages backed by 427 + * a congested BDI 428 + */ 426 429 } zone_flags_t; 427 430 428 431 static inline void zone_set_flag(struct zone *zone, zone_flags_t flag) ··· 441 438 static inline void zone_clear_flag(struct zone *zone, zone_flags_t flag) 442 439 { 443 440 clear_bit(flag, &zone->flags); 441 + } 442 + 443 + static inline int zone_is_reclaim_congested(const struct zone *zone) 444 + { 445 + return test_bit(ZONE_CONGESTED, &zone->flags); 444 446 } 445 447 446 448 static inline int zone_is_reclaim_locked(const struct zone *zone)
+7
include/trace/events/writeback.h
··· 179 179 TP_ARGS(usec_timeout, usec_delayed) 180 180 ); 181 181 182 + DEFINE_EVENT(writeback_congest_waited_template, writeback_wait_iff_congested, 183 + 184 + TP_PROTO(unsigned int usec_timeout, unsigned int usec_delayed), 185 + 186 + TP_ARGS(usec_timeout, usec_delayed) 187 + ); 188 + 182 189 #endif /* _TRACE_WRITEBACK_H */ 183 190 184 191 /* This part must be outside protection */
+59 -2
mm/backing-dev.c
··· 729 729 __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[0]), 730 730 __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[1]) 731 731 }; 732 + static atomic_t nr_bdi_congested[2]; 732 733 733 734 void clear_bdi_congested(struct backing_dev_info *bdi, int sync) 734 735 { ··· 737 736 wait_queue_head_t *wqh = &congestion_wqh[sync]; 738 737 739 738 bit = sync ? BDI_sync_congested : BDI_async_congested; 740 - clear_bit(bit, &bdi->state); 739 + if (test_and_clear_bit(bit, &bdi->state)) 740 + atomic_dec(&nr_bdi_congested[sync]); 741 741 smp_mb__after_clear_bit(); 742 742 if (waitqueue_active(wqh)) 743 743 wake_up(wqh); ··· 750 748 enum bdi_state bit; 751 749 752 750 bit = sync ? BDI_sync_congested : BDI_async_congested; 753 - set_bit(bit, &bdi->state); 751 + if (!test_and_set_bit(bit, &bdi->state)) 752 + atomic_inc(&nr_bdi_congested[sync]); 754 753 } 755 754 EXPORT_SYMBOL(set_bdi_congested); 756 755 ··· 782 779 } 783 780 EXPORT_SYMBOL(congestion_wait); 784 781 782 + /** 783 + * wait_iff_congested - Conditionally wait for a backing_dev to become uncongested or a zone to complete writes 784 + * @zone: A zone to check if it is heavily congested 785 + * @sync: SYNC or ASYNC IO 786 + * @timeout: timeout in jiffies 787 + * 788 + * In the event of a congested backing_dev (any backing_dev) and the given 789 + * @zone has experienced recent congestion, this waits for up to @timeout 790 + * jiffies for either a BDI to exit congestion of the given @sync queue 791 + * or a write to complete. 792 + * 793 + * In the absense of zone congestion, cond_resched() is called to yield 794 + * the processor if necessary but otherwise does not sleep. 795 + * 796 + * The return value is 0 if the sleep is for the full timeout. Otherwise, 797 + * it is the number of jiffies that were still remaining when the function 798 + * returned. return_value == timeout implies the function did not sleep. 799 + */ 800 + long wait_iff_congested(struct zone *zone, int sync, long timeout) 801 + { 802 + long ret; 803 + unsigned long start = jiffies; 804 + DEFINE_WAIT(wait); 805 + wait_queue_head_t *wqh = &congestion_wqh[sync]; 806 + 807 + /* 808 + * If there is no congestion, or heavy congestion is not being 809 + * encountered in the current zone, yield if necessary instead 810 + * of sleeping on the congestion queue 811 + */ 812 + if (atomic_read(&nr_bdi_congested[sync]) == 0 || 813 + !zone_is_reclaim_congested(zone)) { 814 + cond_resched(); 815 + 816 + /* In case we scheduled, work out time remaining */ 817 + ret = timeout - (jiffies - start); 818 + if (ret < 0) 819 + ret = 0; 820 + 821 + goto out; 822 + } 823 + 824 + /* Sleep until uncongested or a write happens */ 825 + prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE); 826 + ret = io_schedule_timeout(timeout); 827 + finish_wait(wqh, &wait); 828 + 829 + out: 830 + trace_writeback_wait_iff_congested(jiffies_to_usecs(timeout), 831 + jiffies_to_usecs(jiffies - start)); 832 + 833 + return ret; 834 + } 835 + EXPORT_SYMBOL(wait_iff_congested);
+2 -2
mm/page_alloc.c
··· 1907 1907 preferred_zone, migratetype); 1908 1908 1909 1909 if (!page && gfp_mask & __GFP_NOFAIL) 1910 - congestion_wait(BLK_RW_ASYNC, HZ/50); 1910 + wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50); 1911 1911 } while (!page && (gfp_mask & __GFP_NOFAIL)); 1912 1912 1913 1913 return page; ··· 2095 2095 pages_reclaimed += did_some_progress; 2096 2096 if (should_alloc_retry(gfp_mask, order, pages_reclaimed)) { 2097 2097 /* Wait for some write requests to complete then retry */ 2098 - congestion_wait(BLK_RW_ASYNC, HZ/50); 2098 + wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50); 2099 2099 goto rebalance; 2100 2100 } 2101 2101
+35 -7
mm/vmscan.c
··· 401 401 } 402 402 if (mapping->a_ops->writepage == NULL) 403 403 return PAGE_ACTIVATE; 404 - if (!may_write_to_queue(mapping->backing_dev_info, sc)) { 405 - disable_lumpy_reclaim_mode(sc); 404 + if (!may_write_to_queue(mapping->backing_dev_info, sc)) 406 405 return PAGE_KEEP; 407 - } 408 406 409 407 if (clear_page_dirty_for_io(page)) { 410 408 int res; ··· 679 681 * shrink_page_list() returns the number of reclaimed pages 680 682 */ 681 683 static unsigned long shrink_page_list(struct list_head *page_list, 684 + struct zone *zone, 682 685 struct scan_control *sc) 683 686 { 684 687 LIST_HEAD(ret_pages); 685 688 LIST_HEAD(free_pages); 686 689 int pgactivate = 0; 690 + unsigned long nr_dirty = 0; 691 + unsigned long nr_congested = 0; 687 692 unsigned long nr_reclaimed = 0; 688 693 689 694 cond_resched(); ··· 706 705 goto keep; 707 706 708 707 VM_BUG_ON(PageActive(page)); 708 + VM_BUG_ON(page_zone(page) != zone); 709 709 710 710 sc->nr_scanned++; 711 711 ··· 784 782 } 785 783 786 784 if (PageDirty(page)) { 785 + nr_dirty++; 786 + 787 787 if (references == PAGEREF_RECLAIM_CLEAN) 788 788 goto keep_locked; 789 789 if (!may_enter_fs) ··· 796 792 /* Page is dirty, try to write it out here */ 797 793 switch (pageout(page, mapping, sc)) { 798 794 case PAGE_KEEP: 795 + nr_congested++; 799 796 goto keep_locked; 800 797 case PAGE_ACTIVATE: 801 798 goto activate_locked; ··· 906 901 list_add(&page->lru, &ret_pages); 907 902 VM_BUG_ON(PageLRU(page) || PageUnevictable(page)); 908 903 } 904 + 905 + /* 906 + * Tag a zone as congested if all the dirty pages encountered were 907 + * backed by a congested BDI. In this case, reclaimers should just 908 + * back off and wait for congestion to clear because further reclaim 909 + * will encounter the same problem 910 + */ 911 + if (nr_dirty == nr_congested) 912 + zone_set_flag(zone, ZONE_CONGESTED); 909 913 910 914 free_page_list(&free_pages); 911 915 ··· 1400 1386 1401 1387 spin_unlock_irq(&zone->lru_lock); 1402 1388 1403 - nr_reclaimed = shrink_page_list(&page_list, sc); 1389 + nr_reclaimed = shrink_page_list(&page_list, zone, sc); 1404 1390 1405 1391 /* Check if we should syncronously wait for writeback */ 1406 1392 if (should_reclaim_stall(nr_taken, nr_reclaimed, priority, sc)) { 1407 1393 set_lumpy_reclaim_mode(priority, sc, true); 1408 - nr_reclaimed += shrink_page_list(&page_list, sc); 1394 + nr_reclaimed += shrink_page_list(&page_list, zone, sc); 1409 1395 } 1410 1396 1411 1397 local_irq_disable(); ··· 1996 1982 1997 1983 /* Take a nap, wait for some writeback to complete */ 1998 1984 if (!sc->hibernation_mode && sc->nr_scanned && 1999 - priority < DEF_PRIORITY - 2) 2000 - congestion_wait(BLK_RW_ASYNC, HZ/10); 1985 + priority < DEF_PRIORITY - 2) { 1986 + struct zone *preferred_zone; 1987 + 1988 + first_zones_zonelist(zonelist, gfp_zone(sc->gfp_mask), 1989 + NULL, &preferred_zone); 1990 + wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/10); 1991 + } 2001 1992 } 2002 1993 2003 1994 out: ··· 2301 2282 if (!zone_watermark_ok(zone, order, 2302 2283 min_wmark_pages(zone), end_zone, 0)) 2303 2284 has_under_min_watermark_zone = 1; 2285 + } else { 2286 + /* 2287 + * If a zone reaches its high watermark, 2288 + * consider it to be no longer congested. It's 2289 + * possible there are dirty pages backed by 2290 + * congested BDIs but as pressure is relieved, 2291 + * spectulatively avoid congestion waits 2292 + */ 2293 + zone_clear_flag(zone, ZONE_CONGESTED); 2304 2294 } 2305 2295 2306 2296 }