Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

sched/balancing: Periodically decay max cost of idle balance

This patch builds on patch 2 and periodically decays that max value to
do idle balancing per sched domain by approximately 1% per second. Also
decay the rq's max_idle_balance_cost value.

Signed-off-by: Jason Low <jason.low2@hp.com>
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1379096813-3032-4-git-send-email-jason.low2@hp.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>

authored by

Jason Low and committed by
Ingo Molnar
f48627e6 9bd721c5

+38 -7
+1
arch/metag/include/asm/topology.h
··· 27 27 .balance_interval = 1, \ 28 28 .nr_balance_failed = 0, \ 29 29 .max_newidle_lb_cost = 0, \ 30 + .next_decay_max_lb_cost = jiffies, \ 30 31 } 31 32 32 33 #define cpu_to_node(cpu) ((void)(cpu), 0)
+3
include/linux/sched.h
··· 810 810 unsigned int nr_balance_failed; /* initialise to 0 */ 811 811 812 812 u64 last_update; 813 + 814 + /* idle_balance() stats */ 813 815 u64 max_newidle_lb_cost; 816 + unsigned long next_decay_max_lb_cost; 814 817 815 818 #ifdef CONFIG_SCHEDSTATS 816 819 /* load_balance() stats */
+3
include/linux/topology.h
··· 107 107 .balance_interval = 1, \ 108 108 .smt_gain = 1178, /* 15% */ \ 109 109 .max_newidle_lb_cost = 0, \ 110 + .next_decay_max_lb_cost = jiffies, \ 110 111 } 111 112 #endif 112 113 #endif /* CONFIG_SCHED_SMT */ ··· 138 137 .last_balance = jiffies, \ 139 138 .balance_interval = 1, \ 140 139 .max_newidle_lb_cost = 0, \ 140 + .next_decay_max_lb_cost = jiffies, \ 141 141 } 142 142 #endif 143 143 #endif /* CONFIG_SCHED_MC */ ··· 171 169 .last_balance = jiffies, \ 172 170 .balance_interval = 1, \ 173 171 .max_newidle_lb_cost = 0, \ 172 + .next_decay_max_lb_cost = jiffies, \ 174 173 } 175 174 #endif 176 175
+31 -7
kernel/sched/fair.c
··· 5681 5681 /* Earliest time when we have to do rebalance again */ 5682 5682 unsigned long next_balance = jiffies + 60*HZ; 5683 5683 int update_next_balance = 0; 5684 - int need_serialize; 5684 + int need_serialize, need_decay = 0; 5685 + u64 max_cost = 0; 5685 5686 5686 5687 update_blocked_averages(cpu); 5687 5688 5688 5689 rcu_read_lock(); 5689 5690 for_each_domain(cpu, sd) { 5691 + /* 5692 + * Decay the newidle max times here because this is a regular 5693 + * visit to all the domains. Decay ~1% per second. 5694 + */ 5695 + if (time_after(jiffies, sd->next_decay_max_lb_cost)) { 5696 + sd->max_newidle_lb_cost = 5697 + (sd->max_newidle_lb_cost * 253) / 256; 5698 + sd->next_decay_max_lb_cost = jiffies + HZ; 5699 + need_decay = 1; 5700 + } 5701 + max_cost += sd->max_newidle_lb_cost; 5702 + 5690 5703 if (!(sd->flags & SD_LOAD_BALANCE)) 5691 5704 continue; 5705 + 5706 + /* 5707 + * Stop the load balance at this level. There is another 5708 + * CPU in our sched group which is doing load balancing more 5709 + * actively. 5710 + */ 5711 + if (!continue_balancing) { 5712 + if (need_decay) 5713 + continue; 5714 + break; 5715 + } 5692 5716 5693 5717 interval = sd->balance_interval; 5694 5718 if (idle != CPU_IDLE) ··· 5747 5723 next_balance = sd->last_balance + interval; 5748 5724 update_next_balance = 1; 5749 5725 } 5750 - 5726 + } 5727 + if (need_decay) { 5751 5728 /* 5752 - * Stop the load balance at this level. There is another 5753 - * CPU in our sched group which is doing load balancing more 5754 - * actively. 5729 + * Ensure the rq-wide value also decays but keep it at a 5730 + * reasonable floor to avoid funnies with rq->avg_idle. 5755 5731 */ 5756 - if (!continue_balancing) 5757 - break; 5732 + rq->max_idle_balance_cost = 5733 + max((u64)sysctl_sched_migration_cost, max_cost); 5758 5734 } 5759 5735 rcu_read_unlock(); 5760 5736