sched: Use group weight, idle cpu metrics to fix imbalances during idle

Currently we consider a sched domain to be well balanced when the imbalance
is less than the domain's imablance_pct. As the number of cores and threads
are increasing, current values of imbalance_pct (for example 25% for a
NUMA domain) are not enough to detect imbalances like:

a) On a WSM-EP system (two sockets, each having 6 cores and 12 logical threads),
24 cpu-hogging tasks get scheduled as 13 on one socket and 11 on another
socket. Leading to an idle HT cpu.

b) On a hypothetial 2 socket NHM-EX system (each socket having 8 cores and
16 logical threads), 16 cpu-hogging tasks can get scheduled as 9 on one
socket and 7 on another socket. Leaving one core in a socket idle
whereas in another socket we have a core having both its HT siblings busy.

While this issue can be fixed by decreasing the domain's imbalance_pct
(by making it a function of number of logical cpus in the domain), it
can potentially cause more task migrations across sched groups in an
overloaded case.

Fix this by using imbalance_pct only during newly_idle and busy
load balancing. And during idle load balancing, check if there
is an imbalance in number of idle cpu's across the busiest and this
sched_group or if the busiest group has more tasks than its weight that
the idle cpu in this_group can pull.

Reported-by: Nikhil Rao <ncrao@google.com>
Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1284760952.2676.11.camel@sbsiddha-MOBL3.sc.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

authored by Suresh Siddha and committed by Ingo Molnar aae6d3dd f6614b7b

+34 -3
+1
include/linux/sched.h
··· 862 862 * single CPU. 863 863 */ 864 864 unsigned int cpu_power, cpu_power_orig; 865 + unsigned int group_weight; 865 866 866 867 /* 867 868 * The CPUs this group covers.
+2
kernel/sched.c
··· 6960 6960 if (cpu != group_first_cpu(sd->groups)) 6961 6961 return; 6962 6962 6963 + sd->groups->group_weight = cpumask_weight(sched_group_cpus(sd->groups)); 6964 + 6963 6965 child = sd->child; 6964 6966 6965 6967 sd->groups->cpu_power = 0;
+31 -3
kernel/sched_fair.c
··· 2035 2035 unsigned long this_load_per_task; 2036 2036 unsigned long this_nr_running; 2037 2037 unsigned long this_has_capacity; 2038 + unsigned int this_idle_cpus; 2038 2039 2039 2040 /* Statistics of the busiest group */ 2041 + unsigned int busiest_idle_cpus; 2040 2042 unsigned long max_load; 2041 2043 unsigned long busiest_load_per_task; 2042 2044 unsigned long busiest_nr_running; 2043 2045 unsigned long busiest_group_capacity; 2044 2046 unsigned long busiest_has_capacity; 2047 + unsigned int busiest_group_weight; 2045 2048 2046 2049 int group_imb; /* Is there imbalance in this sd */ 2047 2050 #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) ··· 2066 2063 unsigned long sum_nr_running; /* Nr tasks running in the group */ 2067 2064 unsigned long sum_weighted_load; /* Weighted load of group's tasks */ 2068 2065 unsigned long group_capacity; 2066 + unsigned long idle_cpus; 2067 + unsigned long group_weight; 2069 2068 int group_imb; /* Is there an imbalance in the group ? */ 2070 2069 int group_has_capacity; /* Is there extra capacity in the group? */ 2071 2070 }; ··· 2436 2431 sgs->group_load += load; 2437 2432 sgs->sum_nr_running += rq->nr_running; 2438 2433 sgs->sum_weighted_load += weighted_cpuload(i); 2439 - 2434 + if (idle_cpu(i)) 2435 + sgs->idle_cpus++; 2440 2436 } 2441 2437 2442 2438 /* ··· 2475 2469 sgs->group_capacity = DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE); 2476 2470 if (!sgs->group_capacity) 2477 2471 sgs->group_capacity = fix_small_capacity(sd, group); 2472 + sgs->group_weight = group->group_weight; 2478 2473 2479 2474 if (sgs->group_capacity > sgs->sum_nr_running) 2480 2475 sgs->group_has_capacity = 1; ··· 2583 2576 sds->this_nr_running = sgs.sum_nr_running; 2584 2577 sds->this_load_per_task = sgs.sum_weighted_load; 2585 2578 sds->this_has_capacity = sgs.group_has_capacity; 2579 + sds->this_idle_cpus = sgs.idle_cpus; 2586 2580 } else if (update_sd_pick_busiest(sd, sds, sg, &sgs, this_cpu)) { 2587 2581 sds->max_load = sgs.avg_load; 2588 2582 sds->busiest = sg; 2589 2583 sds->busiest_nr_running = sgs.sum_nr_running; 2584 + sds->busiest_idle_cpus = sgs.idle_cpus; 2590 2585 sds->busiest_group_capacity = sgs.group_capacity; 2591 2586 sds->busiest_load_per_task = sgs.sum_weighted_load; 2592 2587 sds->busiest_has_capacity = sgs.group_has_capacity; 2588 + sds->busiest_group_weight = sgs.group_weight; 2593 2589 sds->group_imb = sgs.group_imb; 2594 2590 } 2595 2591 ··· 2870 2860 if (sds.this_load >= sds.avg_load) 2871 2861 goto out_balanced; 2872 2862 2873 - if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load) 2874 - goto out_balanced; 2863 + /* 2864 + * In the CPU_NEWLY_IDLE, use imbalance_pct to be conservative. 2865 + * And to check for busy balance use !idle_cpu instead of 2866 + * CPU_NOT_IDLE. This is because HT siblings will use CPU_NOT_IDLE 2867 + * even when they are idle. 2868 + */ 2869 + if (idle == CPU_NEWLY_IDLE || !idle_cpu(this_cpu)) { 2870 + if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load) 2871 + goto out_balanced; 2872 + } else { 2873 + /* 2874 + * This cpu is idle. If the busiest group load doesn't 2875 + * have more tasks than the number of available cpu's and 2876 + * there is no imbalance between this and busiest group 2877 + * wrt to idle cpu's, it is balanced. 2878 + */ 2879 + if ((sds.this_idle_cpus <= sds.busiest_idle_cpus + 1) && 2880 + sds.busiest_nr_running <= sds.busiest_group_weight) 2881 + goto out_balanced; 2882 + } 2875 2883 2876 2884 force_balance: 2877 2885 /* Looks like there is an imbalance. Compute it */