Merge branch 'sched-fixes-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip

* 'sched-fixes-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip:
sched: Fix cross-sched-class wakeup preemption
sched: Fix runnable condition for stoptask
sched: Use group weight, idle cpu metrics to fix imbalances during idle

+62 -22
+1
include/linux/sched.h
··· 862 862 * single CPU. 863 863 */ 864 864 unsigned int cpu_power, cpu_power_orig; 865 + unsigned int group_weight; 865 866 866 867 /* 867 868 * The CPUs this group covers.
+28 -11
kernel/sched.c
··· 560 560 561 561 static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); 562 562 563 - static inline 564 - void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) 565 - { 566 - rq->curr->sched_class->check_preempt_curr(rq, p, flags); 567 563 568 - /* 569 - * A queue event has occurred, and we're going to schedule. In 570 - * this case, we can save a useless back to back clock update. 571 - */ 572 - if (test_tsk_need_resched(p)) 573 - rq->skip_clock_update = 1; 574 - } 564 + static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags); 575 565 576 566 static inline int cpu_of(struct rq *rq) 577 567 { ··· 2106 2116 p->sched_class->switched_to(rq, p, running); 2107 2117 } else 2108 2118 p->sched_class->prio_changed(rq, p, oldprio, running); 2119 + } 2120 + 2121 + static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) 2122 + { 2123 + const struct sched_class *class; 2124 + 2125 + if (p->sched_class == rq->curr->sched_class) { 2126 + rq->curr->sched_class->check_preempt_curr(rq, p, flags); 2127 + } else { 2128 + for_each_class(class) { 2129 + if (class == rq->curr->sched_class) 2130 + break; 2131 + if (class == p->sched_class) { 2132 + resched_task(rq->curr); 2133 + break; 2134 + } 2135 + } 2136 + } 2137 + 2138 + /* 2139 + * A queue event has occurred, and we're going to schedule. In 2140 + * this case, we can save a useless back to back clock update. 2141 + */ 2142 + if (test_tsk_need_resched(rq->curr)) 2143 + rq->skip_clock_update = 1; 2109 2144 } 2110 2145 2111 2146 #ifdef CONFIG_SMP ··· 6974 6959 6975 6960 if (cpu != group_first_cpu(sd->groups)) 6976 6961 return; 6962 + 6963 + sd->groups->group_weight = cpumask_weight(sched_group_cpus(sd->groups)); 6977 6964 6978 6965 child = sd->child; 6979 6966
+31 -9
kernel/sched_fair.c
··· 1654 1654 struct cfs_rq *cfs_rq = task_cfs_rq(curr); 1655 1655 int scale = cfs_rq->nr_running >= sched_nr_latency; 1656 1656 1657 - if (unlikely(rt_prio(p->prio))) 1658 - goto preempt; 1659 - 1660 - if (unlikely(p->sched_class != &fair_sched_class)) 1661 - return; 1662 - 1663 1657 if (unlikely(se == pse)) 1664 1658 return; 1665 1659 ··· 2029 2035 unsigned long this_load_per_task; 2030 2036 unsigned long this_nr_running; 2031 2037 unsigned long this_has_capacity; 2038 + unsigned int this_idle_cpus; 2032 2039 2033 2040 /* Statistics of the busiest group */ 2041 + unsigned int busiest_idle_cpus; 2034 2042 unsigned long max_load; 2035 2043 unsigned long busiest_load_per_task; 2036 2044 unsigned long busiest_nr_running; 2037 2045 unsigned long busiest_group_capacity; 2038 2046 unsigned long busiest_has_capacity; 2047 + unsigned int busiest_group_weight; 2039 2048 2040 2049 int group_imb; /* Is there imbalance in this sd */ 2041 2050 #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) ··· 2060 2063 unsigned long sum_nr_running; /* Nr tasks running in the group */ 2061 2064 unsigned long sum_weighted_load; /* Weighted load of group's tasks */ 2062 2065 unsigned long group_capacity; 2066 + unsigned long idle_cpus; 2067 + unsigned long group_weight; 2063 2068 int group_imb; /* Is there an imbalance in the group ? */ 2064 2069 int group_has_capacity; /* Is there extra capacity in the group? */ 2065 2070 }; ··· 2430 2431 sgs->group_load += load; 2431 2432 sgs->sum_nr_running += rq->nr_running; 2432 2433 sgs->sum_weighted_load += weighted_cpuload(i); 2433 - 2434 + if (idle_cpu(i)) 2435 + sgs->idle_cpus++; 2434 2436 } 2435 2437 2436 2438 /* ··· 2469 2469 sgs->group_capacity = DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE); 2470 2470 if (!sgs->group_capacity) 2471 2471 sgs->group_capacity = fix_small_capacity(sd, group); 2472 + sgs->group_weight = group->group_weight; 2472 2473 2473 2474 if (sgs->group_capacity > sgs->sum_nr_running) 2474 2475 sgs->group_has_capacity = 1; ··· 2577 2576 sds->this_nr_running = sgs.sum_nr_running; 2578 2577 sds->this_load_per_task = sgs.sum_weighted_load; 2579 2578 sds->this_has_capacity = sgs.group_has_capacity; 2579 + sds->this_idle_cpus = sgs.idle_cpus; 2580 2580 } else if (update_sd_pick_busiest(sd, sds, sg, &sgs, this_cpu)) { 2581 2581 sds->max_load = sgs.avg_load; 2582 2582 sds->busiest = sg; 2583 2583 sds->busiest_nr_running = sgs.sum_nr_running; 2584 + sds->busiest_idle_cpus = sgs.idle_cpus; 2584 2585 sds->busiest_group_capacity = sgs.group_capacity; 2585 2586 sds->busiest_load_per_task = sgs.sum_weighted_load; 2586 2587 sds->busiest_has_capacity = sgs.group_has_capacity; 2588 + sds->busiest_group_weight = sgs.group_weight; 2587 2589 sds->group_imb = sgs.group_imb; 2588 2590 } 2589 2591 ··· 2864 2860 if (sds.this_load >= sds.avg_load) 2865 2861 goto out_balanced; 2866 2862 2867 - if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load) 2868 - goto out_balanced; 2863 + /* 2864 + * In the CPU_NEWLY_IDLE, use imbalance_pct to be conservative. 2865 + * And to check for busy balance use !idle_cpu instead of 2866 + * CPU_NOT_IDLE. This is because HT siblings will use CPU_NOT_IDLE 2867 + * even when they are idle. 2868 + */ 2869 + if (idle == CPU_NEWLY_IDLE || !idle_cpu(this_cpu)) { 2870 + if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load) 2871 + goto out_balanced; 2872 + } else { 2873 + /* 2874 + * This cpu is idle. If the busiest group load doesn't 2875 + * have more tasks than the number of available cpu's and 2876 + * there is no imbalance between this and busiest group 2877 + * wrt to idle cpu's, it is balanced. 2878 + */ 2879 + if ((sds.this_idle_cpus <= sds.busiest_idle_cpus + 1) && 2880 + sds.busiest_nr_running <= sds.busiest_group_weight) 2881 + goto out_balanced; 2882 + } 2869 2883 2870 2884 force_balance: 2871 2885 /* Looks like there is an imbalance. Compute it */
+2 -2
kernel/sched_stoptask.c
··· 19 19 static void 20 20 check_preempt_curr_stop(struct rq *rq, struct task_struct *p, int flags) 21 21 { 22 - resched_task(rq->curr); /* we preempt everything */ 22 + /* we're never preempted */ 23 23 } 24 24 25 25 static struct task_struct *pick_next_task_stop(struct rq *rq) 26 26 { 27 27 struct task_struct *stop = rq->stop; 28 28 29 - if (stop && stop->state == TASK_RUNNING) 29 + if (stop && stop->se.on_rq) 30 30 return stop; 31 31 32 32 return NULL;