Merge branch 'sched-fixes-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip

* 'sched-fixes-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip:
sched, cgroup: Use exit hook to avoid use-after-free crash
sched: Fix signed unsigned comparison in check_preempt_tick()
sched: Replace rq->bkl_count with rq->rq_sched_info.bkl_count
sched, autogroup: Fix CONFIG_RT_GROUP_SCHED sched_setscheduler() failure
sched: Display autogroup names in /proc/sched_debug
sched: Reinstate group names in /proc/sched_debug
sched: Update effective_load() to use global share weights

+117 -22
+21 -5
kernel/sched.c
··· 553 553 /* try_to_wake_up() stats */ 554 554 unsigned int ttwu_count; 555 555 unsigned int ttwu_local; 556 - 557 - /* BKL stats */ 558 - unsigned int bkl_count; 559 556 #endif 560 557 }; 561 558 ··· 605 608 { 606 609 struct task_group *tg; 607 610 struct cgroup_subsys_state *css; 611 + 612 + if (p->flags & PF_EXITING) 613 + return &root_task_group; 608 614 609 615 css = task_subsys_state_check(p, cpu_cgroup_subsys_id, 610 616 lockdep_is_held(&task_rq(p)->lock)); ··· 3887 3887 schedstat_inc(this_rq(), sched_count); 3888 3888 #ifdef CONFIG_SCHEDSTATS 3889 3889 if (unlikely(prev->lock_depth >= 0)) { 3890 - schedstat_inc(this_rq(), bkl_count); 3890 + schedstat_inc(this_rq(), rq_sched_info.bkl_count); 3891 3891 schedstat_inc(prev, sched_info.bkl_count); 3892 3892 } 3893 3893 #endif ··· 4871 4871 * assigned. 4872 4872 */ 4873 4873 if (rt_bandwidth_enabled() && rt_policy(policy) && 4874 - task_group(p)->rt_bandwidth.rt_runtime == 0) { 4874 + task_group(p)->rt_bandwidth.rt_runtime == 0 && 4875 + !task_group_is_autogroup(task_group(p))) { 4875 4876 __task_rq_unlock(rq); 4876 4877 raw_spin_unlock_irqrestore(&p->pi_lock, flags); 4877 4878 return -EPERM; ··· 8883 8882 } 8884 8883 } 8885 8884 8885 + static void 8886 + cpu_cgroup_exit(struct cgroup_subsys *ss, struct task_struct *task) 8887 + { 8888 + /* 8889 + * cgroup_exit() is called in the copy_process() failure path. 8890 + * Ignore this case since the task hasn't ran yet, this avoids 8891 + * trying to poke a half freed task state from generic code. 8892 + */ 8893 + if (!(task->flags & PF_EXITING)) 8894 + return; 8895 + 8896 + sched_move_task(task); 8897 + } 8898 + 8886 8899 #ifdef CONFIG_FAIR_GROUP_SCHED 8887 8900 static int cpu_shares_write_u64(struct cgroup *cgrp, struct cftype *cftype, 8888 8901 u64 shareval) ··· 8969 8954 .destroy = cpu_cgroup_destroy, 8970 8955 .can_attach = cpu_cgroup_can_attach, 8971 8956 .attach = cpu_cgroup_attach, 8957 + .exit = cpu_cgroup_exit, 8972 8958 .populate = cpu_cgroup_populate, 8973 8959 .subsys_id = cpu_cgroup_subsys_id, 8974 8960 .early_init = 1,
+32
kernel/sched_autogroup.c
··· 27 27 { 28 28 struct autogroup *ag = container_of(kref, struct autogroup, kref); 29 29 30 + #ifdef CONFIG_RT_GROUP_SCHED 31 + /* We've redirected RT tasks to the root task group... */ 32 + ag->tg->rt_se = NULL; 33 + ag->tg->rt_rq = NULL; 34 + #endif 30 35 sched_destroy_group(ag->tg); 31 36 } 32 37 ··· 60 55 return ag; 61 56 } 62 57 58 + #ifdef CONFIG_RT_GROUP_SCHED 59 + static void free_rt_sched_group(struct task_group *tg); 60 + #endif 61 + 63 62 static inline struct autogroup *autogroup_create(void) 64 63 { 65 64 struct autogroup *ag = kzalloc(sizeof(*ag), GFP_KERNEL); ··· 81 72 init_rwsem(&ag->lock); 82 73 ag->id = atomic_inc_return(&autogroup_seq_nr); 83 74 ag->tg = tg; 75 + #ifdef CONFIG_RT_GROUP_SCHED 76 + /* 77 + * Autogroup RT tasks are redirected to the root task group 78 + * so we don't have to move tasks around upon policy change, 79 + * or flail around trying to allocate bandwidth on the fly. 80 + * A bandwidth exception in __sched_setscheduler() allows 81 + * the policy change to proceed. Thereafter, task_group() 82 + * returns &root_task_group, so zero bandwidth is required. 83 + */ 84 + free_rt_sched_group(tg); 85 + tg->rt_se = root_task_group.rt_se; 86 + tg->rt_rq = root_task_group.rt_rq; 87 + #endif 84 88 tg->autogroup = ag; 85 89 86 90 return ag; ··· 126 104 return false; 127 105 128 106 return true; 107 + } 108 + 109 + static inline bool task_group_is_autogroup(struct task_group *tg) 110 + { 111 + return tg != &root_task_group && tg->autogroup; 129 112 } 130 113 131 114 static inline struct task_group * ··· 258 231 #ifdef CONFIG_SCHED_DEBUG 259 232 static inline int autogroup_path(struct task_group *tg, char *buf, int buflen) 260 233 { 234 + int enabled = ACCESS_ONCE(sysctl_sched_autogroup_enabled); 235 + 236 + if (!enabled || !tg->autogroup) 237 + return 0; 238 + 261 239 return snprintf(buf, buflen, "%s-%ld", "/autogroup", tg->autogroup->id); 262 240 } 263 241 #endif /* CONFIG_SCHED_DEBUG */
+4
kernel/sched_autogroup.h
··· 15 15 16 16 static inline void autogroup_init(struct task_struct *init_task) { } 17 17 static inline void autogroup_free(struct task_group *tg) { } 18 + static inline bool task_group_is_autogroup(struct task_group *tg) 19 + { 20 + return 0; 21 + } 18 22 19 23 static inline struct task_group * 20 24 autogroup_task_group(struct task_struct *p, struct task_group *tg)
+41 -1
kernel/sched_debug.c
··· 16 16 #include <linux/kallsyms.h> 17 17 #include <linux/utsname.h> 18 18 19 + static DEFINE_SPINLOCK(sched_debug_lock); 20 + 19 21 /* 20 22 * This allows printing both to /proc/sched_debug and 21 23 * to the console ··· 88 86 } 89 87 #endif 90 88 89 + #ifdef CONFIG_CGROUP_SCHED 90 + static char group_path[PATH_MAX]; 91 + 92 + static char *task_group_path(struct task_group *tg) 93 + { 94 + if (autogroup_path(tg, group_path, PATH_MAX)) 95 + return group_path; 96 + 97 + /* 98 + * May be NULL if the underlying cgroup isn't fully-created yet 99 + */ 100 + if (!tg->css.cgroup) { 101 + group_path[0] = '\0'; 102 + return group_path; 103 + } 104 + cgroup_path(tg->css.cgroup, group_path, PATH_MAX); 105 + return group_path; 106 + } 107 + #endif 108 + 91 109 static void 92 110 print_task(struct seq_file *m, struct rq *rq, struct task_struct *p) 93 111 { ··· 129 107 #else 130 108 SEQ_printf(m, "%15Ld %15Ld %15Ld.%06ld %15Ld.%06ld %15Ld.%06ld", 131 109 0LL, 0LL, 0LL, 0L, 0LL, 0L, 0LL, 0L); 110 + #endif 111 + #ifdef CONFIG_CGROUP_SCHED 112 + SEQ_printf(m, " %s", task_group_path(task_group(p))); 132 113 #endif 133 114 134 115 SEQ_printf(m, "\n"); ··· 169 144 struct sched_entity *last; 170 145 unsigned long flags; 171 146 147 + #ifdef CONFIG_FAIR_GROUP_SCHED 148 + SEQ_printf(m, "\ncfs_rq[%d]:%s\n", cpu, task_group_path(cfs_rq->tg)); 149 + #else 172 150 SEQ_printf(m, "\ncfs_rq[%d]:\n", cpu); 151 + #endif 173 152 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "exec_clock", 174 153 SPLIT_NS(cfs_rq->exec_clock)); 175 154 ··· 220 191 221 192 void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq) 222 193 { 194 + #ifdef CONFIG_RT_GROUP_SCHED 195 + SEQ_printf(m, "\nrt_rq[%d]:%s\n", cpu, task_group_path(rt_rq->tg)); 196 + #else 223 197 SEQ_printf(m, "\nrt_rq[%d]:\n", cpu); 198 + #endif 224 199 225 200 #define P(x) \ 226 201 SEQ_printf(m, " .%-30s: %Ld\n", #x, (long long)(rt_rq->x)) ··· 245 212 static void print_cpu(struct seq_file *m, int cpu) 246 213 { 247 214 struct rq *rq = cpu_rq(cpu); 215 + unsigned long flags; 248 216 249 217 #ifdef CONFIG_X86 250 218 { ··· 296 262 P(ttwu_count); 297 263 P(ttwu_local); 298 264 299 - P(bkl_count); 265 + SEQ_printf(m, " .%-30s: %d\n", "bkl_count", 266 + rq->rq_sched_info.bkl_count); 300 267 301 268 #undef P 269 + #undef P64 302 270 #endif 271 + spin_lock_irqsave(&sched_debug_lock, flags); 303 272 print_cfs_stats(m, cpu); 304 273 print_rt_stats(m, cpu); 305 274 275 + rcu_read_lock(); 306 276 print_rq(m, rq, cpu); 277 + rcu_read_unlock(); 278 + spin_unlock_irqrestore(&sched_debug_lock, flags); 307 279 } 308 280 309 281 static const char *sched_tunable_scaling_names[] = {
+19 -16
kernel/sched_fair.c
··· 1062 1062 struct sched_entity *se = __pick_next_entity(cfs_rq); 1063 1063 s64 delta = curr->vruntime - se->vruntime; 1064 1064 1065 + if (delta < 0) 1066 + return; 1067 + 1065 1068 if (delta > ideal_runtime) 1066 1069 resched_task(rq_of(cfs_rq)->curr); 1067 1070 } ··· 1365 1362 return wl; 1366 1363 1367 1364 for_each_sched_entity(se) { 1368 - long S, rw, s, a, b; 1365 + long lw, w; 1369 1366 1370 - S = se->my_q->tg->shares; 1371 - s = se->load.weight; 1372 - rw = se->my_q->load.weight; 1367 + tg = se->my_q->tg; 1368 + w = se->my_q->load.weight; 1373 1369 1374 - a = S*(rw + wl); 1375 - b = S*rw + s*wg; 1370 + /* use this cpu's instantaneous contribution */ 1371 + lw = atomic_read(&tg->load_weight); 1372 + lw -= se->my_q->load_contribution; 1373 + lw += w + wg; 1376 1374 1377 - wl = s*(a-b); 1375 + wl += w; 1378 1376 1379 - if (likely(b)) 1380 - wl /= b; 1377 + if (lw > 0 && wl < lw) 1378 + wl = (wl * tg->shares) / lw; 1379 + else 1380 + wl = tg->shares; 1381 1381 1382 - /* 1383 - * Assume the group is already running and will 1384 - * thus already be accounted for in the weight. 1385 - * 1386 - * That is, moving shares between CPUs, does not 1387 - * alter the group weight. 1388 - */ 1382 + /* zero point is MIN_SHARES */ 1383 + if (wl < MIN_SHARES) 1384 + wl = MIN_SHARES; 1385 + wl -= se->load.weight; 1389 1386 wg = 0; 1390 1387 } 1391 1388