Merge tag 'sched_urgent_for_v5.17_rc5' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull scheduler fix from Borislav Petkov:
"Fix task exposure order when forking tasks"

* tag 'sched_urgent_for_v5.17_rc5' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
sched: Fix yet more sched_fork() races

Changed files
+35 -16
include
linux
sched
kernel
sched
+2 -2
include/linux/sched/task.h
··· 54 54 extern void init_idle(struct task_struct *idle, int cpu); 55 55 56 56 extern int sched_fork(unsigned long clone_flags, struct task_struct *p); 57 - extern void sched_post_fork(struct task_struct *p, 58 - struct kernel_clone_args *kargs); 57 + extern void sched_cgroup_fork(struct task_struct *p, struct kernel_clone_args *kargs); 58 + extern void sched_post_fork(struct task_struct *p); 59 59 extern void sched_dead(struct task_struct *p); 60 60 61 61 void __noreturn do_task_dead(void);
+12 -1
kernel/fork.c
··· 2267 2267 goto bad_fork_put_pidfd; 2268 2268 2269 2269 /* 2270 + * Now that the cgroups are pinned, re-clone the parent cgroup and put 2271 + * the new task on the correct runqueue. All this *before* the task 2272 + * becomes visible. 2273 + * 2274 + * This isn't part of ->can_fork() because while the re-cloning is 2275 + * cgroup specific, it unconditionally needs to place the task on a 2276 + * runqueue. 2277 + */ 2278 + sched_cgroup_fork(p, args); 2279 + 2280 + /* 2270 2281 * From this point on we must avoid any synchronous user-space 2271 2282 * communication until we take the tasklist-lock. In particular, we do 2272 2283 * not want user-space to be able to predict the process start-time by ··· 2386 2375 fd_install(pidfd, pidfile); 2387 2376 2388 2377 proc_fork_connector(p); 2389 - sched_post_fork(p, args); 2378 + sched_post_fork(p); 2390 2379 cgroup_post_fork(p, args); 2391 2380 perf_event_fork(p); 2392 2381
+21 -13
kernel/sched/core.c
··· 1214 1214 } 1215 1215 #endif 1216 1216 1217 - static void set_load_weight(struct task_struct *p) 1217 + static void set_load_weight(struct task_struct *p, bool update_load) 1218 1218 { 1219 - bool update_load = !(READ_ONCE(p->__state) & TASK_NEW); 1220 1219 int prio = p->static_prio - MAX_RT_PRIO; 1221 1220 struct load_weight *load = &p->se.load; 1222 1221 ··· 4406 4407 p->static_prio = NICE_TO_PRIO(0); 4407 4408 4408 4409 p->prio = p->normal_prio = p->static_prio; 4409 - set_load_weight(p); 4410 + set_load_weight(p, false); 4410 4411 4411 4412 /* 4412 4413 * We don't need the reset flag anymore after the fork. It has ··· 4424 4425 4425 4426 init_entity_runnable_average(&p->se); 4426 4427 4428 + 4427 4429 #ifdef CONFIG_SCHED_INFO 4428 4430 if (likely(sched_info_on())) 4429 4431 memset(&p->sched_info, 0, sizeof(p->sched_info)); ··· 4440 4440 return 0; 4441 4441 } 4442 4442 4443 - void sched_post_fork(struct task_struct *p, struct kernel_clone_args *kargs) 4443 + void sched_cgroup_fork(struct task_struct *p, struct kernel_clone_args *kargs) 4444 4444 { 4445 4445 unsigned long flags; 4446 - #ifdef CONFIG_CGROUP_SCHED 4447 - struct task_group *tg; 4448 - #endif 4449 4446 4447 + /* 4448 + * Because we're not yet on the pid-hash, p->pi_lock isn't strictly 4449 + * required yet, but lockdep gets upset if rules are violated. 4450 + */ 4450 4451 raw_spin_lock_irqsave(&p->pi_lock, flags); 4451 4452 #ifdef CONFIG_CGROUP_SCHED 4452 - tg = container_of(kargs->cset->subsys[cpu_cgrp_id], 4453 - struct task_group, css); 4454 - p->sched_task_group = autogroup_task_group(p, tg); 4453 + if (1) { 4454 + struct task_group *tg; 4455 + tg = container_of(kargs->cset->subsys[cpu_cgrp_id], 4456 + struct task_group, css); 4457 + tg = autogroup_task_group(p, tg); 4458 + p->sched_task_group = tg; 4459 + } 4455 4460 #endif 4456 4461 rseq_migrate(p); 4457 4462 /* ··· 4467 4462 if (p->sched_class->task_fork) 4468 4463 p->sched_class->task_fork(p); 4469 4464 raw_spin_unlock_irqrestore(&p->pi_lock, flags); 4465 + } 4470 4466 4467 + void sched_post_fork(struct task_struct *p) 4468 + { 4471 4469 uclamp_post_fork(p); 4472 4470 } 4473 4471 ··· 6930 6922 put_prev_task(rq, p); 6931 6923 6932 6924 p->static_prio = NICE_TO_PRIO(nice); 6933 - set_load_weight(p); 6925 + set_load_weight(p, true); 6934 6926 old_prio = p->prio; 6935 6927 p->prio = effective_prio(p); 6936 6928 ··· 7221 7213 */ 7222 7214 p->rt_priority = attr->sched_priority; 7223 7215 p->normal_prio = normal_prio(p); 7224 - set_load_weight(p); 7216 + set_load_weight(p, true); 7225 7217 } 7226 7218 7227 7219 /* ··· 9454 9446 #endif 9455 9447 } 9456 9448 9457 - set_load_weight(&init_task); 9449 + set_load_weight(&init_task, false); 9458 9450 9459 9451 /* 9460 9452 * The boot idle thread does lazy MMU switching as well: