Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge tag 'sched_ext-for-6.19-rc8-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/sched_ext

Pull sched_ext fix from Tejun Heo:

- Fix race where sched_class operations (sched_setscheduler() and
friends) could be invoked on dead tasks after sched_ext_dead()
already ran, causing invalid SCX task state transitions and NULL
pointer dereferences.

This was a regression from the cgroup exit ordering fix which
moved sched_ext_free() to finish_task_switch().

* tag 'sched_ext-for-6.19-rc8-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/sched_ext:
sched_ext: Short-circuit sched_class operations on dead tasks

+48
+48
kernel/sched/ext.c
··· 194 194 #include <trace/events/sched_ext.h> 195 195 196 196 static void process_ddsp_deferred_locals(struct rq *rq); 197 + static bool task_dead_and_done(struct task_struct *p); 197 198 static u32 reenq_local(struct rq *rq); 198 199 static void scx_kick_cpu(struct scx_sched *sch, s32 cpu, u64 flags); 199 200 static bool scx_vexit(struct scx_sched *sch, enum scx_exit_kind kind, ··· 2620 2619 2621 2620 set_cpus_allowed_common(p, ac); 2622 2621 2622 + if (task_dead_and_done(p)) 2623 + return; 2624 + 2623 2625 /* 2624 2626 * The effective cpumask is stored in @p->cpus_ptr which may temporarily 2625 2627 * differ from the configured one in @p->cpus_mask. Always tell the bpf ··· 3038 3034 percpu_up_read(&scx_fork_rwsem); 3039 3035 } 3040 3036 3037 + /** 3038 + * task_dead_and_done - Is a task dead and done running? 3039 + * @p: target task 3040 + * 3041 + * Once sched_ext_dead() removes the dead task from scx_tasks and exits it, the 3042 + * task no longer exists from SCX's POV. However, certain sched_class ops may be 3043 + * invoked on these dead tasks leading to failures - e.g. sched_setscheduler() 3044 + * may try to switch a task which finished sched_ext_dead() back into SCX 3045 + * triggering invalid SCX task state transitions and worse. 3046 + * 3047 + * Once a task has finished the final switch, sched_ext_dead() is the only thing 3048 + * that needs to happen on the task. Use this test to short-circuit sched_class 3049 + * operations which may be called on dead tasks. 3050 + */ 3051 + static bool task_dead_and_done(struct task_struct *p) 3052 + { 3053 + struct rq *rq = task_rq(p); 3054 + 3055 + lockdep_assert_rq_held(rq); 3056 + 3057 + /* 3058 + * In do_task_dead(), a dying task sets %TASK_DEAD with preemption 3059 + * disabled and __schedule(). If @p has %TASK_DEAD set and off CPU, @p 3060 + * won't ever run again. 3061 + */ 3062 + return unlikely(READ_ONCE(p->__state) == TASK_DEAD) && 3063 + !task_on_cpu(rq, p); 3064 + } 3065 + 3041 3066 void sched_ext_dead(struct task_struct *p) 3042 3067 { 3043 3068 unsigned long flags; 3044 3069 3070 + /* 3071 + * By the time control reaches here, @p has %TASK_DEAD set, switched out 3072 + * for the last time and then dropped the rq lock - task_dead_and_done() 3073 + * should be returning %true nullifying the straggling sched_class ops. 3074 + * Remove from scx_tasks and exit @p. 3075 + */ 3045 3076 raw_spin_lock_irqsave(&scx_tasks_lock, flags); 3046 3077 list_del_init(&p->scx.tasks_node); 3047 3078 raw_spin_unlock_irqrestore(&scx_tasks_lock, flags); ··· 3102 3063 3103 3064 lockdep_assert_rq_held(task_rq(p)); 3104 3065 3066 + if (task_dead_and_done(p)) 3067 + return; 3068 + 3105 3069 p->scx.weight = sched_weight_to_cgroup(scale_load_down(lw->weight)); 3106 3070 if (SCX_HAS_OP(sch, set_weight)) 3107 3071 SCX_CALL_OP_TASK(sch, SCX_KF_REST, set_weight, rq, ··· 3119 3077 { 3120 3078 struct scx_sched *sch = scx_root; 3121 3079 3080 + if (task_dead_and_done(p)) 3081 + return; 3082 + 3122 3083 scx_enable_task(p); 3123 3084 3124 3085 /* ··· 3135 3090 3136 3091 static void switched_from_scx(struct rq *rq, struct task_struct *p) 3137 3092 { 3093 + if (task_dead_and_done(p)) 3094 + return; 3095 + 3138 3096 scx_disable_task(p); 3139 3097 } 3140 3098