Merge tag 'sched-urgent-2024-11-03' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull scheduler fixes from Thomas Gleixner:

- Plug a race between pick_next_task_fair() and try_to_wake_up() where
both try to write to the same task, even though both paths hold a
runqueue lock, but obviously from different runqueues.

The problem is that the store to task::on_rq in __block_task() is
visible to try_to_wake_up() which assumes that the task is not
queued. Both sides then operate on the same task.

Cure it by rearranging __block_task() so the the store to task::on_rq
is the last operation on the task.

- Prevent a potential NULL pointer dereference in task_numa_work()

task_numa_work() iterates the VMAs of a process. A concurrent unmap
of the address space can result in a NULL pointer return from
vma_next() which is unchecked.

Add the missing NULL pointer check to prevent this.

- Operate on the correct scheduler policy in task_should_scx()

task_should_scx() returns true when a task should be handled by sched
EXT. It checks the tasks scheduling policy.

This fails when the check is done before a policy has been set.

Cure it by handing the policy into task_should_scx() so it operates
on the requested value.

- Add the missing handling of sched EXT in the delayed dequeue
mechanism. This was simply forgotten.

* tag 'sched-urgent-2024-11-03' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
sched/ext: Fix scx vs sched_delayed
sched: Pass correct scheduling policy to __setscheduler_class
sched/numa: Fix the potential null pointer dereference in task_numa_work()
sched: Fix pick_next_task_fair() vs try_to_wake_up() race

+70 -23
+4 -4
kernel/sched/core.c
··· 4711 4711 if (rt_prio(p->prio)) { 4712 4712 p->sched_class = &rt_sched_class; 4713 4713 #ifdef CONFIG_SCHED_CLASS_EXT 4714 - } else if (task_should_scx(p)) { 4714 + } else if (task_should_scx(p->policy)) { 4715 4715 p->sched_class = &ext_sched_class; 4716 4716 #endif 4717 4717 } else { ··· 7025 7025 } 7026 7026 EXPORT_SYMBOL(default_wake_function); 7027 7027 7028 - const struct sched_class *__setscheduler_class(struct task_struct *p, int prio) 7028 + const struct sched_class *__setscheduler_class(int policy, int prio) 7029 7029 { 7030 7030 if (dl_prio(prio)) 7031 7031 return &dl_sched_class; ··· 7034 7034 return &rt_sched_class; 7035 7035 7036 7036 #ifdef CONFIG_SCHED_CLASS_EXT 7037 - if (task_should_scx(p)) 7037 + if (task_should_scx(policy)) 7038 7038 return &ext_sched_class; 7039 7039 #endif 7040 7040 ··· 7142 7142 queue_flag &= ~DEQUEUE_MOVE; 7143 7143 7144 7144 prev_class = p->sched_class; 7145 - next_class = __setscheduler_class(p, prio); 7145 + next_class = __setscheduler_class(p->policy, prio); 7146 7146 7147 7147 if (prev_class != next_class && p->se.sched_delayed) 7148 7148 dequeue_task(rq, p, DEQUEUE_SLEEP | DEQUEUE_DELAYED | DEQUEUE_NOCLOCK);
+14 -4
kernel/sched/ext.c
··· 4257 4257 * Used by sched_fork() and __setscheduler_prio() to pick the matching 4258 4258 * sched_class. dl/rt are already handled. 4259 4259 */ 4260 - bool task_should_scx(struct task_struct *p) 4260 + bool task_should_scx(int policy) 4261 4261 { 4262 4262 if (!scx_enabled() || 4263 4263 unlikely(scx_ops_enable_state() == SCX_OPS_DISABLING)) 4264 4264 return false; 4265 4265 if (READ_ONCE(scx_switching_all)) 4266 4266 return true; 4267 - return p->policy == SCHED_EXT; 4267 + return policy == SCHED_EXT; 4268 4268 } 4269 4269 4270 4270 /** ··· 4494 4494 scx_task_iter_start(&sti); 4495 4495 while ((p = scx_task_iter_next_locked(&sti))) { 4496 4496 const struct sched_class *old_class = p->sched_class; 4497 + const struct sched_class *new_class = 4498 + __setscheduler_class(p->policy, p->prio); 4497 4499 struct sched_enq_and_set_ctx ctx; 4500 + 4501 + if (old_class != new_class && p->se.sched_delayed) 4502 + dequeue_task(task_rq(p), p, DEQUEUE_SLEEP | DEQUEUE_DELAYED); 4498 4503 4499 4504 sched_deq_and_put_task(p, DEQUEUE_SAVE | DEQUEUE_MOVE, &ctx); 4500 4505 4501 - p->sched_class = __setscheduler_class(p, p->prio); 4506 + p->sched_class = new_class; 4502 4507 check_class_changing(task_rq(p), p, old_class); 4503 4508 4504 4509 sched_enq_and_set_task(&ctx); ··· 5209 5204 scx_task_iter_start(&sti); 5210 5205 while ((p = scx_task_iter_next_locked(&sti))) { 5211 5206 const struct sched_class *old_class = p->sched_class; 5207 + const struct sched_class *new_class = 5208 + __setscheduler_class(p->policy, p->prio); 5212 5209 struct sched_enq_and_set_ctx ctx; 5210 + 5211 + if (old_class != new_class && p->se.sched_delayed) 5212 + dequeue_task(task_rq(p), p, DEQUEUE_SLEEP | DEQUEUE_DELAYED); 5213 5213 5214 5214 sched_deq_and_put_task(p, DEQUEUE_SAVE | DEQUEUE_MOVE, &ctx); 5215 5215 5216 5216 p->scx.slice = SCX_SLICE_DFL; 5217 - p->sched_class = __setscheduler_class(p, p->prio); 5217 + p->sched_class = new_class; 5218 5218 check_class_changing(task_rq(p), p, old_class); 5219 5219 5220 5220 sched_enq_and_set_task(&ctx);
+1 -1
kernel/sched/ext.h
··· 18 18 void scx_rq_activate(struct rq *rq); 19 19 void scx_rq_deactivate(struct rq *rq); 20 20 int scx_check_setscheduler(struct task_struct *p, int policy); 21 - bool task_should_scx(struct task_struct *p); 21 + bool task_should_scx(int policy); 22 22 void init_sched_ext_class(void); 23 23 24 24 static inline u32 scx_cpuperf_target(s32 cpu)
+17 -10
kernel/sched/fair.c
··· 3369 3369 vma = vma_next(&vmi); 3370 3370 } 3371 3371 3372 - do { 3372 + for (; vma; vma = vma_next(&vmi)) { 3373 3373 if (!vma_migratable(vma) || !vma_policy_mof(vma) || 3374 3374 is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_MIXEDMAP)) { 3375 3375 trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_UNSUITABLE); ··· 3491 3491 */ 3492 3492 if (vma_pids_forced) 3493 3493 break; 3494 - } for_each_vma(vmi, vma); 3494 + } 3495 3495 3496 3496 /* 3497 3497 * If no VMAs are remaining and VMAs were skipped due to the PID ··· 5625 5625 struct sched_entity *se = pick_eevdf(cfs_rq); 5626 5626 if (se->sched_delayed) { 5627 5627 dequeue_entities(rq, se, DEQUEUE_SLEEP | DEQUEUE_DELAYED); 5628 - SCHED_WARN_ON(se->sched_delayed); 5629 - SCHED_WARN_ON(se->on_rq); 5628 + /* 5629 + * Must not reference @se again, see __block_task(). 5630 + */ 5630 5631 return NULL; 5631 5632 } 5632 5633 return se; ··· 7177 7176 /* Fix-up what dequeue_task_fair() skipped */ 7178 7177 hrtick_update(rq); 7179 7178 7180 - /* Fix-up what block_task() skipped. */ 7179 + /* 7180 + * Fix-up what block_task() skipped. 7181 + * 7182 + * Must be last, @p might not be valid after this. 7183 + */ 7181 7184 __block_task(rq, p); 7182 7185 } 7183 7186 ··· 7198 7193 if (!(p->se.sched_delayed && (task_on_rq_migrating(p) || (flags & DEQUEUE_SAVE)))) 7199 7194 util_est_dequeue(&rq->cfs, p); 7200 7195 7201 - if (dequeue_entities(rq, &p->se, flags) < 0) { 7202 - util_est_update(&rq->cfs, p, DEQUEUE_SLEEP); 7203 - return false; 7204 - } 7205 - 7206 7196 util_est_update(&rq->cfs, p, flags & DEQUEUE_SLEEP); 7197 + if (dequeue_entities(rq, &p->se, flags) < 0) 7198 + return false; 7199 + 7200 + /* 7201 + * Must not reference @p after dequeue_entities(DEQUEUE_DELAYED). 7202 + */ 7203 + 7207 7204 hrtick_update(rq); 7208 7205 return true; 7209 7206 }
+33 -3
kernel/sched/sched.h
··· 2769 2769 2770 2770 static inline void __block_task(struct rq *rq, struct task_struct *p) 2771 2771 { 2772 - WRITE_ONCE(p->on_rq, 0); 2773 - ASSERT_EXCLUSIVE_WRITER(p->on_rq); 2774 2772 if (p->sched_contributes_to_load) 2775 2773 rq->nr_uninterruptible++; 2776 2774 ··· 2776 2778 atomic_inc(&rq->nr_iowait); 2777 2779 delayacct_blkio_start(); 2778 2780 } 2781 + 2782 + ASSERT_EXCLUSIVE_WRITER(p->on_rq); 2783 + 2784 + /* 2785 + * The moment this write goes through, ttwu() can swoop in and migrate 2786 + * this task, rendering our rq->__lock ineffective. 2787 + * 2788 + * __schedule() try_to_wake_up() 2789 + * LOCK rq->__lock LOCK p->pi_lock 2790 + * pick_next_task() 2791 + * pick_next_task_fair() 2792 + * pick_next_entity() 2793 + * dequeue_entities() 2794 + * __block_task() 2795 + * RELEASE p->on_rq = 0 if (p->on_rq && ...) 2796 + * break; 2797 + * 2798 + * ACQUIRE (after ctrl-dep) 2799 + * 2800 + * cpu = select_task_rq(); 2801 + * set_task_cpu(p, cpu); 2802 + * ttwu_queue() 2803 + * ttwu_do_activate() 2804 + * LOCK rq->__lock 2805 + * activate_task() 2806 + * STORE p->on_rq = 1 2807 + * UNLOCK rq->__lock 2808 + * 2809 + * Callers must ensure to not reference @p after this -- we no longer 2810 + * own it. 2811 + */ 2812 + smp_store_release(&p->on_rq, 0); 2779 2813 } 2780 2814 2781 2815 extern void activate_task(struct rq *rq, struct task_struct *p, int flags); ··· 3830 3800 3831 3801 extern int __sched_setscheduler(struct task_struct *p, const struct sched_attr *attr, bool user, bool pi); 3832 3802 extern int __sched_setaffinity(struct task_struct *p, struct affinity_context *ctx); 3833 - extern const struct sched_class *__setscheduler_class(struct task_struct *p, int prio); 3803 + extern const struct sched_class *__setscheduler_class(int policy, int prio); 3834 3804 extern void set_load_weight(struct task_struct *p, bool update_load); 3835 3805 extern void enqueue_task(struct rq *rq, struct task_struct *p, int flags); 3836 3806 extern bool dequeue_task(struct rq *rq, struct task_struct *p, int flags);
+1 -1
kernel/sched/syscalls.c
··· 707 707 } 708 708 709 709 prev_class = p->sched_class; 710 - next_class = __setscheduler_class(p, newprio); 710 + next_class = __setscheduler_class(policy, newprio); 711 711 712 712 if (prev_class != next_class && p->se.sched_delayed) 713 713 dequeue_task(rq, p, DEQUEUE_SLEEP | DEQUEUE_DELAYED | DEQUEUE_NOCLOCK);