Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

sched: Fix delayed_dequeue vs switched_from_fair()

Commit 2e0199df252a ("sched/fair: Prepare exit/cleanup paths for delayed_dequeue")
and its follow up fixes try to deal with a rather unfortunate
situation where is task is enqueued in a new class, even though it
shouldn't have been. Mostly because the existing ->switched_to/from()
hooks are in the wrong place for this case.

This all led to Paul being able to trigger failures at something like
once per 10k CPU hours of RCU torture.

For now, do the ugly thing and move the code to the right place by
ignoring the switch hooks.

Note: Clean up the whole sched_class::switch*_{to,from}() thing.

Fixes: 2e0199df252a ("sched/fair: Prepare exit/cleanup paths for delayed_dequeue")
Reported-by: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20241003185037.GA5594@noisy.programming.kicks-ass.net

+32 -36
+20 -13
kernel/sched/core.c
··· 7010 7010 } 7011 7011 EXPORT_SYMBOL(default_wake_function); 7012 7012 7013 - void __setscheduler_prio(struct task_struct *p, int prio) 7013 + const struct sched_class *__setscheduler_class(struct task_struct *p, int prio) 7014 7014 { 7015 7015 if (dl_prio(prio)) 7016 - p->sched_class = &dl_sched_class; 7017 - else if (rt_prio(prio)) 7018 - p->sched_class = &rt_sched_class; 7019 - #ifdef CONFIG_SCHED_CLASS_EXT 7020 - else if (task_should_scx(p)) 7021 - p->sched_class = &ext_sched_class; 7022 - #endif 7023 - else 7024 - p->sched_class = &fair_sched_class; 7016 + return &dl_sched_class; 7025 7017 7026 - p->prio = prio; 7018 + if (rt_prio(prio)) 7019 + return &rt_sched_class; 7020 + 7021 + #ifdef CONFIG_SCHED_CLASS_EXT 7022 + if (task_should_scx(p)) 7023 + return &ext_sched_class; 7024 + #endif 7025 + 7026 + return &fair_sched_class; 7027 7027 } 7028 7028 7029 7029 #ifdef CONFIG_RT_MUTEXES ··· 7069 7069 { 7070 7070 int prio, oldprio, queued, running, queue_flag = 7071 7071 DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK; 7072 - const struct sched_class *prev_class; 7072 + const struct sched_class *prev_class, *next_class; 7073 7073 struct rq_flags rf; 7074 7074 struct rq *rq; 7075 7075 ··· 7127 7127 queue_flag &= ~DEQUEUE_MOVE; 7128 7128 7129 7129 prev_class = p->sched_class; 7130 + next_class = __setscheduler_class(p, prio); 7131 + 7132 + if (prev_class != next_class && p->se.sched_delayed) 7133 + dequeue_task(rq, p, DEQUEUE_SLEEP | DEQUEUE_DELAYED | DEQUEUE_NOCLOCK); 7134 + 7130 7135 queued = task_on_rq_queued(p); 7131 7136 running = task_current(rq, p); 7132 7137 if (queued) ··· 7169 7164 p->rt.timeout = 0; 7170 7165 } 7171 7166 7172 - __setscheduler_prio(p, prio); 7167 + p->sched_class = next_class; 7168 + p->prio = prio; 7169 + 7173 7170 check_class_changing(rq, p, prev_class); 7174 7171 7175 7172 if (queued)
+2 -2
kernel/sched/ext.c
··· 4471 4471 sched_deq_and_put_task(p, DEQUEUE_SAVE | DEQUEUE_MOVE, &ctx); 4472 4472 4473 4473 p->scx.slice = min_t(u64, p->scx.slice, SCX_SLICE_DFL); 4474 - __setscheduler_prio(p, p->prio); 4474 + p->sched_class = __setscheduler_class(p, p->prio); 4475 4475 check_class_changing(task_rq(p), p, old_class); 4476 4476 4477 4477 sched_enq_and_set_task(&ctx); ··· 5186 5186 5187 5187 sched_deq_and_put_task(p, DEQUEUE_SAVE | DEQUEUE_MOVE, &ctx); 5188 5188 5189 - __setscheduler_prio(p, p->prio); 5189 + p->sched_class = __setscheduler_class(p, p->prio); 5190 5190 check_class_changing(task_rq(p), p, old_class); 5191 5191 5192 5192 sched_enq_and_set_task(&ctx);
-16
kernel/sched/fair.c
··· 13177 13177 static void switched_from_fair(struct rq *rq, struct task_struct *p) 13178 13178 { 13179 13179 detach_task_cfs_rq(p); 13180 - /* 13181 - * Since this is called after changing class, this is a little weird 13182 - * and we cannot use DEQUEUE_DELAYED. 13183 - */ 13184 - if (p->se.sched_delayed) { 13185 - /* First, dequeue it from its new class' structures */ 13186 - dequeue_task(rq, p, DEQUEUE_NOCLOCK | DEQUEUE_SLEEP); 13187 - /* 13188 - * Now, clean up the fair_sched_class side of things 13189 - * related to sched_delayed being true and that wasn't done 13190 - * due to the generic dequeue not using DEQUEUE_DELAYED. 13191 - */ 13192 - finish_delayed_dequeue_entity(&p->se); 13193 - p->se.rel_deadline = 0; 13194 - __block_task(rq, p); 13195 - } 13196 13180 } 13197 13181 13198 13182 static void switched_to_fair(struct rq *rq, struct task_struct *p)
+1 -1
kernel/sched/sched.h
··· 3797 3797 3798 3798 extern int __sched_setscheduler(struct task_struct *p, const struct sched_attr *attr, bool user, bool pi); 3799 3799 extern int __sched_setaffinity(struct task_struct *p, struct affinity_context *ctx); 3800 - extern void __setscheduler_prio(struct task_struct *p, int prio); 3800 + extern const struct sched_class *__setscheduler_class(struct task_struct *p, int prio); 3801 3801 extern void set_load_weight(struct task_struct *p, bool update_load); 3802 3802 extern void enqueue_task(struct rq *rq, struct task_struct *p, int flags); 3803 3803 extern bool dequeue_task(struct rq *rq, struct task_struct *p, int flags);
+9 -4
kernel/sched/syscalls.c
··· 529 529 { 530 530 int oldpolicy = -1, policy = attr->sched_policy; 531 531 int retval, oldprio, newprio, queued, running; 532 - const struct sched_class *prev_class; 532 + const struct sched_class *prev_class, *next_class; 533 533 struct balance_callback *head; 534 534 struct rq_flags rf; 535 535 int reset_on_fork; ··· 706 706 queue_flags &= ~DEQUEUE_MOVE; 707 707 } 708 708 709 + prev_class = p->sched_class; 710 + next_class = __setscheduler_class(p, newprio); 711 + 712 + if (prev_class != next_class && p->se.sched_delayed) 713 + dequeue_task(rq, p, DEQUEUE_SLEEP | DEQUEUE_DELAYED | DEQUEUE_NOCLOCK); 714 + 709 715 queued = task_on_rq_queued(p); 710 716 running = task_current(rq, p); 711 717 if (queued) ··· 719 713 if (running) 720 714 put_prev_task(rq, p); 721 715 722 - prev_class = p->sched_class; 723 - 724 716 if (!(attr->sched_flags & SCHED_FLAG_KEEP_PARAMS)) { 725 717 __setscheduler_params(p, attr); 726 - __setscheduler_prio(p, newprio); 718 + p->sched_class = next_class; 719 + p->prio = newprio; 727 720 } 728 721 __setscheduler_uclamp(p, attr); 729 722 check_class_changing(rq, p, prev_class);