Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge tag 'sched_urgent_for_v6.13_rc3-p2' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull scheduler fixes from Borislav Petkov:

- Prevent incorrect dequeueing of the deadline dlserver helper task and
fix its time accounting

- Properly track the CFS runqueue runnable stats

- Check the total number of all queued tasks in a sched fair's runqueue
hierarchy before deciding to stop the tick

- Fix the scheduling of the task that got woken last (NEXT_BUDDY) by
preventing those from being delayed

* tag 'sched_urgent_for_v6.13_rc3-p2' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
sched/dlserver: Fix dlserver time accounting
sched/dlserver: Fix dlserver double enqueue
sched/eevdf: More PELT vs DELAYED_DEQUEUE
sched/fair: Fix sched_can_stop_tick() for fair tasks
sched/fair: Fix NEXT_BUDDY

+84 -22
+7
include/linux/sched.h
··· 656 656 * @dl_defer_armed tells if the deferrable server is waiting 657 657 * for the replenishment timer to activate it. 658 658 * 659 + * @dl_server_active tells if the dlserver is active(started). 660 + * dlserver is started on first cfs enqueue on an idle runqueue 661 + * and is stopped when a dequeue results in 0 cfs tasks on the 662 + * runqueue. In other words, dlserver is active only when cpu's 663 + * runqueue has atleast one cfs task. 664 + * 659 665 * @dl_defer_running tells if the deferrable server is actually 660 666 * running, skipping the defer phase. 661 667 */ ··· 670 664 unsigned int dl_non_contending : 1; 671 665 unsigned int dl_overrun : 1; 672 666 unsigned int dl_server : 1; 667 + unsigned int dl_server_active : 1; 673 668 unsigned int dl_defer : 1; 674 669 unsigned int dl_defer_armed : 1; 675 670 unsigned int dl_defer_running : 1;
+1 -1
kernel/sched/core.c
··· 1341 1341 if (scx_enabled() && !scx_can_stop_tick(rq)) 1342 1342 return false; 1343 1343 1344 - if (rq->cfs.nr_running > 1) 1344 + if (rq->cfs.h_nr_running > 1) 1345 1345 return false; 1346 1346 1347 1347 /*
+6 -2
kernel/sched/deadline.c
··· 1647 1647 if (!dl_se->dl_runtime) 1648 1648 return; 1649 1649 1650 + dl_se->dl_server_active = 1; 1650 1651 enqueue_dl_entity(dl_se, ENQUEUE_WAKEUP); 1651 1652 if (!dl_task(dl_se->rq->curr) || dl_entity_preempt(dl_se, &rq->curr->dl)) 1652 1653 resched_curr(dl_se->rq); ··· 1662 1661 hrtimer_try_to_cancel(&dl_se->dl_timer); 1663 1662 dl_se->dl_defer_armed = 0; 1664 1663 dl_se->dl_throttled = 0; 1664 + dl_se->dl_server_active = 0; 1665 1665 } 1666 1666 1667 1667 void dl_server_init(struct sched_dl_entity *dl_se, struct rq *rq, ··· 2423 2421 if (dl_server(dl_se)) { 2424 2422 p = dl_se->server_pick_task(dl_se); 2425 2423 if (!p) { 2426 - dl_se->dl_yielded = 1; 2427 - update_curr_dl_se(rq, dl_se, 0); 2424 + if (dl_server_active(dl_se)) { 2425 + dl_se->dl_yielded = 1; 2426 + update_curr_dl_se(rq, dl_se, 0); 2427 + } 2428 2428 goto again; 2429 2429 } 2430 2430 rq->dl_server = dl_se;
+1
kernel/sched/debug.c
··· 845 845 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "spread", SPLIT_NS(spread)); 846 846 SEQ_printf(m, " .%-30s: %d\n", "nr_running", cfs_rq->nr_running); 847 847 SEQ_printf(m, " .%-30s: %d\n", "h_nr_running", cfs_rq->h_nr_running); 848 + SEQ_printf(m, " .%-30s: %d\n", "h_nr_delayed", cfs_rq->h_nr_delayed); 848 849 SEQ_printf(m, " .%-30s: %d\n", "idle_nr_running", 849 850 cfs_rq->idle_nr_running); 850 851 SEQ_printf(m, " .%-30s: %d\n", "idle_h_nr_running",
+57 -16
kernel/sched/fair.c
··· 1159 1159 trace_sched_stat_runtime(p, delta_exec); 1160 1160 account_group_exec_runtime(p, delta_exec); 1161 1161 cgroup_account_cputime(p, delta_exec); 1162 - if (p->dl_server) 1163 - dl_server_update(p->dl_server, delta_exec); 1164 1162 } 1165 1163 1166 1164 static inline bool did_preempt_short(struct cfs_rq *cfs_rq, struct sched_entity *curr) ··· 1235 1237 update_curr_task(p, delta_exec); 1236 1238 1237 1239 /* 1238 - * Any fair task that runs outside of fair_server should 1239 - * account against fair_server such that it can account for 1240 - * this time and possibly avoid running this period. 1240 + * If the fair_server is active, we need to account for the 1241 + * fair_server time whether or not the task is running on 1242 + * behalf of fair_server or not: 1243 + * - If the task is running on behalf of fair_server, we need 1244 + * to limit its time based on the assigned runtime. 1245 + * - Fair task that runs outside of fair_server should account 1246 + * against fair_server such that it can account for this time 1247 + * and possibly avoid running this period. 1241 1248 */ 1242 - if (p->dl_server != &rq->fair_server) 1249 + if (dl_server_active(&rq->fair_server)) 1243 1250 dl_server_update(&rq->fair_server, delta_exec); 1244 1251 } 1245 1252 ··· 5474 5471 5475 5472 static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq); 5476 5473 5477 - static inline void finish_delayed_dequeue_entity(struct sched_entity *se) 5474 + static void set_delayed(struct sched_entity *se) 5475 + { 5476 + se->sched_delayed = 1; 5477 + for_each_sched_entity(se) { 5478 + struct cfs_rq *cfs_rq = cfs_rq_of(se); 5479 + 5480 + cfs_rq->h_nr_delayed++; 5481 + if (cfs_rq_throttled(cfs_rq)) 5482 + break; 5483 + } 5484 + } 5485 + 5486 + static void clear_delayed(struct sched_entity *se) 5478 5487 { 5479 5488 se->sched_delayed = 0; 5489 + for_each_sched_entity(se) { 5490 + struct cfs_rq *cfs_rq = cfs_rq_of(se); 5491 + 5492 + cfs_rq->h_nr_delayed--; 5493 + if (cfs_rq_throttled(cfs_rq)) 5494 + break; 5495 + } 5496 + } 5497 + 5498 + static inline void finish_delayed_dequeue_entity(struct sched_entity *se) 5499 + { 5500 + clear_delayed(se); 5480 5501 if (sched_feat(DELAY_ZERO) && se->vlag > 0) 5481 5502 se->vlag = 0; 5482 5503 } ··· 5511 5484 bool sleep = flags & DEQUEUE_SLEEP; 5512 5485 5513 5486 update_curr(cfs_rq); 5487 + clear_buddies(cfs_rq, se); 5514 5488 5515 5489 if (flags & DEQUEUE_DELAYED) { 5516 5490 SCHED_WARN_ON(!se->sched_delayed); ··· 5528 5500 5529 5501 if (sched_feat(DELAY_DEQUEUE) && delay && 5530 5502 !entity_eligible(cfs_rq, se)) { 5531 - if (cfs_rq->next == se) 5532 - cfs_rq->next = NULL; 5533 5503 update_load_avg(cfs_rq, se, 0); 5534 - se->sched_delayed = 1; 5504 + set_delayed(se); 5535 5505 return false; 5536 5506 } 5537 5507 } ··· 5551 5525 se_update_runnable(se); 5552 5526 5553 5527 update_stats_dequeue_fair(cfs_rq, se, flags); 5554 - 5555 - clear_buddies(cfs_rq, se); 5556 5528 5557 5529 update_entity_lag(cfs_rq, se); 5558 5530 if (sched_feat(PLACE_REL_DEADLINE) && !sleep) { ··· 5941 5917 struct rq *rq = rq_of(cfs_rq); 5942 5918 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); 5943 5919 struct sched_entity *se; 5944 - long task_delta, idle_task_delta, dequeue = 1; 5920 + long task_delta, idle_task_delta, delayed_delta, dequeue = 1; 5945 5921 long rq_h_nr_running = rq->cfs.h_nr_running; 5946 5922 5947 5923 raw_spin_lock(&cfs_b->lock); ··· 5974 5950 5975 5951 task_delta = cfs_rq->h_nr_running; 5976 5952 idle_task_delta = cfs_rq->idle_h_nr_running; 5953 + delayed_delta = cfs_rq->h_nr_delayed; 5977 5954 for_each_sched_entity(se) { 5978 5955 struct cfs_rq *qcfs_rq = cfs_rq_of(se); 5979 5956 int flags; ··· 5998 5973 5999 5974 qcfs_rq->h_nr_running -= task_delta; 6000 5975 qcfs_rq->idle_h_nr_running -= idle_task_delta; 5976 + qcfs_rq->h_nr_delayed -= delayed_delta; 6001 5977 6002 5978 if (qcfs_rq->load.weight) { 6003 5979 /* Avoid re-evaluating load for this entity: */ ··· 6021 5995 6022 5996 qcfs_rq->h_nr_running -= task_delta; 6023 5997 qcfs_rq->idle_h_nr_running -= idle_task_delta; 5998 + qcfs_rq->h_nr_delayed -= delayed_delta; 6024 5999 } 6025 6000 6026 6001 /* At this point se is NULL and we are at root level*/ ··· 6047 6020 struct rq *rq = rq_of(cfs_rq); 6048 6021 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); 6049 6022 struct sched_entity *se; 6050 - long task_delta, idle_task_delta; 6023 + long task_delta, idle_task_delta, delayed_delta; 6051 6024 long rq_h_nr_running = rq->cfs.h_nr_running; 6052 6025 6053 6026 se = cfs_rq->tg->se[cpu_of(rq)]; ··· 6083 6056 6084 6057 task_delta = cfs_rq->h_nr_running; 6085 6058 idle_task_delta = cfs_rq->idle_h_nr_running; 6059 + delayed_delta = cfs_rq->h_nr_delayed; 6086 6060 for_each_sched_entity(se) { 6087 6061 struct cfs_rq *qcfs_rq = cfs_rq_of(se); 6088 6062 ··· 6101 6073 6102 6074 qcfs_rq->h_nr_running += task_delta; 6103 6075 qcfs_rq->idle_h_nr_running += idle_task_delta; 6076 + qcfs_rq->h_nr_delayed += delayed_delta; 6104 6077 6105 6078 /* end evaluation on encountering a throttled cfs_rq */ 6106 6079 if (cfs_rq_throttled(qcfs_rq)) ··· 6119 6090 6120 6091 qcfs_rq->h_nr_running += task_delta; 6121 6092 qcfs_rq->idle_h_nr_running += idle_task_delta; 6093 + qcfs_rq->h_nr_delayed += delayed_delta; 6122 6094 6123 6095 /* end evaluation on encountering a throttled cfs_rq */ 6124 6096 if (cfs_rq_throttled(qcfs_rq)) ··· 6973 6943 } 6974 6944 6975 6945 update_load_avg(cfs_rq, se, 0); 6976 - se->sched_delayed = 0; 6946 + clear_delayed(se); 6977 6947 } 6978 6948 6979 6949 /* ··· 6987 6957 struct cfs_rq *cfs_rq; 6988 6958 struct sched_entity *se = &p->se; 6989 6959 int idle_h_nr_running = task_has_idle_policy(p); 6960 + int h_nr_delayed = 0; 6990 6961 int task_new = !(flags & ENQUEUE_WAKEUP); 6991 6962 int rq_h_nr_running = rq->cfs.h_nr_running; 6992 6963 u64 slice = 0; ··· 7014 6983 if (p->in_iowait) 7015 6984 cpufreq_update_util(rq, SCHED_CPUFREQ_IOWAIT); 7016 6985 6986 + if (task_new) 6987 + h_nr_delayed = !!se->sched_delayed; 6988 + 7017 6989 for_each_sched_entity(se) { 7018 6990 if (se->on_rq) { 7019 6991 if (se->sched_delayed) ··· 7039 7005 7040 7006 cfs_rq->h_nr_running++; 7041 7007 cfs_rq->idle_h_nr_running += idle_h_nr_running; 7008 + cfs_rq->h_nr_delayed += h_nr_delayed; 7042 7009 7043 7010 if (cfs_rq_is_idle(cfs_rq)) 7044 7011 idle_h_nr_running = 1; ··· 7063 7028 7064 7029 cfs_rq->h_nr_running++; 7065 7030 cfs_rq->idle_h_nr_running += idle_h_nr_running; 7031 + cfs_rq->h_nr_delayed += h_nr_delayed; 7066 7032 7067 7033 if (cfs_rq_is_idle(cfs_rq)) 7068 7034 idle_h_nr_running = 1; ··· 7126 7090 struct task_struct *p = NULL; 7127 7091 int idle_h_nr_running = 0; 7128 7092 int h_nr_running = 0; 7093 + int h_nr_delayed = 0; 7129 7094 struct cfs_rq *cfs_rq; 7130 7095 u64 slice = 0; 7131 7096 ··· 7134 7097 p = task_of(se); 7135 7098 h_nr_running = 1; 7136 7099 idle_h_nr_running = task_has_idle_policy(p); 7100 + if (!task_sleep && !task_delayed) 7101 + h_nr_delayed = !!se->sched_delayed; 7137 7102 } else { 7138 7103 cfs_rq = group_cfs_rq(se); 7139 7104 slice = cfs_rq_min_slice(cfs_rq); ··· 7153 7114 7154 7115 cfs_rq->h_nr_running -= h_nr_running; 7155 7116 cfs_rq->idle_h_nr_running -= idle_h_nr_running; 7117 + cfs_rq->h_nr_delayed -= h_nr_delayed; 7156 7118 7157 7119 if (cfs_rq_is_idle(cfs_rq)) 7158 7120 idle_h_nr_running = h_nr_running; ··· 7192 7152 7193 7153 cfs_rq->h_nr_running -= h_nr_running; 7194 7154 cfs_rq->idle_h_nr_running -= idle_h_nr_running; 7155 + cfs_rq->h_nr_delayed -= h_nr_delayed; 7195 7156 7196 7157 if (cfs_rq_is_idle(cfs_rq)) 7197 7158 idle_h_nr_running = h_nr_running; ··· 8821 8780 if (unlikely(throttled_hierarchy(cfs_rq_of(pse)))) 8822 8781 return; 8823 8782 8824 - if (sched_feat(NEXT_BUDDY) && !(wake_flags & WF_FORK)) { 8783 + if (sched_feat(NEXT_BUDDY) && !(wake_flags & WF_FORK) && !pse->sched_delayed) { 8825 8784 set_next_buddy(pse); 8826 8785 } 8827 8786
+1 -1
kernel/sched/pelt.c
··· 321 321 { 322 322 if (___update_load_sum(now, &cfs_rq->avg, 323 323 scale_load_down(cfs_rq->load.weight), 324 - cfs_rq->h_nr_running, 324 + cfs_rq->h_nr_running - cfs_rq->h_nr_delayed, 325 325 cfs_rq->curr != NULL)) { 326 326 327 327 ___update_load_avg(&cfs_rq->avg, 1);
+11 -2
kernel/sched/sched.h
··· 398 398 extern int dl_server_apply_params(struct sched_dl_entity *dl_se, 399 399 u64 runtime, u64 period, bool init); 400 400 401 + static inline bool dl_server_active(struct sched_dl_entity *dl_se) 402 + { 403 + return dl_se->dl_server_active; 404 + } 405 + 401 406 #ifdef CONFIG_CGROUP_SCHED 402 407 403 408 extern struct list_head task_groups; ··· 654 649 unsigned int h_nr_running; /* SCHED_{NORMAL,BATCH,IDLE} */ 655 650 unsigned int idle_nr_running; /* SCHED_IDLE */ 656 651 unsigned int idle_h_nr_running; /* SCHED_IDLE */ 652 + unsigned int h_nr_delayed; 657 653 658 654 s64 avg_vruntime; 659 655 u64 avg_load; ··· 904 898 905 899 static inline void se_update_runnable(struct sched_entity *se) 906 900 { 907 - if (!entity_is_task(se)) 908 - se->runnable_weight = se->my_q->h_nr_running; 901 + if (!entity_is_task(se)) { 902 + struct cfs_rq *cfs_rq = se->my_q; 903 + 904 + se->runnable_weight = cfs_rq->h_nr_running - cfs_rq->h_nr_delayed; 905 + } 909 906 } 910 907 911 908 static inline long se_runnable(struct sched_entity *se)