sched/eevdf: More PELT vs DELAYED_DEQUEUE

Vincent and Dietmar noted that while
commit fc1892becd56 ("sched/eevdf: Fixup PELT vs DELAYED_DEQUEUE") fixes
the entity runnable stats, it does not adjust the cfs_rq runnable stats,
which are based off of h_nr_running.

Track h_nr_delayed such that we can discount those and adjust the
signal.

Fixes: fc1892becd56 ("sched/eevdf: Fixup PELT vs DELAYED_DEQUEUE")
Closes: https://lore.kernel.org/lkml/a9a45193-d0c6-4ba2-a822-464ad30b550e@arm.com/
Closes: https://lore.kernel.org/lkml/CAKfTPtCNUvWE_GX5LyvTF-WdxUT=ZgvZZv-4t=eWntg5uOFqiQ@mail.gmail.com/
[ Fixes checkpatch warnings and rebased ]
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reported-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
Reported-by: Vincent Guittot <vincent.guittot@linaro.org>
Signed-off-by: "Peter Zijlstra (Intel)" <peterz@infradead.org>
Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
Tested-by: K Prateek Nayak <kprateek.nayak@amd.com>
Link: https://lore.kernel.org/r/20241202174606.4074512-3-vincent.guittot@linaro.org

+54 -8
+1
kernel/sched/debug.c
··· 845 845 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "spread", SPLIT_NS(spread)); 846 846 SEQ_printf(m, " .%-30s: %d\n", "nr_running", cfs_rq->nr_running); 847 847 SEQ_printf(m, " .%-30s: %d\n", "h_nr_running", cfs_rq->h_nr_running); 848 + SEQ_printf(m, " .%-30s: %d\n", "h_nr_delayed", cfs_rq->h_nr_delayed); 848 849 SEQ_printf(m, " .%-30s: %d\n", "idle_nr_running", 849 850 cfs_rq->idle_nr_running); 850 851 SEQ_printf(m, " .%-30s: %d\n", "idle_h_nr_running",
+46 -5
kernel/sched/fair.c
··· 5465 5465 5466 5466 static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq); 5467 5467 5468 - static inline void finish_delayed_dequeue_entity(struct sched_entity *se) 5468 + static void set_delayed(struct sched_entity *se) 5469 + { 5470 + se->sched_delayed = 1; 5471 + for_each_sched_entity(se) { 5472 + struct cfs_rq *cfs_rq = cfs_rq_of(se); 5473 + 5474 + cfs_rq->h_nr_delayed++; 5475 + if (cfs_rq_throttled(cfs_rq)) 5476 + break; 5477 + } 5478 + } 5479 + 5480 + static void clear_delayed(struct sched_entity *se) 5469 5481 { 5470 5482 se->sched_delayed = 0; 5483 + for_each_sched_entity(se) { 5484 + struct cfs_rq *cfs_rq = cfs_rq_of(se); 5485 + 5486 + cfs_rq->h_nr_delayed--; 5487 + if (cfs_rq_throttled(cfs_rq)) 5488 + break; 5489 + } 5490 + } 5491 + 5492 + static inline void finish_delayed_dequeue_entity(struct sched_entity *se) 5493 + { 5494 + clear_delayed(se); 5471 5495 if (sched_feat(DELAY_ZERO) && se->vlag > 0) 5472 5496 se->vlag = 0; 5473 5497 } ··· 5520 5496 if (sched_feat(DELAY_DEQUEUE) && delay && 5521 5497 !entity_eligible(cfs_rq, se)) { 5522 5498 update_load_avg(cfs_rq, se, 0); 5523 - se->sched_delayed = 1; 5499 + set_delayed(se); 5524 5500 return false; 5525 5501 } 5526 5502 } ··· 5932 5908 struct rq *rq = rq_of(cfs_rq); 5933 5909 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); 5934 5910 struct sched_entity *se; 5935 - long task_delta, idle_task_delta, dequeue = 1; 5911 + long task_delta, idle_task_delta, delayed_delta, dequeue = 1; 5936 5912 long rq_h_nr_running = rq->cfs.h_nr_running; 5937 5913 5938 5914 raw_spin_lock(&cfs_b->lock); ··· 5965 5941 5966 5942 task_delta = cfs_rq->h_nr_running; 5967 5943 idle_task_delta = cfs_rq->idle_h_nr_running; 5944 + delayed_delta = cfs_rq->h_nr_delayed; 5968 5945 for_each_sched_entity(se) { 5969 5946 struct cfs_rq *qcfs_rq = cfs_rq_of(se); 5970 5947 int flags; ··· 5989 5964 5990 5965 qcfs_rq->h_nr_running -= task_delta; 5991 5966 qcfs_rq->idle_h_nr_running -= idle_task_delta; 5967 + qcfs_rq->h_nr_delayed -= delayed_delta; 5992 5968 5993 5969 if (qcfs_rq->load.weight) { 5994 5970 /* Avoid re-evaluating load for this entity: */ ··· 6012 5986 6013 5987 qcfs_rq->h_nr_running -= task_delta; 6014 5988 qcfs_rq->idle_h_nr_running -= idle_task_delta; 5989 + qcfs_rq->h_nr_delayed -= delayed_delta; 6015 5990 } 6016 5991 6017 5992 /* At this point se is NULL and we are at root level*/ ··· 6038 6011 struct rq *rq = rq_of(cfs_rq); 6039 6012 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); 6040 6013 struct sched_entity *se; 6041 - long task_delta, idle_task_delta; 6014 + long task_delta, idle_task_delta, delayed_delta; 6042 6015 long rq_h_nr_running = rq->cfs.h_nr_running; 6043 6016 6044 6017 se = cfs_rq->tg->se[cpu_of(rq)]; ··· 6074 6047 6075 6048 task_delta = cfs_rq->h_nr_running; 6076 6049 idle_task_delta = cfs_rq->idle_h_nr_running; 6050 + delayed_delta = cfs_rq->h_nr_delayed; 6077 6051 for_each_sched_entity(se) { 6078 6052 struct cfs_rq *qcfs_rq = cfs_rq_of(se); 6079 6053 ··· 6092 6064 6093 6065 qcfs_rq->h_nr_running += task_delta; 6094 6066 qcfs_rq->idle_h_nr_running += idle_task_delta; 6067 + qcfs_rq->h_nr_delayed += delayed_delta; 6095 6068 6096 6069 /* end evaluation on encountering a throttled cfs_rq */ 6097 6070 if (cfs_rq_throttled(qcfs_rq)) ··· 6110 6081 6111 6082 qcfs_rq->h_nr_running += task_delta; 6112 6083 qcfs_rq->idle_h_nr_running += idle_task_delta; 6084 + qcfs_rq->h_nr_delayed += delayed_delta; 6113 6085 6114 6086 /* end evaluation on encountering a throttled cfs_rq */ 6115 6087 if (cfs_rq_throttled(qcfs_rq)) ··· 6964 6934 } 6965 6935 6966 6936 update_load_avg(cfs_rq, se, 0); 6967 - se->sched_delayed = 0; 6937 + clear_delayed(se); 6968 6938 } 6969 6939 6970 6940 /* ··· 6978 6948 struct cfs_rq *cfs_rq; 6979 6949 struct sched_entity *se = &p->se; 6980 6950 int idle_h_nr_running = task_has_idle_policy(p); 6951 + int h_nr_delayed = 0; 6981 6952 int task_new = !(flags & ENQUEUE_WAKEUP); 6982 6953 int rq_h_nr_running = rq->cfs.h_nr_running; 6983 6954 u64 slice = 0; ··· 7005 6974 if (p->in_iowait) 7006 6975 cpufreq_update_util(rq, SCHED_CPUFREQ_IOWAIT); 7007 6976 6977 + if (task_new) 6978 + h_nr_delayed = !!se->sched_delayed; 6979 + 7008 6980 for_each_sched_entity(se) { 7009 6981 if (se->on_rq) { 7010 6982 if (se->sched_delayed) ··· 7030 6996 7031 6997 cfs_rq->h_nr_running++; 7032 6998 cfs_rq->idle_h_nr_running += idle_h_nr_running; 6999 + cfs_rq->h_nr_delayed += h_nr_delayed; 7033 7000 7034 7001 if (cfs_rq_is_idle(cfs_rq)) 7035 7002 idle_h_nr_running = 1; ··· 7054 7019 7055 7020 cfs_rq->h_nr_running++; 7056 7021 cfs_rq->idle_h_nr_running += idle_h_nr_running; 7022 + cfs_rq->h_nr_delayed += h_nr_delayed; 7057 7023 7058 7024 if (cfs_rq_is_idle(cfs_rq)) 7059 7025 idle_h_nr_running = 1; ··· 7117 7081 struct task_struct *p = NULL; 7118 7082 int idle_h_nr_running = 0; 7119 7083 int h_nr_running = 0; 7084 + int h_nr_delayed = 0; 7120 7085 struct cfs_rq *cfs_rq; 7121 7086 u64 slice = 0; 7122 7087 ··· 7125 7088 p = task_of(se); 7126 7089 h_nr_running = 1; 7127 7090 idle_h_nr_running = task_has_idle_policy(p); 7091 + if (!task_sleep && !task_delayed) 7092 + h_nr_delayed = !!se->sched_delayed; 7128 7093 } else { 7129 7094 cfs_rq = group_cfs_rq(se); 7130 7095 slice = cfs_rq_min_slice(cfs_rq); ··· 7144 7105 7145 7106 cfs_rq->h_nr_running -= h_nr_running; 7146 7107 cfs_rq->idle_h_nr_running -= idle_h_nr_running; 7108 + cfs_rq->h_nr_delayed -= h_nr_delayed; 7147 7109 7148 7110 if (cfs_rq_is_idle(cfs_rq)) 7149 7111 idle_h_nr_running = h_nr_running; ··· 7183 7143 7184 7144 cfs_rq->h_nr_running -= h_nr_running; 7185 7145 cfs_rq->idle_h_nr_running -= idle_h_nr_running; 7146 + cfs_rq->h_nr_delayed -= h_nr_delayed; 7186 7147 7187 7148 if (cfs_rq_is_idle(cfs_rq)) 7188 7149 idle_h_nr_running = h_nr_running;
+1 -1
kernel/sched/pelt.c
··· 321 321 { 322 322 if (___update_load_sum(now, &cfs_rq->avg, 323 323 scale_load_down(cfs_rq->load.weight), 324 - cfs_rq->h_nr_running, 324 + cfs_rq->h_nr_running - cfs_rq->h_nr_delayed, 325 325 cfs_rq->curr != NULL)) { 326 326 327 327 ___update_load_avg(&cfs_rq->avg, 1);
+6 -2
kernel/sched/sched.h
··· 649 649 unsigned int h_nr_running; /* SCHED_{NORMAL,BATCH,IDLE} */ 650 650 unsigned int idle_nr_running; /* SCHED_IDLE */ 651 651 unsigned int idle_h_nr_running; /* SCHED_IDLE */ 652 + unsigned int h_nr_delayed; 652 653 653 654 s64 avg_vruntime; 654 655 u64 avg_load; ··· 899 898 900 899 static inline void se_update_runnable(struct sched_entity *se) 901 900 { 902 - if (!entity_is_task(se)) 903 - se->runnable_weight = se->my_q->h_nr_running; 901 + if (!entity_is_task(se)) { 902 + struct cfs_rq *cfs_rq = se->my_q; 903 + 904 + se->runnable_weight = cfs_rq->h_nr_running - cfs_rq->h_nr_delayed; 905 + } 904 906 } 905 907 906 908 static inline long se_runnable(struct sched_entity *se)