sched/eevdf: More PELT vs DELAYED_DEQUEUE

Vincent and Dietmar noted that while
commit fc1892becd56 ("sched/eevdf: Fixup PELT vs DELAYED_DEQUEUE") fixes
the entity runnable stats, it does not adjust the cfs_rq runnable stats,
which are based off of h_nr_running.

Track h_nr_delayed such that we can discount those and adjust the
signal.

Fixes: fc1892becd56 ("sched/eevdf: Fixup PELT vs DELAYED_DEQUEUE")
Closes: https://lore.kernel.org/lkml/a9a45193-d0c6-4ba2-a822-464ad30b550e@arm.com/
Closes: https://lore.kernel.org/lkml/CAKfTPtCNUvWE_GX5LyvTF-WdxUT=ZgvZZv-4t=eWntg5uOFqiQ@mail.gmail.com/
[ Fixes checkpatch warnings and rebased ]
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reported-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
Reported-by: Vincent Guittot <vincent.guittot@linaro.org>
Signed-off-by: "Peter Zijlstra (Intel)" <peterz@infradead.org>
Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
Tested-by: K Prateek Nayak <kprateek.nayak@amd.com>
Link: https://lore.kernel.org/r/20241202174606.4074512-3-vincent.guittot@linaro.org

+54 -8
+1
kernel/sched/debug.c
··· 845 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "spread", SPLIT_NS(spread)); 846 SEQ_printf(m, " .%-30s: %d\n", "nr_running", cfs_rq->nr_running); 847 SEQ_printf(m, " .%-30s: %d\n", "h_nr_running", cfs_rq->h_nr_running); 848 SEQ_printf(m, " .%-30s: %d\n", "idle_nr_running", 849 cfs_rq->idle_nr_running); 850 SEQ_printf(m, " .%-30s: %d\n", "idle_h_nr_running",
··· 845 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "spread", SPLIT_NS(spread)); 846 SEQ_printf(m, " .%-30s: %d\n", "nr_running", cfs_rq->nr_running); 847 SEQ_printf(m, " .%-30s: %d\n", "h_nr_running", cfs_rq->h_nr_running); 848 + SEQ_printf(m, " .%-30s: %d\n", "h_nr_delayed", cfs_rq->h_nr_delayed); 849 SEQ_printf(m, " .%-30s: %d\n", "idle_nr_running", 850 cfs_rq->idle_nr_running); 851 SEQ_printf(m, " .%-30s: %d\n", "idle_h_nr_running",
+46 -5
kernel/sched/fair.c
··· 5465 5466 static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq); 5467 5468 - static inline void finish_delayed_dequeue_entity(struct sched_entity *se) 5469 { 5470 se->sched_delayed = 0; 5471 if (sched_feat(DELAY_ZERO) && se->vlag > 0) 5472 se->vlag = 0; 5473 } ··· 5520 if (sched_feat(DELAY_DEQUEUE) && delay && 5521 !entity_eligible(cfs_rq, se)) { 5522 update_load_avg(cfs_rq, se, 0); 5523 - se->sched_delayed = 1; 5524 return false; 5525 } 5526 } ··· 5932 struct rq *rq = rq_of(cfs_rq); 5933 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); 5934 struct sched_entity *se; 5935 - long task_delta, idle_task_delta, dequeue = 1; 5936 long rq_h_nr_running = rq->cfs.h_nr_running; 5937 5938 raw_spin_lock(&cfs_b->lock); ··· 5965 5966 task_delta = cfs_rq->h_nr_running; 5967 idle_task_delta = cfs_rq->idle_h_nr_running; 5968 for_each_sched_entity(se) { 5969 struct cfs_rq *qcfs_rq = cfs_rq_of(se); 5970 int flags; ··· 5989 5990 qcfs_rq->h_nr_running -= task_delta; 5991 qcfs_rq->idle_h_nr_running -= idle_task_delta; 5992 5993 if (qcfs_rq->load.weight) { 5994 /* Avoid re-evaluating load for this entity: */ ··· 6012 6013 qcfs_rq->h_nr_running -= task_delta; 6014 qcfs_rq->idle_h_nr_running -= idle_task_delta; 6015 } 6016 6017 /* At this point se is NULL and we are at root level*/ ··· 6038 struct rq *rq = rq_of(cfs_rq); 6039 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); 6040 struct sched_entity *se; 6041 - long task_delta, idle_task_delta; 6042 long rq_h_nr_running = rq->cfs.h_nr_running; 6043 6044 se = cfs_rq->tg->se[cpu_of(rq)]; ··· 6074 6075 task_delta = cfs_rq->h_nr_running; 6076 idle_task_delta = cfs_rq->idle_h_nr_running; 6077 for_each_sched_entity(se) { 6078 struct cfs_rq *qcfs_rq = cfs_rq_of(se); 6079 ··· 6092 6093 qcfs_rq->h_nr_running += task_delta; 6094 qcfs_rq->idle_h_nr_running += idle_task_delta; 6095 6096 /* end evaluation on encountering a throttled cfs_rq */ 6097 if (cfs_rq_throttled(qcfs_rq)) ··· 6110 6111 qcfs_rq->h_nr_running += task_delta; 6112 qcfs_rq->idle_h_nr_running += idle_task_delta; 6113 6114 /* end evaluation on encountering a throttled cfs_rq */ 6115 if (cfs_rq_throttled(qcfs_rq)) ··· 6964 } 6965 6966 update_load_avg(cfs_rq, se, 0); 6967 - se->sched_delayed = 0; 6968 } 6969 6970 /* ··· 6978 struct cfs_rq *cfs_rq; 6979 struct sched_entity *se = &p->se; 6980 int idle_h_nr_running = task_has_idle_policy(p); 6981 int task_new = !(flags & ENQUEUE_WAKEUP); 6982 int rq_h_nr_running = rq->cfs.h_nr_running; 6983 u64 slice = 0; ··· 7005 if (p->in_iowait) 7006 cpufreq_update_util(rq, SCHED_CPUFREQ_IOWAIT); 7007 7008 for_each_sched_entity(se) { 7009 if (se->on_rq) { 7010 if (se->sched_delayed) ··· 7030 7031 cfs_rq->h_nr_running++; 7032 cfs_rq->idle_h_nr_running += idle_h_nr_running; 7033 7034 if (cfs_rq_is_idle(cfs_rq)) 7035 idle_h_nr_running = 1; ··· 7054 7055 cfs_rq->h_nr_running++; 7056 cfs_rq->idle_h_nr_running += idle_h_nr_running; 7057 7058 if (cfs_rq_is_idle(cfs_rq)) 7059 idle_h_nr_running = 1; ··· 7117 struct task_struct *p = NULL; 7118 int idle_h_nr_running = 0; 7119 int h_nr_running = 0; 7120 struct cfs_rq *cfs_rq; 7121 u64 slice = 0; 7122 ··· 7125 p = task_of(se); 7126 h_nr_running = 1; 7127 idle_h_nr_running = task_has_idle_policy(p); 7128 } else { 7129 cfs_rq = group_cfs_rq(se); 7130 slice = cfs_rq_min_slice(cfs_rq); ··· 7144 7145 cfs_rq->h_nr_running -= h_nr_running; 7146 cfs_rq->idle_h_nr_running -= idle_h_nr_running; 7147 7148 if (cfs_rq_is_idle(cfs_rq)) 7149 idle_h_nr_running = h_nr_running; ··· 7183 7184 cfs_rq->h_nr_running -= h_nr_running; 7185 cfs_rq->idle_h_nr_running -= idle_h_nr_running; 7186 7187 if (cfs_rq_is_idle(cfs_rq)) 7188 idle_h_nr_running = h_nr_running;
··· 5465 5466 static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq); 5467 5468 + static void set_delayed(struct sched_entity *se) 5469 + { 5470 + se->sched_delayed = 1; 5471 + for_each_sched_entity(se) { 5472 + struct cfs_rq *cfs_rq = cfs_rq_of(se); 5473 + 5474 + cfs_rq->h_nr_delayed++; 5475 + if (cfs_rq_throttled(cfs_rq)) 5476 + break; 5477 + } 5478 + } 5479 + 5480 + static void clear_delayed(struct sched_entity *se) 5481 { 5482 se->sched_delayed = 0; 5483 + for_each_sched_entity(se) { 5484 + struct cfs_rq *cfs_rq = cfs_rq_of(se); 5485 + 5486 + cfs_rq->h_nr_delayed--; 5487 + if (cfs_rq_throttled(cfs_rq)) 5488 + break; 5489 + } 5490 + } 5491 + 5492 + static inline void finish_delayed_dequeue_entity(struct sched_entity *se) 5493 + { 5494 + clear_delayed(se); 5495 if (sched_feat(DELAY_ZERO) && se->vlag > 0) 5496 se->vlag = 0; 5497 } ··· 5496 if (sched_feat(DELAY_DEQUEUE) && delay && 5497 !entity_eligible(cfs_rq, se)) { 5498 update_load_avg(cfs_rq, se, 0); 5499 + set_delayed(se); 5500 return false; 5501 } 5502 } ··· 5908 struct rq *rq = rq_of(cfs_rq); 5909 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); 5910 struct sched_entity *se; 5911 + long task_delta, idle_task_delta, delayed_delta, dequeue = 1; 5912 long rq_h_nr_running = rq->cfs.h_nr_running; 5913 5914 raw_spin_lock(&cfs_b->lock); ··· 5941 5942 task_delta = cfs_rq->h_nr_running; 5943 idle_task_delta = cfs_rq->idle_h_nr_running; 5944 + delayed_delta = cfs_rq->h_nr_delayed; 5945 for_each_sched_entity(se) { 5946 struct cfs_rq *qcfs_rq = cfs_rq_of(se); 5947 int flags; ··· 5964 5965 qcfs_rq->h_nr_running -= task_delta; 5966 qcfs_rq->idle_h_nr_running -= idle_task_delta; 5967 + qcfs_rq->h_nr_delayed -= delayed_delta; 5968 5969 if (qcfs_rq->load.weight) { 5970 /* Avoid re-evaluating load for this entity: */ ··· 5986 5987 qcfs_rq->h_nr_running -= task_delta; 5988 qcfs_rq->idle_h_nr_running -= idle_task_delta; 5989 + qcfs_rq->h_nr_delayed -= delayed_delta; 5990 } 5991 5992 /* At this point se is NULL and we are at root level*/ ··· 6011 struct rq *rq = rq_of(cfs_rq); 6012 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); 6013 struct sched_entity *se; 6014 + long task_delta, idle_task_delta, delayed_delta; 6015 long rq_h_nr_running = rq->cfs.h_nr_running; 6016 6017 se = cfs_rq->tg->se[cpu_of(rq)]; ··· 6047 6048 task_delta = cfs_rq->h_nr_running; 6049 idle_task_delta = cfs_rq->idle_h_nr_running; 6050 + delayed_delta = cfs_rq->h_nr_delayed; 6051 for_each_sched_entity(se) { 6052 struct cfs_rq *qcfs_rq = cfs_rq_of(se); 6053 ··· 6064 6065 qcfs_rq->h_nr_running += task_delta; 6066 qcfs_rq->idle_h_nr_running += idle_task_delta; 6067 + qcfs_rq->h_nr_delayed += delayed_delta; 6068 6069 /* end evaluation on encountering a throttled cfs_rq */ 6070 if (cfs_rq_throttled(qcfs_rq)) ··· 6081 6082 qcfs_rq->h_nr_running += task_delta; 6083 qcfs_rq->idle_h_nr_running += idle_task_delta; 6084 + qcfs_rq->h_nr_delayed += delayed_delta; 6085 6086 /* end evaluation on encountering a throttled cfs_rq */ 6087 if (cfs_rq_throttled(qcfs_rq)) ··· 6934 } 6935 6936 update_load_avg(cfs_rq, se, 0); 6937 + clear_delayed(se); 6938 } 6939 6940 /* ··· 6948 struct cfs_rq *cfs_rq; 6949 struct sched_entity *se = &p->se; 6950 int idle_h_nr_running = task_has_idle_policy(p); 6951 + int h_nr_delayed = 0; 6952 int task_new = !(flags & ENQUEUE_WAKEUP); 6953 int rq_h_nr_running = rq->cfs.h_nr_running; 6954 u64 slice = 0; ··· 6974 if (p->in_iowait) 6975 cpufreq_update_util(rq, SCHED_CPUFREQ_IOWAIT); 6976 6977 + if (task_new) 6978 + h_nr_delayed = !!se->sched_delayed; 6979 + 6980 for_each_sched_entity(se) { 6981 if (se->on_rq) { 6982 if (se->sched_delayed) ··· 6996 6997 cfs_rq->h_nr_running++; 6998 cfs_rq->idle_h_nr_running += idle_h_nr_running; 6999 + cfs_rq->h_nr_delayed += h_nr_delayed; 7000 7001 if (cfs_rq_is_idle(cfs_rq)) 7002 idle_h_nr_running = 1; ··· 7019 7020 cfs_rq->h_nr_running++; 7021 cfs_rq->idle_h_nr_running += idle_h_nr_running; 7022 + cfs_rq->h_nr_delayed += h_nr_delayed; 7023 7024 if (cfs_rq_is_idle(cfs_rq)) 7025 idle_h_nr_running = 1; ··· 7081 struct task_struct *p = NULL; 7082 int idle_h_nr_running = 0; 7083 int h_nr_running = 0; 7084 + int h_nr_delayed = 0; 7085 struct cfs_rq *cfs_rq; 7086 u64 slice = 0; 7087 ··· 7088 p = task_of(se); 7089 h_nr_running = 1; 7090 idle_h_nr_running = task_has_idle_policy(p); 7091 + if (!task_sleep && !task_delayed) 7092 + h_nr_delayed = !!se->sched_delayed; 7093 } else { 7094 cfs_rq = group_cfs_rq(se); 7095 slice = cfs_rq_min_slice(cfs_rq); ··· 7105 7106 cfs_rq->h_nr_running -= h_nr_running; 7107 cfs_rq->idle_h_nr_running -= idle_h_nr_running; 7108 + cfs_rq->h_nr_delayed -= h_nr_delayed; 7109 7110 if (cfs_rq_is_idle(cfs_rq)) 7111 idle_h_nr_running = h_nr_running; ··· 7143 7144 cfs_rq->h_nr_running -= h_nr_running; 7145 cfs_rq->idle_h_nr_running -= idle_h_nr_running; 7146 + cfs_rq->h_nr_delayed -= h_nr_delayed; 7147 7148 if (cfs_rq_is_idle(cfs_rq)) 7149 idle_h_nr_running = h_nr_running;
+1 -1
kernel/sched/pelt.c
··· 321 { 322 if (___update_load_sum(now, &cfs_rq->avg, 323 scale_load_down(cfs_rq->load.weight), 324 - cfs_rq->h_nr_running, 325 cfs_rq->curr != NULL)) { 326 327 ___update_load_avg(&cfs_rq->avg, 1);
··· 321 { 322 if (___update_load_sum(now, &cfs_rq->avg, 323 scale_load_down(cfs_rq->load.weight), 324 + cfs_rq->h_nr_running - cfs_rq->h_nr_delayed, 325 cfs_rq->curr != NULL)) { 326 327 ___update_load_avg(&cfs_rq->avg, 1);
+6 -2
kernel/sched/sched.h
··· 649 unsigned int h_nr_running; /* SCHED_{NORMAL,BATCH,IDLE} */ 650 unsigned int idle_nr_running; /* SCHED_IDLE */ 651 unsigned int idle_h_nr_running; /* SCHED_IDLE */ 652 653 s64 avg_vruntime; 654 u64 avg_load; ··· 899 900 static inline void se_update_runnable(struct sched_entity *se) 901 { 902 - if (!entity_is_task(se)) 903 - se->runnable_weight = se->my_q->h_nr_running; 904 } 905 906 static inline long se_runnable(struct sched_entity *se)
··· 649 unsigned int h_nr_running; /* SCHED_{NORMAL,BATCH,IDLE} */ 650 unsigned int idle_nr_running; /* SCHED_IDLE */ 651 unsigned int idle_h_nr_running; /* SCHED_IDLE */ 652 + unsigned int h_nr_delayed; 653 654 s64 avg_vruntime; 655 u64 avg_load; ··· 898 899 static inline void se_update_runnable(struct sched_entity *se) 900 { 901 + if (!entity_is_task(se)) { 902 + struct cfs_rq *cfs_rq = se->my_q; 903 + 904 + se->runnable_weight = cfs_rq->h_nr_running - cfs_rq->h_nr_delayed; 905 + } 906 } 907 908 static inline long se_runnable(struct sched_entity *se)