Merge tag 'sched-urgent-2024-08-04' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull scheduler fixes from Thomas Gleixner:

- When stime is larger than rtime due to accounting imprecision, then
utime = rtime - stime becomes negative. As this is unsigned math, the
result becomes a huge positive number.

Cure it by resetting stime to rtime in that case, so utime becomes 0.

- Restore consistent state when sched_cpu_deactivate() fails.

When offlining a CPU fails in sched_cpu_deactivate() after the SMT
present counter has been decremented, then the function aborts but
fails to increment the SMT present counter and leaves it imbalanced.
Consecutive operations cause it to underflow. Add the missing fixup
for the error path.

For SMT accounting the runqueue needs to marked online again in the
error exit path to restore consistent state.

* tag 'sched-urgent-2024-08-04' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
sched/core: Fix unbalance set_rq_online/offline() in sched_cpu_deactivate()
sched/core: Introduce sched_set_rq_on/offline() helper
sched/smt: Fix unbalance sched_smt_present dec/inc
sched/smt: Introduce sched_smt_present_inc/dec() helper
sched/cputime: Fix mul_u64_u64_div_u64() precision for cputime

+53 -21
+47 -21
kernel/sched/core.c
··· 7845 } 7846 } 7847 7848 /* 7849 * used to mark begin/end of suspend/resume: 7850 */ ··· 7919 return 0; 7920 } 7921 7922 int sched_cpu_activate(unsigned int cpu) 7923 { 7924 struct rq *rq = cpu_rq(cpu); 7925 - struct rq_flags rf; 7926 7927 /* 7928 * Clear the balance_push callback and prepare to schedule ··· 7945 */ 7946 balance_push_set(cpu, false); 7947 7948 - #ifdef CONFIG_SCHED_SMT 7949 /* 7950 * When going up, increment the number of cores with SMT present. 7951 */ 7952 - if (cpumask_weight(cpu_smt_mask(cpu)) == 2) 7953 - static_branch_inc_cpuslocked(&sched_smt_present); 7954 - #endif 7955 set_cpu_active(cpu, true); 7956 7957 if (sched_smp_initialized) { ··· 7966 * 2) At runtime, if cpuset_cpu_active() fails to rebuild the 7967 * domains. 7968 */ 7969 - rq_lock_irqsave(rq, &rf); 7970 - if (rq->rd) { 7971 - BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); 7972 - set_rq_online(rq); 7973 - } 7974 - rq_unlock_irqrestore(rq, &rf); 7975 7976 return 0; 7977 } ··· 7974 int sched_cpu_deactivate(unsigned int cpu) 7975 { 7976 struct rq *rq = cpu_rq(cpu); 7977 - struct rq_flags rf; 7978 int ret; 7979 7980 /* ··· 8004 */ 8005 synchronize_rcu(); 8006 8007 - rq_lock_irqsave(rq, &rf); 8008 - if (rq->rd) { 8009 - BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); 8010 - set_rq_offline(rq); 8011 - } 8012 - rq_unlock_irqrestore(rq, &rf); 8013 8014 - #ifdef CONFIG_SCHED_SMT 8015 /* 8016 * When going down, decrement the number of cores with SMT present. 8017 */ 8018 - if (cpumask_weight(cpu_smt_mask(cpu)) == 2) 8019 - static_branch_dec_cpuslocked(&sched_smt_present); 8020 8021 sched_core_cpu_deactivate(cpu); 8022 #endif 8023 ··· 8021 sched_update_numa(cpu, false); 8022 ret = cpuset_cpu_inactive(cpu); 8023 if (ret) { 8024 balance_push_set(cpu, false); 8025 set_cpu_active(cpu, true); 8026 sched_update_numa(cpu, true);
··· 7845 } 7846 } 7847 7848 + static inline void sched_set_rq_online(struct rq *rq, int cpu) 7849 + { 7850 + struct rq_flags rf; 7851 + 7852 + rq_lock_irqsave(rq, &rf); 7853 + if (rq->rd) { 7854 + BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); 7855 + set_rq_online(rq); 7856 + } 7857 + rq_unlock_irqrestore(rq, &rf); 7858 + } 7859 + 7860 + static inline void sched_set_rq_offline(struct rq *rq, int cpu) 7861 + { 7862 + struct rq_flags rf; 7863 + 7864 + rq_lock_irqsave(rq, &rf); 7865 + if (rq->rd) { 7866 + BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); 7867 + set_rq_offline(rq); 7868 + } 7869 + rq_unlock_irqrestore(rq, &rf); 7870 + } 7871 + 7872 /* 7873 * used to mark begin/end of suspend/resume: 7874 */ ··· 7895 return 0; 7896 } 7897 7898 + static inline void sched_smt_present_inc(int cpu) 7899 + { 7900 + #ifdef CONFIG_SCHED_SMT 7901 + if (cpumask_weight(cpu_smt_mask(cpu)) == 2) 7902 + static_branch_inc_cpuslocked(&sched_smt_present); 7903 + #endif 7904 + } 7905 + 7906 + static inline void sched_smt_present_dec(int cpu) 7907 + { 7908 + #ifdef CONFIG_SCHED_SMT 7909 + if (cpumask_weight(cpu_smt_mask(cpu)) == 2) 7910 + static_branch_dec_cpuslocked(&sched_smt_present); 7911 + #endif 7912 + } 7913 + 7914 int sched_cpu_activate(unsigned int cpu) 7915 { 7916 struct rq *rq = cpu_rq(cpu); 7917 7918 /* 7919 * Clear the balance_push callback and prepare to schedule ··· 7906 */ 7907 balance_push_set(cpu, false); 7908 7909 /* 7910 * When going up, increment the number of cores with SMT present. 7911 */ 7912 + sched_smt_present_inc(cpu); 7913 set_cpu_active(cpu, true); 7914 7915 if (sched_smp_initialized) { ··· 7930 * 2) At runtime, if cpuset_cpu_active() fails to rebuild the 7931 * domains. 7932 */ 7933 + sched_set_rq_online(rq, cpu); 7934 7935 return 0; 7936 } ··· 7943 int sched_cpu_deactivate(unsigned int cpu) 7944 { 7945 struct rq *rq = cpu_rq(cpu); 7946 int ret; 7947 7948 /* ··· 7974 */ 7975 synchronize_rcu(); 7976 7977 + sched_set_rq_offline(rq, cpu); 7978 7979 /* 7980 * When going down, decrement the number of cores with SMT present. 7981 */ 7982 + sched_smt_present_dec(cpu); 7983 7984 + #ifdef CONFIG_SCHED_SMT 7985 sched_core_cpu_deactivate(cpu); 7986 #endif 7987 ··· 7997 sched_update_numa(cpu, false); 7998 ret = cpuset_cpu_inactive(cpu); 7999 if (ret) { 8000 + sched_smt_present_inc(cpu); 8001 + sched_set_rq_online(rq, cpu); 8002 balance_push_set(cpu, false); 8003 set_cpu_active(cpu, true); 8004 sched_update_numa(cpu, true);
+6
kernel/sched/cputime.c
··· 582 } 583 584 stime = mul_u64_u64_div_u64(stime, rtime, stime + utime); 585 586 update: 587 /*
··· 582 } 583 584 stime = mul_u64_u64_div_u64(stime, rtime, stime + utime); 585 + /* 586 + * Because mul_u64_u64_div_u64() can approximate on some 587 + * achitectures; enforce the constraint that: a*b/(b+c) <= a. 588 + */ 589 + if (unlikely(stime > rtime)) 590 + stime = rtime; 591 592 update: 593 /*