Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge tag 'sched-urgent-2020-04-12' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull scheduler fixes/updates from Thomas Gleixner:

- Deduplicate the average computations in the scheduler core and the
fair class code.

- Fix a raise between runtime distribution and assignement which can
cause exceeding the quota by up to 70%.

- Prevent negative results in the imbalanace calculation

- Remove a stale warning in the workqueue code which can be triggered
since the call site was moved out of preempt disabled code. It's a
false positive.

- Deduplicate the print macros for procfs

- Add the ucmap values to the SCHED_DEBUG procfs output for completness

* tag 'sched-urgent-2020-04-12' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
sched/debug: Add task uclamp values to SCHED_DEBUG procfs
sched/debug: Factor out printing formats into common macros
sched/debug: Remove redundant macro define
sched/core: Remove unused rq::last_load_update_tick
workqueue: Remove the warning in wq_worker_sleeping()
sched/fair: Fix negative imbalance in imbalance calculation
sched/fair: Fix race between runtime distribution and assignment
sched/fair: Align rq->avg_idle and rq->avg_scan_cost

+51 -62
+2 -8
kernel/sched/core.c
··· 2119 2119 return cpu; 2120 2120 } 2121 2121 2122 - static void update_avg(u64 *avg, u64 sample) 2123 - { 2124 - s64 diff = sample - *avg; 2125 - *avg += diff >> 3; 2126 - } 2127 - 2128 2122 void sched_set_stop_task(int cpu, struct task_struct *stop) 2129 2123 { 2130 2124 struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 }; ··· 4120 4126 * it wants to wake up a task to maintain concurrency. 4121 4127 * As this function is called inside the schedule() context, 4122 4128 * we disable preemption to avoid it calling schedule() again 4123 - * in the possible wakeup of a kworker. 4129 + * in the possible wakeup of a kworker and because wq_worker_sleeping() 4130 + * requires it. 4124 4131 */ 4125 4132 if (tsk->flags & (PF_WQ_WORKER | PF_IO_WORKER)) { 4126 4133 preempt_disable(); ··· 6694 6699 6695 6700 rq_attach_root(rq, &def_root_domain); 6696 6701 #ifdef CONFIG_NO_HZ_COMMON 6697 - rq->last_load_update_tick = jiffies; 6698 6702 rq->last_blocked_load_update_tick = jiffies; 6699 6703 atomic_set(&rq->nohz_flags, 0); 6700 6704 #endif
+18 -26
kernel/sched/debug.c
··· 816 816 817 817 __initcall(init_sched_debug_procfs); 818 818 819 - #define __P(F) SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)F) 820 - #define P(F) SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)p->F) 821 - #define __PN(F) SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)F)) 822 - #define PN(F) SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)p->F)) 819 + #define __PS(S, F) SEQ_printf(m, "%-45s:%21Ld\n", S, (long long)(F)) 820 + #define __P(F) __PS(#F, F) 821 + #define P(F) __PS(#F, p->F) 822 + #define __PSN(S, F) SEQ_printf(m, "%-45s:%14Ld.%06ld\n", S, SPLIT_NS((long long)(F))) 823 + #define __PN(F) __PSN(#F, F) 824 + #define PN(F) __PSN(#F, p->F) 823 825 824 826 825 827 #ifdef CONFIG_NUMA_BALANCING ··· 870 868 SEQ_printf(m, 871 869 "---------------------------------------------------------" 872 870 "----------\n"); 873 - #define __P(F) \ 874 - SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)F) 875 - #define P(F) \ 876 - SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)p->F) 877 - #define P_SCHEDSTAT(F) \ 878 - SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)schedstat_val(p->F)) 879 - #define __PN(F) \ 880 - SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)F)) 881 - #define PN(F) \ 882 - SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)p->F)) 883 - #define PN_SCHEDSTAT(F) \ 884 - SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)schedstat_val(p->F))) 871 + 872 + #define P_SCHEDSTAT(F) __PS(#F, schedstat_val(p->F)) 873 + #define PN_SCHEDSTAT(F) __PSN(#F, schedstat_val(p->F)) 885 874 886 875 PN(se.exec_start); 887 876 PN(se.vruntime); ··· 932 939 } 933 940 934 941 __P(nr_switches); 935 - SEQ_printf(m, "%-45s:%21Ld\n", 936 - "nr_voluntary_switches", (long long)p->nvcsw); 937 - SEQ_printf(m, "%-45s:%21Ld\n", 938 - "nr_involuntary_switches", (long long)p->nivcsw); 942 + __PS("nr_voluntary_switches", p->nvcsw); 943 + __PS("nr_involuntary_switches", p->nivcsw); 939 944 940 945 P(se.load.weight); 941 946 #ifdef CONFIG_SMP ··· 947 956 P(se.avg.util_est.ewma); 948 957 P(se.avg.util_est.enqueued); 949 958 #endif 959 + #ifdef CONFIG_UCLAMP_TASK 960 + __PS("uclamp.min", p->uclamp[UCLAMP_MIN].value); 961 + __PS("uclamp.max", p->uclamp[UCLAMP_MAX].value); 962 + __PS("effective uclamp.min", uclamp_eff_value(p, UCLAMP_MIN)); 963 + __PS("effective uclamp.max", uclamp_eff_value(p, UCLAMP_MAX)); 964 + #endif 950 965 P(policy); 951 966 P(prio); 952 967 if (task_has_dl_policy(p)) { ··· 960 963 P(dl.deadline); 961 964 } 962 965 #undef PN_SCHEDSTAT 963 - #undef PN 964 - #undef __PN 965 966 #undef P_SCHEDSTAT 966 - #undef P 967 - #undef __P 968 967 969 968 { 970 969 unsigned int this_cpu = raw_smp_processor_id(); ··· 968 975 969 976 t0 = cpu_clock(this_cpu); 970 977 t1 = cpu_clock(this_cpu); 971 - SEQ_printf(m, "%-45s:%21Ld\n", 972 - "clock-delta", (long long)(t1-t0)); 978 + __PS("clock-delta", t1-t0); 973 979 } 974 980 975 981 sched_show_numa(p, m);
+21 -25
kernel/sched/fair.c
··· 4836 4836 resched_curr(rq); 4837 4837 } 4838 4838 4839 - static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b, u64 remaining) 4839 + static void distribute_cfs_runtime(struct cfs_bandwidth *cfs_b) 4840 4840 { 4841 4841 struct cfs_rq *cfs_rq; 4842 - u64 runtime; 4843 - u64 starting_runtime = remaining; 4842 + u64 runtime, remaining = 1; 4844 4843 4845 4844 rcu_read_lock(); 4846 4845 list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq, ··· 4854 4855 /* By the above check, this should never be true */ 4855 4856 SCHED_WARN_ON(cfs_rq->runtime_remaining > 0); 4856 4857 4858 + raw_spin_lock(&cfs_b->lock); 4857 4859 runtime = -cfs_rq->runtime_remaining + 1; 4858 - if (runtime > remaining) 4859 - runtime = remaining; 4860 - remaining -= runtime; 4860 + if (runtime > cfs_b->runtime) 4861 + runtime = cfs_b->runtime; 4862 + cfs_b->runtime -= runtime; 4863 + remaining = cfs_b->runtime; 4864 + raw_spin_unlock(&cfs_b->lock); 4861 4865 4862 4866 cfs_rq->runtime_remaining += runtime; 4863 4867 ··· 4875 4873 break; 4876 4874 } 4877 4875 rcu_read_unlock(); 4878 - 4879 - return starting_runtime - remaining; 4880 4876 } 4881 4877 4882 4878 /* ··· 4885 4885 */ 4886 4886 static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun, unsigned long flags) 4887 4887 { 4888 - u64 runtime; 4889 4888 int throttled; 4890 4889 4891 4890 /* no need to continue the timer with no bandwidth constraint */ ··· 4913 4914 cfs_b->nr_throttled += overrun; 4914 4915 4915 4916 /* 4916 - * This check is repeated as we are holding onto the new bandwidth while 4917 - * we unthrottle. This can potentially race with an unthrottled group 4918 - * trying to acquire new bandwidth from the global pool. This can result 4919 - * in us over-using our runtime if it is all used during this loop, but 4920 - * only by limited amounts in that extreme case. 4917 + * This check is repeated as we release cfs_b->lock while we unthrottle. 4921 4918 */ 4922 4919 while (throttled && cfs_b->runtime > 0 && !cfs_b->distribute_running) { 4923 - runtime = cfs_b->runtime; 4924 4920 cfs_b->distribute_running = 1; 4925 4921 raw_spin_unlock_irqrestore(&cfs_b->lock, flags); 4926 4922 /* we can't nest cfs_b->lock while distributing bandwidth */ 4927 - runtime = distribute_cfs_runtime(cfs_b, runtime); 4923 + distribute_cfs_runtime(cfs_b); 4928 4924 raw_spin_lock_irqsave(&cfs_b->lock, flags); 4929 4925 4930 4926 cfs_b->distribute_running = 0; 4931 4927 throttled = !list_empty(&cfs_b->throttled_cfs_rq); 4932 - 4933 - lsub_positive(&cfs_b->runtime, runtime); 4934 4928 } 4935 4929 4936 4930 /* ··· 5057 5065 if (!runtime) 5058 5066 return; 5059 5067 5060 - runtime = distribute_cfs_runtime(cfs_b, runtime); 5068 + distribute_cfs_runtime(cfs_b); 5061 5069 5062 5070 raw_spin_lock_irqsave(&cfs_b->lock, flags); 5063 - lsub_positive(&cfs_b->runtime, runtime); 5064 5071 cfs_b->distribute_running = 0; 5065 5072 raw_spin_unlock_irqrestore(&cfs_b->lock, flags); 5066 5073 } ··· 6071 6080 struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask); 6072 6081 struct sched_domain *this_sd; 6073 6082 u64 avg_cost, avg_idle; 6074 - u64 time, cost; 6075 - s64 delta; 6083 + u64 time; 6076 6084 int this = smp_processor_id(); 6077 6085 int cpu, nr = INT_MAX; 6078 6086 ··· 6109 6119 } 6110 6120 6111 6121 time = cpu_clock(this) - time; 6112 - cost = this_sd->avg_scan_cost; 6113 - delta = (s64)(time - cost) / 8; 6114 - this_sd->avg_scan_cost += delta; 6122 + update_avg(&this_sd->avg_scan_cost, time); 6115 6123 6116 6124 return cpu; 6117 6125 } ··· 9036 9048 9037 9049 sds->avg_load = (sds->total_load * SCHED_CAPACITY_SCALE) / 9038 9050 sds->total_capacity; 9051 + /* 9052 + * If the local group is more loaded than the selected 9053 + * busiest group don't try to pull any tasks. 9054 + */ 9055 + if (local->avg_load >= busiest->avg_load) { 9056 + env->imbalance = 0; 9057 + return; 9058 + } 9039 9059 } 9040 9060 9041 9061 /*
+6 -1
kernel/sched/sched.h
··· 195 195 196 196 #define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT) 197 197 198 + static inline void update_avg(u64 *avg, u64 sample) 199 + { 200 + s64 diff = sample - *avg; 201 + *avg += diff / 8; 202 + } 203 + 198 204 /* 199 205 * !! For sched_setattr_nocheck() (kernel) only !! 200 206 * ··· 888 882 #endif 889 883 #ifdef CONFIG_NO_HZ_COMMON 890 884 #ifdef CONFIG_SMP 891 - unsigned long last_load_update_tick; 892 885 unsigned long last_blocked_load_update_tick; 893 886 unsigned int has_blocked_load; 894 887 #endif /* CONFIG_SMP */
+4 -2
kernel/workqueue.c
··· 858 858 * @task: task going to sleep 859 859 * 860 860 * This function is called from schedule() when a busy worker is 861 - * going to sleep. 861 + * going to sleep. Preemption needs to be disabled to protect ->sleeping 862 + * assignment. 862 863 */ 863 864 void wq_worker_sleeping(struct task_struct *task) 864 865 { ··· 876 875 877 876 pool = worker->pool; 878 877 879 - if (WARN_ON_ONCE(worker->sleeping)) 878 + /* Return if preempted before wq_worker_running() was reached */ 879 + if (worker->sleeping) 880 880 return; 881 881 882 882 worker->sleeping = 1;