Merge tag 'sched-core-2023-06-27' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git

kernel os linux

Pull scheduler updates from Ingo Molnar:
"Scheduler SMP load-balancer improvements:

- Avoid unnecessary migrations within SMT domains on hybrid systems.

Problem:

On hybrid CPU systems, (processors with a mixture of
higher-frequency SMT cores and lower-frequency non-SMT cores),
under the old code lower-priority CPUs pulled tasks from the
higher-priority cores if more than one SMT sibling was busy -
resulting in many unnecessary task migrations.

Solution:

The new code improves the load balancer to recognize SMT cores
with more than one busy sibling and allows lower-priority CPUs
to pull tasks, which avoids superfluous migrations and lets
lower-priority cores inspect all SMT siblings for the busiest
queue.

- Implement the 'runnable boosting' feature in the EAS balancer:
consider CPU contention in frequency, EAS max util & load-balance
busiest CPU selection.

This improves CPU utilization for certain workloads, while leaves
other key workloads unchanged.

Scheduler infrastructure improvements:

- Rewrite the scheduler topology setup code by consolidating it into
the build_sched_topology() helper function and building it
dynamically on the fly.

- Resolve the local_clock() vs. noinstr complications by rewriting
the code: provide separate sched_clock_noinstr() and
local_clock_noinstr() functions to be used in instrumentation code,
and make sure it is all instrumentation-safe.

Fixes:

- Fix a kthread_park() race with wait_woken()

- Fix misc wait_task_inactive() bugs unearthed by the -rt merge:
- Fix UP PREEMPT bug by unifying the SMP and UP implementations
- Fix task_struct::saved_state handling

- Fix various rq clock update bugs, unearthed by turning on the rq
clock debugging code.

- Fix the PSI WINDOW_MIN_US trigger limit, which was easy to trigger
by creating enough cgroups, by removing the warnign and restricting
window size triggers to PSI file write-permission or
CAP_SYS_RESOURCE.

- Propagate SMT flags in the topology when removing degenerate domain

- Fix grub_reclaim() calculation bug in the deadline scheduler code

- Avoid resetting the min update period when it is unnecessary, in
psi_trigger_destroy().

- Don't balance a task to its current running CPU in load_balance(),
which was possible on certain NUMA topologies with overlapping
groups.

- Fix the sched-debug printing of rq->nr_uninterruptible

Cleanups:

- Address various -Wmissing-prototype warnings, as a preparation to
(maybe) enable this warning in the future.

- Remove unused code

- Mark more functions __init

- Fix shadow-variable warnings"

* tag 'sched-core-2023-06-27' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (50 commits)
sched/core: Avoid multiple calling update_rq_clock() in __cfsb_csd_unthrottle()
sched/core: Avoid double calling update_rq_clock() in __balance_push_cpu_stop()
sched/core: Fixed missing rq clock update before calling set_rq_offline()
sched/deadline: Update GRUB description in the documentation
sched/deadline: Fix bandwidth reclaim equation in GRUB
sched/wait: Fix a kthread_park race with wait_woken()
sched/topology: Mark set_sched_topology() __init
sched/fair: Rename variable cpu_util eff_util
arm64/arch_timer: Fix MMIO byteswap
sched/fair, cpufreq: Introduce 'runnable boosting'
sched/fair: Refactor CPU utilization functions
cpuidle: Use local_clock_noinstr()
sched/clock: Provide local_clock_noinstr()
x86/tsc: Provide sched_clock_noinstr()
clocksource: hyper-v: Provide noinstr sched_clock()
clocksource: hyper-v: Adjust hv_read_tsc_page_tsc() to avoid special casing U64_MAX
x86/vdso: Fix gettimeofday masking
math64: Always inline u128 version of mul_u64_u64_shr()
s390/time: Provide sched_clock_noinstr()
loongarch: Provide noinstr sched_clock_read()
...

Linus Torvalds 2 years ago ed3b7923 e8f75c02

+776 -567

43 changed files

expand all

Documentation

scheduler

sched-deadline.rst

arch

arm64

include

asm

arch_timer.h

io.h

loongarch

include

asm

loongarch.h

kernel

time.c

s390

include

asm

timex.h

kernel

time.c

x86

include

asm

mshyperv.h

vdso

gettimeofday.h

kernel

itmt.c

kvmclock.c

smpboot.c

tsc.c

kvm

x86.c

xen

time.c

drivers

clocksource

arm_arch_timer.c

hyperv_timer.c

cpuidle

cpuidle.c

poll_state.c

include

clocksource

hyperv_timer.h

linux

kthread.h

math64.h

rbtree_latch.h

sched

clock.h

sd_flags.h

topology.h

sched.h

seqlock.h

kernel

cgroup

cgroup.c

kthread.c

printk

printk.c

sched

clock.c

core.c

cpufreq_schedutil.c

deadline.c

debug.c

fair.c

psi.c

sched.h

topology.c

wait.c

time

sched_clock.c

timekeeping.c

+4 -1

Documentation/scheduler/sched-deadline.rst

··· 203 203 - Total bandwidth (this_bw): this is the sum of all tasks "belonging" to the 204 204 runqueue, including the tasks in Inactive state. 205 205 206 + - Maximum usable bandwidth (max_bw): This is the maximum bandwidth usable by 207 + deadline tasks and is currently set to the RT capacity. 208 + 206 209 207 210 The algorithm reclaims the bandwidth of the tasks in Inactive state. 208 211 It does so by decrementing the runtime of the executing task Ti at a pace equal 209 212 to 210 213 211 - dq = -max{ Ui / Umax, (1 - Uinact - Uextra) } dt 214 + dq = -(max{ Ui, (Umax - Uinact - Uextra) } / Umax) dt 212 215 213 216 where: 214 217

+1 -7

arch/arm64/include/asm/arch_timer.h

··· 88 88 89 89 #define arch_timer_reg_read_stable(reg) \ 90 90 ({ \ 91 - u64 _val; \ 92 - \ 93 - preempt_disable_notrace(); \ 94 - _val = erratum_handler(read_ ## reg)(); \ 95 - preempt_enable_notrace(); \ 96 - \ 97 - _val; \ 91 + erratum_handler(read_ ## reg)(); \ 98 92 }) 99 93 100 94 /*

+6 -6

arch/arm64/include/asm/io.h

··· 22 22 * Generic IO read/write. These perform native-endian accesses. 23 23 */ 24 24 #define __raw_writeb __raw_writeb 25 - static inline void __raw_writeb(u8 val, volatile void __iomem *addr) 25 + static __always_inline void __raw_writeb(u8 val, volatile void __iomem *addr) 26 26 { 27 27 asm volatile("strb %w0, [%1]" : : "rZ" (val), "r" (addr)); 28 28 } 29 29 30 30 #define __raw_writew __raw_writew 31 - static inline void __raw_writew(u16 val, volatile void __iomem *addr) 31 + static __always_inline void __raw_writew(u16 val, volatile void __iomem *addr) 32 32 { 33 33 asm volatile("strh %w0, [%1]" : : "rZ" (val), "r" (addr)); 34 34 } ··· 40 40 } 41 41 42 42 #define __raw_writeq __raw_writeq 43 - static inline void __raw_writeq(u64 val, volatile void __iomem *addr) 43 + static __always_inline void __raw_writeq(u64 val, volatile void __iomem *addr) 44 44 { 45 45 asm volatile("str %x0, [%1]" : : "rZ" (val), "r" (addr)); 46 46 } 47 47 48 48 #define __raw_readb __raw_readb 49 - static inline u8 __raw_readb(const volatile void __iomem *addr) 49 + static __always_inline u8 __raw_readb(const volatile void __iomem *addr) 50 50 { 51 51 u8 val; 52 52 asm volatile(ALTERNATIVE("ldrb %w0, [%1]", ··· 57 57 } 58 58 59 59 #define __raw_readw __raw_readw 60 - static inline u16 __raw_readw(const volatile void __iomem *addr) 60 + static __always_inline u16 __raw_readw(const volatile void __iomem *addr) 61 61 { 62 62 u16 val; 63 63 ··· 80 80 } 81 81 82 82 #define __raw_readq __raw_readq 83 - static inline u64 __raw_readq(const volatile void __iomem *addr) 83 + static __always_inline u64 __raw_readq(const volatile void __iomem *addr) 84 84 { 85 85 u64 val; 86 86 asm volatile(ALTERNATIVE("ldr %0, [%1]",

+1 -1

arch/loongarch/include/asm/loongarch.h

··· 1167 1167 1168 1168 #ifndef __ASSEMBLY__ 1169 1169 1170 - static inline u64 drdtime(void) 1170 + static __always_inline u64 drdtime(void) 1171 1171 { 1172 1172 int rID = 0; 1173 1173 u64 val = 0;

+3 -3

arch/loongarch/kernel/time.c

··· 190 190 return drdtime(); 191 191 } 192 192 193 - static u64 native_sched_clock(void) 193 + static noinstr u64 sched_clock_read(void) 194 194 { 195 - return read_const_counter(NULL); 195 + return drdtime(); 196 196 } 197 197 198 198 static struct clocksource clocksource_const = { ··· 211 211 212 212 res = clocksource_register_hz(&clocksource_const, freq); 213 213 214 - sched_clock_register(native_sched_clock, 64, freq); 214 + sched_clock_register(sched_clock_read, 64, freq); 215 215 216 216 pr_info("Constant clock source device register\n"); 217 217

+9 -4

arch/s390/include/asm/timex.h

··· 63 63 return cc; 64 64 } 65 65 66 - static inline void store_tod_clock_ext(union tod_clock *tod) 66 + static __always_inline void store_tod_clock_ext(union tod_clock *tod) 67 67 { 68 68 asm volatile("stcke %0" : "=Q" (*tod) : : "cc"); 69 69 } ··· 177 177 178 178 typedef unsigned long cycles_t; 179 179 180 - static inline unsigned long get_tod_clock(void) 180 + static __always_inline unsigned long get_tod_clock(void) 181 181 { 182 182 union tod_clock clk; 183 183 ··· 204 204 205 205 extern union tod_clock tod_clock_base; 206 206 207 + static __always_inline unsigned long __get_tod_clock_monotonic(void) 208 + { 209 + return get_tod_clock() - tod_clock_base.tod; 210 + } 211 + 207 212 /** 208 213 * get_clock_monotonic - returns current time in clock rate units 209 214 * ··· 221 216 unsigned long tod; 222 217 223 218 preempt_disable_notrace(); 224 - tod = get_tod_clock() - tod_clock_base.tod; 219 + tod = __get_tod_clock_monotonic(); 225 220 preempt_enable_notrace(); 226 221 return tod; 227 222 } ··· 245 240 * -> ns = (th * 125) + ((tl * 125) >> 9); 246 241 * 247 242 */ 248 - static inline unsigned long tod_to_ns(unsigned long todval) 243 + static __always_inline unsigned long tod_to_ns(unsigned long todval) 249 244 { 250 245 return ((todval >> 9) * 125) + (((todval & 0x1ff) * 125) >> 9); 251 246 }

arch/s390/kernel/time.c

··· 102 102 ((long) qui.old_leap * 4096000000L); 103 103 } 104 104 105 + unsigned long long noinstr sched_clock_noinstr(void) 106 + { 107 + return tod_to_ns(__get_tod_clock_monotonic()); 108 + } 109 + 105 110 /* 106 111 * Scheduler clock - returns current time in nanosec units. 107 112 */

arch/x86/include/asm/mshyperv.h

··· 257 257 u64 hv_get_non_nested_register(unsigned int reg); 258 258 void hv_set_non_nested_register(unsigned int reg, u64 value); 259 259 260 + static __always_inline u64 hv_raw_get_register(unsigned int reg) 261 + { 262 + return __rdmsr(reg); 263 + } 264 + 260 265 #else /* CONFIG_HYPERV */ 261 266 static inline void hyperv_init(void) {} 262 267 static inline void hyperv_setup_mmu_ops(void) {}

+30 -11

arch/x86/include/asm/vdso/gettimeofday.h

··· 231 231 ret = __pvclock_read_cycles(pvti, rdtsc_ordered()); 232 232 } while (pvclock_read_retry(pvti, version)); 233 233 234 - return ret; 234 + return ret & S64_MAX; 235 235 } 236 236 #endif 237 237 238 238 #ifdef CONFIG_HYPERV_TIMER 239 239 static u64 vread_hvclock(void) 240 240 { 241 - return hv_read_tsc_page(&hvclock_page); 241 + u64 tsc, time; 242 + 243 + if (hv_read_tsc_page_tsc(&hvclock_page, &tsc, &time)) 244 + return time & S64_MAX; 245 + 246 + return U64_MAX; 242 247 } 243 248 #endif 244 249 ··· 251 246 const struct vdso_data *vd) 252 247 { 253 248 if (likely(clock_mode == VDSO_CLOCKMODE_TSC)) 254 - return (u64)rdtsc_ordered(); 249 + return (u64)rdtsc_ordered() & S64_MAX; 255 250 /* 256 251 * For any memory-mapped vclock type, we need to make sure that gcc 257 252 * doesn't cleverly hoist a load before the mode check. Otherwise we ··· 289 284 * which can be invalidated asynchronously and indicate invalidation by 290 285 * returning U64_MAX, which can be effectively tested by checking for a 291 286 * negative value after casting it to s64. 287 + * 288 + * This effectively forces a S64_MAX mask on the calculations, unlike the 289 + * U64_MAX mask normally used by x86 clocksources. 292 290 */ 293 291 static inline bool arch_vdso_cycles_ok(u64 cycles) 294 292 { ··· 311 303 * @last. If not then use @last, which is the base time of the current 312 304 * conversion period. 313 305 * 314 - * This variant also removes the masking of the subtraction because the 315 - * clocksource mask of all VDSO capable clocksources on x86 is U64_MAX 316 - * which would result in a pointless operation. The compiler cannot 317 - * optimize it away as the mask comes from the vdso data and is not compile 318 - * time constant. 306 + * This variant also uses a custom mask because while the clocksource mask of 307 + * all the VDSO capable clocksources on x86 is U64_MAX, the above code uses 308 + * U64_MASK as an exception value, additionally arch_vdso_cycles_ok() above 309 + * declares everything with the MSB/Sign-bit set as invalid. Therefore the 310 + * effective mask is S64_MAX. 319 311 */ 320 312 static __always_inline 321 313 u64 vdso_calc_delta(u64 cycles, u64 last, u64 mask, u32 mult) 322 314 { 323 - if (cycles > last) 324 - return (cycles - last) * mult; 325 - return 0; 315 + /* 316 + * Due to the MSB/Sign-bit being used as invald marker (see 317 + * arch_vdso_cycles_valid() above), the effective mask is S64_MAX. 318 + */ 319 + u64 delta = (cycles - last) & S64_MAX; 320 + 321 + /* 322 + * Due to the above mentioned TSC wobbles, filter out negative motion. 323 + * Per the above masking, the effective sign bit is now bit 62. 324 + */ 325 + if (unlikely(delta & (1ULL << 62))) 326 + return 0; 327 + 328 + return delta * mult; 326 329 } 327 330 #define vdso_calc_delta vdso_calc_delta 328 331

+5 -18

arch/x86/kernel/itmt.c

··· 165 165 166 166 /** 167 167 * sched_set_itmt_core_prio() - Set CPU priority based on ITMT 168 - * @prio: Priority of cpu core 169 - * @core_cpu: The cpu number associated with the core 168 + * @prio: Priority of @cpu 169 + * @cpu: The CPU number 170 170 * 171 171 * The pstate driver will find out the max boost frequency 172 172 * and call this function to set a priority proportional 173 - * to the max boost frequency. CPU with higher boost 173 + * to the max boost frequency. CPUs with higher boost 174 174 * frequency will receive higher priority. 175 175 * 176 176 * No need to rebuild sched domain after updating 177 177 * the CPU priorities. The sched domains have no 178 178 * dependency on CPU priorities. 179 179 */ 180 - void sched_set_itmt_core_prio(int prio, int core_cpu) 180 + void sched_set_itmt_core_prio(int prio, int cpu) 181 181 { 182 - int cpu, i = 1; 183 - 184 - for_each_cpu(cpu, topology_sibling_cpumask(core_cpu)) { 185 - int smt_prio; 186 - 187 - /* 188 - * Ensure that the siblings are moved to the end 189 - * of the priority chain and only used when 190 - * all other high priority cpus are out of capacity. 191 - */ 192 - smt_prio = prio * smp_num_siblings / (i * i); 193 - per_cpu(sched_core_priority, cpu) = smt_prio; 194 - i++; 195 - } 182 + per_cpu(sched_core_priority, cpu) = prio; 196 183 }

+2 -2

arch/x86/kernel/kvmclock.c

··· 71 71 return -ENODEV; 72 72 } 73 73 74 - static noinstr u64 kvm_clock_read(void) 74 + static u64 kvm_clock_read(void) 75 75 { 76 76 u64 ret; 77 77 ··· 88 88 89 89 static noinstr u64 kvm_sched_clock_read(void) 90 90 { 91 - return kvm_clock_read() - kvm_sched_clock_offset; 91 + return pvclock_clocksource_read_nowd(this_cpu_pvti()) - kvm_sched_clock_offset; 92 92 } 93 93 94 94 static inline void kvm_sched_clock_init(bool stable)

+47 -55

arch/x86/kernel/smpboot.c

··· 602 602 #ifdef CONFIG_SCHED_SMT 603 603 static int x86_smt_flags(void) 604 604 { 605 - return cpu_smt_flags() | x86_sched_itmt_flags(); 605 + return cpu_smt_flags(); 606 606 } 607 607 #endif 608 608 #ifdef CONFIG_SCHED_CLUSTER ··· 613 613 #endif 614 614 #endif 615 615 616 - static struct sched_domain_topology_level x86_numa_in_package_topology[] = { 617 - #ifdef CONFIG_SCHED_SMT 618 - { cpu_smt_mask, x86_smt_flags, SD_INIT_NAME(SMT) }, 619 - #endif 620 - #ifdef CONFIG_SCHED_CLUSTER 621 - { cpu_clustergroup_mask, x86_cluster_flags, SD_INIT_NAME(CLS) }, 622 - #endif 623 - #ifdef CONFIG_SCHED_MC 624 - { cpu_coregroup_mask, x86_core_flags, SD_INIT_NAME(MC) }, 625 - #endif 626 - { NULL, }, 627 - }; 628 - 629 - static struct sched_domain_topology_level x86_hybrid_topology[] = { 630 - #ifdef CONFIG_SCHED_SMT 631 - { cpu_smt_mask, x86_smt_flags, SD_INIT_NAME(SMT) }, 632 - #endif 633 - #ifdef CONFIG_SCHED_MC 634 - { cpu_coregroup_mask, x86_core_flags, SD_INIT_NAME(MC) }, 635 - #endif 636 - { cpu_cpu_mask, SD_INIT_NAME(DIE) }, 637 - { NULL, }, 638 - }; 639 - 640 - static struct sched_domain_topology_level x86_topology[] = { 641 - #ifdef CONFIG_SCHED_SMT 642 - { cpu_smt_mask, x86_smt_flags, SD_INIT_NAME(SMT) }, 643 - #endif 644 - #ifdef CONFIG_SCHED_CLUSTER 645 - { cpu_clustergroup_mask, x86_cluster_flags, SD_INIT_NAME(CLS) }, 646 - #endif 647 - #ifdef CONFIG_SCHED_MC 648 - { cpu_coregroup_mask, x86_core_flags, SD_INIT_NAME(MC) }, 649 - #endif 650 - { cpu_cpu_mask, SD_INIT_NAME(DIE) }, 651 - { NULL, }, 652 - }; 653 - 654 616 /* 655 617 * Set if a package/die has multiple NUMA nodes inside. 656 618 * AMD Magny-Cours, Intel Cluster-on-Die, and Intel 657 619 * Sub-NUMA Clustering have this. 658 620 */ 659 621 static bool x86_has_numa_in_package; 622 + 623 + static struct sched_domain_topology_level x86_topology[6]; 624 + 625 + static void __init build_sched_topology(void) 626 + { 627 + int i = 0; 628 + 629 + #ifdef CONFIG_SCHED_SMT 630 + x86_topology[i++] = (struct sched_domain_topology_level){ 631 + cpu_smt_mask, x86_smt_flags, SD_INIT_NAME(SMT) 632 + }; 633 + #endif 634 + #ifdef CONFIG_SCHED_CLUSTER 635 + /* 636 + * For now, skip the cluster domain on Hybrid. 637 + */ 638 + if (!cpu_feature_enabled(X86_FEATURE_HYBRID_CPU)) { 639 + x86_topology[i++] = (struct sched_domain_topology_level){ 640 + cpu_clustergroup_mask, x86_cluster_flags, SD_INIT_NAME(CLS) 641 + }; 642 + } 643 + #endif 644 + #ifdef CONFIG_SCHED_MC 645 + x86_topology[i++] = (struct sched_domain_topology_level){ 646 + cpu_coregroup_mask, x86_core_flags, SD_INIT_NAME(MC) 647 + }; 648 + #endif 649 + /* 650 + * When there is NUMA topology inside the package skip the DIE domain 651 + * since the NUMA domains will auto-magically create the right spanning 652 + * domains based on the SLIT. 653 + */ 654 + if (!x86_has_numa_in_package) { 655 + x86_topology[i++] = (struct sched_domain_topology_level){ 656 + cpu_cpu_mask, SD_INIT_NAME(DIE) 657 + }; 658 + } 659 + 660 + /* 661 + * There must be one trailing NULL entry left. 662 + */ 663 + BUG_ON(i >= ARRAY_SIZE(x86_topology)-1); 664 + 665 + set_sched_topology(x86_topology); 666 + } 660 667 661 668 void set_cpu_sibling_map(int cpu) 662 669 { ··· 1271 1264 zalloc_cpumask_var(&per_cpu(cpu_l2c_shared_map, i), GFP_KERNEL); 1272 1265 } 1273 1266 1274 - /* 1275 - * Set 'default' x86 topology, this matches default_topology() in that 1276 - * it has NUMA nodes as a topology level. See also 1277 - * native_smp_cpus_done(). 1278 - * 1279 - * Must be done before set_cpus_sibling_map() is ran. 1280 - */ 1281 - set_sched_topology(x86_topology); 1282 - 1283 1267 set_cpu_sibling_map(0); 1284 1268 } 1285 1269 ··· 1391 1393 pr_debug("Boot done\n"); 1392 1394 1393 1395 calculate_max_logical_packages(); 1394 - 1395 - /* XXX for now assume numa-in-package and hybrid don't overlap */ 1396 - if (x86_has_numa_in_package) 1397 - set_sched_topology(x86_numa_in_package_topology); 1398 - if (cpu_feature_enabled(X86_FEATURE_HYBRID_CPU)) 1399 - set_sched_topology(x86_hybrid_topology); 1400 - 1396 + build_sched_topology(); 1401 1397 nmi_selftest(); 1402 1398 impress_friends(); 1403 1399 cache_aps_init();

+29 -9

arch/x86/kernel/tsc.c

··· 69 69 } 70 70 early_param("tsc_early_khz", tsc_early_khz_setup); 71 71 72 - __always_inline void cyc2ns_read_begin(struct cyc2ns_data *data) 72 + __always_inline void __cyc2ns_read(struct cyc2ns_data *data) 73 73 { 74 74 int seq, idx; 75 - 76 - preempt_disable_notrace(); 77 75 78 76 do { 79 77 seq = this_cpu_read(cyc2ns.seq.seqcount.sequence); ··· 82 84 data->cyc2ns_shift = this_cpu_read(cyc2ns.data[idx].cyc2ns_shift); 83 85 84 86 } while (unlikely(seq != this_cpu_read(cyc2ns.seq.seqcount.sequence))); 87 + } 88 + 89 + __always_inline void cyc2ns_read_begin(struct cyc2ns_data *data) 90 + { 91 + preempt_disable_notrace(); 92 + __cyc2ns_read(data); 85 93 } 86 94 87 95 __always_inline void cyc2ns_read_end(void) ··· 119 115 * -johnstul@us.ibm.com "math is hard, lets go shopping!" 120 116 */ 121 117 122 - static __always_inline unsigned long long cycles_2_ns(unsigned long long cyc) 118 + static __always_inline unsigned long long __cycles_2_ns(unsigned long long cyc) 123 119 { 124 120 struct cyc2ns_data data; 125 121 unsigned long long ns; 126 122 127 - cyc2ns_read_begin(&data); 123 + __cyc2ns_read(&data); 128 124 129 125 ns = data.cyc2ns_offset; 130 126 ns += mul_u64_u32_shr(cyc, data.cyc2ns_mul, data.cyc2ns_shift); 131 127 132 - cyc2ns_read_end(); 128 + return ns; 129 + } 133 130 131 + static __always_inline unsigned long long cycles_2_ns(unsigned long long cyc) 132 + { 133 + unsigned long long ns; 134 + preempt_disable_notrace(); 135 + ns = __cycles_2_ns(cyc); 136 + preempt_enable_notrace(); 134 137 return ns; 135 138 } 136 139 ··· 234 223 u64 tsc_now = rdtsc(); 235 224 236 225 /* return the value in ns */ 237 - return cycles_2_ns(tsc_now); 226 + return __cycles_2_ns(tsc_now); 238 227 } 239 228 240 229 /* ··· 261 250 /* We need to define a real function for sched_clock, to override the 262 251 weak default version */ 263 252 #ifdef CONFIG_PARAVIRT 264 - noinstr u64 sched_clock(void) 253 + noinstr u64 sched_clock_noinstr(void) 265 254 { 266 255 return paravirt_sched_clock(); 267 256 } ··· 271 260 return static_call_query(pv_sched_clock) == native_sched_clock; 272 261 } 273 262 #else 274 - u64 sched_clock(void) __attribute__((alias("native_sched_clock"))); 263 + u64 sched_clock_noinstr(void) __attribute__((alias("native_sched_clock"))); 275 264 276 265 bool using_native_sched_clock(void) { return true; } 277 266 #endif 267 + 268 + notrace u64 sched_clock(void) 269 + { 270 + u64 now; 271 + preempt_disable_notrace(); 272 + now = sched_clock_noinstr(); 273 + preempt_enable_notrace(); 274 + return now; 275 + } 278 276 279 277 int check_tsc_unstable(void) 280 278 {

+3 -4

arch/x86/kvm/x86.c

··· 2799 2799 static inline u64 vgettsc(struct pvclock_clock *clock, u64 *tsc_timestamp, 2800 2800 int *mode) 2801 2801 { 2802 - long v; 2803 2802 u64 tsc_pg_val; 2803 + long v; 2804 2804 2805 2805 switch (clock->vclock_mode) { 2806 2806 case VDSO_CLOCKMODE_HVCLOCK: 2807 - tsc_pg_val = hv_read_tsc_page_tsc(hv_get_tsc_page(), 2808 - tsc_timestamp); 2809 - if (tsc_pg_val != U64_MAX) { 2807 + if (hv_read_tsc_page_tsc(hv_get_tsc_page(), 2808 + tsc_timestamp, &tsc_pg_val)) { 2810 2809 /* TSC page valid */ 2811 2810 *mode = VDSO_CLOCKMODE_HVCLOCK; 2812 2811 v = (tsc_pg_val - clock->cycle_last) &

+1 -2

arch/x86/xen/time.c

··· 66 66 struct pvclock_vcpu_time_info *src; 67 67 u64 ret; 68 68 69 - preempt_disable_notrace(); 70 69 src = &__this_cpu_read(xen_vcpu)->time; 71 70 ret = pvclock_clocksource_read_nowd(src); 72 71 ret -= xen_sched_clock_offset; 73 - preempt_enable_notrace(); 72 + 74 73 return ret; 75 74 } 76 75

+40 -14

drivers/clocksource/arm_arch_timer.c

··· 191 191 return val; 192 192 } 193 193 194 - static notrace u64 arch_counter_get_cntpct_stable(void) 194 + static noinstr u64 raw_counter_get_cntpct_stable(void) 195 195 { 196 196 return __arch_counter_get_cntpct_stable(); 197 197 } 198 198 199 - static notrace u64 arch_counter_get_cntpct(void) 199 + static notrace u64 arch_counter_get_cntpct_stable(void) 200 + { 201 + u64 val; 202 + preempt_disable_notrace(); 203 + val = __arch_counter_get_cntpct_stable(); 204 + preempt_enable_notrace(); 205 + return val; 206 + } 207 + 208 + static noinstr u64 arch_counter_get_cntpct(void) 200 209 { 201 210 return __arch_counter_get_cntpct(); 202 211 } 203 212 204 - static notrace u64 arch_counter_get_cntvct_stable(void) 213 + static noinstr u64 raw_counter_get_cntvct_stable(void) 205 214 { 206 215 return __arch_counter_get_cntvct_stable(); 207 216 } 208 217 209 - static notrace u64 arch_counter_get_cntvct(void) 218 + static notrace u64 arch_counter_get_cntvct_stable(void) 219 + { 220 + u64 val; 221 + preempt_disable_notrace(); 222 + val = __arch_counter_get_cntvct_stable(); 223 + preempt_enable_notrace(); 224 + return val; 225 + } 226 + 227 + static noinstr u64 arch_counter_get_cntvct(void) 210 228 { 211 229 return __arch_counter_get_cntvct(); 212 230 } ··· 771 753 return 0; 772 754 } 773 755 774 - static u64 arch_counter_get_cnt_mem(struct arch_timer *t, int offset_lo) 756 + static noinstr u64 arch_counter_get_cnt_mem(struct arch_timer *t, int offset_lo) 775 757 { 776 758 u32 cnt_lo, cnt_hi, tmp_hi; 777 759 778 760 do { 779 - cnt_hi = readl_relaxed(t->base + offset_lo + 4); 780 - cnt_lo = readl_relaxed(t->base + offset_lo); 781 - tmp_hi = readl_relaxed(t->base + offset_lo + 4); 761 + cnt_hi = __le32_to_cpu((__le32 __force)__raw_readl(t->base + offset_lo + 4)); 762 + cnt_lo = __le32_to_cpu((__le32 __force)__raw_readl(t->base + offset_lo)); 763 + tmp_hi = __le32_to_cpu((__le32 __force)__raw_readl(t->base + offset_lo + 4)); 782 764 } while (cnt_hi != tmp_hi); 783 765 784 766 return ((u64) cnt_hi << 32) | cnt_lo; ··· 1078 1060 return cpumask_test_cpu(raw_smp_processor_id(), &evtstrm_available); 1079 1061 } 1080 1062 1081 - static u64 arch_counter_get_cntvct_mem(void) 1063 + static noinstr u64 arch_counter_get_cntvct_mem(void) 1082 1064 { 1083 1065 return arch_counter_get_cnt_mem(arch_timer_mem, CNTVCT_LO); 1084 1066 } ··· 1092 1074 1093 1075 static void __init arch_counter_register(unsigned type) 1094 1076 { 1077 + u64 (*scr)(void); 1095 1078 u64 start_count; 1096 1079 int width; 1097 1080 ··· 1102 1083 1103 1084 if ((IS_ENABLED(CONFIG_ARM64) && !is_hyp_mode_available()) || 1104 1085 arch_timer_uses_ppi == ARCH_TIMER_VIRT_PPI) { 1105 - if (arch_timer_counter_has_wa()) 1086 + if (arch_timer_counter_has_wa()) { 1106 1087 rd = arch_counter_get_cntvct_stable; 1107 - else 1088 + scr = raw_counter_get_cntvct_stable; 1089 + } else { 1108 1090 rd = arch_counter_get_cntvct; 1091 + scr = arch_counter_get_cntvct; 1092 + } 1109 1093 } else { 1110 - if (arch_timer_counter_has_wa()) 1094 + if (arch_timer_counter_has_wa()) { 1111 1095 rd = arch_counter_get_cntpct_stable; 1112 - else 1096 + scr = raw_counter_get_cntpct_stable; 1097 + } else { 1113 1098 rd = arch_counter_get_cntpct; 1099 + scr = arch_counter_get_cntpct; 1100 + } 1114 1101 } 1115 1102 1116 1103 arch_timer_read_counter = rd; 1117 1104 clocksource_counter.vdso_clock_mode = vdso_default; 1118 1105 } else { 1119 1106 arch_timer_read_counter = arch_counter_get_cntvct_mem; 1107 + scr = arch_counter_get_cntvct_mem; 1120 1108 } 1121 1109 1122 1110 width = arch_counter_get_width(); ··· 1139 1113 timecounter_init(&arch_timer_kvm_info.timecounter, 1140 1114 &cyclecounter, start_count); 1141 1115 1142 - sched_clock_register(arch_timer_read_counter, width, arch_timer_rate); 1116 + sched_clock_register(scr, width, arch_timer_rate); 1143 1117 } 1144 1118 1145 1119 static void arch_timer_stop(struct clock_event_device *clk)

+26 -16

drivers/clocksource/hyperv_timer.c

··· 365 365 } 366 366 EXPORT_SYMBOL_GPL(hv_stimer_global_cleanup); 367 367 368 + static __always_inline u64 read_hv_clock_msr(void) 369 + { 370 + /* 371 + * Read the partition counter to get the current tick count. This count 372 + * is set to 0 when the partition is created and is incremented in 100 373 + * nanosecond units. 374 + * 375 + * Use hv_raw_get_register() because this function is used from 376 + * noinstr. Notable; while HV_REGISTER_TIME_REF_COUNT is a synthetic 377 + * register it doesn't need the GHCB path. 378 + */ 379 + return hv_raw_get_register(HV_REGISTER_TIME_REF_COUNT); 380 + } 381 + 368 382 /* 369 383 * Code and definitions for the Hyper-V clocksources. Two 370 384 * clocksources are defined: one that reads the Hyper-V defined MSR, and ··· 407 393 } 408 394 EXPORT_SYMBOL_GPL(hv_get_tsc_page); 409 395 410 - static u64 notrace read_hv_clock_tsc(void) 396 + static __always_inline u64 read_hv_clock_tsc(void) 411 397 { 412 - u64 current_tick = hv_read_tsc_page(hv_get_tsc_page()); 398 + u64 cur_tsc, time; 413 399 414 - if (current_tick == U64_MAX) 415 - current_tick = hv_get_register(HV_REGISTER_TIME_REF_COUNT); 400 + /* 401 + * The Hyper-V Top-Level Function Spec (TLFS), section Timers, 402 + * subsection Refererence Counter, guarantees that the TSC and MSR 403 + * times are in sync and monotonic. Therefore we can fall back 404 + * to the MSR in case the TSC page indicates unavailability. 405 + */ 406 + if (!hv_read_tsc_page_tsc(tsc_page, &cur_tsc, &time)) 407 + time = read_hv_clock_msr(); 416 408 417 - return current_tick; 409 + return time; 418 410 } 419 411 420 412 static u64 notrace read_hv_clock_tsc_cs(struct clocksource *arg) ··· 428 408 return read_hv_clock_tsc(); 429 409 } 430 410 431 - static u64 notrace read_hv_sched_clock_tsc(void) 411 + static u64 noinstr read_hv_sched_clock_tsc(void) 432 412 { 433 413 return (read_hv_clock_tsc() - hv_sched_clock_offset) * 434 414 (NSEC_PER_SEC / HV_CLOCK_HZ); ··· 479 459 .vdso_clock_mode = VDSO_CLOCKMODE_NONE, 480 460 #endif 481 461 }; 482 - 483 - static u64 notrace read_hv_clock_msr(void) 484 - { 485 - /* 486 - * Read the partition counter to get the current tick count. This count 487 - * is set to 0 when the partition is created and is incremented in 488 - * 100 nanosecond units. 489 - */ 490 - return hv_get_register(HV_REGISTER_TIME_REF_COUNT); 491 - } 492 462 493 463 static u64 notrace read_hv_clock_msr_cs(struct clocksource *arg) 494 464 {

+4 -4

drivers/cpuidle/cpuidle.c

··· 145 145 146 146 instrumentation_begin(); 147 147 148 - time_start = ns_to_ktime(local_clock()); 148 + time_start = ns_to_ktime(local_clock_noinstr()); 149 149 150 150 tick_freeze(); 151 151 /* ··· 169 169 tick_unfreeze(); 170 170 start_critical_timings(); 171 171 172 - time_end = ns_to_ktime(local_clock()); 172 + time_end = ns_to_ktime(local_clock_noinstr()); 173 173 174 174 dev->states_usage[index].s2idle_time += ktime_us_delta(time_end, time_start); 175 175 dev->states_usage[index].s2idle_usage++; ··· 243 243 sched_idle_set_state(target_state); 244 244 245 245 trace_cpu_idle(index, dev->cpu); 246 - time_start = ns_to_ktime(local_clock()); 246 + time_start = ns_to_ktime(local_clock_noinstr()); 247 247 248 248 stop_critical_timings(); 249 249 if (!(target_state->flags & CPUIDLE_FLAG_RCU_IDLE)) { ··· 276 276 start_critical_timings(); 277 277 278 278 sched_clock_idle_wakeup_event(); 279 - time_end = ns_to_ktime(local_clock()); 279 + time_end = ns_to_ktime(local_clock_noinstr()); 280 280 trace_cpu_idle(PWR_EVENT_EXIT, dev->cpu); 281 281 282 282 /* The cpu is no longer idle or about to enter idle. */

+2 -2

drivers/cpuidle/poll_state.c

··· 15 15 { 16 16 u64 time_start; 17 17 18 - time_start = local_clock(); 18 + time_start = local_clock_noinstr(); 19 19 20 20 dev->poll_time_limit = false; 21 21 ··· 32 32 continue; 33 33 34 34 loop_count = 0; 35 - if (local_clock() - time_start > limit) { 35 + if (local_clock_noinstr() - time_start > limit) { 36 36 dev->poll_time_limit = true; 37 37 break; 38 38 }

+9 -15

include/clocksource/hyperv_timer.h

··· 38 38 extern unsigned long hv_get_tsc_pfn(void); 39 39 extern struct ms_hyperv_tsc_page *hv_get_tsc_page(void); 40 40 41 - static inline notrace u64 42 - hv_read_tsc_page_tsc(const struct ms_hyperv_tsc_page *tsc_pg, u64 *cur_tsc) 41 + static __always_inline bool 42 + hv_read_tsc_page_tsc(const struct ms_hyperv_tsc_page *tsc_pg, 43 + u64 *cur_tsc, u64 *time) 43 44 { 44 45 u64 scale, offset; 45 46 u32 sequence; ··· 64 63 do { 65 64 sequence = READ_ONCE(tsc_pg->tsc_sequence); 66 65 if (!sequence) 67 - return U64_MAX; 66 + return false; 68 67 /* 69 68 * Make sure we read sequence before we read other values from 70 69 * TSC page. ··· 83 82 84 83 } while (READ_ONCE(tsc_pg->tsc_sequence) != sequence); 85 84 86 - return mul_u64_u64_shr(*cur_tsc, scale, 64) + offset; 87 - } 88 - 89 - static inline notrace u64 90 - hv_read_tsc_page(const struct ms_hyperv_tsc_page *tsc_pg) 91 - { 92 - u64 cur_tsc; 93 - 94 - return hv_read_tsc_page_tsc(tsc_pg, &cur_tsc); 85 + *time = mul_u64_u64_shr(*cur_tsc, scale, 64) + offset; 86 + return true; 95 87 } 96 88 97 89 #else /* CONFIG_HYPERV_TIMER */ ··· 98 104 return NULL; 99 105 } 100 106 101 - static inline u64 hv_read_tsc_page_tsc(const struct ms_hyperv_tsc_page *tsc_pg, 102 - u64 *cur_tsc) 107 + static __always_inline bool 108 + hv_read_tsc_page_tsc(const struct ms_hyperv_tsc_page *tsc_pg, u64 *cur_tsc, u64 *time) 103 109 { 104 - return U64_MAX; 110 + return false; 105 111 } 106 112 107 113 static inline int hv_stimer_cleanup(unsigned int cpu) { return 0; }

include/linux/kthread.h

··· 89 89 bool kthread_should_stop(void); 90 90 bool kthread_should_park(void); 91 91 bool __kthread_should_park(struct task_struct *k); 92 + bool kthread_should_stop_or_park(void); 92 93 bool kthread_freezable_should_stop(bool *was_frozen); 93 94 void *kthread_func(struct task_struct *k); 94 95 void *kthread_data(struct task_struct *k);

+1 -1

include/linux/math64.h

··· 168 168 #endif /* mul_u64_u32_shr */ 169 169 170 170 #ifndef mul_u64_u64_shr 171 - static inline u64 mul_u64_u64_shr(u64 a, u64 mul, unsigned int shift) 171 + static __always_inline u64 mul_u64_u64_shr(u64 a, u64 mul, unsigned int shift) 172 172 { 173 173 return (u64)(((unsigned __int128)a * mul) >> shift); 174 174 }

+1 -1

include/linux/rbtree_latch.h

··· 206 206 do { 207 207 seq = raw_read_seqcount_latch(&root->seq); 208 208 node = __lt_find(key, root, seq & 1, ops->comp); 209 - } while (read_seqcount_latch_retry(&root->seq, seq)); 209 + } while (raw_read_seqcount_latch_retry(&root->seq, seq)); 210 210 211 211 return node; 212 212 }

+2 -5

include/linux/sched.h

··· 2006 2006 */ 2007 2007 preempt_fold_need_resched(); 2008 2008 } 2009 - extern unsigned long wait_task_inactive(struct task_struct *, unsigned int match_state); 2010 2009 #else 2011 2010 static inline void scheduler_ipi(void) { } 2012 - static inline unsigned long wait_task_inactive(struct task_struct *p, unsigned int match_state) 2013 - { 2014 - return 1; 2015 - } 2016 2011 #endif 2012 + 2013 + extern unsigned long wait_task_inactive(struct task_struct *, unsigned int match_state); 2017 2014 2018 2015 /* 2019 2016 * Set thread flags in other task's structures.

+16 -1

include/linux/sched/clock.h

··· 12 12 * 13 13 * Please use one of the three interfaces below. 14 14 */ 15 - extern unsigned long long notrace sched_clock(void); 15 + extern u64 sched_clock(void); 16 + 17 + #if defined(CONFIG_ARCH_WANTS_NO_INSTR) || defined(CONFIG_GENERIC_SCHED_CLOCK) 18 + extern u64 sched_clock_noinstr(void); 19 + #else 20 + static __always_inline u64 sched_clock_noinstr(void) 21 + { 22 + return sched_clock(); 23 + } 24 + #endif 16 25 17 26 /* 18 27 * See the comment in kernel/sched/clock.c ··· 52 43 static inline u64 cpu_clock(int cpu) 53 44 { 54 45 return sched_clock(); 46 + } 47 + 48 + static __always_inline u64 local_clock_noinstr(void) 49 + { 50 + return sched_clock_noinstr(); 55 51 } 56 52 57 53 static __always_inline u64 local_clock(void) ··· 93 79 return sched_clock_cpu(cpu); 94 80 } 95 81 82 + extern u64 local_clock_noinstr(void); 96 83 extern u64 local_clock(void); 97 84 98 85 #endif

+1 -4

include/linux/sched/sd_flags.h

··· 132 132 /* 133 133 * Place busy tasks earlier in the domain 134 134 * 135 - * SHARED_CHILD: Usually set on the SMT level. Technically could be set further 136 - * up, but currently assumed to be set from the base domain 137 - * upwards (see update_top_cache_domain()). 138 135 * NEEDS_GROUPS: Load balancing flag. 139 136 */ 140 - SD_FLAG(SD_ASYM_PACKING, SDF_SHARED_CHILD | SDF_NEEDS_GROUPS) 137 + SD_FLAG(SD_ASYM_PACKING, SDF_NEEDS_GROUPS) 141 138 142 139 /* 143 140 * Prefer to place tasks in a sibling domain

+1 -1

include/linux/sched/topology.h

··· 203 203 #endif 204 204 }; 205 205 206 - extern void set_sched_topology(struct sched_domain_topology_level *tl); 206 + extern void __init set_sched_topology(struct sched_domain_topology_level *tl); 207 207 208 208 #ifdef CONFIG_SCHED_DEBUG 209 209 # define SD_INIT_NAME(type) .name = #type

+8 -7

include/linux/seqlock.h

··· 671 671 * 672 672 * Return: sequence counter raw value. Use the lowest bit as an index for 673 673 * picking which data copy to read. The full counter must then be checked 674 - * with read_seqcount_latch_retry(). 674 + * with raw_read_seqcount_latch_retry(). 675 675 */ 676 - static inline unsigned raw_read_seqcount_latch(const seqcount_latch_t *s) 676 + static __always_inline unsigned raw_read_seqcount_latch(const seqcount_latch_t *s) 677 677 { 678 678 /* 679 679 * Pairs with the first smp_wmb() in raw_write_seqcount_latch(). ··· 683 683 } 684 684 685 685 /** 686 - * read_seqcount_latch_retry() - end a seqcount_latch_t read section 686 + * raw_read_seqcount_latch_retry() - end a seqcount_latch_t read section 687 687 * @s: Pointer to seqcount_latch_t 688 688 * @start: count, from raw_read_seqcount_latch() 689 689 * 690 690 * Return: true if a read section retry is required, else false 691 691 */ 692 - static inline int 693 - read_seqcount_latch_retry(const seqcount_latch_t *s, unsigned start) 692 + static __always_inline int 693 + raw_read_seqcount_latch_retry(const seqcount_latch_t *s, unsigned start) 694 694 { 695 - return read_seqcount_retry(&s->seqcount, start); 695 + smp_rmb(); 696 + return unlikely(READ_ONCE(s->seqcount.sequence) != start); 696 697 } 697 698 698 699 /** ··· 753 752 * entry = data_query(latch->data[idx], ...); 754 753 * 755 754 * // This includes needed smp_rmb() 756 - * } while (read_seqcount_latch_retry(&latch->seq, seq)); 755 + * } while (raw_read_seqcount_latch_retry(&latch->seq, seq)); 757 756 * 758 757 * return entry; 759 758 * }

+12

kernel/cgroup/cgroup.c

··· 3891 3891 return psi_trigger_poll(&ctx->psi.trigger, of->file, pt); 3892 3892 } 3893 3893 3894 + static int cgroup_pressure_open(struct kernfs_open_file *of) 3895 + { 3896 + if (of->file->f_mode & FMODE_WRITE && !capable(CAP_SYS_RESOURCE)) 3897 + return -EPERM; 3898 + 3899 + return 0; 3900 + } 3901 + 3894 3902 static void cgroup_pressure_release(struct kernfs_open_file *of) 3895 3903 { 3896 3904 struct cgroup_file_ctx *ctx = of->priv; ··· 5298 5290 { 5299 5291 .name = "io.pressure", 5300 5292 .file_offset = offsetof(struct cgroup, psi_files[PSI_IO]), 5293 + .open = cgroup_pressure_open, 5301 5294 .seq_show = cgroup_io_pressure_show, 5302 5295 .write = cgroup_io_pressure_write, 5303 5296 .poll = cgroup_pressure_poll, ··· 5307 5298 { 5308 5299 .name = "memory.pressure", 5309 5300 .file_offset = offsetof(struct cgroup, psi_files[PSI_MEM]), 5301 + .open = cgroup_pressure_open, 5310 5302 .seq_show = cgroup_memory_pressure_show, 5311 5303 .write = cgroup_memory_pressure_write, 5312 5304 .poll = cgroup_pressure_poll, ··· 5316 5306 { 5317 5307 .name = "cpu.pressure", 5318 5308 .file_offset = offsetof(struct cgroup, psi_files[PSI_CPU]), 5309 + .open = cgroup_pressure_open, 5319 5310 .seq_show = cgroup_cpu_pressure_show, 5320 5311 .write = cgroup_cpu_pressure_write, 5321 5312 .poll = cgroup_pressure_poll, ··· 5326 5315 { 5327 5316 .name = "irq.pressure", 5328 5317 .file_offset = offsetof(struct cgroup, psi_files[PSI_IRQ]), 5318 + .open = cgroup_pressure_open, 5329 5319 .seq_show = cgroup_irq_pressure_show, 5330 5320 .write = cgroup_irq_pressure_write, 5331 5321 .poll = cgroup_pressure_poll,

+10

kernel/kthread.c

··· 182 182 } 183 183 EXPORT_SYMBOL_GPL(kthread_should_park); 184 184 185 + bool kthread_should_stop_or_park(void) 186 + { 187 + struct kthread *kthread = __to_kthread(current); 188 + 189 + if (!kthread) 190 + return false; 191 + 192 + return kthread->flags & (BIT(KTHREAD_SHOULD_STOP) | BIT(KTHREAD_SHOULD_PARK)); 193 + } 194 + 185 195 /** 186 196 * kthread_freezable_should_stop - should this freezable kthread return now? 187 197 * @was_frozen: optional out parameter, indicates whether %current was frozen

+1 -1

kernel/printk/printk.c

··· 528 528 seq = raw_read_seqcount_latch(&ls->latch); 529 529 idx = seq & 0x1; 530 530 val = ls->val[idx]; 531 - } while (read_seqcount_latch_retry(&ls->latch, seq)); 531 + } while (raw_read_seqcount_latch_retry(&ls->latch, seq)); 532 532 533 533 return val; 534 534 }

+13 -6

kernel/sched/clock.c

··· 266 266 s64 delta; 267 267 268 268 again: 269 - now = sched_clock(); 269 + now = sched_clock_noinstr(); 270 270 delta = now - scd->tick_raw; 271 271 if (unlikely(delta < 0)) 272 272 delta = 0; ··· 293 293 return clock; 294 294 } 295 295 296 - noinstr u64 local_clock(void) 296 + noinstr u64 local_clock_noinstr(void) 297 297 { 298 298 u64 clock; 299 299 300 300 if (static_branch_likely(&__sched_clock_stable)) 301 - return sched_clock() + __sched_clock_offset; 301 + return sched_clock_noinstr() + __sched_clock_offset; 302 302 303 303 if (!static_branch_likely(&sched_clock_running)) 304 - return sched_clock(); 304 + return sched_clock_noinstr(); 305 305 306 - preempt_disable_notrace(); 307 306 clock = sched_clock_local(this_scd()); 308 - preempt_enable_notrace(); 309 307 310 308 return clock; 309 + } 310 + 311 + u64 local_clock(void) 312 + { 313 + u64 now; 314 + preempt_disable_notrace(); 315 + now = local_clock_noinstr(); 316 + preempt_enable_notrace(); 317 + return now; 311 318 } 312 319 EXPORT_SYMBOL_GPL(local_clock); 313 320

+158 -120

kernel/sched/core.c

··· 2213 2213 rq_clock_skip_update(rq); 2214 2214 } 2215 2215 2216 + static __always_inline 2217 + int __task_state_match(struct task_struct *p, unsigned int state) 2218 + { 2219 + if (READ_ONCE(p->__state) & state) 2220 + return 1; 2221 + 2222 + #ifdef CONFIG_PREEMPT_RT 2223 + if (READ_ONCE(p->saved_state) & state) 2224 + return -1; 2225 + #endif 2226 + return 0; 2227 + } 2228 + 2229 + static __always_inline 2230 + int task_state_match(struct task_struct *p, unsigned int state) 2231 + { 2232 + #ifdef CONFIG_PREEMPT_RT 2233 + int match; 2234 + 2235 + /* 2236 + * Serialize against current_save_and_set_rtlock_wait_state() and 2237 + * current_restore_rtlock_saved_state(). 2238 + */ 2239 + raw_spin_lock_irq(&p->pi_lock); 2240 + match = __task_state_match(p, state); 2241 + raw_spin_unlock_irq(&p->pi_lock); 2242 + 2243 + return match; 2244 + #else 2245 + return __task_state_match(p, state); 2246 + #endif 2247 + } 2248 + 2249 + /* 2250 + * wait_task_inactive - wait for a thread to unschedule. 2251 + * 2252 + * Wait for the thread to block in any of the states set in @match_state. 2253 + * If it changes, i.e. @p might have woken up, then return zero. When we 2254 + * succeed in waiting for @p to be off its CPU, we return a positive number 2255 + * (its total switch count). If a second call a short while later returns the 2256 + * same number, the caller can be sure that @p has remained unscheduled the 2257 + * whole time. 2258 + * 2259 + * The caller must ensure that the task *will* unschedule sometime soon, 2260 + * else this function might spin for a *long* time. This function can't 2261 + * be called with interrupts off, or it may introduce deadlock with 2262 + * smp_call_function() if an IPI is sent by the same process we are 2263 + * waiting to become inactive. 2264 + */ 2265 + unsigned long wait_task_inactive(struct task_struct *p, unsigned int match_state) 2266 + { 2267 + int running, queued, match; 2268 + struct rq_flags rf; 2269 + unsigned long ncsw; 2270 + struct rq *rq; 2271 + 2272 + for (;;) { 2273 + /* 2274 + * We do the initial early heuristics without holding 2275 + * any task-queue locks at all. We'll only try to get 2276 + * the runqueue lock when things look like they will 2277 + * work out! 2278 + */ 2279 + rq = task_rq(p); 2280 + 2281 + /* 2282 + * If the task is actively running on another CPU 2283 + * still, just relax and busy-wait without holding 2284 + * any locks. 2285 + * 2286 + * NOTE! Since we don't hold any locks, it's not 2287 + * even sure that "rq" stays as the right runqueue! 2288 + * But we don't care, since "task_on_cpu()" will 2289 + * return false if the runqueue has changed and p 2290 + * is actually now running somewhere else! 2291 + */ 2292 + while (task_on_cpu(rq, p)) { 2293 + if (!task_state_match(p, match_state)) 2294 + return 0; 2295 + cpu_relax(); 2296 + } 2297 + 2298 + /* 2299 + * Ok, time to look more closely! We need the rq 2300 + * lock now, to be *sure*. If we're wrong, we'll 2301 + * just go back and repeat. 2302 + */ 2303 + rq = task_rq_lock(p, &rf); 2304 + trace_sched_wait_task(p); 2305 + running = task_on_cpu(rq, p); 2306 + queued = task_on_rq_queued(p); 2307 + ncsw = 0; 2308 + if ((match = __task_state_match(p, match_state))) { 2309 + /* 2310 + * When matching on p->saved_state, consider this task 2311 + * still queued so it will wait. 2312 + */ 2313 + if (match < 0) 2314 + queued = 1; 2315 + ncsw = p->nvcsw | LONG_MIN; /* sets MSB */ 2316 + } 2317 + task_rq_unlock(rq, p, &rf); 2318 + 2319 + /* 2320 + * If it changed from the expected state, bail out now. 2321 + */ 2322 + if (unlikely(!ncsw)) 2323 + break; 2324 + 2325 + /* 2326 + * Was it really running after all now that we 2327 + * checked with the proper locks actually held? 2328 + * 2329 + * Oops. Go back and try again.. 2330 + */ 2331 + if (unlikely(running)) { 2332 + cpu_relax(); 2333 + continue; 2334 + } 2335 + 2336 + /* 2337 + * It's not enough that it's not actively running, 2338 + * it must be off the runqueue _entirely_, and not 2339 + * preempted! 2340 + * 2341 + * So if it was still runnable (but just not actively 2342 + * running right now), it's preempted, and we should 2343 + * yield - it could be a while. 2344 + */ 2345 + if (unlikely(queued)) { 2346 + ktime_t to = NSEC_PER_SEC / HZ; 2347 + 2348 + set_current_state(TASK_UNINTERRUPTIBLE); 2349 + schedule_hrtimeout(&to, HRTIMER_MODE_REL_HARD); 2350 + continue; 2351 + } 2352 + 2353 + /* 2354 + * Ahh, all good. It wasn't running, and it wasn't 2355 + * runnable, which means that it will never become 2356 + * running in the future either. We're all done! 2357 + */ 2358 + break; 2359 + } 2360 + 2361 + return ncsw; 2362 + } 2363 + 2216 2364 #ifdef CONFIG_SMP 2217 2365 2218 2366 static void ··· 2546 2398 if (!is_cpu_allowed(p, dest_cpu)) 2547 2399 return rq; 2548 2400 2549 - update_rq_clock(rq); 2550 2401 rq = move_queued_task(rq, rf, p, dest_cpu); 2551 2402 2552 2403 return rq; ··· 2603 2456 goto out; 2604 2457 } 2605 2458 2606 - if (task_on_rq_queued(p)) 2459 + if (task_on_rq_queued(p)) { 2460 + update_rq_clock(rq); 2607 2461 rq = __migrate_task(rq, &rf, p, arg->dest_cpu); 2608 - else 2462 + } else { 2609 2463 p->wake_cpu = arg->dest_cpu; 2464 + } 2610 2465 2611 2466 /* 2612 2467 * XXX __migrate_task() can fail, at which point we might end ··· 3490 3341 } 3491 3342 #endif /* CONFIG_NUMA_BALANCING */ 3492 3343 3493 - /* 3494 - * wait_task_inactive - wait for a thread to unschedule. 3495 - * 3496 - * Wait for the thread to block in any of the states set in @match_state. 3497 - * If it changes, i.e. @p might have woken up, then return zero. When we 3498 - * succeed in waiting for @p to be off its CPU, we return a positive number 3499 - * (its total switch count). If a second call a short while later returns the 3500 - * same number, the caller can be sure that @p has remained unscheduled the 3501 - * whole time. 3502 - * 3503 - * The caller must ensure that the task *will* unschedule sometime soon, 3504 - * else this function might spin for a *long* time. This function can't 3505 - * be called with interrupts off, or it may introduce deadlock with 3506 - * smp_call_function() if an IPI is sent by the same process we are 3507 - * waiting to become inactive. 3508 - */ 3509 - unsigned long wait_task_inactive(struct task_struct *p, unsigned int match_state) 3510 - { 3511 - int running, queued; 3512 - struct rq_flags rf; 3513 - unsigned long ncsw; 3514 - struct rq *rq; 3515 - 3516 - for (;;) { 3517 - /* 3518 - * We do the initial early heuristics without holding 3519 - * any task-queue locks at all. We'll only try to get 3520 - * the runqueue lock when things look like they will 3521 - * work out! 3522 - */ 3523 - rq = task_rq(p); 3524 - 3525 - /* 3526 - * If the task is actively running on another CPU 3527 - * still, just relax and busy-wait without holding 3528 - * any locks. 3529 - * 3530 - * NOTE! Since we don't hold any locks, it's not 3531 - * even sure that "rq" stays as the right runqueue! 3532 - * But we don't care, since "task_on_cpu()" will 3533 - * return false if the runqueue has changed and p 3534 - * is actually now running somewhere else! 3535 - */ 3536 - while (task_on_cpu(rq, p)) { 3537 - if (!(READ_ONCE(p->__state) & match_state)) 3538 - return 0; 3539 - cpu_relax(); 3540 - } 3541 - 3542 - /* 3543 - * Ok, time to look more closely! We need the rq 3544 - * lock now, to be *sure*. If we're wrong, we'll 3545 - * just go back and repeat. 3546 - */ 3547 - rq = task_rq_lock(p, &rf); 3548 - trace_sched_wait_task(p); 3549 - running = task_on_cpu(rq, p); 3550 - queued = task_on_rq_queued(p); 3551 - ncsw = 0; 3552 - if (READ_ONCE(p->__state) & match_state) 3553 - ncsw = p->nvcsw | LONG_MIN; /* sets MSB */ 3554 - task_rq_unlock(rq, p, &rf); 3555 - 3556 - /* 3557 - * If it changed from the expected state, bail out now. 3558 - */ 3559 - if (unlikely(!ncsw)) 3560 - break; 3561 - 3562 - /* 3563 - * Was it really running after all now that we 3564 - * checked with the proper locks actually held? 3565 - * 3566 - * Oops. Go back and try again.. 3567 - */ 3568 - if (unlikely(running)) { 3569 - cpu_relax(); 3570 - continue; 3571 - } 3572 - 3573 - /* 3574 - * It's not enough that it's not actively running, 3575 - * it must be off the runqueue _entirely_, and not 3576 - * preempted! 3577 - * 3578 - * So if it was still runnable (but just not actively 3579 - * running right now), it's preempted, and we should 3580 - * yield - it could be a while. 3581 - */ 3582 - if (unlikely(queued)) { 3583 - ktime_t to = NSEC_PER_SEC / HZ; 3584 - 3585 - set_current_state(TASK_UNINTERRUPTIBLE); 3586 - schedule_hrtimeout(&to, HRTIMER_MODE_REL_HARD); 3587 - continue; 3588 - } 3589 - 3590 - /* 3591 - * Ahh, all good. It wasn't running, and it wasn't 3592 - * runnable, which means that it will never become 3593 - * running in the future either. We're all done! 3594 - */ 3595 - break; 3596 - } 3597 - 3598 - return ncsw; 3599 - } 3600 - 3601 3344 /*** 3602 3345 * kick_process - kick a running thread to enter/exit the kernel 3603 3346 * @p: the to-be-kicked thread ··· 4044 4003 static __always_inline 4045 4004 bool ttwu_state_match(struct task_struct *p, unsigned int state, int *success) 4046 4005 { 4006 + int match; 4007 + 4047 4008 if (IS_ENABLED(CONFIG_DEBUG_PREEMPT)) { 4048 4009 WARN_ON_ONCE((state & TASK_RTLOCK_WAIT) && 4049 4010 state != TASK_RTLOCK_WAIT); 4050 4011 } 4051 4012 4052 - if (READ_ONCE(p->__state) & state) { 4053 - *success = 1; 4054 - return true; 4055 - } 4013 + *success = !!(match = __task_state_match(p, state)); 4056 4014 4057 4015 #ifdef CONFIG_PREEMPT_RT 4058 4016 /* ··· 4067 4027 * p::saved_state to TASK_RUNNING so any further tests will 4068 4028 * not result in false positives vs. @success 4069 4029 */ 4070 - if (p->saved_state & state) { 4030 + if (match < 0) 4071 4031 p->saved_state = TASK_RUNNING; 4072 - *success = 1; 4073 - } 4074 4032 #endif 4075 - return false; 4033 + return match > 0; 4076 4034 } 4077 4035 4078 4036 /* ··· 9586 9548 if (rq->online) { 9587 9549 const struct sched_class *class; 9588 9550 9551 + update_rq_clock(rq); 9589 9552 for_each_class(class) { 9590 9553 if (class->rq_offline) 9591 9554 class->rq_offline(rq); ··· 9728 9689 9729 9690 rq_lock_irqsave(rq, &rf); 9730 9691 if (rq->rd) { 9731 - update_rq_clock(rq); 9732 9692 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); 9733 9693 set_rq_offline(rq); 9734 9694 }

+2 -1

kernel/sched/cpufreq_schedutil.c

··· 155 155 156 156 static void sugov_get_util(struct sugov_cpu *sg_cpu) 157 157 { 158 + unsigned long util = cpu_util_cfs_boost(sg_cpu->cpu); 158 159 struct rq *rq = cpu_rq(sg_cpu->cpu); 159 160 160 161 sg_cpu->bw_dl = cpu_bw_dl(rq); 161 - sg_cpu->util = effective_cpu_util(sg_cpu->cpu, cpu_util_cfs(sg_cpu->cpu), 162 + sg_cpu->util = effective_cpu_util(sg_cpu->cpu, util, 162 163 FREQUENCY_UTIL, NULL); 163 164 } 164 165

+23 -34

kernel/sched/deadline.c

··· 489 489 490 490 static void init_dl_rq_bw_ratio(struct dl_rq *dl_rq); 491 491 492 - void init_dl_bandwidth(struct dl_bandwidth *dl_b, u64 period, u64 runtime) 493 - { 494 - raw_spin_lock_init(&dl_b->dl_runtime_lock); 495 - dl_b->dl_period = period; 496 - dl_b->dl_runtime = runtime; 497 - } 498 - 499 492 void init_dl_bw(struct dl_bw *dl_b) 500 493 { 501 494 raw_spin_lock_init(&dl_b->lock); ··· 1253 1260 } 1254 1261 1255 1262 /* 1256 - * This function implements the GRUB accounting rule: 1257 - * according to the GRUB reclaiming algorithm, the runtime is 1258 - * not decreased as "dq = -dt", but as 1259 - * "dq = -max{u / Umax, (1 - Uinact - Uextra)} dt", 1263 + * This function implements the GRUB accounting rule. According to the 1264 + * GRUB reclaiming algorithm, the runtime is not decreased as "dq = -dt", 1265 + * but as "dq = -(max{u, (Umax - Uinact - Uextra)} / Umax) dt", 1260 1266 * where u is the utilization of the task, Umax is the maximum reclaimable 1261 1267 * utilization, Uinact is the (per-runqueue) inactive utilization, computed 1262 1268 * as the difference between the "total runqueue utilization" and the 1263 - * runqueue active utilization, and Uextra is the (per runqueue) extra 1269 + * "runqueue active utilization", and Uextra is the (per runqueue) extra 1264 1270 * reclaimable utilization. 1265 - * Since rq->dl.running_bw and rq->dl.this_bw contain utilizations 1266 - * multiplied by 2^BW_SHIFT, the result has to be shifted right by 1267 - * BW_SHIFT. 1268 - * Since rq->dl.bw_ratio contains 1 / Umax multiplied by 2^RATIO_SHIFT, 1269 - * dl_bw is multiped by rq->dl.bw_ratio and shifted right by RATIO_SHIFT. 1270 - * Since delta is a 64 bit variable, to have an overflow its value 1271 - * should be larger than 2^(64 - 20 - 8), which is more than 64 seconds. 1272 - * So, overflow is not an issue here. 1271 + * Since rq->dl.running_bw and rq->dl.this_bw contain utilizations multiplied 1272 + * by 2^BW_SHIFT, the result has to be shifted right by BW_SHIFT. 1273 + * Since rq->dl.bw_ratio contains 1 / Umax multiplied by 2^RATIO_SHIFT, dl_bw 1274 + * is multiped by rq->dl.bw_ratio and shifted right by RATIO_SHIFT. 1275 + * Since delta is a 64 bit variable, to have an overflow its value should be 1276 + * larger than 2^(64 - 20 - 8), which is more than 64 seconds. So, overflow is 1277 + * not an issue here. 1273 1278 */ 1274 1279 static u64 grub_reclaim(u64 delta, struct rq *rq, struct sched_dl_entity *dl_se) 1275 1280 { 1276 - u64 u_inact = rq->dl.this_bw - rq->dl.running_bw; /* Utot - Uact */ 1277 1281 u64 u_act; 1278 - u64 u_act_min = (dl_se->dl_bw * rq->dl.bw_ratio) >> RATIO_SHIFT; 1282 + u64 u_inact = rq->dl.this_bw - rq->dl.running_bw; /* Utot - Uact */ 1279 1283 1280 1284 /* 1281 - * Instead of computing max{u * bw_ratio, (1 - u_inact - u_extra)}, 1282 - * we compare u_inact + rq->dl.extra_bw with 1283 - * 1 - (u * rq->dl.bw_ratio >> RATIO_SHIFT), because 1284 - * u_inact + rq->dl.extra_bw can be larger than 1285 - * 1 * (so, 1 - u_inact - rq->dl.extra_bw would be negative 1286 - * leading to wrong results) 1285 + * Instead of computing max{u, (u_max - u_inact - u_extra)}, we 1286 + * compare u_inact + u_extra with u_max - u, because u_inact + u_extra 1287 + * can be larger than u_max. So, u_max - u_inact - u_extra would be 1288 + * negative leading to wrong results. 1287 1289 */ 1288 - if (u_inact + rq->dl.extra_bw > BW_UNIT - u_act_min) 1289 - u_act = u_act_min; 1290 + if (u_inact + rq->dl.extra_bw > rq->dl.max_bw - dl_se->dl_bw) 1291 + u_act = dl_se->dl_bw; 1290 1292 else 1291 - u_act = BW_UNIT - u_inact - rq->dl.extra_bw; 1293 + u_act = rq->dl.max_bw - u_inact - rq->dl.extra_bw; 1292 1294 1295 + u_act = (u_act * rq->dl.bw_ratio) >> RATIO_SHIFT; 1293 1296 return (delta * u_act) >> BW_SHIFT; 1294 1297 } 1295 1298 ··· 2784 2795 { 2785 2796 if (global_rt_runtime() == RUNTIME_INF) { 2786 2797 dl_rq->bw_ratio = 1 << RATIO_SHIFT; 2787 - dl_rq->extra_bw = 1 << BW_SHIFT; 2798 + dl_rq->max_bw = dl_rq->extra_bw = 1 << BW_SHIFT; 2788 2799 } else { 2789 2800 dl_rq->bw_ratio = to_ratio(global_rt_runtime(), 2790 2801 global_rt_period()) >> (BW_SHIFT - RATIO_SHIFT); 2791 - dl_rq->extra_bw = to_ratio(global_rt_period(), 2792 - global_rt_runtime()); 2802 + dl_rq->max_bw = dl_rq->extra_bw = 2803 + to_ratio(global_rt_period(), global_rt_runtime()); 2793 2804 } 2794 2805 } 2795 2806

+1 -1

kernel/sched/debug.c

··· 777 777 #define P(x) \ 778 778 do { \ 779 779 if (sizeof(rq->x) == 4) \ 780 - SEQ_printf(m, " .%-30s: %ld\n", #x, (long)(rq->x)); \ 780 + SEQ_printf(m, " .%-30s: %d\n", #x, (int)(rq->x)); \ 781 781 else \ 782 782 SEQ_printf(m, " .%-30s: %Ld\n", #x, (long long)(rq->x));\ 783 783 } while (0)

+203 -126

kernel/sched/fair.c

··· 1064 1064 * Scheduling class queueing methods: 1065 1065 */ 1066 1066 1067 + static inline bool is_core_idle(int cpu) 1068 + { 1069 + #ifdef CONFIG_SCHED_SMT 1070 + int sibling; 1071 + 1072 + for_each_cpu(sibling, cpu_smt_mask(cpu)) { 1073 + if (cpu == sibling) 1074 + continue; 1075 + 1076 + if (!idle_cpu(sibling)) 1077 + return false; 1078 + } 1079 + #endif 1080 + 1081 + return true; 1082 + } 1083 + 1067 1084 #ifdef CONFIG_NUMA 1068 1085 #define NUMA_IMBALANCE_MIN 2 1069 1086 ··· 1716 1699 enum numa_type node_type; 1717 1700 int idle_cpu; 1718 1701 }; 1719 - 1720 - static inline bool is_core_idle(int cpu) 1721 - { 1722 - #ifdef CONFIG_SCHED_SMT 1723 - int sibling; 1724 - 1725 - for_each_cpu(sibling, cpu_smt_mask(cpu)) { 1726 - if (cpu == sibling) 1727 - continue; 1728 - 1729 - if (!idle_cpu(sibling)) 1730 - return false; 1731 - } 1732 - #endif 1733 - 1734 - return true; 1735 - } 1736 1702 1737 1703 struct task_numa_env { 1738 1704 struct task_struct *p; ··· 5577 5577 rq_lock(rq, &rf); 5578 5578 5579 5579 /* 5580 + * Iterating over the list can trigger several call to 5581 + * update_rq_clock() in unthrottle_cfs_rq(). 5582 + * Do it once and skip the potential next ones. 5583 + */ 5584 + update_rq_clock(rq); 5585 + rq_clock_start_loop_update(rq); 5586 + 5587 + /* 5580 5588 * Since we hold rq lock we're safe from concurrent manipulation of 5581 5589 * the CSD list. However, this RCU critical section annotates the 5582 5590 * fact that we pair with sched_free_group_rcu(), so that we cannot ··· 5603 5595 5604 5596 rcu_read_unlock(); 5605 5597 5598 + rq_clock_stop_loop_update(rq); 5606 5599 rq_unlock(rq, &rf); 5607 5600 } 5608 5601 ··· 6124 6115 6125 6116 lockdep_assert_rq_held(rq); 6126 6117 6118 + /* 6119 + * The rq clock has already been updated in the 6120 + * set_rq_offline(), so we should skip updating 6121 + * the rq clock again in unthrottle_cfs_rq(). 6122 + */ 6123 + rq_clock_start_loop_update(rq); 6124 + 6127 6125 rcu_read_lock(); 6128 6126 list_for_each_entry_rcu(tg, &task_groups, list) { 6129 6127 struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)]; ··· 6153 6137 unthrottle_cfs_rq(cfs_rq); 6154 6138 } 6155 6139 rcu_read_unlock(); 6140 + 6141 + rq_clock_stop_loop_update(rq); 6156 6142 } 6157 6143 6158 6144 #else /* CONFIG_CFS_BANDWIDTH */ ··· 7220 7202 return target; 7221 7203 } 7222 7204 7223 - /* 7224 - * Predicts what cpu_util(@cpu) would return if @p was removed from @cpu 7225 - * (@dst_cpu = -1) or migrated to @dst_cpu. 7205 + /** 7206 + * cpu_util() - Estimates the amount of CPU capacity used by CFS tasks. 7207 + * @cpu: the CPU to get the utilization for 7208 + * @p: task for which the CPU utilization should be predicted or NULL 7209 + * @dst_cpu: CPU @p migrates to, -1 if @p moves from @cpu or @p == NULL 7210 + * @boost: 1 to enable boosting, otherwise 0 7211 + * 7212 + * The unit of the return value must be the same as the one of CPU capacity 7213 + * so that CPU utilization can be compared with CPU capacity. 7214 + * 7215 + * CPU utilization is the sum of running time of runnable tasks plus the 7216 + * recent utilization of currently non-runnable tasks on that CPU. 7217 + * It represents the amount of CPU capacity currently used by CFS tasks in 7218 + * the range [0..max CPU capacity] with max CPU capacity being the CPU 7219 + * capacity at f_max. 7220 + * 7221 + * The estimated CPU utilization is defined as the maximum between CPU 7222 + * utilization and sum of the estimated utilization of the currently 7223 + * runnable tasks on that CPU. It preserves a utilization "snapshot" of 7224 + * previously-executed tasks, which helps better deduce how busy a CPU will 7225 + * be when a long-sleeping task wakes up. The contribution to CPU utilization 7226 + * of such a task would be significantly decayed at this point of time. 7227 + * 7228 + * Boosted CPU utilization is defined as max(CPU runnable, CPU utilization). 7229 + * CPU contention for CFS tasks can be detected by CPU runnable > CPU 7230 + * utilization. Boosting is implemented in cpu_util() so that internal 7231 + * users (e.g. EAS) can use it next to external users (e.g. schedutil), 7232 + * latter via cpu_util_cfs_boost(). 7233 + * 7234 + * CPU utilization can be higher than the current CPU capacity 7235 + * (f_curr/f_max * max CPU capacity) or even the max CPU capacity because 7236 + * of rounding errors as well as task migrations or wakeups of new tasks. 7237 + * CPU utilization has to be capped to fit into the [0..max CPU capacity] 7238 + * range. Otherwise a group of CPUs (CPU0 util = 121% + CPU1 util = 80%) 7239 + * could be seen as over-utilized even though CPU1 has 20% of spare CPU 7240 + * capacity. CPU utilization is allowed to overshoot current CPU capacity 7241 + * though since this is useful for predicting the CPU capacity required 7242 + * after task migrations (scheduler-driven DVFS). 7243 + * 7244 + * Return: (Boosted) (estimated) utilization for the specified CPU. 7226 7245 */ 7227 - static unsigned long cpu_util_next(int cpu, struct task_struct *p, int dst_cpu) 7246 + static unsigned long 7247 + cpu_util(int cpu, struct task_struct *p, int dst_cpu, int boost) 7228 7248 { 7229 7249 struct cfs_rq *cfs_rq = &cpu_rq(cpu)->cfs; 7230 7250 unsigned long util = READ_ONCE(cfs_rq->avg.util_avg); 7251 + unsigned long runnable; 7252 + 7253 + if (boost) { 7254 + runnable = READ_ONCE(cfs_rq->avg.runnable_avg); 7255 + util = max(util, runnable); 7256 + } 7231 7257 7232 7258 /* 7233 7259 * If @dst_cpu is -1 or @p migrates from @cpu to @dst_cpu remove its ··· 7279 7217 * contribution. In all the other cases @cpu is not impacted by the 7280 7218 * migration so its util_avg is already correct. 7281 7219 */ 7282 - if (task_cpu(p) == cpu && dst_cpu != cpu) 7220 + if (p && task_cpu(p) == cpu && dst_cpu != cpu) 7283 7221 lsub_positive(&util, task_util(p)); 7284 - else if (task_cpu(p) != cpu && dst_cpu == cpu) 7222 + else if (p && task_cpu(p) != cpu && dst_cpu == cpu) 7285 7223 util += task_util(p); 7286 7224 7287 7225 if (sched_feat(UTIL_EST)) { 7288 7226 unsigned long util_est; 7289 7227 7290 7228 util_est = READ_ONCE(cfs_rq->avg.util_est.enqueued); 7229 + 7230 + if (boost) 7231 + util_est = max(util_est, runnable); 7291 7232 7292 7233 /* 7293 7234 * During wake-up @p isn't enqueued yet and doesn't contribute ··· 7320 7255 */ 7321 7256 if (dst_cpu == cpu) 7322 7257 util_est += _task_util_est(p); 7323 - else if (unlikely(task_on_rq_queued(p) || current == p)) 7258 + else if (p && unlikely(task_on_rq_queued(p) || current == p)) 7324 7259 lsub_positive(&util_est, _task_util_est(p)); 7325 7260 7326 7261 util = max(util, util_est); 7327 7262 } 7328 7263 7329 7264 return min(util, capacity_orig_of(cpu)); 7265 + } 7266 + 7267 + unsigned long cpu_util_cfs(int cpu) 7268 + { 7269 + return cpu_util(cpu, NULL, -1, 0); 7270 + } 7271 + 7272 + unsigned long cpu_util_cfs_boost(int cpu) 7273 + { 7274 + return cpu_util(cpu, NULL, -1, 1); 7330 7275 } 7331 7276 7332 7277 /* ··· 7356 7281 { 7357 7282 /* Task has no contribution or is new */ 7358 7283 if (cpu != task_cpu(p) || !READ_ONCE(p->se.avg.last_update_time)) 7359 - return cpu_util_cfs(cpu); 7284 + p = NULL; 7360 7285 7361 - return cpu_util_next(cpu, p, -1); 7286 + return cpu_util(cpu, p, -1, 0); 7362 7287 } 7363 7288 7364 7289 /* ··· 7405 7330 * cpu_capacity. 7406 7331 * 7407 7332 * The contribution of the task @p for which we want to estimate the 7408 - * energy cost is removed (by cpu_util_next()) and must be calculated 7333 + * energy cost is removed (by cpu_util()) and must be calculated 7409 7334 * separately (see eenv_task_busy_time). This ensures: 7410 7335 * 7411 7336 * - A stable PD utilization, no matter which CPU of that PD we want to place ··· 7426 7351 int cpu; 7427 7352 7428 7353 for_each_cpu(cpu, pd_cpus) { 7429 - unsigned long util = cpu_util_next(cpu, p, -1); 7354 + unsigned long util = cpu_util(cpu, p, -1, 0); 7430 7355 7431 7356 busy_time += effective_cpu_util(cpu, util, ENERGY_UTIL, NULL); 7432 7357 } ··· 7450 7375 7451 7376 for_each_cpu(cpu, pd_cpus) { 7452 7377 struct task_struct *tsk = (cpu == dst_cpu) ? p : NULL; 7453 - unsigned long util = cpu_util_next(cpu, p, dst_cpu); 7454 - unsigned long cpu_util; 7378 + unsigned long util = cpu_util(cpu, p, dst_cpu, 1); 7379 + unsigned long eff_util; 7455 7380 7456 7381 /* 7457 7382 * Performance domain frequency: utilization clamping ··· 7460 7385 * NOTE: in case RT tasks are running, by default the 7461 7386 * FREQUENCY_UTIL's utilization can be max OPP. 7462 7387 */ 7463 - cpu_util = effective_cpu_util(cpu, util, FREQUENCY_UTIL, tsk); 7464 - max_util = max(max_util, cpu_util); 7388 + eff_util = effective_cpu_util(cpu, util, FREQUENCY_UTIL, tsk); 7389 + max_util = max(max_util, eff_util); 7465 7390 } 7466 7391 7467 7392 return min(max_util, eenv->cpu_cap); ··· 7596 7521 if (!cpumask_test_cpu(cpu, p->cpus_ptr)) 7597 7522 continue; 7598 7523 7599 - util = cpu_util_next(cpu, p, cpu); 7524 + util = cpu_util(cpu, p, cpu, 0); 7600 7525 cpu_cap = capacity_of(cpu); 7601 7526 7602 7527 /* ··· 9406 9331 } 9407 9332 9408 9333 /** 9409 - * asym_smt_can_pull_tasks - Check whether the load balancing CPU can pull tasks 9410 - * @dst_cpu: Destination CPU of the load balancing 9411 - * @sds: Load-balancing data with statistics of the local group 9412 - * @sgs: Load-balancing statistics of the candidate busiest group 9413 - * @sg: The candidate busiest group 9334 + * sched_use_asym_prio - Check whether asym_packing priority must be used 9335 + * @sd: The scheduling domain of the load balancing 9336 + * @cpu: A CPU 9414 9337 * 9415 - * Check the state of the SMT siblings of both @sds::local and @sg and decide 9416 - * if @dst_cpu can pull tasks. 9338 + * Always use CPU priority when balancing load between SMT siblings. When 9339 + * balancing load between cores, it is not sufficient that @cpu is idle. Only 9340 + * use CPU priority if the whole core is idle. 9417 9341 * 9418 - * If @dst_cpu does not have SMT siblings, it can pull tasks if two or more of 9419 - * the SMT siblings of @sg are busy. If only one CPU in @sg is busy, pull tasks 9420 - * only if @dst_cpu has higher priority. 9421 - * 9422 - * If both @dst_cpu and @sg have SMT siblings, and @sg has exactly one more 9423 - * busy CPU than @sds::local, let @dst_cpu pull tasks if it has higher priority. 9424 - * Bigger imbalances in the number of busy CPUs will be dealt with in 9425 - * update_sd_pick_busiest(). 9426 - * 9427 - * If @sg does not have SMT siblings, only pull tasks if all of the SMT siblings 9428 - * of @dst_cpu are idle and @sg has lower priority. 9429 - * 9430 - * Return: true if @dst_cpu can pull tasks, false otherwise. 9342 + * Returns: True if the priority of @cpu must be followed. False otherwise. 9431 9343 */ 9432 - static bool asym_smt_can_pull_tasks(int dst_cpu, struct sd_lb_stats *sds, 9433 - struct sg_lb_stats *sgs, 9434 - struct sched_group *sg) 9344 + static bool sched_use_asym_prio(struct sched_domain *sd, int cpu) 9435 9345 { 9436 - #ifdef CONFIG_SCHED_SMT 9437 - bool local_is_smt, sg_is_smt; 9438 - int sg_busy_cpus; 9346 + if (!sched_smt_active()) 9347 + return true; 9439 9348 9440 - local_is_smt = sds->local->flags & SD_SHARE_CPUCAPACITY; 9441 - sg_is_smt = sg->flags & SD_SHARE_CPUCAPACITY; 9442 - 9443 - sg_busy_cpus = sgs->group_weight - sgs->idle_cpus; 9444 - 9445 - if (!local_is_smt) { 9446 - /* 9447 - * If we are here, @dst_cpu is idle and does not have SMT 9448 - * siblings. Pull tasks if candidate group has two or more 9449 - * busy CPUs. 9450 - */ 9451 - if (sg_busy_cpus >= 2) /* implies sg_is_smt */ 9452 - return true; 9453 - 9454 - /* 9455 - * @dst_cpu does not have SMT siblings. @sg may have SMT 9456 - * siblings and only one is busy. In such case, @dst_cpu 9457 - * can help if it has higher priority and is idle (i.e., 9458 - * it has no running tasks). 9459 - */ 9460 - return sched_asym_prefer(dst_cpu, sg->asym_prefer_cpu); 9461 - } 9462 - 9463 - /* @dst_cpu has SMT siblings. */ 9464 - 9465 - if (sg_is_smt) { 9466 - int local_busy_cpus = sds->local->group_weight - 9467 - sds->local_stat.idle_cpus; 9468 - int busy_cpus_delta = sg_busy_cpus - local_busy_cpus; 9469 - 9470 - if (busy_cpus_delta == 1) 9471 - return sched_asym_prefer(dst_cpu, sg->asym_prefer_cpu); 9472 - 9473 - return false; 9474 - } 9475 - 9476 - /* 9477 - * @sg does not have SMT siblings. Ensure that @sds::local does not end 9478 - * up with more than one busy SMT sibling and only pull tasks if there 9479 - * are not busy CPUs (i.e., no CPU has running tasks). 9480 - */ 9481 - if (!sds->local_stat.sum_nr_running) 9482 - return sched_asym_prefer(dst_cpu, sg->asym_prefer_cpu); 9483 - 9484 - return false; 9485 - #else 9486 - /* Always return false so that callers deal with non-SMT cases. */ 9487 - return false; 9488 - #endif 9349 + return sd->flags & SD_SHARE_CPUCAPACITY || is_core_idle(cpu); 9489 9350 } 9490 9351 9352 + /** 9353 + * sched_asym - Check if the destination CPU can do asym_packing load balance 9354 + * @env: The load balancing environment 9355 + * @sds: Load-balancing data with statistics of the local group 9356 + * @sgs: Load-balancing statistics of the candidate busiest group 9357 + * @group: The candidate busiest group 9358 + * 9359 + * @env::dst_cpu can do asym_packing if it has higher priority than the 9360 + * preferred CPU of @group. 9361 + * 9362 + * SMT is a special case. If we are balancing load between cores, @env::dst_cpu 9363 + * can do asym_packing balance only if all its SMT siblings are idle. Also, it 9364 + * can only do it if @group is an SMT group and has exactly on busy CPU. Larger 9365 + * imbalances in the number of CPUS are dealt with in find_busiest_group(). 9366 + * 9367 + * If we are balancing load within an SMT core, or at DIE domain level, always 9368 + * proceed. 9369 + * 9370 + * Return: true if @env::dst_cpu can do with asym_packing load balance. False 9371 + * otherwise. 9372 + */ 9491 9373 static inline bool 9492 9374 sched_asym(struct lb_env *env, struct sd_lb_stats *sds, struct sg_lb_stats *sgs, 9493 9375 struct sched_group *group) 9494 9376 { 9495 - /* Only do SMT checks if either local or candidate have SMT siblings */ 9496 - if ((sds->local->flags & SD_SHARE_CPUCAPACITY) || 9497 - (group->flags & SD_SHARE_CPUCAPACITY)) 9498 - return asym_smt_can_pull_tasks(env->dst_cpu, sds, sgs, group); 9377 + /* Ensure that the whole local core is idle, if applicable. */ 9378 + if (!sched_use_asym_prio(env->sd, env->dst_cpu)) 9379 + return false; 9380 + 9381 + /* 9382 + * CPU priorities does not make sense for SMT cores with more than one 9383 + * busy sibling. 9384 + */ 9385 + if (group->flags & SD_SHARE_CPUCAPACITY) { 9386 + if (sgs->group_weight - sgs->idle_cpus != 1) 9387 + return false; 9388 + } 9499 9389 9500 9390 return sched_asym_prefer(env->dst_cpu, group->asym_prefer_cpu); 9501 9391 } ··· 9650 9610 * contention when accessing shared HW resources. 9651 9611 * 9652 9612 * XXX for now avg_load is not computed and always 0 so we 9653 - * select the 1st one. 9613 + * select the 1st one, except if @sg is composed of SMT 9614 + * siblings. 9654 9615 */ 9655 - if (sgs->avg_load <= busiest->avg_load) 9616 + 9617 + if (sgs->avg_load < busiest->avg_load) 9656 9618 return false; 9619 + 9620 + if (sgs->avg_load == busiest->avg_load) { 9621 + /* 9622 + * SMT sched groups need more help than non-SMT groups. 9623 + * If @sg happens to also be SMT, either choice is good. 9624 + */ 9625 + if (sds->busiest->flags & SD_SHARE_CPUCAPACITY) 9626 + return false; 9627 + } 9628 + 9657 9629 break; 9658 9630 9659 9631 case group_has_spare: ··· 10140 10088 10141 10089 static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sds) 10142 10090 { 10143 - struct sched_domain *child = env->sd->child; 10144 10091 struct sched_group *sg = env->sd->groups; 10145 10092 struct sg_lb_stats *local = &sds->local_stat; 10146 10093 struct sg_lb_stats tmp_sgs; ··· 10180 10129 sg = sg->next; 10181 10130 } while (sg != env->sd->groups); 10182 10131 10183 - /* Tag domain that child domain prefers tasks go to siblings first */ 10184 - sds->prefer_sibling = child && child->flags & SD_PREFER_SIBLING; 10132 + /* 10133 + * Indicate that the child domain of the busiest group prefers tasks 10134 + * go to a child's sibling domains first. NB the flags of a sched group 10135 + * are those of the child domain. 10136 + */ 10137 + if (sds->busiest) 10138 + sds->prefer_sibling = !!(sds->busiest->flags & SD_PREFER_SIBLING); 10185 10139 10186 10140 10187 10141 if (env->sd->flags & SD_NUMA) ··· 10496 10440 goto out_balanced; 10497 10441 } 10498 10442 10499 - /* Try to move all excess tasks to child's sibling domain */ 10443 + /* 10444 + * Try to move all excess tasks to a sibling domain of the busiest 10445 + * group's child domain. 10446 + */ 10500 10447 if (sds.prefer_sibling && local->group_type == group_has_spare && 10501 10448 busiest->sum_nr_running > local->sum_nr_running + 1) 10502 10449 goto force_balance; ··· 10601 10542 nr_running == 1) 10602 10543 continue; 10603 10544 10604 - /* Make sure we only pull tasks from a CPU of lower priority */ 10545 + /* 10546 + * Make sure we only pull tasks from a CPU of lower priority 10547 + * when balancing between SMT siblings. 10548 + * 10549 + * If balancing between cores, let lower priority CPUs help 10550 + * SMT cores with more than one busy sibling. 10551 + */ 10605 10552 if ((env->sd->flags & SD_ASYM_PACKING) && 10553 + sched_use_asym_prio(env->sd, i) && 10606 10554 sched_asym_prefer(i, env->dst_cpu) && 10607 10555 nr_running == 1) 10608 10556 continue; ··· 10647 10581 break; 10648 10582 10649 10583 case migrate_util: 10650 - util = cpu_util_cfs(i); 10584 + util = cpu_util_cfs_boost(i); 10651 10585 10652 10586 /* 10653 10587 * Don't try to pull utilization from a CPU with one ··· 10698 10632 asym_active_balance(struct lb_env *env) 10699 10633 { 10700 10634 /* 10701 - * ASYM_PACKING needs to force migrate tasks from busy but 10702 - * lower priority CPUs in order to pack all tasks in the 10703 - * highest priority CPUs. 10635 + * ASYM_PACKING needs to force migrate tasks from busy but lower 10636 + * priority CPUs in order to pack all tasks in the highest priority 10637 + * CPUs. When done between cores, do it only if the whole core if the 10638 + * whole core is idle. 10639 + * 10640 + * If @env::src_cpu is an SMT core with busy siblings, let 10641 + * the lower priority @env::dst_cpu help it. Do not follow 10642 + * CPU priority. 10704 10643 */ 10705 10644 return env->idle != CPU_NOT_IDLE && (env->sd->flags & SD_ASYM_PACKING) && 10706 - sched_asym_prefer(env->dst_cpu, env->src_cpu); 10645 + sched_use_asym_prio(env->sd, env->dst_cpu) && 10646 + (sched_asym_prefer(env->dst_cpu, env->src_cpu) || 10647 + !sched_use_asym_prio(env->sd, env->src_cpu)); 10707 10648 } 10708 10649 10709 10650 static inline bool ··· 10817 10744 .sd = sd, 10818 10745 .dst_cpu = this_cpu, 10819 10746 .dst_rq = this_rq, 10820 - .dst_grpmask = sched_group_span(sd->groups), 10747 + .dst_grpmask = group_balance_mask(sd->groups), 10821 10748 .idle = idle, 10822 10749 .loop_break = SCHED_NR_MIGRATE_BREAK, 10823 10750 .cpus = cpus, ··· 11444 11371 * When ASYM_PACKING; see if there's a more preferred CPU 11445 11372 * currently idle; in which case, kick the ILB to move tasks 11446 11373 * around. 11374 + * 11375 + * When balancing betwen cores, all the SMT siblings of the 11376 + * preferred CPU must be idle. 11447 11377 */ 11448 11378 for_each_cpu_and(i, sched_domain_span(sd), nohz.idle_cpus_mask) { 11449 - if (sched_asym_prefer(i, cpu)) { 11379 + if (sched_use_asym_prio(sd, i) && 11380 + sched_asym_prefer(i, cpu)) { 11450 11381 flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK; 11451 11382 goto unlock; 11452 11383 }

+11 -8

kernel/sched/psi.c

··· 160 160 #define EXP_300s 2034 /* 1/exp(2s/300s) */ 161 161 162 162 /* PSI trigger definitions */ 163 - #define WINDOW_MIN_US 500000 /* Min window size is 500ms */ 164 163 #define WINDOW_MAX_US 10000000 /* Max window size is 10s */ 165 164 #define UPDATES_PER_WINDOW 10 /* 10 updates per window */ 166 165 ··· 1304 1305 if (state >= PSI_NONIDLE) 1305 1306 return ERR_PTR(-EINVAL); 1306 1307 1307 - if (window_us < WINDOW_MIN_US || 1308 - window_us > WINDOW_MAX_US) 1308 + if (window_us == 0 || window_us > WINDOW_MAX_US) 1309 1309 return ERR_PTR(-EINVAL); 1310 1310 1311 1311 /* ··· 1407 1409 group->rtpoll_nr_triggers[t->state]--; 1408 1410 if (!group->rtpoll_nr_triggers[t->state]) 1409 1411 group->rtpoll_states &= ~(1 << t->state); 1410 - /* reset min update period for the remaining triggers */ 1411 - list_for_each_entry(tmp, &group->rtpoll_triggers, node) 1412 - period = min(period, div_u64(tmp->win.size, 1413 - UPDATES_PER_WINDOW)); 1414 - group->rtpoll_min_period = period; 1412 + /* 1413 + * Reset min update period for the remaining triggers 1414 + * iff the destroying trigger had the min window size. 1415 + */ 1416 + if (group->rtpoll_min_period == div_u64(t->win.size, UPDATES_PER_WINDOW)) { 1417 + list_for_each_entry(tmp, &group->rtpoll_triggers, node) 1418 + period = min(period, div_u64(tmp->win.size, 1419 + UPDATES_PER_WINDOW)); 1420 + group->rtpoll_min_period = period; 1421 + } 1415 1422 /* Destroy rtpoll_task when the last trigger is destroyed */ 1416 1423 if (group->rtpoll_states == 0) { 1417 1424 group->rtpoll_until = 0;

+49 -56

kernel/sched/sched.h

··· 286 286 287 287 void __dl_clear_params(struct task_struct *p); 288 288 289 - struct dl_bandwidth { 290 - raw_spinlock_t dl_runtime_lock; 291 - u64 dl_runtime; 292 - u64 dl_period; 293 - }; 294 - 295 289 static inline int dl_bandwidth_enabled(void) 296 290 { 297 291 return sysctl_sched_rt_runtime >= 0; ··· 746 752 */ 747 753 u64 this_bw; 748 754 u64 extra_bw; 755 + 756 + /* 757 + * Maximum available bandwidth for reclaiming by SCHED_FLAG_RECLAIM 758 + * tasks of this rq. Used in calculation of reclaimable bandwidth(GRUB). 759 + */ 760 + u64 max_bw; 749 761 750 762 /* 751 763 * Inverse of the fraction of CPU utilization that can be reclaimed ··· 1546 1546 rq->clock_update_flags &= ~RQCF_REQ_SKIP; 1547 1547 } 1548 1548 1549 + /* 1550 + * During cpu offlining and rq wide unthrottling, we can trigger 1551 + * an update_rq_clock() for several cfs and rt runqueues (Typically 1552 + * when using list_for_each_entry_*) 1553 + * rq_clock_start_loop_update() can be called after updating the clock 1554 + * once and before iterating over the list to prevent multiple update. 1555 + * After the iterative traversal, we need to call rq_clock_stop_loop_update() 1556 + * to clear RQCF_ACT_SKIP of rq->clock_update_flags. 1557 + */ 1558 + static inline void rq_clock_start_loop_update(struct rq *rq) 1559 + { 1560 + lockdep_assert_rq_held(rq); 1561 + SCHED_WARN_ON(rq->clock_update_flags & RQCF_ACT_SKIP); 1562 + rq->clock_update_flags |= RQCF_ACT_SKIP; 1563 + } 1564 + 1565 + static inline void rq_clock_stop_loop_update(struct rq *rq) 1566 + { 1567 + lockdep_assert_rq_held(rq); 1568 + rq->clock_update_flags &= ~RQCF_ACT_SKIP; 1569 + } 1570 + 1549 1571 struct rq_flags { 1550 1572 unsigned long flags; 1551 1573 struct pin_cookie cookie; ··· 1794 1772 for (__sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); \ 1795 1773 __sd; __sd = __sd->parent) 1796 1774 1775 + /* A mask of all the SD flags that have the SDF_SHARED_CHILD metaflag */ 1776 + #define SD_FLAG(name, mflags) (name * !!((mflags) & SDF_SHARED_CHILD)) | 1777 + static const unsigned int SD_SHARED_CHILD_MASK = 1778 + #include <linux/sched/sd_flags.h> 1779 + 0; 1780 + #undef SD_FLAG 1781 + 1797 1782 /** 1798 1783 * highest_flag_domain - Return highest sched_domain containing flag. 1799 1784 * @cpu: The CPU whose highest level of sched domain is to ··· 1808 1779 * @flag: The flag to check for the highest sched_domain 1809 1780 * for the given CPU. 1810 1781 * 1811 - * Returns the highest sched_domain of a CPU which contains the given flag. 1782 + * Returns the highest sched_domain of a CPU which contains @flag. If @flag has 1783 + * the SDF_SHARED_CHILD metaflag, all the children domains also have @flag. 1812 1784 */ 1813 1785 static inline struct sched_domain *highest_flag_domain(int cpu, int flag) 1814 1786 { 1815 1787 struct sched_domain *sd, *hsd = NULL; 1816 1788 1817 1789 for_each_domain(cpu, sd) { 1818 - if (!(sd->flags & flag)) 1790 + if (sd->flags & flag) { 1791 + hsd = sd; 1792 + continue; 1793 + } 1794 + 1795 + /* 1796 + * Stop the search if @flag is known to be shared at lower 1797 + * levels. It will not be found further up. 1798 + */ 1799 + if (flag & SD_SHARED_CHILD_MASK) 1819 1800 break; 1820 - hsd = sd; 1821 1801 } 1822 1802 1823 1803 return hsd; ··· 2416 2378 extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime); 2417 2379 extern bool sched_rt_bandwidth_account(struct rt_rq *rt_rq); 2418 2380 2419 - extern void init_dl_bandwidth(struct dl_bandwidth *dl_b, u64 period, u64 runtime); 2420 2381 extern void init_dl_task_timer(struct sched_dl_entity *dl_se); 2421 2382 extern void init_dl_inactive_task_timer(struct sched_dl_entity *dl_se); 2422 2383 ··· 2983 2946 return READ_ONCE(rq->avg_dl.util_avg); 2984 2947 } 2985 2948 2986 - /** 2987 - * cpu_util_cfs() - Estimates the amount of CPU capacity used by CFS tasks. 2988 - * @cpu: the CPU to get the utilization for. 2989 - * 2990 - * The unit of the return value must be the same as the one of CPU capacity 2991 - * so that CPU utilization can be compared with CPU capacity. 2992 - * 2993 - * CPU utilization is the sum of running time of runnable tasks plus the 2994 - * recent utilization of currently non-runnable tasks on that CPU. 2995 - * It represents the amount of CPU capacity currently used by CFS tasks in 2996 - * the range [0..max CPU capacity] with max CPU capacity being the CPU 2997 - * capacity at f_max. 2998 - * 2999 - * The estimated CPU utilization is defined as the maximum between CPU 3000 - * utilization and sum of the estimated utilization of the currently 3001 - * runnable tasks on that CPU. It preserves a utilization "snapshot" of 3002 - * previously-executed tasks, which helps better deduce how busy a CPU will 3003 - * be when a long-sleeping task wakes up. The contribution to CPU utilization 3004 - * of such a task would be significantly decayed at this point of time. 3005 - * 3006 - * CPU utilization can be higher than the current CPU capacity 3007 - * (f_curr/f_max * max CPU capacity) or even the max CPU capacity because 3008 - * of rounding errors as well as task migrations or wakeups of new tasks. 3009 - * CPU utilization has to be capped to fit into the [0..max CPU capacity] 3010 - * range. Otherwise a group of CPUs (CPU0 util = 121% + CPU1 util = 80%) 3011 - * could be seen as over-utilized even though CPU1 has 20% of spare CPU 3012 - * capacity. CPU utilization is allowed to overshoot current CPU capacity 3013 - * though since this is useful for predicting the CPU capacity required 3014 - * after task migrations (scheduler-driven DVFS). 3015 - * 3016 - * Return: (Estimated) utilization for the specified CPU. 3017 - */ 3018 - static inline unsigned long cpu_util_cfs(int cpu) 3019 - { 3020 - struct cfs_rq *cfs_rq; 3021 - unsigned long util; 3022 2949 3023 - cfs_rq = &cpu_rq(cpu)->cfs; 3024 - util = READ_ONCE(cfs_rq->avg.util_avg); 3025 - 3026 - if (sched_feat(UTIL_EST)) { 3027 - util = max_t(unsigned long, util, 3028 - READ_ONCE(cfs_rq->avg.util_est.enqueued)); 3029 - } 3030 - 3031 - return min(util, capacity_orig_of(cpu)); 3032 - } 2950 + extern unsigned long cpu_util_cfs(int cpu); 2951 + extern unsigned long cpu_util_cfs_boost(int cpu); 3033 2952 3034 2953 static inline unsigned long cpu_util_rt(struct rq *rq) 3035 2954 {

+10 -5

kernel/sched/topology.c

··· 487 487 void rq_attach_root(struct rq *rq, struct root_domain *rd) 488 488 { 489 489 struct root_domain *old_rd = NULL; 490 - unsigned long flags; 490 + struct rq_flags rf; 491 491 492 - raw_spin_rq_lock_irqsave(rq, flags); 492 + rq_lock_irqsave(rq, &rf); 493 493 494 494 if (rq->rd) { 495 495 old_rd = rq->rd; ··· 515 515 if (cpumask_test_cpu(rq->cpu, cpu_active_mask)) 516 516 set_rq_online(rq); 517 517 518 - raw_spin_rq_unlock_irqrestore(rq, flags); 518 + rq_unlock_irqrestore(rq, &rf); 519 519 520 520 if (old_rd) 521 521 call_rcu(&old_rd->rcu, free_rootdomain); ··· 719 719 720 720 if (sd_parent_degenerate(tmp, parent)) { 721 721 tmp->parent = parent->parent; 722 - if (parent->parent) 722 + 723 + if (parent->parent) { 723 724 parent->parent->child = tmp; 725 + if (tmp->flags & SD_SHARE_CPUCAPACITY) 726 + parent->parent->groups->flags |= SD_SHARE_CPUCAPACITY; 727 + } 728 + 724 729 /* 725 730 * Transfer SD_PREFER_SIBLING down in case of a 726 731 * degenerate parent; the spans match for this ··· 1681 1676 #define for_each_sd_topology(tl) \ 1682 1677 for (tl = sched_domain_topology; tl->mask; tl++) 1683 1678 1684 - void set_sched_topology(struct sched_domain_topology_level *tl) 1679 + void __init set_sched_topology(struct sched_domain_topology_level *tl) 1685 1680 { 1686 1681 if (WARN_ON_ONCE(sched_smp_initialized)) 1687 1682 return;

+1 -6

kernel/sched/wait.c

··· 425 425 } 426 426 EXPORT_SYMBOL(autoremove_wake_function); 427 427 428 - static inline bool is_kthread_should_stop(void) 429 - { 430 - return (current->flags & PF_KTHREAD) && kthread_should_stop(); 431 - } 432 - 433 428 /* 434 429 * DEFINE_WAIT_FUNC(wait, woken_wake_func); 435 430 * ··· 454 459 * or woken_wake_function() sees our store to current->state. 455 460 */ 456 461 set_current_state(mode); /* A */ 457 - if (!(wq_entry->flags & WQ_FLAG_WOKEN) && !is_kthread_should_stop()) 462 + if (!(wq_entry->flags & WQ_FLAG_WOKEN) && !kthread_should_stop_or_park()) 458 463 timeout = schedule_timeout(timeout); 459 464 __set_current_state(TASK_RUNNING); 460 465

+17 -7

kernel/time/sched_clock.c

··· 64 64 .actual_read_sched_clock = jiffy_sched_clock_read, 65 65 }; 66 66 67 - static inline u64 notrace cyc_to_ns(u64 cyc, u32 mult, u32 shift) 67 + static __always_inline u64 cyc_to_ns(u64 cyc, u32 mult, u32 shift) 68 68 { 69 69 return (cyc * mult) >> shift; 70 70 } ··· 77 77 78 78 notrace int sched_clock_read_retry(unsigned int seq) 79 79 { 80 - return read_seqcount_latch_retry(&cd.seq, seq); 80 + return raw_read_seqcount_latch_retry(&cd.seq, seq); 81 81 } 82 82 83 - unsigned long long notrace sched_clock(void) 83 + unsigned long long noinstr sched_clock_noinstr(void) 84 84 { 85 - u64 cyc, res; 86 - unsigned int seq; 87 85 struct clock_read_data *rd; 86 + unsigned int seq; 87 + u64 cyc, res; 88 88 89 89 do { 90 - rd = sched_clock_read_begin(&seq); 90 + seq = raw_read_seqcount_latch(&cd.seq); 91 + rd = cd.read_data + (seq & 1); 91 92 92 93 cyc = (rd->read_sched_clock() - rd->epoch_cyc) & 93 94 rd->sched_clock_mask; 94 95 res = rd->epoch_ns + cyc_to_ns(cyc, rd->mult, rd->shift); 95 - } while (sched_clock_read_retry(seq)); 96 + } while (raw_read_seqcount_latch_retry(&cd.seq, seq)); 96 97 97 98 return res; 99 + } 100 + 101 + unsigned long long notrace sched_clock(void) 102 + { 103 + unsigned long long ns; 104 + preempt_disable_notrace(); 105 + ns = sched_clock_noinstr(); 106 + preempt_enable_notrace(); 107 + return ns; 98 108 } 99 109 100 110 /*

+2 -2

kernel/time/timekeeping.c

··· 450 450 tkr = tkf->base + (seq & 0x01); 451 451 now = ktime_to_ns(tkr->base); 452 452 now += fast_tk_get_delta_ns(tkr); 453 - } while (read_seqcount_latch_retry(&tkf->seq, seq)); 453 + } while (raw_read_seqcount_latch_retry(&tkf->seq, seq)); 454 454 455 455 return now; 456 456 } ··· 566 566 basem = ktime_to_ns(tkr->base); 567 567 baser = ktime_to_ns(tkr->base_real); 568 568 delta = fast_tk_get_delta_ns(tkr); 569 - } while (read_seqcount_latch_retry(&tkf->seq, seq)); 569 + } while (raw_read_seqcount_latch_retry(&tkf->seq, seq)); 570 570 571 571 if (mono) 572 572 *mono = basem + delta;