Merge tag 'sched-core-2025-09-26' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

+12 -1

Kbuild

··· 34 34 $(offsets-file): arch/$(SRCARCH)/kernel/asm-offsets.s FORCE 35 35 $(call filechk,offsets,__ASM_OFFSETS_H__) 36 36 37 + # Generate rq-offsets.h 38 + 39 + rq-offsets-file := include/generated/rq-offsets.h 40 + 41 + targets += kernel/sched/rq-offsets.s 42 + 43 + kernel/sched/rq-offsets.s: $(offsets-file) 44 + 45 + $(rq-offsets-file): kernel/sched/rq-offsets.s FORCE 46 + $(call filechk,offsets,__RQ_OFFSETS_H__) 47 + 37 48 # Check for missing system calls 38 49 39 50 quiet_cmd_syscalls = CALL $< 40 51 cmd_syscalls = $(CONFIG_SHELL) $< $(CC) $(c_flags) $(missing_syscalls_flags) 41 52 42 53 PHONY += missing-syscalls 43 - missing-syscalls: scripts/checksyscalls.sh $(offsets-file) 54 + missing-syscalls: scripts/checksyscalls.sh $(rq-offsets-file) 44 55 $(call cmd,syscalls) 45 56 46 57 # Check the manual modification of atomic headers

+38

arch/Kconfig

··· 41 41 config SMT_NUM_THREADS_DYNAMIC 42 42 bool 43 43 44 + config ARCH_SUPPORTS_SCHED_SMT 45 + bool 46 + 47 + config ARCH_SUPPORTS_SCHED_CLUSTER 48 + bool 49 + 50 + config ARCH_SUPPORTS_SCHED_MC 51 + bool 52 + 53 + config SCHED_SMT 54 + bool "SMT (Hyperthreading) scheduler support" 55 + depends on ARCH_SUPPORTS_SCHED_SMT 56 + default y 57 + help 58 + Improves the CPU scheduler's decision making when dealing with 59 + MultiThreading at a cost of slightly increased overhead in some 60 + places. If unsure say N here. 61 + 62 + config SCHED_CLUSTER 63 + bool "Cluster scheduler support" 64 + depends on ARCH_SUPPORTS_SCHED_CLUSTER 65 + default y 66 + help 67 + Cluster scheduler support improves the CPU scheduler's decision 68 + making when dealing with machines that have clusters of CPUs. 69 + Cluster usually means a couple of CPUs which are placed closely 70 + by sharing mid-level caches, last-level cache tags or internal 71 + busses. 72 + 73 + config SCHED_MC 74 + bool "Multi-Core Cache (MC) scheduler support" 75 + depends on ARCH_SUPPORTS_SCHED_MC 76 + default y 77 + help 78 + Multi-core scheduler support improves the CPU scheduler's decision 79 + making when dealing with multi-core CPU chips at a cost of slightly 80 + increased overhead in some places. If unsure say N here. 81 + 44 82 # Selected by HOTPLUG_CORE_SYNC_DEAD or HOTPLUG_CORE_SYNC_FULL 45 83 config HOTPLUG_CORE_SYNC 46 84 bool

+1

arch/alpha/kernel/asm-offsets.c

··· 4 4 * This code generates raw asm output which is post-processed to extract 5 5 * and format the required data. 6 6 */ 7 + #define COMPILE_OFFSETS 7 8 8 9 #include <linux/types.h> 9 10 #include <linux/stddef.h>

+1

arch/arc/kernel/asm-offsets.c

+2 -16

arch/arm/Kconfig

··· 941 941 config ARM_CPU_TOPOLOGY 942 942 bool "Support cpu topology definition" 943 943 depends on SMP && CPU_V7 944 + select ARCH_SUPPORTS_SCHED_MC 945 + select ARCH_SUPPORTS_SCHED_SMT 944 946 default y 945 947 help 946 948 Support ARM cpu topology definition. The MPIDR register defines 947 949 affinity between processors which is then used to describe the cpu 948 950 topology of an ARM System. 949 - 950 - config SCHED_MC 951 - bool "Multi-core scheduler support" 952 - depends on ARM_CPU_TOPOLOGY 953 - help 954 - Multi-core scheduler support improves the CPU scheduler's decision 955 - making when dealing with multi-core CPU chips at a cost of slightly 956 - increased overhead in some places. If unsure say N here. 957 - 958 - config SCHED_SMT 959 - bool "SMT scheduler support" 960 - depends on ARM_CPU_TOPOLOGY 961 - help 962 - Improves the CPU scheduler's decision making when dealing with 963 - MultiThreading at a cost of slightly increased overhead in some 964 - places. If unsure say N here. 965 951 966 952 config HAVE_ARM_SCU 967 953 bool

+2

arch/arm/kernel/asm-offsets.c

··· 7 7 * This code generates raw asm output which is post-processed to extract 8 8 * and format the required data. 9 9 */ 10 + #define COMPILE_OFFSETS 11 + 10 12 #include <linux/compiler.h> 11 13 #include <linux/sched.h> 12 14 #include <linux/mm.h>

+3 -23

arch/arm64/Kconfig

··· 108 108 select ARCH_SUPPORTS_PER_VMA_LOCK 109 109 select ARCH_SUPPORTS_HUGE_PFNMAP if TRANSPARENT_HUGEPAGE 110 110 select ARCH_SUPPORTS_RT 111 + select ARCH_SUPPORTS_SCHED_SMT 112 + select ARCH_SUPPORTS_SCHED_CLUSTER 113 + select ARCH_SUPPORTS_SCHED_MC 111 114 select ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH 112 115 select ARCH_WANT_COMPAT_IPC_PARSE_VERSION if COMPAT 113 116 select ARCH_WANT_DEFAULT_BPF_JIT ··· 1509 1506 This is usually the case for distributions targeting arm64. 1510 1507 1511 1508 endchoice 1512 - 1513 - config SCHED_MC 1514 - bool "Multi-core scheduler support" 1515 - help 1516 - Multi-core scheduler support improves the CPU scheduler's decision 1517 - making when dealing with multi-core CPU chips at a cost of slightly 1518 - increased overhead in some places. If unsure say N here. 1519 - 1520 - config SCHED_CLUSTER 1521 - bool "Cluster scheduler support" 1522 - help 1523 - Cluster scheduler support improves the CPU scheduler's decision 1524 - making when dealing with machines that have clusters of CPUs. 1525 - Cluster usually means a couple of CPUs which are placed closely 1526 - by sharing mid-level caches, last-level cache tags or internal 1527 - busses. 1528 - 1529 - config SCHED_SMT 1530 - bool "SMT scheduler support" 1531 - help 1532 - Improves the CPU scheduler's decision making when dealing with 1533 - MultiThreading at a cost of slightly increased overhead in some 1534 - places. If unsure say N here. 1535 1509 1536 1510 config NR_CPUS 1537 1511 int "Maximum number of CPUs (2-4096)"

+1

arch/arm64/kernel/asm-offsets.c

+1

arch/csky/kernel/asm-offsets.c

··· 1 1 // SPDX-License-Identifier: GPL-2.0 2 2 // Copyright (C) 2018 Hangzhou C-SKY Microsystems co.,ltd. 3 + #define COMPILE_OFFSETS 3 4 4 5 #include <linux/sched.h> 5 6 #include <linux/kernel_stat.h>

+1

arch/hexagon/kernel/asm-offsets.c

+2 -17

arch/loongarch/Kconfig

··· 70 70 select ARCH_SUPPORTS_MSEAL_SYSTEM_MAPPINGS 71 71 select ARCH_SUPPORTS_NUMA_BALANCING 72 72 select ARCH_SUPPORTS_RT 73 + select ARCH_SUPPORTS_SCHED_SMT if SMP 74 + select ARCH_SUPPORTS_SCHED_MC if SMP 73 75 select ARCH_USE_BUILTIN_BSWAP 74 76 select ARCH_USE_CMPXCHG_LOCKREF 75 77 select ARCH_USE_MEMTEST ··· 453 451 help 454 452 This kernel feature allows the kernel to be loaded directly by 455 453 EFI firmware without the use of a bootloader. 456 - 457 - config SCHED_SMT 458 - bool "SMT scheduler support" 459 - depends on SMP 460 - default y 461 - help 462 - Improves scheduler's performance when there are multiple 463 - threads in one physical core. 464 - 465 - config SCHED_MC 466 - bool "Multi-core scheduler support" 467 - depends on SMP 468 - default y 469 - help 470 - Multi-core scheduler support improves the CPU scheduler's decision 471 - making when dealing with multi-core CPU chips at a cost of slightly 472 - increased overhead in some places. 473 454 474 455 config SMP 475 456 bool "Multi-Processing support"

+2

arch/loongarch/kernel/asm-offsets.c

··· 4 4 * 5 5 * Copyright (C) 2020-2022 Loongson Technology Corporation Limited 6 6 */ 7 + #define COMPILE_OFFSETS 8 + 7 9 #include <linux/types.h> 8 10 #include <linux/sched.h> 9 11 #include <linux/mm.h>

+1

arch/m68k/kernel/asm-offsets.c

··· 9 9 * #defines from the assembly-language output. 10 10 */ 11 11 12 + #define COMPILE_OFFSETS 12 13 #define ASM_OFFSETS_C 13 14 14 15 #include <linux/stddef.h>

+1

arch/microblaze/kernel/asm-offsets.c

··· 7 7 * License. See the file "COPYING" in the main directory of this archive 8 8 * for more details. 9 9 */ 10 + #define COMPILE_OFFSETS 10 11 11 12 #include <linux/init.h> 12 13 #include <linux/stddef.h>

+2 -14

arch/mips/Kconfig

··· 2223 2223 select SMP 2224 2224 select SMP_UP 2225 2225 select SYS_SUPPORTS_SMP 2226 - select SYS_SUPPORTS_SCHED_SMT 2226 + select ARCH_SUPPORTS_SCHED_SMT 2227 2227 select MIPS_PERF_SHARED_TC_COUNTERS 2228 2228 help 2229 2229 This is a kernel model which is known as SMVP. This is supported ··· 2233 2233 <http://www.imgtec.com/mips/mips-multithreading.asp>. 2234 2234 2235 2235 config MIPS_MT 2236 - bool 2237 - 2238 - config SCHED_SMT 2239 - bool "SMT (multithreading) scheduler support" 2240 - depends on SYS_SUPPORTS_SCHED_SMT 2241 - default n 2242 - help 2243 - SMT scheduler support improves the CPU scheduler's decision making 2244 - when dealing with MIPS MT enabled cores at a cost of slightly 2245 - increased overhead in some places. If unsure say N here. 2246 - 2247 - config SYS_SUPPORTS_SCHED_SMT 2248 2236 bool 2249 2237 2250 2238 config SYS_SUPPORTS_MULTITHREADING ··· 2306 2318 select HOTPLUG_CORE_SYNC_DEAD if HOTPLUG_CPU 2307 2319 select SYNC_R4K if (CEVT_R4K || CSRC_R4K) 2308 2320 select SYS_SUPPORTS_HOTPLUG_CPU 2309 - select SYS_SUPPORTS_SCHED_SMT if CPU_MIPSR6 2321 + select ARCH_SUPPORTS_SCHED_SMT if CPU_MIPSR6 2310 2322 select SYS_SUPPORTS_SMP 2311 2323 select WEAK_ORDERING 2312 2324 select GENERIC_IRQ_MIGRATION if HOTPLUG_CPU

+2

arch/mips/kernel/asm-offsets.c

··· 9 9 * Kevin Kissell, kevink@mips.com and Carsten Langgaard, carstenl@mips.com 10 10 * Copyright (C) 2000 MIPS Technologies, Inc. 11 11 */ 12 + #define COMPILE_OFFSETS 13 + 12 14 #include <linux/compat.h> 13 15 #include <linux/types.h> 14 16 #include <linux/sched.h>

+1

arch/nios2/kernel/asm-offsets.c

+1

arch/openrisc/kernel/asm-offsets.c

··· 18 18 * compile this file to assembler, and then extract the 19 19 * #defines from the assembly-language output. 20 20 */ 21 + #define COMPILE_OFFSETS 21 22 22 23 #include <linux/signal.h> 23 24 #include <linux/sched.h>

+1 -8

arch/parisc/Kconfig

··· 44 44 select ARCH_HAVE_NMI_SAFE_CMPXCHG 45 45 select GENERIC_SMP_IDLE_THREAD 46 46 select GENERIC_ARCH_TOPOLOGY if SMP 47 + select ARCH_SUPPORTS_SCHED_MC if SMP && PA8X00 47 48 select GENERIC_CPU_DEVICES if !SMP 48 49 select GENERIC_LIB_DEVMEM_IS_ALLOWED 49 50 select SYSCTL_ARCH_UNALIGN_ALLOW ··· 319 318 available at <https://www.tldp.org/docs.html#howto>. 320 319 321 320 If you don't know what to do here, say N. 322 - 323 - config SCHED_MC 324 - bool "Multi-core scheduler support" 325 - depends on GENERIC_ARCH_TOPOLOGY && PA8X00 326 - help 327 - Multi-core scheduler support improves the CPU scheduler's decision 328 - making when dealing with multi-core CPU chips at a cost of slightly 329 - increased overhead in some places. If unsure say N here. 330 321 331 322 config IRQSTACKS 332 323 bool "Use separate kernel stacks when processing interrupts"

+1

arch/parisc/kernel/asm-offsets.c

··· 13 13 * Copyright (C) 2002 Randolph Chung <tausq with parisc-linux.org> 14 14 * Copyright (C) 2003 James Bottomley <jejb at parisc-linux.org> 15 15 */ 16 + #define COMPILE_OFFSETS 16 17 17 18 #include <linux/types.h> 18 19 #include <linux/sched.h>

+3 -8

arch/powerpc/Kconfig

··· 170 170 select ARCH_STACKWALK 171 171 select ARCH_SUPPORTS_ATOMIC_RMW 172 172 select ARCH_SUPPORTS_DEBUG_PAGEALLOC if PPC_BOOK3S || PPC_8xx 173 + select ARCH_SUPPORTS_SCHED_MC if SMP 174 + select ARCH_SUPPORTS_SCHED_SMT if PPC64 && SMP 175 + select SCHED_MC if ARCH_SUPPORTS_SCHED_MC 173 176 select ARCH_USE_BUILTIN_BSWAP 174 177 select ARCH_USE_CMPXCHG_LOCKREF if PPC64 175 178 select ARCH_USE_MEMTEST ··· 967 964 968 965 config PPC_COPRO_BASE 969 966 bool 970 - 971 - config SCHED_SMT 972 - bool "SMT (Hyperthreading) scheduler support" 973 - depends on PPC64 && SMP 974 - help 975 - SMT scheduler support improves the CPU scheduler's decision making 976 - when dealing with POWER5 cpus at a cost of slightly increased 977 - overhead in some places. If unsure say N here. 978 967 979 968 config PPC_DENORMALISATION 980 969 bool "PowerPC denormalisation exception handling"

+2

arch/powerpc/include/asm/topology.h

··· 131 131 #ifdef CONFIG_SMP 132 132 #include <asm/cputable.h> 133 133 134 + struct cpumask *cpu_coregroup_mask(int cpu); 135 + 134 136 #ifdef CONFIG_PPC64 135 137 #include <asm/smp.h> 136 138

+1

arch/powerpc/kernel/asm-offsets.c

··· 8 8 * compile this file to assembler, and then extract the 9 9 * #defines from the assembly-language output. 10 10 */ 11 + #define COMPILE_OFFSETS 11 12 12 13 #include <linux/compat.h> 13 14 #include <linux/signal.h>

+11 -16

arch/powerpc/kernel/smp.c

··· 1028 1028 * We can't just pass cpu_l2_cache_mask() directly because 1029 1029 * returns a non-const pointer and the compiler barfs on that. 1030 1030 */ 1031 - static const struct cpumask *shared_cache_mask(int cpu) 1031 + static const struct cpumask *tl_cache_mask(struct sched_domain_topology_level *tl, int cpu) 1032 1032 { 1033 1033 return per_cpu(cpu_l2_cache_map, cpu); 1034 1034 } 1035 1035 1036 1036 #ifdef CONFIG_SCHED_SMT 1037 - static const struct cpumask *smallcore_smt_mask(int cpu) 1037 + static const struct cpumask *tl_smallcore_smt_mask(struct sched_domain_topology_level *tl, int cpu) 1038 1038 { 1039 1039 return cpu_smallcore_mask(cpu); 1040 1040 } 1041 1041 #endif 1042 1042 1043 - static struct cpumask *cpu_coregroup_mask(int cpu) 1043 + struct cpumask *cpu_coregroup_mask(int cpu) 1044 1044 { 1045 1045 return per_cpu(cpu_coregroup_map, cpu); 1046 1046 } ··· 1052 1052 return 0; 1053 1053 1054 1054 return coregroup_enabled; 1055 - } 1056 - 1057 - static const struct cpumask *cpu_mc_mask(int cpu) 1058 - { 1059 - return cpu_coregroup_mask(cpu); 1060 1055 } 1061 1056 1062 1057 static int __init init_big_cores(void) ··· 1443 1448 return false; 1444 1449 } 1445 1450 1446 - cpumask_and(*mask, cpu_online_mask, cpu_cpu_mask(cpu)); 1451 + cpumask_and(*mask, cpu_online_mask, cpu_node_mask(cpu)); 1447 1452 1448 1453 /* Update l2-cache mask with all the CPUs that are part of submask */ 1449 1454 or_cpumasks_related(cpu, cpu, submask_fn, cpu_l2_cache_mask); ··· 1533 1538 return; 1534 1539 } 1535 1540 1536 - cpumask_and(*mask, cpu_online_mask, cpu_cpu_mask(cpu)); 1541 + cpumask_and(*mask, cpu_online_mask, cpu_node_mask(cpu)); 1537 1542 1538 1543 /* Update coregroup mask with all the CPUs that are part of submask */ 1539 1544 or_cpumasks_related(cpu, cpu, submask_fn, cpu_coregroup_mask); ··· 1596 1601 1597 1602 /* If chip_id is -1; limit the cpu_core_mask to within PKG */ 1598 1603 if (chip_id == -1) 1599 - cpumask_and(mask, mask, cpu_cpu_mask(cpu)); 1604 + cpumask_and(mask, mask, cpu_node_mask(cpu)); 1600 1605 1601 1606 for_each_cpu(i, mask) { 1602 1607 if (chip_id == cpu_to_chip_id(i)) { ··· 1696 1701 if (has_big_cores) { 1697 1702 pr_info("Big cores detected but using small core scheduling\n"); 1698 1703 powerpc_topology[i++] = 1699 - SDTL_INIT(smallcore_smt_mask, powerpc_smt_flags, SMT); 1704 + SDTL_INIT(tl_smallcore_smt_mask, powerpc_smt_flags, SMT); 1700 1705 } else { 1701 - powerpc_topology[i++] = SDTL_INIT(cpu_smt_mask, powerpc_smt_flags, SMT); 1706 + powerpc_topology[i++] = SDTL_INIT(tl_smt_mask, powerpc_smt_flags, SMT); 1702 1707 } 1703 1708 #endif 1704 1709 if (shared_caches) { 1705 1710 powerpc_topology[i++] = 1706 - SDTL_INIT(shared_cache_mask, powerpc_shared_cache_flags, CACHE); 1711 + SDTL_INIT(tl_cache_mask, powerpc_shared_cache_flags, CACHE); 1707 1712 } 1708 1713 1709 1714 if (has_coregroup_support()) { 1710 1715 powerpc_topology[i++] = 1711 - SDTL_INIT(cpu_mc_mask, powerpc_shared_proc_flags, MC); 1716 + SDTL_INIT(tl_mc_mask, powerpc_shared_proc_flags, MC); 1712 1717 } 1713 1718 1714 - powerpc_topology[i++] = SDTL_INIT(cpu_cpu_mask, powerpc_shared_proc_flags, PKG); 1719 + powerpc_topology[i++] = SDTL_INIT(tl_pkg_mask, powerpc_shared_proc_flags, PKG); 1715 1720 1716 1721 /* There must be one trailing NULL entry left. */ 1717 1722 BUG_ON(i >= ARRAY_SIZE(powerpc_topology) - 1);

+1 -8

arch/riscv/Kconfig

··· 74 74 select ARCH_SUPPORTS_PER_VMA_LOCK if MMU 75 75 select ARCH_SUPPORTS_RT 76 76 select ARCH_SUPPORTS_SHADOW_CALL_STACK if HAVE_SHADOW_CALL_STACK 77 + select ARCH_SUPPORTS_SCHED_MC if SMP 77 78 select ARCH_USE_CMPXCHG_LOCKREF if 64BIT 78 79 select ARCH_USE_MEMTEST 79 80 select ARCH_USE_QUEUED_RWLOCKS ··· 455 454 here. 456 455 457 456 If you don't know what to do here, say N. 458 - 459 - config SCHED_MC 460 - bool "Multi-core scheduler support" 461 - depends on SMP 462 - help 463 - Multi-core scheduler support improves the CPU scheduler's decision 464 - making when dealing with multi-core CPU chips at a cost of slightly 465 - increased overhead in some places. If unsure say N here. 466 457 467 458 config NR_CPUS 468 459 int "Maximum number of CPUs (2-512)"

+1

arch/riscv/kernel/asm-offsets.c

+2 -6

arch/s390/Kconfig

··· 554 554 depends on NUMA 555 555 default "1" 556 556 557 - config SCHED_SMT 558 - def_bool n 559 - 560 - config SCHED_MC 561 - def_bool n 562 - 563 557 config SCHED_TOPOLOGY 564 558 def_bool y 565 559 prompt "Topology scheduler support" 560 + select ARCH_SUPPORTS_SCHED_SMT 561 + select ARCH_SUPPORTS_SCHED_MC 566 562 select SCHED_SMT 567 563 select SCHED_MC 568 564 help

+1

arch/s390/kernel/asm-offsets.c

··· 4 4 * This code generates raw asm output which is post-processed to extract 5 5 * and format the required data. 6 6 */ 7 + #define COMPILE_OFFSETS 7 8 8 9 #include <linux/kbuild.h> 9 10 #include <linux/sched.h>

+7 -13

arch/s390/kernel/topology.c

··· 509 509 return rc; 510 510 } 511 511 512 - static const struct cpumask *cpu_thread_mask(int cpu) 513 - { 514 - return &cpu_topology[cpu].thread_mask; 515 - } 516 - 517 - 518 512 const struct cpumask *cpu_coregroup_mask(int cpu) 519 513 { 520 514 return &cpu_topology[cpu].core_mask; 521 515 } 522 516 523 - static const struct cpumask *cpu_book_mask(int cpu) 517 + static const struct cpumask *tl_book_mask(struct sched_domain_topology_level *tl, int cpu) 524 518 { 525 519 return &cpu_topology[cpu].book_mask; 526 520 } 527 521 528 - static const struct cpumask *cpu_drawer_mask(int cpu) 522 + static const struct cpumask *tl_drawer_mask(struct sched_domain_topology_level *tl, int cpu) 529 523 { 530 524 return &cpu_topology[cpu].drawer_mask; 531 525 } 532 526 533 527 static struct sched_domain_topology_level s390_topology[] = { 534 - SDTL_INIT(cpu_thread_mask, cpu_smt_flags, SMT), 535 - SDTL_INIT(cpu_coregroup_mask, cpu_core_flags, MC), 536 - SDTL_INIT(cpu_book_mask, NULL, BOOK), 537 - SDTL_INIT(cpu_drawer_mask, NULL, DRAWER), 538 - SDTL_INIT(cpu_cpu_mask, NULL, PKG), 528 + SDTL_INIT(tl_smt_mask, cpu_smt_flags, SMT), 529 + SDTL_INIT(tl_mc_mask, cpu_core_flags, MC), 530 + SDTL_INIT(tl_book_mask, NULL, BOOK), 531 + SDTL_INIT(tl_drawer_mask, NULL, DRAWER), 532 + SDTL_INIT(tl_pkg_mask, NULL, PKG), 539 533 { NULL, }, 540 534 }; 541 535

+1

arch/sh/kernel/asm-offsets.c

··· 8 8 * compile this file to assembler, and then extract the 9 9 * #defines from the assembly-language output. 10 10 */ 11 + #define COMPILE_OFFSETS 11 12 12 13 #include <linux/stddef.h> 13 14 #include <linux/types.h>

+2 -18

arch/sparc/Kconfig

··· 110 110 select HAVE_SETUP_PER_CPU_AREA 111 111 select NEED_PER_CPU_EMBED_FIRST_CHUNK 112 112 select NEED_PER_CPU_PAGE_FIRST_CHUNK 113 + select ARCH_SUPPORTS_SCHED_SMT if SMP 114 + select ARCH_SUPPORTS_SCHED_MC if SMP 113 115 114 116 config ARCH_PROC_KCORE_TEXT 115 117 def_bool y ··· 289 287 if SPARC64 || COMPILE_TEST 290 288 source "kernel/power/Kconfig" 291 289 endif 292 - 293 - config SCHED_SMT 294 - bool "SMT (Hyperthreading) scheduler support" 295 - depends on SPARC64 && SMP 296 - default y 297 - help 298 - SMT scheduler support improves the CPU scheduler's decision making 299 - when dealing with SPARC cpus at a cost of slightly increased overhead 300 - in some places. If unsure say N here. 301 - 302 - config SCHED_MC 303 - bool "Multi-core scheduler support" 304 - depends on SPARC64 && SMP 305 - default y 306 - help 307 - Multi-core scheduler support improves the CPU scheduler's decision 308 - making when dealing with multi-core CPU chips at a cost of slightly 309 - increased overhead in some places. If unsure say N here. 310 290 311 291 config CMDLINE_BOOL 312 292 bool "Default bootloader kernel arguments"

+1

arch/sparc/kernel/asm-offsets.c

··· 10 10 * 11 11 * On sparc, thread_info data is static and TI_XXX offsets are computed by hand. 12 12 */ 13 + #define COMPILE_OFFSETS 13 14 14 15 #include <linux/sched.h> 15 16 #include <linux/mm_types.h>

+2

arch/um/kernel/asm-offsets.c

··· 1 + #define COMPILE_OFFSETS 2 + 1 3 #include <sysdep/kernel-offsets.h>

+4 -23

arch/x86/Kconfig

··· 330 330 imply IMA_SECURE_AND_OR_TRUSTED_BOOT if EFI 331 331 select HAVE_DYNAMIC_FTRACE_NO_PATCHABLE 332 332 select ARCH_SUPPORTS_PT_RECLAIM if X86_64 333 + select ARCH_SUPPORTS_SCHED_SMT if SMP 334 + select SCHED_SMT if SMP 335 + select ARCH_SUPPORTS_SCHED_CLUSTER if SMP 336 + select ARCH_SUPPORTS_SCHED_MC if SMP 333 337 334 338 config INSTRUCTION_DECODER 335 339 def_bool y ··· 1034 1030 1035 1031 This is purely to save memory: each supported CPU adds about 8KB 1036 1032 to the kernel image. 1037 - 1038 - config SCHED_CLUSTER 1039 - bool "Cluster scheduler support" 1040 - depends on SMP 1041 - default y 1042 - help 1043 - Cluster scheduler support improves the CPU scheduler's decision 1044 - making when dealing with machines that have clusters of CPUs. 1045 - Cluster usually means a couple of CPUs which are placed closely 1046 - by sharing mid-level caches, last-level cache tags or internal 1047 - busses. 1048 - 1049 - config SCHED_SMT 1050 - def_bool y if SMP 1051 - 1052 - config SCHED_MC 1053 - def_bool y 1054 - prompt "Multi-core scheduler support" 1055 - depends on SMP 1056 - help 1057 - Multi-core scheduler support improves the CPU scheduler's decision 1058 - making when dealing with multi-core CPU chips at a cost of slightly 1059 - increased overhead in some places. If unsure say N here. 1060 1033 1061 1034 config SCHED_MC_PRIO 1062 1035 bool "CPU core priorities scheduler support"

+4 -4

arch/x86/kernel/smpboot.c

··· 479 479 static bool x86_has_numa_in_package; 480 480 481 481 static struct sched_domain_topology_level x86_topology[] = { 482 - SDTL_INIT(cpu_smt_mask, cpu_smt_flags, SMT), 482 + SDTL_INIT(tl_smt_mask, cpu_smt_flags, SMT), 483 483 #ifdef CONFIG_SCHED_CLUSTER 484 - SDTL_INIT(cpu_clustergroup_mask, x86_cluster_flags, CLS), 484 + SDTL_INIT(tl_cls_mask, x86_cluster_flags, CLS), 485 485 #endif 486 486 #ifdef CONFIG_SCHED_MC 487 - SDTL_INIT(cpu_coregroup_mask, x86_core_flags, MC), 487 + SDTL_INIT(tl_mc_mask, x86_core_flags, MC), 488 488 #endif 489 - SDTL_INIT(cpu_cpu_mask, x86_sched_itmt_flags, PKG), 489 + SDTL_INIT(tl_pkg_mask, x86_sched_itmt_flags, PKG), 490 490 { NULL }, 491 491 }; 492 492

+1

arch/xtensa/kernel/asm-offsets.c

··· 11 11 * 12 12 * Chris Zankel <chris@zankel.net> 13 13 */ 14 + #define COMPILE_OFFSETS 14 15 15 16 #include <asm/processor.h> 16 17 #include <asm/coprocessor.h>

+4 -7

include/linux/preempt.h

··· 372 372 /* 373 373 * Migrate-Disable and why it is undesired. 374 374 * 375 - * When a preempted task becomes elegible to run under the ideal model (IOW it 375 + * When a preempted task becomes eligible to run under the ideal model (IOW it 376 376 * becomes one of the M highest priority tasks), it might still have to wait 377 377 * for the preemptee's migrate_disable() section to complete. Thereby suffering 378 378 * a reduction in bandwidth in the exact duration of the migrate_disable() ··· 387 387 * - a lower priority tasks; which under preempt_disable() could've instantly 388 388 * migrated away when another CPU becomes available, is now constrained 389 389 * by the ability to push the higher priority task away, which might itself be 390 - * in a migrate_disable() section, reducing it's available bandwidth. 390 + * in a migrate_disable() section, reducing its available bandwidth. 391 391 * 392 392 * IOW it trades latency / moves the interference term, but it stays in the 393 393 * system, and as long as it remains unbounded, the system is not fully ··· 399 399 * PREEMPT_RT breaks a number of assumptions traditionally held. By forcing a 400 400 * number of primitives into becoming preemptible, they would also allow 401 401 * migration. This turns out to break a bunch of per-cpu usage. To this end, 402 - * all these primitives employ migirate_disable() to restore this implicit 402 + * all these primitives employ migrate_disable() to restore this implicit 403 403 * assumption. 404 404 * 405 405 * This is a 'temporary' work-around at best. The correct solution is getting ··· 407 407 * per-cpu locking or short preempt-disable regions. 408 408 * 409 409 * The end goal must be to get rid of migrate_disable(), alternatively we need 410 - * a schedulability theory that does not depend on abritrary migration. 410 + * a schedulability theory that does not depend on arbitrary migration. 411 411 * 412 412 * 413 413 * Notes on the implementation. ··· 424 424 * work-conserving schedulers. 425 425 * 426 426 */ 427 - extern void migrate_disable(void); 428 - extern void migrate_enable(void); 429 427 430 428 /** 431 429 * preempt_disable_nested - Disable preemption inside a normally preempt disabled section ··· 469 471 470 472 DEFINE_LOCK_GUARD_0(preempt, preempt_disable(), preempt_enable()) 471 473 DEFINE_LOCK_GUARD_0(preempt_notrace, preempt_disable_notrace(), preempt_enable_notrace()) 472 - DEFINE_LOCK_GUARD_0(migrate, migrate_disable(), migrate_enable()) 473 474 474 475 #ifdef CONFIG_PREEMPT_DYNAMIC 475 476

+1 -1

include/linux/rcupdate.h

··· 24 24 #include <linux/compiler.h> 25 25 #include <linux/atomic.h> 26 26 #include <linux/irqflags.h> 27 - #include <linux/preempt.h> 27 + #include <linux/sched.h> 28 28 #include <linux/bottom_half.h> 29 29 #include <linux/lockdep.h> 30 30 #include <linux/cleanup.h>

+118

include/linux/sched.h

··· 49 49 #include <linux/tracepoint-defs.h> 50 50 #include <linux/unwind_deferred_types.h> 51 51 #include <asm/kmap_size.h> 52 + #ifndef COMPILE_OFFSETS 53 + #include <generated/rq-offsets.h> 54 + #endif 52 55 53 56 /* task_struct member predeclarations (sorted alphabetically): */ 54 57 struct audit_context; ··· 884 881 885 882 #ifdef CONFIG_CGROUP_SCHED 886 883 struct task_group *sched_task_group; 884 + #ifdef CONFIG_CFS_BANDWIDTH 885 + struct callback_head sched_throttle_work; 886 + struct list_head throttle_node; 887 + bool throttled; 888 + #endif 887 889 #endif 888 890 889 891 ··· 2317 2309 #define alloc_tag_save(_tag) NULL 2318 2310 #define alloc_tag_restore(_tag, _old) do {} while (0) 2319 2311 #endif 2312 + 2313 + #ifndef MODULE 2314 + #ifndef COMPILE_OFFSETS 2315 + 2316 + extern void ___migrate_enable(void); 2317 + 2318 + struct rq; 2319 + DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); 2320 + 2321 + /* 2322 + * The "struct rq" is not available here, so we can't access the 2323 + * "runqueues" with this_cpu_ptr(), as the compilation will fail in 2324 + * this_cpu_ptr() -> raw_cpu_ptr() -> __verify_pcpu_ptr(): 2325 + * typeof((ptr) + 0) 2326 + * 2327 + * So use arch_raw_cpu_ptr()/PERCPU_PTR() directly here. 2328 + */ 2329 + #ifdef CONFIG_SMP 2330 + #define this_rq_raw() arch_raw_cpu_ptr(&runqueues) 2331 + #else 2332 + #define this_rq_raw() PERCPU_PTR(&runqueues) 2333 + #endif 2334 + #define this_rq_pinned() (*(unsigned int *)((void *)this_rq_raw() + RQ_nr_pinned)) 2335 + 2336 + static inline void __migrate_enable(void) 2337 + { 2338 + struct task_struct *p = current; 2339 + 2340 + #ifdef CONFIG_DEBUG_PREEMPT 2341 + /* 2342 + * Check both overflow from migrate_disable() and superfluous 2343 + * migrate_enable(). 2344 + */ 2345 + if (WARN_ON_ONCE((s16)p->migration_disabled <= 0)) 2346 + return; 2347 + #endif 2348 + 2349 + if (p->migration_disabled > 1) { 2350 + p->migration_disabled--; 2351 + return; 2352 + } 2353 + 2354 + /* 2355 + * Ensure stop_task runs either before or after this, and that 2356 + * __set_cpus_allowed_ptr(SCA_MIGRATE_ENABLE) doesn't schedule(). 2357 + */ 2358 + guard(preempt)(); 2359 + if (unlikely(p->cpus_ptr != &p->cpus_mask)) 2360 + ___migrate_enable(); 2361 + /* 2362 + * Mustn't clear migration_disabled() until cpus_ptr points back at the 2363 + * regular cpus_mask, otherwise things that race (eg. 2364 + * select_fallback_rq) get confused. 2365 + */ 2366 + barrier(); 2367 + p->migration_disabled = 0; 2368 + this_rq_pinned()--; 2369 + } 2370 + 2371 + static inline void __migrate_disable(void) 2372 + { 2373 + struct task_struct *p = current; 2374 + 2375 + if (p->migration_disabled) { 2376 + #ifdef CONFIG_DEBUG_PREEMPT 2377 + /* 2378 + *Warn about overflow half-way through the range. 2379 + */ 2380 + WARN_ON_ONCE((s16)p->migration_disabled < 0); 2381 + #endif 2382 + p->migration_disabled++; 2383 + return; 2384 + } 2385 + 2386 + guard(preempt)(); 2387 + this_rq_pinned()++; 2388 + p->migration_disabled = 1; 2389 + } 2390 + #else /* !COMPILE_OFFSETS */ 2391 + static inline void __migrate_disable(void) { } 2392 + static inline void __migrate_enable(void) { } 2393 + #endif /* !COMPILE_OFFSETS */ 2394 + 2395 + /* 2396 + * So that it is possible to not export the runqueues variable, define and 2397 + * export migrate_enable/migrate_disable in kernel/sched/core.c too, and use 2398 + * them for the modules. The macro "INSTANTIATE_EXPORTED_MIGRATE_DISABLE" will 2399 + * be defined in kernel/sched/core.c. 2400 + */ 2401 + #ifndef INSTANTIATE_EXPORTED_MIGRATE_DISABLE 2402 + static inline void migrate_disable(void) 2403 + { 2404 + __migrate_disable(); 2405 + } 2406 + 2407 + static inline void migrate_enable(void) 2408 + { 2409 + __migrate_enable(); 2410 + } 2411 + #else /* INSTANTIATE_EXPORTED_MIGRATE_DISABLE */ 2412 + extern void migrate_disable(void); 2413 + extern void migrate_enable(void); 2414 + #endif /* INSTANTIATE_EXPORTED_MIGRATE_DISABLE */ 2415 + 2416 + #else /* MODULE */ 2417 + extern void migrate_disable(void); 2418 + extern void migrate_enable(void); 2419 + #endif /* MODULE */ 2420 + 2421 + DEFINE_LOCK_GUARD_0(migrate, migrate_disable(), migrate_enable()) 2320 2422 2321 2423 #endif

+10 -19

include/linux/sched/topology.h

··· 30 30 }; 31 31 extern const struct sd_flag_debug sd_flag_debug[]; 32 32 33 + struct sched_domain_topology_level; 34 + 33 35 #ifdef CONFIG_SCHED_SMT 34 - static inline int cpu_smt_flags(void) 35 - { 36 - return SD_SHARE_CPUCAPACITY | SD_SHARE_LLC; 37 - } 36 + extern int cpu_smt_flags(void); 37 + extern const struct cpumask *tl_smt_mask(struct sched_domain_topology_level *tl, int cpu); 38 38 #endif 39 39 40 40 #ifdef CONFIG_SCHED_CLUSTER 41 - static inline int cpu_cluster_flags(void) 42 - { 43 - return SD_CLUSTER | SD_SHARE_LLC; 44 - } 41 + extern int cpu_cluster_flags(void); 42 + extern const struct cpumask *tl_cls_mask(struct sched_domain_topology_level *tl, int cpu); 45 43 #endif 46 44 47 45 #ifdef CONFIG_SCHED_MC 48 - static inline int cpu_core_flags(void) 49 - { 50 - return SD_SHARE_LLC; 51 - } 46 + extern int cpu_core_flags(void); 47 + extern const struct cpumask *tl_mc_mask(struct sched_domain_topology_level *tl, int cpu); 52 48 #endif 53 49 54 - #ifdef CONFIG_NUMA 55 - static inline int cpu_numa_flags(void) 56 - { 57 - return SD_NUMA; 58 - } 59 - #endif 50 + extern const struct cpumask *tl_pkg_mask(struct sched_domain_topology_level *tl, int cpu); 60 51 61 52 extern int arch_asym_cpu_priority(int cpu); 62 53 ··· 163 172 bool cpus_share_cache(int this_cpu, int that_cpu); 164 173 bool cpus_share_resources(int this_cpu, int that_cpu); 165 174 166 - typedef const struct cpumask *(*sched_domain_mask_f)(int cpu); 175 + typedef const struct cpumask *(*sched_domain_mask_f)(struct sched_domain_topology_level *tl, int cpu); 167 176 typedef int (*sched_domain_flags_f)(void); 168 177 169 178 struct sd_data {

+1 -1

include/linux/topology.h

··· 260 260 261 261 #endif 262 262 263 - static inline const struct cpumask *cpu_cpu_mask(int cpu) 263 + static inline const struct cpumask *cpu_node_mask(int cpu) 264 264 { 265 265 return cpumask_of_node(cpu_to_node(cpu)); 266 266 }

+1

kernel/bpf/verifier.c

··· 23859 23859 BTF_SET_START(btf_id_deny) 23860 23860 BTF_ID_UNUSED 23861 23861 #ifdef CONFIG_SMP 23862 + BTF_ID(func, ___migrate_enable) 23862 23863 BTF_ID(func, migrate_disable) 23863 23864 BTF_ID(func, migrate_enable) 23864 23865 #endif

+17 -49

kernel/sched/core.c

··· 7 7 * Copyright (C) 1991-2002 Linus Torvalds 8 8 * Copyright (C) 1998-2024 Ingo Molnar, Red Hat 9 9 */ 10 + #define INSTANTIATE_EXPORTED_MIGRATE_DISABLE 11 + #include <linux/sched.h> 10 12 #include <linux/highmem.h> 11 13 #include <linux/hrtimer_api.h> 12 14 #include <linux/ktime_api.h> ··· 2383 2381 __do_set_cpus_allowed(p, &ac); 2384 2382 } 2385 2383 2386 - void migrate_disable(void) 2387 - { 2388 - struct task_struct *p = current; 2389 - 2390 - if (p->migration_disabled) { 2391 - #ifdef CONFIG_DEBUG_PREEMPT 2392 - /* 2393 - *Warn about overflow half-way through the range. 2394 - */ 2395 - WARN_ON_ONCE((s16)p->migration_disabled < 0); 2396 - #endif 2397 - p->migration_disabled++; 2398 - return; 2399 - } 2400 - 2401 - guard(preempt)(); 2402 - this_rq()->nr_pinned++; 2403 - p->migration_disabled = 1; 2404 - } 2405 - EXPORT_SYMBOL_GPL(migrate_disable); 2406 - 2407 - void migrate_enable(void) 2384 + void ___migrate_enable(void) 2408 2385 { 2409 2386 struct task_struct *p = current; 2410 2387 struct affinity_context ac = { ··· 2391 2410 .flags = SCA_MIGRATE_ENABLE, 2392 2411 }; 2393 2412 2394 - #ifdef CONFIG_DEBUG_PREEMPT 2395 - /* 2396 - * Check both overflow from migrate_disable() and superfluous 2397 - * migrate_enable(). 2398 - */ 2399 - if (WARN_ON_ONCE((s16)p->migration_disabled <= 0)) 2400 - return; 2401 - #endif 2413 + __set_cpus_allowed_ptr(p, &ac); 2414 + } 2415 + EXPORT_SYMBOL_GPL(___migrate_enable); 2402 2416 2403 - if (p->migration_disabled > 1) { 2404 - p->migration_disabled--; 2405 - return; 2406 - } 2417 + void migrate_disable(void) 2418 + { 2419 + __migrate_disable(); 2420 + } 2421 + EXPORT_SYMBOL_GPL(migrate_disable); 2407 2422 2408 - /* 2409 - * Ensure stop_task runs either before or after this, and that 2410 - * __set_cpus_allowed_ptr(SCA_MIGRATE_ENABLE) doesn't schedule(). 2411 - */ 2412 - guard(preempt)(); 2413 - if (p->cpus_ptr != &p->cpus_mask) 2414 - __set_cpus_allowed_ptr(p, &ac); 2415 - /* 2416 - * Mustn't clear migration_disabled() until cpus_ptr points back at the 2417 - * regular cpus_mask, otherwise things that race (eg. 2418 - * select_fallback_rq) get confused. 2419 - */ 2420 - barrier(); 2421 - p->migration_disabled = 0; 2422 - this_rq()->nr_pinned--; 2423 + void migrate_enable(void) 2424 + { 2425 + __migrate_enable(); 2423 2426 } 2424 2427 EXPORT_SYMBOL_GPL(migrate_enable); 2425 2428 ··· 4455 4490 4456 4491 #ifdef CONFIG_FAIR_GROUP_SCHED 4457 4492 p->se.cfs_rq = NULL; 4493 + #ifdef CONFIG_CFS_BANDWIDTH 4494 + init_cfs_throttle_work(p); 4495 + #endif 4458 4496 #endif 4459 4497 4460 4498 #ifdef CONFIG_SCHEDSTATS

+49 -24

kernel/sched/deadline.c

··· 2551 2551 return -1; 2552 2552 } 2553 2553 2554 + static struct task_struct *pick_next_pushable_dl_task(struct rq *rq) 2555 + { 2556 + struct task_struct *p; 2557 + 2558 + if (!has_pushable_dl_tasks(rq)) 2559 + return NULL; 2560 + 2561 + p = __node_2_pdl(rb_first_cached(&rq->dl.pushable_dl_tasks_root)); 2562 + 2563 + WARN_ON_ONCE(rq->cpu != task_cpu(p)); 2564 + WARN_ON_ONCE(task_current(rq, p)); 2565 + WARN_ON_ONCE(p->nr_cpus_allowed <= 1); 2566 + 2567 + WARN_ON_ONCE(!task_on_rq_queued(p)); 2568 + WARN_ON_ONCE(!dl_task(p)); 2569 + 2570 + return p; 2571 + } 2572 + 2554 2573 /* Locks the rq it finds */ 2555 2574 static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq) 2556 2575 { ··· 2597 2578 2598 2579 /* Retry if something changed. */ 2599 2580 if (double_lock_balance(rq, later_rq)) { 2600 - if (unlikely(task_rq(task) != rq || 2581 + /* 2582 + * double_lock_balance had to release rq->lock, in the 2583 + * meantime, task may no longer be fit to be migrated. 2584 + * Check the following to ensure that the task is 2585 + * still suitable for migration: 2586 + * 1. It is possible the task was scheduled, 2587 + * migrate_disabled was set and then got preempted, 2588 + * so we must check the task migration disable 2589 + * flag. 2590 + * 2. The CPU picked is in the task's affinity. 2591 + * 3. For throttled task (dl_task_offline_migration), 2592 + * check the following: 2593 + * - the task is not on the rq anymore (it was 2594 + * migrated) 2595 + * - the task is not on CPU anymore 2596 + * - the task is still a dl task 2597 + * - the task is not queued on the rq anymore 2598 + * 4. For the non-throttled task (push_dl_task), the 2599 + * check to ensure that this task is still at the 2600 + * head of the pushable tasks list is enough. 2601 + */ 2602 + if (unlikely(is_migration_disabled(task) || 2601 2603 !cpumask_test_cpu(later_rq->cpu, &task->cpus_mask) || 2602 - task_on_cpu(rq, task) || 2603 - !dl_task(task) || 2604 - is_migration_disabled(task) || 2605 - !task_on_rq_queued(task))) { 2604 + (task->dl.dl_throttled && 2605 + (task_rq(task) != rq || 2606 + task_on_cpu(rq, task) || 2607 + !dl_task(task) || 2608 + !task_on_rq_queued(task))) || 2609 + (!task->dl.dl_throttled && 2610 + task != pick_next_pushable_dl_task(rq)))) { 2611 + 2606 2612 double_unlock_balance(rq, later_rq); 2607 2613 later_rq = NULL; 2608 2614 break; ··· 2648 2604 } 2649 2605 2650 2606 return later_rq; 2651 - } 2652 - 2653 - static struct task_struct *pick_next_pushable_dl_task(struct rq *rq) 2654 - { 2655 - struct task_struct *p; 2656 - 2657 - if (!has_pushable_dl_tasks(rq)) 2658 - return NULL; 2659 - 2660 - p = __node_2_pdl(rb_first_cached(&rq->dl.pushable_dl_tasks_root)); 2661 - 2662 - WARN_ON_ONCE(rq->cpu != task_cpu(p)); 2663 - WARN_ON_ONCE(task_current(rq, p)); 2664 - WARN_ON_ONCE(p->nr_cpus_allowed <= 1); 2665 - 2666 - WARN_ON_ONCE(!task_on_rq_queued(p)); 2667 - WARN_ON_ONCE(!dl_task(p)); 2668 - 2669 - return p; 2670 2607 } 2671 2608 2672 2609 /*

+299 -208

kernel/sched/fair.c

··· 3957 3957 if (!gcfs_rq || !gcfs_rq->load.weight) 3958 3958 return; 3959 3959 3960 - if (throttled_hierarchy(gcfs_rq)) 3961 - return; 3962 - 3963 3960 shares = calc_group_shares(gcfs_rq); 3964 3961 if (unlikely(se->load.weight != shares)) 3965 3962 reweight_entity(cfs_rq_of(se), se, shares); ··· 5288 5291 5289 5292 if (cfs_rq->nr_queued == 1) { 5290 5293 check_enqueue_throttle(cfs_rq); 5291 - if (!throttled_hierarchy(cfs_rq)) { 5292 - list_add_leaf_cfs_rq(cfs_rq); 5293 - } else { 5294 + list_add_leaf_cfs_rq(cfs_rq); 5294 5295 #ifdef CONFIG_CFS_BANDWIDTH 5296 + if (cfs_rq->pelt_clock_throttled) { 5295 5297 struct rq *rq = rq_of(cfs_rq); 5296 5298 5297 - if (cfs_rq_throttled(cfs_rq) && !cfs_rq->throttled_clock) 5298 - cfs_rq->throttled_clock = rq_clock(rq); 5299 - if (!cfs_rq->throttled_clock_self) 5300 - cfs_rq->throttled_clock_self = rq_clock(rq); 5301 - #endif 5299 + cfs_rq->throttled_clock_pelt_time += rq_clock_pelt(rq) - 5300 + cfs_rq->throttled_clock_pelt; 5301 + cfs_rq->pelt_clock_throttled = 0; 5302 5302 } 5303 + #endif 5303 5304 } 5304 5305 } 5305 5306 ··· 5336 5341 struct cfs_rq *cfs_rq = cfs_rq_of(se); 5337 5342 5338 5343 cfs_rq->h_nr_runnable--; 5339 - if (cfs_rq_throttled(cfs_rq)) 5340 - break; 5341 5344 } 5342 5345 } 5343 5346 ··· 5356 5363 struct cfs_rq *cfs_rq = cfs_rq_of(se); 5357 5364 5358 5365 cfs_rq->h_nr_runnable++; 5359 - if (cfs_rq_throttled(cfs_rq)) 5360 - break; 5361 5366 } 5362 5367 } 5363 5368 ··· 5383 5392 * DELAY_DEQUEUE relies on spurious wakeups, special task 5384 5393 * states must not suffer spurious wakeups, excempt them. 5385 5394 */ 5386 - if (flags & DEQUEUE_SPECIAL) 5395 + if (flags & (DEQUEUE_SPECIAL | DEQUEUE_THROTTLE)) 5387 5396 delay = false; 5388 5397 5389 5398 WARN_ON_ONCE(delay && se->sched_delayed); ··· 5441 5450 if (flags & DEQUEUE_DELAYED) 5442 5451 finish_delayed_dequeue_entity(se); 5443 5452 5444 - if (cfs_rq->nr_queued == 0) 5453 + if (cfs_rq->nr_queued == 0) { 5445 5454 update_idle_cfs_rq_clock_pelt(cfs_rq); 5455 + #ifdef CONFIG_CFS_BANDWIDTH 5456 + if (throttled_hierarchy(cfs_rq)) { 5457 + struct rq *rq = rq_of(cfs_rq); 5458 + 5459 + list_del_leaf_cfs_rq(cfs_rq); 5460 + cfs_rq->throttled_clock_pelt = rq_clock_pelt(rq); 5461 + cfs_rq->pelt_clock_throttled = 1; 5462 + } 5463 + #endif 5464 + } 5446 5465 5447 5466 return true; 5448 5467 } ··· 5726 5725 return cfs_bandwidth_used() && cfs_rq->throttled; 5727 5726 } 5728 5727 5728 + static inline bool cfs_rq_pelt_clock_throttled(struct cfs_rq *cfs_rq) 5729 + { 5730 + return cfs_bandwidth_used() && cfs_rq->pelt_clock_throttled; 5731 + } 5732 + 5729 5733 /* check whether cfs_rq, or any parent, is throttled */ 5730 5734 static inline int throttled_hierarchy(struct cfs_rq *cfs_rq) 5731 5735 { 5732 5736 return cfs_bandwidth_used() && cfs_rq->throttle_count; 5733 5737 } 5734 5738 5735 - /* 5736 - * Ensure that neither of the group entities corresponding to src_cpu or 5737 - * dest_cpu are members of a throttled hierarchy when performing group 5738 - * load-balance operations. 5739 - */ 5740 - static inline int throttled_lb_pair(struct task_group *tg, 5741 - int src_cpu, int dest_cpu) 5739 + static inline int lb_throttled_hierarchy(struct task_struct *p, int dst_cpu) 5742 5740 { 5743 - struct cfs_rq *src_cfs_rq, *dest_cfs_rq; 5744 - 5745 - src_cfs_rq = tg->cfs_rq[src_cpu]; 5746 - dest_cfs_rq = tg->cfs_rq[dest_cpu]; 5747 - 5748 - return throttled_hierarchy(src_cfs_rq) || 5749 - throttled_hierarchy(dest_cfs_rq); 5741 + return throttled_hierarchy(task_group(p)->cfs_rq[dst_cpu]); 5750 5742 } 5751 5743 5744 + static inline bool task_is_throttled(struct task_struct *p) 5745 + { 5746 + return cfs_bandwidth_used() && p->throttled; 5747 + } 5748 + 5749 + static bool dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags); 5750 + static void throttle_cfs_rq_work(struct callback_head *work) 5751 + { 5752 + struct task_struct *p = container_of(work, struct task_struct, sched_throttle_work); 5753 + struct sched_entity *se; 5754 + struct cfs_rq *cfs_rq; 5755 + struct rq *rq; 5756 + 5757 + WARN_ON_ONCE(p != current); 5758 + p->sched_throttle_work.next = &p->sched_throttle_work; 5759 + 5760 + /* 5761 + * If task is exiting, then there won't be a return to userspace, so we 5762 + * don't have to bother with any of this. 5763 + */ 5764 + if ((p->flags & PF_EXITING)) 5765 + return; 5766 + 5767 + scoped_guard(task_rq_lock, p) { 5768 + se = &p->se; 5769 + cfs_rq = cfs_rq_of(se); 5770 + 5771 + /* Raced, forget */ 5772 + if (p->sched_class != &fair_sched_class) 5773 + return; 5774 + 5775 + /* 5776 + * If not in limbo, then either replenish has happened or this 5777 + * task got migrated out of the throttled cfs_rq, move along. 5778 + */ 5779 + if (!cfs_rq->throttle_count) 5780 + return; 5781 + rq = scope.rq; 5782 + update_rq_clock(rq); 5783 + WARN_ON_ONCE(p->throttled || !list_empty(&p->throttle_node)); 5784 + dequeue_task_fair(rq, p, DEQUEUE_SLEEP | DEQUEUE_THROTTLE); 5785 + list_add(&p->throttle_node, &cfs_rq->throttled_limbo_list); 5786 + /* 5787 + * Must not set throttled before dequeue or dequeue will 5788 + * mistakenly regard this task as an already throttled one. 5789 + */ 5790 + p->throttled = true; 5791 + resched_curr(rq); 5792 + } 5793 + } 5794 + 5795 + void init_cfs_throttle_work(struct task_struct *p) 5796 + { 5797 + init_task_work(&p->sched_throttle_work, throttle_cfs_rq_work); 5798 + /* Protect against double add, see throttle_cfs_rq() and throttle_cfs_rq_work() */ 5799 + p->sched_throttle_work.next = &p->sched_throttle_work; 5800 + INIT_LIST_HEAD(&p->throttle_node); 5801 + } 5802 + 5803 + /* 5804 + * Task is throttled and someone wants to dequeue it again: 5805 + * it could be sched/core when core needs to do things like 5806 + * task affinity change, task group change, task sched class 5807 + * change etc. and in these cases, DEQUEUE_SLEEP is not set; 5808 + * or the task is blocked after throttled due to freezer etc. 5809 + * and in these cases, DEQUEUE_SLEEP is set. 5810 + */ 5811 + static void detach_task_cfs_rq(struct task_struct *p); 5812 + static void dequeue_throttled_task(struct task_struct *p, int flags) 5813 + { 5814 + WARN_ON_ONCE(p->se.on_rq); 5815 + list_del_init(&p->throttle_node); 5816 + 5817 + /* task blocked after throttled */ 5818 + if (flags & DEQUEUE_SLEEP) { 5819 + p->throttled = false; 5820 + return; 5821 + } 5822 + 5823 + /* 5824 + * task is migrating off its old cfs_rq, detach 5825 + * the task's load from its old cfs_rq. 5826 + */ 5827 + if (task_on_rq_migrating(p)) 5828 + detach_task_cfs_rq(p); 5829 + } 5830 + 5831 + static bool enqueue_throttled_task(struct task_struct *p) 5832 + { 5833 + struct cfs_rq *cfs_rq = cfs_rq_of(&p->se); 5834 + 5835 + /* @p should have gone through dequeue_throttled_task() first */ 5836 + WARN_ON_ONCE(!list_empty(&p->throttle_node)); 5837 + 5838 + /* 5839 + * If the throttled task @p is enqueued to a throttled cfs_rq, 5840 + * take the fast path by directly putting the task on the 5841 + * target cfs_rq's limbo list. 5842 + * 5843 + * Do not do that when @p is current because the following race can 5844 + * cause @p's group_node to be incorectly re-insterted in its rq's 5845 + * cfs_tasks list, despite being throttled: 5846 + * 5847 + * cpuX cpuY 5848 + * p ret2user 5849 + * throttle_cfs_rq_work() sched_move_task(p) 5850 + * LOCK task_rq_lock 5851 + * dequeue_task_fair(p) 5852 + * UNLOCK task_rq_lock 5853 + * LOCK task_rq_lock 5854 + * task_current_donor(p) == true 5855 + * task_on_rq_queued(p) == true 5856 + * dequeue_task(p) 5857 + * put_prev_task(p) 5858 + * sched_change_group() 5859 + * enqueue_task(p) -> p's new cfs_rq 5860 + * is throttled, go 5861 + * fast path and skip 5862 + * actual enqueue 5863 + * set_next_task(p) 5864 + * list_move(&se->group_node, &rq->cfs_tasks); // bug 5865 + * schedule() 5866 + * 5867 + * In the above race case, @p current cfs_rq is in the same rq as 5868 + * its previous cfs_rq because sched_move_task() only moves a task 5869 + * to a different group from the same rq, so we can use its current 5870 + * cfs_rq to derive rq and test if the task is current. 5871 + */ 5872 + if (throttled_hierarchy(cfs_rq) && 5873 + !task_current_donor(rq_of(cfs_rq), p)) { 5874 + list_add(&p->throttle_node, &cfs_rq->throttled_limbo_list); 5875 + return true; 5876 + } 5877 + 5878 + /* we can't take the fast path, do an actual enqueue*/ 5879 + p->throttled = false; 5880 + return false; 5881 + } 5882 + 5883 + static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags); 5752 5884 static int tg_unthrottle_up(struct task_group *tg, void *data) 5753 5885 { 5754 5886 struct rq *rq = data; 5755 5887 struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)]; 5888 + struct task_struct *p, *tmp; 5756 5889 5757 - cfs_rq->throttle_count--; 5758 - if (!cfs_rq->throttle_count) { 5890 + if (--cfs_rq->throttle_count) 5891 + return 0; 5892 + 5893 + if (cfs_rq->pelt_clock_throttled) { 5759 5894 cfs_rq->throttled_clock_pelt_time += rq_clock_pelt(rq) - 5760 5895 cfs_rq->throttled_clock_pelt; 5761 - 5762 - /* Add cfs_rq with load or one or more already running entities to the list */ 5763 - if (!cfs_rq_is_decayed(cfs_rq)) 5764 - list_add_leaf_cfs_rq(cfs_rq); 5765 - 5766 - if (cfs_rq->throttled_clock_self) { 5767 - u64 delta = rq_clock(rq) - cfs_rq->throttled_clock_self; 5768 - 5769 - cfs_rq->throttled_clock_self = 0; 5770 - 5771 - if (WARN_ON_ONCE((s64)delta < 0)) 5772 - delta = 0; 5773 - 5774 - cfs_rq->throttled_clock_self_time += delta; 5775 - } 5896 + cfs_rq->pelt_clock_throttled = 0; 5776 5897 } 5777 5898 5899 + if (cfs_rq->throttled_clock_self) { 5900 + u64 delta = rq_clock(rq) - cfs_rq->throttled_clock_self; 5901 + 5902 + cfs_rq->throttled_clock_self = 0; 5903 + 5904 + if (WARN_ON_ONCE((s64)delta < 0)) 5905 + delta = 0; 5906 + 5907 + cfs_rq->throttled_clock_self_time += delta; 5908 + } 5909 + 5910 + /* Re-enqueue the tasks that have been throttled at this level. */ 5911 + list_for_each_entry_safe(p, tmp, &cfs_rq->throttled_limbo_list, throttle_node) { 5912 + list_del_init(&p->throttle_node); 5913 + p->throttled = false; 5914 + enqueue_task_fair(rq_of(cfs_rq), p, ENQUEUE_WAKEUP); 5915 + } 5916 + 5917 + /* Add cfs_rq with load or one or more already running entities to the list */ 5918 + if (!cfs_rq_is_decayed(cfs_rq)) 5919 + list_add_leaf_cfs_rq(cfs_rq); 5920 + 5778 5921 return 0; 5922 + } 5923 + 5924 + static inline bool task_has_throttle_work(struct task_struct *p) 5925 + { 5926 + return p->sched_throttle_work.next != &p->sched_throttle_work; 5927 + } 5928 + 5929 + static inline void task_throttle_setup_work(struct task_struct *p) 5930 + { 5931 + if (task_has_throttle_work(p)) 5932 + return; 5933 + 5934 + /* 5935 + * Kthreads and exiting tasks don't return to userspace, so adding the 5936 + * work is pointless 5937 + */ 5938 + if ((p->flags & (PF_EXITING | PF_KTHREAD))) 5939 + return; 5940 + 5941 + task_work_add(p, &p->sched_throttle_work, TWA_RESUME); 5942 + } 5943 + 5944 + static void record_throttle_clock(struct cfs_rq *cfs_rq) 5945 + { 5946 + struct rq *rq = rq_of(cfs_rq); 5947 + 5948 + if (cfs_rq_throttled(cfs_rq) && !cfs_rq->throttled_clock) 5949 + cfs_rq->throttled_clock = rq_clock(rq); 5950 + 5951 + if (!cfs_rq->throttled_clock_self) 5952 + cfs_rq->throttled_clock_self = rq_clock(rq); 5779 5953 } 5780 5954 5781 5955 static int tg_throttle_down(struct task_group *tg, void *data) ··· 5958 5782 struct rq *rq = data; 5959 5783 struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)]; 5960 5784 5961 - /* group is entering throttled state, stop time */ 5962 - if (!cfs_rq->throttle_count) { 5963 - cfs_rq->throttled_clock_pelt = rq_clock_pelt(rq); 5785 + if (cfs_rq->throttle_count++) 5786 + return 0; 5787 + 5788 + /* 5789 + * For cfs_rqs that still have entities enqueued, PELT clock 5790 + * stop happens at dequeue time when all entities are dequeued. 5791 + */ 5792 + if (!cfs_rq->nr_queued) { 5964 5793 list_del_leaf_cfs_rq(cfs_rq); 5965 - 5966 - WARN_ON_ONCE(cfs_rq->throttled_clock_self); 5967 - if (cfs_rq->nr_queued) 5968 - cfs_rq->throttled_clock_self = rq_clock(rq); 5794 + cfs_rq->throttled_clock_pelt = rq_clock_pelt(rq); 5795 + cfs_rq->pelt_clock_throttled = 1; 5969 5796 } 5970 - cfs_rq->throttle_count++; 5971 5797 5798 + WARN_ON_ONCE(cfs_rq->throttled_clock_self); 5799 + WARN_ON_ONCE(!list_empty(&cfs_rq->throttled_limbo_list)); 5972 5800 return 0; 5973 5801 } 5974 5802 ··· 5980 5800 { 5981 5801 struct rq *rq = rq_of(cfs_rq); 5982 5802 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); 5983 - struct sched_entity *se; 5984 - long queued_delta, runnable_delta, idle_delta, dequeue = 1; 5803 + int dequeue = 1; 5985 5804 5986 5805 raw_spin_lock(&cfs_b->lock); 5987 5806 /* This will start the period timer if necessary */ ··· 6003 5824 if (!dequeue) 6004 5825 return false; /* Throttle no longer required. */ 6005 5826 6006 - se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))]; 6007 - 6008 5827 /* freeze hierarchy runnable averages while throttled */ 6009 5828 rcu_read_lock(); 6010 5829 walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq); 6011 5830 rcu_read_unlock(); 6012 5831 6013 - queued_delta = cfs_rq->h_nr_queued; 6014 - runnable_delta = cfs_rq->h_nr_runnable; 6015 - idle_delta = cfs_rq->h_nr_idle; 6016 - for_each_sched_entity(se) { 6017 - struct cfs_rq *qcfs_rq = cfs_rq_of(se); 6018 - int flags; 6019 - 6020 - /* throttled entity or throttle-on-deactivate */ 6021 - if (!se->on_rq) 6022 - goto done; 6023 - 6024 - /* 6025 - * Abuse SPECIAL to avoid delayed dequeue in this instance. 6026 - * This avoids teaching dequeue_entities() about throttled 6027 - * entities and keeps things relatively simple. 6028 - */ 6029 - flags = DEQUEUE_SLEEP | DEQUEUE_SPECIAL; 6030 - if (se->sched_delayed) 6031 - flags |= DEQUEUE_DELAYED; 6032 - dequeue_entity(qcfs_rq, se, flags); 6033 - 6034 - if (cfs_rq_is_idle(group_cfs_rq(se))) 6035 - idle_delta = cfs_rq->h_nr_queued; 6036 - 6037 - qcfs_rq->h_nr_queued -= queued_delta; 6038 - qcfs_rq->h_nr_runnable -= runnable_delta; 6039 - qcfs_rq->h_nr_idle -= idle_delta; 6040 - 6041 - if (qcfs_rq->load.weight) { 6042 - /* Avoid re-evaluating load for this entity: */ 6043 - se = parent_entity(se); 6044 - break; 6045 - } 6046 - } 6047 - 6048 - for_each_sched_entity(se) { 6049 - struct cfs_rq *qcfs_rq = cfs_rq_of(se); 6050 - /* throttled entity or throttle-on-deactivate */ 6051 - if (!se->on_rq) 6052 - goto done; 6053 - 6054 - update_load_avg(qcfs_rq, se, 0); 6055 - se_update_runnable(se); 6056 - 6057 - if (cfs_rq_is_idle(group_cfs_rq(se))) 6058 - idle_delta = cfs_rq->h_nr_queued; 6059 - 6060 - qcfs_rq->h_nr_queued -= queued_delta; 6061 - qcfs_rq->h_nr_runnable -= runnable_delta; 6062 - qcfs_rq->h_nr_idle -= idle_delta; 6063 - } 6064 - 6065 - /* At this point se is NULL and we are at root level*/ 6066 - sub_nr_running(rq, queued_delta); 6067 - done: 6068 5832 /* 6069 5833 * Note: distribution will already see us throttled via the 6070 5834 * throttled-list. rq->lock protects completion. 6071 5835 */ 6072 5836 cfs_rq->throttled = 1; 6073 5837 WARN_ON_ONCE(cfs_rq->throttled_clock); 6074 - if (cfs_rq->nr_queued) 6075 - cfs_rq->throttled_clock = rq_clock(rq); 6076 5838 return true; 6077 5839 } 6078 5840 ··· 6021 5901 { 6022 5902 struct rq *rq = rq_of(cfs_rq); 6023 5903 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); 6024 - struct sched_entity *se; 6025 - long queued_delta, runnable_delta, idle_delta; 6026 - long rq_h_nr_queued = rq->cfs.h_nr_queued; 5904 + struct sched_entity *se = cfs_rq->tg->se[cpu_of(rq)]; 5905 + 5906 + /* 5907 + * It's possible we are called with !runtime_remaining due to things 5908 + * like user changed quota setting(see tg_set_cfs_bandwidth()) or async 5909 + * unthrottled us with a positive runtime_remaining but other still 5910 + * running entities consumed those runtime before we reached here. 5911 + * 5912 + * Anyway, we can't unthrottle this cfs_rq without any runtime remaining 5913 + * because any enqueue in tg_unthrottle_up() will immediately trigger a 5914 + * throttle, which is not supposed to happen on unthrottle path. 5915 + */ 5916 + if (cfs_rq->runtime_enabled && cfs_rq->runtime_remaining <= 0) 5917 + return; 6027 5918 6028 5919 se = cfs_rq->tg->se[cpu_of(rq)]; 6029 5920 ··· 6064 5933 if (list_add_leaf_cfs_rq(cfs_rq_of(se))) 6065 5934 break; 6066 5935 } 6067 - goto unthrottle_throttle; 6068 5936 } 6069 5937 6070 - queued_delta = cfs_rq->h_nr_queued; 6071 - runnable_delta = cfs_rq->h_nr_runnable; 6072 - idle_delta = cfs_rq->h_nr_idle; 6073 - for_each_sched_entity(se) { 6074 - struct cfs_rq *qcfs_rq = cfs_rq_of(se); 6075 - 6076 - /* Handle any unfinished DELAY_DEQUEUE business first. */ 6077 - if (se->sched_delayed) { 6078 - int flags = DEQUEUE_SLEEP | DEQUEUE_DELAYED; 6079 - 6080 - dequeue_entity(qcfs_rq, se, flags); 6081 - } else if (se->on_rq) 6082 - break; 6083 - enqueue_entity(qcfs_rq, se, ENQUEUE_WAKEUP); 6084 - 6085 - if (cfs_rq_is_idle(group_cfs_rq(se))) 6086 - idle_delta = cfs_rq->h_nr_queued; 6087 - 6088 - qcfs_rq->h_nr_queued += queued_delta; 6089 - qcfs_rq->h_nr_runnable += runnable_delta; 6090 - qcfs_rq->h_nr_idle += idle_delta; 6091 - 6092 - /* end evaluation on encountering a throttled cfs_rq */ 6093 - if (cfs_rq_throttled(qcfs_rq)) 6094 - goto unthrottle_throttle; 6095 - } 6096 - 6097 - for_each_sched_entity(se) { 6098 - struct cfs_rq *qcfs_rq = cfs_rq_of(se); 6099 - 6100 - update_load_avg(qcfs_rq, se, UPDATE_TG); 6101 - se_update_runnable(se); 6102 - 6103 - if (cfs_rq_is_idle(group_cfs_rq(se))) 6104 - idle_delta = cfs_rq->h_nr_queued; 6105 - 6106 - qcfs_rq->h_nr_queued += queued_delta; 6107 - qcfs_rq->h_nr_runnable += runnable_delta; 6108 - qcfs_rq->h_nr_idle += idle_delta; 6109 - 6110 - /* end evaluation on encountering a throttled cfs_rq */ 6111 - if (cfs_rq_throttled(qcfs_rq)) 6112 - goto unthrottle_throttle; 6113 - } 6114 - 6115 - /* Start the fair server if un-throttling resulted in new runnable tasks */ 6116 - if (!rq_h_nr_queued && rq->cfs.h_nr_queued) 6117 - dl_server_start(&rq->fair_server); 6118 - 6119 - /* At this point se is NULL and we are at root level*/ 6120 - add_nr_running(rq, queued_delta); 6121 - 6122 - unthrottle_throttle: 6123 5938 assert_list_leaf_cfs_rq(rq); 6124 5939 6125 5940 /* Determine whether we need to wake up potentially idle CPU: */ ··· 6549 6472 cfs_rq->runtime_enabled = 0; 6550 6473 INIT_LIST_HEAD(&cfs_rq->throttled_list); 6551 6474 INIT_LIST_HEAD(&cfs_rq->throttled_csd_list); 6475 + INIT_LIST_HEAD(&cfs_rq->throttled_limbo_list); 6552 6476 } 6553 6477 6554 6478 void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b) ··· 6717 6639 static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {} 6718 6640 static inline void sync_throttle(struct task_group *tg, int cpu) {} 6719 6641 static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} 6642 + static void task_throttle_setup_work(struct task_struct *p) {} 6643 + static bool task_is_throttled(struct task_struct *p) { return false; } 6644 + static void dequeue_throttled_task(struct task_struct *p, int flags) {} 6645 + static bool enqueue_throttled_task(struct task_struct *p) { return false; } 6646 + static void record_throttle_clock(struct cfs_rq *cfs_rq) {} 6720 6647 6721 6648 static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq) 6722 6649 { 6723 6650 return 0; 6651 + } 6652 + 6653 + static inline bool cfs_rq_pelt_clock_throttled(struct cfs_rq *cfs_rq) 6654 + { 6655 + return false; 6724 6656 } 6725 6657 6726 6658 static inline int throttled_hierarchy(struct cfs_rq *cfs_rq) ··· 6738 6650 return 0; 6739 6651 } 6740 6652 6741 - static inline int throttled_lb_pair(struct task_group *tg, 6742 - int src_cpu, int dest_cpu) 6653 + static inline int lb_throttled_hierarchy(struct task_struct *p, int dst_cpu) 6743 6654 { 6744 6655 return 0; 6745 6656 } ··· 6918 6831 int rq_h_nr_queued = rq->cfs.h_nr_queued; 6919 6832 u64 slice = 0; 6920 6833 6834 + if (task_is_throttled(p) && enqueue_throttled_task(p)) 6835 + return; 6836 + 6921 6837 /* 6922 6838 * The code below (indirectly) updates schedutil which looks at 6923 6839 * the cfs_rq utilization to select a frequency. ··· 6973 6883 if (cfs_rq_is_idle(cfs_rq)) 6974 6884 h_nr_idle = 1; 6975 6885 6976 - /* end evaluation on encountering a throttled cfs_rq */ 6977 - if (cfs_rq_throttled(cfs_rq)) 6978 - goto enqueue_throttle; 6979 - 6980 6886 flags = ENQUEUE_WAKEUP; 6981 6887 } 6982 6888 ··· 6994 6908 6995 6909 if (cfs_rq_is_idle(cfs_rq)) 6996 6910 h_nr_idle = 1; 6997 - 6998 - /* end evaluation on encountering a throttled cfs_rq */ 6999 - if (cfs_rq_throttled(cfs_rq)) 7000 - goto enqueue_throttle; 7001 6911 } 7002 6912 7003 6913 if (!rq_h_nr_queued && rq->cfs.h_nr_queued) { ··· 7023 6941 if (!task_new) 7024 6942 check_update_overutilized_status(rq); 7025 6943 7026 - enqueue_throttle: 7027 6944 assert_list_leaf_cfs_rq(rq); 7028 6945 7029 6946 hrtick_update(rq); ··· 7044 6963 bool was_sched_idle = sched_idle_rq(rq); 7045 6964 bool task_sleep = flags & DEQUEUE_SLEEP; 7046 6965 bool task_delayed = flags & DEQUEUE_DELAYED; 6966 + bool task_throttled = flags & DEQUEUE_THROTTLE; 7047 6967 struct task_struct *p = NULL; 7048 6968 int h_nr_idle = 0; 7049 6969 int h_nr_queued = 0; ··· 7078 6996 if (cfs_rq_is_idle(cfs_rq)) 7079 6997 h_nr_idle = h_nr_queued; 7080 6998 7081 - /* end evaluation on encountering a throttled cfs_rq */ 7082 - if (cfs_rq_throttled(cfs_rq)) 7083 - return 0; 6999 + if (throttled_hierarchy(cfs_rq) && task_throttled) 7000 + record_throttle_clock(cfs_rq); 7084 7001 7085 7002 /* Don't dequeue parent if it has other entities besides us */ 7086 7003 if (cfs_rq->load.weight) { ··· 7091 7010 * Bias pick_next to pick a task from this cfs_rq, as 7092 7011 * p is sleeping when it is within its sched_slice. 7093 7012 */ 7094 - if (task_sleep && se && !throttled_hierarchy(cfs_rq)) 7013 + if (task_sleep && se) 7095 7014 set_next_buddy(se); 7096 7015 break; 7097 7016 } ··· 7118 7037 if (cfs_rq_is_idle(cfs_rq)) 7119 7038 h_nr_idle = h_nr_queued; 7120 7039 7121 - /* end evaluation on encountering a throttled cfs_rq */ 7122 - if (cfs_rq_throttled(cfs_rq)) 7123 - return 0; 7040 + if (throttled_hierarchy(cfs_rq) && task_throttled) 7041 + record_throttle_clock(cfs_rq); 7124 7042 } 7125 7043 7126 7044 sub_nr_running(rq, h_nr_queued); ··· 7153 7073 */ 7154 7074 static bool dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) 7155 7075 { 7076 + if (task_is_throttled(p)) { 7077 + dequeue_throttled_task(p, flags); 7078 + return true; 7079 + } 7080 + 7156 7081 if (!p->se.sched_delayed) 7157 7082 util_est_dequeue(&rq->cfs, p); 7158 7083 ··· 8745 8660 * lead to a throttle). This both saves work and prevents false 8746 8661 * next-buddy nomination below. 8747 8662 */ 8748 - if (unlikely(throttled_hierarchy(cfs_rq_of(pse)))) 8663 + if (task_is_throttled(p)) 8749 8664 return; 8750 8665 8751 8666 if (sched_feat(NEXT_BUDDY) && !(wake_flags & WF_FORK) && !pse->sched_delayed) { ··· 8826 8741 { 8827 8742 struct sched_entity *se; 8828 8743 struct cfs_rq *cfs_rq; 8744 + struct task_struct *p; 8745 + bool throttled; 8829 8746 8830 8747 again: 8831 8748 cfs_rq = &rq->cfs; 8832 8749 if (!cfs_rq->nr_queued) 8833 8750 return NULL; 8834 8751 8752 + throttled = false; 8753 + 8835 8754 do { 8836 8755 /* Might not have done put_prev_entity() */ 8837 8756 if (cfs_rq->curr && cfs_rq->curr->on_rq) 8838 8757 update_curr(cfs_rq); 8839 8758 8840 - if (unlikely(check_cfs_rq_runtime(cfs_rq))) 8841 - goto again; 8759 + throttled |= check_cfs_rq_runtime(cfs_rq); 8842 8760 8843 8761 se = pick_next_entity(rq, cfs_rq); 8844 8762 if (!se) ··· 8849 8761 cfs_rq = group_cfs_rq(se); 8850 8762 } while (cfs_rq); 8851 8763 8852 - return task_of(se); 8764 + p = task_of(se); 8765 + if (unlikely(throttled)) 8766 + task_throttle_setup_work(p); 8767 + return p; 8853 8768 } 8854 8769 8855 8770 static void __set_next_task_fair(struct rq *rq, struct task_struct *p, bool first); ··· 9014 8923 { 9015 8924 struct sched_entity *se = &p->se; 9016 8925 9017 - /* throttled hierarchies are not runnable */ 9018 - if (!se->on_rq || throttled_hierarchy(cfs_rq_of(se))) 8926 + /* !se->on_rq also covers throttled task */ 8927 + if (!se->on_rq) 9019 8928 return false; 9020 8929 9021 8930 /* Tell the scheduler that we'd really like se to run next. */ ··· 9374 9283 /* 9375 9284 * We do not migrate tasks that are: 9376 9285 * 1) delayed dequeued unless we migrate load, or 9377 - * 2) throttled_lb_pair, or 9286 + * 2) target cfs_rq is in throttled hierarchy, or 9378 9287 * 3) cannot be migrated to this CPU due to cpus_ptr, or 9379 9288 * 4) running (obviously), or 9380 9289 * 5) are cache-hot on their current CPU, or ··· 9383 9292 if ((p->se.sched_delayed) && (env->migration_type != migrate_load)) 9384 9293 return 0; 9385 9294 9386 - if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu)) 9295 + if (lb_throttled_hierarchy(p, env->dst_cpu)) 9387 9296 return 0; 9388 9297 9389 9298 /* ··· 13167 13076 { 13168 13077 struct cfs_rq *cfs_rq = cfs_rq_of(se); 13169 13078 13170 - if (cfs_rq_throttled(cfs_rq)) 13171 - return; 13172 - 13173 - if (!throttled_hierarchy(cfs_rq)) 13079 + /* 13080 + * If a task gets attached to this cfs_rq and before being queued, 13081 + * it gets migrated to another CPU due to reasons like affinity 13082 + * change, make sure this cfs_rq stays on leaf cfs_rq list to have 13083 + * that removed load decayed or it can cause faireness problem. 13084 + */ 13085 + if (!cfs_rq_pelt_clock_throttled(cfs_rq)) 13174 13086 list_add_leaf_cfs_rq(cfs_rq); 13175 13087 13176 13088 /* Start to propagate at parent */ ··· 13184 13090 13185 13091 update_load_avg(cfs_rq, se, UPDATE_TG); 13186 13092 13187 - if (cfs_rq_throttled(cfs_rq)) 13188 - break; 13189 - 13190 - if (!throttled_hierarchy(cfs_rq)) 13093 + if (!cfs_rq_pelt_clock_throttled(cfs_rq)) 13191 13094 list_add_leaf_cfs_rq(cfs_rq); 13192 13095 } 13193 13096 }

+2 -2

kernel/sched/pelt.h

··· 162 162 { 163 163 u64 throttled; 164 164 165 - if (unlikely(cfs_rq->throttle_count)) 165 + if (unlikely(cfs_rq->pelt_clock_throttled)) 166 166 throttled = U64_MAX; 167 167 else 168 168 throttled = cfs_rq->throttled_clock_pelt_time; ··· 173 173 /* rq->task_clock normalized against any time this cfs_rq has spent throttled */ 174 174 static inline u64 cfs_rq_clock_pelt(struct cfs_rq *cfs_rq) 175 175 { 176 - if (unlikely(cfs_rq->throttle_count)) 176 + if (unlikely(cfs_rq->pelt_clock_throttled)) 177 177 return cfs_rq->throttled_clock_pelt - cfs_rq->throttled_clock_pelt_time; 178 178 179 179 return rq_clock_pelt(rq_of(cfs_rq)) - cfs_rq->throttled_clock_pelt_time;

+12

kernel/sched/rq-offsets.c

··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + #define COMPILE_OFFSETS 3 + #include <linux/kbuild.h> 4 + #include <linux/types.h> 5 + #include "sched.h" 6 + 7 + int main(void) 8 + { 9 + DEFINE(RQ_nr_pinned, offsetof(struct rq, nr_pinned)); 10 + 11 + return 0; 12 + }

+6 -1

kernel/sched/sched.h

··· 760 760 u64 throttled_clock_pelt_time; 761 761 u64 throttled_clock_self; 762 762 u64 throttled_clock_self_time; 763 - int throttled; 763 + bool throttled:1; 764 + bool pelt_clock_throttled:1; 764 765 int throttle_count; 765 766 struct list_head throttled_list; 766 767 struct list_head throttled_csd_list; 768 + struct list_head throttled_limbo_list; 767 769 #endif /* CONFIG_CFS_BANDWIDTH */ 768 770 #endif /* CONFIG_FAIR_GROUP_SCHED */ 769 771 }; ··· 2369 2367 #define DEQUEUE_SPECIAL 0x10 2370 2368 #define DEQUEUE_MIGRATING 0x100 /* Matches ENQUEUE_MIGRATING */ 2371 2369 #define DEQUEUE_DELAYED 0x200 /* Matches ENQUEUE_DELAYED */ 2370 + #define DEQUEUE_THROTTLE 0x800 2372 2371 2373 2372 #define ENQUEUE_WAKEUP 0x01 2374 2373 #define ENQUEUE_RESTORE 0x02 ··· 2685 2682 extern bool sched_rt_bandwidth_account(struct rt_rq *rt_rq); 2686 2683 2687 2684 extern void init_dl_entity(struct sched_dl_entity *dl_se); 2685 + 2686 + extern void init_cfs_throttle_work(struct task_struct *p); 2688 2687 2689 2688 #define BW_SHIFT 20 2690 2689 #define BW_UNIT (1 << BW_SHIFT)

+56 -19

kernel/sched/topology.c

··· 1591 1591 enum numa_topology_type sched_numa_topology_type; 1592 1592 1593 1593 static int sched_domains_numa_levels; 1594 - static int sched_domains_curr_level; 1595 1594 1596 1595 int sched_max_numa_distance; 1597 1596 static int *sched_domains_numa_distance; ··· 1631 1632 int sd_id, sd_weight, sd_flags = 0; 1632 1633 struct cpumask *sd_span; 1633 1634 1634 - #ifdef CONFIG_NUMA 1635 - /* 1636 - * Ugly hack to pass state to sd_numa_mask()... 1637 - */ 1638 - sched_domains_curr_level = tl->numa_level; 1639 - #endif 1640 - 1641 - sd_weight = cpumask_weight(tl->mask(cpu)); 1635 + sd_weight = cpumask_weight(tl->mask(tl, cpu)); 1642 1636 1643 1637 if (tl->sd_flags) 1644 1638 sd_flags = (*tl->sd_flags)(); ··· 1669 1677 }; 1670 1678 1671 1679 sd_span = sched_domain_span(sd); 1672 - cpumask_and(sd_span, cpu_map, tl->mask(cpu)); 1680 + cpumask_and(sd_span, cpu_map, tl->mask(tl, cpu)); 1673 1681 sd_id = cpumask_first(sd_span); 1674 1682 1675 1683 sd->flags |= asym_cpu_capacity_classify(sd_span, cpu_map); ··· 1724 1732 return sd; 1725 1733 } 1726 1734 1735 + #ifdef CONFIG_SCHED_SMT 1736 + int cpu_smt_flags(void) 1737 + { 1738 + return SD_SHARE_CPUCAPACITY | SD_SHARE_LLC; 1739 + } 1740 + 1741 + const struct cpumask *tl_smt_mask(struct sched_domain_topology_level *tl, int cpu) 1742 + { 1743 + return cpu_smt_mask(cpu); 1744 + } 1745 + #endif 1746 + 1747 + #ifdef CONFIG_SCHED_CLUSTER 1748 + int cpu_cluster_flags(void) 1749 + { 1750 + return SD_CLUSTER | SD_SHARE_LLC; 1751 + } 1752 + 1753 + const struct cpumask *tl_cls_mask(struct sched_domain_topology_level *tl, int cpu) 1754 + { 1755 + return cpu_clustergroup_mask(cpu); 1756 + } 1757 + #endif 1758 + 1759 + #ifdef CONFIG_SCHED_MC 1760 + int cpu_core_flags(void) 1761 + { 1762 + return SD_SHARE_LLC; 1763 + } 1764 + 1765 + const struct cpumask *tl_mc_mask(struct sched_domain_topology_level *tl, int cpu) 1766 + { 1767 + return cpu_coregroup_mask(cpu); 1768 + } 1769 + #endif 1770 + 1771 + const struct cpumask *tl_pkg_mask(struct sched_domain_topology_level *tl, int cpu) 1772 + { 1773 + return cpu_node_mask(cpu); 1774 + } 1775 + 1727 1776 /* 1728 1777 * Topology list, bottom-up. 1729 1778 */ 1730 1779 static struct sched_domain_topology_level default_topology[] = { 1731 1780 #ifdef CONFIG_SCHED_SMT 1732 - SDTL_INIT(cpu_smt_mask, cpu_smt_flags, SMT), 1781 + SDTL_INIT(tl_smt_mask, cpu_smt_flags, SMT), 1733 1782 #endif 1734 1783 1735 1784 #ifdef CONFIG_SCHED_CLUSTER 1736 - SDTL_INIT(cpu_clustergroup_mask, cpu_cluster_flags, CLS), 1785 + SDTL_INIT(tl_cls_mask, cpu_cluster_flags, CLS), 1737 1786 #endif 1738 1787 1739 1788 #ifdef CONFIG_SCHED_MC 1740 - SDTL_INIT(cpu_coregroup_mask, cpu_core_flags, MC), 1789 + SDTL_INIT(tl_mc_mask, cpu_core_flags, MC), 1741 1790 #endif 1742 - SDTL_INIT(cpu_cpu_mask, NULL, PKG), 1791 + SDTL_INIT(tl_pkg_mask, NULL, PKG), 1743 1792 { NULL, }, 1744 1793 }; 1745 1794 ··· 1801 1768 } 1802 1769 1803 1770 #ifdef CONFIG_NUMA 1804 - 1805 - static const struct cpumask *sd_numa_mask(int cpu) 1771 + static int cpu_numa_flags(void) 1806 1772 { 1807 - return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)]; 1773 + return SD_NUMA; 1774 + } 1775 + 1776 + static const struct cpumask *sd_numa_mask(struct sched_domain_topology_level *tl, int cpu) 1777 + { 1778 + return sched_domains_numa_masks[tl->numa_level][cpu_to_node(cpu)]; 1808 1779 } 1809 1780 1810 1781 static void sched_numa_warn(const char *str) ··· 2450 2413 * breaks the linking done for an earlier span. 2451 2414 */ 2452 2415 for_each_cpu(cpu, cpu_map) { 2453 - const struct cpumask *tl_cpu_mask = tl->mask(cpu); 2416 + const struct cpumask *tl_cpu_mask = tl->mask(tl, cpu); 2454 2417 int id; 2455 2418 2456 2419 /* lowest bit set in this mask is used as a unique id */ ··· 2458 2421 2459 2422 if (cpumask_test_cpu(id, id_seen)) { 2460 2423 /* First CPU has already been seen, ensure identical spans */ 2461 - if (!cpumask_equal(tl->mask(id), tl_cpu_mask)) 2424 + if (!cpumask_equal(tl->mask(tl, id), tl_cpu_mask)) 2462 2425 return false; 2463 2426 } else { 2464 2427 /* First CPU hasn't been seen before, ensure it's a completely new span */