Merge tag 'sched_urgent_for_v5.16_rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull scheduler fixes from Borislav Petkov:

- Avoid touching ~100 config files in order to be able to select the
preemption model

- clear cluster CPU masks too, on the CPU unplug path

- prevent use-after-free in cfs

- Prevent a race condition when updating CPU cache domains

- Factor out common shared part of smp_prepare_cpus() into a common
helper which can be called by both baremetal and Xen, in order to fix
a booting of Xen PV guests

* tag 'sched_urgent_for_v5.16_rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
preempt: Restore preemption model selection configs
arch_topology: Fix missing clear cluster_cpumask in remove_cpu_topology()
sched/fair: Prevent dead task groups from regaining cfs_rq's
sched/core: Mitigate race cpus_share_cache()/update_top_cache_domain()
x86/smp: Factor out parts of native_smp_prepare_cpus()

+96 -59
+1
arch/x86/include/asm/smp.h
··· 126 127 void cpu_disable_common(void); 128 void native_smp_prepare_boot_cpu(void); 129 void native_smp_prepare_cpus(unsigned int max_cpus); 130 void calculate_max_logical_packages(void); 131 void native_smp_cpus_done(unsigned int max_cpus);
··· 126 127 void cpu_disable_common(void); 128 void native_smp_prepare_boot_cpu(void); 129 + void smp_prepare_cpus_common(void); 130 void native_smp_prepare_cpus(unsigned int max_cpus); 131 void calculate_max_logical_packages(void); 132 void native_smp_cpus_done(unsigned int max_cpus);
+12 -6
arch/x86/kernel/smpboot.c
··· 1350 cpu0_logical_apicid = GET_APIC_LOGICAL_ID(apic_read(APIC_LDR)); 1351 } 1352 1353 - /* 1354 - * Prepare for SMP bootup. 1355 - * @max_cpus: configured maximum number of CPUs, It is a legacy parameter 1356 - * for common interface support. 1357 - */ 1358 - void __init native_smp_prepare_cpus(unsigned int max_cpus) 1359 { 1360 unsigned int i; 1361 ··· 1381 set_sched_topology(x86_topology); 1382 1383 set_cpu_sibling_map(0); 1384 init_freq_invariance(false, false); 1385 smp_sanity_check(); 1386
··· 1350 cpu0_logical_apicid = GET_APIC_LOGICAL_ID(apic_read(APIC_LDR)); 1351 } 1352 1353 + void __init smp_prepare_cpus_common(void) 1354 { 1355 unsigned int i; 1356 ··· 1386 set_sched_topology(x86_topology); 1387 1388 set_cpu_sibling_map(0); 1389 + } 1390 + 1391 + /* 1392 + * Prepare for SMP bootup. 1393 + * @max_cpus: configured maximum number of CPUs, It is a legacy parameter 1394 + * for common interface support. 1395 + */ 1396 + void __init native_smp_prepare_cpus(unsigned int max_cpus) 1397 + { 1398 + smp_prepare_cpus_common(); 1399 + 1400 init_freq_invariance(false, false); 1401 smp_sanity_check(); 1402
+2 -10
arch/x86/xen/smp_pv.c
··· 225 static void __init xen_pv_smp_prepare_cpus(unsigned int max_cpus) 226 { 227 unsigned cpu; 228 - unsigned int i; 229 230 if (skip_ioapic_setup) { 231 char *m = (max_cpus == 0) ? ··· 237 } 238 xen_init_lock_cpu(0); 239 240 - smp_store_boot_cpu_info(); 241 - cpu_data(0).x86_max_cores = 1; 242 243 - for_each_possible_cpu(i) { 244 - zalloc_cpumask_var(&per_cpu(cpu_sibling_map, i), GFP_KERNEL); 245 - zalloc_cpumask_var(&per_cpu(cpu_core_map, i), GFP_KERNEL); 246 - zalloc_cpumask_var(&per_cpu(cpu_die_map, i), GFP_KERNEL); 247 - zalloc_cpumask_var(&per_cpu(cpu_llc_shared_map, i), GFP_KERNEL); 248 - } 249 - set_cpu_sibling_map(0); 250 251 speculative_store_bypass_ht_init(); 252
··· 225 static void __init xen_pv_smp_prepare_cpus(unsigned int max_cpus) 226 { 227 unsigned cpu; 228 229 if (skip_ioapic_setup) { 230 char *m = (max_cpus == 0) ? ··· 238 } 239 xen_init_lock_cpu(0); 240 241 + smp_prepare_cpus_common(); 242 243 + cpu_data(0).x86_max_cores = 1; 244 245 speculative_store_bypass_ht_init(); 246
+2
drivers/base/arch_topology.c
··· 677 cpumask_clear_cpu(cpu, topology_core_cpumask(sibling)); 678 for_each_cpu(sibling, topology_sibling_cpumask(cpu)) 679 cpumask_clear_cpu(cpu, topology_sibling_cpumask(sibling)); 680 for_each_cpu(sibling, topology_llc_cpumask(cpu)) 681 cpumask_clear_cpu(cpu, topology_llc_cpumask(sibling)); 682
··· 677 cpumask_clear_cpu(cpu, topology_core_cpumask(sibling)); 678 for_each_cpu(sibling, topology_sibling_cpumask(cpu)) 679 cpumask_clear_cpu(cpu, topology_sibling_cpumask(sibling)); 680 + for_each_cpu(sibling, topology_cluster_cpumask(cpu)) 681 + cpumask_clear_cpu(cpu, topology_cluster_cpumask(sibling)); 682 for_each_cpu(sibling, topology_llc_cpumask(cpu)) 683 cpumask_clear_cpu(cpu, topology_llc_cpumask(sibling)); 684
+1 -1
include/linux/kernel.h
··· 85 struct completion; 86 struct user; 87 88 - #ifdef CONFIG_PREEMPT_VOLUNTARY 89 90 extern int __cond_resched(void); 91 # define might_resched() __cond_resched()
··· 85 struct completion; 86 struct user; 87 88 + #ifdef CONFIG_PREEMPT_VOLUNTARY_BUILD 89 90 extern int __cond_resched(void); 91 # define might_resched() __cond_resched()
+1 -1
include/linux/vermagic.h
··· 15 #else 16 #define MODULE_VERMAGIC_SMP "" 17 #endif 18 - #ifdef CONFIG_PREEMPT 19 #define MODULE_VERMAGIC_PREEMPT "preempt " 20 #elif defined(CONFIG_PREEMPT_RT) 21 #define MODULE_VERMAGIC_PREEMPT "preempt_rt "
··· 15 #else 16 #define MODULE_VERMAGIC_SMP "" 17 #endif 18 + #ifdef CONFIG_PREEMPT_BUILD 19 #define MODULE_VERMAGIC_PREEMPT "preempt " 20 #elif defined(CONFIG_PREEMPT_RT) 21 #define MODULE_VERMAGIC_PREEMPT "preempt_rt "
+1 -1
init/Makefile
··· 30 quiet_cmd_compile.h = CHK $@ 31 cmd_compile.h = \ 32 $(CONFIG_SHELL) $(srctree)/scripts/mkcompile_h $@ \ 33 - "$(UTS_MACHINE)" "$(CONFIG_SMP)" "$(CONFIG_PREEMPT)" \ 34 "$(CONFIG_PREEMPT_RT)" $(CONFIG_CC_VERSION_TEXT) "$(LD)" 35 36 include/generated/compile.h: FORCE
··· 30 quiet_cmd_compile.h = CHK $@ 31 cmd_compile.h = \ 32 $(CONFIG_SHELL) $(srctree)/scripts/mkcompile_h $@ \ 33 + "$(UTS_MACHINE)" "$(CONFIG_SMP)" "$(CONFIG_PREEMPT_BUILD)" \ 34 "$(CONFIG_PREEMPT_RT)" $(CONFIG_CC_VERSION_TEXT) "$(LD)" 35 36 include/generated/compile.h: FORCE
+21 -21
kernel/Kconfig.preempt
··· 1 # SPDX-License-Identifier: GPL-2.0-only 2 3 choice 4 prompt "Preemption Model" 5 - default PREEMPT_NONE_BEHAVIOUR 6 7 - config PREEMPT_NONE_BEHAVIOUR 8 bool "No Forced Preemption (Server)" 9 - select PREEMPT_NONE if !PREEMPT_DYNAMIC 10 help 11 This is the traditional Linux preemption model, geared towards 12 throughput. It will still provide good latencies most of the ··· 29 raw processing power of the kernel, irrespective of scheduling 30 latencies. 31 32 - config PREEMPT_VOLUNTARY_BEHAVIOUR 33 bool "Voluntary Kernel Preemption (Desktop)" 34 depends on !ARCH_NO_PREEMPT 35 - select PREEMPT_VOLUNTARY if !PREEMPT_DYNAMIC 36 help 37 This option reduces the latency of the kernel by adding more 38 "explicit preemption points" to the kernel code. These new ··· 48 49 Select this if you are building a kernel for a desktop system. 50 51 - config PREEMPT_BEHAVIOUR 52 bool "Preemptible Kernel (Low-Latency Desktop)" 53 depends on !ARCH_NO_PREEMPT 54 - select PREEMPT 55 help 56 This option reduces the latency of the kernel by making 57 all kernel code (that is not executing in a critical section) ··· 69 70 config PREEMPT_RT 71 bool "Fully Preemptible Kernel (Real-Time)" 72 - depends on EXPERT && ARCH_SUPPORTS_RT && !PREEMPT_DYNAMIC 73 select PREEMPTION 74 help 75 This option turns the kernel into a real-time kernel by replacing ··· 86 87 endchoice 88 89 - config PREEMPT_NONE 90 - bool 91 - 92 - config PREEMPT_VOLUNTARY 93 - bool 94 - 95 - config PREEMPT 96 - bool 97 - select PREEMPTION 98 - select UNINLINE_SPIN_UNLOCK if !ARCH_INLINE_SPIN_UNLOCK 99 - 100 config PREEMPT_COUNT 101 bool 102 ··· 95 96 config PREEMPT_DYNAMIC 97 bool "Preemption behaviour defined on boot" 98 - depends on HAVE_PREEMPT_DYNAMIC 99 - select PREEMPT 100 default y 101 help 102 This option allows to define the preemption model on the kernel
··· 1 # SPDX-License-Identifier: GPL-2.0-only 2 3 + config PREEMPT_NONE_BUILD 4 + bool 5 + 6 + config PREEMPT_VOLUNTARY_BUILD 7 + bool 8 + 9 + config PREEMPT_BUILD 10 + bool 11 + select PREEMPTION 12 + select UNINLINE_SPIN_UNLOCK if !ARCH_INLINE_SPIN_UNLOCK 13 + 14 choice 15 prompt "Preemption Model" 16 + default PREEMPT_NONE 17 18 + config PREEMPT_NONE 19 bool "No Forced Preemption (Server)" 20 + select PREEMPT_NONE_BUILD if !PREEMPT_DYNAMIC 21 help 22 This is the traditional Linux preemption model, geared towards 23 throughput. It will still provide good latencies most of the ··· 18 raw processing power of the kernel, irrespective of scheduling 19 latencies. 20 21 + config PREEMPT_VOLUNTARY 22 bool "Voluntary Kernel Preemption (Desktop)" 23 depends on !ARCH_NO_PREEMPT 24 + select PREEMPT_VOLUNTARY_BUILD if !PREEMPT_DYNAMIC 25 help 26 This option reduces the latency of the kernel by adding more 27 "explicit preemption points" to the kernel code. These new ··· 37 38 Select this if you are building a kernel for a desktop system. 39 40 + config PREEMPT 41 bool "Preemptible Kernel (Low-Latency Desktop)" 42 depends on !ARCH_NO_PREEMPT 43 + select PREEMPT_BUILD 44 help 45 This option reduces the latency of the kernel by making 46 all kernel code (that is not executing in a critical section) ··· 58 59 config PREEMPT_RT 60 bool "Fully Preemptible Kernel (Real-Time)" 61 + depends on EXPERT && ARCH_SUPPORTS_RT 62 select PREEMPTION 63 help 64 This option turns the kernel into a real-time kernel by replacing ··· 75 76 endchoice 77 78 config PREEMPT_COUNT 79 bool 80 ··· 95 96 config PREEMPT_DYNAMIC 97 bool "Preemption behaviour defined on boot" 98 + depends on HAVE_PREEMPT_DYNAMIC && !PREEMPT_RT 99 + select PREEMPT_BUILD 100 default y 101 help 102 This option allows to define the preemption model on the kernel
+1 -1
kernel/sched/autogroup.c
··· 31 ag->tg->rt_se = NULL; 32 ag->tg->rt_rq = NULL; 33 #endif 34 - sched_offline_group(ag->tg); 35 sched_destroy_group(ag->tg); 36 } 37
··· 31 ag->tg->rt_se = NULL; 32 ag->tg->rt_rq = NULL; 33 #endif 34 + sched_release_group(ag->tg); 35 sched_destroy_group(ag->tg); 36 } 37
+41 -12
kernel/sched/core.c
··· 3726 3727 bool cpus_share_cache(int this_cpu, int that_cpu) 3728 { 3729 return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu); 3730 } 3731 ··· 6628 static void __init preempt_dynamic_init(void) 6629 { 6630 if (preempt_dynamic_mode == preempt_dynamic_undefined) { 6631 - if (IS_ENABLED(CONFIG_PREEMPT_NONE_BEHAVIOUR)) { 6632 sched_dynamic_update(preempt_dynamic_none); 6633 - } else if (IS_ENABLED(CONFIG_PREEMPT_VOLUNTARY_BEHAVIOUR)) { 6634 sched_dynamic_update(preempt_dynamic_voluntary); 6635 } else { 6636 /* Default static call setting, nothing to do */ 6637 - WARN_ON_ONCE(!IS_ENABLED(CONFIG_PREEMPT_BEHAVIOUR)); 6638 preempt_dynamic_mode = preempt_dynamic_full; 6639 pr_info("Dynamic Preempt: full\n"); 6640 } ··· 9719 kmem_cache_free(task_group_cache, tg); 9720 } 9721 9722 /* allocate runqueue etc for a new task group */ 9723 struct task_group *sched_create_group(struct task_group *parent) 9724 { ··· 9778 } 9779 9780 /* rcu callback to free various structures associated with a task group */ 9781 - static void sched_free_group_rcu(struct rcu_head *rhp) 9782 { 9783 /* Now it should be safe to free those cfs_rqs: */ 9784 - sched_free_group(container_of(rhp, struct task_group, rcu)); 9785 } 9786 9787 void sched_destroy_group(struct task_group *tg) 9788 { 9789 /* Wait for possible concurrent references to cfs_rqs complete: */ 9790 - call_rcu(&tg->rcu, sched_free_group_rcu); 9791 } 9792 9793 - void sched_offline_group(struct task_group *tg) 9794 { 9795 unsigned long flags; 9796 9797 - /* End participation in shares distribution: */ 9798 - unregister_fair_sched_group(tg); 9799 - 9800 spin_lock_irqsave(&task_group_lock, flags); 9801 list_del_rcu(&tg->list); 9802 list_del_rcu(&tg->siblings); ··· 9925 { 9926 struct task_group *tg = css_tg(css); 9927 9928 - sched_offline_group(tg); 9929 } 9930 9931 static void cpu_cgroup_css_free(struct cgroup_subsys_state *css) ··· 9935 /* 9936 * Relies on the RCU grace period between css_released() and this. 9937 */ 9938 - sched_free_group(tg); 9939 } 9940 9941 /*
··· 3726 3727 bool cpus_share_cache(int this_cpu, int that_cpu) 3728 { 3729 + if (this_cpu == that_cpu) 3730 + return true; 3731 + 3732 return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu); 3733 } 3734 ··· 6625 static void __init preempt_dynamic_init(void) 6626 { 6627 if (preempt_dynamic_mode == preempt_dynamic_undefined) { 6628 + if (IS_ENABLED(CONFIG_PREEMPT_NONE)) { 6629 sched_dynamic_update(preempt_dynamic_none); 6630 + } else if (IS_ENABLED(CONFIG_PREEMPT_VOLUNTARY)) { 6631 sched_dynamic_update(preempt_dynamic_voluntary); 6632 } else { 6633 /* Default static call setting, nothing to do */ 6634 + WARN_ON_ONCE(!IS_ENABLED(CONFIG_PREEMPT)); 6635 preempt_dynamic_mode = preempt_dynamic_full; 6636 pr_info("Dynamic Preempt: full\n"); 6637 } ··· 9716 kmem_cache_free(task_group_cache, tg); 9717 } 9718 9719 + static void sched_free_group_rcu(struct rcu_head *rcu) 9720 + { 9721 + sched_free_group(container_of(rcu, struct task_group, rcu)); 9722 + } 9723 + 9724 + static void sched_unregister_group(struct task_group *tg) 9725 + { 9726 + unregister_fair_sched_group(tg); 9727 + unregister_rt_sched_group(tg); 9728 + /* 9729 + * We have to wait for yet another RCU grace period to expire, as 9730 + * print_cfs_stats() might run concurrently. 9731 + */ 9732 + call_rcu(&tg->rcu, sched_free_group_rcu); 9733 + } 9734 + 9735 /* allocate runqueue etc for a new task group */ 9736 struct task_group *sched_create_group(struct task_group *parent) 9737 { ··· 9759 } 9760 9761 /* rcu callback to free various structures associated with a task group */ 9762 + static void sched_unregister_group_rcu(struct rcu_head *rhp) 9763 { 9764 /* Now it should be safe to free those cfs_rqs: */ 9765 + sched_unregister_group(container_of(rhp, struct task_group, rcu)); 9766 } 9767 9768 void sched_destroy_group(struct task_group *tg) 9769 { 9770 /* Wait for possible concurrent references to cfs_rqs complete: */ 9771 + call_rcu(&tg->rcu, sched_unregister_group_rcu); 9772 } 9773 9774 + void sched_release_group(struct task_group *tg) 9775 { 9776 unsigned long flags; 9777 9778 + /* 9779 + * Unlink first, to avoid walk_tg_tree_from() from finding us (via 9780 + * sched_cfs_period_timer()). 9781 + * 9782 + * For this to be effective, we have to wait for all pending users of 9783 + * this task group to leave their RCU critical section to ensure no new 9784 + * user will see our dying task group any more. Specifically ensure 9785 + * that tg_unthrottle_up() won't add decayed cfs_rq's to it. 9786 + * 9787 + * We therefore defer calling unregister_fair_sched_group() to 9788 + * sched_unregister_group() which is guarantied to get called only after the 9789 + * current RCU grace period has expired. 9790 + */ 9791 spin_lock_irqsave(&task_group_lock, flags); 9792 list_del_rcu(&tg->list); 9793 list_del_rcu(&tg->siblings); ··· 9896 { 9897 struct task_group *tg = css_tg(css); 9898 9899 + sched_release_group(tg); 9900 } 9901 9902 static void cpu_cgroup_css_free(struct cgroup_subsys_state *css) ··· 9906 /* 9907 * Relies on the RCU grace period between css_released() and this. 9908 */ 9909 + sched_unregister_group(tg); 9910 } 9911 9912 /*
+2 -2
kernel/sched/fair.c
··· 11456 { 11457 int i; 11458 11459 - destroy_cfs_bandwidth(tg_cfs_bandwidth(tg)); 11460 - 11461 for_each_possible_cpu(i) { 11462 if (tg->cfs_rq) 11463 kfree(tg->cfs_rq[i]); ··· 11531 unsigned long flags; 11532 struct rq *rq; 11533 int cpu; 11534 11535 for_each_possible_cpu(cpu) { 11536 if (tg->se[cpu])
··· 11456 { 11457 int i; 11458 11459 for_each_possible_cpu(i) { 11460 if (tg->cfs_rq) 11461 kfree(tg->cfs_rq[i]); ··· 11533 unsigned long flags; 11534 struct rq *rq; 11535 int cpu; 11536 + 11537 + destroy_cfs_bandwidth(tg_cfs_bandwidth(tg)); 11538 11539 for_each_possible_cpu(cpu) { 11540 if (tg->se[cpu])
+9 -3
kernel/sched/rt.c
··· 137 return rt_rq->rq; 138 } 139 140 void free_rt_sched_group(struct task_group *tg) 141 { 142 int i; 143 - 144 - if (tg->rt_se) 145 - destroy_rt_bandwidth(&tg->rt_bandwidth); 146 147 for_each_possible_cpu(i) { 148 if (tg->rt_rq) ··· 253 254 return &rq->rt; 255 } 256 257 void free_rt_sched_group(struct task_group *tg) { } 258
··· 137 return rt_rq->rq; 138 } 139 140 + void unregister_rt_sched_group(struct task_group *tg) 141 + { 142 + if (tg->rt_se) 143 + destroy_rt_bandwidth(&tg->rt_bandwidth); 144 + 145 + } 146 + 147 void free_rt_sched_group(struct task_group *tg) 148 { 149 int i; 150 151 for_each_possible_cpu(i) { 152 if (tg->rt_rq) ··· 249 250 return &rq->rt; 251 } 252 + 253 + void unregister_rt_sched_group(struct task_group *tg) { } 254 255 void free_rt_sched_group(struct task_group *tg) { } 256
+2 -1
kernel/sched/sched.h
··· 488 extern void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b); 489 extern void unthrottle_cfs_rq(struct cfs_rq *cfs_rq); 490 491 extern void free_rt_sched_group(struct task_group *tg); 492 extern int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent); 493 extern void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq, ··· 504 extern void sched_online_group(struct task_group *tg, 505 struct task_group *parent); 506 extern void sched_destroy_group(struct task_group *tg); 507 - extern void sched_offline_group(struct task_group *tg); 508 509 extern void sched_move_task(struct task_struct *tsk); 510
··· 488 extern void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b); 489 extern void unthrottle_cfs_rq(struct cfs_rq *cfs_rq); 490 491 + extern void unregister_rt_sched_group(struct task_group *tg); 492 extern void free_rt_sched_group(struct task_group *tg); 493 extern int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent); 494 extern void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq, ··· 503 extern void sched_online_group(struct task_group *tg, 504 struct task_group *parent); 505 extern void sched_destroy_group(struct task_group *tg); 506 + extern void sched_release_group(struct task_group *tg); 507 508 extern void sched_move_task(struct task_struct *tsk); 509