commit fc661f2dcb7e41dcda9ae862efb822bb2f461646 · tjh.dev/kernel

tjh.dev / kernel

Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git

kernel os linux

Merge tag 'sched_urgent_for_v5.16_rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull scheduler fixes from Borislav Petkov:

- Avoid touching ~100 config files in order to be able to select the
preemption model

- clear cluster CPU masks too, on the CPU unplug path

- prevent use-after-free in cfs

- Prevent a race condition when updating CPU cache domains

- Factor out common shared part of smp_prepare_cpus() into a common
helper which can be called by both baremetal and Xen, in order to fix
a booting of Xen PV guests

* tag 'sched_urgent_for_v5.16_rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
preempt: Restore preemption model selection configs
arch_topology: Fix missing clear cluster_cpumask in remove_cpu_topology()
sched/fair: Prevent dead task groups from regaining cfs_rq's
sched/core: Mitigate race cpus_share_cache()/update_top_cache_domain()
x86/smp: Factor out parts of native_smp_prepare_cpus()

Linus Torvalds 4 years ago fc661f2d f7018be2

+96 -59

13 changed files

expand all

unified split

arch

x86

include

asm

smp.h

kernel

smpboot.c

xen

smp_pv.c

drivers

base

arch_topology.c

include

linux

kernel.h

vermagic.h

init

Makefile

kernel

Kconfig.preempt

sched

autogroup.c

core.c

fair.c

rt.c

sched.h

arch/x86/include/asm/smp.h

··· 126 126 127 127 void cpu_disable_common(void); 128 128 void native_smp_prepare_boot_cpu(void); 129 + void smp_prepare_cpus_common(void); 129 130 void native_smp_prepare_cpus(unsigned int max_cpus); 130 131 void calculate_max_logical_packages(void); 131 132 void native_smp_cpus_done(unsigned int max_cpus);

+12 -6

arch/x86/kernel/smpboot.c

··· 1350 1350 cpu0_logical_apicid = GET_APIC_LOGICAL_ID(apic_read(APIC_LDR)); 1351 1351 } 1352 1352 1353 - /* 1354 - * Prepare for SMP bootup. 1355 - * @max_cpus: configured maximum number of CPUs, It is a legacy parameter 1356 - * for common interface support. 1357 - */ 1358 - void __init native_smp_prepare_cpus(unsigned int max_cpus) 1353 + void __init smp_prepare_cpus_common(void) 1359 1354 { 1360 1355 unsigned int i; 1361 1356 ··· 1381 1386 set_sched_topology(x86_topology); 1382 1387 1383 1388 set_cpu_sibling_map(0); 1389 + } 1390 + 1391 + /* 1392 + * Prepare for SMP bootup. 1393 + * @max_cpus: configured maximum number of CPUs, It is a legacy parameter 1394 + * for common interface support. 1395 + */ 1396 + void __init native_smp_prepare_cpus(unsigned int max_cpus) 1397 + { 1398 + smp_prepare_cpus_common(); 1399 + 1384 1400 init_freq_invariance(false, false); 1385 1401 smp_sanity_check(); 1386 1402

+2 -10

arch/x86/xen/smp_pv.c

··· 225 225 static void __init xen_pv_smp_prepare_cpus(unsigned int max_cpus) 226 226 { 227 227 unsigned cpu; 228 - unsigned int i; 229 228 230 229 if (skip_ioapic_setup) { 231 230 char *m = (max_cpus == 0) ? ··· 237 238 } 238 239 xen_init_lock_cpu(0); 239 240 240 - smp_store_boot_cpu_info(); 241 - cpu_data(0).x86_max_cores = 1; 241 + smp_prepare_cpus_common(); 242 242 243 - for_each_possible_cpu(i) { 244 - zalloc_cpumask_var(&per_cpu(cpu_sibling_map, i), GFP_KERNEL); 245 - zalloc_cpumask_var(&per_cpu(cpu_core_map, i), GFP_KERNEL); 246 - zalloc_cpumask_var(&per_cpu(cpu_die_map, i), GFP_KERNEL); 247 - zalloc_cpumask_var(&per_cpu(cpu_llc_shared_map, i), GFP_KERNEL); 248 - } 249 - set_cpu_sibling_map(0); 243 + cpu_data(0).x86_max_cores = 1; 250 244 251 245 speculative_store_bypass_ht_init(); 252 246

drivers/base/arch_topology.c

··· 677 677 cpumask_clear_cpu(cpu, topology_core_cpumask(sibling)); 678 678 for_each_cpu(sibling, topology_sibling_cpumask(cpu)) 679 679 cpumask_clear_cpu(cpu, topology_sibling_cpumask(sibling)); 680 + for_each_cpu(sibling, topology_cluster_cpumask(cpu)) 681 + cpumask_clear_cpu(cpu, topology_cluster_cpumask(sibling)); 680 682 for_each_cpu(sibling, topology_llc_cpumask(cpu)) 681 683 cpumask_clear_cpu(cpu, topology_llc_cpumask(sibling)); 682 684

+1 -1

include/linux/kernel.h

··· 85 85 struct completion; 86 86 struct user; 87 87 88 - #ifdef CONFIG_PREEMPT_VOLUNTARY 88 + #ifdef CONFIG_PREEMPT_VOLUNTARY_BUILD 89 89 90 90 extern int __cond_resched(void); 91 91 # define might_resched() __cond_resched()

+1 -1

include/linux/vermagic.h

··· 15 15 #else 16 16 #define MODULE_VERMAGIC_SMP "" 17 17 #endif 18 - #ifdef CONFIG_PREEMPT 18 + #ifdef CONFIG_PREEMPT_BUILD 19 19 #define MODULE_VERMAGIC_PREEMPT "preempt " 20 20 #elif defined(CONFIG_PREEMPT_RT) 21 21 #define MODULE_VERMAGIC_PREEMPT "preempt_rt "

+1 -1

init/Makefile

··· 30 30 quiet_cmd_compile.h = CHK $@ 31 31 cmd_compile.h = \ 32 32 $(CONFIG_SHELL) $(srctree)/scripts/mkcompile_h $@ \ 33 - "$(UTS_MACHINE)" "$(CONFIG_SMP)" "$(CONFIG_PREEMPT)" \ 33 + "$(UTS_MACHINE)" "$(CONFIG_SMP)" "$(CONFIG_PREEMPT_BUILD)" \ 34 34 "$(CONFIG_PREEMPT_RT)" $(CONFIG_CC_VERSION_TEXT) "$(LD)" 35 35 36 36 include/generated/compile.h: FORCE

+21 -21

kernel/Kconfig.preempt

··· 1 1 # SPDX-License-Identifier: GPL-2.0-only 2 2 3 + config PREEMPT_NONE_BUILD 4 + bool 5 + 6 + config PREEMPT_VOLUNTARY_BUILD 7 + bool 8 + 9 + config PREEMPT_BUILD 10 + bool 11 + select PREEMPTION 12 + select UNINLINE_SPIN_UNLOCK if !ARCH_INLINE_SPIN_UNLOCK 13 + 3 14 choice 4 15 prompt "Preemption Model" 5 - default PREEMPT_NONE_BEHAVIOUR 16 + default PREEMPT_NONE 6 17 7 - config PREEMPT_NONE_BEHAVIOUR 18 + config PREEMPT_NONE 8 19 bool "No Forced Preemption (Server)" 9 - select PREEMPT_NONE if !PREEMPT_DYNAMIC 20 + select PREEMPT_NONE_BUILD if !PREEMPT_DYNAMIC 10 21 help 11 22 This is the traditional Linux preemption model, geared towards 12 23 throughput. It will still provide good latencies most of the ··· 29 18 raw processing power of the kernel, irrespective of scheduling 30 19 latencies. 31 20 32 - config PREEMPT_VOLUNTARY_BEHAVIOUR 21 + config PREEMPT_VOLUNTARY 33 22 bool "Voluntary Kernel Preemption (Desktop)" 34 23 depends on !ARCH_NO_PREEMPT 35 - select PREEMPT_VOLUNTARY if !PREEMPT_DYNAMIC 24 + select PREEMPT_VOLUNTARY_BUILD if !PREEMPT_DYNAMIC 36 25 help 37 26 This option reduces the latency of the kernel by adding more 38 27 "explicit preemption points" to the kernel code. These new ··· 48 37 49 38 Select this if you are building a kernel for a desktop system. 50 39 51 - config PREEMPT_BEHAVIOUR 40 + config PREEMPT 52 41 bool "Preemptible Kernel (Low-Latency Desktop)" 53 42 depends on !ARCH_NO_PREEMPT 54 - select PREEMPT 43 + select PREEMPT_BUILD 55 44 help 56 45 This option reduces the latency of the kernel by making 57 46 all kernel code (that is not executing in a critical section) ··· 69 58 70 59 config PREEMPT_RT 71 60 bool "Fully Preemptible Kernel (Real-Time)" 72 - depends on EXPERT && ARCH_SUPPORTS_RT && !PREEMPT_DYNAMIC 61 + depends on EXPERT && ARCH_SUPPORTS_RT 73 62 select PREEMPTION 74 63 help 75 64 This option turns the kernel into a real-time kernel by replacing ··· 86 75 87 76 endchoice 88 77 89 - config PREEMPT_NONE 90 - bool 91 - 92 - config PREEMPT_VOLUNTARY 93 - bool 94 - 95 - config PREEMPT 96 - bool 97 - select PREEMPTION 98 - select UNINLINE_SPIN_UNLOCK if !ARCH_INLINE_SPIN_UNLOCK 99 - 100 78 config PREEMPT_COUNT 101 79 bool 102 80 ··· 95 95 96 96 config PREEMPT_DYNAMIC 97 97 bool "Preemption behaviour defined on boot" 98 - depends on HAVE_PREEMPT_DYNAMIC 99 - select PREEMPT 98 + depends on HAVE_PREEMPT_DYNAMIC && !PREEMPT_RT 99 + select PREEMPT_BUILD 100 100 default y 101 101 help 102 102 This option allows to define the preemption model on the kernel

+1 -1

kernel/sched/autogroup.c

··· 31 31 ag->tg->rt_se = NULL; 32 32 ag->tg->rt_rq = NULL; 33 33 #endif 34 - sched_offline_group(ag->tg); 34 + sched_release_group(ag->tg); 35 35 sched_destroy_group(ag->tg); 36 36 } 37 37

+41 -12

kernel/sched/core.c

··· 3726 3726 3727 3727 bool cpus_share_cache(int this_cpu, int that_cpu) 3728 3728 { 3729 + if (this_cpu == that_cpu) 3730 + return true; 3731 + 3729 3732 return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu); 3730 3733 } 3731 3734 ··· 6628 6625 static void __init preempt_dynamic_init(void) 6629 6626 { 6630 6627 if (preempt_dynamic_mode == preempt_dynamic_undefined) { 6631 - if (IS_ENABLED(CONFIG_PREEMPT_NONE_BEHAVIOUR)) { 6628 + if (IS_ENABLED(CONFIG_PREEMPT_NONE)) { 6632 6629 sched_dynamic_update(preempt_dynamic_none); 6633 - } else if (IS_ENABLED(CONFIG_PREEMPT_VOLUNTARY_BEHAVIOUR)) { 6630 + } else if (IS_ENABLED(CONFIG_PREEMPT_VOLUNTARY)) { 6634 6631 sched_dynamic_update(preempt_dynamic_voluntary); 6635 6632 } else { 6636 6633 /* Default static call setting, nothing to do */ 6637 - WARN_ON_ONCE(!IS_ENABLED(CONFIG_PREEMPT_BEHAVIOUR)); 6634 + WARN_ON_ONCE(!IS_ENABLED(CONFIG_PREEMPT)); 6638 6635 preempt_dynamic_mode = preempt_dynamic_full; 6639 6636 pr_info("Dynamic Preempt: full\n"); 6640 6637 } ··· 9719 9716 kmem_cache_free(task_group_cache, tg); 9720 9717 } 9721 9718 9719 + static void sched_free_group_rcu(struct rcu_head *rcu) 9720 + { 9721 + sched_free_group(container_of(rcu, struct task_group, rcu)); 9722 + } 9723 + 9724 + static void sched_unregister_group(struct task_group *tg) 9725 + { 9726 + unregister_fair_sched_group(tg); 9727 + unregister_rt_sched_group(tg); 9728 + /* 9729 + * We have to wait for yet another RCU grace period to expire, as 9730 + * print_cfs_stats() might run concurrently. 9731 + */ 9732 + call_rcu(&tg->rcu, sched_free_group_rcu); 9733 + } 9734 + 9722 9735 /* allocate runqueue etc for a new task group */ 9723 9736 struct task_group *sched_create_group(struct task_group *parent) 9724 9737 { ··· 9778 9759 } 9779 9760 9780 9761 /* rcu callback to free various structures associated with a task group */ 9781 - static void sched_free_group_rcu(struct rcu_head *rhp) 9762 + static void sched_unregister_group_rcu(struct rcu_head *rhp) 9782 9763 { 9783 9764 /* Now it should be safe to free those cfs_rqs: */ 9784 - sched_free_group(container_of(rhp, struct task_group, rcu)); 9765 + sched_unregister_group(container_of(rhp, struct task_group, rcu)); 9785 9766 } 9786 9767 9787 9768 void sched_destroy_group(struct task_group *tg) 9788 9769 { 9789 9770 /* Wait for possible concurrent references to cfs_rqs complete: */ 9790 - call_rcu(&tg->rcu, sched_free_group_rcu); 9771 + call_rcu(&tg->rcu, sched_unregister_group_rcu); 9791 9772 } 9792 9773 9793 - void sched_offline_group(struct task_group *tg) 9774 + void sched_release_group(struct task_group *tg) 9794 9775 { 9795 9776 unsigned long flags; 9796 9777 9797 - /* End participation in shares distribution: */ 9798 - unregister_fair_sched_group(tg); 9799 - 9778 + /* 9779 + * Unlink first, to avoid walk_tg_tree_from() from finding us (via 9780 + * sched_cfs_period_timer()). 9781 + * 9782 + * For this to be effective, we have to wait for all pending users of 9783 + * this task group to leave their RCU critical section to ensure no new 9784 + * user will see our dying task group any more. Specifically ensure 9785 + * that tg_unthrottle_up() won't add decayed cfs_rq's to it. 9786 + * 9787 + * We therefore defer calling unregister_fair_sched_group() to 9788 + * sched_unregister_group() which is guarantied to get called only after the 9789 + * current RCU grace period has expired. 9790 + */ 9800 9791 spin_lock_irqsave(&task_group_lock, flags); 9801 9792 list_del_rcu(&tg->list); 9802 9793 list_del_rcu(&tg->siblings); ··· 9925 9896 { 9926 9897 struct task_group *tg = css_tg(css); 9927 9898 9928 - sched_offline_group(tg); 9899 + sched_release_group(tg); 9929 9900 } 9930 9901 9931 9902 static void cpu_cgroup_css_free(struct cgroup_subsys_state *css) ··· 9935 9906 /* 9936 9907 * Relies on the RCU grace period between css_released() and this. 9937 9908 */ 9938 - sched_free_group(tg); 9909 + sched_unregister_group(tg); 9939 9910 } 9940 9911 9941 9912 /*

+2 -2

kernel/sched/fair.c

··· 11456 11456 { 11457 11457 int i; 11458 11458 11459 - destroy_cfs_bandwidth(tg_cfs_bandwidth(tg)); 11460 - 11461 11459 for_each_possible_cpu(i) { 11462 11460 if (tg->cfs_rq) 11463 11461 kfree(tg->cfs_rq[i]); ··· 11531 11533 unsigned long flags; 11532 11534 struct rq *rq; 11533 11535 int cpu; 11536 + 11537 + destroy_cfs_bandwidth(tg_cfs_bandwidth(tg)); 11534 11538 11535 11539 for_each_possible_cpu(cpu) { 11536 11540 if (tg->se[cpu])

+9 -3

kernel/sched/rt.c

··· 137 137 return rt_rq->rq; 138 138 } 139 139 140 + void unregister_rt_sched_group(struct task_group *tg) 141 + { 142 + if (tg->rt_se) 143 + destroy_rt_bandwidth(&tg->rt_bandwidth); 144 + 145 + } 146 + 140 147 void free_rt_sched_group(struct task_group *tg) 141 148 { 142 149 int i; 143 - 144 - if (tg->rt_se) 145 - destroy_rt_bandwidth(&tg->rt_bandwidth); 146 150 147 151 for_each_possible_cpu(i) { 148 152 if (tg->rt_rq) ··· 253 249 254 250 return &rq->rt; 255 251 } 252 + 253 + void unregister_rt_sched_group(struct task_group *tg) { } 256 254 257 255 void free_rt_sched_group(struct task_group *tg) { } 258 256

+2 -1

kernel/sched/sched.h

··· 488 488 extern void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b); 489 489 extern void unthrottle_cfs_rq(struct cfs_rq *cfs_rq); 490 490 491 + extern void unregister_rt_sched_group(struct task_group *tg); 491 492 extern void free_rt_sched_group(struct task_group *tg); 492 493 extern int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent); 493 494 extern void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq, ··· 504 503 extern void sched_online_group(struct task_group *tg, 505 504 struct task_group *parent); 506 505 extern void sched_destroy_group(struct task_group *tg); 507 - extern void sched_offline_group(struct task_group *tg); 506 + extern void sched_release_group(struct task_group *tg); 508 507 509 508 extern void sched_move_task(struct task_struct *tsk); 510 509