Merge tag 'sched-core-2020-12-14' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

+11 -15

Documentation/scheduler/sched-domains.rst

··· 65 65 cpumask of a node. Or, you could do multi-level NUMA or Opteron, for example, 66 66 might have just one domain covering its one NUMA level. 67 67 68 - The implementor should read comments in include/linux/sched.h: 69 - struct sched_domain fields, SD_FLAG_*, SD_*_INIT to get an idea of 70 - the specifics and what to tune. 68 + The implementor should read comments in include/linux/sched/sd_flags.h: 69 + SD_* to get an idea of the specifics and what to tune for the SD flags 70 + of a sched_domain. 71 71 72 - Architectures may retain the regular override the default SD_*_INIT flags 73 - while using the generic domain builder in kernel/sched/core.c if they wish to 74 - retain the traditional SMT->SMP->NUMA topology (or some subset of that). This 75 - can be done by #define'ing ARCH_HASH_SCHED_TUNE. 76 - 77 - Alternatively, the architecture may completely override the generic domain 78 - builder by #define'ing ARCH_HASH_SCHED_DOMAIN, and exporting your 79 - arch_init_sched_domains function. This function will attach domains to all 80 - CPUs using cpu_attach_domain. 72 + Architectures may override the generic domain builder and the default SD flags 73 + for a given topology level by creating a sched_domain_topology_level array and 74 + calling set_sched_topology() with this array as the parameter. 81 75 82 76 The sched-domains debugging infrastructure can be enabled by enabling 83 - CONFIG_SCHED_DEBUG. This enables an error checking parse of the sched domains 84 - which should catch most possible errors (described above). It also prints out 85 - the domain structure in a visual format. 77 + CONFIG_SCHED_DEBUG and adding 'sched_debug' to your cmdline. If you forgot to 78 + tweak your cmdline, you can also flip the /sys/kernel/debug/sched_debug 79 + knob. This enables an error checking parse of the sched domains which should 80 + catch most possible errors (described above). It also prints out the domain 81 + structure in a visual format.

+10

arch/arm64/kernel/topology.c

··· 223 223 224 224 static int __init init_amu_fie(void) 225 225 { 226 + bool invariance_status = topology_scale_freq_invariant(); 226 227 cpumask_var_t valid_cpus; 227 228 bool have_policy = false; 228 229 int ret = 0; ··· 269 268 */ 270 269 if (!topology_scale_freq_invariant()) 271 270 static_branch_disable(&amu_fie_key); 271 + 272 + /* 273 + * Task scheduler behavior depends on frequency invariance support, 274 + * either cpufreq or counter driven. If the support status changes as 275 + * a result of counter initialisation and use, retrigger the build of 276 + * scheduling domains to ensure the information is propagated properly. 277 + */ 278 + if (invariance_status != topology_scale_freq_invariant()) 279 + rebuild_sched_domains_energy(); 272 280 273 281 free_valid_mask: 274 282 free_cpumask_var(valid_cpus);

+3 -2

arch/mips/kernel/process.c

··· 702 702 return sp & ALMASK; 703 703 } 704 704 705 - static DEFINE_PER_CPU(call_single_data_t, backtrace_csd); 706 705 static struct cpumask backtrace_csd_busy; 707 706 708 707 static void handle_backtrace(void *info) ··· 709 710 nmi_cpu_backtrace(get_irq_regs()); 710 711 cpumask_clear_cpu(smp_processor_id(), &backtrace_csd_busy); 711 712 } 713 + 714 + static DEFINE_PER_CPU(call_single_data_t, backtrace_csd) = 715 + CSD_INIT(handle_backtrace, NULL); 712 716 713 717 static void raise_backtrace(cpumask_t *mask) 714 718 { ··· 732 730 } 733 731 734 732 csd = &per_cpu(backtrace_csd, cpu); 735 - csd->func = handle_backtrace; 736 733 smp_call_function_single_async(cpu, csd); 737 734 } 738 735 }

+7 -20

arch/mips/kernel/smp.c

··· 687 687 688 688 #ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST 689 689 690 - static DEFINE_PER_CPU(call_single_data_t, tick_broadcast_csd); 690 + static void tick_broadcast_callee(void *info) 691 + { 692 + tick_receive_broadcast(); 693 + } 694 + 695 + static DEFINE_PER_CPU(call_single_data_t, tick_broadcast_csd) = 696 + CSD_INIT(tick_broadcast_callee, NULL); 691 697 692 698 void tick_broadcast(const struct cpumask *mask) 693 699 { ··· 705 699 smp_call_function_single_async(cpu, csd); 706 700 } 707 701 } 708 - 709 - static void tick_broadcast_callee(void *info) 710 - { 711 - tick_receive_broadcast(); 712 - } 713 - 714 - static int __init tick_broadcast_init(void) 715 - { 716 - call_single_data_t *csd; 717 - int cpu; 718 - 719 - for (cpu = 0; cpu < NR_CPUS; cpu++) { 720 - csd = &per_cpu(tick_broadcast_csd, cpu); 721 - csd->func = tick_broadcast_callee; 722 - } 723 - 724 - return 0; 725 - } 726 - early_initcall(tick_broadcast_init); 727 702 728 703 #endif /* CONFIG_GENERIC_CLOCKEVENTS_BROADCAST */

+1 -3

arch/s390/pci/pci_irq.c

··· 179 179 if (atomic_inc_return(&cpu_data->scheduled) > 1) 180 180 continue; 181 181 182 - cpu_data->csd.func = zpci_handle_remote_irq; 183 - cpu_data->csd.info = &cpu_data->scheduled; 184 - cpu_data->csd.flags = 0; 182 + INIT_CSD(&cpu_data->csd, zpci_handle_remote_irq, &cpu_data->scheduled); 185 183 smp_call_function_single_async(cpu, &cpu_data->csd); 186 184 } 187 185 }

+5

arch/x86/include/asm/topology.h

··· 218 218 } 219 219 #endif 220 220 221 + #ifdef CONFIG_ACPI_CPPC_LIB 222 + void init_freq_invariance_cppc(void); 223 + #define init_freq_invariance_cppc init_freq_invariance_cppc 224 + #endif 225 + 221 226 #endif /* _ASM_X86_TOPOLOGY_H */

+3 -4

arch/x86/kernel/cpuid.c

··· 74 74 75 75 init_completion(&cmd.done); 76 76 for (; count; count -= 16) { 77 - call_single_data_t csd = { 78 - .func = cpuid_smp_cpuid, 79 - .info = &cmd, 80 - }; 77 + call_single_data_t csd; 78 + 79 + INIT_CSD(&csd, cpuid_smp_cpuid, &cmd); 81 80 82 81 cmd.regs.eax = pos; 83 82 cmd.regs.ecx = pos >> 32;

+74 -5

arch/x86/kernel/smpboot.c

··· 82 82 #include <asm/hw_irq.h> 83 83 #include <asm/stackprotector.h> 84 84 85 + #ifdef CONFIG_ACPI_CPPC_LIB 86 + #include <acpi/cppc_acpi.h> 87 + #endif 88 + 85 89 /* representing HT siblings of each logical CPU */ 86 90 DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_sibling_map); 87 91 EXPORT_PER_CPU_SYMBOL(cpu_sibling_map); ··· 152 148 *((volatile u32 *)phys_to_virt(TRAMPOLINE_PHYS_LOW)) = 0; 153 149 } 154 150 155 - static void init_freq_invariance(bool secondary); 151 + static void init_freq_invariance(bool secondary, bool cppc_ready); 156 152 157 153 /* 158 154 * Report back to the Boot Processor during boot time or to the caller processor ··· 190 186 */ 191 187 set_cpu_sibling_map(raw_smp_processor_id()); 192 188 193 - init_freq_invariance(true); 189 + init_freq_invariance(true, false); 194 190 195 191 /* 196 192 * Get our bogomips. ··· 1345 1341 set_sched_topology(x86_topology); 1346 1342 1347 1343 set_cpu_sibling_map(0); 1348 - init_freq_invariance(false); 1344 + init_freq_invariance(false, false); 1349 1345 smp_sanity_check(); 1350 1346 1351 1347 switch (apic_intr_mode) { ··· 2032 2028 return true; 2033 2029 } 2034 2030 2031 + #ifdef CONFIG_ACPI_CPPC_LIB 2032 + static bool amd_set_max_freq_ratio(void) 2033 + { 2034 + struct cppc_perf_caps perf_caps; 2035 + u64 highest_perf, nominal_perf; 2036 + u64 perf_ratio; 2037 + int rc; 2038 + 2039 + rc = cppc_get_perf_caps(0, &perf_caps); 2040 + if (rc) { 2041 + pr_debug("Could not retrieve perf counters (%d)\n", rc); 2042 + return false; 2043 + } 2044 + 2045 + highest_perf = perf_caps.highest_perf; 2046 + nominal_perf = perf_caps.nominal_perf; 2047 + 2048 + if (!highest_perf || !nominal_perf) { 2049 + pr_debug("Could not retrieve highest or nominal performance\n"); 2050 + return false; 2051 + } 2052 + 2053 + perf_ratio = div_u64(highest_perf * SCHED_CAPACITY_SCALE, nominal_perf); 2054 + /* midpoint between max_boost and max_P */ 2055 + perf_ratio = (perf_ratio + SCHED_CAPACITY_SCALE) >> 1; 2056 + if (!perf_ratio) { 2057 + pr_debug("Non-zero highest/nominal perf values led to a 0 ratio\n"); 2058 + return false; 2059 + } 2060 + 2061 + arch_turbo_freq_ratio = perf_ratio; 2062 + arch_set_max_freq_ratio(false); 2063 + 2064 + return true; 2065 + } 2066 + #else 2067 + static bool amd_set_max_freq_ratio(void) 2068 + { 2069 + return false; 2070 + } 2071 + #endif 2072 + 2035 2073 static void init_counter_refs(void) 2036 2074 { 2037 2075 u64 aperf, mperf; ··· 2085 2039 this_cpu_write(arch_prev_mperf, mperf); 2086 2040 } 2087 2041 2088 - static void init_freq_invariance(bool secondary) 2042 + static void init_freq_invariance(bool secondary, bool cppc_ready) 2089 2043 { 2090 2044 bool ret = false; 2091 2045 ··· 2101 2055 2102 2056 if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) 2103 2057 ret = intel_set_max_freq_ratio(); 2058 + else if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) { 2059 + if (!cppc_ready) { 2060 + return; 2061 + } 2062 + ret = amd_set_max_freq_ratio(); 2063 + } 2104 2064 2105 2065 if (ret) { 2106 2066 init_counter_refs(); 2107 2067 static_branch_enable(&arch_scale_freq_key); 2068 + pr_info("Estimated ratio of average max frequency by base frequency (times 1024): %llu\n", arch_max_freq_ratio); 2108 2069 } else { 2109 2070 pr_debug("Couldn't determine max cpu frequency, necessary for scale-invariant accounting.\n"); 2110 2071 } 2111 2072 } 2073 + 2074 + #ifdef CONFIG_ACPI_CPPC_LIB 2075 + static DEFINE_MUTEX(freq_invariance_lock); 2076 + 2077 + void init_freq_invariance_cppc(void) 2078 + { 2079 + static bool secondary; 2080 + 2081 + mutex_lock(&freq_invariance_lock); 2082 + 2083 + init_freq_invariance(secondary, true); 2084 + secondary = true; 2085 + 2086 + mutex_unlock(&freq_invariance_lock); 2087 + } 2088 + #endif 2112 2089 2113 2090 static void disable_freq_invariance_workfn(struct work_struct *work) 2114 2091 { ··· 2182 2113 schedule_work(&disable_freq_invariance_work); 2183 2114 } 2184 2115 #else 2185 - static inline void init_freq_invariance(bool secondary) 2116 + static inline void init_freq_invariance(bool secondary, bool cppc_ready) 2186 2117 { 2187 2118 } 2188 2119 #endif /* CONFIG_X86_64 */

+3 -4

arch/x86/lib/msr-smp.c

··· 169 169 int rdmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h) 170 170 { 171 171 struct msr_info_completion rv; 172 - call_single_data_t csd = { 173 - .func = __rdmsr_safe_on_cpu, 174 - .info = &rv, 175 - }; 172 + call_single_data_t csd; 176 173 int err; 174 + 175 + INIT_CSD(&csd, __rdmsr_safe_on_cpu, &rv); 177 176 178 177 memset(&rv, 0, sizeof(rv)); 179 178 init_completion(&rv.done);

+1 -3

block/blk-mq.c

··· 671 671 return false; 672 672 673 673 if (blk_mq_complete_need_ipi(rq)) { 674 - rq->csd.func = __blk_mq_complete_request_remote; 675 - rq->csd.info = rq; 676 - rq->csd.flags = 0; 674 + INIT_CSD(&rq->csd, __blk_mq_complete_request_remote, rq); 677 675 smp_call_function_single_async(rq->mq_ctx->cpu, &rq->csd); 678 676 } else { 679 677 if (rq->q->nr_hw_queues > 1)

+7

drivers/acpi/cppc_acpi.c

··· 39 39 #include <linux/ktime.h> 40 40 #include <linux/rwsem.h> 41 41 #include <linux/wait.h> 42 + #include <linux/topology.h> 42 43 43 44 #include <acpi/cppc_acpi.h> 44 45 ··· 689 688 * } 690 689 */ 691 690 691 + #ifndef init_freq_invariance_cppc 692 + static inline void init_freq_invariance_cppc(void) { } 693 + #endif 694 + 692 695 /** 693 696 * acpi_cppc_processor_probe - Search for per CPU _CPC objects. 694 697 * @pr: Ptr to acpi_processor containing this CPU's logical ID. ··· 854 849 kobject_put(&cpc_ptr->kobj); 855 850 goto out_free; 856 851 } 852 + 853 + init_freq_invariance_cppc(); 857 854 858 855 kfree(output.pointer); 859 856 return 0;

+1 -2

drivers/cpuidle/coupled.c

··· 674 674 coupled->refcnt++; 675 675 676 676 csd = &per_cpu(cpuidle_coupled_poke_cb, dev->cpu); 677 - csd->func = cpuidle_coupled_handle_poke; 678 - csd->info = (void *)(unsigned long)dev->cpu; 677 + INIT_CSD(csd, cpuidle_coupled_handle_poke, (void *)(unsigned long)dev->cpu); 679 678 680 679 return 0; 681 680 }

+2 -2

drivers/gpu/drm/i915/i915_request.c

··· 197 197 198 198 llist_for_each_entry_safe(cb, cn, 199 199 llist_del_all(&rq->execute_cb), 200 - work.llnode) 200 + work.node.llist) 201 201 fn(&cb->work); 202 202 } 203 203 ··· 460 460 * callback first, then checking the ACTIVE bit, we serialise with 461 461 * the completed/retired request. 462 462 */ 463 - if (llist_add(&cb->work.llnode, &signal->execute_cb)) { 463 + if (llist_add(&cb->work.node.llist, &signal->execute_cb)) { 464 464 if (i915_request_is_active(signal) || 465 465 __request_in_flight(signal)) 466 466 __notify_execute_cb_imm(signal);

+2 -7

drivers/net/ethernet/cavium/liquidio/lio_core.c

··· 729 729 droq->cpu_id == this_cpu) { 730 730 napi_schedule_irqoff(&droq->napi); 731 731 } else { 732 - call_single_data_t *csd = &droq->csd; 733 - 734 - csd->func = napi_schedule_wrapper; 735 - csd->info = &droq->napi; 736 - csd->flags = 0; 737 - 738 - smp_call_function_single_async(droq->cpu_id, csd); 732 + INIT_CSD(&droq->csd, napi_schedule_wrapper, &droq->napi); 733 + smp_call_function_single_async(droq->cpu_id, &droq->csd); 739 734 } 740 735 } 741 736

+2 -2

fs/proc/array.c

··· 383 383 static void task_cpus_allowed(struct seq_file *m, struct task_struct *task) 384 384 { 385 385 seq_printf(m, "Cpus_allowed:\t%*pb\n", 386 - cpumask_pr_args(task->cpus_ptr)); 386 + cpumask_pr_args(&task->cpus_mask)); 387 387 seq_printf(m, "Cpus_allowed_list:\t%*pbl\n", 388 - cpumask_pr_args(task->cpus_ptr)); 388 + cpumask_pr_args(&task->cpus_mask)); 389 389 } 390 390 391 391 static inline void task_core_dumping(struct seq_file *m, struct mm_struct *mm)

+1

include/linux/cpuhotplug.h

··· 152 152 CPUHP_AP_ONLINE, 153 153 CPUHP_TEARDOWN_CPU, 154 154 CPUHP_AP_ONLINE_IDLE, 155 + CPUHP_AP_SCHED_WAIT_EMPTY, 155 156 CPUHP_AP_SMPBOOT_THREADS, 156 157 CPUHP_AP_X86_VDSO_VMA_ONLINE, 157 158 CPUHP_AP_IRQ_AFFINITY_ONLINE,

+6

include/linux/cpumask.h

··· 199 199 return cpumask_next_and(-1, src1p, src2p); 200 200 } 201 201 202 + static inline int cpumask_any_distribute(const struct cpumask *srcp) 203 + { 204 + return cpumask_first(srcp); 205 + } 206 + 202 207 #define for_each_cpu(cpu, mask) \ 203 208 for ((cpu) = 0; (cpu) < 1; (cpu)++, (void)mask) 204 209 #define for_each_cpu_not(cpu, mask) \ ··· 257 252 unsigned int cpumask_local_spread(unsigned int i, int node); 258 253 int cpumask_any_and_distribute(const struct cpumask *src1p, 259 254 const struct cpumask *src2p); 255 + int cpumask_any_distribute(const struct cpumask *srcp); 260 256 261 257 /** 262 258 * for_each_cpu - iterate over every cpu in a mask

+21 -12

include/linux/irq_work.h

··· 14 14 */ 15 15 16 16 struct irq_work { 17 - union { 18 - struct __call_single_node node; 19 - struct { 20 - struct llist_node llnode; 21 - atomic_t flags; 22 - }; 23 - }; 17 + struct __call_single_node node; 24 18 void (*func)(struct irq_work *); 25 19 }; 20 + 21 + #define __IRQ_WORK_INIT(_func, _flags) (struct irq_work){ \ 22 + .node = { .u_flags = (_flags), }, \ 23 + .func = (_func), \ 24 + } 25 + 26 + #define IRQ_WORK_INIT(_func) __IRQ_WORK_INIT(_func, 0) 27 + #define IRQ_WORK_INIT_LAZY(_func) __IRQ_WORK_INIT(_func, IRQ_WORK_LAZY) 28 + #define IRQ_WORK_INIT_HARD(_func) __IRQ_WORK_INIT(_func, IRQ_WORK_HARD_IRQ) 29 + 30 + #define DEFINE_IRQ_WORK(name, _f) \ 31 + struct irq_work name = IRQ_WORK_INIT(_f) 26 32 27 33 static inline 28 34 void init_irq_work(struct irq_work *work, void (*func)(struct irq_work *)) 29 35 { 30 - atomic_set(&work->flags, 0); 31 - work->func = func; 36 + *work = IRQ_WORK_INIT(func); 32 37 } 33 38 34 - #define DEFINE_IRQ_WORK(name, _f) struct irq_work name = { \ 35 - .flags = ATOMIC_INIT(0), \ 36 - .func = (_f) \ 39 + static inline bool irq_work_is_pending(struct irq_work *work) 40 + { 41 + return atomic_read(&work->node.a_flags) & IRQ_WORK_PENDING; 37 42 } 38 43 44 + static inline bool irq_work_is_busy(struct irq_work *work) 45 + { 46 + return atomic_read(&work->node.a_flags) & IRQ_WORK_BUSY; 47 + } 39 48 40 49 bool irq_work_queue(struct irq_work *work); 41 50 bool irq_work_queue_on(struct irq_work *work, int cpu);

+4 -4

include/linux/irqflags.h

··· 107 107 current->irq_config = 0; \ 108 108 } while (0) 109 109 110 - # define lockdep_irq_work_enter(__work) \ 110 + # define lockdep_irq_work_enter(_flags) \ 111 111 do { \ 112 - if (!(atomic_read(&__work->flags) & IRQ_WORK_HARD_IRQ))\ 112 + if (!((_flags) & IRQ_WORK_HARD_IRQ)) \ 113 113 current->irq_config = 1; \ 114 114 } while (0) 115 - # define lockdep_irq_work_exit(__work) \ 115 + # define lockdep_irq_work_exit(_flags) \ 116 116 do { \ 117 - if (!(atomic_read(&__work->flags) & IRQ_WORK_HARD_IRQ))\ 117 + if (!((_flags) & IRQ_WORK_HARD_IRQ)) \ 118 118 current->irq_config = 0; \ 119 119 } while (0) 120 120

+14 -7

include/linux/kernel.h

··· 204 204 extern void ___might_sleep(const char *file, int line, int preempt_offset); 205 205 extern void __might_sleep(const char *file, int line, int preempt_offset); 206 206 extern void __cant_sleep(const char *file, int line, int preempt_offset); 207 + extern void __cant_migrate(const char *file, int line); 207 208 208 209 /** 209 210 * might_sleep - annotation for functions that can sleep ··· 228 227 # define cant_sleep() \ 229 228 do { __cant_sleep(__FILE__, __LINE__, 0); } while (0) 230 229 # define sched_annotate_sleep() (current->task_state_change = 0) 230 + 231 + /** 232 + * cant_migrate - annotation for functions that cannot migrate 233 + * 234 + * Will print a stack trace if executed in code which is migratable 235 + */ 236 + # define cant_migrate() \ 237 + do { \ 238 + if (IS_ENABLED(CONFIG_SMP)) \ 239 + __cant_migrate(__FILE__, __LINE__); \ 240 + } while (0) 241 + 231 242 /** 232 243 * non_block_start - annotate the start of section where sleeping is prohibited 233 244 * ··· 264 251 int preempt_offset) { } 265 252 # define might_sleep() do { might_resched(); } while (0) 266 253 # define cant_sleep() do { } while (0) 254 + # define cant_migrate() do { } while (0) 267 255 # define sched_annotate_sleep() do { } while (0) 268 256 # define non_block_start() do { } while (0) 269 257 # define non_block_end() do { } while (0) 270 258 #endif 271 259 272 260 #define might_sleep_if(cond) do { if (cond) might_sleep(); } while (0) 273 - 274 - #ifndef CONFIG_PREEMPT_RT 275 - # define cant_migrate() cant_sleep() 276 - #else 277 - /* Placeholder for now */ 278 - # define cant_migrate() do { } while (0) 279 - #endif 280 261 281 262 /** 282 263 * abs - return absolute value of an argument

+61 -24

include/linux/preempt.h

··· 322 322 323 323 #endif 324 324 325 - /** 326 - * migrate_disable - Prevent migration of the current task 327 - * 328 - * Maps to preempt_disable() which also disables preemption. Use 329 - * migrate_disable() to annotate that the intent is to prevent migration, 330 - * but not necessarily preemption. 331 - * 332 - * Can be invoked nested like preempt_disable() and needs the corresponding 333 - * number of migrate_enable() invocations. 334 - */ 335 - static __always_inline void migrate_disable(void) 336 - { 337 - preempt_disable(); 338 - } 325 + #ifdef CONFIG_SMP 339 326 340 - /** 341 - * migrate_enable - Allow migration of the current task 327 + /* 328 + * Migrate-Disable and why it is undesired. 342 329 * 343 - * Counterpart to migrate_disable(). 330 + * When a preempted task becomes elegible to run under the ideal model (IOW it 331 + * becomes one of the M highest priority tasks), it might still have to wait 332 + * for the preemptee's migrate_disable() section to complete. Thereby suffering 333 + * a reduction in bandwidth in the exact duration of the migrate_disable() 334 + * section. 344 335 * 345 - * As migrate_disable() can be invoked nested, only the outermost invocation 346 - * reenables migration. 336 + * Per this argument, the change from preempt_disable() to migrate_disable() 337 + * gets us: 347 338 * 348 - * Currently mapped to preempt_enable(). 339 + * - a higher priority tasks gains reduced wake-up latency; with preempt_disable() 340 + * it would have had to wait for the lower priority task. 341 + * 342 + * - a lower priority tasks; which under preempt_disable() could've instantly 343 + * migrated away when another CPU becomes available, is now constrained 344 + * by the ability to push the higher priority task away, which might itself be 345 + * in a migrate_disable() section, reducing it's available bandwidth. 346 + * 347 + * IOW it trades latency / moves the interference term, but it stays in the 348 + * system, and as long as it remains unbounded, the system is not fully 349 + * deterministic. 350 + * 351 + * 352 + * The reason we have it anyway. 353 + * 354 + * PREEMPT_RT breaks a number of assumptions traditionally held. By forcing a 355 + * number of primitives into becoming preemptible, they would also allow 356 + * migration. This turns out to break a bunch of per-cpu usage. To this end, 357 + * all these primitives employ migirate_disable() to restore this implicit 358 + * assumption. 359 + * 360 + * This is a 'temporary' work-around at best. The correct solution is getting 361 + * rid of the above assumptions and reworking the code to employ explicit 362 + * per-cpu locking or short preempt-disable regions. 363 + * 364 + * The end goal must be to get rid of migrate_disable(), alternatively we need 365 + * a schedulability theory that does not depend on abritrary migration. 366 + * 367 + * 368 + * Notes on the implementation. 369 + * 370 + * The implementation is particularly tricky since existing code patterns 371 + * dictate neither migrate_disable() nor migrate_enable() is allowed to block. 372 + * This means that it cannot use cpus_read_lock() to serialize against hotplug, 373 + * nor can it easily migrate itself into a pending affinity mask change on 374 + * migrate_enable(). 375 + * 376 + * 377 + * Note: even non-work-conserving schedulers like semi-partitioned depends on 378 + * migration, so migrate_disable() is not only a problem for 379 + * work-conserving schedulers. 380 + * 349 381 */ 350 - static __always_inline void migrate_enable(void) 351 - { 352 - preempt_enable(); 353 - } 382 + extern void migrate_disable(void); 383 + extern void migrate_enable(void); 384 + 385 + #else 386 + 387 + static inline void migrate_disable(void) { } 388 + static inline void migrate_enable(void) { } 389 + 390 + #endif /* CONFIG_SMP */ 354 391 355 392 #endif /* __LINUX_PREEMPT_H */

+5

include/linux/sched.h

··· 723 723 int nr_cpus_allowed; 724 724 const cpumask_t *cpus_ptr; 725 725 cpumask_t cpus_mask; 726 + void *migration_pending; 727 + #ifdef CONFIG_SMP 728 + unsigned short migration_disabled; 729 + #endif 730 + unsigned short migration_flags; 726 731 727 732 #ifdef CONFIG_PREEMPT_RCU 728 733 int rcu_read_lock_nesting;

+2

include/linux/sched/hotplug.h

··· 11 11 extern int sched_cpu_deactivate(unsigned int cpu); 12 12 13 13 #ifdef CONFIG_HOTPLUG_CPU 14 + extern int sched_cpu_wait_empty(unsigned int cpu); 14 15 extern int sched_cpu_dying(unsigned int cpu); 15 16 #else 17 + # define sched_cpu_wait_empty NULL 16 18 # define sched_cpu_dying NULL 17 19 #endif 18 20

+5

include/linux/sched/mm.h

··· 347 347 348 348 extern void membarrier_exec_mmap(struct mm_struct *mm); 349 349 350 + extern void membarrier_update_current_mm(struct mm_struct *next_mm); 351 + 350 352 #else 351 353 #ifdef CONFIG_ARCH_HAS_MEMBARRIER_CALLBACKS 352 354 static inline void membarrier_arch_switch_mm(struct mm_struct *prev, ··· 361 359 { 362 360 } 363 361 static inline void membarrier_mm_sync_core_before_usermode(struct mm_struct *mm) 362 + { 363 + } 364 + static inline void membarrier_update_current_mm(struct mm_struct *next_mm) 364 365 { 365 366 } 366 367 #endif

+8

include/linux/sched/topology.h

··· 225 225 226 226 #endif /* !CONFIG_SMP */ 227 227 228 + #if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) 229 + extern void rebuild_sched_domains_energy(void); 230 + #else 231 + static inline void rebuild_sched_domains_energy(void) 232 + { 233 + } 234 + #endif 235 + 228 236 #ifndef arch_scale_cpu_capacity 229 237 /** 230 238 * arch_scale_cpu_capacity - get the capacity scale factor of a given CPU.

+9 -10

include/linux/smp.h

··· 21 21 * structure shares (partial) layout with struct irq_work 22 22 */ 23 23 struct __call_single_data { 24 - union { 25 - struct __call_single_node node; 26 - struct { 27 - struct llist_node llist; 28 - unsigned int flags; 29 - #ifdef CONFIG_64BIT 30 - u16 src, dst; 31 - #endif 32 - }; 33 - }; 24 + struct __call_single_node node; 34 25 smp_call_func_t func; 35 26 void *info; 36 27 }; 37 28 29 + #define CSD_INIT(_func, _info) \ 30 + (struct __call_single_data){ .func = (_func), .info = (_info), } 31 + 38 32 /* Use __aligned() to avoid to use 2 cache lines for 1 csd */ 39 33 typedef struct __call_single_data call_single_data_t 40 34 __aligned(sizeof(struct __call_single_data)); 35 + 36 + #define INIT_CSD(_csd, _func, _info) \ 37 + do { \ 38 + *(_csd) = CSD_INIT((_func), (_info)); \ 39 + } while (0) 41 40 42 41 /* 43 42 * Enqueue a llist_node on the call_single_queue; be very careful, read

+5

include/linux/stop_machine.h

··· 24 24 struct cpu_stop_work { 25 25 struct list_head list; /* cpu_stopper->works */ 26 26 cpu_stop_fn_t fn; 27 + unsigned long caller; 27 28 void *arg; 28 29 struct cpu_stop_done *done; 29 30 }; ··· 36 35 void stop_machine_park(int cpu); 37 36 void stop_machine_unpark(int cpu); 38 37 void stop_machine_yield(const struct cpumask *cpumask); 38 + 39 + extern void print_stop_info(const char *log_lvl, struct task_struct *task); 39 40 40 41 #else /* CONFIG_SMP */ 41 42 ··· 82 79 83 80 return false; 84 81 } 82 + 83 + static inline void print_stop_info(const char *log_lvl, struct task_struct *task) { } 85 84 86 85 #endif /* CONFIG_SMP */ 87 86

+2

include/uapi/linux/sched/types.h

··· 96 96 * on a CPU with a capacity big enough to fit the specified value. 97 97 * A task with a max utilization value smaller than 1024 is more likely 98 98 * scheduled on a CPU with no more capacity than the specified value. 99 + * 100 + * A task utilization boundary can be reset by setting the attribute to -1. 99 101 */ 100 102 struct sched_attr { 101 103 __u32 size;

+1 -1

kernel/bpf/stackmap.c

··· 298 298 if (irqs_disabled()) { 299 299 if (!IS_ENABLED(CONFIG_PREEMPT_RT)) { 300 300 work = this_cpu_ptr(&up_read_work); 301 - if (atomic_read(&work->irq_work.flags) & IRQ_WORK_BUSY) { 301 + if (irq_work_is_busy(&work->irq_work)) { 302 302 /* cannot queue more up_read, fallback */ 303 303 irq_work_busy = true; 304 304 }

+28 -5

kernel/cgroup/cpuset.c

··· 983 983 */ 984 984 static void rebuild_sched_domains_locked(void) 985 985 { 986 + struct cgroup_subsys_state *pos_css; 986 987 struct sched_domain_attr *attr; 987 988 cpumask_var_t *doms; 989 + struct cpuset *cs; 988 990 int ndoms; 989 991 990 992 lockdep_assert_cpus_held(); 991 993 percpu_rwsem_assert_held(&cpuset_rwsem); 992 994 993 995 /* 994 - * We have raced with CPU hotplug. Don't do anything to avoid 996 + * If we have raced with CPU hotplug, return early to avoid 995 997 * passing doms with offlined cpu to partition_sched_domains(). 996 - * Anyways, hotplug work item will rebuild sched domains. 998 + * Anyways, cpuset_hotplug_workfn() will rebuild sched domains. 999 + * 1000 + * With no CPUs in any subpartitions, top_cpuset's effective CPUs 1001 + * should be the same as the active CPUs, so checking only top_cpuset 1002 + * is enough to detect racing CPU offlines. 997 1003 */ 998 1004 if (!top_cpuset.nr_subparts_cpus && 999 1005 !cpumask_equal(top_cpuset.effective_cpus, cpu_active_mask)) 1000 1006 return; 1001 1007 1002 - if (top_cpuset.nr_subparts_cpus && 1003 - !cpumask_subset(top_cpuset.effective_cpus, cpu_active_mask)) 1004 - return; 1008 + /* 1009 + * With subpartition CPUs, however, the effective CPUs of a partition 1010 + * root should be only a subset of the active CPUs. Since a CPU in any 1011 + * partition root could be offlined, all must be checked. 1012 + */ 1013 + if (top_cpuset.nr_subparts_cpus) { 1014 + rcu_read_lock(); 1015 + cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) { 1016 + if (!is_partition_root(cs)) { 1017 + pos_css = css_rightmost_descendant(pos_css); 1018 + continue; 1019 + } 1020 + if (!cpumask_subset(cs->effective_cpus, 1021 + cpu_active_mask)) { 1022 + rcu_read_unlock(); 1023 + return; 1024 + } 1025 + } 1026 + rcu_read_unlock(); 1027 + } 1005 1028 1006 1029 /* Generate domain masks and attrs */ 1007 1030 ndoms = generate_sched_domains(&doms, &attr);

+8 -1

kernel/cpu.c

··· 1606 1606 .name = "ap:online", 1607 1607 }, 1608 1608 /* 1609 - * Handled on controll processor until the plugged processor manages 1609 + * Handled on control processor until the plugged processor manages 1610 1610 * this itself. 1611 1611 */ 1612 1612 [CPUHP_TEARDOWN_CPU] = { ··· 1615 1615 .teardown.single = takedown_cpu, 1616 1616 .cant_stop = true, 1617 1617 }, 1618 + 1619 + [CPUHP_AP_SCHED_WAIT_EMPTY] = { 1620 + .name = "sched:waitempty", 1621 + .startup.single = NULL, 1622 + .teardown.single = sched_cpu_wait_empty, 1623 + }, 1624 + 1618 1625 /* Handle smpboot threads park/unpark */ 1619 1626 [CPUHP_AP_SMPBOOT_THREADS] = { 1620 1627 .name = "smpboot/threads:online",

+3 -3

kernel/debug/debug_core.c

··· 225 225 * Default (weak) implementation for kgdb_roundup_cpus 226 226 */ 227 227 228 - static DEFINE_PER_CPU(call_single_data_t, kgdb_roundup_csd); 229 - 230 228 void __weak kgdb_call_nmi_hook(void *ignored) 231 229 { 232 230 /* ··· 238 240 kgdb_nmicallback(raw_smp_processor_id(), get_irq_regs()); 239 241 } 240 242 NOKPROBE_SYMBOL(kgdb_call_nmi_hook); 243 + 244 + static DEFINE_PER_CPU(call_single_data_t, kgdb_roundup_csd) = 245 + CSD_INIT(kgdb_call_nmi_hook, NULL); 241 246 242 247 void __weak kgdb_roundup_cpus(void) 243 248 { ··· 268 267 continue; 269 268 kgdb_info[cpu].rounding_up = true; 270 269 271 - csd->func = kgdb_call_nmi_hook; 272 270 ret = smp_call_function_single_async(cpu, csd); 273 271 if (ret) 274 272 kgdb_info[cpu].rounding_up = false;

+15 -1

kernel/exit.c

··· 478 478 BUG_ON(mm != current->active_mm); 479 479 /* more a memory barrier than a real lock */ 480 480 task_lock(current); 481 + /* 482 + * When a thread stops operating on an address space, the loop 483 + * in membarrier_private_expedited() may not observe that 484 + * tsk->mm, and the loop in membarrier_global_expedited() may 485 + * not observe a MEMBARRIER_STATE_GLOBAL_EXPEDITED 486 + * rq->membarrier_state, so those would not issue an IPI. 487 + * Membarrier requires a memory barrier after accessing 488 + * user-space memory, before clearing tsk->mm or the 489 + * rq->membarrier_state. 490 + */ 491 + smp_mb__after_spinlock(); 492 + local_irq_disable(); 481 493 current->mm = NULL; 482 - mmap_read_unlock(mm); 494 + membarrier_update_current_mm(NULL); 483 495 enter_lazy_tlb(mm, current); 496 + local_irq_enable(); 484 497 task_unlock(current); 498 + mmap_read_unlock(mm); 485 499 mm_update_next_owner(mm); 486 500 mmput(mm); 487 501 if (test_thread_flag(TIF_MEMDIE))

+28 -23

kernel/irq_work.c

··· 31 31 { 32 32 int oflags; 33 33 34 - oflags = atomic_fetch_or(IRQ_WORK_CLAIMED | CSD_TYPE_IRQ_WORK, &work->flags); 34 + oflags = atomic_fetch_or(IRQ_WORK_CLAIMED | CSD_TYPE_IRQ_WORK, &work->node.a_flags); 35 35 /* 36 36 * If the work is already pending, no need to raise the IPI. 37 - * The pairing atomic_fetch_andnot() in irq_work_run() makes sure 37 + * The pairing smp_mb() in irq_work_single() makes sure 38 38 * everything we did before is visible. 39 39 */ 40 40 if (oflags & IRQ_WORK_PENDING) ··· 53 53 static void __irq_work_queue_local(struct irq_work *work) 54 54 { 55 55 /* If the work is "lazy", handle it from next tick if any */ 56 - if (atomic_read(&work->flags) & IRQ_WORK_LAZY) { 57 - if (llist_add(&work->llnode, this_cpu_ptr(&lazy_list)) && 56 + if (atomic_read(&work->node.a_flags) & IRQ_WORK_LAZY) { 57 + if (llist_add(&work->node.llist, this_cpu_ptr(&lazy_list)) && 58 58 tick_nohz_tick_stopped()) 59 59 arch_irq_work_raise(); 60 60 } else { 61 - if (llist_add(&work->llnode, this_cpu_ptr(&raised_list))) 61 + if (llist_add(&work->node.llist, this_cpu_ptr(&raised_list))) 62 62 arch_irq_work_raise(); 63 63 } 64 64 } ··· 102 102 if (cpu != smp_processor_id()) { 103 103 /* Arch remote IPI send/receive backend aren't NMI safe */ 104 104 WARN_ON_ONCE(in_nmi()); 105 - __smp_call_single_queue(cpu, &work->llnode); 105 + __smp_call_single_queue(cpu, &work->node.llist); 106 106 } else { 107 107 __irq_work_queue_local(work); 108 108 } ··· 136 136 int flags; 137 137 138 138 /* 139 - * Clear the PENDING bit, after this point the @work 140 - * can be re-used. 141 - * Make it immediately visible so that other CPUs trying 142 - * to claim that work don't rely on us to handle their data 143 - * while we are in the middle of the func. 139 + * Clear the PENDING bit, after this point the @work can be re-used. 140 + * The PENDING bit acts as a lock, and we own it, so we can clear it 141 + * without atomic ops. 144 142 */ 145 - flags = atomic_fetch_andnot(IRQ_WORK_PENDING, &work->flags); 146 - 147 - lockdep_irq_work_enter(work); 148 - work->func(work); 149 - lockdep_irq_work_exit(work); 150 - /* 151 - * Clear the BUSY bit and return to the free state if 152 - * no-one else claimed it meanwhile. 153 - */ 143 + flags = atomic_read(&work->node.a_flags); 154 144 flags &= ~IRQ_WORK_PENDING; 155 - (void)atomic_cmpxchg(&work->flags, flags, flags & ~IRQ_WORK_BUSY); 145 + atomic_set(&work->node.a_flags, flags); 146 + 147 + /* 148 + * See irq_work_claim(). 149 + */ 150 + smp_mb(); 151 + 152 + lockdep_irq_work_enter(flags); 153 + work->func(work); 154 + lockdep_irq_work_exit(flags); 155 + 156 + /* 157 + * Clear the BUSY bit, if set, and return to the free state if no-one 158 + * else claimed it meanwhile. 159 + */ 160 + (void)atomic_cmpxchg(&work->node.a_flags, flags, flags & ~IRQ_WORK_BUSY); 156 161 } 157 162 158 163 static void irq_work_run_list(struct llist_head *list) ··· 171 166 return; 172 167 173 168 llnode = llist_del_all(list); 174 - llist_for_each_entry_safe(work, tmp, llnode, llnode) 169 + llist_for_each_entry_safe(work, tmp, llnode, node.llist) 175 170 irq_work_single(work); 176 171 } 177 172 ··· 203 198 { 204 199 lockdep_assert_irqs_enabled(); 205 200 206 - while (atomic_read(&work->flags) & IRQ_WORK_BUSY) 201 + while (irq_work_is_busy(work)) 207 202 cpu_relax(); 208 203 } 209 204 EXPORT_SYMBOL_GPL(irq_work_sync);

+21

kernel/kthread.c

··· 1249 1249 tsk->active_mm = mm; 1250 1250 } 1251 1251 tsk->mm = mm; 1252 + membarrier_update_current_mm(mm); 1252 1253 switch_mm_irqs_off(active_mm, mm, tsk); 1253 1254 local_irq_enable(); 1254 1255 task_unlock(tsk); ··· 1257 1256 finish_arch_post_lock_switch(); 1258 1257 #endif 1259 1258 1259 + /* 1260 + * When a kthread starts operating on an address space, the loop 1261 + * in membarrier_{private,global}_expedited() may not observe 1262 + * that tsk->mm, and not issue an IPI. Membarrier requires a 1263 + * memory barrier after storing to tsk->mm, before accessing 1264 + * user-space memory. A full memory barrier for membarrier 1265 + * {PRIVATE,GLOBAL}_EXPEDITED is implicitly provided by 1266 + * mmdrop(), or explicitly with smp_mb(). 1267 + */ 1260 1268 if (active_mm != mm) 1261 1269 mmdrop(active_mm); 1270 + else 1271 + smp_mb(); 1262 1272 1263 1273 to_kthread(tsk)->oldfs = force_uaccess_begin(); 1264 1274 } ··· 1289 1277 force_uaccess_end(to_kthread(tsk)->oldfs); 1290 1278 1291 1279 task_lock(tsk); 1280 + /* 1281 + * When a kthread stops operating on an address space, the loop 1282 + * in membarrier_{private,global}_expedited() may not observe 1283 + * that tsk->mm, and not issue an IPI. Membarrier requires a 1284 + * memory barrier after accessing user-space memory, before 1285 + * clearing tsk->mm. 1286 + */ 1287 + smp_mb__after_spinlock(); 1292 1288 sync_mm_rss(mm); 1293 1289 local_irq_disable(); 1294 1290 tsk->mm = NULL; 1291 + membarrier_update_current_mm(NULL); 1295 1292 /* active_mm is still 'mm' */ 1296 1293 enter_lazy_tlb(mm, tsk); 1297 1294 local_irq_enable();

+2 -4

kernel/printk/printk.c

··· 3025 3025 wake_up_interruptible(&log_wait); 3026 3026 } 3027 3027 3028 - static DEFINE_PER_CPU(struct irq_work, wake_up_klogd_work) = { 3029 - .func = wake_up_klogd_work_func, 3030 - .flags = ATOMIC_INIT(IRQ_WORK_LAZY), 3031 - }; 3028 + static DEFINE_PER_CPU(struct irq_work, wake_up_klogd_work) = 3029 + IRQ_WORK_INIT_LAZY(wake_up_klogd_work_func); 3032 3030 3033 3031 void wake_up_klogd(void) 3034 3032 {

+1 -2

kernel/rcu/tree.c

··· 1322 1322 if (IS_ENABLED(CONFIG_IRQ_WORK) && 1323 1323 !rdp->rcu_iw_pending && rdp->rcu_iw_gp_seq != rnp->gp_seq && 1324 1324 (rnp->ffmask & rdp->grpmask)) { 1325 - init_irq_work(&rdp->rcu_iw, rcu_iw_handler); 1326 - atomic_set(&rdp->rcu_iw.flags, IRQ_WORK_HARD_IRQ); 1327 1325 rdp->rcu_iw_pending = true; 1328 1326 rdp->rcu_iw_gp_seq = rnp->gp_seq; 1329 1327 irq_work_queue_on(&rdp->rcu_iw, rdp->cpu); ··· 4021 4023 rdp->cpu_no_qs.b.norm = true; 4022 4024 rdp->core_needs_qs = false; 4023 4025 rdp->rcu_iw_pending = false; 4026 + rdp->rcu_iw = IRQ_WORK_INIT_HARD(rcu_iw_handler); 4024 4027 rdp->rcu_iw_gp_seq = rdp->gp_seq - 1; 4025 4028 trace_rcu_grace_period(rcu_state.name, rdp->gp_seq, TPS("cpuonl")); 4026 4029 raw_spin_unlock_irqrestore_rcu_node(rnp, flags);

+877 -263

kernel/sched/core.c

··· 97 97 * 98 98 * Normal scheduling state is serialized by rq->lock. __schedule() takes the 99 99 * local CPU's rq->lock, it optionally removes the task from the runqueue and 100 - * always looks at the local rq data structures to find the most elegible task 100 + * always looks at the local rq data structures to find the most eligible task 101 101 * to run next. 102 102 * 103 103 * Task enqueue is also under rq->lock, possibly taken from another CPU. ··· 320 320 update_rq_clock_task(rq, delta); 321 321 } 322 322 323 - static inline void 324 - rq_csd_init(struct rq *rq, call_single_data_t *csd, smp_call_func_t func) 325 - { 326 - csd->flags = 0; 327 - csd->func = func; 328 - csd->info = rq; 329 - } 330 - 331 323 #ifdef CONFIG_SCHED_HRTICK 332 324 /* 333 325 * Use HR-timers to deliver accurate preemption points. ··· 420 428 static void hrtick_rq_init(struct rq *rq) 421 429 { 422 430 #ifdef CONFIG_SMP 423 - rq_csd_init(rq, &rq->hrtick_csd, __hrtick_start); 431 + INIT_CSD(&rq->hrtick_csd, __hrtick_start, rq); 424 432 #endif 425 433 hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD); 426 434 rq->hrtick_timer.function = hrtick; ··· 510 518 511 519 /* 512 520 * Atomically grab the task, if ->wake_q is !nil already it means 513 - * its already queued (either by us or someone else) and will get the 521 + * it's already queued (either by us or someone else) and will get the 514 522 * wakeup due to that. 515 523 * 516 524 * In order to ensure that a pending wakeup will observe our pending ··· 761 769 return false; 762 770 763 771 /* 764 - * If there are more than one RR tasks, we need the tick to effect the 772 + * If there are more than one RR tasks, we need the tick to affect the 765 773 * actual RR behaviour. 766 774 */ 767 775 if (rq->rt.rr_nr_running) { ··· 1179 1187 * accounting was performed at enqueue time and we can just return 1180 1188 * here. 1181 1189 * 1182 - * Need to be careful of the following enqeueue/dequeue ordering 1190 + * Need to be careful of the following enqueue/dequeue ordering 1183 1191 * problem too 1184 1192 * 1185 1193 * enqueue(taskA) 1186 1194 * // sched_uclamp_used gets enabled 1187 1195 * enqueue(taskB) 1188 1196 * dequeue(taskA) 1189 - * // Must not decrement bukcet->tasks here 1197 + * // Must not decrement bucket->tasks here 1190 1198 * dequeue(taskB) 1191 1199 * 1192 1200 * where we could end up with stale data in uc_se and ··· 1405 1413 static int uclamp_validate(struct task_struct *p, 1406 1414 const struct sched_attr *attr) 1407 1415 { 1408 - unsigned int lower_bound = p->uclamp_req[UCLAMP_MIN].value; 1409 - unsigned int upper_bound = p->uclamp_req[UCLAMP_MAX].value; 1416 + int util_min = p->uclamp_req[UCLAMP_MIN].value; 1417 + int util_max = p->uclamp_req[UCLAMP_MAX].value; 1410 1418 1411 - if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN) 1412 - lower_bound = attr->sched_util_min; 1413 - if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX) 1414 - upper_bound = attr->sched_util_max; 1419 + if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN) { 1420 + util_min = attr->sched_util_min; 1415 1421 1416 - if (lower_bound > upper_bound) 1417 - return -EINVAL; 1418 - if (upper_bound > SCHED_CAPACITY_SCALE) 1422 + if (util_min + 1 > SCHED_CAPACITY_SCALE + 1) 1423 + return -EINVAL; 1424 + } 1425 + 1426 + if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX) { 1427 + util_max = attr->sched_util_max; 1428 + 1429 + if (util_max + 1 > SCHED_CAPACITY_SCALE + 1) 1430 + return -EINVAL; 1431 + } 1432 + 1433 + if (util_min != -1 && util_max != -1 && util_min > util_max) 1419 1434 return -EINVAL; 1420 1435 1421 1436 /* ··· 1437 1438 return 0; 1438 1439 } 1439 1440 1441 + static bool uclamp_reset(const struct sched_attr *attr, 1442 + enum uclamp_id clamp_id, 1443 + struct uclamp_se *uc_se) 1444 + { 1445 + /* Reset on sched class change for a non user-defined clamp value. */ 1446 + if (likely(!(attr->sched_flags & SCHED_FLAG_UTIL_CLAMP)) && 1447 + !uc_se->user_defined) 1448 + return true; 1449 + 1450 + /* Reset on sched_util_{min,max} == -1. */ 1451 + if (clamp_id == UCLAMP_MIN && 1452 + attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN && 1453 + attr->sched_util_min == -1) { 1454 + return true; 1455 + } 1456 + 1457 + if (clamp_id == UCLAMP_MAX && 1458 + attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX && 1459 + attr->sched_util_max == -1) { 1460 + return true; 1461 + } 1462 + 1463 + return false; 1464 + } 1465 + 1440 1466 static void __setscheduler_uclamp(struct task_struct *p, 1441 1467 const struct sched_attr *attr) 1442 1468 { 1443 1469 enum uclamp_id clamp_id; 1444 1470 1445 - /* 1446 - * On scheduling class change, reset to default clamps for tasks 1447 - * without a task-specific value. 1448 - */ 1449 1471 for_each_clamp_id(clamp_id) { 1450 1472 struct uclamp_se *uc_se = &p->uclamp_req[clamp_id]; 1473 + unsigned int value; 1451 1474 1452 - /* Keep using defined clamps across class changes */ 1453 - if (uc_se->user_defined) 1475 + if (!uclamp_reset(attr, clamp_id, uc_se)) 1454 1476 continue; 1455 1477 1456 1478 /* ··· 1479 1459 * at runtime. 1480 1460 */ 1481 1461 if (unlikely(rt_task(p) && clamp_id == UCLAMP_MIN)) 1482 - __uclamp_update_util_min_rt_default(p); 1462 + value = sysctl_sched_uclamp_util_min_rt_default; 1483 1463 else 1484 - uclamp_se_set(uc_se, uclamp_none(clamp_id), false); 1464 + value = uclamp_none(clamp_id); 1465 + 1466 + uclamp_se_set(uc_se, value, false); 1485 1467 1486 1468 } 1487 1469 1488 1470 if (likely(!(attr->sched_flags & SCHED_FLAG_UTIL_CLAMP))) 1489 1471 return; 1490 1472 1491 - if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN) { 1473 + if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN && 1474 + attr->sched_util_min != -1) { 1492 1475 uclamp_se_set(&p->uclamp_req[UCLAMP_MIN], 1493 1476 attr->sched_util_min, true); 1494 1477 } 1495 1478 1496 - if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX) { 1479 + if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX && 1480 + attr->sched_util_max != -1) { 1497 1481 uclamp_se_set(&p->uclamp_req[UCLAMP_MAX], 1498 1482 attr->sched_util_max, true); 1499 1483 } ··· 1720 1696 1721 1697 #ifdef CONFIG_SMP 1722 1698 1699 + static void 1700 + __do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask, u32 flags); 1701 + 1702 + static int __set_cpus_allowed_ptr(struct task_struct *p, 1703 + const struct cpumask *new_mask, 1704 + u32 flags); 1705 + 1706 + static void migrate_disable_switch(struct rq *rq, struct task_struct *p) 1707 + { 1708 + if (likely(!p->migration_disabled)) 1709 + return; 1710 + 1711 + if (p->cpus_ptr != &p->cpus_mask) 1712 + return; 1713 + 1714 + /* 1715 + * Violates locking rules! see comment in __do_set_cpus_allowed(). 1716 + */ 1717 + __do_set_cpus_allowed(p, cpumask_of(rq->cpu), SCA_MIGRATE_DISABLE); 1718 + } 1719 + 1720 + void migrate_disable(void) 1721 + { 1722 + struct task_struct *p = current; 1723 + 1724 + if (p->migration_disabled) { 1725 + p->migration_disabled++; 1726 + return; 1727 + } 1728 + 1729 + preempt_disable(); 1730 + this_rq()->nr_pinned++; 1731 + p->migration_disabled = 1; 1732 + preempt_enable(); 1733 + } 1734 + EXPORT_SYMBOL_GPL(migrate_disable); 1735 + 1736 + void migrate_enable(void) 1737 + { 1738 + struct task_struct *p = current; 1739 + 1740 + if (p->migration_disabled > 1) { 1741 + p->migration_disabled--; 1742 + return; 1743 + } 1744 + 1745 + /* 1746 + * Ensure stop_task runs either before or after this, and that 1747 + * __set_cpus_allowed_ptr(SCA_MIGRATE_ENABLE) doesn't schedule(). 1748 + */ 1749 + preempt_disable(); 1750 + if (p->cpus_ptr != &p->cpus_mask) 1751 + __set_cpus_allowed_ptr(p, &p->cpus_mask, SCA_MIGRATE_ENABLE); 1752 + /* 1753 + * Mustn't clear migration_disabled() until cpus_ptr points back at the 1754 + * regular cpus_mask, otherwise things that race (eg. 1755 + * select_fallback_rq) get confused. 1756 + */ 1757 + barrier(); 1758 + p->migration_disabled = 0; 1759 + this_rq()->nr_pinned--; 1760 + preempt_enable(); 1761 + } 1762 + EXPORT_SYMBOL_GPL(migrate_enable); 1763 + 1764 + static inline bool rq_has_pinned_tasks(struct rq *rq) 1765 + { 1766 + return rq->nr_pinned; 1767 + } 1768 + 1723 1769 /* 1724 1770 * Per-CPU kthreads are allowed to run on !active && online CPUs, see 1725 1771 * __set_cpus_allowed_ptr() and select_fallback_rq(). ··· 1799 1705 if (!cpumask_test_cpu(cpu, p->cpus_ptr)) 1800 1706 return false; 1801 1707 1802 - if (is_per_cpu_kthread(p)) 1708 + if (is_per_cpu_kthread(p) || is_migration_disabled(p)) 1803 1709 return cpu_online(cpu); 1804 1710 1805 1711 return cpu_active(cpu); ··· 1844 1750 } 1845 1751 1846 1752 struct migration_arg { 1847 - struct task_struct *task; 1848 - int dest_cpu; 1753 + struct task_struct *task; 1754 + int dest_cpu; 1755 + struct set_affinity_pending *pending; 1756 + }; 1757 + 1758 + struct set_affinity_pending { 1759 + refcount_t refs; 1760 + struct completion done; 1761 + struct cpu_stop_work stop_work; 1762 + struct migration_arg arg; 1849 1763 }; 1850 1764 1851 1765 /* ··· 1885 1783 */ 1886 1784 static int migration_cpu_stop(void *data) 1887 1785 { 1786 + struct set_affinity_pending *pending; 1888 1787 struct migration_arg *arg = data; 1889 1788 struct task_struct *p = arg->task; 1789 + int dest_cpu = arg->dest_cpu; 1890 1790 struct rq *rq = this_rq(); 1791 + bool complete = false; 1891 1792 struct rq_flags rf; 1892 1793 1893 1794 /* 1894 1795 * The original target CPU might have gone down and we might 1895 1796 * be on another CPU but it doesn't matter. 1896 1797 */ 1897 - local_irq_disable(); 1798 + local_irq_save(rf.flags); 1898 1799 /* 1899 1800 * We need to explicitly wake pending tasks before running 1900 1801 * __migrate_task() such that we will not miss enforcing cpus_ptr ··· 1907 1802 1908 1803 raw_spin_lock(&p->pi_lock); 1909 1804 rq_lock(rq, &rf); 1805 + 1806 + pending = p->migration_pending; 1910 1807 /* 1911 1808 * If task_rq(p) != rq, it cannot be migrated here, because we're 1912 1809 * holding rq->lock, if p->on_rq == 0 it cannot get enqueued because 1913 1810 * we're holding p->pi_lock. 1914 1811 */ 1915 1812 if (task_rq(p) == rq) { 1916 - if (task_on_rq_queued(p)) 1917 - rq = __migrate_task(rq, &rf, p, arg->dest_cpu); 1918 - else 1919 - p->wake_cpu = arg->dest_cpu; 1920 - } 1921 - rq_unlock(rq, &rf); 1922 - raw_spin_unlock(&p->pi_lock); 1813 + if (is_migration_disabled(p)) 1814 + goto out; 1923 1815 1924 - local_irq_enable(); 1816 + if (pending) { 1817 + p->migration_pending = NULL; 1818 + complete = true; 1819 + } 1820 + 1821 + /* migrate_enable() -- we must not race against SCA */ 1822 + if (dest_cpu < 0) { 1823 + /* 1824 + * When this was migrate_enable() but we no longer 1825 + * have a @pending, a concurrent SCA 'fixed' things 1826 + * and we should be valid again. Nothing to do. 1827 + */ 1828 + if (!pending) { 1829 + WARN_ON_ONCE(!cpumask_test_cpu(task_cpu(p), &p->cpus_mask)); 1830 + goto out; 1831 + } 1832 + 1833 + dest_cpu = cpumask_any_distribute(&p->cpus_mask); 1834 + } 1835 + 1836 + if (task_on_rq_queued(p)) 1837 + rq = __migrate_task(rq, &rf, p, dest_cpu); 1838 + else 1839 + p->wake_cpu = dest_cpu; 1840 + 1841 + } else if (dest_cpu < 0 || pending) { 1842 + /* 1843 + * This happens when we get migrated between migrate_enable()'s 1844 + * preempt_enable() and scheduling the stopper task. At that 1845 + * point we're a regular task again and not current anymore. 1846 + * 1847 + * A !PREEMPT kernel has a giant hole here, which makes it far 1848 + * more likely. 1849 + */ 1850 + 1851 + /* 1852 + * The task moved before the stopper got to run. We're holding 1853 + * ->pi_lock, so the allowed mask is stable - if it got 1854 + * somewhere allowed, we're done. 1855 + */ 1856 + if (pending && cpumask_test_cpu(task_cpu(p), p->cpus_ptr)) { 1857 + p->migration_pending = NULL; 1858 + complete = true; 1859 + goto out; 1860 + } 1861 + 1862 + /* 1863 + * When this was migrate_enable() but we no longer have an 1864 + * @pending, a concurrent SCA 'fixed' things and we should be 1865 + * valid again. Nothing to do. 1866 + */ 1867 + if (!pending) { 1868 + WARN_ON_ONCE(!cpumask_test_cpu(task_cpu(p), &p->cpus_mask)); 1869 + goto out; 1870 + } 1871 + 1872 + /* 1873 + * When migrate_enable() hits a rq mis-match we can't reliably 1874 + * determine is_migration_disabled() and so have to chase after 1875 + * it. 1876 + */ 1877 + task_rq_unlock(rq, p, &rf); 1878 + stop_one_cpu_nowait(task_cpu(p), migration_cpu_stop, 1879 + &pending->arg, &pending->stop_work); 1880 + return 0; 1881 + } 1882 + out: 1883 + task_rq_unlock(rq, p, &rf); 1884 + 1885 + if (complete) 1886 + complete_all(&pending->done); 1887 + 1888 + /* For pending->{arg,stop_work} */ 1889 + pending = arg->pending; 1890 + if (pending && refcount_dec_and_test(&pending->refs)) 1891 + wake_up_var(&pending->refs); 1892 + 1893 + return 0; 1894 + } 1895 + 1896 + int push_cpu_stop(void *arg) 1897 + { 1898 + struct rq *lowest_rq = NULL, *rq = this_rq(); 1899 + struct task_struct *p = arg; 1900 + 1901 + raw_spin_lock_irq(&p->pi_lock); 1902 + raw_spin_lock(&rq->lock); 1903 + 1904 + if (task_rq(p) != rq) 1905 + goto out_unlock; 1906 + 1907 + if (is_migration_disabled(p)) { 1908 + p->migration_flags |= MDF_PUSH; 1909 + goto out_unlock; 1910 + } 1911 + 1912 + p->migration_flags &= ~MDF_PUSH; 1913 + 1914 + if (p->sched_class->find_lock_rq) 1915 + lowest_rq = p->sched_class->find_lock_rq(p, rq); 1916 + 1917 + if (!lowest_rq) 1918 + goto out_unlock; 1919 + 1920 + // XXX validate p is still the highest prio task 1921 + if (task_rq(p) == rq) { 1922 + deactivate_task(rq, p, 0); 1923 + set_task_cpu(p, lowest_rq->cpu); 1924 + activate_task(lowest_rq, p, 0); 1925 + resched_curr(lowest_rq); 1926 + } 1927 + 1928 + double_unlock_balance(rq, lowest_rq); 1929 + 1930 + out_unlock: 1931 + rq->push_busy = false; 1932 + raw_spin_unlock(&rq->lock); 1933 + raw_spin_unlock_irq(&p->pi_lock); 1934 + 1935 + put_task_struct(p); 1925 1936 return 0; 1926 1937 } 1927 1938 ··· 2045 1824 * sched_class::set_cpus_allowed must do the below, but is not required to 2046 1825 * actually call this function. 2047 1826 */ 2048 - void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask) 1827 + void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask, u32 flags) 2049 1828 { 1829 + if (flags & (SCA_MIGRATE_ENABLE | SCA_MIGRATE_DISABLE)) { 1830 + p->cpus_ptr = new_mask; 1831 + return; 1832 + } 1833 + 2050 1834 cpumask_copy(&p->cpus_mask, new_mask); 2051 1835 p->nr_cpus_allowed = cpumask_weight(new_mask); 2052 1836 } 2053 1837 2054 - void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) 1838 + static void 1839 + __do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask, u32 flags) 2055 1840 { 2056 1841 struct rq *rq = task_rq(p); 2057 1842 bool queued, running; 2058 1843 2059 - lockdep_assert_held(&p->pi_lock); 1844 + /* 1845 + * This here violates the locking rules for affinity, since we're only 1846 + * supposed to change these variables while holding both rq->lock and 1847 + * p->pi_lock. 1848 + * 1849 + * HOWEVER, it magically works, because ttwu() is the only code that 1850 + * accesses these variables under p->pi_lock and only does so after 1851 + * smp_cond_load_acquire(&p->on_cpu, !VAL), and we're in __schedule() 1852 + * before finish_task(). 1853 + * 1854 + * XXX do further audits, this smells like something putrid. 1855 + */ 1856 + if (flags & SCA_MIGRATE_DISABLE) 1857 + SCHED_WARN_ON(!p->on_cpu); 1858 + else 1859 + lockdep_assert_held(&p->pi_lock); 2060 1860 2061 1861 queued = task_on_rq_queued(p); 2062 1862 running = task_current(rq, p); ··· 2093 1851 if (running) 2094 1852 put_prev_task(rq, p); 2095 1853 2096 - p->sched_class->set_cpus_allowed(p, new_mask); 1854 + p->sched_class->set_cpus_allowed(p, new_mask, flags); 2097 1855 2098 1856 if (queued) 2099 1857 enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK); 2100 1858 if (running) 2101 1859 set_next_task(rq, p); 1860 + } 1861 + 1862 + void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) 1863 + { 1864 + __do_set_cpus_allowed(p, new_mask, 0); 1865 + } 1866 + 1867 + /* 1868 + * This function is wildly self concurrent; here be dragons. 1869 + * 1870 + * 1871 + * When given a valid mask, __set_cpus_allowed_ptr() must block until the 1872 + * designated task is enqueued on an allowed CPU. If that task is currently 1873 + * running, we have to kick it out using the CPU stopper. 1874 + * 1875 + * Migrate-Disable comes along and tramples all over our nice sandcastle. 1876 + * Consider: 1877 + * 1878 + * Initial conditions: P0->cpus_mask = [0, 1] 1879 + * 1880 + * P0@CPU0 P1 1881 + * 1882 + * migrate_disable(); 1883 + * <preempted> 1884 + * set_cpus_allowed_ptr(P0, [1]); 1885 + * 1886 + * P1 *cannot* return from this set_cpus_allowed_ptr() call until P0 executes 1887 + * its outermost migrate_enable() (i.e. it exits its Migrate-Disable region). 1888 + * This means we need the following scheme: 1889 + * 1890 + * P0@CPU0 P1 1891 + * 1892 + * migrate_disable(); 1893 + * <preempted> 1894 + * set_cpus_allowed_ptr(P0, [1]); 1895 + * <blocks> 1896 + * <resumes> 1897 + * migrate_enable(); 1898 + * __set_cpus_allowed_ptr(); 1899 + * <wakes local stopper> 1900 + * `--> <woken on migration completion> 1901 + * 1902 + * Now the fun stuff: there may be several P1-like tasks, i.e. multiple 1903 + * concurrent set_cpus_allowed_ptr(P0, [*]) calls. CPU affinity changes of any 1904 + * task p are serialized by p->pi_lock, which we can leverage: the one that 1905 + * should come into effect at the end of the Migrate-Disable region is the last 1906 + * one. This means we only need to track a single cpumask (i.e. p->cpus_mask), 1907 + * but we still need to properly signal those waiting tasks at the appropriate 1908 + * moment. 1909 + * 1910 + * This is implemented using struct set_affinity_pending. The first 1911 + * __set_cpus_allowed_ptr() caller within a given Migrate-Disable region will 1912 + * setup an instance of that struct and install it on the targeted task_struct. 1913 + * Any and all further callers will reuse that instance. Those then wait for 1914 + * a completion signaled at the tail of the CPU stopper callback (1), triggered 1915 + * on the end of the Migrate-Disable region (i.e. outermost migrate_enable()). 1916 + * 1917 + * 1918 + * (1) In the cases covered above. There is one more where the completion is 1919 + * signaled within affine_move_task() itself: when a subsequent affinity request 1920 + * cancels the need for an active migration. Consider: 1921 + * 1922 + * Initial conditions: P0->cpus_mask = [0, 1] 1923 + * 1924 + * P0@CPU0 P1 P2 1925 + * 1926 + * migrate_disable(); 1927 + * <preempted> 1928 + * set_cpus_allowed_ptr(P0, [1]); 1929 + * <blocks> 1930 + * set_cpus_allowed_ptr(P0, [0, 1]); 1931 + * <signal completion> 1932 + * <awakes> 1933 + * 1934 + * Note that the above is safe vs a concurrent migrate_enable(), as any 1935 + * pending affinity completion is preceded by an uninstallation of 1936 + * p->migration_pending done with p->pi_lock held. 1937 + */ 1938 + static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flags *rf, 1939 + int dest_cpu, unsigned int flags) 1940 + { 1941 + struct set_affinity_pending my_pending = { }, *pending = NULL; 1942 + struct migration_arg arg = { 1943 + .task = p, 1944 + .dest_cpu = dest_cpu, 1945 + }; 1946 + bool complete = false; 1947 + 1948 + /* Can the task run on the task's current CPU? If so, we're done */ 1949 + if (cpumask_test_cpu(task_cpu(p), &p->cpus_mask)) { 1950 + struct task_struct *push_task = NULL; 1951 + 1952 + if ((flags & SCA_MIGRATE_ENABLE) && 1953 + (p->migration_flags & MDF_PUSH) && !rq->push_busy) { 1954 + rq->push_busy = true; 1955 + push_task = get_task_struct(p); 1956 + } 1957 + 1958 + pending = p->migration_pending; 1959 + if (pending) { 1960 + refcount_inc(&pending->refs); 1961 + p->migration_pending = NULL; 1962 + complete = true; 1963 + } 1964 + task_rq_unlock(rq, p, rf); 1965 + 1966 + if (push_task) { 1967 + stop_one_cpu_nowait(rq->cpu, push_cpu_stop, 1968 + p, &rq->push_work); 1969 + } 1970 + 1971 + if (complete) 1972 + goto do_complete; 1973 + 1974 + return 0; 1975 + } 1976 + 1977 + if (!(flags & SCA_MIGRATE_ENABLE)) { 1978 + /* serialized by p->pi_lock */ 1979 + if (!p->migration_pending) { 1980 + /* Install the request */ 1981 + refcount_set(&my_pending.refs, 1); 1982 + init_completion(&my_pending.done); 1983 + p->migration_pending = &my_pending; 1984 + } else { 1985 + pending = p->migration_pending; 1986 + refcount_inc(&pending->refs); 1987 + } 1988 + } 1989 + pending = p->migration_pending; 1990 + /* 1991 + * - !MIGRATE_ENABLE: 1992 + * we'll have installed a pending if there wasn't one already. 1993 + * 1994 + * - MIGRATE_ENABLE: 1995 + * we're here because the current CPU isn't matching anymore, 1996 + * the only way that can happen is because of a concurrent 1997 + * set_cpus_allowed_ptr() call, which should then still be 1998 + * pending completion. 1999 + * 2000 + * Either way, we really should have a @pending here. 2001 + */ 2002 + if (WARN_ON_ONCE(!pending)) { 2003 + task_rq_unlock(rq, p, rf); 2004 + return -EINVAL; 2005 + } 2006 + 2007 + if (flags & SCA_MIGRATE_ENABLE) { 2008 + 2009 + refcount_inc(&pending->refs); /* pending->{arg,stop_work} */ 2010 + p->migration_flags &= ~MDF_PUSH; 2011 + task_rq_unlock(rq, p, rf); 2012 + 2013 + pending->arg = (struct migration_arg) { 2014 + .task = p, 2015 + .dest_cpu = -1, 2016 + .pending = pending, 2017 + }; 2018 + 2019 + stop_one_cpu_nowait(cpu_of(rq), migration_cpu_stop, 2020 + &pending->arg, &pending->stop_work); 2021 + 2022 + return 0; 2023 + } 2024 + 2025 + if (task_running(rq, p) || p->state == TASK_WAKING) { 2026 + /* 2027 + * Lessen races (and headaches) by delegating 2028 + * is_migration_disabled(p) checks to the stopper, which will 2029 + * run on the same CPU as said p. 2030 + */ 2031 + task_rq_unlock(rq, p, rf); 2032 + stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg); 2033 + 2034 + } else { 2035 + 2036 + if (!is_migration_disabled(p)) { 2037 + if (task_on_rq_queued(p)) 2038 + rq = move_queued_task(rq, rf, p, dest_cpu); 2039 + 2040 + p->migration_pending = NULL; 2041 + complete = true; 2042 + } 2043 + task_rq_unlock(rq, p, rf); 2044 + 2045 + do_complete: 2046 + if (complete) 2047 + complete_all(&pending->done); 2048 + } 2049 + 2050 + wait_for_completion(&pending->done); 2051 + 2052 + if (refcount_dec_and_test(&pending->refs)) 2053 + wake_up_var(&pending->refs); 2054 + 2055 + /* 2056 + * Block the original owner of &pending until all subsequent callers 2057 + * have seen the completion and decremented the refcount 2058 + */ 2059 + wait_var_event(&my_pending.refs, !refcount_read(&my_pending.refs)); 2060 + 2061 + return 0; 2102 2062 } 2103 2063 2104 2064 /* ··· 2313 1869 * call is not atomic; no spinlocks may be held. 2314 1870 */ 2315 1871 static int __set_cpus_allowed_ptr(struct task_struct *p, 2316 - const struct cpumask *new_mask, bool check) 1872 + const struct cpumask *new_mask, 1873 + u32 flags) 2317 1874 { 2318 1875 const struct cpumask *cpu_valid_mask = cpu_active_mask; 2319 1876 unsigned int dest_cpu; ··· 2325 1880 rq = task_rq_lock(p, &rf); 2326 1881 update_rq_clock(rq); 2327 1882 2328 - if (p->flags & PF_KTHREAD) { 1883 + if (p->flags & PF_KTHREAD || is_migration_disabled(p)) { 2329 1884 /* 2330 - * Kernel threads are allowed on online && !active CPUs 1885 + * Kernel threads are allowed on online && !active CPUs. 1886 + * 1887 + * Specifically, migration_disabled() tasks must not fail the 1888 + * cpumask_any_and_distribute() pick below, esp. so on 1889 + * SCA_MIGRATE_ENABLE, otherwise we'll not call 1890 + * set_cpus_allowed_common() and actually reset p->cpus_ptr. 2331 1891 */ 2332 1892 cpu_valid_mask = cpu_online_mask; 2333 1893 } ··· 2341 1891 * Must re-check here, to close a race against __kthread_bind(), 2342 1892 * sched_setaffinity() is not guaranteed to observe the flag. 2343 1893 */ 2344 - if (check && (p->flags & PF_NO_SETAFFINITY)) { 1894 + if ((flags & SCA_CHECK) && (p->flags & PF_NO_SETAFFINITY)) { 2345 1895 ret = -EINVAL; 2346 1896 goto out; 2347 1897 } 2348 1898 2349 - if (cpumask_equal(&p->cpus_mask, new_mask)) 2350 - goto out; 1899 + if (!(flags & SCA_MIGRATE_ENABLE)) { 1900 + if (cpumask_equal(&p->cpus_mask, new_mask)) 1901 + goto out; 1902 + 1903 + if (WARN_ON_ONCE(p == current && 1904 + is_migration_disabled(p) && 1905 + !cpumask_test_cpu(task_cpu(p), new_mask))) { 1906 + ret = -EBUSY; 1907 + goto out; 1908 + } 1909 + } 2351 1910 2352 1911 /* 2353 1912 * Picking a ~random cpu helps in cases where we are changing affinity ··· 2369 1910 goto out; 2370 1911 } 2371 1912 2372 - do_set_cpus_allowed(p, new_mask); 1913 + __do_set_cpus_allowed(p, new_mask, flags); 2373 1914 2374 1915 if (p->flags & PF_KTHREAD) { 2375 1916 /* ··· 2381 1922 p->nr_cpus_allowed != 1); 2382 1923 } 2383 1924 2384 - /* Can the task run on the task's current CPU? If so, we're done */ 2385 - if (cpumask_test_cpu(task_cpu(p), new_mask)) 2386 - goto out; 1925 + return affine_move_task(rq, p, &rf, dest_cpu, flags); 2387 1926 2388 - if (task_running(rq, p) || p->state == TASK_WAKING) { 2389 - struct migration_arg arg = { p, dest_cpu }; 2390 - /* Need help from migration thread: drop lock and wait. */ 2391 - task_rq_unlock(rq, p, &rf); 2392 - stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg); 2393 - return 0; 2394 - } else if (task_on_rq_queued(p)) { 2395 - /* 2396 - * OK, since we're going to drop the lock immediately 2397 - * afterwards anyway. 2398 - */ 2399 - rq = move_queued_task(rq, &rf, p, dest_cpu); 2400 - } 2401 1927 out: 2402 1928 task_rq_unlock(rq, p, &rf); 2403 1929 ··· 2391 1947 2392 1948 int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) 2393 1949 { 2394 - return __set_cpus_allowed_ptr(p, new_mask, false); 1950 + return __set_cpus_allowed_ptr(p, new_mask, 0); 2395 1951 } 2396 1952 EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr); 2397 1953 ··· 2432 1988 * Clearly, migrating tasks to offline CPUs is a fairly daft thing. 2433 1989 */ 2434 1990 WARN_ON_ONCE(!cpu_online(new_cpu)); 1991 + 1992 + WARN_ON_ONCE(is_migration_disabled(p)); 2435 1993 #endif 2436 1994 2437 1995 trace_sched_migrate_task(p, new_cpu); ··· 2764 2318 } 2765 2319 fallthrough; 2766 2320 case possible: 2321 + /* 2322 + * XXX When called from select_task_rq() we only 2323 + * hold p->pi_lock and again violate locking order. 2324 + * 2325 + * More yuck to audit. 2326 + */ 2767 2327 do_set_cpus_allowed(p, cpu_possible_mask); 2768 2328 state = fail; 2769 2329 break; ··· 2800 2348 * The caller (fork, wakeup) owns p->pi_lock, ->cpus_ptr is stable. 2801 2349 */ 2802 2350 static inline 2803 - int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags) 2351 + int select_task_rq(struct task_struct *p, int cpu, int wake_flags) 2804 2352 { 2805 2353 lockdep_assert_held(&p->pi_lock); 2806 2354 2807 - if (p->nr_cpus_allowed > 1) 2808 - cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags); 2355 + if (p->nr_cpus_allowed > 1 && !is_migration_disabled(p)) 2356 + cpu = p->sched_class->select_task_rq(p, cpu, wake_flags); 2809 2357 else 2810 2358 cpu = cpumask_any(p->cpus_ptr); 2811 2359 ··· 2827 2375 2828 2376 void sched_set_stop_task(int cpu, struct task_struct *stop) 2829 2377 { 2378 + static struct lock_class_key stop_pi_lock; 2830 2379 struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 }; 2831 2380 struct task_struct *old_stop = cpu_rq(cpu)->stop; 2832 2381 ··· 2843 2390 sched_setscheduler_nocheck(stop, SCHED_FIFO, &param); 2844 2391 2845 2392 stop->sched_class = &stop_sched_class; 2393 + 2394 + /* 2395 + * The PI code calls rt_mutex_setprio() with ->pi_lock held to 2396 + * adjust the effective priority of a task. As a result, 2397 + * rt_mutex_setprio() can trigger (RT) balancing operations, 2398 + * which can then trigger wakeups of the stop thread to push 2399 + * around the current task. 2400 + * 2401 + * The stop task itself will never be part of the PI-chain, it 2402 + * never blocks, therefore that ->pi_lock recursion is safe. 2403 + * Tell lockdep about this by placing the stop->pi_lock in its 2404 + * own class. 2405 + */ 2406 + lockdep_set_class(&stop->pi_lock, &stop_pi_lock); 2846 2407 } 2847 2408 2848 2409 cpu_rq(cpu)->stop = stop; ··· 2870 2403 } 2871 2404 } 2872 2405 2873 - #else 2406 + #else /* CONFIG_SMP */ 2874 2407 2875 2408 static inline int __set_cpus_allowed_ptr(struct task_struct *p, 2876 - const struct cpumask *new_mask, bool check) 2409 + const struct cpumask *new_mask, 2410 + u32 flags) 2877 2411 { 2878 2412 return set_cpus_allowed_ptr(p, new_mask); 2879 2413 } 2880 2414 2881 - #endif /* CONFIG_SMP */ 2415 + static inline void migrate_disable_switch(struct rq *rq, struct task_struct *p) { } 2416 + 2417 + static inline bool rq_has_pinned_tasks(struct rq *rq) 2418 + { 2419 + return false; 2420 + } 2421 + 2422 + #endif /* !CONFIG_SMP */ 2882 2423 2883 2424 static void 2884 2425 ttwu_stat(struct task_struct *p, int cpu, int wake_flags) ··· 2940 2465 #ifdef CONFIG_SMP 2941 2466 if (p->sched_class->task_woken) { 2942 2467 /* 2943 - * Our task @p is fully woken up and running; so its safe to 2468 + * Our task @p is fully woken up and running; so it's safe to 2944 2469 * drop the rq->lock, hereafter rq is only used for statistics. 2945 2470 */ 2946 2471 rq_unpin_lock(rq, rf); ··· 3427 2952 3428 2953 /* 3429 2954 * If the owning (remote) CPU is still in the middle of schedule() with 3430 - * this task as prev, wait until its done referencing the task. 2955 + * this task as prev, wait until it's done referencing the task. 3431 2956 * 3432 2957 * Pairs with the smp_store_release() in finish_task(). 3433 2958 * ··· 3436 2961 */ 3437 2962 smp_cond_load_acquire(&p->on_cpu, !VAL); 3438 2963 3439 - cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags); 2964 + cpu = select_task_rq(p, p->wake_cpu, wake_flags | WF_TTWU); 3440 2965 if (task_cpu(p) != cpu) { 3441 2966 if (p->in_iowait) { 3442 2967 delayacct_blkio_end(p); ··· 3578 3103 init_numa_balancing(clone_flags, p); 3579 3104 #ifdef CONFIG_SMP 3580 3105 p->wake_entry.u_flags = CSD_TYPE_TTWU; 3106 + p->migration_pending = NULL; 3581 3107 #endif 3582 3108 } 3583 3109 ··· 3825 3349 */ 3826 3350 p->recent_used_cpu = task_cpu(p); 3827 3351 rseq_migrate(p); 3828 - __set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0)); 3352 + __set_task_cpu(p, select_task_rq(p, task_cpu(p), WF_FORK)); 3829 3353 #endif 3830 3354 rq = __task_rq_lock(p, &rf); 3831 3355 update_rq_clock(rq); ··· 3837 3361 #ifdef CONFIG_SMP 3838 3362 if (p->sched_class->task_woken) { 3839 3363 /* 3840 - * Nothing relies on rq->lock after this, so its fine to 3364 + * Nothing relies on rq->lock after this, so it's fine to 3841 3365 * drop it. 3842 3366 */ 3843 3367 rq_unpin_lock(rq, &rf); ··· 3966 3490 #endif 3967 3491 } 3968 3492 3493 + #ifdef CONFIG_SMP 3494 + 3495 + static void do_balance_callbacks(struct rq *rq, struct callback_head *head) 3496 + { 3497 + void (*func)(struct rq *rq); 3498 + struct callback_head *next; 3499 + 3500 + lockdep_assert_held(&rq->lock); 3501 + 3502 + while (head) { 3503 + func = (void (*)(struct rq *))head->func; 3504 + next = head->next; 3505 + head->next = NULL; 3506 + head = next; 3507 + 3508 + func(rq); 3509 + } 3510 + } 3511 + 3512 + static inline struct callback_head *splice_balance_callbacks(struct rq *rq) 3513 + { 3514 + struct callback_head *head = rq->balance_callback; 3515 + 3516 + lockdep_assert_held(&rq->lock); 3517 + if (head) { 3518 + rq->balance_callback = NULL; 3519 + rq->balance_flags &= ~BALANCE_WORK; 3520 + } 3521 + 3522 + return head; 3523 + } 3524 + 3525 + static void __balance_callbacks(struct rq *rq) 3526 + { 3527 + do_balance_callbacks(rq, splice_balance_callbacks(rq)); 3528 + } 3529 + 3530 + static inline void balance_callbacks(struct rq *rq, struct callback_head *head) 3531 + { 3532 + unsigned long flags; 3533 + 3534 + if (unlikely(head)) { 3535 + raw_spin_lock_irqsave(&rq->lock, flags); 3536 + do_balance_callbacks(rq, head); 3537 + raw_spin_unlock_irqrestore(&rq->lock, flags); 3538 + } 3539 + } 3540 + 3541 + static void balance_push(struct rq *rq); 3542 + 3543 + static inline void balance_switch(struct rq *rq) 3544 + { 3545 + if (likely(!rq->balance_flags)) 3546 + return; 3547 + 3548 + if (rq->balance_flags & BALANCE_PUSH) { 3549 + balance_push(rq); 3550 + return; 3551 + } 3552 + 3553 + __balance_callbacks(rq); 3554 + } 3555 + 3556 + #else 3557 + 3558 + static inline void __balance_callbacks(struct rq *rq) 3559 + { 3560 + } 3561 + 3562 + static inline struct callback_head *splice_balance_callbacks(struct rq *rq) 3563 + { 3564 + return NULL; 3565 + } 3566 + 3567 + static inline void balance_callbacks(struct rq *rq, struct callback_head *head) 3568 + { 3569 + } 3570 + 3571 + static inline void balance_switch(struct rq *rq) 3572 + { 3573 + } 3574 + 3575 + #endif 3576 + 3969 3577 static inline void 3970 3578 prepare_lock_switch(struct rq *rq, struct task_struct *next, struct rq_flags *rf) 3971 3579 { ··· 4075 3515 * prev into current: 4076 3516 */ 4077 3517 spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_); 3518 + balance_switch(rq); 4078 3519 raw_spin_unlock_irq(&rq->lock); 4079 3520 } 4080 3521 ··· 4217 3656 return rq; 4218 3657 } 4219 3658 4220 - #ifdef CONFIG_SMP 4221 - 4222 - /* rq->lock is NOT held, but preemption is disabled */ 4223 - static void __balance_callback(struct rq *rq) 4224 - { 4225 - struct callback_head *head, *next; 4226 - void (*func)(struct rq *rq); 4227 - unsigned long flags; 4228 - 4229 - raw_spin_lock_irqsave(&rq->lock, flags); 4230 - head = rq->balance_callback; 4231 - rq->balance_callback = NULL; 4232 - while (head) { 4233 - func = (void (*)(struct rq *))head->func; 4234 - next = head->next; 4235 - head->next = NULL; 4236 - head = next; 4237 - 4238 - func(rq); 4239 - } 4240 - raw_spin_unlock_irqrestore(&rq->lock, flags); 4241 - } 4242 - 4243 - static inline void balance_callback(struct rq *rq) 4244 - { 4245 - if (unlikely(rq->balance_callback)) 4246 - __balance_callback(rq); 4247 - } 4248 - 4249 - #else 4250 - 4251 - static inline void balance_callback(struct rq *rq) 4252 - { 4253 - } 4254 - 4255 - #endif 4256 - 4257 3659 /** 4258 3660 * schedule_tail - first thing a freshly forked thread must call. 4259 3661 * @prev: the thread we just switched away from. ··· 4236 3712 */ 4237 3713 4238 3714 rq = finish_task_switch(prev); 4239 - balance_callback(rq); 4240 3715 preempt_enable(); 4241 3716 4242 3717 if (current->set_child_tid) ··· 4364 3841 } 4365 3842 4366 3843 /* 4367 - * IO-wait accounting, and how its mostly bollocks (on SMP). 3844 + * IO-wait accounting, and how it's mostly bollocks (on SMP). 4368 3845 * 4369 3846 * The idea behind IO-wait account is to account the idle time that we could 4370 3847 * have spend running if it were not for IO. That is, if we were to improve the ··· 4416 3893 int dest_cpu; 4417 3894 4418 3895 raw_spin_lock_irqsave(&p->pi_lock, flags); 4419 - dest_cpu = p->sched_class->select_task_rq(p, task_cpu(p), SD_BALANCE_EXEC, 0); 3896 + dest_cpu = p->sched_class->select_task_rq(p, task_cpu(p), WF_EXEC); 4420 3897 if (dest_cpu == smp_processor_id()) 4421 3898 goto unlock; 4422 3899 ··· 4860 4337 /* 4861 4338 * Optimization: we know that if all tasks are in the fair class we can 4862 4339 * call that function directly, but only if the @prev task wasn't of a 4863 - * higher scheduling class, because otherwise those loose the 4340 + * higher scheduling class, because otherwise those lose the 4864 4341 * opportunity to pull in more work from other CPUs. 4865 4342 */ 4866 4343 if (likely(prev->sched_class <= &fair_sched_class && ··· 5044 4521 */ 5045 4522 ++*switch_count; 5046 4523 4524 + migrate_disable_switch(rq, prev); 5047 4525 psi_sched_switch(prev, next, !task_on_rq_queued(prev)); 5048 4526 5049 4527 trace_sched_switch(preempt, prev, next); ··· 5053 4529 rq = context_switch(rq, prev, next, &rf); 5054 4530 } else { 5055 4531 rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP); 5056 - rq_unlock_irq(rq, &rf); 5057 - } 5058 4532 5059 - balance_callback(rq); 4533 + rq_unpin_lock(rq, &rf); 4534 + __balance_callbacks(rq); 4535 + raw_spin_unlock_irq(&rq->lock); 4536 + } 5060 4537 } 5061 4538 5062 4539 void __noreturn do_task_dead(void) ··· 5383 4858 * right. rt_mutex_slowunlock()+rt_mutex_postunlock() work together to 5384 4859 * ensure a task is de-boosted (pi_task is set to NULL) before the 5385 4860 * task is allowed to run again (and can exit). This ensures the pointer 5386 - * points to a blocked task -- which guaratees the task is present. 4861 + * points to a blocked task -- which guarantees the task is present. 5387 4862 */ 5388 4863 p->pi_top_task = pi_task; 5389 4864 ··· 5469 4944 out_unlock: 5470 4945 /* Avoid rq from going away on us: */ 5471 4946 preempt_disable(); 5472 - __task_rq_unlock(rq, &rf); 5473 4947 5474 - balance_callback(rq); 4948 + rq_unpin_lock(rq, &rf); 4949 + __balance_callbacks(rq); 4950 + raw_spin_unlock(&rq->lock); 4951 + 5475 4952 preempt_enable(); 5476 4953 } 5477 4954 #else ··· 5502 4975 /* 5503 4976 * The RT priorities are set via sched_setscheduler(), but we still 5504 4977 * allow the 'normal' nice value to be set - but as expected 5505 - * it wont have any effect on scheduling until the task is 4978 + * it won't have any effect on scheduling until the task is 5506 4979 * SCHED_DEADLINE, SCHED_FIFO or SCHED_RR: 5507 4980 */ 5508 4981 if (task_has_dl_policy(p) || task_has_rt_policy(p)) { ··· 5747 5220 int retval, oldprio, oldpolicy = -1, queued, running; 5748 5221 int new_effective_prio, policy = attr->sched_policy; 5749 5222 const struct sched_class *prev_class; 5223 + struct callback_head *head; 5750 5224 struct rq_flags rf; 5751 5225 int reset_on_fork; 5752 5226 int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK; ··· 5986 5458 5987 5459 /* Avoid rq from going away on us: */ 5988 5460 preempt_disable(); 5461 + head = splice_balance_callbacks(rq); 5989 5462 task_rq_unlock(rq, p, &rf); 5990 5463 5991 5464 if (pi) { ··· 5995 5466 } 5996 5467 5997 5468 /* Run balance callbacks after we've adjusted the PI chain: */ 5998 - balance_callback(rq); 5469 + balance_callbacks(rq, head); 5999 5470 preempt_enable(); 6000 5471 6001 5472 return 0; ··· 6490 5961 } 6491 5962 #endif 6492 5963 again: 6493 - retval = __set_cpus_allowed_ptr(p, new_mask, true); 5964 + retval = __set_cpus_allowed_ptr(p, new_mask, SCA_CHECK); 6494 5965 6495 5966 if (!retval) { 6496 5967 cpuset_cpus_allowed(p, cpus_allowed); ··· 6612 6083 return ret; 6613 6084 } 6614 6085 6615 - /** 6616 - * sys_sched_yield - yield the current processor to other threads. 6617 - * 6618 - * This function yields the current CPU to other tasks. If there are no 6619 - * other threads running on this CPU then this function will return. 6620 - * 6621 - * Return: 0. 6622 - */ 6623 6086 static void do_sched_yield(void) 6624 6087 { 6625 6088 struct rq_flags rf; ··· 6622 6101 schedstat_inc(rq->yld_count); 6623 6102 current->sched_class->yield_task(rq); 6624 6103 6625 - /* 6626 - * Since we are going to call schedule() anyway, there's 6627 - * no need to preempt or enable interrupts: 6628 - */ 6629 6104 preempt_disable(); 6630 - rq_unlock(rq, &rf); 6105 + rq_unlock_irq(rq, &rf); 6631 6106 sched_preempt_enable_no_resched(); 6632 6107 6633 6108 schedule(); 6634 6109 } 6635 6110 6111 + /** 6112 + * sys_sched_yield - yield the current processor to other threads. 6113 + * 6114 + * This function yields the current CPU to other tasks. If there are no 6115 + * other threads running on this CPU then this function will return. 6116 + * 6117 + * Return: 0. 6118 + */ 6636 6119 SYSCALL_DEFINE0(sched_yield) 6637 6120 { 6638 6121 do_sched_yield(); ··· 6691 6166 * 6692 6167 * The scheduler is at all times free to pick the calling task as the most 6693 6168 * eligible task to run, if removing the yield() call from your code breaks 6694 - * it, its already broken. 6169 + * it, it's already broken. 6695 6170 * 6696 6171 * Typical broken usage is: 6697 6172 * ··· 6979 6454 (unsigned long)task_thread_info(p)->flags); 6980 6455 6981 6456 print_worker_info(KERN_INFO, p); 6457 + print_stop_info(KERN_INFO, p); 6982 6458 show_stack(p, NULL, KERN_INFO); 6983 6459 put_task_stack(p); 6984 6460 } ··· 7065 6539 7066 6540 #ifdef CONFIG_SMP 7067 6541 /* 7068 - * Its possible that init_idle() gets called multiple times on a task, 6542 + * It's possible that init_idle() gets called multiple times on a task, 7069 6543 * in that case do_set_cpus_allowed() will not do the right thing. 7070 6544 * 7071 6545 * And since this is boot we can forgo the serialization. 7072 6546 */ 7073 - set_cpus_allowed_common(idle, cpumask_of(cpu)); 6547 + set_cpus_allowed_common(idle, cpumask_of(cpu), 0); 7074 6548 #endif 7075 6549 /* 7076 6550 * We're having a chicken and egg problem, even though we are ··· 7221 6695 /* finish_cpu(), as ran on the BP, will clean up the active_mm state */ 7222 6696 } 7223 6697 7224 - /* 7225 - * Since this CPU is going 'away' for a while, fold any nr_active delta 7226 - * we might have. Assumes we're called after migrate_tasks() so that the 7227 - * nr_active count is stable. We need to take the teardown thread which 7228 - * is calling this into account, so we hand in adjust = 1 to the load 7229 - * calculation. 7230 - * 7231 - * Also see the comment "Global load-average calculations". 7232 - */ 7233 - static void calc_load_migrate(struct rq *rq) 6698 + static int __balance_push_cpu_stop(void *arg) 7234 6699 { 7235 - long delta = calc_load_fold_active(rq, 1); 7236 - if (delta) 7237 - atomic_long_add(delta, &calc_load_tasks); 7238 - } 6700 + struct task_struct *p = arg; 6701 + struct rq *rq = this_rq(); 6702 + struct rq_flags rf; 6703 + int cpu; 7239 6704 7240 - static struct task_struct *__pick_migrate_task(struct rq *rq) 7241 - { 7242 - const struct sched_class *class; 7243 - struct task_struct *next; 6705 + raw_spin_lock_irq(&p->pi_lock); 6706 + rq_lock(rq, &rf); 7244 6707 7245 - for_each_class(class) { 7246 - next = class->pick_next_task(rq); 7247 - if (next) { 7248 - next->sched_class->put_prev_task(rq, next); 7249 - return next; 7250 - } 7251 - } 7252 - 7253 - /* The idle class should always have a runnable task */ 7254 - BUG(); 7255 - } 7256 - 7257 - /* 7258 - * Migrate all tasks from the rq, sleeping tasks will be migrated by 7259 - * try_to_wake_up()->select_task_rq(). 7260 - * 7261 - * Called with rq->lock held even though we'er in stop_machine() and 7262 - * there's no concurrency possible, we hold the required locks anyway 7263 - * because of lock validation efforts. 7264 - */ 7265 - static void migrate_tasks(struct rq *dead_rq, struct rq_flags *rf) 7266 - { 7267 - struct rq *rq = dead_rq; 7268 - struct task_struct *next, *stop = rq->stop; 7269 - struct rq_flags orf = *rf; 7270 - int dest_cpu; 7271 - 7272 - /* 7273 - * Fudge the rq selection such that the below task selection loop 7274 - * doesn't get stuck on the currently eligible stop task. 7275 - * 7276 - * We're currently inside stop_machine() and the rq is either stuck 7277 - * in the stop_machine_cpu_stop() loop, or we're executing this code, 7278 - * either way we should never end up calling schedule() until we're 7279 - * done here. 7280 - */ 7281 - rq->stop = NULL; 7282 - 7283 - /* 7284 - * put_prev_task() and pick_next_task() sched 7285 - * class method both need to have an up-to-date 7286 - * value of rq->clock[_task] 7287 - */ 7288 6708 update_rq_clock(rq); 7289 6709 7290 - for (;;) { 7291 - /* 7292 - * There's this thread running, bail when that's the only 7293 - * remaining thread: 7294 - */ 7295 - if (rq->nr_running == 1) 7296 - break; 7297 - 7298 - next = __pick_migrate_task(rq); 7299 - 7300 - /* 7301 - * Rules for changing task_struct::cpus_mask are holding 7302 - * both pi_lock and rq->lock, such that holding either 7303 - * stabilizes the mask. 7304 - * 7305 - * Drop rq->lock is not quite as disastrous as it usually is 7306 - * because !cpu_active at this point, which means load-balance 7307 - * will not interfere. Also, stop-machine. 7308 - */ 7309 - rq_unlock(rq, rf); 7310 - raw_spin_lock(&next->pi_lock); 7311 - rq_relock(rq, rf); 7312 - 7313 - /* 7314 - * Since we're inside stop-machine, _nothing_ should have 7315 - * changed the task, WARN if weird stuff happened, because in 7316 - * that case the above rq->lock drop is a fail too. 7317 - */ 7318 - if (WARN_ON(task_rq(next) != rq || !task_on_rq_queued(next))) { 7319 - raw_spin_unlock(&next->pi_lock); 7320 - continue; 7321 - } 7322 - 7323 - /* Find suitable destination for @next, with force if needed. */ 7324 - dest_cpu = select_fallback_rq(dead_rq->cpu, next); 7325 - rq = __migrate_task(rq, rf, next, dest_cpu); 7326 - if (rq != dead_rq) { 7327 - rq_unlock(rq, rf); 7328 - rq = dead_rq; 7329 - *rf = orf; 7330 - rq_relock(rq, rf); 7331 - } 7332 - raw_spin_unlock(&next->pi_lock); 6710 + if (task_rq(p) == rq && task_on_rq_queued(p)) { 6711 + cpu = select_fallback_rq(rq->cpu, p); 6712 + rq = __migrate_task(rq, &rf, p, cpu); 7333 6713 } 7334 6714 7335 - rq->stop = stop; 6715 + rq_unlock(rq, &rf); 6716 + raw_spin_unlock_irq(&p->pi_lock); 6717 + 6718 + put_task_struct(p); 6719 + 6720 + return 0; 7336 6721 } 6722 + 6723 + static DEFINE_PER_CPU(struct cpu_stop_work, push_work); 6724 + 6725 + /* 6726 + * Ensure we only run per-cpu kthreads once the CPU goes !active. 6727 + */ 6728 + static void balance_push(struct rq *rq) 6729 + { 6730 + struct task_struct *push_task = rq->curr; 6731 + 6732 + lockdep_assert_held(&rq->lock); 6733 + SCHED_WARN_ON(rq->cpu != smp_processor_id()); 6734 + 6735 + /* 6736 + * Both the cpu-hotplug and stop task are in this case and are 6737 + * required to complete the hotplug process. 6738 + */ 6739 + if (is_per_cpu_kthread(push_task) || is_migration_disabled(push_task)) { 6740 + /* 6741 + * If this is the idle task on the outgoing CPU try to wake 6742 + * up the hotplug control thread which might wait for the 6743 + * last task to vanish. The rcuwait_active() check is 6744 + * accurate here because the waiter is pinned on this CPU 6745 + * and can't obviously be running in parallel. 6746 + * 6747 + * On RT kernels this also has to check whether there are 6748 + * pinned and scheduled out tasks on the runqueue. They 6749 + * need to leave the migrate disabled section first. 6750 + */ 6751 + if (!rq->nr_running && !rq_has_pinned_tasks(rq) && 6752 + rcuwait_active(&rq->hotplug_wait)) { 6753 + raw_spin_unlock(&rq->lock); 6754 + rcuwait_wake_up(&rq->hotplug_wait); 6755 + raw_spin_lock(&rq->lock); 6756 + } 6757 + return; 6758 + } 6759 + 6760 + get_task_struct(push_task); 6761 + /* 6762 + * Temporarily drop rq->lock such that we can wake-up the stop task. 6763 + * Both preemption and IRQs are still disabled. 6764 + */ 6765 + raw_spin_unlock(&rq->lock); 6766 + stop_one_cpu_nowait(rq->cpu, __balance_push_cpu_stop, push_task, 6767 + this_cpu_ptr(&push_work)); 6768 + /* 6769 + * At this point need_resched() is true and we'll take the loop in 6770 + * schedule(). The next pick is obviously going to be the stop task 6771 + * which is_per_cpu_kthread() and will push this task away. 6772 + */ 6773 + raw_spin_lock(&rq->lock); 6774 + } 6775 + 6776 + static void balance_push_set(int cpu, bool on) 6777 + { 6778 + struct rq *rq = cpu_rq(cpu); 6779 + struct rq_flags rf; 6780 + 6781 + rq_lock_irqsave(rq, &rf); 6782 + if (on) 6783 + rq->balance_flags |= BALANCE_PUSH; 6784 + else 6785 + rq->balance_flags &= ~BALANCE_PUSH; 6786 + rq_unlock_irqrestore(rq, &rf); 6787 + } 6788 + 6789 + /* 6790 + * Invoked from a CPUs hotplug control thread after the CPU has been marked 6791 + * inactive. All tasks which are not per CPU kernel threads are either 6792 + * pushed off this CPU now via balance_push() or placed on a different CPU 6793 + * during wakeup. Wait until the CPU is quiescent. 6794 + */ 6795 + static void balance_hotplug_wait(void) 6796 + { 6797 + struct rq *rq = this_rq(); 6798 + 6799 + rcuwait_wait_event(&rq->hotplug_wait, 6800 + rq->nr_running == 1 && !rq_has_pinned_tasks(rq), 6801 + TASK_UNINTERRUPTIBLE); 6802 + } 6803 + 6804 + #else 6805 + 6806 + static inline void balance_push(struct rq *rq) 6807 + { 6808 + } 6809 + 6810 + static inline void balance_push_set(int cpu, bool on) 6811 + { 6812 + } 6813 + 6814 + static inline void balance_hotplug_wait(void) 6815 + { 6816 + } 6817 + 7337 6818 #endif /* CONFIG_HOTPLUG_CPU */ 7338 6819 7339 6820 void set_rq_online(struct rq *rq) ··· 7426 6893 struct rq *rq = cpu_rq(cpu); 7427 6894 struct rq_flags rf; 7428 6895 6896 + balance_push_set(cpu, false); 6897 + 7429 6898 #ifdef CONFIG_SCHED_SMT 7430 6899 /* 7431 6900 * When going up, increment the number of cores with SMT present. ··· 7463 6928 7464 6929 int sched_cpu_deactivate(unsigned int cpu) 7465 6930 { 6931 + struct rq *rq = cpu_rq(cpu); 6932 + struct rq_flags rf; 7466 6933 int ret; 7467 6934 7468 6935 set_cpu_active(cpu, false); ··· 7476 6939 * Do sync before park smpboot threads to take care the rcu boost case. 7477 6940 */ 7478 6941 synchronize_rcu(); 6942 + 6943 + balance_push_set(cpu, true); 6944 + 6945 + rq_lock_irqsave(rq, &rf); 6946 + if (rq->rd) { 6947 + update_rq_clock(rq); 6948 + BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); 6949 + set_rq_offline(rq); 6950 + } 6951 + rq_unlock_irqrestore(rq, &rf); 7479 6952 7480 6953 #ifdef CONFIG_SCHED_SMT 7481 6954 /* ··· 7500 6953 7501 6954 ret = cpuset_cpu_inactive(cpu); 7502 6955 if (ret) { 6956 + balance_push_set(cpu, false); 7503 6957 set_cpu_active(cpu, true); 7504 6958 return ret; 7505 6959 } ··· 7524 6976 } 7525 6977 7526 6978 #ifdef CONFIG_HOTPLUG_CPU 6979 + 6980 + /* 6981 + * Invoked immediately before the stopper thread is invoked to bring the 6982 + * CPU down completely. At this point all per CPU kthreads except the 6983 + * hotplug thread (current) and the stopper thread (inactive) have been 6984 + * either parked or have been unbound from the outgoing CPU. Ensure that 6985 + * any of those which might be on the way out are gone. 6986 + * 6987 + * If after this point a bound task is being woken on this CPU then the 6988 + * responsible hotplug callback has failed to do it's job. 6989 + * sched_cpu_dying() will catch it with the appropriate fireworks. 6990 + */ 6991 + int sched_cpu_wait_empty(unsigned int cpu) 6992 + { 6993 + balance_hotplug_wait(); 6994 + return 0; 6995 + } 6996 + 6997 + /* 6998 + * Since this CPU is going 'away' for a while, fold any nr_active delta we 6999 + * might have. Called from the CPU stopper task after ensuring that the 7000 + * stopper is the last running task on the CPU, so nr_active count is 7001 + * stable. We need to take the teardown thread which is calling this into 7002 + * account, so we hand in adjust = 1 to the load calculation. 7003 + * 7004 + * Also see the comment "Global load-average calculations". 7005 + */ 7006 + static void calc_load_migrate(struct rq *rq) 7007 + { 7008 + long delta = calc_load_fold_active(rq, 1); 7009 + 7010 + if (delta) 7011 + atomic_long_add(delta, &calc_load_tasks); 7012 + } 7013 + 7527 7014 int sched_cpu_dying(unsigned int cpu) 7528 7015 { 7529 7016 struct rq *rq = cpu_rq(cpu); ··· 7568 6985 sched_tick_stop(cpu); 7569 6986 7570 6987 rq_lock_irqsave(rq, &rf); 7571 - if (rq->rd) { 7572 - BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); 7573 - set_rq_offline(rq); 7574 - } 7575 - migrate_tasks(rq, &rf); 7576 - BUG_ON(rq->nr_running != 1); 6988 + BUG_ON(rq->nr_running != 1 || rq_has_pinned_tasks(rq)); 7577 6989 rq_unlock_irqrestore(rq, &rf); 7578 6990 7579 6991 calc_load_migrate(rq); ··· 7773 7195 rq->last_blocked_load_update_tick = jiffies; 7774 7196 atomic_set(&rq->nohz_flags, 0); 7775 7197 7776 - rq_csd_init(rq, &rq->nohz_csd, nohz_csd_func); 7198 + INIT_CSD(&rq->nohz_csd, nohz_csd_func, rq); 7199 + #endif 7200 + #ifdef CONFIG_HOTPLUG_CPU 7201 + rcuwait_init(&rq->hotplug_wait); 7777 7202 #endif 7778 7203 #endif /* CONFIG_SMP */ 7779 7204 hrtick_rq_init(rq); ··· 7915 7334 add_taint(TAINT_WARN, LOCKDEP_STILL_OK); 7916 7335 } 7917 7336 EXPORT_SYMBOL_GPL(__cant_sleep); 7337 + 7338 + #ifdef CONFIG_SMP 7339 + void __cant_migrate(const char *file, int line) 7340 + { 7341 + static unsigned long prev_jiffy; 7342 + 7343 + if (irqs_disabled()) 7344 + return; 7345 + 7346 + if (is_migration_disabled(current)) 7347 + return; 7348 + 7349 + if (!IS_ENABLED(CONFIG_PREEMPT_COUNT)) 7350 + return; 7351 + 7352 + if (preempt_count() > 0) 7353 + return; 7354 + 7355 + if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) 7356 + return; 7357 + prev_jiffy = jiffies; 7358 + 7359 + pr_err("BUG: assuming non migratable context at %s:%d\n", file, line); 7360 + pr_err("in_atomic(): %d, irqs_disabled(): %d, migration_disabled() %u pid: %d, name: %s\n", 7361 + in_atomic(), irqs_disabled(), is_migration_disabled(current), 7362 + current->pid, current->comm); 7363 + 7364 + debug_show_held_locks(current); 7365 + dump_stack(); 7366 + add_taint(TAINT_WARN, LOCKDEP_STILL_OK); 7367 + } 7368 + EXPORT_SYMBOL_GPL(__cant_migrate); 7369 + #endif 7918 7370 #endif 7919 7371 7920 7372 #ifdef CONFIG_MAGIC_SYSRQ ··· 8281 7667 return -EINVAL; 8282 7668 #endif 8283 7669 /* 8284 - * Serialize against wake_up_new_task() such that if its 7670 + * Serialize against wake_up_new_task() such that if it's 8285 7671 * running, we're sure to observe its full state. 8286 7672 */ 8287 7673 raw_spin_lock_irq(&task->pi_lock);

+2 -2

kernel/sched/cpudeadline.c

··· 120 120 const struct sched_dl_entity *dl_se = &p->dl; 121 121 122 122 if (later_mask && 123 - cpumask_and(later_mask, cp->free_cpus, p->cpus_ptr)) { 123 + cpumask_and(later_mask, cp->free_cpus, &p->cpus_mask)) { 124 124 unsigned long cap, max_cap = 0; 125 125 int cpu, max_cpu = -1; 126 126 ··· 151 151 152 152 WARN_ON(best_cpu != -1 && !cpu_present(best_cpu)); 153 153 154 - if (cpumask_test_cpu(best_cpu, p->cpus_ptr) && 154 + if (cpumask_test_cpu(best_cpu, &p->cpus_mask) && 155 155 dl_time_before(dl_se->deadline, cp->elements[0].dl)) { 156 156 if (later_mask) 157 157 cpumask_set_cpu(best_cpu, later_mask);

+1 -8

kernel/sched/cpufreq_schedutil.c

··· 899 899 cpufreq_governor_init(schedutil_gov); 900 900 901 901 #ifdef CONFIG_ENERGY_MODEL 902 - extern bool sched_energy_update; 903 - extern struct mutex sched_energy_mutex; 904 - 905 902 static void rebuild_sd_workfn(struct work_struct *work) 906 903 { 907 - mutex_lock(&sched_energy_mutex); 908 - sched_energy_update = true; 909 - rebuild_sched_domains(); 910 - sched_energy_update = false; 911 - mutex_unlock(&sched_energy_mutex); 904 + rebuild_sched_domains_energy(); 912 905 } 913 906 static DECLARE_WORK(rebuild_sd_work, rebuild_sd_workfn); 914 907

+38 -14

kernel/sched/cpupri.c

··· 11 11 * This code tracks the priority of each CPU so that global migration 12 12 * decisions are easy to calculate. Each CPU can be in a state as follows: 13 13 * 14 - * (INVALID), IDLE, NORMAL, RT1, ... RT99 14 + * (INVALID), NORMAL, RT1, ... RT99, HIGHER 15 15 * 16 16 * going from the lowest priority to the highest. CPUs in the INVALID state 17 17 * are not eligible for routing. The system maintains this state with ··· 19 19 * in that class). Therefore a typical application without affinity 20 20 * restrictions can find a suitable CPU with O(1) complexity (e.g. two bit 21 21 * searches). For tasks with affinity restrictions, the algorithm has a 22 - * worst case complexity of O(min(102, nr_domcpus)), though the scenario that 22 + * worst case complexity of O(min(101, nr_domcpus)), though the scenario that 23 23 * yields the worst case search is fairly contrived. 24 24 */ 25 25 #include "sched.h" 26 26 27 - /* Convert between a 140 based task->prio, and our 102 based cpupri */ 27 + /* 28 + * p->rt_priority p->prio newpri cpupri 29 + * 30 + * -1 -1 (CPUPRI_INVALID) 31 + * 32 + * 99 0 (CPUPRI_NORMAL) 33 + * 34 + * 1 98 98 1 35 + * ... 36 + * 49 50 50 49 37 + * 50 49 49 50 38 + * ... 39 + * 99 0 0 99 40 + * 41 + * 100 100 (CPUPRI_HIGHER) 42 + */ 28 43 static int convert_prio(int prio) 29 44 { 30 45 int cpupri; 31 46 32 - if (prio == CPUPRI_INVALID) 33 - cpupri = CPUPRI_INVALID; 34 - else if (prio == MAX_PRIO) 35 - cpupri = CPUPRI_IDLE; 36 - else if (prio >= MAX_RT_PRIO) 37 - cpupri = CPUPRI_NORMAL; 38 - else 39 - cpupri = MAX_RT_PRIO - prio + 1; 47 + switch (prio) { 48 + case CPUPRI_INVALID: 49 + cpupri = CPUPRI_INVALID; /* -1 */ 50 + break; 51 + 52 + case 0 ... 98: 53 + cpupri = MAX_RT_PRIO-1 - prio; /* 1 ... 99 */ 54 + break; 55 + 56 + case MAX_RT_PRIO-1: 57 + cpupri = CPUPRI_NORMAL; /* 0 */ 58 + break; 59 + 60 + case MAX_RT_PRIO: 61 + cpupri = CPUPRI_HIGHER; /* 100 */ 62 + break; 63 + } 40 64 41 65 return cpupri; 42 66 } ··· 97 73 if (skip) 98 74 return 0; 99 75 100 - if (cpumask_any_and(p->cpus_ptr, vec->mask) >= nr_cpu_ids) 76 + if (cpumask_any_and(&p->cpus_mask, vec->mask) >= nr_cpu_ids) 101 77 return 0; 102 78 103 79 if (lowest_mask) { 104 - cpumask_and(lowest_mask, p->cpus_ptr, vec->mask); 80 + cpumask_and(lowest_mask, &p->cpus_mask, vec->mask); 105 81 106 82 /* 107 83 * We have to ensure that we have at least one bit ··· 201 177 * cpupri_set - update the CPU priority setting 202 178 * @cp: The cpupri context 203 179 * @cpu: The target CPU 204 - * @newpri: The priority (INVALID-RT99) to assign to this CPU 180 + * @newpri: The priority (INVALID,NORMAL,RT1-RT99,HIGHER) to assign to this CPU 205 181 * 206 182 * Note: Assumes cpu_rq(cpu)->lock is locked 207 183 *

+4 -4

kernel/sched/cpupri.h

··· 1 1 /* SPDX-License-Identifier: GPL-2.0 */ 2 2 3 - #define CPUPRI_NR_PRIORITIES (MAX_RT_PRIO + 2) 3 + #define CPUPRI_NR_PRIORITIES (MAX_RT_PRIO+1) 4 4 5 5 #define CPUPRI_INVALID -1 6 - #define CPUPRI_IDLE 0 7 - #define CPUPRI_NORMAL 1 8 - /* values 2-101 are RT priorities 0-99 */ 6 + #define CPUPRI_NORMAL 0 7 + /* values 1-99 are for RT1-RT99 priorities */ 8 + #define CPUPRI_HIGHER 100 9 9 10 10 struct cpupri_vec { 11 11 atomic_t count;

+73 -28

kernel/sched/deadline.c

··· 119 119 return __dl_bw_capacity(i); 120 120 } 121 121 } 122 + 123 + static inline bool dl_bw_visited(int cpu, u64 gen) 124 + { 125 + struct root_domain *rd = cpu_rq(cpu)->rd; 126 + 127 + if (rd->visit_gen == gen) 128 + return true; 129 + 130 + rd->visit_gen = gen; 131 + return false; 132 + } 122 133 #else 123 134 static inline struct dl_bw *dl_bw_of(int i) 124 135 { ··· 144 133 static inline unsigned long dl_bw_capacity(int i) 145 134 { 146 135 return SCHED_CAPACITY_SCALE; 136 + } 137 + 138 + static inline bool dl_bw_visited(int cpu, u64 gen) 139 + { 140 + return false; 147 141 } 148 142 #endif 149 143 ··· 581 565 582 566 static inline bool need_pull_dl_task(struct rq *rq, struct task_struct *prev) 583 567 { 584 - return dl_task(prev); 568 + return rq->online && dl_task(prev); 585 569 } 586 570 587 571 static DEFINE_PER_CPU(struct callback_head, dl_push_head); ··· 1413 1397 1414 1398 if (dl_rq->earliest_dl.curr == 0 || 1415 1399 dl_time_before(deadline, dl_rq->earliest_dl.curr)) { 1400 + if (dl_rq->earliest_dl.curr == 0) 1401 + cpupri_set(&rq->rd->cpupri, rq->cpu, CPUPRI_HIGHER); 1416 1402 dl_rq->earliest_dl.curr = deadline; 1417 1403 cpudl_set(&rq->rd->cpudl, rq->cpu, deadline); 1418 1404 } ··· 1432 1414 dl_rq->earliest_dl.curr = 0; 1433 1415 dl_rq->earliest_dl.next = 0; 1434 1416 cpudl_clear(&rq->rd->cpudl, rq->cpu); 1417 + cpupri_set(&rq->rd->cpupri, rq->cpu, rq->rt.highest_prio.curr); 1435 1418 } else { 1436 1419 struct rb_node *leftmost = dl_rq->root.rb_leftmost; 1437 1420 struct sched_dl_entity *entry; ··· 1689 1670 static int find_later_rq(struct task_struct *task); 1690 1671 1691 1672 static int 1692 - select_task_rq_dl(struct task_struct *p, int cpu, int sd_flag, int flags) 1673 + select_task_rq_dl(struct task_struct *p, int cpu, int flags) 1693 1674 { 1694 1675 struct task_struct *curr; 1695 1676 bool select_rq; 1696 1677 struct rq *rq; 1697 1678 1698 - if (sd_flag != SD_BALANCE_WAKE) 1679 + if (!(flags & WF_TTWU)) 1699 1680 goto out; 1700 1681 1701 1682 rq = cpu_rq(cpu); ··· 1937 1918 static int pick_dl_task(struct rq *rq, struct task_struct *p, int cpu) 1938 1919 { 1939 1920 if (!task_running(rq, p) && 1940 - cpumask_test_cpu(cpu, p->cpus_ptr)) 1921 + cpumask_test_cpu(cpu, &p->cpus_mask)) 1941 1922 return 1; 1942 1923 return 0; 1943 1924 } ··· 2027 2008 return this_cpu; 2028 2009 } 2029 2010 2030 - best_cpu = cpumask_first_and(later_mask, 2031 - sched_domain_span(sd)); 2011 + best_cpu = cpumask_any_and_distribute(later_mask, 2012 + sched_domain_span(sd)); 2032 2013 /* 2033 2014 * Last chance: if a CPU being in both later_mask 2034 2015 * and current sd span is valid, that becomes our ··· 2050 2031 if (this_cpu != -1) 2051 2032 return this_cpu; 2052 2033 2053 - cpu = cpumask_any(later_mask); 2034 + cpu = cpumask_any_distribute(later_mask); 2054 2035 if (cpu < nr_cpu_ids) 2055 2036 return cpu; 2056 2037 ··· 2087 2068 /* Retry if something changed. */ 2088 2069 if (double_lock_balance(rq, later_rq)) { 2089 2070 if (unlikely(task_rq(task) != rq || 2090 - !cpumask_test_cpu(later_rq->cpu, task->cpus_ptr) || 2071 + !cpumask_test_cpu(later_rq->cpu, &task->cpus_mask) || 2091 2072 task_running(rq, task) || 2092 2073 !dl_task(task) || 2093 2074 !task_on_rq_queued(task))) { ··· 2154 2135 return 0; 2155 2136 2156 2137 retry: 2138 + if (is_migration_disabled(next_task)) 2139 + return 0; 2140 + 2157 2141 if (WARN_ON(next_task == rq->curr)) 2158 2142 return 0; 2159 2143 ··· 2234 2212 static void pull_dl_task(struct rq *this_rq) 2235 2213 { 2236 2214 int this_cpu = this_rq->cpu, cpu; 2237 - struct task_struct *p; 2215 + struct task_struct *p, *push_task; 2238 2216 bool resched = false; 2239 2217 struct rq *src_rq; 2240 2218 u64 dmin = LONG_MAX; ··· 2264 2242 continue; 2265 2243 2266 2244 /* Might drop this_rq->lock */ 2245 + push_task = NULL; 2267 2246 double_lock_balance(this_rq, src_rq); 2268 2247 2269 2248 /* ··· 2296 2273 src_rq->curr->dl.deadline)) 2297 2274 goto skip; 2298 2275 2299 - resched = true; 2300 - 2301 - deactivate_task(src_rq, p, 0); 2302 - set_task_cpu(p, this_cpu); 2303 - activate_task(this_rq, p, 0); 2304 - dmin = p->dl.deadline; 2276 + if (is_migration_disabled(p)) { 2277 + push_task = get_push_task(src_rq); 2278 + } else { 2279 + deactivate_task(src_rq, p, 0); 2280 + set_task_cpu(p, this_cpu); 2281 + activate_task(this_rq, p, 0); 2282 + dmin = p->dl.deadline; 2283 + resched = true; 2284 + } 2305 2285 2306 2286 /* Is there any other task even earlier? */ 2307 2287 } 2308 2288 skip: 2309 2289 double_unlock_balance(this_rq, src_rq); 2290 + 2291 + if (push_task) { 2292 + raw_spin_unlock(&this_rq->lock); 2293 + stop_one_cpu_nowait(src_rq->cpu, push_cpu_stop, 2294 + push_task, &src_rq->push_work); 2295 + raw_spin_lock(&this_rq->lock); 2296 + } 2310 2297 } 2311 2298 2312 2299 if (resched) ··· 2340 2307 } 2341 2308 2342 2309 static void set_cpus_allowed_dl(struct task_struct *p, 2343 - const struct cpumask *new_mask) 2310 + const struct cpumask *new_mask, 2311 + u32 flags) 2344 2312 { 2345 2313 struct root_domain *src_rd; 2346 2314 struct rq *rq; ··· 2370 2336 raw_spin_unlock(&src_dl_b->lock); 2371 2337 } 2372 2338 2373 - set_cpus_allowed_common(p, new_mask); 2339 + set_cpus_allowed_common(p, new_mask, flags); 2374 2340 } 2375 2341 2376 2342 /* Assumes rq->lock is held */ ··· 2543 2509 } 2544 2510 } 2545 2511 2546 - const struct sched_class dl_sched_class 2547 - __section("__dl_sched_class") = { 2512 + DEFINE_SCHED_CLASS(dl) = { 2513 + 2548 2514 .enqueue_task = enqueue_task_dl, 2549 2515 .dequeue_task = dequeue_task_dl, 2550 2516 .yield_task = yield_task_dl, ··· 2563 2529 .rq_online = rq_online_dl, 2564 2530 .rq_offline = rq_offline_dl, 2565 2531 .task_woken = task_woken_dl, 2532 + .find_lock_rq = find_lock_later_rq, 2566 2533 #endif 2567 2534 2568 2535 .task_tick = task_tick_dl, ··· 2576 2541 .update_curr = update_curr_dl, 2577 2542 }; 2578 2543 2544 + /* Used for dl_bw check and update, used under sched_rt_handler()::mutex */ 2545 + static u64 dl_generation; 2546 + 2579 2547 int sched_dl_global_validate(void) 2580 2548 { 2581 2549 u64 runtime = global_rt_runtime(); 2582 2550 u64 period = global_rt_period(); 2583 2551 u64 new_bw = to_ratio(period, runtime); 2552 + u64 gen = ++dl_generation; 2584 2553 struct dl_bw *dl_b; 2585 - int cpu, ret = 0; 2554 + int cpu, cpus, ret = 0; 2586 2555 unsigned long flags; 2587 2556 2588 2557 /* 2589 2558 * Here we want to check the bandwidth not being set to some 2590 2559 * value smaller than the currently allocated bandwidth in 2591 2560 * any of the root_domains. 2592 - * 2593 - * FIXME: Cycling on all the CPUs is overdoing, but simpler than 2594 - * cycling on root_domains... Discussion on different/better 2595 - * solutions is welcome! 2596 2561 */ 2597 2562 for_each_possible_cpu(cpu) { 2598 2563 rcu_read_lock_sched(); 2564 + 2565 + if (dl_bw_visited(cpu, gen)) 2566 + goto next; 2567 + 2599 2568 dl_b = dl_bw_of(cpu); 2569 + cpus = dl_bw_cpus(cpu); 2600 2570 2601 2571 raw_spin_lock_irqsave(&dl_b->lock, flags); 2602 - if (new_bw < dl_b->total_bw) 2572 + if (new_bw * cpus < dl_b->total_bw) 2603 2573 ret = -EBUSY; 2604 2574 raw_spin_unlock_irqrestore(&dl_b->lock, flags); 2605 2575 2576 + next: 2606 2577 rcu_read_unlock_sched(); 2607 2578 2608 2579 if (ret) ··· 2634 2593 void sched_dl_do_global(void) 2635 2594 { 2636 2595 u64 new_bw = -1; 2596 + u64 gen = ++dl_generation; 2637 2597 struct dl_bw *dl_b; 2638 2598 int cpu; 2639 2599 unsigned long flags; ··· 2645 2603 if (global_rt_runtime() != RUNTIME_INF) 2646 2604 new_bw = to_ratio(global_rt_period(), global_rt_runtime()); 2647 2605 2648 - /* 2649 - * FIXME: As above... 2650 - */ 2651 2606 for_each_possible_cpu(cpu) { 2652 2607 rcu_read_lock_sched(); 2608 + 2609 + if (dl_bw_visited(cpu, gen)) { 2610 + rcu_read_unlock_sched(); 2611 + continue; 2612 + } 2613 + 2653 2614 dl_b = dl_bw_of(cpu); 2654 2615 2655 2616 raw_spin_lock_irqsave(&dl_b->lock, flags);

+83 -38

kernel/sched/fair.c

··· 906 906 if (!schedstat_enabled()) 907 907 return; 908 908 909 + /* 910 + * When the sched_schedstat changes from 0 to 1, some sched se 911 + * maybe already in the runqueue, the se->statistics.wait_start 912 + * will be 0.So it will let the delta wrong. We need to avoid this 913 + * scenario. 914 + */ 915 + if (unlikely(!schedstat_val(se->statistics.wait_start))) 916 + return; 917 + 909 918 delta = rq_clock(rq_of(cfs_rq)) - schedstat_val(se->statistics.wait_start); 910 919 911 920 if (entity_is_task(se)) { ··· 1559 1550 static unsigned long cpu_load(struct rq *rq); 1560 1551 static unsigned long cpu_runnable(struct rq *rq); 1561 1552 static unsigned long cpu_util(int cpu); 1562 - static inline long adjust_numa_imbalance(int imbalance, int nr_running); 1553 + static inline long adjust_numa_imbalance(int imbalance, 1554 + int dst_running, int dst_weight); 1563 1555 1564 1556 static inline enum 1565 1557 numa_type numa_classify(unsigned int imbalance_pct, ··· 1940 1930 src_running = env->src_stats.nr_running - 1; 1941 1931 dst_running = env->dst_stats.nr_running + 1; 1942 1932 imbalance = max(0, dst_running - src_running); 1943 - imbalance = adjust_numa_imbalance(imbalance, dst_running); 1933 + imbalance = adjust_numa_imbalance(imbalance, dst_running, 1934 + env->dst_stats.weight); 1944 1935 1945 1936 /* Use idle CPU if there is no imbalance */ 1946 1937 if (!imbalance) { ··· 4790 4779 struct cfs_rq *qcfs_rq = cfs_rq_of(se); 4791 4780 /* throttled entity or throttle-on-deactivate */ 4792 4781 if (!se->on_rq) 4793 - break; 4782 + goto done; 4794 4783 4795 - if (dequeue) { 4796 - dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP); 4797 - } else { 4798 - update_load_avg(qcfs_rq, se, 0); 4799 - se_update_runnable(se); 4800 - } 4784 + dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP); 4801 4785 4802 4786 qcfs_rq->h_nr_running -= task_delta; 4803 4787 qcfs_rq->idle_h_nr_running -= idle_task_delta; 4804 4788 4805 - if (qcfs_rq->load.weight) 4806 - dequeue = 0; 4789 + if (qcfs_rq->load.weight) { 4790 + /* Avoid re-evaluating load for this entity: */ 4791 + se = parent_entity(se); 4792 + break; 4793 + } 4807 4794 } 4808 4795 4809 - if (!se) 4810 - sub_nr_running(rq, task_delta); 4796 + for_each_sched_entity(se) { 4797 + struct cfs_rq *qcfs_rq = cfs_rq_of(se); 4798 + /* throttled entity or throttle-on-deactivate */ 4799 + if (!se->on_rq) 4800 + goto done; 4811 4801 4802 + update_load_avg(qcfs_rq, se, 0); 4803 + se_update_runnable(se); 4804 + 4805 + qcfs_rq->h_nr_running -= task_delta; 4806 + qcfs_rq->idle_h_nr_running -= idle_task_delta; 4807 + } 4808 + 4809 + /* At this point se is NULL and we are at root level*/ 4810 + sub_nr_running(rq, task_delta); 4811 + 4812 + done: 4812 4813 /* 4813 4814 * Note: distribution will already see us throttled via the 4814 4815 * throttled-list. rq->lock protects completion. ··· 5128 5105 return; 5129 5106 5130 5107 distribute_cfs_runtime(cfs_b); 5131 - 5132 - raw_spin_lock_irqsave(&cfs_b->lock, flags); 5133 - raw_spin_unlock_irqrestore(&cfs_b->lock, flags); 5134 5108 } 5135 5109 5136 5110 /* ··· 5825 5805 if (sync && cpu_rq(this_cpu)->nr_running == 1) 5826 5806 return this_cpu; 5827 5807 5808 + if (available_idle_cpu(prev_cpu)) 5809 + return prev_cpu; 5810 + 5828 5811 return nr_cpumask_bits; 5829 5812 } 5830 5813 ··· 6086 6063 break; 6087 6064 } 6088 6065 } 6089 - cpumask_andnot(cpus, cpus, cpu_smt_mask(core)); 6090 6066 6091 6067 if (idle) 6092 6068 return core; 6069 + 6070 + cpumask_andnot(cpus, cpus, cpu_smt_mask(core)); 6093 6071 } 6094 6072 6095 6073 /* ··· 6331 6307 } 6332 6308 6333 6309 /** 6334 - * Amount of capacity of a CPU that is (estimated to be) used by CFS tasks 6310 + * cpu_util - Estimates the amount of capacity of a CPU used by CFS tasks. 6335 6311 * @cpu: the CPU to get the utilization of 6336 6312 * 6337 6313 * The unit of the return value must be the one of capacity so we can compare ··· 6707 6683 6708 6684 /* 6709 6685 * select_task_rq_fair: Select target runqueue for the waking task in domains 6710 - * that have the 'sd_flag' flag set. In practice, this is SD_BALANCE_WAKE, 6686 + * that have the relevant SD flag set. In practice, this is SD_BALANCE_WAKE, 6711 6687 * SD_BALANCE_FORK, or SD_BALANCE_EXEC. 6712 6688 * 6713 6689 * Balances load by selecting the idlest CPU in the idlest group, or under ··· 6718 6694 * preempt must be disabled. 6719 6695 */ 6720 6696 static int 6721 - select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_flags) 6697 + select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags) 6722 6698 { 6699 + int sync = (wake_flags & WF_SYNC) && !(current->flags & PF_EXITING); 6723 6700 struct sched_domain *tmp, *sd = NULL; 6724 6701 int cpu = smp_processor_id(); 6725 6702 int new_cpu = prev_cpu; 6726 6703 int want_affine = 0; 6727 - int sync = (wake_flags & WF_SYNC) && !(current->flags & PF_EXITING); 6704 + /* SD_flags and WF_flags share the first nibble */ 6705 + int sd_flag = wake_flags & 0xF; 6728 6706 6729 - if (sd_flag & SD_BALANCE_WAKE) { 6707 + if (wake_flags & WF_TTWU) { 6730 6708 record_wakee(p); 6731 6709 6732 6710 if (sched_energy_enabled()) { ··· 6765 6739 if (unlikely(sd)) { 6766 6740 /* Slow path */ 6767 6741 new_cpu = find_idlest_cpu(sd, p, cpu, prev_cpu, sd_flag); 6768 - } else if (sd_flag & SD_BALANCE_WAKE) { /* XXX always ? */ 6742 + } else if (wake_flags & WF_TTWU) { /* XXX always ? */ 6769 6743 /* Fast path */ 6770 - 6771 6744 new_cpu = select_idle_sibling(p, prev_cpu, new_cpu); 6772 6745 6773 6746 if (want_affine) ··· 8783 8758 } 8784 8759 8785 8760 /* 8761 + * Allow a NUMA imbalance if busy CPUs is less than 25% of the domain. 8762 + * This is an approximation as the number of running tasks may not be 8763 + * related to the number of busy CPUs due to sched_setaffinity. 8764 + */ 8765 + static inline bool allow_numa_imbalance(int dst_running, int dst_weight) 8766 + { 8767 + return (dst_running < (dst_weight >> 2)); 8768 + } 8769 + 8770 + /* 8786 8771 * find_idlest_group() finds and returns the least busy CPU group within the 8787 8772 * domain. 8788 8773 * ··· 8809 8774 .avg_load = UINT_MAX, 8810 8775 .group_type = group_overloaded, 8811 8776 }; 8812 - 8813 - imbalance = scale_load_down(NICE_0_LOAD) * 8814 - (sd->imbalance_pct-100) / 100; 8815 8777 8816 8778 do { 8817 8779 int local_group; ··· 8863 8831 switch (local_sgs.group_type) { 8864 8832 case group_overloaded: 8865 8833 case group_fully_busy: 8834 + 8835 + /* Calculate allowed imbalance based on load */ 8836 + imbalance = scale_load_down(NICE_0_LOAD) * 8837 + (sd->imbalance_pct-100) / 100; 8838 + 8866 8839 /* 8867 8840 * When comparing groups across NUMA domains, it's possible for 8868 8841 * the local domain to be very lightly loaded relative to the ··· 8924 8887 * a real need of migration, periodic load balance will 8925 8888 * take care of it. 8926 8889 */ 8927 - if (local_sgs.idle_cpus) 8890 + if (allow_numa_imbalance(local_sgs.sum_nr_running, sd->span_weight)) 8928 8891 return NULL; 8929 8892 } 8930 8893 ··· 9026 8989 } 9027 8990 } 9028 8991 9029 - static inline long adjust_numa_imbalance(int imbalance, int nr_running) 8992 + #define NUMA_IMBALANCE_MIN 2 8993 + 8994 + static inline long adjust_numa_imbalance(int imbalance, 8995 + int dst_running, int dst_weight) 9030 8996 { 9031 - unsigned int imbalance_min; 8997 + if (!allow_numa_imbalance(dst_running, dst_weight)) 8998 + return imbalance; 9032 8999 9033 9000 /* 9034 9001 * Allow a small imbalance based on a simple pair of communicating 9035 - * tasks that remain local when the source domain is almost idle. 9002 + * tasks that remain local when the destination is lightly loaded. 9036 9003 */ 9037 - imbalance_min = 2; 9038 - if (nr_running <= imbalance_min) 9004 + if (imbalance <= NUMA_IMBALANCE_MIN) 9039 9005 return 0; 9040 9006 9041 9007 return imbalance; ··· 9141 9101 } 9142 9102 9143 9103 /* Consider allowing a small imbalance between NUMA groups */ 9144 - if (env->sd->flags & SD_NUMA) 9104 + if (env->sd->flags & SD_NUMA) { 9145 9105 env->imbalance = adjust_numa_imbalance(env->imbalance, 9146 - busiest->sum_nr_running); 9106 + busiest->sum_nr_running, busiest->group_weight); 9107 + } 9147 9108 9148 9109 return; 9149 9110 } ··· 10109 10068 10110 10069 for_each_cpu_and(ilb, nohz.idle_cpus_mask, 10111 10070 housekeeping_cpumask(HK_FLAG_MISC)) { 10071 + 10072 + if (ilb == smp_processor_id()) 10073 + continue; 10074 + 10112 10075 if (idle_cpu(ilb)) 10113 10076 return ilb; 10114 10077 } ··· 10550 10505 #endif /* CONFIG_NO_HZ_COMMON */ 10551 10506 10552 10507 /* 10553 - * idle_balance is called by schedule() if this_cpu is about to become 10508 + * newidle_balance is called by schedule() if this_cpu is about to become 10554 10509 * idle. Attempts to pull tasks from other CPUs. 10555 10510 * 10556 10511 * Returns: ··· 11224 11179 /* 11225 11180 * All the scheduling class methods: 11226 11181 */ 11227 - const struct sched_class fair_sched_class 11228 - __section("__fair_sched_class") = { 11182 + DEFINE_SCHED_CLASS(fair) = { 11183 + 11229 11184 .enqueue_task = enqueue_task_fair, 11230 11185 .dequeue_task = dequeue_task_fair, 11231 11186 .yield_task = yield_task_fair,

+4 -3

kernel/sched/idle.c

··· 364 364 WARN_ON_ONCE(!(current->flags & PF_KTHREAD)); 365 365 WARN_ON_ONCE(!(current->flags & PF_NO_SETAFFINITY)); 366 366 WARN_ON_ONCE(!duration_ns); 367 + WARN_ON_ONCE(current->mm); 367 368 368 369 rcu_sleep_check(); 369 370 preempt_disable(); ··· 402 401 403 402 #ifdef CONFIG_SMP 404 403 static int 405 - select_task_rq_idle(struct task_struct *p, int cpu, int sd_flag, int flags) 404 + select_task_rq_idle(struct task_struct *p, int cpu, int flags) 406 405 { 407 406 return task_cpu(p); /* IDLE tasks as never migrated */ 408 407 } ··· 484 483 /* 485 484 * Simple, special scheduling class for the per-CPU idle tasks: 486 485 */ 487 - const struct sched_class idle_sched_class 488 - __section("__idle_sched_class") = { 486 + DEFINE_SCHED_CLASS(idle) = { 487 + 489 488 /* no enqueue/yield_task for idle tasks */ 490 489 491 490 /* dequeue is not valid, we print a debug message there: */

+143 -4

kernel/sched/membarrier.c

··· 7 7 #include "sched.h" 8 8 9 9 /* 10 + * For documentation purposes, here are some membarrier ordering 11 + * scenarios to keep in mind: 12 + * 13 + * A) Userspace thread execution after IPI vs membarrier's memory 14 + * barrier before sending the IPI 15 + * 16 + * Userspace variables: 17 + * 18 + * int x = 0, y = 0; 19 + * 20 + * The memory barrier at the start of membarrier() on CPU0 is necessary in 21 + * order to enforce the guarantee that any writes occurring on CPU0 before 22 + * the membarrier() is executed will be visible to any code executing on 23 + * CPU1 after the IPI-induced memory barrier: 24 + * 25 + * CPU0 CPU1 26 + * 27 + * x = 1 28 + * membarrier(): 29 + * a: smp_mb() 30 + * b: send IPI IPI-induced mb 31 + * c: smp_mb() 32 + * r2 = y 33 + * y = 1 34 + * barrier() 35 + * r1 = x 36 + * 37 + * BUG_ON(r1 == 0 && r2 == 0) 38 + * 39 + * The write to y and load from x by CPU1 are unordered by the hardware, 40 + * so it's possible to have "r1 = x" reordered before "y = 1" at any 41 + * point after (b). If the memory barrier at (a) is omitted, then "x = 1" 42 + * can be reordered after (a) (although not after (c)), so we get r1 == 0 43 + * and r2 == 0. This violates the guarantee that membarrier() is 44 + * supposed by provide. 45 + * 46 + * The timing of the memory barrier at (a) has to ensure that it executes 47 + * before the IPI-induced memory barrier on CPU1. 48 + * 49 + * B) Userspace thread execution before IPI vs membarrier's memory 50 + * barrier after completing the IPI 51 + * 52 + * Userspace variables: 53 + * 54 + * int x = 0, y = 0; 55 + * 56 + * The memory barrier at the end of membarrier() on CPU0 is necessary in 57 + * order to enforce the guarantee that any writes occurring on CPU1 before 58 + * the membarrier() is executed will be visible to any code executing on 59 + * CPU0 after the membarrier(): 60 + * 61 + * CPU0 CPU1 62 + * 63 + * x = 1 64 + * barrier() 65 + * y = 1 66 + * r2 = y 67 + * membarrier(): 68 + * a: smp_mb() 69 + * b: send IPI IPI-induced mb 70 + * c: smp_mb() 71 + * r1 = x 72 + * BUG_ON(r1 == 0 && r2 == 1) 73 + * 74 + * The writes to x and y are unordered by the hardware, so it's possible to 75 + * have "r2 = 1" even though the write to x doesn't execute until (b). If 76 + * the memory barrier at (c) is omitted then "r1 = x" can be reordered 77 + * before (b) (although not before (a)), so we get "r1 = 0". This violates 78 + * the guarantee that membarrier() is supposed to provide. 79 + * 80 + * The timing of the memory barrier at (c) has to ensure that it executes 81 + * after the IPI-induced memory barrier on CPU1. 82 + * 83 + * C) Scheduling userspace thread -> kthread -> userspace thread vs membarrier 84 + * 85 + * CPU0 CPU1 86 + * 87 + * membarrier(): 88 + * a: smp_mb() 89 + * d: switch to kthread (includes mb) 90 + * b: read rq->curr->mm == NULL 91 + * e: switch to user (includes mb) 92 + * c: smp_mb() 93 + * 94 + * Using the scenario from (A), we can show that (a) needs to be paired 95 + * with (e). Using the scenario from (B), we can show that (c) needs to 96 + * be paired with (d). 97 + * 98 + * D) exit_mm vs membarrier 99 + * 100 + * Two thread groups are created, A and B. Thread group B is created by 101 + * issuing clone from group A with flag CLONE_VM set, but not CLONE_THREAD. 102 + * Let's assume we have a single thread within each thread group (Thread A 103 + * and Thread B). Thread A runs on CPU0, Thread B runs on CPU1. 104 + * 105 + * CPU0 CPU1 106 + * 107 + * membarrier(): 108 + * a: smp_mb() 109 + * exit_mm(): 110 + * d: smp_mb() 111 + * e: current->mm = NULL 112 + * b: read rq->curr->mm == NULL 113 + * c: smp_mb() 114 + * 115 + * Using scenario (B), we can show that (c) needs to be paired with (d). 116 + * 117 + * E) kthread_{use,unuse}_mm vs membarrier 118 + * 119 + * CPU0 CPU1 120 + * 121 + * membarrier(): 122 + * a: smp_mb() 123 + * kthread_unuse_mm() 124 + * d: smp_mb() 125 + * e: current->mm = NULL 126 + * b: read rq->curr->mm == NULL 127 + * kthread_use_mm() 128 + * f: current->mm = mm 129 + * g: smp_mb() 130 + * c: smp_mb() 131 + * 132 + * Using the scenario from (A), we can show that (a) needs to be paired 133 + * with (g). Using the scenario from (B), we can show that (c) needs to 134 + * be paired with (d). 135 + */ 136 + 137 + /* 10 138 * Bitmask made from a "or" of all commands within enum membarrier_cmd, 11 139 * except MEMBARRIER_CMD_QUERY. 12 140 */ ··· 229 101 this_cpu_write(runqueues.membarrier_state, 0); 230 102 } 231 103 104 + void membarrier_update_current_mm(struct mm_struct *next_mm) 105 + { 106 + struct rq *rq = this_rq(); 107 + int membarrier_state = 0; 108 + 109 + if (next_mm) 110 + membarrier_state = atomic_read(&next_mm->membarrier_state); 111 + if (READ_ONCE(rq->membarrier_state) == membarrier_state) 112 + return; 113 + WRITE_ONCE(rq->membarrier_state, membarrier_state); 114 + } 115 + 232 116 static int membarrier_global_expedited(void) 233 117 { 234 118 int cpu; ··· 279 139 continue; 280 140 281 141 /* 282 - * Skip the CPU if it runs a kernel thread. The scheduler 283 - * leaves the prior task mm in place as an optimization when 284 - * scheduling a kthread. 142 + * Skip the CPU if it runs a kernel thread which is not using 143 + * a task mm. 285 144 */ 286 145 p = rcu_dereference(cpu_rq(cpu)->curr); 287 - if (p->flags & PF_KTHREAD) 146 + if (!p->mm) 288 147 continue; 289 148 290 149 __cpumask_set_cpu(cpu, tmpmask);

+70 -29

kernel/sched/rt.c

··· 89 89 __set_bit(MAX_RT_PRIO, array->bitmap); 90 90 91 91 #if defined CONFIG_SMP 92 - rt_rq->highest_prio.curr = MAX_RT_PRIO; 93 - rt_rq->highest_prio.next = MAX_RT_PRIO; 92 + rt_rq->highest_prio.curr = MAX_RT_PRIO-1; 93 + rt_rq->highest_prio.next = MAX_RT_PRIO-1; 94 94 rt_rq->rt_nr_migratory = 0; 95 95 rt_rq->overloaded = 0; 96 96 plist_head_init(&rt_rq->pushable_tasks); ··· 161 161 { 162 162 struct rq *rq = cpu_rq(cpu); 163 163 164 - rt_rq->highest_prio.curr = MAX_RT_PRIO; 164 + rt_rq->highest_prio.curr = MAX_RT_PRIO-1; 165 165 rt_rq->rt_nr_boosted = 0; 166 166 rt_rq->rq = rq; 167 167 rt_rq->tg = tg; ··· 265 265 static inline bool need_pull_rt_task(struct rq *rq, struct task_struct *prev) 266 266 { 267 267 /* Try to pull RT tasks here if we lower this rq's prio */ 268 - return rq->rt.highest_prio.curr > prev->prio; 268 + return rq->online && rq->rt.highest_prio.curr > prev->prio; 269 269 } 270 270 271 271 static inline int rt_overloaded(struct rq *rq) ··· 393 393 p = plist_first_entry(&rq->rt.pushable_tasks, 394 394 struct task_struct, pushable_tasks); 395 395 rq->rt.highest_prio.next = p->prio; 396 - } else 397 - rq->rt.highest_prio.next = MAX_RT_PRIO; 396 + } else { 397 + rq->rt.highest_prio.next = MAX_RT_PRIO-1; 398 + } 398 399 } 399 400 400 401 #else ··· 1148 1147 sched_find_first_bit(array->bitmap); 1149 1148 } 1150 1149 1151 - } else 1152 - rt_rq->highest_prio.curr = MAX_RT_PRIO; 1150 + } else { 1151 + rt_rq->highest_prio.curr = MAX_RT_PRIO-1; 1152 + } 1153 1153 1154 1154 dec_rt_prio_smp(rt_rq, prio, prev_prio); 1155 1155 } ··· 1430 1428 static int find_lowest_rq(struct task_struct *task); 1431 1429 1432 1430 static int 1433 - select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags) 1431 + select_task_rq_rt(struct task_struct *p, int cpu, int flags) 1434 1432 { 1435 1433 struct task_struct *curr; 1436 1434 struct rq *rq; 1437 1435 bool test; 1438 1436 1439 1437 /* For anything but wake ups, just return the task_cpu */ 1440 - if (sd_flag != SD_BALANCE_WAKE && sd_flag != SD_BALANCE_FORK) 1438 + if (!(flags & (WF_TTWU | WF_FORK))) 1441 1439 goto out; 1442 1440 1443 1441 rq = cpu_rq(cpu); ··· 1660 1658 static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu) 1661 1659 { 1662 1660 if (!task_running(rq, p) && 1663 - cpumask_test_cpu(cpu, p->cpus_ptr)) 1661 + cpumask_test_cpu(cpu, &p->cpus_mask)) 1664 1662 return 1; 1665 1663 1666 1664 return 0; ··· 1754 1752 return this_cpu; 1755 1753 } 1756 1754 1757 - best_cpu = cpumask_first_and(lowest_mask, 1758 - sched_domain_span(sd)); 1755 + best_cpu = cpumask_any_and_distribute(lowest_mask, 1756 + sched_domain_span(sd)); 1759 1757 if (best_cpu < nr_cpu_ids) { 1760 1758 rcu_read_unlock(); 1761 1759 return best_cpu; ··· 1772 1770 if (this_cpu != -1) 1773 1771 return this_cpu; 1774 1772 1775 - cpu = cpumask_any(lowest_mask); 1773 + cpu = cpumask_any_distribute(lowest_mask); 1776 1774 if (cpu < nr_cpu_ids) 1777 1775 return cpu; 1778 1776 ··· 1813 1811 * Also make sure that it wasn't scheduled on its rq. 1814 1812 */ 1815 1813 if (unlikely(task_rq(task) != rq || 1816 - !cpumask_test_cpu(lowest_rq->cpu, task->cpus_ptr) || 1814 + !cpumask_test_cpu(lowest_rq->cpu, &task->cpus_mask) || 1817 1815 task_running(rq, task) || 1818 1816 !rt_task(task) || 1819 1817 !task_on_rq_queued(task))) { ··· 1861 1859 * running task can migrate over to a CPU that is running a task 1862 1860 * of lesser priority. 1863 1861 */ 1864 - static int push_rt_task(struct rq *rq) 1862 + static int push_rt_task(struct rq *rq, bool pull) 1865 1863 { 1866 1864 struct task_struct *next_task; 1867 1865 struct rq *lowest_rq; ··· 1875 1873 return 0; 1876 1874 1877 1875 retry: 1876 + if (is_migration_disabled(next_task)) { 1877 + struct task_struct *push_task = NULL; 1878 + int cpu; 1879 + 1880 + if (!pull || rq->push_busy) 1881 + return 0; 1882 + 1883 + cpu = find_lowest_rq(rq->curr); 1884 + if (cpu == -1 || cpu == rq->cpu) 1885 + return 0; 1886 + 1887 + /* 1888 + * Given we found a CPU with lower priority than @next_task, 1889 + * therefore it should be running. However we cannot migrate it 1890 + * to this other CPU, instead attempt to push the current 1891 + * running task on this CPU away. 1892 + */ 1893 + push_task = get_push_task(rq); 1894 + if (push_task) { 1895 + raw_spin_unlock(&rq->lock); 1896 + stop_one_cpu_nowait(rq->cpu, push_cpu_stop, 1897 + push_task, &rq->push_work); 1898 + raw_spin_lock(&rq->lock); 1899 + } 1900 + 1901 + return 0; 1902 + } 1903 + 1878 1904 if (WARN_ON(next_task == rq->curr)) 1879 1905 return 0; 1880 1906 ··· 1957 1927 deactivate_task(rq, next_task, 0); 1958 1928 set_task_cpu(next_task, lowest_rq->cpu); 1959 1929 activate_task(lowest_rq, next_task, 0); 1930 + resched_curr(lowest_rq); 1960 1931 ret = 1; 1961 1932 1962 - resched_curr(lowest_rq); 1963 - 1964 1933 double_unlock_balance(rq, lowest_rq); 1965 - 1966 1934 out: 1967 1935 put_task_struct(next_task); 1968 1936 ··· 1970 1942 static void push_rt_tasks(struct rq *rq) 1971 1943 { 1972 1944 /* push_rt_task will return true if it moved an RT */ 1973 - while (push_rt_task(rq)) 1945 + while (push_rt_task(rq, false)) 1974 1946 ; 1975 1947 } 1976 1948 ··· 2123 2095 */ 2124 2096 if (has_pushable_tasks(rq)) { 2125 2097 raw_spin_lock(&rq->lock); 2126 - push_rt_tasks(rq); 2098 + while (push_rt_task(rq, true)) 2099 + ; 2127 2100 raw_spin_unlock(&rq->lock); 2128 2101 } 2129 2102 ··· 2149 2120 { 2150 2121 int this_cpu = this_rq->cpu, cpu; 2151 2122 bool resched = false; 2152 - struct task_struct *p; 2123 + struct task_struct *p, *push_task; 2153 2124 struct rq *src_rq; 2154 2125 int rt_overload_count = rt_overloaded(this_rq); 2155 2126 ··· 2196 2167 * double_lock_balance, and another CPU could 2197 2168 * alter this_rq 2198 2169 */ 2170 + push_task = NULL; 2199 2171 double_lock_balance(this_rq, src_rq); 2200 2172 2201 2173 /* ··· 2224 2194 if (p->prio < src_rq->curr->prio) 2225 2195 goto skip; 2226 2196 2227 - resched = true; 2228 - 2229 - deactivate_task(src_rq, p, 0); 2230 - set_task_cpu(p, this_cpu); 2231 - activate_task(this_rq, p, 0); 2197 + if (is_migration_disabled(p)) { 2198 + push_task = get_push_task(src_rq); 2199 + } else { 2200 + deactivate_task(src_rq, p, 0); 2201 + set_task_cpu(p, this_cpu); 2202 + activate_task(this_rq, p, 0); 2203 + resched = true; 2204 + } 2232 2205 /* 2233 2206 * We continue with the search, just in 2234 2207 * case there's an even higher prio task ··· 2241 2208 } 2242 2209 skip: 2243 2210 double_unlock_balance(this_rq, src_rq); 2211 + 2212 + if (push_task) { 2213 + raw_spin_unlock(&this_rq->lock); 2214 + stop_one_cpu_nowait(src_rq->cpu, push_cpu_stop, 2215 + push_task, &src_rq->push_work); 2216 + raw_spin_lock(&this_rq->lock); 2217 + } 2244 2218 } 2245 2219 2246 2220 if (resched) ··· 2469 2429 return 0; 2470 2430 } 2471 2431 2472 - const struct sched_class rt_sched_class 2473 - __section("__rt_sched_class") = { 2432 + DEFINE_SCHED_CLASS(rt) = { 2433 + 2474 2434 .enqueue_task = enqueue_task_rt, 2475 2435 .dequeue_task = dequeue_task_rt, 2476 2436 .yield_task = yield_task_rt, ··· 2489 2449 .rq_offline = rq_offline_rt, 2490 2450 .task_woken = task_woken_rt, 2491 2451 .switched_from = switched_from_rt, 2452 + .find_lock_rq = find_lock_lowest_rq, 2492 2453 #endif 2493 2454 2494 2455 .task_tick = task_tick_rt,

+113 -37

kernel/sched/sched.h

··· 67 67 #include <linux/tsacct_kern.h> 68 68 69 69 #include <asm/tlb.h> 70 - #include <asm-generic/vmlinux.lds.h> 71 70 72 71 #ifdef CONFIG_PARAVIRT 73 72 # include <asm/paravirt.h> ··· 256 257 257 258 void __dl_clear_params(struct task_struct *p); 258 259 259 - /* 260 - * To keep the bandwidth of -deadline tasks and groups under control 261 - * we need some place where: 262 - * - store the maximum -deadline bandwidth of the system (the group); 263 - * - cache the fraction of that bandwidth that is currently allocated. 264 - * 265 - * This is all done in the data structure below. It is similar to the 266 - * one used for RT-throttling (rt_bandwidth), with the main difference 267 - * that, since here we are only interested in admission control, we 268 - * do not decrease any runtime while the group "executes", neither we 269 - * need a timer to replenish it. 270 - * 271 - * With respect to SMP, the bandwidth is given on a per-CPU basis, 272 - * meaning that: 273 - * - dl_bw (< 100%) is the bandwidth of the system (group) on each CPU; 274 - * - dl_total_bw array contains, in the i-eth element, the currently 275 - * allocated bandwidth on the i-eth CPU. 276 - * Moreover, groups consume bandwidth on each CPU, while tasks only 277 - * consume bandwidth on the CPU they're running on. 278 - * Finally, dl_total_bw_cpu is used to cache the index of dl_total_bw 279 - * that will be shown the next time the proc or cgroup controls will 280 - * be red. It on its turn can be changed by writing on its own 281 - * control. 282 - */ 283 260 struct dl_bandwidth { 284 261 raw_spinlock_t dl_runtime_lock; 285 262 u64 dl_runtime; ··· 267 292 return sysctl_sched_rt_runtime >= 0; 268 293 } 269 294 295 + /* 296 + * To keep the bandwidth of -deadline tasks under control 297 + * we need some place where: 298 + * - store the maximum -deadline bandwidth of each cpu; 299 + * - cache the fraction of bandwidth that is currently allocated in 300 + * each root domain; 301 + * 302 + * This is all done in the data structure below. It is similar to the 303 + * one used for RT-throttling (rt_bandwidth), with the main difference 304 + * that, since here we are only interested in admission control, we 305 + * do not decrease any runtime while the group "executes", neither we 306 + * need a timer to replenish it. 307 + * 308 + * With respect to SMP, bandwidth is given on a per root domain basis, 309 + * meaning that: 310 + * - bw (< 100%) is the deadline bandwidth of each CPU; 311 + * - total_bw is the currently allocated bandwidth in each root domain; 312 + */ 270 313 struct dl_bw { 271 314 raw_spinlock_t lock; 272 315 u64 bw; ··· 794 801 struct dl_bw dl_bw; 795 802 struct cpudl cpudl; 796 803 804 + /* 805 + * Indicate whether a root_domain's dl_bw has been checked or 806 + * updated. It's monotonously increasing value. 807 + * 808 + * Also, some corner cases, like 'wrap around' is dangerous, but given 809 + * that u64 is 'big enough'. So that shouldn't be a concern. 810 + */ 811 + u64 visit_gen; 812 + 797 813 #ifdef HAVE_RT_PUSH_IPI 798 814 /* 799 815 * For IPI pull requests, loop across the rto_mask. ··· 975 973 unsigned long cpu_capacity_orig; 976 974 977 975 struct callback_head *balance_callback; 976 + unsigned char balance_flags; 978 977 979 978 unsigned char nohz_idle_balance; 980 979 unsigned char idle_balance; ··· 1006 1003 1007 1004 /* This is used to determine avg_idle's max value */ 1008 1005 u64 max_idle_balance_cost; 1006 + 1007 + #ifdef CONFIG_HOTPLUG_CPU 1008 + struct rcuwait hotplug_wait; 1009 + #endif 1009 1010 #endif /* CONFIG_SMP */ 1010 1011 1011 1012 #ifdef CONFIG_IRQ_TIME_ACCOUNTING ··· 1055 1048 /* Must be inspected within a rcu lock section */ 1056 1049 struct cpuidle_state *idle_state; 1057 1050 #endif 1051 + 1052 + #ifdef CONFIG_SMP 1053 + unsigned int nr_pinned; 1054 + #endif 1055 + unsigned int push_busy; 1056 + struct cpu_stop_work push_work; 1058 1057 }; 1059 1058 1060 1059 #ifdef CONFIG_FAIR_GROUP_SCHED ··· 1088 1075 #endif 1089 1076 } 1090 1077 1078 + #define MDF_PUSH 0x01 1079 + 1080 + static inline bool is_migration_disabled(struct task_struct *p) 1081 + { 1082 + #ifdef CONFIG_SMP 1083 + return p->migration_disabled; 1084 + #else 1085 + return false; 1086 + #endif 1087 + } 1091 1088 1092 1089 #ifdef CONFIG_SCHED_SMT 1093 1090 extern void __update_idle_core(struct rq *rq); ··· 1243 1220 #ifdef CONFIG_SCHED_DEBUG 1244 1221 rq->clock_update_flags &= (RQCF_REQ_SKIP|RQCF_ACT_SKIP); 1245 1222 rf->clock_update_flags = 0; 1223 + #endif 1224 + #ifdef CONFIG_SMP 1225 + SCHED_WARN_ON(rq->balance_callback); 1246 1226 #endif 1247 1227 } 1248 1228 ··· 1408 1382 1409 1383 #ifdef CONFIG_SMP 1410 1384 1385 + #define BALANCE_WORK 0x01 1386 + #define BALANCE_PUSH 0x02 1387 + 1411 1388 static inline void 1412 1389 queue_balance_callback(struct rq *rq, 1413 1390 struct callback_head *head, ··· 1418 1389 { 1419 1390 lockdep_assert_held(&rq->lock); 1420 1391 1421 - if (unlikely(head->next)) 1392 + if (unlikely(head->next || (rq->balance_flags & BALANCE_PUSH))) 1422 1393 return; 1423 1394 1424 1395 head->func = (void (*)(struct callback_head *))func; 1425 1396 head->next = rq->balance_callback; 1426 1397 rq->balance_callback = head; 1398 + rq->balance_flags |= BALANCE_WORK; 1427 1399 } 1428 1400 1429 1401 #define rcu_dereference_check_sched_domain(p) \ ··· 1744 1714 return READ_ONCE(p->on_rq) == TASK_ON_RQ_MIGRATING; 1745 1715 } 1746 1716 1747 - /* 1748 - * wake flags 1749 - */ 1750 - #define WF_SYNC 0x01 /* Waker goes to sleep after wakeup */ 1751 - #define WF_FORK 0x02 /* Child wakeup after fork */ 1752 - #define WF_MIGRATED 0x04 /* Internal use, task got migrated */ 1753 - #define WF_ON_CPU 0x08 /* Wakee is on_cpu */ 1717 + /* Wake flags. The first three directly map to some SD flag value */ 1718 + #define WF_EXEC 0x02 /* Wakeup after exec; maps to SD_BALANCE_EXEC */ 1719 + #define WF_FORK 0x04 /* Wakeup after fork; maps to SD_BALANCE_FORK */ 1720 + #define WF_TTWU 0x08 /* Wakeup; maps to SD_BALANCE_WAKE */ 1721 + 1722 + #define WF_SYNC 0x10 /* Waker goes to sleep after wakeup */ 1723 + #define WF_MIGRATED 0x20 /* Internal use, task got migrated */ 1724 + #define WF_ON_CPU 0x40 /* Wakee is on_cpu */ 1725 + 1726 + #ifdef CONFIG_SMP 1727 + static_assert(WF_EXEC == SD_BALANCE_EXEC); 1728 + static_assert(WF_FORK == SD_BALANCE_FORK); 1729 + static_assert(WF_TTWU == SD_BALANCE_WAKE); 1730 + #endif 1754 1731 1755 1732 /* 1756 1733 * To aid in avoiding the subversion of "niceness" due to uneven distribution ··· 1833 1796 1834 1797 #ifdef CONFIG_SMP 1835 1798 int (*balance)(struct rq *rq, struct task_struct *prev, struct rq_flags *rf); 1836 - int (*select_task_rq)(struct task_struct *p, int task_cpu, int sd_flag, int flags); 1799 + int (*select_task_rq)(struct task_struct *p, int task_cpu, int flags); 1837 1800 void (*migrate_task_rq)(struct task_struct *p, int new_cpu); 1838 1801 1839 1802 void (*task_woken)(struct rq *this_rq, struct task_struct *task); 1840 1803 1841 1804 void (*set_cpus_allowed)(struct task_struct *p, 1842 - const struct cpumask *newmask); 1805 + const struct cpumask *newmask, 1806 + u32 flags); 1843 1807 1844 1808 void (*rq_online)(struct rq *rq); 1845 1809 void (*rq_offline)(struct rq *rq); 1810 + 1811 + struct rq *(*find_lock_rq)(struct task_struct *p, struct rq *rq); 1846 1812 #endif 1847 1813 1848 1814 void (*task_tick)(struct rq *rq, struct task_struct *p, int queued); ··· 1873 1833 #ifdef CONFIG_FAIR_GROUP_SCHED 1874 1834 void (*task_change_group)(struct task_struct *p, int type); 1875 1835 #endif 1876 - } __aligned(STRUCT_ALIGNMENT); /* STRUCT_ALIGN(), vmlinux.lds.h */ 1836 + }; 1877 1837 1878 1838 static inline void put_prev_task(struct rq *rq, struct task_struct *prev) 1879 1839 { ··· 1886 1846 WARN_ON_ONCE(rq->curr != next); 1887 1847 next->sched_class->set_next_task(rq, next, false); 1888 1848 } 1849 + 1850 + 1851 + /* 1852 + * Helper to define a sched_class instance; each one is placed in a separate 1853 + * section which is ordered by the linker script: 1854 + * 1855 + * include/asm-generic/vmlinux.lds.h 1856 + * 1857 + * Also enforce alignment on the instance, not the type, to guarantee layout. 1858 + */ 1859 + #define DEFINE_SCHED_CLASS(name) \ 1860 + const struct sched_class name##_sched_class \ 1861 + __aligned(__alignof__(struct sched_class)) \ 1862 + __section("__" #name "_sched_class") 1889 1863 1890 1864 /* Defined in include/asm-generic/vmlinux.lds.h */ 1891 1865 extern struct sched_class __begin_sched_classes[]; ··· 1943 1889 extern struct task_struct *pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf); 1944 1890 extern struct task_struct *pick_next_task_idle(struct rq *rq); 1945 1891 1892 + #define SCA_CHECK 0x01 1893 + #define SCA_MIGRATE_DISABLE 0x02 1894 + #define SCA_MIGRATE_ENABLE 0x04 1895 + 1946 1896 #ifdef CONFIG_SMP 1947 1897 1948 1898 extern void update_group_capacity(struct sched_domain *sd, int cpu); 1949 1899 1950 1900 extern void trigger_load_balance(struct rq *rq); 1951 1901 1952 - extern void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask); 1902 + extern void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask, u32 flags); 1903 + 1904 + static inline struct task_struct *get_push_task(struct rq *rq) 1905 + { 1906 + struct task_struct *p = rq->curr; 1907 + 1908 + lockdep_assert_held(&rq->lock); 1909 + 1910 + if (rq->push_busy) 1911 + return NULL; 1912 + 1913 + if (p->nr_cpus_allowed == 1) 1914 + return NULL; 1915 + 1916 + rq->push_busy = true; 1917 + return get_task_struct(p); 1918 + } 1919 + 1920 + extern int push_cpu_stop(void *arg); 1953 1921 1954 1922 #endif 1955 1923

+2 -3

kernel/sched/stop_task.c

··· 11 11 12 12 #ifdef CONFIG_SMP 13 13 static int 14 - select_task_rq_stop(struct task_struct *p, int cpu, int sd_flag, int flags) 14 + select_task_rq_stop(struct task_struct *p, int cpu, int flags) 15 15 { 16 16 return task_cpu(p); /* stop tasks as never migrate */ 17 17 } ··· 109 109 /* 110 110 * Simple, special scheduling class for the per-CPU stop tasks: 111 111 */ 112 - const struct sched_class stop_sched_class 113 - __section("__stop_sched_class") = { 112 + DEFINE_SCHED_CLASS(stop) = { 114 113 115 114 .enqueue_task = enqueue_task_stop, 116 115 .dequeue_task = dequeue_task_stop,

+54 -7

kernel/sched/topology.c

··· 211 211 DEFINE_MUTEX(sched_energy_mutex); 212 212 bool sched_energy_update; 213 213 214 + void rebuild_sched_domains_energy(void) 215 + { 216 + mutex_lock(&sched_energy_mutex); 217 + sched_energy_update = true; 218 + rebuild_sched_domains(); 219 + sched_energy_update = false; 220 + mutex_unlock(&sched_energy_mutex); 221 + } 222 + 214 223 #ifdef CONFIG_PROC_SYSCTL 215 224 int sched_energy_aware_handler(struct ctl_table *table, int write, 216 225 void *buffer, size_t *lenp, loff_t *ppos) ··· 232 223 ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); 233 224 if (!ret && write) { 234 225 state = static_branch_unlikely(&sched_energy_present); 235 - if (state != sysctl_sched_energy_aware) { 236 - mutex_lock(&sched_energy_mutex); 237 - sched_energy_update = 1; 238 - rebuild_sched_domains(); 239 - sched_energy_update = 0; 240 - mutex_unlock(&sched_energy_mutex); 241 - } 226 + if (state != sysctl_sched_energy_aware) 227 + rebuild_sched_domains_energy(); 242 228 } 243 229 244 230 return ret; ··· 328 324 * 3. no SMT is detected. 329 325 * 4. the EM complexity is low enough to keep scheduling overheads low; 330 326 * 5. schedutil is driving the frequency of all CPUs of the rd; 327 + * 6. frequency invariance support is present; 331 328 * 332 329 * The complexity of the Energy Model is defined as: 333 330 * ··· 374 369 if (sched_smt_active()) { 375 370 pr_warn("rd %*pbl: Disabling EAS, SMT is not supported\n", 376 371 cpumask_pr_args(cpu_map)); 372 + goto free; 373 + } 374 + 375 + if (!arch_scale_freq_invariant()) { 376 + if (sched_debug()) { 377 + pr_warn("rd %*pbl: Disabling EAS: frequency-invariant load tracking not yet supported", 378 + cpumask_pr_args(cpu_map)); 379 + } 377 380 goto free; 378 381 } 379 382 ··· 529 516 init_irq_work(&rd->rto_push_work, rto_push_irq_work_func); 530 517 #endif 531 518 519 + rd->visit_gen = 0; 532 520 init_dl_bw(&rd->dl_bw); 533 521 if (cpudl_init(&rd->cpudl) != 0) 534 522 goto free_rto_mask; ··· 688 674 { 689 675 struct rq *rq = cpu_rq(cpu); 690 676 struct sched_domain *tmp; 677 + int numa_distance = 0; 691 678 692 679 /* Remove the sched domains which do not contribute to scheduling. */ 693 680 for (tmp = sd; tmp; ) { ··· 719 704 if (sd) 720 705 sd->child = NULL; 721 706 } 707 + 708 + for (tmp = sd; tmp; tmp = tmp->parent) 709 + numa_distance += !!(tmp->flags & SD_NUMA); 710 + 711 + /* 712 + * FIXME: Diameter >=3 is misrepresented. 713 + * 714 + * Smallest diameter=3 topology is: 715 + * 716 + * node 0 1 2 3 717 + * 0: 10 20 30 40 718 + * 1: 20 10 20 30 719 + * 2: 30 20 10 20 720 + * 3: 40 30 20 10 721 + * 722 + * 0 --- 1 --- 2 --- 3 723 + * 724 + * NUMA-3 0-3 N/A N/A 0-3 725 + * groups: {0-2},{1-3} {1-3},{0-2} 726 + * 727 + * NUMA-2 0-2 0-3 0-3 1-3 728 + * groups: {0-1},{1-3} {0-2},{2-3} {1-3},{0-1} {2-3},{0-2} 729 + * 730 + * NUMA-1 0-1 0-2 1-3 2-3 731 + * groups: {0},{1} {1},{2},{0} {2},{3},{1} {3},{2} 732 + * 733 + * NUMA-0 0 1 2 3 734 + * 735 + * The NUMA-2 groups for nodes 0 and 3 are obviously buggered, as the 736 + * group span isn't a subset of the domain span. 737 + */ 738 + WARN_ONCE(numa_distance > 2, "Shortest NUMA path spans too many nodes\n"); 722 739 723 740 sched_domain_debug(sd, cpu); 724 741

+26 -26

kernel/smp.c

··· 27 27 #include "smpboot.h" 28 28 #include "sched/smp.h" 29 29 30 - #define CSD_TYPE(_csd) ((_csd)->flags & CSD_FLAG_TYPE_MASK) 30 + #define CSD_TYPE(_csd) ((_csd)->node.u_flags & CSD_FLAG_TYPE_MASK) 31 31 32 32 struct call_function_data { 33 33 call_single_data_t __percpu *csd; ··· 130 130 131 131 csd_type = CSD_TYPE(csd); 132 132 if (csd_type == CSD_TYPE_ASYNC || csd_type == CSD_TYPE_SYNC) 133 - return csd->dst; /* Other CSD_TYPE_ values might not have ->dst. */ 133 + return csd->node.dst; /* Other CSD_TYPE_ values might not have ->dst. */ 134 134 return -1; 135 135 } 136 136 ··· 146 146 bool firsttime; 147 147 u64 ts2, ts_delta; 148 148 call_single_data_t *cpu_cur_csd; 149 - unsigned int flags = READ_ONCE(csd->flags); 149 + unsigned int flags = READ_ONCE(csd->node.u_flags); 150 150 151 151 if (!(flags & CSD_FLAG_LOCK)) { 152 152 if (!unlikely(*bug_id)) ··· 224 224 225 225 static __always_inline void csd_lock_wait(call_single_data_t *csd) 226 226 { 227 - smp_cond_load_acquire(&csd->flags, !(VAL & CSD_FLAG_LOCK)); 227 + smp_cond_load_acquire(&csd->node.u_flags, !(VAL & CSD_FLAG_LOCK)); 228 228 } 229 229 #endif 230 230 231 231 static __always_inline void csd_lock(call_single_data_t *csd) 232 232 { 233 233 csd_lock_wait(csd); 234 - csd->flags |= CSD_FLAG_LOCK; 234 + csd->node.u_flags |= CSD_FLAG_LOCK; 235 235 236 236 /* 237 237 * prevent CPU from reordering the above assignment ··· 243 243 244 244 static __always_inline void csd_unlock(call_single_data_t *csd) 245 245 { 246 - WARN_ON(!(csd->flags & CSD_FLAG_LOCK)); 246 + WARN_ON(!(csd->node.u_flags & CSD_FLAG_LOCK)); 247 247 248 248 /* 249 249 * ensure we're all done before releasing data: 250 250 */ 251 - smp_store_release(&csd->flags, 0); 251 + smp_store_release(&csd->node.u_flags, 0); 252 252 } 253 253 254 254 static DEFINE_PER_CPU_SHARED_ALIGNED(call_single_data_t, csd_data); ··· 300 300 return -ENXIO; 301 301 } 302 302 303 - __smp_call_single_queue(cpu, &csd->llist); 303 + __smp_call_single_queue(cpu, &csd->node.llist); 304 304 305 305 return 0; 306 306 } ··· 353 353 * We don't have to use the _safe() variant here 354 354 * because we are not invoking the IPI handlers yet. 355 355 */ 356 - llist_for_each_entry(csd, entry, llist) { 356 + llist_for_each_entry(csd, entry, node.llist) { 357 357 switch (CSD_TYPE(csd)) { 358 358 case CSD_TYPE_ASYNC: 359 359 case CSD_TYPE_SYNC: ··· 378 378 * First; run all SYNC callbacks, people are waiting for us. 379 379 */ 380 380 prev = NULL; 381 - llist_for_each_entry_safe(csd, csd_next, entry, llist) { 381 + llist_for_each_entry_safe(csd, csd_next, entry, node.llist) { 382 382 /* Do we wait until *after* callback? */ 383 383 if (CSD_TYPE(csd) == CSD_TYPE_SYNC) { 384 384 smp_call_func_t func = csd->func; 385 385 void *info = csd->info; 386 386 387 387 if (prev) { 388 - prev->next = &csd_next->llist; 388 + prev->next = &csd_next->node.llist; 389 389 } else { 390 - entry = &csd_next->llist; 390 + entry = &csd_next->node.llist; 391 391 } 392 392 393 393 csd_lock_record(csd); ··· 395 395 csd_unlock(csd); 396 396 csd_lock_record(NULL); 397 397 } else { 398 - prev = &csd->llist; 398 + prev = &csd->node.llist; 399 399 } 400 400 } 401 401 ··· 406 406 * Second; run all !SYNC callbacks. 407 407 */ 408 408 prev = NULL; 409 - llist_for_each_entry_safe(csd, csd_next, entry, llist) { 409 + llist_for_each_entry_safe(csd, csd_next, entry, node.llist) { 410 410 int type = CSD_TYPE(csd); 411 411 412 412 if (type != CSD_TYPE_TTWU) { 413 413 if (prev) { 414 - prev->next = &csd_next->llist; 414 + prev->next = &csd_next->node.llist; 415 415 } else { 416 - entry = &csd_next->llist; 416 + entry = &csd_next->node.llist; 417 417 } 418 418 419 419 if (type == CSD_TYPE_ASYNC) { ··· 429 429 } 430 430 431 431 } else { 432 - prev = &csd->llist; 432 + prev = &csd->node.llist; 433 433 } 434 434 } 435 435 ··· 465 465 { 466 466 call_single_data_t *csd; 467 467 call_single_data_t csd_stack = { 468 - .flags = CSD_FLAG_LOCK | CSD_TYPE_SYNC, 468 + .node = { .u_flags = CSD_FLAG_LOCK | CSD_TYPE_SYNC, }, 469 469 }; 470 470 int this_cpu; 471 471 int err; ··· 502 502 csd->func = func; 503 503 csd->info = info; 504 504 #ifdef CONFIG_CSD_LOCK_WAIT_DEBUG 505 - csd->src = smp_processor_id(); 506 - csd->dst = cpu; 505 + csd->node.src = smp_processor_id(); 506 + csd->node.dst = cpu; 507 507 #endif 508 508 509 509 err = generic_exec_single(cpu, csd); ··· 544 544 545 545 preempt_disable(); 546 546 547 - if (csd->flags & CSD_FLAG_LOCK) { 547 + if (csd->node.u_flags & CSD_FLAG_LOCK) { 548 548 err = -EBUSY; 549 549 goto out; 550 550 } 551 551 552 - csd->flags = CSD_FLAG_LOCK; 552 + csd->node.u_flags = CSD_FLAG_LOCK; 553 553 smp_wmb(); 554 554 555 555 err = generic_exec_single(cpu, csd); ··· 667 667 668 668 csd_lock(csd); 669 669 if (wait) 670 - csd->flags |= CSD_TYPE_SYNC; 670 + csd->node.u_flags |= CSD_TYPE_SYNC; 671 671 csd->func = func; 672 672 csd->info = info; 673 673 #ifdef CONFIG_CSD_LOCK_WAIT_DEBUG 674 - csd->src = smp_processor_id(); 675 - csd->dst = cpu; 674 + csd->node.src = smp_processor_id(); 675 + csd->node.dst = cpu; 676 676 #endif 677 - if (llist_add(&csd->llist, &per_cpu(call_single_queue, cpu))) 677 + if (llist_add(&csd->node.llist, &per_cpu(call_single_queue, cpu))) 678 678 __cpumask_set_cpu(cpu, cfd->cpumask_ipi); 679 679 } 680 680

+24 -3

kernel/stop_machine.c

··· 42 42 struct list_head works; /* list of pending works */ 43 43 44 44 struct cpu_stop_work stop_work; /* for stop_cpus */ 45 + unsigned long caller; 46 + cpu_stop_fn_t fn; 45 47 }; 46 48 47 49 static DEFINE_PER_CPU(struct cpu_stopper, cpu_stopper); 48 50 static bool stop_machine_initialized = false; 51 + 52 + void print_stop_info(const char *log_lvl, struct task_struct *task) 53 + { 54 + /* 55 + * If @task is a stopper task, it cannot migrate and task_cpu() is 56 + * stable. 57 + */ 58 + struct cpu_stopper *stopper = per_cpu_ptr(&cpu_stopper, task_cpu(task)); 59 + 60 + if (task != stopper->thread) 61 + return; 62 + 63 + printk("%sStopper: %pS <- %pS\n", log_lvl, stopper->fn, (void *)stopper->caller); 64 + } 49 65 50 66 /* static data for stop_cpus */ 51 67 static DEFINE_MUTEX(stop_cpus_mutex); ··· 139 123 int stop_one_cpu(unsigned int cpu, cpu_stop_fn_t fn, void *arg) 140 124 { 141 125 struct cpu_stop_done done; 142 - struct cpu_stop_work work = { .fn = fn, .arg = arg, .done = &done }; 126 + struct cpu_stop_work work = { .fn = fn, .arg = arg, .done = &done, .caller = _RET_IP_ }; 143 127 144 128 cpu_stop_init_done(&done, 1); 145 129 if (!cpu_stop_queue_work(cpu, &work)) ··· 347 331 work1 = work2 = (struct cpu_stop_work){ 348 332 .fn = multi_cpu_stop, 349 333 .arg = &msdata, 350 - .done = &done 334 + .done = &done, 335 + .caller = _RET_IP_, 351 336 }; 352 337 353 338 cpu_stop_init_done(&done, 2); ··· 384 367 bool stop_one_cpu_nowait(unsigned int cpu, cpu_stop_fn_t fn, void *arg, 385 368 struct cpu_stop_work *work_buf) 386 369 { 387 - *work_buf = (struct cpu_stop_work){ .fn = fn, .arg = arg, }; 370 + *work_buf = (struct cpu_stop_work){ .fn = fn, .arg = arg, .caller = _RET_IP_, }; 388 371 return cpu_stop_queue_work(cpu, work_buf); 389 372 } 390 373 ··· 504 487 int ret; 505 488 506 489 /* cpu stop callbacks must not sleep, make in_atomic() == T */ 490 + stopper->caller = work->caller; 491 + stopper->fn = fn; 507 492 preempt_count_inc(); 508 493 ret = fn(arg); 509 494 if (done) { ··· 514 495 cpu_stop_signal_done(done); 515 496 } 516 497 preempt_count_dec(); 498 + stopper->fn = NULL; 499 + stopper->caller = 0; 517 500 WARN_ONCE(preempt_count(), 518 501 "cpu_stop: %ps(%p) leaked preempt count\n", fn, arg); 519 502 goto repeat;

+2 -4

kernel/time/tick-sched.c

··· 293 293 /* Empty, the tick restart happens on tick_nohz_irq_exit() */ 294 294 } 295 295 296 - static DEFINE_PER_CPU(struct irq_work, nohz_full_kick_work) = { 297 - .func = nohz_full_kick_func, 298 - .flags = ATOMIC_INIT(IRQ_WORK_HARD_IRQ), 299 - }; 296 + static DEFINE_PER_CPU(struct irq_work, nohz_full_kick_work) = 297 + IRQ_WORK_INIT_HARD(nohz_full_kick_func); 300 298 301 299 /* 302 300 * Kick this CPU if it's full dynticks in order to force it to

+1 -1

kernel/trace/bpf_trace.c

··· 1096 1096 return -EINVAL; 1097 1097 1098 1098 work = this_cpu_ptr(&send_signal_work); 1099 - if (atomic_read(&work->irq_work.flags) & IRQ_WORK_BUSY) 1099 + if (irq_work_is_busy(&work->irq_work)) 1100 1100 return -EBUSY; 1101 1101 1102 1102 /* Add the current task, which is the target of sending signal,

+4

kernel/workqueue.c

··· 4908 4908 pool->flags |= POOL_DISASSOCIATED; 4909 4909 4910 4910 raw_spin_unlock_irq(&pool->lock); 4911 + 4912 + for_each_pool_worker(worker, pool) 4913 + WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, cpu_active_mask) < 0); 4914 + 4911 4915 mutex_unlock(&wq_pool_attach_mutex); 4912 4916 4913 4917 /*

+18

lib/cpumask.c

··· 267 267 return next; 268 268 } 269 269 EXPORT_SYMBOL(cpumask_any_and_distribute); 270 + 271 + int cpumask_any_distribute(const struct cpumask *srcp) 272 + { 273 + int next, prev; 274 + 275 + /* NOTE: our first selection will skip 0. */ 276 + prev = __this_cpu_read(distribute_cpu_mask_prev); 277 + 278 + next = cpumask_next(prev, srcp); 279 + if (next >= nr_cpu_ids) 280 + next = cpumask_first(srcp); 281 + 282 + if (next < nr_cpu_ids) 283 + __this_cpu_write(distribute_cpu_mask_prev, next); 284 + 285 + return next; 286 + } 287 + EXPORT_SYMBOL(cpumask_any_distribute);

+2

lib/dump_stack.c

··· 12 12 #include <linux/atomic.h> 13 13 #include <linux/kexec.h> 14 14 #include <linux/utsname.h> 15 + #include <linux/stop_machine.h> 15 16 16 17 static char dump_stack_arch_desc_str[128]; 17 18 ··· 58 57 log_lvl, dump_stack_arch_desc_str); 59 58 60 59 print_worker_info(log_lvl, current); 60 + print_stop_info(log_lvl, current); 61 61 } 62 62 63 63 /**

+5

lib/smp_processor_id.c

··· 26 26 if (current->nr_cpus_allowed == 1) 27 27 goto out; 28 28 29 + #ifdef CONFIG_SMP 30 + if (current->migration_disabled) 31 + goto out; 32 + #endif 33 + 29 34 /* 30 35 * It is valid to assume CPU-locality during early bootup: 31 36 */

+1 -2

net/core/dev.c

··· 11179 11179 INIT_LIST_HEAD(&sd->poll_list); 11180 11180 sd->output_queue_tailp = &sd->output_queue; 11181 11181 #ifdef CONFIG_RPS 11182 - sd->csd.func = rps_trigger_softirq; 11183 - sd->csd.info = sd; 11182 + INIT_CSD(&sd->csd, rps_trigger_softirq, sd); 11184 11183 sd->cpu = i; 11185 11184 #endif 11186 11185