Merge tag 'sched-core-2025-03-22' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

+1 -1

Documentation/scheduler/sched-debug.rst

··· 2 2 Scheduler debugfs 3 3 ================= 4 4 5 - Booting a kernel with CONFIG_SCHED_DEBUG=y will give access to 5 + Booting a kernel with debugfs enabled will give access to 6 6 scheduler specific debug files under /sys/kernel/debug/sched. Some of 7 7 those files are described below. 8 8

+1 -1

Documentation/scheduler/sched-design-CFS.rst

··· 96 96 CFS uses nanosecond granularity accounting and does not rely on any jiffies or 97 97 other HZ detail. Thus the CFS scheduler has no notion of "timeslices" in the 98 98 way the previous scheduler had, and has no heuristics whatsoever. There is 99 - only one central tunable (you have to switch on CONFIG_SCHED_DEBUG): 99 + only one central tunable: 100 100 101 101 /sys/kernel/debug/sched/base_slice_ns 102 102

+2 -3

Documentation/scheduler/sched-domains.rst

··· 73 73 for a given topology level by creating a sched_domain_topology_level array and 74 74 calling set_sched_topology() with this array as the parameter. 75 75 76 - The sched-domains debugging infrastructure can be enabled by enabling 77 - CONFIG_SCHED_DEBUG and adding 'sched_verbose' to your cmdline. If you 78 - forgot to tweak your cmdline, you can also flip the 76 + The sched-domains debugging infrastructure can be enabled by 'sched_verbose' 77 + to your cmdline. If you forgot to tweak your cmdline, you can also flip the 79 78 /sys/kernel/debug/sched/verbose knob. This enables an error checking parse of 80 79 the sched domains which should catch most possible errors (described above). It 81 80 also prints out the domain structure in a visual format.

+1 -2

Documentation/scheduler/sched-ext.rst

··· 107 107 nr_rejected : 0 108 108 enable_seq : 1 109 109 110 - If ``CONFIG_SCHED_DEBUG`` is set, whether a given task is on sched_ext can 111 - be determined as follows: 110 + Whether a given task is on sched_ext can be determined as follows: 112 111 113 112 .. code-block:: none 114 113

+1 -1

Documentation/scheduler/sched-stats.rst

··· 88 88 CONFIG_SMP is not defined, *no* domains are utilized and these lines 89 89 will not appear in the output. <name> is an extension to the domain field 90 90 that prints the name of the corresponding sched domain. It can appear in 91 - schedstat version 17 and above, and requires CONFIG_SCHED_DEBUG.) 91 + schedstat version 17 and above. 92 92 93 93 domain<N> <name> <cpumask> 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 94 94

+1 -1

Documentation/translations/sp_SP/scheduler/sched-design-CFS.rst

··· 112 112 jiffy o detalles como HZ. De este modo, el gestor de tareas CFS no tiene 113 113 noción de "ventanas de tiempo" de la forma en que tenía el gestor de 114 114 tareas previo, y tampoco tiene heurísticos. Únicamente hay un parámetro 115 - central ajustable (se ha de cambiar en CONFIG_SCHED_DEBUG): 115 + central ajustable: 116 116 117 117 /sys/kernel/debug/sched/base_slice_ns 118 118

+2 -9

arch/arm/kernel/traps.c

··· 258 258 barrier(); 259 259 } 260 260 261 - #ifdef CONFIG_PREEMPT 262 - #define S_PREEMPT " PREEMPT" 263 - #elif defined(CONFIG_PREEMPT_RT) 264 - #define S_PREEMPT " PREEMPT_RT" 265 - #else 266 - #define S_PREEMPT "" 267 - #endif 268 261 #ifdef CONFIG_SMP 269 262 #define S_SMP " SMP" 270 263 #else ··· 275 282 static int die_counter; 276 283 int ret; 277 284 278 - pr_emerg("Internal error: %s: %x [#%d]" S_PREEMPT S_SMP S_ISA "\n", 279 - str, err, ++die_counter); 285 + pr_emerg("Internal error: %s: %x [#%d]" S_SMP S_ISA "\n", 286 + str, err, ++die_counter); 280 287 281 288 /* trap and error numbers are mostly meaningless on ARM */ 282 289 ret = notify_die(DIE_OOPS, str, regs, err, tsk->thread.trap_no, SIGSEGV);

+1 -9

arch/arm64/kernel/traps.c

··· 172 172 printk("%sCode: %s\n", lvl, str); 173 173 } 174 174 175 - #ifdef CONFIG_PREEMPT 176 - #define S_PREEMPT " PREEMPT" 177 - #elif defined(CONFIG_PREEMPT_RT) 178 - #define S_PREEMPT " PREEMPT_RT" 179 - #else 180 - #define S_PREEMPT "" 181 - #endif 182 - 183 175 #define S_SMP " SMP" 184 176 185 177 static int __die(const char *str, long err, struct pt_regs *regs) ··· 179 187 static int die_counter; 180 188 int ret; 181 189 182 - pr_emerg("Internal error: %s: %016lx [#%d]" S_PREEMPT S_SMP "\n", 190 + pr_emerg("Internal error: %s: %016lx [#%d] " S_SMP "\n", 183 191 str, err, ++die_counter); 184 192 185 193 /* trap and error numbers are mostly meaningless on ARM */

+1 -2

arch/powerpc/kernel/traps.c

··· 263 263 { 264 264 printk("Oops: %s, sig: %ld [#%d]\n", str, err, ++die_counter); 265 265 266 - printk("%s PAGE_SIZE=%luK%s%s%s%s%s%s %s\n", 266 + printk("%s PAGE_SIZE=%luK%s %s%s%s%s %s\n", 267 267 IS_ENABLED(CONFIG_CPU_LITTLE_ENDIAN) ? "LE" : "BE", 268 268 PAGE_SIZE / 1024, get_mmu_str(), 269 - IS_ENABLED(CONFIG_PREEMPT) ? " PREEMPT" : "", 270 269 IS_ENABLED(CONFIG_SMP) ? " SMP" : "", 271 270 IS_ENABLED(CONFIG_SMP) ? (" NR_CPUS=" __stringify(NR_CPUS)) : "", 272 271 debug_pagealloc_enabled() ? " DEBUG_PAGEALLOC" : "",

+1 -6

arch/s390/kernel/dumpstack.c

··· 198 198 console_verbose(); 199 199 spin_lock_irq(&die_lock); 200 200 bust_spinlocks(1); 201 - printk("%s: %04x ilc:%d [#%d] ", str, regs->int_code & 0xffff, 201 + printk("%s: %04x ilc:%d [#%d]", str, regs->int_code & 0xffff, 202 202 regs->int_code >> 17, ++die_counter); 203 - #ifdef CONFIG_PREEMPT 204 - pr_cont("PREEMPT "); 205 - #elif defined(CONFIG_PREEMPT_RT) 206 - pr_cont("PREEMPT_RT "); 207 - #endif 208 203 pr_cont("SMP "); 209 204 if (debug_pagealloc_enabled()) 210 205 pr_cont("DEBUG_PAGEALLOC");

+2 -7

arch/x86/kernel/dumpstack.c

··· 395 395 396 396 static void __die_header(const char *str, struct pt_regs *regs, long err) 397 397 { 398 - const char *pr = ""; 399 - 400 398 /* Save the regs of the first oops for the executive summary later. */ 401 399 if (!die_counter) 402 400 exec_summary_regs = *regs; 403 401 404 - if (IS_ENABLED(CONFIG_PREEMPTION)) 405 - pr = IS_ENABLED(CONFIG_PREEMPT_RT) ? " PREEMPT_RT" : " PREEMPT"; 406 - 407 402 printk(KERN_DEFAULT 408 - "Oops: %s: %04lx [#%d]%s%s%s%s%s\n", str, err & 0xffff, 409 - ++die_counter, pr, 403 + "Oops: %s: %04lx [#%d]%s%s%s%s\n", str, err & 0xffff, 404 + ++die_counter, 410 405 IS_ENABLED(CONFIG_SMP) ? " SMP" : "", 411 406 debug_pagealloc_enabled() ? " DEBUG_PAGEALLOC" : "", 412 407 IS_ENABLED(CONFIG_KASAN) ? " KASAN" : "",

+2 -2

arch/x86/kernel/tsc.c

··· 959 959 960 960 void tsc_save_sched_clock_state(void) 961 961 { 962 - if (!sched_clock_stable()) 962 + if (!static_branch_likely(&__use_tsc) && !sched_clock_stable()) 963 963 return; 964 964 965 965 cyc2ns_suspend = sched_clock(); ··· 979 979 unsigned long flags; 980 980 int cpu; 981 981 982 - if (!sched_clock_stable()) 982 + if (!static_branch_likely(&__use_tsc) && !sched_clock_stable()) 983 983 return; 984 984 985 985 local_irq_save(flags);

+1 -5

arch/xtensa/kernel/traps.c

··· 629 629 void __noreturn die(const char * str, struct pt_regs * regs, long err) 630 630 { 631 631 static int die_counter; 632 - const char *pr = ""; 633 - 634 - if (IS_ENABLED(CONFIG_PREEMPTION)) 635 - pr = IS_ENABLED(CONFIG_PREEMPT_RT) ? " PREEMPT_RT" : " PREEMPT"; 636 632 637 633 console_verbose(); 638 634 spin_lock_irq(&die_lock); 639 635 640 - pr_info("%s: sig: %ld [#%d]%s\n", str, err, ++die_counter, pr); 636 + pr_info("%s: sig: %ld [#%d]\n", str, err, ++die_counter); 641 637 show_regs(regs); 642 638 if (!user_mode(regs)) 643 639 show_stack(NULL, (unsigned long *)regs->areg[1], KERN_INFO);

-7

fs/proc/base.c

··· 1489 1489 #endif 1490 1490 1491 1491 1492 - #ifdef CONFIG_SCHED_DEBUG 1493 1492 /* 1494 1493 * Print out various scheduling related per-task fields: 1495 1494 */ ··· 1537 1538 .llseek = seq_lseek, 1538 1539 .release = single_release, 1539 1540 }; 1540 - 1541 - #endif 1542 1541 1543 1542 #ifdef CONFIG_SCHED_AUTOGROUP 1544 1543 /* ··· 3328 3331 ONE("status", S_IRUGO, proc_pid_status), 3329 3332 ONE("personality", S_IRUSR, proc_pid_personality), 3330 3333 ONE("limits", S_IRUGO, proc_pid_limits), 3331 - #ifdef CONFIG_SCHED_DEBUG 3332 3334 REG("sched", S_IRUGO|S_IWUSR, proc_pid_sched_operations), 3333 - #endif 3334 3335 #ifdef CONFIG_SCHED_AUTOGROUP 3335 3336 REG("autogroup", S_IRUGO|S_IWUSR, proc_pid_sched_autogroup_operations), 3336 3337 #endif ··· 3677 3682 ONE("status", S_IRUGO, proc_pid_status), 3678 3683 ONE("personality", S_IRUSR, proc_pid_personality), 3679 3684 ONE("limits", S_IRUGO, proc_pid_limits), 3680 - #ifdef CONFIG_SCHED_DEBUG 3681 3685 REG("sched", S_IRUGO|S_IWUSR, proc_pid_sched_operations), 3682 - #endif 3683 3686 NOD("comm", S_IFREG|S_IRUGO|S_IWUSR, 3684 3687 &proc_tid_comm_inode_operations, 3685 3688 &proc_pid_set_comm_operations, {}),

+11

include/linux/cpuset.h

··· 125 125 126 126 extern bool current_cpuset_is_being_rebound(void); 127 127 128 + extern void dl_rebuild_rd_accounting(void); 128 129 extern void rebuild_sched_domains(void); 129 130 130 131 extern void cpuset_print_current_mems_allowed(void); 132 + extern void cpuset_reset_sched_domains(void); 131 133 132 134 /* 133 135 * read_mems_allowed_begin is required when making decisions involving ··· 261 259 return false; 262 260 } 263 261 262 + static inline void dl_rebuild_rd_accounting(void) 263 + { 264 + } 265 + 264 266 static inline void rebuild_sched_domains(void) 267 + { 268 + partition_sched_domains(1, NULL, NULL); 269 + } 270 + 271 + static inline void cpuset_reset_sched_domains(void) 265 272 { 266 273 partition_sched_domains(1, NULL, NULL); 267 274 }

-2

include/linux/energy_model.h

··· 240 240 struct em_perf_state *ps; 241 241 int i; 242 242 243 - #ifdef CONFIG_SCHED_DEBUG 244 243 WARN_ONCE(!rcu_read_lock_held(), "EM: rcu read lock needed\n"); 245 - #endif 246 244 247 245 if (!sum_util) 248 246 return 0;

+2

include/linux/preempt.h

··· 515 515 return IS_ENABLED(CONFIG_PREEMPT_RT); 516 516 } 517 517 518 + extern const char *preempt_model_str(void); 519 + 518 520 /* 519 521 * Does the preemption model allow non-cooperative preemption? 520 522 *

+5

include/linux/sched.h

··· 382 382 #ifdef CONFIG_SMP 383 383 extern struct root_domain def_root_domain; 384 384 extern struct mutex sched_domains_mutex; 385 + extern void sched_domains_mutex_lock(void); 386 + extern void sched_domains_mutex_unlock(void); 387 + #else 388 + static inline void sched_domains_mutex_lock(void) { } 389 + static inline void sched_domains_mutex_unlock(void) { } 385 390 #endif 386 391 387 392 struct sched_param {

+4

include/linux/sched/deadline.h

··· 34 34 struct root_domain; 35 35 extern void dl_add_task_root_domain(struct task_struct *p); 36 36 extern void dl_clear_root_domain(struct root_domain *rd); 37 + extern void dl_clear_root_domain_cpu(int cpu); 37 38 38 39 #endif /* CONFIG_SMP */ 40 + 41 + extern u64 dl_cookie; 42 + extern bool dl_bw_visited(int cpu, u64 cookie); 39 43 40 44 #endif /* _LINUX_SCHED_DEADLINE_H */

-2

include/linux/sched/debug.h

··· 35 35 36 36 extern void sched_show_task(struct task_struct *p); 37 37 38 - #ifdef CONFIG_SCHED_DEBUG 39 38 struct seq_file; 40 39 extern void proc_sched_show_task(struct task_struct *p, 41 40 struct pid_namespace *ns, struct seq_file *m); 42 41 extern void proc_sched_set_task(struct task_struct *p); 43 - #endif 44 42 45 43 /* Attach to any functions which should be ignored in wchan output. */ 46 44 #define __sched __section(".sched.text")

+16 -7

include/linux/sched/idle.h

··· 79 79 return unlikely(tif_need_resched()); 80 80 } 81 81 82 + static __always_inline void current_clr_polling(void) 83 + { 84 + __current_clr_polling(); 85 + 86 + /* 87 + * Ensure we check TIF_NEED_RESCHED after we clear the polling bit. 88 + * Once the bit is cleared, we'll get IPIs with every new 89 + * TIF_NEED_RESCHED and the IPI handler, scheduler_ipi(), will also 90 + * fold. 91 + */ 92 + smp_mb__after_atomic(); /* paired with resched_curr() */ 93 + 94 + preempt_fold_need_resched(); 95 + } 96 + 82 97 #else 83 98 static inline void __current_set_polling(void) { } 84 99 static inline void __current_clr_polling(void) { } ··· 106 91 { 107 92 return unlikely(tif_need_resched()); 108 93 } 109 - #endif 110 94 111 95 static __always_inline void current_clr_polling(void) 112 96 { 113 97 __current_clr_polling(); 114 98 115 - /* 116 - * Ensure we check TIF_NEED_RESCHED after we clear the polling bit. 117 - * Once the bit is cleared, we'll get IPIs with every new 118 - * TIF_NEED_RESCHED and the IPI handler, scheduler_ipi(), will also 119 - * fold. 120 - */ 121 99 smp_mb(); /* paired with resched_curr() */ 122 100 123 101 preempt_fold_need_resched(); 124 102 } 103 + #endif 125 104 126 105 #endif /* _LINUX_SCHED_IDLE_H */

+7

include/linux/sched/mm.h

··· 531 531 532 532 static inline void membarrier_mm_sync_core_before_usermode(struct mm_struct *mm) 533 533 { 534 + /* 535 + * The atomic_read() below prevents CSE. The following should 536 + * help the compiler generate more efficient code on architectures 537 + * where sync_core_before_usermode() is a no-op. 538 + */ 539 + if (!IS_ENABLED(CONFIG_ARCH_HAS_SYNC_CORE_BEFORE_USERMODE)) 540 + return; 534 541 if (current->mm != mm) 535 542 return; 536 543 if (likely(!(atomic_read(&mm->membarrier_state) &

-14

include/linux/sched/topology.h

··· 25 25 }; 26 26 #undef SD_FLAG 27 27 28 - #ifdef CONFIG_SCHED_DEBUG 29 - 30 28 struct sd_flag_debug { 31 29 unsigned int meta_flags; 32 30 char *name; 33 31 }; 34 32 extern const struct sd_flag_debug sd_flag_debug[]; 35 - 36 - #endif 37 33 38 34 #ifdef CONFIG_SCHED_SMT 39 35 static inline int cpu_smt_flags(void) ··· 162 166 return to_cpumask(sd->span); 163 167 } 164 168 165 - extern void partition_sched_domains_locked(int ndoms_new, 166 - cpumask_var_t doms_new[], 167 - struct sched_domain_attr *dattr_new); 168 - 169 169 extern void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[], 170 170 struct sched_domain_attr *dattr_new); 171 171 ··· 201 209 #else /* CONFIG_SMP */ 202 210 203 211 struct sched_domain_attr; 204 - 205 - static inline void 206 - partition_sched_domains_locked(int ndoms_new, cpumask_var_t doms_new[], 207 - struct sched_domain_attr *dattr_new) 208 - { 209 - } 210 212 211 213 static inline void 212 214 partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],

-2

include/trace/events/sched.h

··· 193 193 { 194 194 unsigned int state; 195 195 196 - #ifdef CONFIG_SCHED_DEBUG 197 196 BUG_ON(p != current); 198 - #endif /* CONFIG_SCHED_DEBUG */ 199 197 200 198 /* 201 199 * Preemption ignores task state, therefore preempted tasks are always

+17 -17

kernel/cgroup/cpuset.c

··· 953 953 css_task_iter_end(&it); 954 954 } 955 955 956 - static void dl_rebuild_rd_accounting(void) 956 + void dl_rebuild_rd_accounting(void) 957 957 { 958 958 struct cpuset *cs = NULL; 959 959 struct cgroup_subsys_state *pos_css; 960 + int cpu; 961 + u64 cookie = ++dl_cookie; 960 962 961 963 lockdep_assert_held(&cpuset_mutex); 962 964 lockdep_assert_cpus_held(); ··· 966 964 967 965 rcu_read_lock(); 968 966 969 - /* 970 - * Clear default root domain DL accounting, it will be computed again 971 - * if a task belongs to it. 972 - */ 973 - dl_clear_root_domain(&def_root_domain); 967 + for_each_possible_cpu(cpu) { 968 + if (dl_bw_visited(cpu, cookie)) 969 + continue; 970 + 971 + dl_clear_root_domain_cpu(cpu); 972 + } 974 973 975 974 cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) { 976 975 ··· 990 987 css_put(&cs->css); 991 988 } 992 989 rcu_read_unlock(); 993 - } 994 - 995 - static void 996 - partition_and_rebuild_sched_domains(int ndoms_new, cpumask_var_t doms_new[], 997 - struct sched_domain_attr *dattr_new) 998 - { 999 - mutex_lock(&sched_domains_mutex); 1000 - partition_sched_domains_locked(ndoms_new, doms_new, dattr_new); 1001 - dl_rebuild_rd_accounting(); 1002 - mutex_unlock(&sched_domains_mutex); 1003 990 } 1004 991 1005 992 /* ··· 1053 1060 ndoms = generate_sched_domains(&doms, &attr); 1054 1061 1055 1062 /* Have scheduler rebuild the domains */ 1056 - partition_and_rebuild_sched_domains(ndoms, doms, attr); 1063 + partition_sched_domains(ndoms, doms, attr); 1057 1064 } 1058 1065 #else /* !CONFIG_SMP */ 1059 1066 void rebuild_sched_domains_locked(void) ··· 1073 1080 cpus_read_lock(); 1074 1081 rebuild_sched_domains_cpuslocked(); 1075 1082 cpus_read_unlock(); 1083 + } 1084 + 1085 + void cpuset_reset_sched_domains(void) 1086 + { 1087 + mutex_lock(&cpuset_mutex); 1088 + partition_sched_domains(1, NULL, NULL); 1089 + mutex_unlock(&cpuset_mutex); 1076 1090 } 1077 1091 1078 1092 /**

+89 -53

kernel/rseq.c

··· 78 78 return -EFAULT; 79 79 } 80 80 81 - static void rseq_set_ro_fields(struct task_struct *t, u32 cpu_id_start, u32 cpu_id, 82 - u32 node_id, u32 mm_cid) 83 - { 84 - rseq_kernel_fields(t)->cpu_id_start = cpu_id; 85 - rseq_kernel_fields(t)->cpu_id = cpu_id; 86 - rseq_kernel_fields(t)->node_id = node_id; 87 - rseq_kernel_fields(t)->mm_cid = mm_cid; 88 - } 81 + /* 82 + * Update an rseq field and its in-kernel copy in lock-step to keep a coherent 83 + * state. 84 + */ 85 + #define rseq_unsafe_put_user(t, value, field, error_label) \ 86 + do { \ 87 + unsafe_put_user(value, &t->rseq->field, error_label); \ 88 + rseq_kernel_fields(t)->field = value; \ 89 + } while (0) 90 + 89 91 #else 90 92 static int rseq_validate_ro_fields(struct task_struct *t) 91 93 { 92 94 return 0; 93 95 } 94 96 95 - static void rseq_set_ro_fields(struct task_struct *t, u32 cpu_id_start, u32 cpu_id, 96 - u32 node_id, u32 mm_cid) 97 - { 98 - } 97 + #define rseq_unsafe_put_user(t, value, field, error_label) \ 98 + unsafe_put_user(value, &t->rseq->field, error_label) 99 99 #endif 100 100 101 101 /* ··· 173 173 WARN_ON_ONCE((int) mm_cid < 0); 174 174 if (!user_write_access_begin(rseq, t->rseq_len)) 175 175 goto efault; 176 - unsafe_put_user(cpu_id, &rseq->cpu_id_start, efault_end); 177 - unsafe_put_user(cpu_id, &rseq->cpu_id, efault_end); 178 - unsafe_put_user(node_id, &rseq->node_id, efault_end); 179 - unsafe_put_user(mm_cid, &rseq->mm_cid, efault_end); 176 + 177 + rseq_unsafe_put_user(t, cpu_id, cpu_id_start, efault_end); 178 + rseq_unsafe_put_user(t, cpu_id, cpu_id, efault_end); 179 + rseq_unsafe_put_user(t, node_id, node_id, efault_end); 180 + rseq_unsafe_put_user(t, mm_cid, mm_cid, efault_end); 181 + 180 182 /* 181 183 * Additional feature fields added after ORIG_RSEQ_SIZE 182 184 * need to be conditionally updated only if 183 185 * t->rseq_len != ORIG_RSEQ_SIZE. 184 186 */ 185 187 user_write_access_end(); 186 - rseq_set_ro_fields(t, cpu_id, cpu_id, node_id, mm_cid); 187 188 trace_rseq_update(t); 188 189 return 0; 189 190 ··· 196 195 197 196 static int rseq_reset_rseq_cpu_node_id(struct task_struct *t) 198 197 { 198 + struct rseq __user *rseq = t->rseq; 199 199 u32 cpu_id_start = 0, cpu_id = RSEQ_CPU_ID_UNINITIALIZED, node_id = 0, 200 200 mm_cid = 0; 201 201 ··· 204 202 * Validate read-only rseq fields. 205 203 */ 206 204 if (rseq_validate_ro_fields(t)) 207 - return -EFAULT; 208 - /* 209 - * Reset cpu_id_start to its initial state (0). 210 - */ 211 - if (put_user(cpu_id_start, &t->rseq->cpu_id_start)) 212 - return -EFAULT; 213 - /* 214 - * Reset cpu_id to RSEQ_CPU_ID_UNINITIALIZED, so any user coming 215 - * in after unregistration can figure out that rseq needs to be 216 - * registered again. 217 - */ 218 - if (put_user(cpu_id, &t->rseq->cpu_id)) 219 - return -EFAULT; 220 - /* 221 - * Reset node_id to its initial state (0). 222 - */ 223 - if (put_user(node_id, &t->rseq->node_id)) 224 - return -EFAULT; 225 - /* 226 - * Reset mm_cid to its initial state (0). 227 - */ 228 - if (put_user(mm_cid, &t->rseq->mm_cid)) 229 - return -EFAULT; 205 + goto efault; 230 206 231 - rseq_set_ro_fields(t, cpu_id_start, cpu_id, node_id, mm_cid); 207 + if (!user_write_access_begin(rseq, t->rseq_len)) 208 + goto efault; 209 + 210 + /* 211 + * Reset all fields to their initial state. 212 + * 213 + * All fields have an initial state of 0 except cpu_id which is set to 214 + * RSEQ_CPU_ID_UNINITIALIZED, so that any user coming in after 215 + * unregistration can figure out that rseq needs to be registered 216 + * again. 217 + */ 218 + rseq_unsafe_put_user(t, cpu_id_start, cpu_id_start, efault_end); 219 + rseq_unsafe_put_user(t, cpu_id, cpu_id, efault_end); 220 + rseq_unsafe_put_user(t, node_id, node_id, efault_end); 221 + rseq_unsafe_put_user(t, mm_cid, mm_cid, efault_end); 232 222 233 223 /* 234 224 * Additional feature fields added after ORIG_RSEQ_SIZE 235 225 * need to be conditionally reset only if 236 226 * t->rseq_len != ORIG_RSEQ_SIZE. 237 227 */ 228 + user_write_access_end(); 229 + return 0; 230 + 231 + efault_end: 232 + user_write_access_end(); 233 + efault: 234 + return -EFAULT; 235 + } 236 + 237 + /* 238 + * Get the user-space pointer value stored in the 'rseq_cs' field. 239 + */ 240 + static int rseq_get_rseq_cs_ptr_val(struct rseq __user *rseq, u64 *rseq_cs) 241 + { 242 + if (!rseq_cs) 243 + return -EFAULT; 244 + 245 + #ifdef CONFIG_64BIT 246 + if (get_user(*rseq_cs, &rseq->rseq_cs)) 247 + return -EFAULT; 248 + #else 249 + if (copy_from_user(rseq_cs, &rseq->rseq_cs, sizeof(*rseq_cs))) 250 + return -EFAULT; 251 + #endif 252 + 238 253 return 0; 239 254 } 240 255 256 + /* 257 + * If the rseq_cs field of 'struct rseq' contains a valid pointer to 258 + * user-space, copy 'struct rseq_cs' from user-space and validate its fields. 259 + */ 241 260 static int rseq_get_rseq_cs(struct task_struct *t, struct rseq_cs *rseq_cs) 242 261 { 243 262 struct rseq_cs __user *urseq_cs; ··· 267 244 u32 sig; 268 245 int ret; 269 246 270 - #ifdef CONFIG_64BIT 271 - if (get_user(ptr, &t->rseq->rseq_cs)) 272 - return -EFAULT; 273 - #else 274 - if (copy_from_user(&ptr, &t->rseq->rseq_cs, sizeof(ptr))) 275 - return -EFAULT; 276 - #endif 247 + ret = rseq_get_rseq_cs_ptr_val(t->rseq, &ptr); 248 + if (ret) 249 + return ret; 250 + 251 + /* If the rseq_cs pointer is NULL, return a cleared struct rseq_cs. */ 277 252 if (!ptr) { 278 253 memset(rseq_cs, 0, sizeof(*rseq_cs)); 279 254 return 0; 280 255 } 256 + /* Check that the pointer value fits in the user-space process space. */ 281 257 if (ptr >= TASK_SIZE) 282 258 return -EINVAL; 283 259 urseq_cs = (struct rseq_cs __user *)(unsigned long)ptr; ··· 352 330 return !!event_mask; 353 331 } 354 332 355 - static int clear_rseq_cs(struct task_struct *t) 333 + static int clear_rseq_cs(struct rseq __user *rseq) 356 334 { 357 335 /* 358 336 * The rseq_cs field is set to NULL on preemption or signal ··· 363 341 * Set rseq_cs to NULL. 364 342 */ 365 343 #ifdef CONFIG_64BIT 366 - return put_user(0UL, &t->rseq->rseq_cs); 344 + return put_user(0UL, &rseq->rseq_cs); 367 345 #else 368 - if (clear_user(&t->rseq->rseq_cs, sizeof(t->rseq->rseq_cs))) 346 + if (clear_user(&rseq->rseq_cs, sizeof(rseq->rseq_cs))) 369 347 return -EFAULT; 370 348 return 0; 371 349 #endif ··· 397 375 * Clear the rseq_cs pointer and return. 398 376 */ 399 377 if (!in_rseq_cs(ip, &rseq_cs)) 400 - return clear_rseq_cs(t); 378 + return clear_rseq_cs(t->rseq); 401 379 ret = rseq_need_restart(t, rseq_cs.flags); 402 380 if (ret <= 0) 403 381 return ret; 404 - ret = clear_rseq_cs(t); 382 + ret = clear_rseq_cs(t->rseq); 405 383 if (ret) 406 384 return ret; 407 385 trace_rseq_ip_fixup(ip, rseq_cs.start_ip, rseq_cs.post_commit_offset, ··· 475 453 int, flags, u32, sig) 476 454 { 477 455 int ret; 456 + u64 rseq_cs; 478 457 479 458 if (flags & RSEQ_FLAG_UNREGISTER) { 480 459 if (flags & ~RSEQ_FLAG_UNREGISTER) ··· 530 507 return -EINVAL; 531 508 if (!access_ok(rseq, rseq_len)) 532 509 return -EFAULT; 510 + 511 + /* 512 + * If the rseq_cs pointer is non-NULL on registration, clear it to 513 + * avoid a potential segfault on return to user-space. The proper thing 514 + * to do would have been to fail the registration but this would break 515 + * older libcs that reuse the rseq area for new threads without 516 + * clearing the fields. 517 + */ 518 + if (rseq_get_rseq_cs_ptr_val(rseq, &rseq_cs)) 519 + return -EFAULT; 520 + if (rseq_cs && clear_rseq_cs(rseq)) 521 + return -EFAULT; 522 + 533 523 #ifdef CONFIG_DEBUG_RSEQ 534 524 /* 535 525 * Initialize the in-kernel rseq fields copy for validation of

+1 -3

kernel/sched/build_utility.c

··· 68 68 # include "cpufreq_schedutil.c" 69 69 #endif 70 70 71 - #ifdef CONFIG_SCHED_DEBUG 72 - # include "debug.c" 73 - #endif 71 + #include "debug.c" 74 72 75 73 #ifdef CONFIG_SCHEDSTATS 76 74 # include "stats.c"

+73 -39

kernel/sched/core.c

··· 91 91 #include "autogroup.h" 92 92 #include "pelt.h" 93 93 #include "smp.h" 94 - #include "stats.h" 95 94 96 95 #include "../workqueue_internal.h" 97 96 #include "../../io_uring/io-wq.h" ··· 118 119 119 120 DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); 120 121 121 - #ifdef CONFIG_SCHED_DEBUG 122 122 /* 123 123 * Debugging: various feature bits 124 124 * ··· 127 129 */ 128 130 #define SCHED_FEAT(name, enabled) \ 129 131 (1UL << __SCHED_FEAT_##name) * enabled | 130 - const_debug unsigned int sysctl_sched_features = 132 + __read_mostly unsigned int sysctl_sched_features = 131 133 #include "features.h" 132 134 0; 133 135 #undef SCHED_FEAT ··· 141 143 */ 142 144 __read_mostly int sysctl_resched_latency_warn_ms = 100; 143 145 __read_mostly int sysctl_resched_latency_warn_once = 1; 144 - #endif /* CONFIG_SCHED_DEBUG */ 145 146 146 147 /* 147 148 * Number of tasks to iterate in a single balance run. 148 149 * Limited because this is done with IRQs disabled. 149 150 */ 150 - const_debug unsigned int sysctl_sched_nr_migrate = SCHED_NR_MIGRATE_BREAK; 151 + __read_mostly unsigned int sysctl_sched_nr_migrate = SCHED_NR_MIGRATE_BREAK; 151 152 152 153 __read_mostly int scheduler_running; 153 154 ··· 797 800 if (rq->clock_update_flags & RQCF_ACT_SKIP) 798 801 return; 799 802 800 - #ifdef CONFIG_SCHED_DEBUG 801 803 if (sched_feat(WARN_DOUBLE_CLOCK)) 802 - SCHED_WARN_ON(rq->clock_update_flags & RQCF_UPDATED); 804 + WARN_ON_ONCE(rq->clock_update_flags & RQCF_UPDATED); 803 805 rq->clock_update_flags |= RQCF_UPDATED; 804 - #endif 806 + 805 807 clock = sched_clock_cpu(cpu_of(rq)); 806 808 scx_rq_clock_update(rq, clock); 807 809 ··· 1716 1720 1717 1721 bucket = &uc_rq->bucket[uc_se->bucket_id]; 1718 1722 1719 - SCHED_WARN_ON(!bucket->tasks); 1723 + WARN_ON_ONCE(!bucket->tasks); 1720 1724 if (likely(bucket->tasks)) 1721 1725 bucket->tasks--; 1722 1726 ··· 1736 1740 * Defensive programming: this should never happen. If it happens, 1737 1741 * e.g. due to future modification, warn and fix up the expected value. 1738 1742 */ 1739 - SCHED_WARN_ON(bucket->value > rq_clamp); 1743 + WARN_ON_ONCE(bucket->value > rq_clamp); 1740 1744 if (bucket->value >= rq_clamp) { 1741 1745 bkt_clamp = uclamp_rq_max_value(rq, clamp_id, uc_se->value); 1742 1746 uclamp_rq_set(rq, clamp_id, bkt_clamp); ··· 1753 1757 * The condition is constructed such that a NOP is generated when 1754 1758 * sched_uclamp_used is disabled. 1755 1759 */ 1756 - if (!static_branch_unlikely(&sched_uclamp_used)) 1760 + if (!uclamp_is_used()) 1757 1761 return; 1758 1762 1759 1763 if (unlikely(!p->sched_class->uclamp_enabled)) ··· 1780 1784 * The condition is constructed such that a NOP is generated when 1781 1785 * sched_uclamp_used is disabled. 1782 1786 */ 1783 - if (!static_branch_unlikely(&sched_uclamp_used)) 1787 + if (!uclamp_is_used()) 1784 1788 return; 1785 1789 1786 1790 if (unlikely(!p->sched_class->uclamp_enabled)) ··· 1938 1942 } 1939 1943 1940 1944 if (update_root_tg) { 1941 - static_branch_enable(&sched_uclamp_used); 1945 + sched_uclamp_enable(); 1942 1946 uclamp_update_root_tg(); 1943 1947 } 1944 1948 1945 1949 if (old_min_rt != sysctl_sched_uclamp_util_min_rt_default) { 1946 - static_branch_enable(&sched_uclamp_used); 1950 + sched_uclamp_enable(); 1947 1951 uclamp_sync_util_min_rt_default(); 1948 1952 } 1949 1953 ··· 2118 2122 2119 2123 void deactivate_task(struct rq *rq, struct task_struct *p, int flags) 2120 2124 { 2121 - SCHED_WARN_ON(flags & DEQUEUE_SLEEP); 2125 + WARN_ON_ONCE(flags & DEQUEUE_SLEEP); 2122 2126 2123 2127 WRITE_ONCE(p->on_rq, TASK_ON_RQ_MIGRATING); 2124 2128 ASSERT_EXCLUSIVE_WRITER(p->on_rq); ··· 2723 2727 * XXX do further audits, this smells like something putrid. 2724 2728 */ 2725 2729 if (ctx->flags & SCA_MIGRATE_DISABLE) 2726 - SCHED_WARN_ON(!p->on_cpu); 2730 + WARN_ON_ONCE(!p->on_cpu); 2727 2731 else 2728 2732 lockdep_assert_held(&p->pi_lock); 2729 2733 ··· 3288 3292 3289 3293 void set_task_cpu(struct task_struct *p, unsigned int new_cpu) 3290 3294 { 3291 - #ifdef CONFIG_SCHED_DEBUG 3292 3295 unsigned int state = READ_ONCE(p->__state); 3293 3296 3294 3297 /* ··· 3325 3330 WARN_ON_ONCE(!cpu_online(new_cpu)); 3326 3331 3327 3332 WARN_ON_ONCE(is_migration_disabled(p)); 3328 - #endif 3329 3333 3330 3334 trace_sched_migrate_task(p, new_cpu); 3331 3335 ··· 4185 4191 * - we're serialized against set_special_state() by virtue of 4186 4192 * it disabling IRQs (this allows not taking ->pi_lock). 4187 4193 */ 4188 - SCHED_WARN_ON(p->se.sched_delayed); 4194 + WARN_ON_ONCE(p->se.sched_delayed); 4189 4195 if (!ttwu_state_match(p, state, &success)) 4190 4196 goto out; 4191 4197 ··· 4479 4485 INIT_LIST_HEAD(&p->se.group_node); 4480 4486 4481 4487 /* A delayed task cannot be in clone(). */ 4482 - SCHED_WARN_ON(p->se.sched_delayed); 4488 + WARN_ON_ONCE(p->se.sched_delayed); 4483 4489 4484 4490 #ifdef CONFIG_FAIR_GROUP_SCHED 4485 4491 p->se.cfs_rq = NULL; ··· 5567 5573 return ns; 5568 5574 } 5569 5575 5570 - #ifdef CONFIG_SCHED_DEBUG 5571 5576 static u64 cpu_resched_latency(struct rq *rq) 5572 5577 { 5573 5578 int latency_warn_ms = READ_ONCE(sysctl_resched_latency_warn_ms); ··· 5611 5618 return 1; 5612 5619 } 5613 5620 __setup("resched_latency_warn_ms=", setup_resched_latency_warn_ms); 5614 - #else 5615 - static inline u64 cpu_resched_latency(struct rq *rq) { return 0; } 5616 - #endif /* CONFIG_SCHED_DEBUG */ 5617 5621 5618 5622 /* 5619 5623 * This function gets called by the timer code, with HZ frequency. ··· 5731 5741 * we are always sure that there is no proxy (only a 5732 5742 * single task is running). 5733 5743 */ 5734 - SCHED_WARN_ON(rq->curr != rq->donor); 5744 + WARN_ON_ONCE(rq->curr != rq->donor); 5735 5745 update_rq_clock(rq); 5736 5746 5737 5747 if (!is_idle_task(curr)) { ··· 5951 5961 preempt_count_set(PREEMPT_DISABLED); 5952 5962 } 5953 5963 rcu_sleep_check(); 5954 - SCHED_WARN_ON(ct_state() == CT_STATE_USER); 5964 + WARN_ON_ONCE(ct_state() == CT_STATE_USER); 5955 5965 5956 5966 profile_hit(SCHED_PROFILING, __builtin_return_address(0)); 5957 5967 ··· 6704 6714 picked: 6705 6715 clear_tsk_need_resched(prev); 6706 6716 clear_preempt_need_resched(); 6707 - #ifdef CONFIG_SCHED_DEBUG 6708 6717 rq->last_seen_need_resched_ns = 0; 6709 - #endif 6710 6718 6711 6719 if (likely(prev != next)) { 6712 6720 rq->nr_switches++; ··· 6795 6807 * deadlock if the callback attempts to acquire a lock which is 6796 6808 * already acquired. 6797 6809 */ 6798 - SCHED_WARN_ON(current->__state & TASK_RTLOCK_WAIT); 6810 + WARN_ON_ONCE(current->__state & TASK_RTLOCK_WAIT); 6799 6811 6800 6812 /* 6801 6813 * If we are going to sleep and we have plugged IO queued, ··· 7078 7090 int default_wake_function(wait_queue_entry_t *curr, unsigned mode, int wake_flags, 7079 7091 void *key) 7080 7092 { 7081 - WARN_ON_ONCE(IS_ENABLED(CONFIG_SCHED_DEBUG) && wake_flags & ~(WF_SYNC|WF_CURRENT_CPU)); 7093 + WARN_ON_ONCE(wake_flags & ~(WF_SYNC|WF_CURRENT_CPU)); 7082 7094 return try_to_wake_up(curr->private, mode, wake_flags); 7083 7095 } 7084 7096 EXPORT_SYMBOL(default_wake_function); ··· 7632 7644 7633 7645 #else /* !CONFIG_PREEMPT_DYNAMIC: */ 7634 7646 7647 + #define preempt_dynamic_mode -1 7648 + 7635 7649 static inline void preempt_dynamic_init(void) { } 7636 7650 7637 7651 #endif /* CONFIG_PREEMPT_DYNAMIC */ 7652 + 7653 + const char *preempt_modes[] = { 7654 + "none", "voluntary", "full", "lazy", NULL, 7655 + }; 7656 + 7657 + const char *preempt_model_str(void) 7658 + { 7659 + bool brace = IS_ENABLED(CONFIG_PREEMPT_RT) && 7660 + (IS_ENABLED(CONFIG_PREEMPT_DYNAMIC) || 7661 + IS_ENABLED(CONFIG_PREEMPT_LAZY)); 7662 + static char buf[128]; 7663 + 7664 + if (IS_ENABLED(CONFIG_PREEMPT_BUILD)) { 7665 + struct seq_buf s; 7666 + 7667 + seq_buf_init(&s, buf, sizeof(buf)); 7668 + seq_buf_puts(&s, "PREEMPT"); 7669 + 7670 + if (IS_ENABLED(CONFIG_PREEMPT_RT)) 7671 + seq_buf_printf(&s, "%sRT%s", 7672 + brace ? "_{" : "_", 7673 + brace ? "," : ""); 7674 + 7675 + if (IS_ENABLED(CONFIG_PREEMPT_DYNAMIC)) { 7676 + seq_buf_printf(&s, "(%s)%s", 7677 + preempt_dynamic_mode > 0 ? 7678 + preempt_modes[preempt_dynamic_mode] : "undef", 7679 + brace ? "}" : ""); 7680 + return seq_buf_str(&s); 7681 + } 7682 + 7683 + if (IS_ENABLED(CONFIG_PREEMPT_LAZY)) { 7684 + seq_buf_printf(&s, "LAZY%s", 7685 + brace ? "}" : ""); 7686 + return seq_buf_str(&s); 7687 + } 7688 + 7689 + return seq_buf_str(&s); 7690 + } 7691 + 7692 + if (IS_ENABLED(CONFIG_PREEMPT_VOLUNTARY_BUILD)) 7693 + return "VOLUNTARY"; 7694 + 7695 + return "NONE"; 7696 + } 7638 7697 7639 7698 int io_schedule_prepare(void) 7640 7699 { ··· 7797 7762 sched_show_task(p); 7798 7763 } 7799 7764 7800 - #ifdef CONFIG_SCHED_DEBUG 7801 7765 if (!state_filter) 7802 7766 sysrq_sched_debug_show(); 7803 - #endif 7767 + 7804 7768 rcu_read_unlock(); 7805 7769 /* 7806 7770 * Only show locks if all tasks are dumped: ··· 8214 8180 * operation in the resume sequence, just build a single sched 8215 8181 * domain, ignoring cpusets. 8216 8182 */ 8217 - partition_sched_domains(1, NULL, NULL); 8183 + cpuset_reset_sched_domains(); 8218 8184 if (--num_cpus_frozen) 8219 8185 return; 8220 8186 /* ··· 8233 8199 cpuset_update_active_cpus(); 8234 8200 } else { 8235 8201 num_cpus_frozen++; 8236 - partition_sched_domains(1, NULL, NULL); 8202 + cpuset_reset_sched_domains(); 8237 8203 } 8238 8204 } 8239 8205 ··· 8455 8421 * CPU masks are stable and all blatant races in the below code cannot 8456 8422 * happen. 8457 8423 */ 8458 - mutex_lock(&sched_domains_mutex); 8424 + sched_domains_mutex_lock(); 8459 8425 sched_init_domains(cpu_active_mask); 8460 - mutex_unlock(&sched_domains_mutex); 8426 + sched_domains_mutex_unlock(); 8461 8427 8462 8428 /* Move init over to a non-isolated CPU */ 8463 8429 if (set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_TYPE_DOMAIN)) < 0) ··· 9219 9185 unsigned int clamps; 9220 9186 9221 9187 lockdep_assert_held(&uclamp_mutex); 9222 - SCHED_WARN_ON(!rcu_read_lock_held()); 9188 + WARN_ON_ONCE(!rcu_read_lock_held()); 9223 9189 9224 9190 css_for_each_descendant_pre(css, top_css) { 9225 9191 uc_parent = css_tg(css)->parent ··· 9311 9277 if (req.ret) 9312 9278 return req.ret; 9313 9279 9314 - static_branch_enable(&sched_uclamp_used); 9280 + sched_uclamp_enable(); 9315 9281 9316 9282 guard(mutex)(&uclamp_mutex); 9317 9283 guard(rcu)(); ··· 10554 10520 struct mm_struct *mm; 10555 10521 int weight, cpu; 10556 10522 10557 - SCHED_WARN_ON(t != container_of(work, struct task_struct, cid_work)); 10523 + WARN_ON_ONCE(t != container_of(work, struct task_struct, cid_work)); 10558 10524 10559 10525 work->next = work; /* Prevent double-add */ 10560 10526 if (t->flags & PF_EXITING)

+1 -1

kernel/sched/core_sched.c

··· 65 65 * a cookie until after we've removed it, we must have core scheduling 66 66 * enabled here. 67 67 */ 68 - SCHED_WARN_ON((p->core_cookie || cookie) && !sched_core_enabled(rq)); 68 + WARN_ON_ONCE((p->core_cookie || cookie) && !sched_core_enabled(rq)); 69 69 70 70 if (sched_core_enqueued(p)) 71 71 sched_core_dequeue(rq, p, DEQUEUE_SAVE);

+28 -23

kernel/sched/deadline.c

··· 166 166 } 167 167 } 168 168 169 - static inline bool dl_bw_visited(int cpu, u64 gen) 169 + bool dl_bw_visited(int cpu, u64 cookie) 170 170 { 171 171 struct root_domain *rd = cpu_rq(cpu)->rd; 172 172 173 - if (rd->visit_gen == gen) 173 + if (rd->visit_cookie == cookie) 174 174 return true; 175 175 176 - rd->visit_gen = gen; 176 + rd->visit_cookie = cookie; 177 177 return false; 178 178 } 179 179 ··· 207 207 return SCHED_CAPACITY_SCALE; 208 208 } 209 209 210 - static inline bool dl_bw_visited(int cpu, u64 gen) 210 + bool dl_bw_visited(int cpu, u64 cookie) 211 211 { 212 212 return false; 213 213 } ··· 249 249 250 250 lockdep_assert_rq_held(rq_of_dl_rq(dl_rq)); 251 251 dl_rq->running_bw += dl_bw; 252 - SCHED_WARN_ON(dl_rq->running_bw < old); /* overflow */ 253 - SCHED_WARN_ON(dl_rq->running_bw > dl_rq->this_bw); 252 + WARN_ON_ONCE(dl_rq->running_bw < old); /* overflow */ 253 + WARN_ON_ONCE(dl_rq->running_bw > dl_rq->this_bw); 254 254 /* kick cpufreq (see the comment in kernel/sched/sched.h). */ 255 255 cpufreq_update_util(rq_of_dl_rq(dl_rq), 0); 256 256 } ··· 262 262 263 263 lockdep_assert_rq_held(rq_of_dl_rq(dl_rq)); 264 264 dl_rq->running_bw -= dl_bw; 265 - SCHED_WARN_ON(dl_rq->running_bw > old); /* underflow */ 265 + WARN_ON_ONCE(dl_rq->running_bw > old); /* underflow */ 266 266 if (dl_rq->running_bw > old) 267 267 dl_rq->running_bw = 0; 268 268 /* kick cpufreq (see the comment in kernel/sched/sched.h). */ ··· 276 276 277 277 lockdep_assert_rq_held(rq_of_dl_rq(dl_rq)); 278 278 dl_rq->this_bw += dl_bw; 279 - SCHED_WARN_ON(dl_rq->this_bw < old); /* overflow */ 279 + WARN_ON_ONCE(dl_rq->this_bw < old); /* overflow */ 280 280 } 281 281 282 282 static inline ··· 286 286 287 287 lockdep_assert_rq_held(rq_of_dl_rq(dl_rq)); 288 288 dl_rq->this_bw -= dl_bw; 289 - SCHED_WARN_ON(dl_rq->this_bw > old); /* underflow */ 289 + WARN_ON_ONCE(dl_rq->this_bw > old); /* underflow */ 290 290 if (dl_rq->this_bw > old) 291 291 dl_rq->this_bw = 0; 292 - SCHED_WARN_ON(dl_rq->running_bw > dl_rq->this_bw); 292 + WARN_ON_ONCE(dl_rq->running_bw > dl_rq->this_bw); 293 293 } 294 294 295 295 static inline ··· 2956 2956 struct dl_bw *dl_b; 2957 2957 2958 2958 raw_spin_lock_irqsave(&p->pi_lock, rf.flags); 2959 - if (!dl_task(p)) { 2959 + if (!dl_task(p) || dl_entity_is_special(&p->dl)) { 2960 2960 raw_spin_unlock_irqrestore(&p->pi_lock, rf.flags); 2961 2961 return; 2962 2962 } ··· 2981 2981 rd->dl_bw.total_bw = 0; 2982 2982 2983 2983 /* 2984 - * dl_server bandwidth is only restored when CPUs are attached to root 2985 - * domains (after domains are created or CPUs moved back to the 2986 - * default root doamin). 2984 + * dl_servers are not tasks. Since dl_add_task_root_domain ignores 2985 + * them, we need to account for them here explicitly. 2987 2986 */ 2988 2987 for_each_cpu(i, rd->span) { 2989 2988 struct sched_dl_entity *dl_se = &cpu_rq(i)->fair_server; 2990 2989 2991 2990 if (dl_server(dl_se) && cpu_active(i)) 2992 - rd->dl_bw.total_bw += dl_se->dl_bw; 2991 + __dl_add(&rd->dl_bw, dl_se->dl_bw, dl_bw_cpus(i)); 2993 2992 } 2993 + } 2994 + 2995 + void dl_clear_root_domain_cpu(int cpu) 2996 + { 2997 + dl_clear_root_domain(cpu_rq(cpu)->rd); 2994 2998 } 2995 2999 2996 3000 #endif /* CONFIG_SMP */ ··· 3175 3171 #endif 3176 3172 }; 3177 3173 3178 - /* Used for dl_bw check and update, used under sched_rt_handler()::mutex */ 3179 - static u64 dl_generation; 3174 + /* 3175 + * Used for dl_bw check and update, used under sched_rt_handler()::mutex and 3176 + * sched_domains_mutex. 3177 + */ 3178 + u64 dl_cookie; 3180 3179 3181 3180 int sched_dl_global_validate(void) 3182 3181 { 3183 3182 u64 runtime = global_rt_runtime(); 3184 3183 u64 period = global_rt_period(); 3185 3184 u64 new_bw = to_ratio(period, runtime); 3186 - u64 gen = ++dl_generation; 3185 + u64 cookie = ++dl_cookie; 3187 3186 struct dl_bw *dl_b; 3188 3187 int cpu, cpus, ret = 0; 3189 3188 unsigned long flags; ··· 3199 3192 for_each_online_cpu(cpu) { 3200 3193 rcu_read_lock_sched(); 3201 3194 3202 - if (dl_bw_visited(cpu, gen)) 3195 + if (dl_bw_visited(cpu, cookie)) 3203 3196 goto next; 3204 3197 3205 3198 dl_b = dl_bw_of(cpu); ··· 3236 3229 void sched_dl_do_global(void) 3237 3230 { 3238 3231 u64 new_bw = -1; 3239 - u64 gen = ++dl_generation; 3232 + u64 cookie = ++dl_cookie; 3240 3233 struct dl_bw *dl_b; 3241 3234 int cpu; 3242 3235 unsigned long flags; ··· 3247 3240 for_each_possible_cpu(cpu) { 3248 3241 rcu_read_lock_sched(); 3249 3242 3250 - if (dl_bw_visited(cpu, gen)) { 3243 + if (dl_bw_visited(cpu, cookie)) { 3251 3244 rcu_read_unlock_sched(); 3252 3245 continue; 3253 3246 } ··· 3574 3567 } 3575 3568 #endif 3576 3569 3577 - #ifdef CONFIG_SCHED_DEBUG 3578 3570 void print_dl_stats(struct seq_file *m, int cpu) 3579 3571 { 3580 3572 print_dl_rq(m, cpu, &cpu_rq(cpu)->dl); 3581 3573 } 3582 - #endif /* CONFIG_SCHED_DEBUG */

+10 -8

kernel/sched/debug.c

··· 244 244 245 245 static int sched_dynamic_show(struct seq_file *m, void *v) 246 246 { 247 - static const char * preempt_modes[] = { 248 - "none", "voluntary", "full", "lazy", 249 - }; 250 - int j = ARRAY_SIZE(preempt_modes) - !IS_ENABLED(CONFIG_ARCH_HAS_PREEMPT_LAZY); 251 247 int i = IS_ENABLED(CONFIG_PREEMPT_RT) * 2; 248 + int j; 249 + 250 + /* Count entries in NULL terminated preempt_modes */ 251 + for (j = 0; preempt_modes[j]; j++) 252 + ; 253 + j -= !IS_ENABLED(CONFIG_ARCH_HAS_PREEMPT_LAZY); 252 254 253 255 for (; i < j; i++) { 254 256 if (preempt_dynamic_mode == i) ··· 294 292 bool orig; 295 293 296 294 cpus_read_lock(); 297 - mutex_lock(&sched_domains_mutex); 295 + sched_domains_mutex_lock(); 298 296 299 297 orig = sched_debug_verbose; 300 298 result = debugfs_write_file_bool(filp, ubuf, cnt, ppos); ··· 306 304 sd_dentry = NULL; 307 305 } 308 306 309 - mutex_unlock(&sched_domains_mutex); 307 + sched_domains_mutex_unlock(); 310 308 cpus_read_unlock(); 311 309 312 310 return result; ··· 517 515 debugfs_create_u32("migration_cost_ns", 0644, debugfs_sched, &sysctl_sched_migration_cost); 518 516 debugfs_create_u32("nr_migrate", 0644, debugfs_sched, &sysctl_sched_nr_migrate); 519 517 520 - mutex_lock(&sched_domains_mutex); 518 + sched_domains_mutex_lock(); 521 519 update_sched_domain_debugfs(); 522 - mutex_unlock(&sched_domains_mutex); 520 + sched_domains_mutex_unlock(); 523 521 #endif 524 522 525 523 #ifdef CONFIG_NUMA_BALANCING

+1 -1

kernel/sched/ext.c

··· 2472 2472 { 2473 2473 int cpu = cpu_of(rq); 2474 2474 2475 - SCHED_WARN_ON(task_cpu(p) == cpu); 2475 + WARN_ON_ONCE(task_cpu(p) == cpu); 2476 2476 2477 2477 /* 2478 2478 * If @p has migration disabled, @p->cpus_ptr is updated to contain only

+75 -56

kernel/sched/fair.c

··· 74 74 /* 75 75 * Minimal preemption granularity for CPU-bound tasks: 76 76 * 77 - * (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds) 77 + * (default: 0.70 msec * (1 + ilog(ncpus)), units: nanoseconds) 78 78 */ 79 - unsigned int sysctl_sched_base_slice = 750000ULL; 80 - static unsigned int normalized_sysctl_sched_base_slice = 750000ULL; 79 + unsigned int sysctl_sched_base_slice = 700000ULL; 80 + static unsigned int normalized_sysctl_sched_base_slice = 700000ULL; 81 81 82 - const_debug unsigned int sysctl_sched_migration_cost = 500000UL; 82 + __read_mostly unsigned int sysctl_sched_migration_cost = 500000UL; 83 83 84 84 static int __init setup_sched_thermal_decay_shift(char *str) 85 85 { ··· 399 399 400 400 static inline void assert_list_leaf_cfs_rq(struct rq *rq) 401 401 { 402 - SCHED_WARN_ON(rq->tmp_alone_branch != &rq->leaf_cfs_rq_list); 402 + WARN_ON_ONCE(rq->tmp_alone_branch != &rq->leaf_cfs_rq_list); 403 403 } 404 404 405 405 /* Iterate through all leaf cfs_rq's on a runqueue */ ··· 696 696 { 697 697 s64 vlag, limit; 698 698 699 - SCHED_WARN_ON(!se->on_rq); 699 + WARN_ON_ONCE(!se->on_rq); 700 700 701 701 vlag = avg_vruntime(cfs_rq) - se->vruntime; 702 702 limit = calc_delta_fair(max_t(u64, 2*se->slice, TICK_NSEC), se); ··· 884 884 } 885 885 886 886 /* 887 + * HACK, stash a copy of deadline at the point of pick in vlag, 888 + * which isn't used until dequeue. 889 + */ 890 + static inline void set_protect_slice(struct sched_entity *se) 891 + { 892 + se->vlag = se->deadline; 893 + } 894 + 895 + static inline bool protect_slice(struct sched_entity *se) 896 + { 897 + return se->vlag == se->deadline; 898 + } 899 + 900 + static inline void cancel_protect_slice(struct sched_entity *se) 901 + { 902 + if (protect_slice(se)) 903 + se->vlag = se->deadline + 1; 904 + } 905 + 906 + /* 887 907 * Earliest Eligible Virtual Deadline First 888 908 * 889 909 * In order to provide latency guarantees for different request sizes ··· 939 919 if (curr && (!curr->on_rq || !entity_eligible(cfs_rq, curr))) 940 920 curr = NULL; 941 921 942 - /* 943 - * Once selected, run a task until it either becomes non-eligible or 944 - * until it gets a new slice. See the HACK in set_next_entity(). 945 - */ 946 - if (sched_feat(RUN_TO_PARITY) && curr && curr->vlag == curr->deadline) 922 + if (sched_feat(RUN_TO_PARITY) && curr && protect_slice(curr)) 947 923 return curr; 948 924 949 925 /* Pick the leftmost entity if it's eligible */ ··· 983 967 return best; 984 968 } 985 969 986 - #ifdef CONFIG_SCHED_DEBUG 987 970 struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq) 988 971 { 989 972 struct rb_node *last = rb_last(&cfs_rq->tasks_timeline.rb_root); ··· 1008 993 1009 994 return 0; 1010 995 } 1011 - #endif 1012 996 #endif 1013 997 1014 998 static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se); ··· 3315 3301 bool vma_pids_skipped; 3316 3302 bool vma_pids_forced = false; 3317 3303 3318 - SCHED_WARN_ON(p != container_of(work, struct task_struct, numa_work)); 3304 + WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work)); 3319 3305 3320 3306 work->next = work; 3321 3307 /* ··· 4034 4020 * Make sure that rounding and/or propagation of PELT values never 4035 4021 * break this. 4036 4022 */ 4037 - SCHED_WARN_ON(sa->load_avg || 4023 + WARN_ON_ONCE(sa->load_avg || 4038 4024 sa->util_avg || 4039 4025 sa->runnable_avg); 4040 4026 ··· 5458 5444 clear_buddies(cfs_rq, se); 5459 5445 5460 5446 if (flags & DEQUEUE_DELAYED) { 5461 - SCHED_WARN_ON(!se->sched_delayed); 5447 + WARN_ON_ONCE(!se->sched_delayed); 5462 5448 } else { 5463 5449 bool delay = sleep; 5464 5450 /* ··· 5468 5454 if (flags & DEQUEUE_SPECIAL) 5469 5455 delay = false; 5470 5456 5471 - SCHED_WARN_ON(delay && se->sched_delayed); 5457 + WARN_ON_ONCE(delay && se->sched_delayed); 5472 5458 5473 5459 if (sched_feat(DELAY_DEQUEUE) && delay && 5474 5460 !entity_eligible(cfs_rq, se)) { ··· 5544 5530 update_stats_wait_end_fair(cfs_rq, se); 5545 5531 __dequeue_entity(cfs_rq, se); 5546 5532 update_load_avg(cfs_rq, se, UPDATE_TG); 5547 - /* 5548 - * HACK, stash a copy of deadline at the point of pick in vlag, 5549 - * which isn't used until dequeue. 5550 - */ 5551 - se->vlag = se->deadline; 5533 + 5534 + set_protect_slice(se); 5552 5535 } 5553 5536 5554 5537 update_stats_curr_start(cfs_rq, se); 5555 - SCHED_WARN_ON(cfs_rq->curr); 5538 + WARN_ON_ONCE(cfs_rq->curr); 5556 5539 cfs_rq->curr = se; 5557 5540 5558 5541 /* ··· 5590 5579 if (sched_feat(PICK_BUDDY) && 5591 5580 cfs_rq->next && entity_eligible(cfs_rq, cfs_rq->next)) { 5592 5581 /* ->next will never be delayed */ 5593 - SCHED_WARN_ON(cfs_rq->next->sched_delayed); 5582 + WARN_ON_ONCE(cfs_rq->next->sched_delayed); 5594 5583 return cfs_rq->next; 5595 5584 } 5596 5585 ··· 5626 5615 /* in !on_rq case, update occurred at dequeue */ 5627 5616 update_load_avg(cfs_rq, prev, 0); 5628 5617 } 5629 - SCHED_WARN_ON(cfs_rq->curr != prev); 5618 + WARN_ON_ONCE(cfs_rq->curr != prev); 5630 5619 cfs_rq->curr = NULL; 5631 5620 } 5632 5621 ··· 5849 5838 5850 5839 cfs_rq->throttled_clock_self = 0; 5851 5840 5852 - if (SCHED_WARN_ON((s64)delta < 0)) 5841 + if (WARN_ON_ONCE((s64)delta < 0)) 5853 5842 delta = 0; 5854 5843 5855 5844 cfs_rq->throttled_clock_self_time += delta; ··· 5869 5858 cfs_rq->throttled_clock_pelt = rq_clock_pelt(rq); 5870 5859 list_del_leaf_cfs_rq(cfs_rq); 5871 5860 5872 - SCHED_WARN_ON(cfs_rq->throttled_clock_self); 5861 + WARN_ON_ONCE(cfs_rq->throttled_clock_self); 5873 5862 if (cfs_rq->nr_queued) 5874 5863 cfs_rq->throttled_clock_self = rq_clock(rq); 5875 5864 } ··· 5978 5967 * throttled-list. rq->lock protects completion. 5979 5968 */ 5980 5969 cfs_rq->throttled = 1; 5981 - SCHED_WARN_ON(cfs_rq->throttled_clock); 5970 + WARN_ON_ONCE(cfs_rq->throttled_clock); 5982 5971 if (cfs_rq->nr_queued) 5983 5972 cfs_rq->throttled_clock = rq_clock(rq); 5984 5973 return true; ··· 6134 6123 } 6135 6124 6136 6125 /* Already enqueued */ 6137 - if (SCHED_WARN_ON(!list_empty(&cfs_rq->throttled_csd_list))) 6126 + if (WARN_ON_ONCE(!list_empty(&cfs_rq->throttled_csd_list))) 6138 6127 return; 6139 6128 6140 6129 first = list_empty(&rq->cfsb_csd_list); ··· 6153 6142 { 6154 6143 lockdep_assert_rq_held(rq_of(cfs_rq)); 6155 6144 6156 - if (SCHED_WARN_ON(!cfs_rq_throttled(cfs_rq) || 6145 + if (WARN_ON_ONCE(!cfs_rq_throttled(cfs_rq) || 6157 6146 cfs_rq->runtime_remaining <= 0)) 6158 6147 return; 6159 6148 ··· 6189 6178 goto next; 6190 6179 6191 6180 /* By the above checks, this should never be true */ 6192 - SCHED_WARN_ON(cfs_rq->runtime_remaining > 0); 6181 + WARN_ON_ONCE(cfs_rq->runtime_remaining > 0); 6193 6182 6194 6183 raw_spin_lock(&cfs_b->lock); 6195 6184 runtime = -cfs_rq->runtime_remaining + 1; ··· 6210 6199 * We currently only expect to be unthrottling 6211 6200 * a single cfs_rq locally. 6212 6201 */ 6213 - SCHED_WARN_ON(!list_empty(&local_unthrottle)); 6202 + WARN_ON_ONCE(!list_empty(&local_unthrottle)); 6214 6203 list_add_tail(&cfs_rq->throttled_csd_list, 6215 6204 &local_unthrottle); 6216 6205 } ··· 6235 6224 6236 6225 rq_unlock_irqrestore(rq, &rf); 6237 6226 } 6238 - SCHED_WARN_ON(!list_empty(&local_unthrottle)); 6227 + WARN_ON_ONCE(!list_empty(&local_unthrottle)); 6239 6228 6240 6229 rcu_read_unlock(); 6241 6230 ··· 6787 6776 { 6788 6777 struct sched_entity *se = &p->se; 6789 6778 6790 - SCHED_WARN_ON(task_rq(p) != rq); 6779 + WARN_ON_ONCE(task_rq(p) != rq); 6791 6780 6792 6781 if (rq->cfs.h_nr_queued > 1) { 6793 6782 u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime; ··· 6898 6887 * Because a delayed entity is one that is still on 6899 6888 * the runqueue competing until elegibility. 6900 6889 */ 6901 - SCHED_WARN_ON(!se->sched_delayed); 6902 - SCHED_WARN_ON(!se->on_rq); 6890 + WARN_ON_ONCE(!se->sched_delayed); 6891 + WARN_ON_ONCE(!se->on_rq); 6903 6892 6904 6893 if (sched_feat(DELAY_ZERO)) { 6905 6894 update_entity_lag(cfs_rq, se); ··· 7002 6991 update_cfs_group(se); 7003 6992 7004 6993 se->slice = slice; 6994 + if (se != cfs_rq->curr) 6995 + min_vruntime_cb_propagate(&se->run_node, NULL); 7005 6996 slice = cfs_rq_min_slice(cfs_rq); 7006 6997 7007 6998 cfs_rq->h_nr_runnable += h_nr_runnable; ··· 7133 7120 update_cfs_group(se); 7134 7121 7135 7122 se->slice = slice; 7123 + if (se != cfs_rq->curr) 7124 + min_vruntime_cb_propagate(&se->run_node, NULL); 7136 7125 slice = cfs_rq_min_slice(cfs_rq); 7137 7126 7138 7127 cfs_rq->h_nr_runnable -= h_nr_runnable; ··· 7159 7144 rq->next_balance = jiffies; 7160 7145 7161 7146 if (p && task_delayed) { 7162 - SCHED_WARN_ON(!task_sleep); 7163 - SCHED_WARN_ON(p->on_rq != 1); 7147 + WARN_ON_ONCE(!task_sleep); 7148 + WARN_ON_ONCE(p->on_rq != 1); 7164 7149 7165 7150 /* Fix-up what dequeue_task_fair() skipped */ 7166 7151 hrtick_update(rq); ··· 8738 8723 static void set_next_buddy(struct sched_entity *se) 8739 8724 { 8740 8725 for_each_sched_entity(se) { 8741 - if (SCHED_WARN_ON(!se->on_rq)) 8726 + if (WARN_ON_ONCE(!se->on_rq)) 8742 8727 return; 8743 8728 if (se_is_idle(se)) 8744 8729 return; ··· 8798 8783 * Preempt an idle entity in favor of a non-idle entity (and don't preempt 8799 8784 * in the inverse case). 8800 8785 */ 8801 - if (cse_is_idle && !pse_is_idle) 8786 + if (cse_is_idle && !pse_is_idle) { 8787 + /* 8788 + * When non-idle entity preempt an idle entity, 8789 + * don't give idle entity slice protection. 8790 + */ 8791 + cancel_protect_slice(se); 8802 8792 goto preempt; 8793 + } 8794 + 8803 8795 if (cse_is_idle != pse_is_idle) 8804 8796 return; 8805 8797 ··· 8825 8803 * Note that even if @p does not turn out to be the most eligible 8826 8804 * task at this moment, current's slice protection will be lost. 8827 8805 */ 8828 - if (do_preempt_short(cfs_rq, pse, se) && se->vlag == se->deadline) 8829 - se->vlag = se->deadline + 1; 8806 + if (do_preempt_short(cfs_rq, pse, se)) 8807 + cancel_protect_slice(se); 8830 8808 8831 8809 /* 8832 8810 * If @p has become the most eligible task, force preemption. ··· 9439 9417 return 0; 9440 9418 9441 9419 /* Prevent to re-select dst_cpu via env's CPUs: */ 9442 - for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) { 9443 - if (cpumask_test_cpu(cpu, p->cpus_ptr)) { 9444 - env->flags |= LBF_DST_PINNED; 9445 - env->new_dst_cpu = cpu; 9446 - break; 9447 - } 9420 + cpu = cpumask_first_and_and(env->dst_grpmask, env->cpus, p->cpus_ptr); 9421 + 9422 + if (cpu < nr_cpu_ids) { 9423 + env->flags |= LBF_DST_PINNED; 9424 + env->new_dst_cpu = cpu; 9448 9425 } 9449 9426 9450 9427 return 0; ··· 12482 12461 12483 12462 void nohz_balance_exit_idle(struct rq *rq) 12484 12463 { 12485 - SCHED_WARN_ON(rq != this_rq()); 12464 + WARN_ON_ONCE(rq != this_rq()); 12486 12465 12487 12466 if (likely(!rq->nohz_tick_stopped)) 12488 12467 return; ··· 12518 12497 { 12519 12498 struct rq *rq = cpu_rq(cpu); 12520 12499 12521 - SCHED_WARN_ON(cpu != smp_processor_id()); 12500 + WARN_ON_ONCE(cpu != smp_processor_id()); 12522 12501 12523 12502 /* If this CPU is going down, then nothing needs to be done: */ 12524 12503 if (!cpu_active(cpu)) ··· 12601 12580 int balance_cpu; 12602 12581 struct rq *rq; 12603 12582 12604 - SCHED_WARN_ON((flags & NOHZ_KICK_MASK) == NOHZ_BALANCE_KICK); 12583 + WARN_ON_ONCE((flags & NOHZ_KICK_MASK) == NOHZ_BALANCE_KICK); 12605 12584 12606 12585 /* 12607 12586 * We assume there will be no idle load after this update and clear ··· 13041 13020 struct cfs_rq *cfs_rqb; 13042 13021 s64 delta; 13043 13022 13044 - SCHED_WARN_ON(task_rq(b)->core != rq->core); 13023 + WARN_ON_ONCE(task_rq(b)->core != rq->core); 13045 13024 13046 13025 #ifdef CONFIG_FAIR_GROUP_SCHED 13047 13026 /* ··· 13244 13223 13245 13224 static void switched_to_fair(struct rq *rq, struct task_struct *p) 13246 13225 { 13247 - SCHED_WARN_ON(p->se.sched_delayed); 13226 + WARN_ON_ONCE(p->se.sched_delayed); 13248 13227 13249 13228 attach_task_cfs_rq(p); 13250 13229 ··· 13279 13258 if (!first) 13280 13259 return; 13281 13260 13282 - SCHED_WARN_ON(se->sched_delayed); 13261 + WARN_ON_ONCE(se->sched_delayed); 13283 13262 13284 13263 if (hrtick_enabled_fair(rq)) 13285 13264 hrtick_start_fair(rq, p); ··· 13666 13645 #endif 13667 13646 }; 13668 13647 13669 - #ifdef CONFIG_SCHED_DEBUG 13670 13648 void print_cfs_stats(struct seq_file *m, int cpu) 13671 13649 { 13672 13650 struct cfs_rq *cfs_rq, *pos; ··· 13699 13679 rcu_read_unlock(); 13700 13680 } 13701 13681 #endif /* CONFIG_NUMA_BALANCING */ 13702 - #endif /* CONFIG_SCHED_DEBUG */ 13703 13682 13704 13683 __init void init_sched_fair_class(void) 13705 13684 {

+4 -5

kernel/sched/rt.c

··· 169 169 170 170 static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se) 171 171 { 172 - #ifdef CONFIG_SCHED_DEBUG 173 172 WARN_ON_ONCE(!rt_entity_is_task(rt_se)); 174 - #endif 173 + 175 174 return container_of(rt_se, struct task_struct, rt); 176 175 } 177 176 ··· 1712 1713 BUG_ON(idx >= MAX_RT_PRIO); 1713 1714 1714 1715 queue = array->queue + idx; 1715 - if (SCHED_WARN_ON(list_empty(queue))) 1716 + if (WARN_ON_ONCE(list_empty(queue))) 1716 1717 return NULL; 1717 1718 next = list_entry(queue->next, struct sched_rt_entity, run_list); 1718 1719 ··· 2909 2910 int ret; 2910 2911 2911 2912 mutex_lock(&mutex); 2913 + sched_domains_mutex_lock(); 2912 2914 old_period = sysctl_sched_rt_period; 2913 2915 old_runtime = sysctl_sched_rt_runtime; 2914 2916 ··· 2936 2936 sysctl_sched_rt_period = old_period; 2937 2937 sysctl_sched_rt_runtime = old_runtime; 2938 2938 } 2939 + sched_domains_mutex_unlock(); 2939 2940 mutex_unlock(&mutex); 2940 2941 2941 2942 return ret; ··· 2968 2967 } 2969 2968 #endif /* CONFIG_SYSCTL */ 2970 2969 2971 - #ifdef CONFIG_SCHED_DEBUG 2972 2970 void print_rt_stats(struct seq_file *m, int cpu) 2973 2971 { 2974 2972 rt_rq_iter_t iter; ··· 2978 2978 print_rt_rq(m, cpu, rt_rq); 2979 2979 rcu_read_unlock(); 2980 2980 } 2981 - #endif /* CONFIG_SCHED_DEBUG */

+43 -85

kernel/sched/sched.h

··· 91 91 #include "cpupri.h" 92 92 #include "cpudeadline.h" 93 93 94 - #ifdef CONFIG_SCHED_DEBUG 95 - # define SCHED_WARN_ON(x) WARN_ONCE(x, #x) 96 - #else 97 - # define SCHED_WARN_ON(x) ({ (void)(x), 0; }) 98 - #endif 99 - 100 94 /* task_struct::on_rq states: */ 101 95 #define TASK_ON_RQ_QUEUED 1 102 96 #define TASK_ON_RQ_MIGRATING 2 ··· 992 998 * Also, some corner cases, like 'wrap around' is dangerous, but given 993 999 * that u64 is 'big enough'. So that shouldn't be a concern. 994 1000 */ 995 - u64 visit_gen; 1001 + u64 visit_cookie; 996 1002 997 1003 #ifdef HAVE_RT_PUSH_IPI 998 1004 /* ··· 1174 1180 1175 1181 atomic_t nr_iowait; 1176 1182 1177 - #ifdef CONFIG_SCHED_DEBUG 1178 1183 u64 last_seen_need_resched_ns; 1179 1184 int ticks_without_resched; 1180 - #endif 1181 1185 1182 1186 #ifdef CONFIG_MEMBARRIER 1183 1187 int membarrier_state; ··· 1563 1571 1564 1572 static inline struct task_struct *task_of(struct sched_entity *se) 1565 1573 { 1566 - SCHED_WARN_ON(!entity_is_task(se)); 1574 + WARN_ON_ONCE(!entity_is_task(se)); 1567 1575 return container_of(se, struct task_struct, se); 1568 1576 } 1569 1577 ··· 1644 1652 * The only reason for not seeing a clock update since the 1645 1653 * last rq_pin_lock() is if we're currently skipping updates. 1646 1654 */ 1647 - SCHED_WARN_ON(rq->clock_update_flags < RQCF_ACT_SKIP); 1655 + WARN_ON_ONCE(rq->clock_update_flags < RQCF_ACT_SKIP); 1648 1656 } 1649 1657 1650 1658 static inline u64 rq_clock(struct rq *rq) ··· 1691 1699 static inline void rq_clock_start_loop_update(struct rq *rq) 1692 1700 { 1693 1701 lockdep_assert_rq_held(rq); 1694 - SCHED_WARN_ON(rq->clock_update_flags & RQCF_ACT_SKIP); 1702 + WARN_ON_ONCE(rq->clock_update_flags & RQCF_ACT_SKIP); 1695 1703 rq->clock_update_flags |= RQCF_ACT_SKIP; 1696 1704 } 1697 1705 ··· 1704 1712 struct rq_flags { 1705 1713 unsigned long flags; 1706 1714 struct pin_cookie cookie; 1707 - #ifdef CONFIG_SCHED_DEBUG 1708 1715 /* 1709 1716 * A copy of (rq::clock_update_flags & RQCF_UPDATED) for the 1710 1717 * current pin context is stashed here in case it needs to be 1711 1718 * restored in rq_repin_lock(). 1712 1719 */ 1713 1720 unsigned int clock_update_flags; 1714 - #endif 1715 1721 }; 1716 1722 1717 1723 extern struct balance_callback balance_push_callback; ··· 1760 1770 { 1761 1771 rf->cookie = lockdep_pin_lock(__rq_lockp(rq)); 1762 1772 1763 - #ifdef CONFIG_SCHED_DEBUG 1764 1773 rq->clock_update_flags &= (RQCF_REQ_SKIP|RQCF_ACT_SKIP); 1765 1774 rf->clock_update_flags = 0; 1766 - # ifdef CONFIG_SMP 1767 - SCHED_WARN_ON(rq->balance_callback && rq->balance_callback != &balance_push_callback); 1768 - # endif 1775 + #ifdef CONFIG_SMP 1776 + WARN_ON_ONCE(rq->balance_callback && rq->balance_callback != &balance_push_callback); 1769 1777 #endif 1770 1778 } 1771 1779 1772 1780 static inline void rq_unpin_lock(struct rq *rq, struct rq_flags *rf) 1773 1781 { 1774 - #ifdef CONFIG_SCHED_DEBUG 1775 1782 if (rq->clock_update_flags > RQCF_ACT_SKIP) 1776 1783 rf->clock_update_flags = RQCF_UPDATED; 1777 - #endif 1784 + 1778 1785 scx_rq_clock_invalidate(rq); 1779 1786 lockdep_unpin_lock(__rq_lockp(rq), rf->cookie); 1780 1787 } ··· 1780 1793 { 1781 1794 lockdep_repin_lock(__rq_lockp(rq), rf->cookie); 1782 1795 1783 - #ifdef CONFIG_SCHED_DEBUG 1784 1796 /* 1785 1797 * Restore the value we stashed in @rf for this pin context. 1786 1798 */ 1787 1799 rq->clock_update_flags |= rf->clock_update_flags; 1788 - #endif 1789 1800 } 1790 1801 1791 1802 extern ··· 2057 2072 unsigned long next_update; 2058 2073 int imbalance; /* XXX unrelated to capacity but shared group state */ 2059 2074 2060 - #ifdef CONFIG_SCHED_DEBUG 2061 2075 int id; 2062 - #endif 2063 2076 2064 2077 unsigned long cpumask[]; /* Balance mask */ 2065 2078 }; ··· 2097 2114 2098 2115 extern int group_balance_cpu(struct sched_group *sg); 2099 2116 2100 - #ifdef CONFIG_SCHED_DEBUG 2101 2117 extern void update_sched_domain_debugfs(void); 2102 2118 extern void dirty_sched_domain_sysctl(int cpu); 2103 - #else 2104 - static inline void update_sched_domain_debugfs(void) { } 2105 - static inline void dirty_sched_domain_sysctl(int cpu) { } 2106 - #endif 2107 2119 2108 2120 extern int sched_update_scaling(void); 2109 2121 ··· 2178 2200 } 2179 2201 2180 2202 /* 2181 - * Tunables that become constants when CONFIG_SCHED_DEBUG is off: 2203 + * Tunables: 2182 2204 */ 2183 - #ifdef CONFIG_SCHED_DEBUG 2184 - # define const_debug __read_mostly 2185 - #else 2186 - # define const_debug const 2187 - #endif 2188 2205 2189 2206 #define SCHED_FEAT(name, enabled) \ 2190 2207 __SCHED_FEAT_##name , ··· 2191 2218 2192 2219 #undef SCHED_FEAT 2193 2220 2194 - #ifdef CONFIG_SCHED_DEBUG 2195 - 2196 2221 /* 2197 2222 * To support run-time toggling of sched features, all the translation units 2198 2223 * (but core.c) reference the sysctl_sched_features defined in core.c. 2199 2224 */ 2200 - extern const_debug unsigned int sysctl_sched_features; 2225 + extern __read_mostly unsigned int sysctl_sched_features; 2201 2226 2202 2227 #ifdef CONFIG_JUMP_LABEL 2203 2228 ··· 2216 2245 #define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x)) 2217 2246 2218 2247 #endif /* !CONFIG_JUMP_LABEL */ 2219 - 2220 - #else /* !SCHED_DEBUG: */ 2221 - 2222 - /* 2223 - * Each translation unit has its own copy of sysctl_sched_features to allow 2224 - * constants propagation at compile time and compiler optimization based on 2225 - * features default. 2226 - */ 2227 - #define SCHED_FEAT(name, enabled) \ 2228 - (1UL << __SCHED_FEAT_##name) * enabled | 2229 - static const_debug __maybe_unused unsigned int sysctl_sched_features = 2230 - #include "features.h" 2231 - 0; 2232 - #undef SCHED_FEAT 2233 - 2234 - #define sched_feat(x) !!(sysctl_sched_features & (1UL << __SCHED_FEAT_##x)) 2235 - 2236 - #endif /* !SCHED_DEBUG */ 2237 2248 2238 2249 extern struct static_key_false sched_numa_balancing; 2239 2250 extern struct static_key_false sched_schedstats; ··· 2638 2685 2639 2686 static inline struct cpuidle_state *idle_get_state(struct rq *rq) 2640 2687 { 2641 - SCHED_WARN_ON(!rcu_read_lock_held()); 2688 + WARN_ON_ONCE(!rcu_read_lock_held()); 2642 2689 2643 2690 return rq->idle_state; 2644 2691 } ··· 2796 2843 # define SCHED_NR_MIGRATE_BREAK 32 2797 2844 #endif 2798 2845 2799 - extern const_debug unsigned int sysctl_sched_nr_migrate; 2800 - extern const_debug unsigned int sysctl_sched_migration_cost; 2846 + extern __read_mostly unsigned int sysctl_sched_nr_migrate; 2847 + extern __read_mostly unsigned int sysctl_sched_migration_cost; 2801 2848 2802 2849 extern unsigned int sysctl_sched_base_slice; 2803 2850 2804 - #ifdef CONFIG_SCHED_DEBUG 2805 2851 extern int sysctl_resched_latency_warn_ms; 2806 2852 extern int sysctl_resched_latency_warn_once; 2807 2853 ··· 2811 2859 extern unsigned int sysctl_numa_balancing_scan_period_max; 2812 2860 extern unsigned int sysctl_numa_balancing_scan_size; 2813 2861 extern unsigned int sysctl_numa_balancing_hot_threshold; 2814 - #endif 2815 2862 2816 2863 #ifdef CONFIG_SCHED_HRTICK 2817 2864 ··· 2883 2932 } 2884 2933 #endif 2885 2934 2886 - #ifdef CONFIG_SCHED_DEBUG 2887 2935 /* 2888 2936 * In double_lock_balance()/double_rq_lock(), we use raw_spin_rq_lock() to 2889 2937 * acquire rq lock instead of rq_lock(). So at the end of these two functions ··· 2897 2947 rq2->clock_update_flags &= (RQCF_REQ_SKIP|RQCF_ACT_SKIP); 2898 2948 #endif 2899 2949 } 2900 - #else 2901 - static inline void double_rq_clock_clear_update(struct rq *rq1, struct rq *rq2) { } 2902 - #endif 2903 2950 2904 2951 #define DEFINE_LOCK_GUARD_2(name, type, _lock, _unlock, ...) \ 2905 2952 __DEFINE_UNLOCK_GUARD(name, type, _unlock, type *lock2; __VA_ARGS__) \ ··· 3109 3162 extern struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq); 3110 3163 extern struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq); 3111 3164 3112 - #ifdef CONFIG_SCHED_DEBUG 3113 3165 extern bool sched_debug_verbose; 3114 3166 3115 3167 extern void print_cfs_stats(struct seq_file *m, int cpu); ··· 3119 3173 extern void print_dl_rq(struct seq_file *m, int cpu, struct dl_rq *dl_rq); 3120 3174 3121 3175 extern void resched_latency_warn(int cpu, u64 latency); 3122 - # ifdef CONFIG_NUMA_BALANCING 3176 + #ifdef CONFIG_NUMA_BALANCING 3123 3177 extern void show_numa_stats(struct task_struct *p, struct seq_file *m); 3124 3178 extern void 3125 3179 print_numa_stats(struct seq_file *m, int node, unsigned long tsf, 3126 3180 unsigned long tpf, unsigned long gsf, unsigned long gpf); 3127 - # endif /* CONFIG_NUMA_BALANCING */ 3128 - #else /* !CONFIG_SCHED_DEBUG: */ 3129 - static inline void resched_latency_warn(int cpu, u64 latency) { } 3130 - #endif /* !CONFIG_SCHED_DEBUG */ 3181 + #endif /* CONFIG_NUMA_BALANCING */ 3131 3182 3132 3183 extern void init_cfs_rq(struct cfs_rq *cfs_rq); 3133 3184 extern void init_rt_rq(struct rt_rq *rt_rq); ··· 3337 3394 3338 3395 unsigned long uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp_id); 3339 3396 3397 + /* 3398 + * When uclamp is compiled in, the aggregation at rq level is 'turned off' 3399 + * by default in the fast path and only gets turned on once userspace performs 3400 + * an operation that requires it. 3401 + * 3402 + * Returns true if userspace opted-in to use uclamp and aggregation at rq level 3403 + * hence is active. 3404 + */ 3405 + static inline bool uclamp_is_used(void) 3406 + { 3407 + return static_branch_likely(&sched_uclamp_used); 3408 + } 3409 + 3410 + /* 3411 + * Enabling static branches would get the cpus_read_lock(), 3412 + * check whether uclamp_is_used before enable it to avoid always 3413 + * calling cpus_read_lock(). Because we never disable this 3414 + * static key once enable it. 3415 + */ 3416 + static inline void sched_uclamp_enable(void) 3417 + { 3418 + if (!uclamp_is_used()) 3419 + static_branch_enable(&sched_uclamp_used); 3420 + } 3421 + 3340 3422 static inline unsigned long uclamp_rq_get(struct rq *rq, 3341 3423 enum uclamp_id clamp_id) 3342 3424 { ··· 3385 3417 unsigned long rq_util; 3386 3418 unsigned long max_util; 3387 3419 3388 - if (!static_branch_likely(&sched_uclamp_used)) 3420 + if (!uclamp_is_used()) 3389 3421 return false; 3390 3422 3391 3423 rq_util = cpu_util_cfs(cpu_of(rq)) + cpu_util_rt(rq); 3392 3424 max_util = READ_ONCE(rq->uclamp[UCLAMP_MAX].value); 3393 3425 3394 3426 return max_util != SCHED_CAPACITY_SCALE && rq_util >= max_util; 3395 - } 3396 - 3397 - /* 3398 - * When uclamp is compiled in, the aggregation at rq level is 'turned off' 3399 - * by default in the fast path and only gets turned on once userspace performs 3400 - * an operation that requires it. 3401 - * 3402 - * Returns true if userspace opted-in to use uclamp and aggregation at rq level 3403 - * hence is active. 3404 - */ 3405 - static inline bool uclamp_is_used(void) 3406 - { 3407 - return static_branch_likely(&sched_uclamp_used); 3408 3427 } 3409 3428 3410 3429 #define for_each_clamp_id(clamp_id) \ ··· 3440 3485 { 3441 3486 return false; 3442 3487 } 3488 + 3489 + static inline void sched_uclamp_enable(void) {} 3443 3490 3444 3491 static inline unsigned long 3445 3492 uclamp_rq_get(struct rq *rq, enum uclamp_id clamp_id) ··· 3576 3619 extern int sched_dynamic_mode(const char *str); 3577 3620 extern void sched_dynamic_update(int mode); 3578 3621 #endif 3622 + extern const char *preempt_modes[]; 3579 3623 3580 3624 #ifdef CONFIG_SCHED_MM_CID 3581 3625

+1 -1

kernel/sched/stats.h

··· 144 144 145 145 if (p->se.sched_delayed) { 146 146 /* CPU migration of "sleeping" task */ 147 - SCHED_WARN_ON(!(flags & ENQUEUE_MIGRATED)); 147 + WARN_ON_ONCE(!(flags & ENQUEUE_MIGRATED)); 148 148 if (p->in_memstall) 149 149 set |= TSK_MEMSTALL; 150 150 if (p->in_iowait)

+6 -6

kernel/sched/syscalls.c

··· 368 368 * blocking operation which obviously cannot be done while holding 369 369 * scheduler locks. 370 370 */ 371 - static_branch_enable(&sched_uclamp_used); 371 + sched_uclamp_enable(); 372 372 373 373 return 0; 374 374 } ··· 875 875 { 876 876 struct sched_param lparam; 877 877 878 - if (!param || pid < 0) 878 + if (unlikely(!param || pid < 0)) 879 879 return -EINVAL; 880 880 if (copy_from_user(&lparam, param, sizeof(struct sched_param))) 881 881 return -EFAULT; ··· 984 984 struct sched_attr attr; 985 985 int retval; 986 986 987 - if (!uattr || pid < 0 || flags) 987 + if (unlikely(!uattr || pid < 0 || flags)) 988 988 return -EINVAL; 989 989 990 990 retval = sched_copy_attr(uattr, &attr); ··· 1049 1049 struct task_struct *p; 1050 1050 int retval; 1051 1051 1052 - if (!param || pid < 0) 1052 + if (unlikely(!param || pid < 0)) 1053 1053 return -EINVAL; 1054 1054 1055 1055 scoped_guard (rcu) { ··· 1085 1085 struct task_struct *p; 1086 1086 int retval; 1087 1087 1088 - if (!uattr || pid < 0 || usize > PAGE_SIZE || 1089 - usize < SCHED_ATTR_SIZE_VER0 || flags) 1088 + if (unlikely(!uattr || pid < 0 || usize > PAGE_SIZE || 1089 + usize < SCHED_ATTR_SIZE_VER0 || flags)) 1090 1090 return -EINVAL; 1091 1091 1092 1092 scoped_guard (rcu) {

+14 -31

kernel/sched/topology.c

··· 6 6 #include <linux/bsearch.h> 7 7 8 8 DEFINE_MUTEX(sched_domains_mutex); 9 + void sched_domains_mutex_lock(void) 10 + { 11 + mutex_lock(&sched_domains_mutex); 12 + } 13 + void sched_domains_mutex_unlock(void) 14 + { 15 + mutex_unlock(&sched_domains_mutex); 16 + } 9 17 10 18 /* Protected by sched_domains_mutex: */ 11 19 static cpumask_var_t sched_domains_tmpmask; 12 20 static cpumask_var_t sched_domains_tmpmask2; 13 - 14 - #ifdef CONFIG_SCHED_DEBUG 15 21 16 22 static int __init sched_debug_setup(char *str) 17 23 { ··· 157 151 break; 158 152 } 159 153 } 160 - #else /* !CONFIG_SCHED_DEBUG */ 161 - 162 - # define sched_debug_verbose 0 163 - # define sched_domain_debug(sd, cpu) do { } while (0) 164 - static inline bool sched_debug(void) 165 - { 166 - return false; 167 - } 168 - #endif /* CONFIG_SCHED_DEBUG */ 169 154 170 155 /* Generate a mask of SD flags with the SDF_NEEDS_GROUPS metaflag */ 171 156 #define SD_FLAG(name, mflags) (name * !!((mflags) & SDF_NEEDS_GROUPS)) | ··· 557 560 rd->rto_push_work = IRQ_WORK_INIT_HARD(rto_push_irq_work_func); 558 561 #endif 559 562 560 - rd->visit_gen = 0; 563 + rd->visit_cookie = 0; 561 564 init_dl_bw(&rd->dl_bw); 562 565 if (cpudl_init(&rd->cpudl) != 0) 563 566 goto free_rto_mask; ··· 2272 2275 if (!sgc) 2273 2276 return -ENOMEM; 2274 2277 2275 - #ifdef CONFIG_SCHED_DEBUG 2276 2278 sgc->id = j; 2277 - #endif 2278 2279 2279 2280 *per_cpu_ptr(sdd->sgc, j) = sgc; 2280 2281 } ··· 2675 2680 * 2676 2681 * Call with hotplug lock and sched_domains_mutex held 2677 2682 */ 2678 - void partition_sched_domains_locked(int ndoms_new, cpumask_var_t doms_new[], 2683 + static void partition_sched_domains_locked(int ndoms_new, cpumask_var_t doms_new[], 2679 2684 struct sched_domain_attr *dattr_new) 2680 2685 { 2681 2686 bool __maybe_unused has_eas = false; ··· 2707 2712 for (i = 0; i < ndoms_cur; i++) { 2708 2713 for (j = 0; j < n && !new_topology; j++) { 2709 2714 if (cpumask_equal(doms_cur[i], doms_new[j]) && 2710 - dattrs_equal(dattr_cur, i, dattr_new, j)) { 2711 - struct root_domain *rd; 2712 - 2713 - /* 2714 - * This domain won't be destroyed and as such 2715 - * its dl_bw->total_bw needs to be cleared. 2716 - * Tasks contribution will be then recomputed 2717 - * in function dl_update_tasks_root_domain(), 2718 - * dl_servers contribution in function 2719 - * dl_restore_server_root_domain(). 2720 - */ 2721 - rd = cpu_rq(cpumask_any(doms_cur[i]))->rd; 2722 - dl_clear_root_domain(rd); 2715 + dattrs_equal(dattr_cur, i, dattr_new, j)) 2723 2716 goto match1; 2724 - } 2725 2717 } 2726 2718 /* No match - a current sched domain not in new doms_new[] */ 2727 2719 detach_destroy_domains(doms_cur[i]); ··· 2765 2783 ndoms_cur = ndoms_new; 2766 2784 2767 2785 update_sched_domain_debugfs(); 2786 + dl_rebuild_rd_accounting(); 2768 2787 } 2769 2788 2770 2789 /* ··· 2774 2791 void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[], 2775 2792 struct sched_domain_attr *dattr_new) 2776 2793 { 2777 - mutex_lock(&sched_domains_mutex); 2794 + sched_domains_mutex_lock(); 2778 2795 partition_sched_domains_locked(ndoms_new, doms_new, dattr_new); 2779 - mutex_unlock(&sched_domains_mutex); 2796 + sched_domains_mutex_unlock(); 2780 2797 }

+1 -6

kernel/trace/trace.c

··· 4100 4100 entries, 4101 4101 total, 4102 4102 buf->cpu, 4103 - preempt_model_none() ? "server" : 4104 - preempt_model_voluntary() ? "desktop" : 4105 - preempt_model_full() ? "preempt" : 4106 - preempt_model_lazy() ? "lazy" : 4107 - preempt_model_rt() ? "preempt_rt" : 4108 - "unknown", 4103 + preempt_model_str(), 4109 4104 /* These are reserved for later use */ 4110 4105 0, 0, 0, 0); 4111 4106 #ifdef CONFIG_SMP

-9

lib/Kconfig.debug

··· 1321 1321 1322 1322 menu "Scheduler Debugging" 1323 1323 1324 - config SCHED_DEBUG 1325 - bool "Collect scheduler debugging info" 1326 - depends on DEBUG_KERNEL && DEBUG_FS 1327 - default y 1328 - help 1329 - If you say Y here, the /sys/kernel/debug/sched file will be provided 1330 - that can help debug the scheduler. The runtime overhead of this 1331 - option is minimal. 1332 - 1333 1324 config SCHED_INFO 1334 1325 bool 1335 1326 default n

+2 -2

lib/dump_stack.c

··· 54 54 */ 55 55 void dump_stack_print_info(const char *log_lvl) 56 56 { 57 - printk("%sCPU: %d UID: %u PID: %d Comm: %.20s %s%s %s %.*s" BUILD_ID_FMT "\n", 57 + printk("%sCPU: %d UID: %u PID: %d Comm: %.20s %s%s %s %.*s %s " BUILD_ID_FMT "\n", 58 58 log_lvl, raw_smp_processor_id(), 59 59 __kuid_val(current_real_cred()->euid), 60 60 current->pid, current->comm, ··· 62 62 print_tainted(), 63 63 init_utsname()->release, 64 64 (int)strcspn(init_utsname()->version, " "), 65 - init_utsname()->version, BUILD_ID_VAL); 65 + init_utsname()->version, preempt_model_str(), BUILD_ID_VAL); 66 66 67 67 if (get_taint()) 68 68 printk("%s%s\n", log_lvl, print_tainted_verbose());

+1

tools/testing/selftests/rseq/.gitignore

··· 9 9 param_test_mm_cid 10 10 param_test_mm_cid_benchmark 11 11 param_test_mm_cid_compare_twice 12 + syscall_errors_test

+7 -2

tools/testing/selftests/rseq/Makefile

··· 16 16 17 17 TEST_GEN_PROGS = basic_test basic_percpu_ops_test basic_percpu_ops_mm_cid_test param_test \ 18 18 param_test_benchmark param_test_compare_twice param_test_mm_cid \ 19 - param_test_mm_cid_benchmark param_test_mm_cid_compare_twice 19 + param_test_mm_cid_benchmark param_test_mm_cid_compare_twice \ 20 + syscall_errors_test 20 21 21 22 TEST_GEN_PROGS_EXTENDED = librseq.so 22 23 23 - TEST_PROGS = run_param_test.sh 24 + TEST_PROGS = run_param_test.sh run_syscall_errors_test.sh 24 25 25 26 TEST_FILES := settings 26 27 ··· 55 54 $(OUTPUT)/param_test_mm_cid_compare_twice: param_test.c $(TEST_GEN_PROGS_EXTENDED) \ 56 55 rseq.h rseq-*.h 57 56 $(CC) $(CFLAGS) -DBUILDOPT_RSEQ_PERCPU_MM_CID -DRSEQ_COMPARE_TWICE $< $(LDLIBS) -lrseq -o $@ 57 + 58 + $(OUTPUT)/syscall_errors_test: syscall_errors_test.c $(TEST_GEN_PROGS_EXTENDED) \ 59 + rseq.h rseq-*.h 60 + $(CC) $(CFLAGS) $< $(LDLIBS) -lrseq -o $@

+19 -8

tools/testing/selftests/rseq/rseq.c

··· 71 71 /* Original struct rseq allocation size is 32 bytes. */ 72 72 #define ORIG_RSEQ_ALLOC_SIZE 32 73 73 74 + /* 75 + * Use a union to ensure we allocate a TLS area of 1024 bytes to accomodate an 76 + * rseq registration that is larger than the current rseq ABI. 77 + */ 78 + union rseq_tls { 79 + struct rseq_abi abi; 80 + char dummy[RSEQ_THREAD_AREA_ALLOC_SIZE]; 81 + }; 82 + 74 83 static 75 - __thread struct rseq_abi __rseq_abi __attribute__((tls_model("initial-exec"), aligned(RSEQ_THREAD_AREA_ALLOC_SIZE))) = { 76 - .cpu_id = RSEQ_ABI_CPU_ID_UNINITIALIZED, 84 + __thread union rseq_tls __rseq __attribute__((tls_model("initial-exec"))) = { 85 + .abi = { 86 + .cpu_id = RSEQ_ABI_CPU_ID_UNINITIALIZED, 87 + }, 77 88 }; 78 89 79 90 static int sys_rseq(struct rseq_abi *rseq_abi, uint32_t rseq_len, ··· 98 87 return syscall(__NR_getcpu, cpu, node, NULL); 99 88 } 100 89 101 - int rseq_available(void) 90 + bool rseq_available(void) 102 91 { 103 92 int rc; 104 93 ··· 107 96 abort(); 108 97 switch (errno) { 109 98 case ENOSYS: 110 - return 0; 99 + return false; 111 100 case EINVAL: 112 - return 1; 101 + return true; 113 102 default: 114 103 abort(); 115 104 } ··· 160 149 /* Treat libc's ownership as a successful registration. */ 161 150 return 0; 162 151 } 163 - rc = sys_rseq(&__rseq_abi, get_rseq_min_alloc_size(), 0, RSEQ_SIG); 152 + rc = sys_rseq(&__rseq.abi, get_rseq_min_alloc_size(), 0, RSEQ_SIG); 164 153 if (rc) { 165 154 /* 166 155 * After at least one thread has registered successfully ··· 194 183 /* Treat libc's ownership as a successful unregistration. */ 195 184 return 0; 196 185 } 197 - rc = sys_rseq(&__rseq_abi, get_rseq_min_alloc_size(), RSEQ_ABI_FLAG_UNREGISTER, RSEQ_SIG); 186 + rc = sys_rseq(&__rseq.abi, get_rseq_min_alloc_size(), RSEQ_ABI_FLAG_UNREGISTER, RSEQ_SIG); 198 187 if (rc) 199 188 return -1; 200 189 return 0; ··· 260 249 rseq_ownership = 1; 261 250 262 251 /* Calculate the offset of the rseq area from the thread pointer. */ 263 - rseq_offset = (void *)&__rseq_abi - rseq_thread_pointer(); 252 + rseq_offset = (void *)&__rseq.abi - rseq_thread_pointer(); 264 253 265 254 /* rseq flags are deprecated, always set to 0. */ 266 255 rseq_flags = 0;

+5

tools/testing/selftests/rseq/rseq.h

··· 160 160 int32_t rseq_fallback_current_node(void); 161 161 162 162 /* 163 + * Returns true if rseq is supported. 164 + */ 165 + bool rseq_available(void); 166 + 167 + /* 163 168 * Values returned can be either the current CPU number, -1 (rseq is 164 169 * uninitialized), or -2 (rseq initialization has failed). 165 170 */

+5

tools/testing/selftests/rseq/run_syscall_errors_test.sh

··· 1 + #!/bin/bash 2 + # SPDX-License-Identifier: MIT 3 + # SPDX-FileCopyrightText: 2024 Michael Jeanson <mjeanson@efficios.com> 4 + 5 + GLIBC_TUNABLES="${GLIBC_TUNABLES:-}:glibc.pthread.rseq=0" ./syscall_errors_test

+124

tools/testing/selftests/rseq/syscall_errors_test.c

··· 1 + // SPDX-License-Identifier: MIT 2 + // SPDX-FileCopyrightText: 2024 Michael Jeanson <mjeanson@efficios.com> 3 + 4 + #ifndef _GNU_SOURCE 5 + #define _GNU_SOURCE 6 + #endif 7 + 8 + #include <assert.h> 9 + #include <stdint.h> 10 + #include <syscall.h> 11 + #include <string.h> 12 + #include <unistd.h> 13 + 14 + #include "rseq.h" 15 + 16 + static int sys_rseq(void *rseq_abi, uint32_t rseq_len, 17 + int flags, uint32_t sig) 18 + { 19 + return syscall(__NR_rseq, rseq_abi, rseq_len, flags, sig); 20 + } 21 + 22 + /* 23 + * Check the value of errno on some expected failures of the rseq syscall. 24 + */ 25 + 26 + int main(void) 27 + { 28 + struct rseq_abi *global_rseq = rseq_get_abi(); 29 + int ret; 30 + int errno_copy; 31 + 32 + if (!rseq_available()) { 33 + fprintf(stderr, "rseq syscall unavailable"); 34 + goto error; 35 + } 36 + 37 + /* The current thread is NOT registered. */ 38 + 39 + /* EINVAL */ 40 + errno = 0; 41 + ret = sys_rseq(global_rseq, 32, -1, RSEQ_SIG); 42 + errno_copy = errno; 43 + fprintf(stderr, "Registration with invalid flag fails with errno set to EINVAL (ret = %d, errno = %s)\n", ret, strerrorname_np(errno_copy)); 44 + if (ret == 0 || errno_copy != EINVAL) 45 + goto error; 46 + 47 + errno = 0; 48 + ret = sys_rseq((char *) global_rseq + 1, 32, 0, RSEQ_SIG); 49 + errno_copy = errno; 50 + fprintf(stderr, "Registration with unaligned rseq_abi fails with errno set to EINVAL (ret = %d, errno = %s)\n", ret, strerrorname_np(errno_copy)); 51 + if (ret == 0 || errno_copy != EINVAL) 52 + goto error; 53 + 54 + errno = 0; 55 + ret = sys_rseq(global_rseq, 31, 0, RSEQ_SIG); 56 + errno_copy = errno; 57 + fprintf(stderr, "Registration with invalid size fails with errno set to EINVAL (ret = %d, errno = %s)\n", ret, strerrorname_np(errno_copy)); 58 + if (ret == 0 || errno_copy != EINVAL) 59 + goto error; 60 + 61 + 62 + #if defined(__LP64__) && (!defined(__s390__) && !defined(__s390x__)) 63 + /* 64 + * We haven't found a reliable way to find an invalid address when 65 + * running a 32bit userspace on a 64bit kernel, so only run this test 66 + * on 64bit builds for the moment. 67 + * 68 + * Also exclude architectures that select 69 + * CONFIG_ALTERNATE_USER_ADDRESS_SPACE where the kernel and userspace 70 + * have their own address space and this failure can't happen. 71 + */ 72 + 73 + /* EFAULT */ 74 + errno = 0; 75 + ret = sys_rseq((void *) -4096UL, 32, 0, RSEQ_SIG); 76 + errno_copy = errno; 77 + fprintf(stderr, "Registration with invalid address fails with errno set to EFAULT (ret = %d, errno = %s)\n", ret, strerrorname_np(errno_copy)); 78 + if (ret == 0 || errno_copy != EFAULT) 79 + goto error; 80 + #endif 81 + 82 + errno = 0; 83 + ret = sys_rseq(global_rseq, 32, 0, RSEQ_SIG); 84 + errno_copy = errno; 85 + fprintf(stderr, "Registration succeeds for the current thread (ret = %d, errno = %s)\n", ret, strerrorname_np(errno_copy)); 86 + if (ret != 0 && errno != 0) 87 + goto error; 88 + 89 + /* The current thread is registered. */ 90 + 91 + /* EBUSY */ 92 + errno = 0; 93 + ret = sys_rseq(global_rseq, 32, 0, RSEQ_SIG); 94 + errno_copy = errno; 95 + fprintf(stderr, "Double registration fails with errno set to EBUSY (ret = %d, errno = %s)\n", ret, strerrorname_np(errno_copy)); 96 + if (ret == 0 || errno_copy != EBUSY) 97 + goto error; 98 + 99 + /* EPERM */ 100 + errno = 0; 101 + ret = sys_rseq(global_rseq, 32, RSEQ_ABI_FLAG_UNREGISTER, RSEQ_SIG + 1); 102 + errno_copy = errno; 103 + fprintf(stderr, "Unregistration with wrong RSEQ_SIG fails with errno to EPERM (ret = %d, errno = %s)\n", ret, strerrorname_np(errno_copy)); 104 + if (ret == 0 || errno_copy != EPERM) 105 + goto error; 106 + 107 + errno = 0; 108 + ret = sys_rseq(global_rseq, 32, RSEQ_ABI_FLAG_UNREGISTER, RSEQ_SIG); 109 + errno_copy = errno; 110 + fprintf(stderr, "Unregistration succeeds for the current thread (ret = %d, errno = %s)\n", ret, strerrorname_np(errno_copy)); 111 + if (ret != 0) 112 + goto error; 113 + 114 + errno = 0; 115 + ret = sys_rseq(global_rseq, 32, RSEQ_ABI_FLAG_UNREGISTER, RSEQ_SIG); 116 + errno_copy = errno; 117 + fprintf(stderr, "Double unregistration fails with errno set to EINVAL (ret = %d, errno = %s)\n", ret, strerrorname_np(errno_copy)); 118 + if (ret == 0 || errno_copy != EINVAL) 119 + goto error; 120 + 121 + return 0; 122 + error: 123 + return -1; 124 + }

+1 -1

tools/testing/selftests/sched/config

··· 1 - CONFIG_SCHED_DEBUG=y 1 + # empty

-1

tools/testing/selftests/sched_ext/config

··· 1 - CONFIG_SCHED_DEBUG=y 2 1 CONFIG_SCHED_CLASS_EXT=y 3 2 CONFIG_CGROUPS=y 4 3 CONFIG_CGROUP_SCHED=y

-1

tools/testing/selftests/wireguard/qemu/debug.config

··· 27 27 CONFIG_DEBUG_STACK_USAGE=y 28 28 CONFIG_DEBUG_SHIRQ=y 29 29 CONFIG_WQ_WATCHDOG=y 30 - CONFIG_SCHED_DEBUG=y 31 30 CONFIG_SCHED_INFO=y 32 31 CONFIG_SCHEDSTATS=y 33 32 CONFIG_SCHED_STACK_END_CHECK=y