Merge tag 'sched-core-2023-04-27' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

+4

Documentation/accounting/psi.rst

··· 105 105 after which monitors are most likely not needed and psi averages can be used 106 106 instead. 107 107 108 + Unprivileged users can also create monitors, with the only limitation that the 109 + window size must be a multiple of 2s, in order to prevent excessive resource 110 + usage. 111 + 108 112 When activated, psi monitor stays active for at least the duration of one 109 113 tracking window to avoid repeated activations/deactivations when system is 110 114 bouncing in and out of the stall state.

+1 -2

drivers/vhost/vhost.c

··· 361 361 kcov_remote_start_common(worker->kcov_handle); 362 362 work->fn(work); 363 363 kcov_remote_stop(); 364 - if (need_resched()) 365 - schedule(); 364 + cond_resched(); 366 365 } 367 366 } 368 367

+1

include/linux/livepatch.h

··· 13 13 #include <linux/ftrace.h> 14 14 #include <linux/completion.h> 15 15 #include <linux/list.h> 16 + #include <linux/livepatch_sched.h> 16 17 17 18 #if IS_ENABLED(CONFIG_LIVEPATCH) 18 19

+29

include/linux/livepatch_sched.h

··· 1 + /* SPDX-License-Identifier: GPL-2.0-or-later */ 2 + #ifndef _LINUX_LIVEPATCH_SCHED_H_ 3 + #define _LINUX_LIVEPATCH_SCHED_H_ 4 + 5 + #include <linux/jump_label.h> 6 + #include <linux/static_call_types.h> 7 + 8 + #ifdef CONFIG_LIVEPATCH 9 + 10 + void __klp_sched_try_switch(void); 11 + 12 + #if !defined(CONFIG_PREEMPT_DYNAMIC) || !defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL) 13 + 14 + DECLARE_STATIC_KEY_FALSE(klp_sched_try_switch_key); 15 + 16 + static __always_inline void klp_sched_try_switch(void) 17 + { 18 + if (static_branch_unlikely(&klp_sched_try_switch_key)) 19 + __klp_sched_try_switch(); 20 + } 21 + 22 + #endif /* !CONFIG_PREEMPT_DYNAMIC || !CONFIG_HAVE_PREEMPT_DYNAMIC_CALL */ 23 + 24 + #else /* !CONFIG_LIVEPATCH */ 25 + static inline void klp_sched_try_switch(void) {} 26 + static inline void __klp_sched_try_switch(void) {} 27 + #endif /* CONFIG_LIVEPATCH */ 28 + 29 + #endif /* _LINUX_LIVEPATCH_SCHED_H_ */

+74 -8

include/linux/mm_types.h

··· 573 573 struct vm_userfaultfd_ctx vm_userfaultfd_ctx; 574 574 } __randomize_layout; 575 575 576 + #ifdef CONFIG_SCHED_MM_CID 577 + struct mm_cid { 578 + u64 time; 579 + int cid; 580 + }; 581 + #endif 582 + 576 583 struct kioctx_table; 577 584 struct mm_struct { 578 585 struct { ··· 630 623 atomic_t mm_count; 631 624 #ifdef CONFIG_SCHED_MM_CID 632 625 /** 633 - * @cid_lock: Protect cid bitmap updates vs lookups. 626 + * @pcpu_cid: Per-cpu current cid. 634 627 * 635 - * Prevent situations where updates to the cid bitmap happen 636 - * concurrently with lookups. Those can lead to situations 637 - * where a lookup cannot find a free bit simply because it was 638 - * unlucky enough to load, non-atomically, bitmap words as they 639 - * were being concurrently updated by the updaters. 628 + * Keep track of the currently allocated mm_cid for each cpu. 629 + * The per-cpu mm_cid values are serialized by their respective 630 + * runqueue locks. 640 631 */ 641 - raw_spinlock_t cid_lock; 632 + struct mm_cid __percpu *pcpu_cid; 633 + /* 634 + * @mm_cid_next_scan: Next mm_cid scan (in jiffies). 635 + * 636 + * When the next mm_cid scan is due (in jiffies). 637 + */ 638 + unsigned long mm_cid_next_scan; 642 639 #endif 643 640 #ifdef CONFIG_MMU 644 641 atomic_long_t pgtables_bytes; /* size of all page tables */ ··· 910 899 } 911 900 912 901 #ifdef CONFIG_SCHED_MM_CID 902 + 903 + enum mm_cid_state { 904 + MM_CID_UNSET = -1U, /* Unset state has lazy_put flag set. */ 905 + MM_CID_LAZY_PUT = (1U << 31), 906 + }; 907 + 908 + static inline bool mm_cid_is_unset(int cid) 909 + { 910 + return cid == MM_CID_UNSET; 911 + } 912 + 913 + static inline bool mm_cid_is_lazy_put(int cid) 914 + { 915 + return !mm_cid_is_unset(cid) && (cid & MM_CID_LAZY_PUT); 916 + } 917 + 918 + static inline bool mm_cid_is_valid(int cid) 919 + { 920 + return !(cid & MM_CID_LAZY_PUT); 921 + } 922 + 923 + static inline int mm_cid_set_lazy_put(int cid) 924 + { 925 + return cid | MM_CID_LAZY_PUT; 926 + } 927 + 928 + static inline int mm_cid_clear_lazy_put(int cid) 929 + { 930 + return cid & ~MM_CID_LAZY_PUT; 931 + } 932 + 913 933 /* Accessor for struct mm_struct's cidmask. */ 914 934 static inline cpumask_t *mm_cidmask(struct mm_struct *mm) 915 935 { ··· 954 912 955 913 static inline void mm_init_cid(struct mm_struct *mm) 956 914 { 957 - raw_spin_lock_init(&mm->cid_lock); 915 + int i; 916 + 917 + for_each_possible_cpu(i) { 918 + struct mm_cid *pcpu_cid = per_cpu_ptr(mm->pcpu_cid, i); 919 + 920 + pcpu_cid->cid = MM_CID_UNSET; 921 + pcpu_cid->time = 0; 922 + } 958 923 cpumask_clear(mm_cidmask(mm)); 924 + } 925 + 926 + static inline int mm_alloc_cid(struct mm_struct *mm) 927 + { 928 + mm->pcpu_cid = alloc_percpu(struct mm_cid); 929 + if (!mm->pcpu_cid) 930 + return -ENOMEM; 931 + mm_init_cid(mm); 932 + return 0; 933 + } 934 + 935 + static inline void mm_destroy_cid(struct mm_struct *mm) 936 + { 937 + free_percpu(mm->pcpu_cid); 938 + mm->pcpu_cid = NULL; 959 939 } 960 940 961 941 static inline unsigned int mm_cid_size(void) ··· 986 922 } 987 923 #else /* CONFIG_SCHED_MM_CID */ 988 924 static inline void mm_init_cid(struct mm_struct *mm) { } 925 + static inline int mm_alloc_cid(struct mm_struct *mm) { return 0; } 926 + static inline void mm_destroy_cid(struct mm_struct *mm) { } 989 927 static inline unsigned int mm_cid_size(void) 990 928 { 991 929 return 0;

+1 -1

include/linux/psi.h

··· 24 24 25 25 int psi_show(struct seq_file *s, struct psi_group *group, enum psi_res res); 26 26 struct psi_trigger *psi_trigger_create(struct psi_group *group, 27 - char *buf, enum psi_res res); 27 + char *buf, enum psi_res res, struct file *file); 28 28 void psi_trigger_destroy(struct psi_trigger *t); 29 29 30 30 __poll_t psi_trigger_poll(void **trigger_ptr, struct file *file,

+23 -16

include/linux/psi_types.h

··· 151 151 152 152 /* Deferred event(s) from previous ratelimit window */ 153 153 bool pending_event; 154 + 155 + /* Trigger type - PSI_AVGS for unprivileged, PSI_POLL for RT */ 156 + enum psi_aggregators aggregator; 154 157 }; 155 158 156 159 struct psi_group { ··· 174 171 /* Aggregator work control */ 175 172 struct delayed_work avgs_work; 176 173 174 + /* Unprivileged triggers against N*PSI_FREQ windows */ 175 + struct list_head avg_triggers; 176 + u32 avg_nr_triggers[NR_PSI_STATES - 1]; 177 + 177 178 /* Total stall times and sampled pressure averages */ 178 179 u64 total[NR_PSI_AGGREGATORS][NR_PSI_STATES - 1]; 179 180 unsigned long avg[NR_PSI_STATES - 1][3]; 180 181 181 - /* Monitor work control */ 182 - struct task_struct __rcu *poll_task; 183 - struct timer_list poll_timer; 184 - wait_queue_head_t poll_wait; 185 - atomic_t poll_wakeup; 186 - atomic_t poll_scheduled; 182 + /* Monitor RT polling work control */ 183 + struct task_struct __rcu *rtpoll_task; 184 + struct timer_list rtpoll_timer; 185 + wait_queue_head_t rtpoll_wait; 186 + atomic_t rtpoll_wakeup; 187 + atomic_t rtpoll_scheduled; 187 188 188 189 /* Protects data used by the monitor */ 189 - struct mutex trigger_lock; 190 + struct mutex rtpoll_trigger_lock; 190 191 191 - /* Configured polling triggers */ 192 - struct list_head triggers; 193 - u32 nr_triggers[NR_PSI_STATES - 1]; 194 - u32 poll_states; 195 - u64 poll_min_period; 192 + /* Configured RT polling triggers */ 193 + struct list_head rtpoll_triggers; 194 + u32 rtpoll_nr_triggers[NR_PSI_STATES - 1]; 195 + u32 rtpoll_states; 196 + u64 rtpoll_min_period; 196 197 197 - /* Total stall times at the start of monitor activation */ 198 - u64 polling_total[NR_PSI_STATES - 1]; 199 - u64 polling_next_update; 200 - u64 polling_until; 198 + /* Total stall times at the start of RT polling monitor activation */ 199 + u64 rtpoll_total[NR_PSI_STATES - 1]; 200 + u64 rtpoll_next_update; 201 + u64 rtpoll_until; 201 202 }; 202 203 203 204 #else /* CONFIG_PSI */

+18 -5

include/linux/sched.h

··· 36 36 #include <linux/seqlock.h> 37 37 #include <linux/kcsan.h> 38 38 #include <linux/rv.h> 39 + #include <linux/livepatch_sched.h> 39 40 #include <asm/kmap_size.h> 40 41 41 42 /* task_struct member predeclarations (sorted alphabetically): */ ··· 1314 1313 1315 1314 #ifdef CONFIG_SCHED_MM_CID 1316 1315 int mm_cid; /* Current cid in mm */ 1316 + int last_mm_cid; /* Most recent cid in mm */ 1317 + int migrate_from_cpu; 1317 1318 int mm_cid_active; /* Whether cid bitmap is active */ 1319 + struct callback_head cid_work; 1318 1320 #endif 1319 1321 1320 1322 struct tlbflush_unmap_batch tlb_ubc; ··· 2071 2067 2072 2068 #if defined(CONFIG_PREEMPT_DYNAMIC) && defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL) 2073 2069 2070 + void sched_dynamic_klp_enable(void); 2071 + void sched_dynamic_klp_disable(void); 2072 + 2074 2073 DECLARE_STATIC_CALL(cond_resched, __cond_resched); 2075 2074 2076 2075 static __always_inline int _cond_resched(void) ··· 2082 2075 } 2083 2076 2084 2077 #elif defined(CONFIG_PREEMPT_DYNAMIC) && defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY) 2078 + 2085 2079 extern int dynamic_cond_resched(void); 2086 2080 2087 2081 static __always_inline int _cond_resched(void) ··· 2090 2082 return dynamic_cond_resched(); 2091 2083 } 2092 2084 2093 - #else 2085 + #else /* !CONFIG_PREEMPTION */ 2094 2086 2095 2087 static inline int _cond_resched(void) 2096 2088 { 2089 + klp_sched_try_switch(); 2097 2090 return __cond_resched(); 2098 2091 } 2099 2092 2100 - #endif /* CONFIG_PREEMPT_DYNAMIC */ 2093 + #endif /* PREEMPT_DYNAMIC && CONFIG_HAVE_PREEMPT_DYNAMIC_CALL */ 2101 2094 2102 - #else 2095 + #else /* CONFIG_PREEMPTION && !CONFIG_PREEMPT_DYNAMIC */ 2103 2096 2104 - static inline int _cond_resched(void) { return 0; } 2097 + static inline int _cond_resched(void) 2098 + { 2099 + klp_sched_try_switch(); 2100 + return 0; 2101 + } 2105 2102 2106 - #endif /* !defined(CONFIG_PREEMPTION) || defined(CONFIG_PREEMPT_DYNAMIC) */ 2103 + #endif /* !CONFIG_PREEMPTION || CONFIG_PREEMPT_DYNAMIC */ 2107 2104 2108 2105 #define cond_resched() ({ \ 2109 2106 __might_resched(__FILE__, __LINE__, 0); \

+5

include/linux/sched/mm.h

··· 37 37 atomic_inc(&mm->mm_count); 38 38 } 39 39 40 + static inline void smp_mb__after_mmgrab(void) 41 + { 42 + smp_mb__after_atomic(); 43 + } 44 + 40 45 extern void __mmdrop(struct mm_struct *mm); 41 46 42 47 static inline void mmdrop(struct mm_struct *mm)

+1 -1

kernel/cgroup/cgroup.c

··· 3771 3771 } 3772 3772 3773 3773 psi = cgroup_psi(cgrp); 3774 - new = psi_trigger_create(psi, buf, res); 3774 + new = psi_trigger_create(psi, buf, res, of->file); 3775 3775 if (IS_ERR(new)) { 3776 3776 cgroup_put(cgrp); 3777 3777 return PTR_ERR(new);

+8 -1

kernel/fork.c

··· 924 924 check_mm(mm); 925 925 put_user_ns(mm->user_ns); 926 926 mm_pasid_drop(mm); 927 + mm_destroy_cid(mm); 927 928 928 929 for (i = 0; i < NR_MM_COUNTERS; i++) 929 930 percpu_counter_destroy(&mm->rss_stat[i]); ··· 1189 1188 1190 1189 #ifdef CONFIG_SCHED_MM_CID 1191 1190 tsk->mm_cid = -1; 1191 + tsk->last_mm_cid = -1; 1192 1192 tsk->mm_cid_active = 0; 1193 + tsk->migrate_from_cpu = -1; 1193 1194 #endif 1194 1195 return tsk; 1195 1196 ··· 1299 1296 if (init_new_context(p, mm)) 1300 1297 goto fail_nocontext; 1301 1298 1299 + if (mm_alloc_cid(mm)) 1300 + goto fail_cid; 1301 + 1302 1302 for (i = 0; i < NR_MM_COUNTERS; i++) 1303 1303 if (percpu_counter_init(&mm->rss_stat[i], 0, GFP_KERNEL_ACCOUNT)) 1304 1304 goto fail_pcpu; 1305 1305 1306 1306 mm->user_ns = get_user_ns(user_ns); 1307 1307 lru_gen_init_mm(mm); 1308 - mm_init_cid(mm); 1309 1308 return mm; 1310 1309 1311 1310 fail_pcpu: 1312 1311 while (i > 0) 1313 1312 percpu_counter_destroy(&mm->rss_stat[--i]); 1313 + mm_destroy_cid(mm); 1314 + fail_cid: 1314 1315 destroy_context(mm); 1315 1316 fail_nocontext: 1316 1317 mm_free_pgd(mm);

+1

kernel/livepatch/core.c

··· 33 33 * 34 34 * - klp_ftrace_handler() 35 35 * - klp_update_patch_state() 36 + * - __klp_sched_try_switch() 36 37 */ 37 38 DEFINE_MUTEX(klp_mutex); 38 39

+104 -18

kernel/livepatch/transition.c

··· 9 9 10 10 #include <linux/cpu.h> 11 11 #include <linux/stacktrace.h> 12 + #include <linux/static_call.h> 12 13 #include "core.h" 13 14 #include "patch.h" 14 15 #include "transition.h" 15 16 16 17 #define MAX_STACK_ENTRIES 100 18 + DEFINE_PER_CPU(unsigned long[MAX_STACK_ENTRIES], klp_stack_entries); 19 + 17 20 #define STACK_ERR_BUF_SIZE 128 18 21 19 22 #define SIGNALS_TIMEOUT 15 ··· 26 23 static int klp_target_state = KLP_UNDEFINED; 27 24 28 25 static unsigned int klp_signals_cnt; 26 + 27 + /* 28 + * When a livepatch is in progress, enable klp stack checking in 29 + * cond_resched(). This helps CPU-bound kthreads get patched. 30 + */ 31 + #if defined(CONFIG_PREEMPT_DYNAMIC) && defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL) 32 + 33 + #define klp_cond_resched_enable() sched_dynamic_klp_enable() 34 + #define klp_cond_resched_disable() sched_dynamic_klp_disable() 35 + 36 + #else /* !CONFIG_PREEMPT_DYNAMIC || !CONFIG_HAVE_PREEMPT_DYNAMIC_CALL */ 37 + 38 + DEFINE_STATIC_KEY_FALSE(klp_sched_try_switch_key); 39 + EXPORT_SYMBOL(klp_sched_try_switch_key); 40 + 41 + #define klp_cond_resched_enable() static_branch_enable(&klp_sched_try_switch_key) 42 + #define klp_cond_resched_disable() static_branch_disable(&klp_sched_try_switch_key) 43 + 44 + #endif /* CONFIG_PREEMPT_DYNAMIC && CONFIG_HAVE_PREEMPT_DYNAMIC_CALL */ 29 45 30 46 /* 31 47 * This work can be performed periodically to finish patching or unpatching any ··· 194 172 * barrier (smp_rmb) for two cases: 195 173 * 196 174 * 1) Enforce the order of the TIF_PATCH_PENDING read and the 197 - * klp_target_state read. The corresponding write barrier is in 198 - * klp_init_transition(). 175 + * klp_target_state read. The corresponding write barriers are in 176 + * klp_init_transition() and klp_reverse_transition(). 199 177 * 200 178 * 2) Enforce the order of the TIF_PATCH_PENDING read and a future read 201 179 * of func->transition, if klp_ftrace_handler() is called later on ··· 262 240 */ 263 241 static int klp_check_stack(struct task_struct *task, const char **oldname) 264 242 { 265 - static unsigned long entries[MAX_STACK_ENTRIES]; 243 + unsigned long *entries = this_cpu_ptr(klp_stack_entries); 266 244 struct klp_object *obj; 267 245 struct klp_func *func; 268 246 int ret, nr_entries; 269 247 270 - ret = stack_trace_save_tsk_reliable(task, entries, ARRAY_SIZE(entries)); 248 + /* Protect 'klp_stack_entries' */ 249 + lockdep_assert_preemption_disabled(); 250 + 251 + ret = stack_trace_save_tsk_reliable(task, entries, MAX_STACK_ENTRIES); 271 252 if (ret < 0) 272 253 return -EINVAL; 273 254 nr_entries = ret; ··· 332 307 * functions. If all goes well, switch the task to the target patch 333 308 * state. 334 309 */ 335 - ret = task_call_func(task, klp_check_and_switch_task, &old_name); 310 + if (task == current) 311 + ret = klp_check_and_switch_task(current, &old_name); 312 + else 313 + ret = task_call_func(task, klp_check_and_switch_task, &old_name); 314 + 336 315 switch (ret) { 337 316 case 0: /* success */ 338 317 break; ··· 362 333 363 334 return !ret; 364 335 } 336 + 337 + void __klp_sched_try_switch(void) 338 + { 339 + if (likely(!klp_patch_pending(current))) 340 + return; 341 + 342 + /* 343 + * This function is called from cond_resched() which is called in many 344 + * places throughout the kernel. Using the klp_mutex here might 345 + * deadlock. 346 + * 347 + * Instead, disable preemption to prevent racing with other callers of 348 + * klp_try_switch_task(). Thanks to task_call_func() they won't be 349 + * able to switch this task while it's running. 350 + */ 351 + preempt_disable(); 352 + 353 + /* 354 + * Make sure current didn't get patched between the above check and 355 + * preempt_disable(). 356 + */ 357 + if (unlikely(!klp_patch_pending(current))) 358 + goto out; 359 + 360 + /* 361 + * Enforce the order of the TIF_PATCH_PENDING read above and the 362 + * klp_target_state read in klp_try_switch_task(). The corresponding 363 + * write barriers are in klp_init_transition() and 364 + * klp_reverse_transition(). 365 + */ 366 + smp_rmb(); 367 + 368 + klp_try_switch_task(current); 369 + 370 + out: 371 + preempt_enable(); 372 + } 373 + EXPORT_SYMBOL(__klp_sched_try_switch); 365 374 366 375 /* 367 376 * Sends a fake signal to all non-kthread tasks with TIF_PATCH_PENDING set. ··· 507 440 return; 508 441 } 509 442 510 - /* we're done, now cleanup the data structures */ 443 + /* Done! Now cleanup the data structures. */ 444 + klp_cond_resched_disable(); 511 445 patch = klp_transition_patch; 512 446 klp_complete_transition(); 513 447 ··· 559 491 if (task->patch_state != klp_target_state) 560 492 set_tsk_thread_flag(task, TIF_PATCH_PENDING); 561 493 } 494 + 495 + klp_cond_resched_enable(); 562 496 563 497 klp_signals_cnt = 0; 564 498 } ··· 617 547 * see a func in transition with a task->patch_state of KLP_UNDEFINED. 618 548 * 619 549 * Also enforce the order of the klp_target_state write and future 620 - * TIF_PATCH_PENDING writes to ensure klp_update_patch_state() doesn't 621 - * set a task->patch_state to KLP_UNDEFINED. 550 + * TIF_PATCH_PENDING writes to ensure klp_update_patch_state() and 551 + * __klp_sched_try_switch() don't set a task->patch_state to 552 + * KLP_UNDEFINED. 622 553 */ 623 554 smp_wmb(); 624 555 ··· 655 584 klp_target_state == KLP_PATCHED ? "patching to unpatching" : 656 585 "unpatching to patching"); 657 586 658 - klp_transition_patch->enabled = !klp_transition_patch->enabled; 659 - 660 - klp_target_state = !klp_target_state; 661 - 662 587 /* 663 588 * Clear all TIF_PATCH_PENDING flags to prevent races caused by 664 - * klp_update_patch_state() running in parallel with 665 - * klp_start_transition(). 589 + * klp_update_patch_state() or __klp_sched_try_switch() running in 590 + * parallel with the reverse transition. 666 591 */ 667 592 read_lock(&tasklist_lock); 668 593 for_each_process_thread(g, task) ··· 668 601 for_each_possible_cpu(cpu) 669 602 clear_tsk_thread_flag(idle_task(cpu), TIF_PATCH_PENDING); 670 603 671 - /* Let any remaining calls to klp_update_patch_state() complete */ 604 + /* 605 + * Make sure all existing invocations of klp_update_patch_state() and 606 + * __klp_sched_try_switch() see the cleared TIF_PATCH_PENDING before 607 + * starting the reverse transition. 608 + */ 672 609 klp_synchronize_transition(); 610 + 611 + /* 612 + * All patching has stopped, now re-initialize the global variables to 613 + * prepare for the reverse transition. 614 + */ 615 + klp_transition_patch->enabled = !klp_transition_patch->enabled; 616 + klp_target_state = !klp_target_state; 617 + 618 + /* 619 + * Enforce the order of the klp_target_state write and the 620 + * TIF_PATCH_PENDING writes in klp_start_transition() to ensure 621 + * klp_update_patch_state() and __klp_sched_try_switch() don't set 622 + * task->patch_state to the wrong value. 623 + */ 624 + smp_wmb(); 673 625 674 626 klp_start_transition(); 675 627 } ··· 703 617 * the task flag up to date with the parent here. 704 618 * 705 619 * The operation is serialized against all klp_*_transition() 706 - * operations by the tasklist_lock. The only exception is 707 - * klp_update_patch_state(current), but we cannot race with 708 - * that because we are current. 620 + * operations by the tasklist_lock. The only exceptions are 621 + * klp_update_patch_state(current) and __klp_sched_try_switch(), but we 622 + * cannot race with them because we are current. 709 623 */ 710 624 if (test_tsk_thread_flag(current, TIF_PATCH_PENDING)) 711 625 set_tsk_thread_flag(child, TIF_PATCH_PENDING);

+3

kernel/sched/clock.c

··· 300 300 if (static_branch_likely(&__sched_clock_stable)) 301 301 return sched_clock() + __sched_clock_offset; 302 302 303 + if (!static_branch_likely(&sched_clock_running)) 304 + return sched_clock(); 305 + 303 306 preempt_disable_notrace(); 304 307 clock = sched_clock_local(this_scd()); 305 308 preempt_enable_notrace();

+622 -47

kernel/sched/core.c

··· 261 261 resched_curr(rq); 262 262 } 263 263 264 - /* 265 - * Find left-most (aka, highest priority) task matching @cookie. 266 - */ 267 - static struct task_struct *sched_core_find(struct rq *rq, unsigned long cookie) 264 + static int sched_task_is_throttled(struct task_struct *p, int cpu) 268 265 { 269 - struct rb_node *node; 266 + if (p->sched_class->task_is_throttled) 267 + return p->sched_class->task_is_throttled(p, cpu); 270 268 271 - node = rb_find_first((void *)cookie, &rq->core_tree, rb_sched_core_cmp); 272 - /* 273 - * The idle task always matches any cookie! 274 - */ 275 - if (!node) 276 - return idle_sched_class.pick_task(rq); 277 - 278 - return __node_2_sc(node); 269 + return 0; 279 270 } 280 271 281 272 static struct task_struct *sched_core_next(struct task_struct *p, unsigned long cookie) 282 273 { 283 274 struct rb_node *node = &p->core_node; 275 + int cpu = task_cpu(p); 284 276 285 - node = rb_next(node); 277 + do { 278 + node = rb_next(node); 279 + if (!node) 280 + return NULL; 281 + 282 + p = __node_2_sc(node); 283 + if (p->core_cookie != cookie) 284 + return NULL; 285 + 286 + } while (sched_task_is_throttled(p, cpu)); 287 + 288 + return p; 289 + } 290 + 291 + /* 292 + * Find left-most (aka, highest priority) and unthrottled task matching @cookie. 293 + * If no suitable task is found, NULL will be returned. 294 + */ 295 + static struct task_struct *sched_core_find(struct rq *rq, unsigned long cookie) 296 + { 297 + struct task_struct *p; 298 + struct rb_node *node; 299 + 300 + node = rb_find_first((void *)cookie, &rq->core_tree, rb_sched_core_cmp); 286 301 if (!node) 287 302 return NULL; 288 303 289 - p = container_of(node, struct task_struct, core_node); 290 - if (p->core_cookie != cookie) 291 - return NULL; 304 + p = __node_2_sc(node); 305 + if (!sched_task_is_throttled(p, rq->cpu)) 306 + return p; 292 307 293 - return p; 308 + return sched_core_next(p, cookie); 294 309 } 295 310 296 311 /* ··· 2102 2087 { 2103 2088 if (task_on_rq_migrating(p)) 2104 2089 flags |= ENQUEUE_MIGRATED; 2090 + if (flags & ENQUEUE_MIGRATED) 2091 + sched_mm_cid_migrate_to(rq, p); 2105 2092 2106 2093 enqueue_task(rq, p, flags); 2107 2094 ··· 3213 3196 p->sched_class->migrate_task_rq(p, new_cpu); 3214 3197 p->se.nr_migrations++; 3215 3198 rseq_migrate(p); 3199 + sched_mm_cid_migrate_from(p); 3216 3200 perf_event_task_migrate(p); 3217 3201 } 3218 3202 ··· 4487 4469 p->wake_entry.u_flags = CSD_TYPE_TTWU; 4488 4470 p->migration_pending = NULL; 4489 4471 #endif 4472 + init_sched_mm_cid(p); 4490 4473 } 4491 4474 4492 4475 DEFINE_STATIC_KEY_FALSE(sched_numa_balancing); ··· 5134 5115 sched_info_switch(rq, prev, next); 5135 5116 perf_event_task_sched_out(prev, next); 5136 5117 rseq_preempt(prev); 5137 - switch_mm_cid(prev, next); 5138 5118 fire_sched_out_preempt_notifiers(prev, next); 5139 5119 kmap_local_sched_out(); 5140 5120 prepare_task(next); ··· 5290 5272 * 5291 5273 * kernel -> user switch + mmdrop_lazy_tlb() active 5292 5274 * user -> user switch 5275 + * 5276 + * switch_mm_cid() needs to be updated if the barriers provided 5277 + * by context_switch() are modified. 5293 5278 */ 5294 5279 if (!next->mm) { // to kernel 5295 5280 enter_lazy_tlb(prev->active_mm, next); ··· 5321 5300 prev->active_mm = NULL; 5322 5301 } 5323 5302 } 5303 + 5304 + /* switch_mm_cid() requires the memory barriers above. */ 5305 + switch_mm_cid(rq, prev, next); 5324 5306 5325 5307 rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP); 5326 5308 ··· 5613 5589 resched_latency = cpu_resched_latency(rq); 5614 5590 calc_global_load_tick(rq); 5615 5591 sched_core_tick(rq); 5592 + task_tick_mm_cid(rq, curr); 5616 5593 5617 5594 rq_unlock(rq, &rf); 5618 5595 ··· 6266 6241 goto unlock; 6267 6242 6268 6243 p = sched_core_find(src, cookie); 6269 - if (p == src->idle) 6244 + if (!p) 6270 6245 goto unlock; 6271 6246 6272 6247 do { ··· 6277 6252 goto next; 6278 6253 6279 6254 if (p->core_occupation > dst->idle->core_occupation) 6255 + goto next; 6256 + /* 6257 + * sched_core_find() and sched_core_next() will ensure that task @p 6258 + * is not throttled now, we also need to check whether the runqueue 6259 + * of the destination CPU is being throttled. 6260 + */ 6261 + if (sched_task_is_throttled(p, this)) 6280 6262 goto next; 6281 6263 6282 6264 deactivate_task(src, p, 0); ··· 8540 8508 static DEFINE_STATIC_KEY_FALSE(sk_dynamic_cond_resched); 8541 8509 int __sched dynamic_cond_resched(void) 8542 8510 { 8511 + klp_sched_try_switch(); 8543 8512 if (!static_branch_unlikely(&sk_dynamic_cond_resched)) 8544 8513 return 0; 8545 8514 return __cond_resched(); ··· 8689 8656 #error "Unsupported PREEMPT_DYNAMIC mechanism" 8690 8657 #endif 8691 8658 8692 - void sched_dynamic_update(int mode) 8659 + static DEFINE_MUTEX(sched_dynamic_mutex); 8660 + static bool klp_override; 8661 + 8662 + static void __sched_dynamic_update(int mode) 8693 8663 { 8694 8664 /* 8695 8665 * Avoid {NONE,VOLUNTARY} -> FULL transitions from ever ending up in 8696 8666 * the ZERO state, which is invalid. 8697 8667 */ 8698 - preempt_dynamic_enable(cond_resched); 8668 + if (!klp_override) 8669 + preempt_dynamic_enable(cond_resched); 8699 8670 preempt_dynamic_enable(might_resched); 8700 8671 preempt_dynamic_enable(preempt_schedule); 8701 8672 preempt_dynamic_enable(preempt_schedule_notrace); ··· 8707 8670 8708 8671 switch (mode) { 8709 8672 case preempt_dynamic_none: 8710 - preempt_dynamic_enable(cond_resched); 8673 + if (!klp_override) 8674 + preempt_dynamic_enable(cond_resched); 8711 8675 preempt_dynamic_disable(might_resched); 8712 8676 preempt_dynamic_disable(preempt_schedule); 8713 8677 preempt_dynamic_disable(preempt_schedule_notrace); 8714 8678 preempt_dynamic_disable(irqentry_exit_cond_resched); 8715 - pr_info("Dynamic Preempt: none\n"); 8679 + if (mode != preempt_dynamic_mode) 8680 + pr_info("Dynamic Preempt: none\n"); 8716 8681 break; 8717 8682 8718 8683 case preempt_dynamic_voluntary: 8719 - preempt_dynamic_enable(cond_resched); 8684 + if (!klp_override) 8685 + preempt_dynamic_enable(cond_resched); 8720 8686 preempt_dynamic_enable(might_resched); 8721 8687 preempt_dynamic_disable(preempt_schedule); 8722 8688 preempt_dynamic_disable(preempt_schedule_notrace); 8723 8689 preempt_dynamic_disable(irqentry_exit_cond_resched); 8724 - pr_info("Dynamic Preempt: voluntary\n"); 8690 + if (mode != preempt_dynamic_mode) 8691 + pr_info("Dynamic Preempt: voluntary\n"); 8725 8692 break; 8726 8693 8727 8694 case preempt_dynamic_full: 8728 - preempt_dynamic_disable(cond_resched); 8695 + if (!klp_override) 8696 + preempt_dynamic_disable(cond_resched); 8729 8697 preempt_dynamic_disable(might_resched); 8730 8698 preempt_dynamic_enable(preempt_schedule); 8731 8699 preempt_dynamic_enable(preempt_schedule_notrace); 8732 8700 preempt_dynamic_enable(irqentry_exit_cond_resched); 8733 - pr_info("Dynamic Preempt: full\n"); 8701 + if (mode != preempt_dynamic_mode) 8702 + pr_info("Dynamic Preempt: full\n"); 8734 8703 break; 8735 8704 } 8736 8705 8737 8706 preempt_dynamic_mode = mode; 8738 8707 } 8708 + 8709 + void sched_dynamic_update(int mode) 8710 + { 8711 + mutex_lock(&sched_dynamic_mutex); 8712 + __sched_dynamic_update(mode); 8713 + mutex_unlock(&sched_dynamic_mutex); 8714 + } 8715 + 8716 + #ifdef CONFIG_HAVE_PREEMPT_DYNAMIC_CALL 8717 + 8718 + static int klp_cond_resched(void) 8719 + { 8720 + __klp_sched_try_switch(); 8721 + return __cond_resched(); 8722 + } 8723 + 8724 + void sched_dynamic_klp_enable(void) 8725 + { 8726 + mutex_lock(&sched_dynamic_mutex); 8727 + 8728 + klp_override = true; 8729 + static_call_update(cond_resched, klp_cond_resched); 8730 + 8731 + mutex_unlock(&sched_dynamic_mutex); 8732 + } 8733 + 8734 + void sched_dynamic_klp_disable(void) 8735 + { 8736 + mutex_lock(&sched_dynamic_mutex); 8737 + 8738 + klp_override = false; 8739 + __sched_dynamic_update(preempt_dynamic_mode); 8740 + 8741 + mutex_unlock(&sched_dynamic_mutex); 8742 + } 8743 + 8744 + #endif /* CONFIG_HAVE_PREEMPT_DYNAMIC_CALL */ 8739 8745 8740 8746 static int __init setup_preempt_mode(char *str) 8741 8747 { ··· 10414 10334 spin_unlock_irqrestore(&task_group_lock, flags); 10415 10335 } 10416 10336 10417 - static void sched_change_group(struct task_struct *tsk) 10337 + static struct task_group *sched_get_task_group(struct task_struct *tsk) 10418 10338 { 10419 10339 struct task_group *tg; 10420 10340 ··· 10426 10346 tg = container_of(task_css_check(tsk, cpu_cgrp_id, true), 10427 10347 struct task_group, css); 10428 10348 tg = autogroup_task_group(tsk, tg); 10429 - tsk->sched_task_group = tg; 10349 + 10350 + return tg; 10351 + } 10352 + 10353 + static void sched_change_group(struct task_struct *tsk, struct task_group *group) 10354 + { 10355 + tsk->sched_task_group = group; 10430 10356 10431 10357 #ifdef CONFIG_FAIR_GROUP_SCHED 10432 10358 if (tsk->sched_class->task_change_group) ··· 10453 10367 { 10454 10368 int queued, running, queue_flags = 10455 10369 DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK; 10370 + struct task_group *group; 10456 10371 struct rq_flags rf; 10457 10372 struct rq *rq; 10458 10373 10459 10374 rq = task_rq_lock(tsk, &rf); 10375 + /* 10376 + * Esp. with SCHED_AUTOGROUP enabled it is possible to get superfluous 10377 + * group changes. 10378 + */ 10379 + group = sched_get_task_group(tsk); 10380 + if (group == tsk->sched_task_group) 10381 + goto unlock; 10382 + 10460 10383 update_rq_clock(rq); 10461 10384 10462 10385 running = task_current(rq, tsk); ··· 10476 10381 if (running) 10477 10382 put_prev_task(rq, tsk); 10478 10383 10479 - sched_change_group(tsk); 10384 + sched_change_group(tsk, group); 10480 10385 10481 10386 if (queued) 10482 10387 enqueue_task(rq, tsk, queue_flags); ··· 10490 10395 resched_curr(rq); 10491 10396 } 10492 10397 10398 + unlock: 10493 10399 task_rq_unlock(rq, tsk, &rf); 10494 10400 } 10495 10401 ··· 11481 11385 } 11482 11386 11483 11387 #ifdef CONFIG_SCHED_MM_CID 11484 - void sched_mm_cid_exit_signals(struct task_struct *t) 11388 + 11389 + /** 11390 + * @cid_lock: Guarantee forward-progress of cid allocation. 11391 + * 11392 + * Concurrency ID allocation within a bitmap is mostly lock-free. The cid_lock 11393 + * is only used when contention is detected by the lock-free allocation so 11394 + * forward progress can be guaranteed. 11395 + */ 11396 + DEFINE_RAW_SPINLOCK(cid_lock); 11397 + 11398 + /** 11399 + * @use_cid_lock: Select cid allocation behavior: lock-free vs spinlock. 11400 + * 11401 + * When @use_cid_lock is 0, the cid allocation is lock-free. When contention is 11402 + * detected, it is set to 1 to ensure that all newly coming allocations are 11403 + * serialized by @cid_lock until the allocation which detected contention 11404 + * completes and sets @use_cid_lock back to 0. This guarantees forward progress 11405 + * of a cid allocation. 11406 + */ 11407 + int use_cid_lock; 11408 + 11409 + /* 11410 + * mm_cid remote-clear implements a lock-free algorithm to clear per-mm/cpu cid 11411 + * concurrently with respect to the execution of the source runqueue context 11412 + * switch. 11413 + * 11414 + * There is one basic properties we want to guarantee here: 11415 + * 11416 + * (1) Remote-clear should _never_ mark a per-cpu cid UNSET when it is actively 11417 + * used by a task. That would lead to concurrent allocation of the cid and 11418 + * userspace corruption. 11419 + * 11420 + * Provide this guarantee by introducing a Dekker memory ordering to guarantee 11421 + * that a pair of loads observe at least one of a pair of stores, which can be 11422 + * shown as: 11423 + * 11424 + * X = Y = 0 11425 + * 11426 + * w[X]=1 w[Y]=1 11427 + * MB MB 11428 + * r[Y]=y r[X]=x 11429 + * 11430 + * Which guarantees that x==0 && y==0 is impossible. But rather than using 11431 + * values 0 and 1, this algorithm cares about specific state transitions of the 11432 + * runqueue current task (as updated by the scheduler context switch), and the 11433 + * per-mm/cpu cid value. 11434 + * 11435 + * Let's introduce task (Y) which has task->mm == mm and task (N) which has 11436 + * task->mm != mm for the rest of the discussion. There are two scheduler state 11437 + * transitions on context switch we care about: 11438 + * 11439 + * (TSA) Store to rq->curr with transition from (N) to (Y) 11440 + * 11441 + * (TSB) Store to rq->curr with transition from (Y) to (N) 11442 + * 11443 + * On the remote-clear side, there is one transition we care about: 11444 + * 11445 + * (TMA) cmpxchg to *pcpu_cid to set the LAZY flag 11446 + * 11447 + * There is also a transition to UNSET state which can be performed from all 11448 + * sides (scheduler, remote-clear). It is always performed with a cmpxchg which 11449 + * guarantees that only a single thread will succeed: 11450 + * 11451 + * (TMB) cmpxchg to *pcpu_cid to mark UNSET 11452 + * 11453 + * Just to be clear, what we do _not_ want to happen is a transition to UNSET 11454 + * when a thread is actively using the cid (property (1)). 11455 + * 11456 + * Let's looks at the relevant combinations of TSA/TSB, and TMA transitions. 11457 + * 11458 + * Scenario A) (TSA)+(TMA) (from next task perspective) 11459 + * 11460 + * CPU0 CPU1 11461 + * 11462 + * Context switch CS-1 Remote-clear 11463 + * - store to rq->curr: (N)->(Y) (TSA) - cmpxchg to *pcpu_id to LAZY (TMA) 11464 + * (implied barrier after cmpxchg) 11465 + * - switch_mm_cid() 11466 + * - memory barrier (see switch_mm_cid() 11467 + * comment explaining how this barrier 11468 + * is combined with other scheduler 11469 + * barriers) 11470 + * - mm_cid_get (next) 11471 + * - READ_ONCE(*pcpu_cid) - rcu_dereference(src_rq->curr) 11472 + * 11473 + * This Dekker ensures that either task (Y) is observed by the 11474 + * rcu_dereference() or the LAZY flag is observed by READ_ONCE(), or both are 11475 + * observed. 11476 + * 11477 + * If task (Y) store is observed by rcu_dereference(), it means that there is 11478 + * still an active task on the cpu. Remote-clear will therefore not transition 11479 + * to UNSET, which fulfills property (1). 11480 + * 11481 + * If task (Y) is not observed, but the lazy flag is observed by READ_ONCE(), 11482 + * it will move its state to UNSET, which clears the percpu cid perhaps 11483 + * uselessly (which is not an issue for correctness). Because task (Y) is not 11484 + * observed, CPU1 can move ahead to set the state to UNSET. Because moving 11485 + * state to UNSET is done with a cmpxchg expecting that the old state has the 11486 + * LAZY flag set, only one thread will successfully UNSET. 11487 + * 11488 + * If both states (LAZY flag and task (Y)) are observed, the thread on CPU0 11489 + * will observe the LAZY flag and transition to UNSET (perhaps uselessly), and 11490 + * CPU1 will observe task (Y) and do nothing more, which is fine. 11491 + * 11492 + * What we are effectively preventing with this Dekker is a scenario where 11493 + * neither LAZY flag nor store (Y) are observed, which would fail property (1) 11494 + * because this would UNSET a cid which is actively used. 11495 + */ 11496 + 11497 + void sched_mm_cid_migrate_from(struct task_struct *t) 11498 + { 11499 + t->migrate_from_cpu = task_cpu(t); 11500 + } 11501 + 11502 + static 11503 + int __sched_mm_cid_migrate_from_fetch_cid(struct rq *src_rq, 11504 + struct task_struct *t, 11505 + struct mm_cid *src_pcpu_cid) 11485 11506 { 11486 11507 struct mm_struct *mm = t->mm; 11487 - unsigned long flags; 11508 + struct task_struct *src_task; 11509 + int src_cid, last_mm_cid; 11510 + 11511 + if (!mm) 11512 + return -1; 11513 + 11514 + last_mm_cid = t->last_mm_cid; 11515 + /* 11516 + * If the migrated task has no last cid, or if the current 11517 + * task on src rq uses the cid, it means the source cid does not need 11518 + * to be moved to the destination cpu. 11519 + */ 11520 + if (last_mm_cid == -1) 11521 + return -1; 11522 + src_cid = READ_ONCE(src_pcpu_cid->cid); 11523 + if (!mm_cid_is_valid(src_cid) || last_mm_cid != src_cid) 11524 + return -1; 11525 + 11526 + /* 11527 + * If we observe an active task using the mm on this rq, it means we 11528 + * are not the last task to be migrated from this cpu for this mm, so 11529 + * there is no need to move src_cid to the destination cpu. 11530 + */ 11531 + rcu_read_lock(); 11532 + src_task = rcu_dereference(src_rq->curr); 11533 + if (READ_ONCE(src_task->mm_cid_active) && src_task->mm == mm) { 11534 + rcu_read_unlock(); 11535 + t->last_mm_cid = -1; 11536 + return -1; 11537 + } 11538 + rcu_read_unlock(); 11539 + 11540 + return src_cid; 11541 + } 11542 + 11543 + static 11544 + int __sched_mm_cid_migrate_from_try_steal_cid(struct rq *src_rq, 11545 + struct task_struct *t, 11546 + struct mm_cid *src_pcpu_cid, 11547 + int src_cid) 11548 + { 11549 + struct task_struct *src_task; 11550 + struct mm_struct *mm = t->mm; 11551 + int lazy_cid; 11552 + 11553 + if (src_cid == -1) 11554 + return -1; 11555 + 11556 + /* 11557 + * Attempt to clear the source cpu cid to move it to the destination 11558 + * cpu. 11559 + */ 11560 + lazy_cid = mm_cid_set_lazy_put(src_cid); 11561 + if (!try_cmpxchg(&src_pcpu_cid->cid, &src_cid, lazy_cid)) 11562 + return -1; 11563 + 11564 + /* 11565 + * The implicit barrier after cmpxchg per-mm/cpu cid before loading 11566 + * rq->curr->mm matches the scheduler barrier in context_switch() 11567 + * between store to rq->curr and load of prev and next task's 11568 + * per-mm/cpu cid. 11569 + * 11570 + * The implicit barrier after cmpxchg per-mm/cpu cid before loading 11571 + * rq->curr->mm_cid_active matches the barrier in 11572 + * sched_mm_cid_exit_signals(), sched_mm_cid_before_execve(), and 11573 + * sched_mm_cid_after_execve() between store to t->mm_cid_active and 11574 + * load of per-mm/cpu cid. 11575 + */ 11576 + 11577 + /* 11578 + * If we observe an active task using the mm on this rq after setting 11579 + * the lazy-put flag, this task will be responsible for transitioning 11580 + * from lazy-put flag set to MM_CID_UNSET. 11581 + */ 11582 + rcu_read_lock(); 11583 + src_task = rcu_dereference(src_rq->curr); 11584 + if (READ_ONCE(src_task->mm_cid_active) && src_task->mm == mm) { 11585 + rcu_read_unlock(); 11586 + /* 11587 + * We observed an active task for this mm, there is therefore 11588 + * no point in moving this cid to the destination cpu. 11589 + */ 11590 + t->last_mm_cid = -1; 11591 + return -1; 11592 + } 11593 + rcu_read_unlock(); 11594 + 11595 + /* 11596 + * The src_cid is unused, so it can be unset. 11597 + */ 11598 + if (!try_cmpxchg(&src_pcpu_cid->cid, &lazy_cid, MM_CID_UNSET)) 11599 + return -1; 11600 + return src_cid; 11601 + } 11602 + 11603 + /* 11604 + * Migration to dst cpu. Called with dst_rq lock held. 11605 + * Interrupts are disabled, which keeps the window of cid ownership without the 11606 + * source rq lock held small. 11607 + */ 11608 + void sched_mm_cid_migrate_to(struct rq *dst_rq, struct task_struct *t) 11609 + { 11610 + struct mm_cid *src_pcpu_cid, *dst_pcpu_cid; 11611 + struct mm_struct *mm = t->mm; 11612 + int src_cid, dst_cid, src_cpu; 11613 + struct rq *src_rq; 11614 + 11615 + lockdep_assert_rq_held(dst_rq); 11488 11616 11489 11617 if (!mm) 11490 11618 return; 11619 + src_cpu = t->migrate_from_cpu; 11620 + if (src_cpu == -1) { 11621 + t->last_mm_cid = -1; 11622 + return; 11623 + } 11624 + /* 11625 + * Move the src cid if the dst cid is unset. This keeps id 11626 + * allocation closest to 0 in cases where few threads migrate around 11627 + * many cpus. 11628 + * 11629 + * If destination cid is already set, we may have to just clear 11630 + * the src cid to ensure compactness in frequent migrations 11631 + * scenarios. 11632 + * 11633 + * It is not useful to clear the src cid when the number of threads is 11634 + * greater or equal to the number of allowed cpus, because user-space 11635 + * can expect that the number of allowed cids can reach the number of 11636 + * allowed cpus. 11637 + */ 11638 + dst_pcpu_cid = per_cpu_ptr(mm->pcpu_cid, cpu_of(dst_rq)); 11639 + dst_cid = READ_ONCE(dst_pcpu_cid->cid); 11640 + if (!mm_cid_is_unset(dst_cid) && 11641 + atomic_read(&mm->mm_users) >= t->nr_cpus_allowed) 11642 + return; 11643 + src_pcpu_cid = per_cpu_ptr(mm->pcpu_cid, src_cpu); 11644 + src_rq = cpu_rq(src_cpu); 11645 + src_cid = __sched_mm_cid_migrate_from_fetch_cid(src_rq, t, src_pcpu_cid); 11646 + if (src_cid == -1) 11647 + return; 11648 + src_cid = __sched_mm_cid_migrate_from_try_steal_cid(src_rq, t, src_pcpu_cid, 11649 + src_cid); 11650 + if (src_cid == -1) 11651 + return; 11652 + if (!mm_cid_is_unset(dst_cid)) { 11653 + __mm_cid_put(mm, src_cid); 11654 + return; 11655 + } 11656 + /* Move src_cid to dst cpu. */ 11657 + mm_cid_snapshot_time(dst_rq, mm); 11658 + WRITE_ONCE(dst_pcpu_cid->cid, src_cid); 11659 + } 11660 + 11661 + static void sched_mm_cid_remote_clear(struct mm_struct *mm, struct mm_cid *pcpu_cid, 11662 + int cpu) 11663 + { 11664 + struct rq *rq = cpu_rq(cpu); 11665 + struct task_struct *t; 11666 + unsigned long flags; 11667 + int cid, lazy_cid; 11668 + 11669 + cid = READ_ONCE(pcpu_cid->cid); 11670 + if (!mm_cid_is_valid(cid)) 11671 + return; 11672 + 11673 + /* 11674 + * Clear the cpu cid if it is set to keep cid allocation compact. If 11675 + * there happens to be other tasks left on the source cpu using this 11676 + * mm, the next task using this mm will reallocate its cid on context 11677 + * switch. 11678 + */ 11679 + lazy_cid = mm_cid_set_lazy_put(cid); 11680 + if (!try_cmpxchg(&pcpu_cid->cid, &cid, lazy_cid)) 11681 + return; 11682 + 11683 + /* 11684 + * The implicit barrier after cmpxchg per-mm/cpu cid before loading 11685 + * rq->curr->mm matches the scheduler barrier in context_switch() 11686 + * between store to rq->curr and load of prev and next task's 11687 + * per-mm/cpu cid. 11688 + * 11689 + * The implicit barrier after cmpxchg per-mm/cpu cid before loading 11690 + * rq->curr->mm_cid_active matches the barrier in 11691 + * sched_mm_cid_exit_signals(), sched_mm_cid_before_execve(), and 11692 + * sched_mm_cid_after_execve() between store to t->mm_cid_active and 11693 + * load of per-mm/cpu cid. 11694 + */ 11695 + 11696 + /* 11697 + * If we observe an active task using the mm on this rq after setting 11698 + * the lazy-put flag, that task will be responsible for transitioning 11699 + * from lazy-put flag set to MM_CID_UNSET. 11700 + */ 11701 + rcu_read_lock(); 11702 + t = rcu_dereference(rq->curr); 11703 + if (READ_ONCE(t->mm_cid_active) && t->mm == mm) { 11704 + rcu_read_unlock(); 11705 + return; 11706 + } 11707 + rcu_read_unlock(); 11708 + 11709 + /* 11710 + * The cid is unused, so it can be unset. 11711 + * Disable interrupts to keep the window of cid ownership without rq 11712 + * lock small. 11713 + */ 11491 11714 local_irq_save(flags); 11492 - mm_cid_put(mm, t->mm_cid); 11493 - t->mm_cid = -1; 11494 - t->mm_cid_active = 0; 11715 + if (try_cmpxchg(&pcpu_cid->cid, &lazy_cid, MM_CID_UNSET)) 11716 + __mm_cid_put(mm, cid); 11495 11717 local_irq_restore(flags); 11718 + } 11719 + 11720 + static void sched_mm_cid_remote_clear_old(struct mm_struct *mm, int cpu) 11721 + { 11722 + struct rq *rq = cpu_rq(cpu); 11723 + struct mm_cid *pcpu_cid; 11724 + struct task_struct *curr; 11725 + u64 rq_clock; 11726 + 11727 + /* 11728 + * rq->clock load is racy on 32-bit but one spurious clear once in a 11729 + * while is irrelevant. 11730 + */ 11731 + rq_clock = READ_ONCE(rq->clock); 11732 + pcpu_cid = per_cpu_ptr(mm->pcpu_cid, cpu); 11733 + 11734 + /* 11735 + * In order to take care of infrequently scheduled tasks, bump the time 11736 + * snapshot associated with this cid if an active task using the mm is 11737 + * observed on this rq. 11738 + */ 11739 + rcu_read_lock(); 11740 + curr = rcu_dereference(rq->curr); 11741 + if (READ_ONCE(curr->mm_cid_active) && curr->mm == mm) { 11742 + WRITE_ONCE(pcpu_cid->time, rq_clock); 11743 + rcu_read_unlock(); 11744 + return; 11745 + } 11746 + rcu_read_unlock(); 11747 + 11748 + if (rq_clock < pcpu_cid->time + SCHED_MM_CID_PERIOD_NS) 11749 + return; 11750 + sched_mm_cid_remote_clear(mm, pcpu_cid, cpu); 11751 + } 11752 + 11753 + static void sched_mm_cid_remote_clear_weight(struct mm_struct *mm, int cpu, 11754 + int weight) 11755 + { 11756 + struct mm_cid *pcpu_cid; 11757 + int cid; 11758 + 11759 + pcpu_cid = per_cpu_ptr(mm->pcpu_cid, cpu); 11760 + cid = READ_ONCE(pcpu_cid->cid); 11761 + if (!mm_cid_is_valid(cid) || cid < weight) 11762 + return; 11763 + sched_mm_cid_remote_clear(mm, pcpu_cid, cpu); 11764 + } 11765 + 11766 + static void task_mm_cid_work(struct callback_head *work) 11767 + { 11768 + unsigned long now = jiffies, old_scan, next_scan; 11769 + struct task_struct *t = current; 11770 + struct cpumask *cidmask; 11771 + struct mm_struct *mm; 11772 + int weight, cpu; 11773 + 11774 + SCHED_WARN_ON(t != container_of(work, struct task_struct, cid_work)); 11775 + 11776 + work->next = work; /* Prevent double-add */ 11777 + if (t->flags & PF_EXITING) 11778 + return; 11779 + mm = t->mm; 11780 + if (!mm) 11781 + return; 11782 + old_scan = READ_ONCE(mm->mm_cid_next_scan); 11783 + next_scan = now + msecs_to_jiffies(MM_CID_SCAN_DELAY); 11784 + if (!old_scan) { 11785 + unsigned long res; 11786 + 11787 + res = cmpxchg(&mm->mm_cid_next_scan, old_scan, next_scan); 11788 + if (res != old_scan) 11789 + old_scan = res; 11790 + else 11791 + old_scan = next_scan; 11792 + } 11793 + if (time_before(now, old_scan)) 11794 + return; 11795 + if (!try_cmpxchg(&mm->mm_cid_next_scan, &old_scan, next_scan)) 11796 + return; 11797 + cidmask = mm_cidmask(mm); 11798 + /* Clear cids that were not recently used. */ 11799 + for_each_possible_cpu(cpu) 11800 + sched_mm_cid_remote_clear_old(mm, cpu); 11801 + weight = cpumask_weight(cidmask); 11802 + /* 11803 + * Clear cids that are greater or equal to the cidmask weight to 11804 + * recompact it. 11805 + */ 11806 + for_each_possible_cpu(cpu) 11807 + sched_mm_cid_remote_clear_weight(mm, cpu, weight); 11808 + } 11809 + 11810 + void init_sched_mm_cid(struct task_struct *t) 11811 + { 11812 + struct mm_struct *mm = t->mm; 11813 + int mm_users = 0; 11814 + 11815 + if (mm) { 11816 + mm_users = atomic_read(&mm->mm_users); 11817 + if (mm_users == 1) 11818 + mm->mm_cid_next_scan = jiffies + msecs_to_jiffies(MM_CID_SCAN_DELAY); 11819 + } 11820 + t->cid_work.next = &t->cid_work; /* Protect against double add */ 11821 + init_task_work(&t->cid_work, task_mm_cid_work); 11822 + } 11823 + 11824 + void task_tick_mm_cid(struct rq *rq, struct task_struct *curr) 11825 + { 11826 + struct callback_head *work = &curr->cid_work; 11827 + unsigned long now = jiffies; 11828 + 11829 + if (!curr->mm || (curr->flags & (PF_EXITING | PF_KTHREAD)) || 11830 + work->next != work) 11831 + return; 11832 + if (time_before(now, READ_ONCE(curr->mm->mm_cid_next_scan))) 11833 + return; 11834 + task_work_add(curr, work, TWA_RESUME); 11835 + } 11836 + 11837 + void sched_mm_cid_exit_signals(struct task_struct *t) 11838 + { 11839 + struct mm_struct *mm = t->mm; 11840 + struct rq_flags rf; 11841 + struct rq *rq; 11842 + 11843 + if (!mm) 11844 + return; 11845 + 11846 + preempt_disable(); 11847 + rq = this_rq(); 11848 + rq_lock_irqsave(rq, &rf); 11849 + preempt_enable_no_resched(); /* holding spinlock */ 11850 + WRITE_ONCE(t->mm_cid_active, 0); 11851 + /* 11852 + * Store t->mm_cid_active before loading per-mm/cpu cid. 11853 + * Matches barrier in sched_mm_cid_remote_clear_old(). 11854 + */ 11855 + smp_mb(); 11856 + mm_cid_put(mm); 11857 + t->last_mm_cid = t->mm_cid = -1; 11858 + rq_unlock_irqrestore(rq, &rf); 11496 11859 } 11497 11860 11498 11861 void sched_mm_cid_before_execve(struct task_struct *t) 11499 11862 { 11500 11863 struct mm_struct *mm = t->mm; 11501 - unsigned long flags; 11864 + struct rq_flags rf; 11865 + struct rq *rq; 11502 11866 11503 11867 if (!mm) 11504 11868 return; 11505 - local_irq_save(flags); 11506 - mm_cid_put(mm, t->mm_cid); 11507 - t->mm_cid = -1; 11508 - t->mm_cid_active = 0; 11509 - local_irq_restore(flags); 11869 + 11870 + preempt_disable(); 11871 + rq = this_rq(); 11872 + rq_lock_irqsave(rq, &rf); 11873 + preempt_enable_no_resched(); /* holding spinlock */ 11874 + WRITE_ONCE(t->mm_cid_active, 0); 11875 + /* 11876 + * Store t->mm_cid_active before loading per-mm/cpu cid. 11877 + * Matches barrier in sched_mm_cid_remote_clear_old(). 11878 + */ 11879 + smp_mb(); 11880 + mm_cid_put(mm); 11881 + t->last_mm_cid = t->mm_cid = -1; 11882 + rq_unlock_irqrestore(rq, &rf); 11510 11883 } 11511 11884 11512 11885 void sched_mm_cid_after_execve(struct task_struct *t) 11513 11886 { 11514 11887 struct mm_struct *mm = t->mm; 11515 - unsigned long flags; 11888 + struct rq_flags rf; 11889 + struct rq *rq; 11516 11890 11517 11891 if (!mm) 11518 11892 return; 11519 - local_irq_save(flags); 11520 - t->mm_cid = mm_cid_get(mm); 11521 - t->mm_cid_active = 1; 11522 - local_irq_restore(flags); 11893 + 11894 + preempt_disable(); 11895 + rq = this_rq(); 11896 + rq_lock_irqsave(rq, &rf); 11897 + preempt_enable_no_resched(); /* holding spinlock */ 11898 + WRITE_ONCE(t->mm_cid_active, 1); 11899 + /* 11900 + * Store t->mm_cid_active before loading per-mm/cpu cid. 11901 + * Matches barrier in sched_mm_cid_remote_clear_old(). 11902 + */ 11903 + smp_mb(); 11904 + t->last_mm_cid = t->mm_cid = mm_cid_get(rq, mm); 11905 + rq_unlock_irqrestore(rq, &rf); 11523 11906 rseq_set_notify_resume(t); 11524 11907 } 11525 11908

+11

kernel/sched/deadline.c

··· 2246 2246 !cpumask_test_cpu(later_rq->cpu, &task->cpus_mask) || 2247 2247 task_on_cpu(rq, task) || 2248 2248 !dl_task(task) || 2249 + is_migration_disabled(task) || 2249 2250 !task_on_rq_queued(task))) { 2250 2251 double_unlock_balance(rq, later_rq); 2251 2252 later_rq = NULL; ··· 2705 2704 #endif 2706 2705 } 2707 2706 2707 + #ifdef CONFIG_SCHED_CORE 2708 + static int task_is_throttled_dl(struct task_struct *p, int cpu) 2709 + { 2710 + return p->dl.dl_throttled; 2711 + } 2712 + #endif 2713 + 2708 2714 DEFINE_SCHED_CLASS(dl) = { 2709 2715 2710 2716 .enqueue_task = enqueue_task_dl, ··· 2744 2736 .switched_to = switched_to_dl, 2745 2737 2746 2738 .update_curr = update_curr_dl, 2739 + #ifdef CONFIG_SCHED_CORE 2740 + .task_is_throttled = task_is_throttled_dl, 2741 + #endif 2747 2742 }; 2748 2743 2749 2744 /* Used for dl_bw check and update, used under sched_rt_handler()::mutex */

+21 -1

kernel/sched/fair.c

··· 6016 6016 INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq); 6017 6017 hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED); 6018 6018 cfs_b->period_timer.function = sched_cfs_period_timer; 6019 + 6020 + /* Add a random offset so that timers interleave */ 6021 + hrtimer_set_expires(&cfs_b->period_timer, 6022 + get_random_u32_below(cfs_b->period)); 6019 6023 hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); 6020 6024 cfs_b->slack_timer.function = sched_cfs_slack_timer; 6021 6025 cfs_b->slack_started = false; ··· 6675 6671 target = wake_affine_weight(sd, p, this_cpu, prev_cpu, sync); 6676 6672 6677 6673 schedstat_inc(p->stats.nr_wakeups_affine_attempts); 6678 - if (target == nr_cpumask_bits) 6674 + if (target != this_cpu) 6679 6675 return prev_cpu; 6680 6676 6681 6677 schedstat_inc(sd->ttwu_move_affine); ··· 12037 12033 12038 12034 return delta > 0; 12039 12035 } 12036 + 12037 + static int task_is_throttled_fair(struct task_struct *p, int cpu) 12038 + { 12039 + struct cfs_rq *cfs_rq; 12040 + 12041 + #ifdef CONFIG_FAIR_GROUP_SCHED 12042 + cfs_rq = task_group(p)->cfs_rq[cpu]; 12043 + #else 12044 + cfs_rq = &cpu_rq(cpu)->cfs; 12045 + #endif 12046 + return throttled_hierarchy(cfs_rq); 12047 + } 12040 12048 #else 12041 12049 static inline void task_tick_core(struct rq *rq, struct task_struct *curr) {} 12042 12050 #endif ··· 12673 12657 12674 12658 #ifdef CONFIG_FAIR_GROUP_SCHED 12675 12659 .task_change_group = task_change_group_fair, 12660 + #endif 12661 + 12662 + #ifdef CONFIG_SCHED_CORE 12663 + .task_is_throttled = task_is_throttled_fair, 12676 12664 #endif 12677 12665 12678 12666 #ifdef CONFIG_UCLAMP_TASK

+266 -223

kernel/sched/psi.c

··· 186 186 seqcount_init(&per_cpu_ptr(group->pcpu, cpu)->seq); 187 187 group->avg_last_update = sched_clock(); 188 188 group->avg_next_update = group->avg_last_update + psi_period; 189 - INIT_DELAYED_WORK(&group->avgs_work, psi_avgs_work); 190 189 mutex_init(&group->avgs_lock); 191 - /* Init trigger-related members */ 192 - atomic_set(&group->poll_scheduled, 0); 193 - mutex_init(&group->trigger_lock); 194 - INIT_LIST_HEAD(&group->triggers); 195 - group->poll_min_period = U32_MAX; 196 - group->polling_next_update = ULLONG_MAX; 197 - init_waitqueue_head(&group->poll_wait); 198 - timer_setup(&group->poll_timer, poll_timer_fn, 0); 199 - rcu_assign_pointer(group->poll_task, NULL); 190 + 191 + /* Init avg trigger-related members */ 192 + INIT_LIST_HEAD(&group->avg_triggers); 193 + memset(group->avg_nr_triggers, 0, sizeof(group->avg_nr_triggers)); 194 + INIT_DELAYED_WORK(&group->avgs_work, psi_avgs_work); 195 + 196 + /* Init rtpoll trigger-related members */ 197 + atomic_set(&group->rtpoll_scheduled, 0); 198 + mutex_init(&group->rtpoll_trigger_lock); 199 + INIT_LIST_HEAD(&group->rtpoll_triggers); 200 + group->rtpoll_min_period = U32_MAX; 201 + group->rtpoll_next_update = ULLONG_MAX; 202 + init_waitqueue_head(&group->rtpoll_wait); 203 + timer_setup(&group->rtpoll_timer, poll_timer_fn, 0); 204 + rcu_assign_pointer(group->rtpoll_task, NULL); 200 205 } 201 206 202 207 void __init psi_init(void) ··· 389 384 *pchanged_states = changed_states; 390 385 } 391 386 387 + /* Trigger tracking window manipulations */ 388 + static void window_reset(struct psi_window *win, u64 now, u64 value, 389 + u64 prev_growth) 390 + { 391 + win->start_time = now; 392 + win->start_value = value; 393 + win->prev_growth = prev_growth; 394 + } 395 + 396 + /* 397 + * PSI growth tracking window update and growth calculation routine. 398 + * 399 + * This approximates a sliding tracking window by interpolating 400 + * partially elapsed windows using historical growth data from the 401 + * previous intervals. This minimizes memory requirements (by not storing 402 + * all the intermediate values in the previous window) and simplifies 403 + * the calculations. It works well because PSI signal changes only in 404 + * positive direction and over relatively small window sizes the growth 405 + * is close to linear. 406 + */ 407 + static u64 window_update(struct psi_window *win, u64 now, u64 value) 408 + { 409 + u64 elapsed; 410 + u64 growth; 411 + 412 + elapsed = now - win->start_time; 413 + growth = value - win->start_value; 414 + /* 415 + * After each tracking window passes win->start_value and 416 + * win->start_time get reset and win->prev_growth stores 417 + * the average per-window growth of the previous window. 418 + * win->prev_growth is then used to interpolate additional 419 + * growth from the previous window assuming it was linear. 420 + */ 421 + if (elapsed > win->size) 422 + window_reset(win, now, value, growth); 423 + else { 424 + u32 remaining; 425 + 426 + remaining = win->size - elapsed; 427 + growth += div64_u64(win->prev_growth * remaining, win->size); 428 + } 429 + 430 + return growth; 431 + } 432 + 433 + static u64 update_triggers(struct psi_group *group, u64 now, bool *update_total, 434 + enum psi_aggregators aggregator) 435 + { 436 + struct psi_trigger *t; 437 + u64 *total = group->total[aggregator]; 438 + struct list_head *triggers; 439 + u64 *aggregator_total; 440 + *update_total = false; 441 + 442 + if (aggregator == PSI_AVGS) { 443 + triggers = &group->avg_triggers; 444 + aggregator_total = group->avg_total; 445 + } else { 446 + triggers = &group->rtpoll_triggers; 447 + aggregator_total = group->rtpoll_total; 448 + } 449 + 450 + /* 451 + * On subsequent updates, calculate growth deltas and let 452 + * watchers know when their specified thresholds are exceeded. 453 + */ 454 + list_for_each_entry(t, triggers, node) { 455 + u64 growth; 456 + bool new_stall; 457 + 458 + new_stall = aggregator_total[t->state] != total[t->state]; 459 + 460 + /* Check for stall activity or a previous threshold breach */ 461 + if (!new_stall && !t->pending_event) 462 + continue; 463 + /* 464 + * Check for new stall activity, as well as deferred 465 + * events that occurred in the last window after the 466 + * trigger had already fired (we want to ratelimit 467 + * events without dropping any). 468 + */ 469 + if (new_stall) { 470 + /* 471 + * Multiple triggers might be looking at the same state, 472 + * remember to update group->polling_total[] once we've 473 + * been through all of them. Also remember to extend the 474 + * polling time if we see new stall activity. 475 + */ 476 + *update_total = true; 477 + 478 + /* Calculate growth since last update */ 479 + growth = window_update(&t->win, now, total[t->state]); 480 + if (!t->pending_event) { 481 + if (growth < t->threshold) 482 + continue; 483 + 484 + t->pending_event = true; 485 + } 486 + } 487 + /* Limit event signaling to once per window */ 488 + if (now < t->last_event_time + t->win.size) 489 + continue; 490 + 491 + /* Generate an event */ 492 + if (cmpxchg(&t->event, 0, 1) == 0) 493 + wake_up_interruptible(&t->event_wait); 494 + t->last_event_time = now; 495 + /* Reset threshold breach flag once event got generated */ 496 + t->pending_event = false; 497 + } 498 + 499 + return now + group->rtpoll_min_period; 500 + } 501 + 392 502 static u64 update_averages(struct psi_group *group, u64 now) 393 503 { 394 504 unsigned long missed_periods = 0; ··· 562 442 struct delayed_work *dwork; 563 443 struct psi_group *group; 564 444 u32 changed_states; 445 + bool update_total; 565 446 u64 now; 566 447 567 448 dwork = to_delayed_work(work); ··· 580 459 * Once restarted, we'll catch up the running averages in one 581 460 * go - see calc_avgs() and missed_periods. 582 461 */ 583 - if (now >= group->avg_next_update) 462 + if (now >= group->avg_next_update) { 463 + update_triggers(group, now, &update_total, PSI_AVGS); 584 464 group->avg_next_update = update_averages(group, now); 465 + } 585 466 586 467 if (changed_states & PSI_STATE_RESCHEDULE) { 587 468 schedule_delayed_work(dwork, nsecs_to_jiffies( ··· 593 470 mutex_unlock(&group->avgs_lock); 594 471 } 595 472 596 - /* Trigger tracking window manipulations */ 597 - static void window_reset(struct psi_window *win, u64 now, u64 value, 598 - u64 prev_growth) 599 - { 600 - win->start_time = now; 601 - win->start_value = value; 602 - win->prev_growth = prev_growth; 603 - } 604 - 605 - /* 606 - * PSI growth tracking window update and growth calculation routine. 607 - * 608 - * This approximates a sliding tracking window by interpolating 609 - * partially elapsed windows using historical growth data from the 610 - * previous intervals. This minimizes memory requirements (by not storing 611 - * all the intermediate values in the previous window) and simplifies 612 - * the calculations. It works well because PSI signal changes only in 613 - * positive direction and over relatively small window sizes the growth 614 - * is close to linear. 615 - */ 616 - static u64 window_update(struct psi_window *win, u64 now, u64 value) 617 - { 618 - u64 elapsed; 619 - u64 growth; 620 - 621 - elapsed = now - win->start_time; 622 - growth = value - win->start_value; 623 - /* 624 - * After each tracking window passes win->start_value and 625 - * win->start_time get reset and win->prev_growth stores 626 - * the average per-window growth of the previous window. 627 - * win->prev_growth is then used to interpolate additional 628 - * growth from the previous window assuming it was linear. 629 - */ 630 - if (elapsed > win->size) 631 - window_reset(win, now, value, growth); 632 - else { 633 - u32 remaining; 634 - 635 - remaining = win->size - elapsed; 636 - growth += div64_u64(win->prev_growth * remaining, win->size); 637 - } 638 - 639 - return growth; 640 - } 641 - 642 - static void init_triggers(struct psi_group *group, u64 now) 473 + static void init_rtpoll_triggers(struct psi_group *group, u64 now) 643 474 { 644 475 struct psi_trigger *t; 645 476 646 - list_for_each_entry(t, &group->triggers, node) 477 + list_for_each_entry(t, &group->rtpoll_triggers, node) 647 478 window_reset(&t->win, now, 648 479 group->total[PSI_POLL][t->state], 0); 649 - memcpy(group->polling_total, group->total[PSI_POLL], 650 - sizeof(group->polling_total)); 651 - group->polling_next_update = now + group->poll_min_period; 652 - } 653 - 654 - static u64 update_triggers(struct psi_group *group, u64 now) 655 - { 656 - struct psi_trigger *t; 657 - bool update_total = false; 658 - u64 *total = group->total[PSI_POLL]; 659 - 660 - /* 661 - * On subsequent updates, calculate growth deltas and let 662 - * watchers know when their specified thresholds are exceeded. 663 - */ 664 - list_for_each_entry(t, &group->triggers, node) { 665 - u64 growth; 666 - bool new_stall; 667 - 668 - new_stall = group->polling_total[t->state] != total[t->state]; 669 - 670 - /* Check for stall activity or a previous threshold breach */ 671 - if (!new_stall && !t->pending_event) 672 - continue; 673 - /* 674 - * Check for new stall activity, as well as deferred 675 - * events that occurred in the last window after the 676 - * trigger had already fired (we want to ratelimit 677 - * events without dropping any). 678 - */ 679 - if (new_stall) { 680 - /* 681 - * Multiple triggers might be looking at the same state, 682 - * remember to update group->polling_total[] once we've 683 - * been through all of them. Also remember to extend the 684 - * polling time if we see new stall activity. 685 - */ 686 - update_total = true; 687 - 688 - /* Calculate growth since last update */ 689 - growth = window_update(&t->win, now, total[t->state]); 690 - if (!t->pending_event) { 691 - if (growth < t->threshold) 692 - continue; 693 - 694 - t->pending_event = true; 695 - } 696 - } 697 - /* Limit event signaling to once per window */ 698 - if (now < t->last_event_time + t->win.size) 699 - continue; 700 - 701 - /* Generate an event */ 702 - if (cmpxchg(&t->event, 0, 1) == 0) 703 - wake_up_interruptible(&t->event_wait); 704 - t->last_event_time = now; 705 - /* Reset threshold breach flag once event got generated */ 706 - t->pending_event = false; 707 - } 708 - 709 - if (update_total) 710 - memcpy(group->polling_total, total, 711 - sizeof(group->polling_total)); 712 - 713 - return now + group->poll_min_period; 480 + memcpy(group->rtpoll_total, group->total[PSI_POLL], 481 + sizeof(group->rtpoll_total)); 482 + group->rtpoll_next_update = now + group->rtpoll_min_period; 714 483 } 715 484 716 485 /* Schedule polling if it's not already scheduled or forced. */ 717 - static void psi_schedule_poll_work(struct psi_group *group, unsigned long delay, 486 + static void psi_schedule_rtpoll_work(struct psi_group *group, unsigned long delay, 718 487 bool force) 719 488 { 720 489 struct task_struct *task; 721 490 722 491 /* 723 492 * atomic_xchg should be called even when !force to provide a 724 - * full memory barrier (see the comment inside psi_poll_work). 493 + * full memory barrier (see the comment inside psi_rtpoll_work). 725 494 */ 726 - if (atomic_xchg(&group->poll_scheduled, 1) && !force) 495 + if (atomic_xchg(&group->rtpoll_scheduled, 1) && !force) 727 496 return; 728 497 729 498 rcu_read_lock(); 730 499 731 - task = rcu_dereference(group->poll_task); 500 + task = rcu_dereference(group->rtpoll_task); 732 501 /* 733 502 * kworker might be NULL in case psi_trigger_destroy races with 734 503 * psi_task_change (hotpath) which can't use locks 735 504 */ 736 505 if (likely(task)) 737 - mod_timer(&group->poll_timer, jiffies + delay); 506 + mod_timer(&group->rtpoll_timer, jiffies + delay); 738 507 else 739 - atomic_set(&group->poll_scheduled, 0); 508 + atomic_set(&group->rtpoll_scheduled, 0); 740 509 741 510 rcu_read_unlock(); 742 511 } 743 512 744 - static void psi_poll_work(struct psi_group *group) 513 + static void psi_rtpoll_work(struct psi_group *group) 745 514 { 746 515 bool force_reschedule = false; 747 516 u32 changed_states; 517 + bool update_total; 748 518 u64 now; 749 519 750 - mutex_lock(&group->trigger_lock); 520 + mutex_lock(&group->rtpoll_trigger_lock); 751 521 752 522 now = sched_clock(); 753 523 754 - if (now > group->polling_until) { 524 + if (now > group->rtpoll_until) { 755 525 /* 756 526 * We are either about to start or might stop polling if no 757 527 * state change was recorded. Resetting poll_scheduled leaves ··· 654 638 * should be negligible and polling_next_update still keeps 655 639 * updates correctly on schedule. 656 640 */ 657 - atomic_set(&group->poll_scheduled, 0); 641 + atomic_set(&group->rtpoll_scheduled, 0); 658 642 /* 659 643 * A task change can race with the poll worker that is supposed to 660 644 * report on it. To avoid missing events, ensure ordering between ··· 683 667 684 668 collect_percpu_times(group, PSI_POLL, &changed_states); 685 669 686 - if (changed_states & group->poll_states) { 670 + if (changed_states & group->rtpoll_states) { 687 671 /* Initialize trigger windows when entering polling mode */ 688 - if (now > group->polling_until) 689 - init_triggers(group, now); 672 + if (now > group->rtpoll_until) 673 + init_rtpoll_triggers(group, now); 690 674 691 675 /* 692 676 * Keep the monitor active for at least the duration of the 693 677 * minimum tracking window as long as monitor states are 694 678 * changing. 695 679 */ 696 - group->polling_until = now + 697 - group->poll_min_period * UPDATES_PER_WINDOW; 680 + group->rtpoll_until = now + 681 + group->rtpoll_min_period * UPDATES_PER_WINDOW; 698 682 } 699 683 700 - if (now > group->polling_until) { 701 - group->polling_next_update = ULLONG_MAX; 684 + if (now > group->rtpoll_until) { 685 + group->rtpoll_next_update = ULLONG_MAX; 702 686 goto out; 703 687 } 704 688 705 - if (now >= group->polling_next_update) 706 - group->polling_next_update = update_triggers(group, now); 689 + if (now >= group->rtpoll_next_update) { 690 + group->rtpoll_next_update = update_triggers(group, now, &update_total, PSI_POLL); 691 + if (update_total) 692 + memcpy(group->rtpoll_total, group->total[PSI_POLL], 693 + sizeof(group->rtpoll_total)); 694 + } 707 695 708 - psi_schedule_poll_work(group, 709 - nsecs_to_jiffies(group->polling_next_update - now) + 1, 696 + psi_schedule_rtpoll_work(group, 697 + nsecs_to_jiffies(group->rtpoll_next_update - now) + 1, 710 698 force_reschedule); 711 699 712 700 out: 713 - mutex_unlock(&group->trigger_lock); 701 + mutex_unlock(&group->rtpoll_trigger_lock); 714 702 } 715 703 716 - static int psi_poll_worker(void *data) 704 + static int psi_rtpoll_worker(void *data) 717 705 { 718 706 struct psi_group *group = (struct psi_group *)data; 719 707 720 708 sched_set_fifo_low(current); 721 709 722 710 while (true) { 723 - wait_event_interruptible(group->poll_wait, 724 - atomic_cmpxchg(&group->poll_wakeup, 1, 0) || 711 + wait_event_interruptible(group->rtpoll_wait, 712 + atomic_cmpxchg(&group->rtpoll_wakeup, 1, 0) || 725 713 kthread_should_stop()); 726 714 if (kthread_should_stop()) 727 715 break; 728 716 729 - psi_poll_work(group); 717 + psi_rtpoll_work(group); 730 718 } 731 719 return 0; 732 720 } 733 721 734 722 static void poll_timer_fn(struct timer_list *t) 735 723 { 736 - struct psi_group *group = from_timer(group, t, poll_timer); 724 + struct psi_group *group = from_timer(group, t, rtpoll_timer); 737 725 738 - atomic_set(&group->poll_wakeup, 1); 739 - wake_up_interruptible(&group->poll_wait); 726 + atomic_set(&group->rtpoll_wakeup, 1); 727 + wake_up_interruptible(&group->rtpoll_wait); 740 728 } 741 729 742 730 static void record_times(struct psi_group_cpu *groupc, u64 now) ··· 871 851 872 852 write_seqcount_end(&groupc->seq); 873 853 874 - if (state_mask & group->poll_states) 875 - psi_schedule_poll_work(group, 1, false); 854 + if (state_mask & group->rtpoll_states) 855 + psi_schedule_rtpoll_work(group, 1, false); 876 856 877 857 if (wake_clock && !delayed_work_pending(&group->avgs_work)) 878 858 schedule_delayed_work(&group->avgs_work, PSI_FREQ); ··· 1025 1005 1026 1006 write_seqcount_end(&groupc->seq); 1027 1007 1028 - if (group->poll_states & (1 << PSI_IRQ_FULL)) 1029 - psi_schedule_poll_work(group, 1, false); 1008 + if (group->rtpoll_states & (1 << PSI_IRQ_FULL)) 1009 + psi_schedule_rtpoll_work(group, 1, false); 1030 1010 } while ((group = group->parent)); 1031 1011 } 1032 1012 #endif ··· 1121 1101 cancel_delayed_work_sync(&cgroup->psi->avgs_work); 1122 1102 free_percpu(cgroup->psi->pcpu); 1123 1103 /* All triggers must be removed by now */ 1124 - WARN_ONCE(cgroup->psi->poll_states, "psi: trigger leak\n"); 1104 + WARN_ONCE(cgroup->psi->rtpoll_states, "psi: trigger leak\n"); 1125 1105 kfree(cgroup->psi); 1126 1106 } 1127 1107 ··· 1273 1253 } 1274 1254 1275 1255 struct psi_trigger *psi_trigger_create(struct psi_group *group, 1276 - char *buf, enum psi_res res) 1256 + char *buf, enum psi_res res, struct file *file) 1277 1257 { 1278 1258 struct psi_trigger *t; 1279 1259 enum psi_states state; 1280 1260 u32 threshold_us; 1261 + bool privileged; 1281 1262 u32 window_us; 1282 1263 1283 1264 if (static_branch_likely(&psi_disabled)) 1284 1265 return ERR_PTR(-EOPNOTSUPP); 1266 + 1267 + /* 1268 + * Checking the privilege here on file->f_cred implies that a privileged user 1269 + * could open the file and delegate the write to an unprivileged one. 1270 + */ 1271 + privileged = cap_raised(file->f_cred->cap_effective, CAP_SYS_RESOURCE); 1285 1272 1286 1273 if (sscanf(buf, "some %u %u", &threshold_us, &window_us) == 2) 1287 1274 state = PSI_IO_SOME + res * 2; ··· 1307 1280 1308 1281 if (window_us < WINDOW_MIN_US || 1309 1282 window_us > WINDOW_MAX_US) 1283 + return ERR_PTR(-EINVAL); 1284 + 1285 + /* 1286 + * Unprivileged users can only use 2s windows so that averages aggregation 1287 + * work is used, and no RT threads need to be spawned. 1288 + */ 1289 + if (!privileged && window_us % 2000000) 1310 1290 return ERR_PTR(-EINVAL); 1311 1291 1312 1292 /* Check threshold */ ··· 1335 1301 t->last_event_time = 0; 1336 1302 init_waitqueue_head(&t->event_wait); 1337 1303 t->pending_event = false; 1304 + t->aggregator = privileged ? PSI_POLL : PSI_AVGS; 1338 1305 1339 - mutex_lock(&group->trigger_lock); 1306 + if (privileged) { 1307 + mutex_lock(&group->rtpoll_trigger_lock); 1340 1308 1341 - if (!rcu_access_pointer(group->poll_task)) { 1342 - struct task_struct *task; 1309 + if (!rcu_access_pointer(group->rtpoll_task)) { 1310 + struct task_struct *task; 1343 1311 1344 - task = kthread_create(psi_poll_worker, group, "psimon"); 1345 - if (IS_ERR(task)) { 1346 - kfree(t); 1347 - mutex_unlock(&group->trigger_lock); 1348 - return ERR_CAST(task); 1312 + task = kthread_create(psi_rtpoll_worker, group, "psimon"); 1313 + if (IS_ERR(task)) { 1314 + kfree(t); 1315 + mutex_unlock(&group->rtpoll_trigger_lock); 1316 + return ERR_CAST(task); 1317 + } 1318 + atomic_set(&group->rtpoll_wakeup, 0); 1319 + wake_up_process(task); 1320 + rcu_assign_pointer(group->rtpoll_task, task); 1349 1321 } 1350 - atomic_set(&group->poll_wakeup, 0); 1351 - wake_up_process(task); 1352 - rcu_assign_pointer(group->poll_task, task); 1322 + 1323 + list_add(&t->node, &group->rtpoll_triggers); 1324 + group->rtpoll_min_period = min(group->rtpoll_min_period, 1325 + div_u64(t->win.size, UPDATES_PER_WINDOW)); 1326 + group->rtpoll_nr_triggers[t->state]++; 1327 + group->rtpoll_states |= (1 << t->state); 1328 + 1329 + mutex_unlock(&group->rtpoll_trigger_lock); 1330 + } else { 1331 + mutex_lock(&group->avgs_lock); 1332 + 1333 + list_add(&t->node, &group->avg_triggers); 1334 + group->avg_nr_triggers[t->state]++; 1335 + 1336 + mutex_unlock(&group->avgs_lock); 1353 1337 } 1354 - 1355 - list_add(&t->node, &group->triggers); 1356 - group->poll_min_period = min(group->poll_min_period, 1357 - div_u64(t->win.size, UPDATES_PER_WINDOW)); 1358 - group->nr_triggers[t->state]++; 1359 - group->poll_states |= (1 << t->state); 1360 - 1361 - mutex_unlock(&group->trigger_lock); 1362 - 1363 1338 return t; 1364 1339 } 1365 1340 ··· 1392 1349 */ 1393 1350 wake_up_pollfree(&t->event_wait); 1394 1351 1395 - mutex_lock(&group->trigger_lock); 1396 - 1397 - if (!list_empty(&t->node)) { 1398 - struct psi_trigger *tmp; 1399 - u64 period = ULLONG_MAX; 1400 - 1401 - list_del(&t->node); 1402 - group->nr_triggers[t->state]--; 1403 - if (!group->nr_triggers[t->state]) 1404 - group->poll_states &= ~(1 << t->state); 1405 - /* reset min update period for the remaining triggers */ 1406 - list_for_each_entry(tmp, &group->triggers, node) 1407 - period = min(period, div_u64(tmp->win.size, 1408 - UPDATES_PER_WINDOW)); 1409 - group->poll_min_period = period; 1410 - /* Destroy poll_task when the last trigger is destroyed */ 1411 - if (group->poll_states == 0) { 1412 - group->polling_until = 0; 1413 - task_to_destroy = rcu_dereference_protected( 1414 - group->poll_task, 1415 - lockdep_is_held(&group->trigger_lock)); 1416 - rcu_assign_pointer(group->poll_task, NULL); 1417 - del_timer(&group->poll_timer); 1352 + if (t->aggregator == PSI_AVGS) { 1353 + mutex_lock(&group->avgs_lock); 1354 + if (!list_empty(&t->node)) { 1355 + list_del(&t->node); 1356 + group->avg_nr_triggers[t->state]--; 1418 1357 } 1358 + mutex_unlock(&group->avgs_lock); 1359 + } else { 1360 + mutex_lock(&group->rtpoll_trigger_lock); 1361 + if (!list_empty(&t->node)) { 1362 + struct psi_trigger *tmp; 1363 + u64 period = ULLONG_MAX; 1364 + 1365 + list_del(&t->node); 1366 + group->rtpoll_nr_triggers[t->state]--; 1367 + if (!group->rtpoll_nr_triggers[t->state]) 1368 + group->rtpoll_states &= ~(1 << t->state); 1369 + /* reset min update period for the remaining triggers */ 1370 + list_for_each_entry(tmp, &group->rtpoll_triggers, node) 1371 + period = min(period, div_u64(tmp->win.size, 1372 + UPDATES_PER_WINDOW)); 1373 + group->rtpoll_min_period = period; 1374 + /* Destroy rtpoll_task when the last trigger is destroyed */ 1375 + if (group->rtpoll_states == 0) { 1376 + group->rtpoll_until = 0; 1377 + task_to_destroy = rcu_dereference_protected( 1378 + group->rtpoll_task, 1379 + lockdep_is_held(&group->rtpoll_trigger_lock)); 1380 + rcu_assign_pointer(group->rtpoll_task, NULL); 1381 + del_timer(&group->rtpoll_timer); 1382 + } 1383 + } 1384 + mutex_unlock(&group->rtpoll_trigger_lock); 1419 1385 } 1420 1386 1421 - mutex_unlock(&group->trigger_lock); 1422 - 1423 1387 /* 1424 - * Wait for psi_schedule_poll_work RCU to complete its read-side 1388 + * Wait for psi_schedule_rtpoll_work RCU to complete its read-side 1425 1389 * critical section before destroying the trigger and optionally the 1426 - * poll_task. 1390 + * rtpoll_task. 1427 1391 */ 1428 1392 synchronize_rcu(); 1429 1393 /* 1430 - * Stop kthread 'psimon' after releasing trigger_lock to prevent a 1431 - * deadlock while waiting for psi_poll_work to acquire trigger_lock 1394 + * Stop kthread 'psimon' after releasing rtpoll_trigger_lock to prevent 1395 + * a deadlock while waiting for psi_rtpoll_work to acquire 1396 + * rtpoll_trigger_lock 1432 1397 */ 1433 1398 if (task_to_destroy) { 1434 1399 /* 1435 1400 * After the RCU grace period has expired, the worker 1436 - * can no longer be found through group->poll_task. 1401 + * can no longer be found through group->rtpoll_task. 1437 1402 */ 1438 1403 kthread_stop(task_to_destroy); 1439 - atomic_set(&group->poll_scheduled, 0); 1404 + atomic_set(&group->rtpoll_scheduled, 0); 1440 1405 } 1441 1406 kfree(t); 1442 1407 } ··· 1486 1435 return psi_show(m, &psi_system, PSI_CPU); 1487 1436 } 1488 1437 1489 - static int psi_open(struct file *file, int (*psi_show)(struct seq_file *, void *)) 1490 - { 1491 - if (file->f_mode & FMODE_WRITE && !capable(CAP_SYS_RESOURCE)) 1492 - return -EPERM; 1493 - 1494 - return single_open(file, psi_show, NULL); 1495 - } 1496 - 1497 1438 static int psi_io_open(struct inode *inode, struct file *file) 1498 1439 { 1499 - return psi_open(file, psi_io_show); 1440 + return single_open(file, psi_io_show, NULL); 1500 1441 } 1501 1442 1502 1443 static int psi_memory_open(struct inode *inode, struct file *file) 1503 1444 { 1504 - return psi_open(file, psi_memory_show); 1445 + return single_open(file, psi_memory_show, NULL); 1505 1446 } 1506 1447 1507 1448 static int psi_cpu_open(struct inode *inode, struct file *file) 1508 1449 { 1509 - return psi_open(file, psi_cpu_show); 1450 + return single_open(file, psi_cpu_show, NULL); 1510 1451 } 1511 1452 1512 1453 static ssize_t psi_write(struct file *file, const char __user *user_buf, ··· 1532 1489 return -EBUSY; 1533 1490 } 1534 1491 1535 - new = psi_trigger_create(&psi_system, buf, res); 1492 + new = psi_trigger_create(&psi_system, buf, res, file); 1536 1493 if (IS_ERR(new)) { 1537 1494 mutex_unlock(&seq->lock); 1538 1495 return PTR_ERR(new); ··· 1612 1569 1613 1570 static int psi_irq_open(struct inode *inode, struct file *file) 1614 1571 { 1615 - return psi_open(file, psi_irq_show); 1572 + return single_open(file, psi_irq_show, NULL); 1616 1573 } 1617 1574 1618 1575 static ssize_t psi_irq_write(struct file *file, const char __user *user_buf,

+23

kernel/sched/rt.c

··· 2000 2000 * the mean time, task could have 2001 2001 * migrated already or had its affinity changed. 2002 2002 * Also make sure that it wasn't scheduled on its rq. 2003 + * It is possible the task was scheduled, set 2004 + * "migrate_disabled" and then got preempted, so we must 2005 + * check the task migration disable flag here too. 2003 2006 */ 2004 2007 if (unlikely(task_rq(task) != rq || 2005 2008 !cpumask_test_cpu(lowest_rq->cpu, &task->cpus_mask) || 2006 2009 task_on_cpu(rq, task) || 2007 2010 !rt_task(task) || 2011 + is_migration_disabled(task) || 2008 2012 !task_on_rq_queued(task))) { 2009 2013 2010 2014 double_unlock_balance(rq, lowest_rq); ··· 2681 2677 return 0; 2682 2678 } 2683 2679 2680 + #ifdef CONFIG_SCHED_CORE 2681 + static int task_is_throttled_rt(struct task_struct *p, int cpu) 2682 + { 2683 + struct rt_rq *rt_rq; 2684 + 2685 + #ifdef CONFIG_RT_GROUP_SCHED 2686 + rt_rq = task_group(p)->rt_rq[cpu]; 2687 + #else 2688 + rt_rq = &cpu_rq(cpu)->rt; 2689 + #endif 2690 + 2691 + return rt_rq_throttled(rt_rq); 2692 + } 2693 + #endif 2694 + 2684 2695 DEFINE_SCHED_CLASS(rt) = { 2685 2696 2686 2697 .enqueue_task = enqueue_task_rt, ··· 2728 2709 .switched_to = switched_to_rt, 2729 2710 2730 2711 .update_curr = update_curr_rt, 2712 + 2713 + #ifdef CONFIG_SCHED_CORE 2714 + .task_is_throttled = task_is_throttled_rt, 2715 + #endif 2731 2716 2732 2717 #ifdef CONFIG_UCLAMP_TASK 2733 2718 .uclamp_enabled = 1,

+212 -31

kernel/sched/sched.h

··· 2224 2224 #ifdef CONFIG_FAIR_GROUP_SCHED 2225 2225 void (*task_change_group)(struct task_struct *p); 2226 2226 #endif 2227 + 2228 + #ifdef CONFIG_SCHED_CORE 2229 + int (*task_is_throttled)(struct task_struct *p, int cpu); 2230 + #endif 2227 2231 }; 2228 2232 2229 2233 static inline void put_prev_task(struct rq *rq, struct task_struct *prev) ··· 3253 3249 } 3254 3250 3255 3251 #ifdef CONFIG_SCHED_MM_CID 3256 - static inline int __mm_cid_get(struct mm_struct *mm) 3252 + 3253 + #define SCHED_MM_CID_PERIOD_NS (100ULL * 1000000) /* 100ms */ 3254 + #define MM_CID_SCAN_DELAY 100 /* 100ms */ 3255 + 3256 + extern raw_spinlock_t cid_lock; 3257 + extern int use_cid_lock; 3258 + 3259 + extern void sched_mm_cid_migrate_from(struct task_struct *t); 3260 + extern void sched_mm_cid_migrate_to(struct rq *dst_rq, struct task_struct *t); 3261 + extern void task_tick_mm_cid(struct rq *rq, struct task_struct *curr); 3262 + extern void init_sched_mm_cid(struct task_struct *t); 3263 + 3264 + static inline void __mm_cid_put(struct mm_struct *mm, int cid) 3265 + { 3266 + if (cid < 0) 3267 + return; 3268 + cpumask_clear_cpu(cid, mm_cidmask(mm)); 3269 + } 3270 + 3271 + /* 3272 + * The per-mm/cpu cid can have the MM_CID_LAZY_PUT flag set or transition to 3273 + * the MM_CID_UNSET state without holding the rq lock, but the rq lock needs to 3274 + * be held to transition to other states. 3275 + * 3276 + * State transitions synchronized with cmpxchg or try_cmpxchg need to be 3277 + * consistent across cpus, which prevents use of this_cpu_cmpxchg. 3278 + */ 3279 + static inline void mm_cid_put_lazy(struct task_struct *t) 3280 + { 3281 + struct mm_struct *mm = t->mm; 3282 + struct mm_cid __percpu *pcpu_cid = mm->pcpu_cid; 3283 + int cid; 3284 + 3285 + lockdep_assert_irqs_disabled(); 3286 + cid = __this_cpu_read(pcpu_cid->cid); 3287 + if (!mm_cid_is_lazy_put(cid) || 3288 + !try_cmpxchg(&this_cpu_ptr(pcpu_cid)->cid, &cid, MM_CID_UNSET)) 3289 + return; 3290 + __mm_cid_put(mm, mm_cid_clear_lazy_put(cid)); 3291 + } 3292 + 3293 + static inline int mm_cid_pcpu_unset(struct mm_struct *mm) 3294 + { 3295 + struct mm_cid __percpu *pcpu_cid = mm->pcpu_cid; 3296 + int cid, res; 3297 + 3298 + lockdep_assert_irqs_disabled(); 3299 + cid = __this_cpu_read(pcpu_cid->cid); 3300 + for (;;) { 3301 + if (mm_cid_is_unset(cid)) 3302 + return MM_CID_UNSET; 3303 + /* 3304 + * Attempt transition from valid or lazy-put to unset. 3305 + */ 3306 + res = cmpxchg(&this_cpu_ptr(pcpu_cid)->cid, cid, MM_CID_UNSET); 3307 + if (res == cid) 3308 + break; 3309 + cid = res; 3310 + } 3311 + return cid; 3312 + } 3313 + 3314 + static inline void mm_cid_put(struct mm_struct *mm) 3315 + { 3316 + int cid; 3317 + 3318 + lockdep_assert_irqs_disabled(); 3319 + cid = mm_cid_pcpu_unset(mm); 3320 + if (cid == MM_CID_UNSET) 3321 + return; 3322 + __mm_cid_put(mm, mm_cid_clear_lazy_put(cid)); 3323 + } 3324 + 3325 + static inline int __mm_cid_try_get(struct mm_struct *mm) 3257 3326 { 3258 3327 struct cpumask *cpumask; 3259 3328 int cid; 3260 3329 3261 3330 cpumask = mm_cidmask(mm); 3262 - cid = cpumask_first_zero(cpumask); 3263 - if (cid >= nr_cpu_ids) 3331 + /* 3332 + * Retry finding first zero bit if the mask is temporarily 3333 + * filled. This only happens during concurrent remote-clear 3334 + * which owns a cid without holding a rq lock. 3335 + */ 3336 + for (;;) { 3337 + cid = cpumask_first_zero(cpumask); 3338 + if (cid < nr_cpu_ids) 3339 + break; 3340 + cpu_relax(); 3341 + } 3342 + if (cpumask_test_and_set_cpu(cid, cpumask)) 3264 3343 return -1; 3265 - __cpumask_set_cpu(cid, cpumask); 3266 3344 return cid; 3267 3345 } 3268 3346 3269 - static inline void mm_cid_put(struct mm_struct *mm, int cid) 3347 + /* 3348 + * Save a snapshot of the current runqueue time of this cpu 3349 + * with the per-cpu cid value, allowing to estimate how recently it was used. 3350 + */ 3351 + static inline void mm_cid_snapshot_time(struct rq *rq, struct mm_struct *mm) 3270 3352 { 3271 - lockdep_assert_irqs_disabled(); 3272 - if (cid < 0) 3273 - return; 3274 - raw_spin_lock(&mm->cid_lock); 3275 - __cpumask_clear_cpu(cid, mm_cidmask(mm)); 3276 - raw_spin_unlock(&mm->cid_lock); 3353 + struct mm_cid *pcpu_cid = per_cpu_ptr(mm->pcpu_cid, cpu_of(rq)); 3354 + 3355 + lockdep_assert_rq_held(rq); 3356 + WRITE_ONCE(pcpu_cid->time, rq->clock); 3277 3357 } 3278 3358 3279 - static inline int mm_cid_get(struct mm_struct *mm) 3359 + static inline int __mm_cid_get(struct rq *rq, struct mm_struct *mm) 3280 3360 { 3281 - int ret; 3361 + int cid; 3282 3362 3283 - lockdep_assert_irqs_disabled(); 3284 - raw_spin_lock(&mm->cid_lock); 3285 - ret = __mm_cid_get(mm); 3286 - raw_spin_unlock(&mm->cid_lock); 3287 - return ret; 3363 + /* 3364 + * All allocations (even those using the cid_lock) are lock-free. If 3365 + * use_cid_lock is set, hold the cid_lock to perform cid allocation to 3366 + * guarantee forward progress. 3367 + */ 3368 + if (!READ_ONCE(use_cid_lock)) { 3369 + cid = __mm_cid_try_get(mm); 3370 + if (cid >= 0) 3371 + goto end; 3372 + raw_spin_lock(&cid_lock); 3373 + } else { 3374 + raw_spin_lock(&cid_lock); 3375 + cid = __mm_cid_try_get(mm); 3376 + if (cid >= 0) 3377 + goto unlock; 3378 + } 3379 + 3380 + /* 3381 + * cid concurrently allocated. Retry while forcing following 3382 + * allocations to use the cid_lock to ensure forward progress. 3383 + */ 3384 + WRITE_ONCE(use_cid_lock, 1); 3385 + /* 3386 + * Set use_cid_lock before allocation. Only care about program order 3387 + * because this is only required for forward progress. 3388 + */ 3389 + barrier(); 3390 + /* 3391 + * Retry until it succeeds. It is guaranteed to eventually succeed once 3392 + * all newcoming allocations observe the use_cid_lock flag set. 3393 + */ 3394 + do { 3395 + cid = __mm_cid_try_get(mm); 3396 + cpu_relax(); 3397 + } while (cid < 0); 3398 + /* 3399 + * Allocate before clearing use_cid_lock. Only care about 3400 + * program order because this is for forward progress. 3401 + */ 3402 + barrier(); 3403 + WRITE_ONCE(use_cid_lock, 0); 3404 + unlock: 3405 + raw_spin_unlock(&cid_lock); 3406 + end: 3407 + mm_cid_snapshot_time(rq, mm); 3408 + return cid; 3288 3409 } 3289 3410 3290 - static inline void switch_mm_cid(struct task_struct *prev, struct task_struct *next) 3411 + static inline int mm_cid_get(struct rq *rq, struct mm_struct *mm) 3291 3412 { 3413 + struct mm_cid __percpu *pcpu_cid = mm->pcpu_cid; 3414 + struct cpumask *cpumask; 3415 + int cid; 3416 + 3417 + lockdep_assert_rq_held(rq); 3418 + cpumask = mm_cidmask(mm); 3419 + cid = __this_cpu_read(pcpu_cid->cid); 3420 + if (mm_cid_is_valid(cid)) { 3421 + mm_cid_snapshot_time(rq, mm); 3422 + return cid; 3423 + } 3424 + if (mm_cid_is_lazy_put(cid)) { 3425 + if (try_cmpxchg(&this_cpu_ptr(pcpu_cid)->cid, &cid, MM_CID_UNSET)) 3426 + __mm_cid_put(mm, mm_cid_clear_lazy_put(cid)); 3427 + } 3428 + cid = __mm_cid_get(rq, mm); 3429 + __this_cpu_write(pcpu_cid->cid, cid); 3430 + return cid; 3431 + } 3432 + 3433 + static inline void switch_mm_cid(struct rq *rq, 3434 + struct task_struct *prev, 3435 + struct task_struct *next) 3436 + { 3437 + /* 3438 + * Provide a memory barrier between rq->curr store and load of 3439 + * {prev,next}->mm->pcpu_cid[cpu] on rq->curr->mm transition. 3440 + * 3441 + * Should be adapted if context_switch() is modified. 3442 + */ 3443 + if (!next->mm) { // to kernel 3444 + /* 3445 + * user -> kernel transition does not guarantee a barrier, but 3446 + * we can use the fact that it performs an atomic operation in 3447 + * mmgrab(). 3448 + */ 3449 + if (prev->mm) // from user 3450 + smp_mb__after_mmgrab(); 3451 + /* 3452 + * kernel -> kernel transition does not change rq->curr->mm 3453 + * state. It stays NULL. 3454 + */ 3455 + } else { // to user 3456 + /* 3457 + * kernel -> user transition does not provide a barrier 3458 + * between rq->curr store and load of {prev,next}->mm->pcpu_cid[cpu]. 3459 + * Provide it here. 3460 + */ 3461 + if (!prev->mm) // from kernel 3462 + smp_mb(); 3463 + /* 3464 + * user -> user transition guarantees a memory barrier through 3465 + * switch_mm() when current->mm changes. If current->mm is 3466 + * unchanged, no barrier is needed. 3467 + */ 3468 + } 3292 3469 if (prev->mm_cid_active) { 3293 - if (next->mm_cid_active && next->mm == prev->mm) { 3294 - /* 3295 - * Context switch between threads in same mm, hand over 3296 - * the mm_cid from prev to next. 3297 - */ 3298 - next->mm_cid = prev->mm_cid; 3299 - prev->mm_cid = -1; 3300 - return; 3301 - } 3302 - mm_cid_put(prev->mm, prev->mm_cid); 3470 + mm_cid_snapshot_time(rq, prev->mm); 3471 + mm_cid_put_lazy(prev); 3303 3472 prev->mm_cid = -1; 3304 3473 } 3305 3474 if (next->mm_cid_active) 3306 - next->mm_cid = mm_cid_get(next->mm); 3475 + next->last_mm_cid = next->mm_cid = mm_cid_get(rq, next->mm); 3307 3476 } 3308 3477 3309 3478 #else 3310 - static inline void switch_mm_cid(struct task_struct *prev, struct task_struct *next) { } 3479 + static inline void switch_mm_cid(struct rq *rq, struct task_struct *prev, struct task_struct *next) { } 3480 + static inline void sched_mm_cid_migrate_from(struct task_struct *t) { } 3481 + static inline void sched_mm_cid_migrate_to(struct rq *dst_rq, struct task_struct *t) { } 3482 + static inline void task_tick_mm_cid(struct rq *rq, struct task_struct *curr) { } 3483 + static inline void init_sched_mm_cid(struct task_struct *t) { } 3311 3484 #endif 3312 3485 3313 3486 #endif /* _KERNEL_SCHED_SCHED_H */

+2 -2

kernel/sched/topology.c

··· 209 209 #if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) 210 210 DEFINE_STATIC_KEY_FALSE(sched_energy_present); 211 211 static unsigned int sysctl_sched_energy_aware = 1; 212 - DEFINE_MUTEX(sched_energy_mutex); 213 - bool sched_energy_update; 212 + static DEFINE_MUTEX(sched_energy_mutex); 213 + static bool sched_energy_update; 214 214 215 215 void rebuild_sched_domains_energy(void) 216 216 {