sched_ext: Merge branch 'sched/core' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip into for-6.19

+2

arch/x86/include/asm/topology.h

··· 325 325 extern void arch_scale_freq_tick(void); 326 326 #define arch_scale_freq_tick arch_scale_freq_tick 327 327 328 + extern int arch_sched_node_distance(int from, int to); 329 + 328 330 #endif /* _ASM_X86_TOPOLOGY_H */

+70

arch/x86/kernel/smpboot.c

··· 515 515 set_sched_topology(topology); 516 516 } 517 517 518 + #ifdef CONFIG_NUMA 519 + static int sched_avg_remote_distance; 520 + static int avg_remote_numa_distance(void) 521 + { 522 + int i, j; 523 + int distance, nr_remote, total_distance; 524 + 525 + if (sched_avg_remote_distance > 0) 526 + return sched_avg_remote_distance; 527 + 528 + nr_remote = 0; 529 + total_distance = 0; 530 + for_each_node_state(i, N_CPU) { 531 + for_each_node_state(j, N_CPU) { 532 + distance = node_distance(i, j); 533 + 534 + if (distance >= REMOTE_DISTANCE) { 535 + nr_remote++; 536 + total_distance += distance; 537 + } 538 + } 539 + } 540 + if (nr_remote) 541 + sched_avg_remote_distance = total_distance / nr_remote; 542 + else 543 + sched_avg_remote_distance = REMOTE_DISTANCE; 544 + 545 + return sched_avg_remote_distance; 546 + } 547 + 548 + int arch_sched_node_distance(int from, int to) 549 + { 550 + int d = node_distance(from, to); 551 + 552 + switch (boot_cpu_data.x86_vfm) { 553 + case INTEL_GRANITERAPIDS_X: 554 + case INTEL_ATOM_DARKMONT_X: 555 + 556 + if (!x86_has_numa_in_package || topology_max_packages() == 1 || 557 + d < REMOTE_DISTANCE) 558 + return d; 559 + 560 + /* 561 + * With SNC enabled, there could be too many levels of remote 562 + * NUMA node distances, creating NUMA domain levels 563 + * including local nodes and partial remote nodes. 564 + * 565 + * Trim finer distance tuning for NUMA nodes in remote package 566 + * for the purpose of building sched domains. Group NUMA nodes 567 + * in the remote package in the same sched group. 568 + * Simplify NUMA domains and avoid extra NUMA levels including 569 + * different remote NUMA nodes and local nodes. 570 + * 571 + * GNR and CWF don't expect systems with more than 2 packages 572 + * and more than 2 hops between packages. Single average remote 573 + * distance won't be appropriate if there are more than 2 574 + * packages as average distance to different remote packages 575 + * could be different. 576 + */ 577 + WARN_ONCE(topology_max_packages() > 2, 578 + "sched: Expect only up to 2 packages for GNR or CWF, " 579 + "but saw %d packages when building sched domains.", 580 + topology_max_packages()); 581 + 582 + d = avg_remote_numa_distance(); 583 + } 584 + return d; 585 + } 586 + #endif /* CONFIG_NUMA */ 587 + 518 588 void set_cpu_sibling_map(int cpu) 519 589 { 520 590 bool has_smt = __max_threads_per_core > 1;

+5

include/linux/cleanup.h

··· 340 340 #define __DEFINE_CLASS_IS_CONDITIONAL(_name, _is_cond) \ 341 341 static __maybe_unused const bool class_##_name##_is_conditional = _is_cond 342 342 343 + #define DEFINE_CLASS_IS_UNCONDITIONAL(_name) \ 344 + __DEFINE_CLASS_IS_CONDITIONAL(_name, false); \ 345 + static inline void * class_##_name##_lock_ptr(class_##_name##_t *_T) \ 346 + { return (void *)1; } 347 + 343 348 #define __GUARD_IS_ERR(_ptr) \ 344 349 ({ \ 345 350 unsigned long _rc = (__force unsigned long)(_ptr); \

+4 -7

include/linux/sched.h

··· 637 637 #endif 638 638 } __randomize_layout; 639 639 640 - typedef bool (*dl_server_has_tasks_f)(struct sched_dl_entity *); 641 - typedef struct task_struct *(*dl_server_pick_f)(struct sched_dl_entity *); 640 + struct rq_flags; 641 + typedef struct task_struct *(*dl_server_pick_f)(struct sched_dl_entity *, struct rq_flags *rf); 642 642 643 643 struct sched_dl_entity { 644 644 struct rb_node rb_node; ··· 730 730 * dl_server_update(). 731 731 * 732 732 * @rq the runqueue this server is for 733 - * 734 - * @server_has_tasks() returns true if @server_pick return a 735 - * runnable task. 736 733 */ 737 734 struct rq *rq; 738 735 dl_server_pick_f server_pick_task; ··· 1858 1861 extern int dl_bw_alloc(int cpu, u64 dl_bw); 1859 1862 extern void dl_bw_free(int cpu, u64 dl_bw); 1860 1863 1861 - /* do_set_cpus_allowed() - consider using set_cpus_allowed_ptr() instead */ 1862 - extern void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask); 1864 + /* set_cpus_allowed_force() - consider using set_cpus_allowed_ptr() instead */ 1865 + extern void set_cpus_allowed_force(struct task_struct *p, const struct cpumask *new_mask); 1863 1866 1864 1867 /** 1865 1868 * set_cpus_allowed_ptr - set CPU affinity mask of a task

+1 -1

kernel/cgroup/cpuset.c

··· 4180 4180 rcu_read_lock(); 4181 4181 cs_mask = task_cs(tsk)->cpus_allowed; 4182 4182 if (is_in_v2_mode() && cpumask_subset(cs_mask, possible_mask)) { 4183 - do_set_cpus_allowed(tsk, cs_mask); 4183 + set_cpus_allowed_force(tsk, cs_mask); 4184 4184 changed = true; 4185 4185 } 4186 4186 rcu_read_unlock();

+5 -10

kernel/kthread.c

··· 593 593 594 594 static void __kthread_bind_mask(struct task_struct *p, const struct cpumask *mask, unsigned int state) 595 595 { 596 - unsigned long flags; 597 - 598 596 if (!wait_task_inactive(p, state)) { 599 597 WARN_ON(1); 600 598 return; 601 599 } 602 600 601 + scoped_guard (raw_spinlock_irqsave, &p->pi_lock) 602 + set_cpus_allowed_force(p, mask); 603 + 603 604 /* It's safe because the task is inactive. */ 604 - raw_spin_lock_irqsave(&p->pi_lock, flags); 605 - do_set_cpus_allowed(p, mask); 606 605 p->flags |= PF_NO_SETAFFINITY; 607 - raw_spin_unlock_irqrestore(&p->pi_lock, flags); 608 606 } 609 607 610 608 static void __kthread_bind(struct task_struct *p, unsigned int cpu, unsigned int state) ··· 855 857 { 856 858 struct kthread *kthread = to_kthread(p); 857 859 cpumask_var_t affinity; 858 - unsigned long flags; 859 860 int ret = 0; 860 861 861 862 if (!wait_task_inactive(p, TASK_UNINTERRUPTIBLE) || kthread->started) { ··· 879 882 list_add_tail(&kthread->hotplug_node, &kthreads_hotplug); 880 883 kthread_fetch_affinity(kthread, affinity); 881 884 882 - /* It's safe because the task is inactive. */ 883 - raw_spin_lock_irqsave(&p->pi_lock, flags); 884 - do_set_cpus_allowed(p, affinity); 885 - raw_spin_unlock_irqrestore(&p->pi_lock, flags); 885 + scoped_guard (raw_spinlock_irqsave, &p->pi_lock) 886 + set_cpus_allowed_force(p, affinity); 886 887 887 888 mutex_unlock(&kthreads_hotplug_lock); 888 889 out:

+164 -238

kernel/sched/core.c

··· 583 583 * 584 584 * p->on_rq <- { 0, 1 = TASK_ON_RQ_QUEUED, 2 = TASK_ON_RQ_MIGRATING }: 585 585 * 586 - * is set by activate_task() and cleared by deactivate_task(), under 587 - * rq->lock. Non-zero indicates the task is runnable, the special 586 + * is set by activate_task() and cleared by deactivate_task()/block_task(), 587 + * under rq->lock. Non-zero indicates the task is runnable, the special 588 588 * ON_RQ_MIGRATING state is used for migration without holding both 589 589 * rq->locks. It indicates task_cpu() is not stable, see task_rq_lock(). 590 590 * ··· 2089 2089 */ 2090 2090 uclamp_rq_inc(rq, p, flags); 2091 2091 2092 + rq->queue_mask |= p->sched_class->queue_mask; 2092 2093 p->sched_class->enqueue_task(rq, p, flags); 2093 2094 2094 2095 psi_enqueue(p, flags); ··· 2122 2121 * and mark the task ->sched_delayed. 2123 2122 */ 2124 2123 uclamp_rq_dec(rq, p); 2124 + rq->queue_mask |= p->sched_class->queue_mask; 2125 2125 return p->sched_class->dequeue_task(rq, p, flags); 2126 2126 } 2127 2127 ··· 2169 2167 inline int task_curr(const struct task_struct *p) 2170 2168 { 2171 2169 return cpu_curr(task_cpu(p)) == p; 2172 - } 2173 - 2174 - /* 2175 - * ->switching_to() is called with the pi_lock and rq_lock held and must not 2176 - * mess with locking. 2177 - */ 2178 - void check_class_changing(struct rq *rq, struct task_struct *p, 2179 - const struct sched_class *prev_class) 2180 - { 2181 - if (prev_class != p->sched_class && p->sched_class->switching_to) 2182 - p->sched_class->switching_to(rq, p); 2183 - } 2184 - 2185 - /* 2186 - * switched_from, switched_to and prio_changed must _NOT_ drop rq->lock, 2187 - * use the balance_callback list if you want balancing. 2188 - * 2189 - * this means any call to check_class_changed() must be followed by a call to 2190 - * balance_callback(). 2191 - */ 2192 - void check_class_changed(struct rq *rq, struct task_struct *p, 2193 - const struct sched_class *prev_class, 2194 - int oldprio) 2195 - { 2196 - if (prev_class != p->sched_class) { 2197 - if (prev_class->switched_from) 2198 - prev_class->switched_from(rq, p); 2199 - 2200 - p->sched_class->switched_to(rq, p); 2201 - } else if (oldprio != p->prio || dl_task(p)) 2202 - p->sched_class->prio_changed(rq, p, oldprio); 2203 2170 } 2204 2171 2205 2172 void wakeup_preempt(struct rq *rq, struct task_struct *p, int flags) ··· 2333 2362 } 2334 2363 2335 2364 static void 2336 - __do_set_cpus_allowed(struct task_struct *p, struct affinity_context *ctx); 2365 + do_set_cpus_allowed(struct task_struct *p, struct affinity_context *ctx); 2337 2366 2338 2367 static void migrate_disable_switch(struct rq *rq, struct task_struct *p) 2339 2368 { ··· 2348 2377 if (p->cpus_ptr != &p->cpus_mask) 2349 2378 return; 2350 2379 2351 - /* 2352 - * Violates locking rules! See comment in __do_set_cpus_allowed(). 2353 - */ 2354 - __do_set_cpus_allowed(p, &ac); 2380 + scoped_guard (task_rq_lock, p) 2381 + do_set_cpus_allowed(p, &ac); 2355 2382 } 2356 2383 2357 2384 void ___migrate_enable(void) ··· 2582 2613 */ 2583 2614 WARN_ON_ONCE(!pending->stop_pending); 2584 2615 preempt_disable(); 2585 - task_rq_unlock(rq, p, &rf); 2616 + rq_unlock(rq, &rf); 2617 + raw_spin_unlock_irqrestore(&p->pi_lock, rf.flags); 2586 2618 stop_one_cpu_nowait(task_cpu(p), migration_cpu_stop, 2587 2619 &pending->arg, &pending->stop_work); 2588 2620 preempt_enable(); ··· 2592 2622 out: 2593 2623 if (pending) 2594 2624 pending->stop_pending = false; 2595 - task_rq_unlock(rq, p, &rf); 2625 + rq_unlock(rq, &rf); 2626 + raw_spin_unlock_irqrestore(&p->pi_lock, rf.flags); 2596 2627 2597 2628 if (complete) 2598 2629 complete_all(&pending->done); ··· 2664 2693 } 2665 2694 2666 2695 static void 2667 - __do_set_cpus_allowed(struct task_struct *p, struct affinity_context *ctx) 2696 + do_set_cpus_allowed(struct task_struct *p, struct affinity_context *ctx) 2668 2697 { 2669 - struct rq *rq = task_rq(p); 2670 - bool queued, running; 2671 - 2672 - /* 2673 - * This here violates the locking rules for affinity, since we're only 2674 - * supposed to change these variables while holding both rq->lock and 2675 - * p->pi_lock. 2676 - * 2677 - * HOWEVER, it magically works, because ttwu() is the only code that 2678 - * accesses these variables under p->pi_lock and only does so after 2679 - * smp_cond_load_acquire(&p->on_cpu, !VAL), and we're in __schedule() 2680 - * before finish_task(). 2681 - * 2682 - * XXX do further audits, this smells like something putrid. 2683 - */ 2684 - if (ctx->flags & SCA_MIGRATE_DISABLE) 2685 - WARN_ON_ONCE(!p->on_cpu); 2686 - else 2687 - lockdep_assert_held(&p->pi_lock); 2688 - 2689 - queued = task_on_rq_queued(p); 2690 - running = task_current_donor(rq, p); 2691 - 2692 - if (queued) { 2693 - /* 2694 - * Because __kthread_bind() calls this on blocked tasks without 2695 - * holding rq->lock. 2696 - */ 2697 - lockdep_assert_rq_held(rq); 2698 - dequeue_task(rq, p, DEQUEUE_SAVE | DEQUEUE_NOCLOCK); 2698 + scoped_guard (sched_change, p, DEQUEUE_SAVE) { 2699 + p->sched_class->set_cpus_allowed(p, ctx); 2700 + mm_set_cpus_allowed(p->mm, ctx->new_mask); 2699 2701 } 2700 - if (running) 2701 - put_prev_task(rq, p); 2702 - 2703 - p->sched_class->set_cpus_allowed(p, ctx); 2704 - mm_set_cpus_allowed(p->mm, ctx->new_mask); 2705 - 2706 - if (queued) 2707 - enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK); 2708 - if (running) 2709 - set_next_task(rq, p); 2710 2702 } 2711 2703 2712 2704 /* 2713 2705 * Used for kthread_bind() and select_fallback_rq(), in both cases the user 2714 2706 * affinity (if any) should be destroyed too. 2715 2707 */ 2716 - void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) 2708 + void set_cpus_allowed_force(struct task_struct *p, const struct cpumask *new_mask) 2717 2709 { 2718 2710 struct affinity_context ac = { 2719 2711 .new_mask = new_mask, ··· 2688 2754 struct rcu_head rcu; 2689 2755 }; 2690 2756 2691 - __do_set_cpus_allowed(p, &ac); 2757 + scoped_guard (__task_rq_lock, p) 2758 + do_set_cpus_allowed(p, &ac); 2692 2759 2693 2760 /* 2694 2761 * Because this is called with p->pi_lock held, it is not possible ··· 2727 2792 * Use pi_lock to protect content of user_cpus_ptr 2728 2793 * 2729 2794 * Though unlikely, user_cpus_ptr can be reset to NULL by a concurrent 2730 - * do_set_cpus_allowed(). 2795 + * set_cpus_allowed_force(). 2731 2796 */ 2732 2797 raw_spin_lock_irqsave(&src->pi_lock, flags); 2733 2798 if (src->user_cpus_ptr) { ··· 3055 3120 goto out; 3056 3121 } 3057 3122 3058 - __do_set_cpus_allowed(p, ctx); 3123 + do_set_cpus_allowed(p, ctx); 3059 3124 3060 3125 return affine_move_task(rq, p, rf, dest_cpu, ctx->flags); 3061 3126 ··· 3464 3529 } 3465 3530 fallthrough; 3466 3531 case possible: 3467 - /* 3468 - * XXX When called from select_task_rq() we only 3469 - * hold p->pi_lock and again violate locking order. 3470 - * 3471 - * More yuck to audit. 3472 - */ 3473 - do_set_cpus_allowed(p, task_cpu_fallback_mask(p)); 3532 + set_cpus_allowed_force(p, task_cpu_fallback_mask(p)); 3474 3533 state = fail; 3475 3534 break; 3476 3535 case fail: ··· 3706 3777 ttwu_do_wakeup(p); 3707 3778 ret = 1; 3708 3779 } 3709 - __task_rq_unlock(rq, &rf); 3780 + __task_rq_unlock(rq, p, &rf); 3710 3781 3711 3782 return ret; 3712 3783 } ··· 4160 4231 * __schedule(). See the comment for smp_mb__after_spinlock(). 4161 4232 * 4162 4233 * Form a control-dep-acquire with p->on_rq == 0 above, to ensure 4163 - * schedule()'s deactivate_task() has 'happened' and p will no longer 4234 + * schedule()'s block_task() has 'happened' and p will no longer 4164 4235 * care about it's own p->state. See the comment in __schedule(). 4165 4236 */ 4166 4237 smp_acquire__after_ctrl_dep(); ··· 4299 4370 ret = func(p, arg); 4300 4371 4301 4372 if (rq) 4302 - rq_unlock(rq, &rf); 4373 + __task_rq_unlock(rq, p, &rf); 4303 4374 4304 4375 raw_spin_unlock_irqrestore(&p->pi_lock, rf.flags); 4305 4376 return ret; ··· 5845 5916 const struct sched_class *start_class = prev->sched_class; 5846 5917 const struct sched_class *class; 5847 5918 5848 - #ifdef CONFIG_SCHED_CLASS_EXT 5849 - /* 5850 - * SCX requires a balance() call before every pick_task() including when 5851 - * waking up from SCHED_IDLE. If @start_class is below SCX, start from 5852 - * SCX instead. Also, set a flag to detect missing balance() call. 5853 - */ 5854 - if (scx_enabled()) { 5855 - rq->scx.flags |= SCX_RQ_BAL_PENDING; 5856 - if (sched_class_above(&ext_sched_class, start_class)) 5857 - start_class = &ext_sched_class; 5858 - } 5859 - #endif 5860 - 5861 5919 /* 5862 5920 * We must do the balancing pass before put_prev_task(), such 5863 5921 * that when we release the rq->lock the task is in the same ··· 5888 5972 5889 5973 /* Assume the next prioritized class is idle_sched_class */ 5890 5974 if (!p) { 5891 - p = pick_task_idle(rq); 5975 + p = pick_task_idle(rq, rf); 5892 5976 put_prev_set_next_task(rq, prev, p); 5893 5977 } 5894 5978 ··· 5900 5984 5901 5985 for_each_active_class(class) { 5902 5986 if (class->pick_next_task) { 5903 - p = class->pick_next_task(rq, prev); 5987 + p = class->pick_next_task(rq, prev, rf); 5988 + if (unlikely(p == RETRY_TASK)) 5989 + goto restart; 5904 5990 if (p) 5905 5991 return p; 5906 5992 } else { 5907 - p = class->pick_task(rq); 5993 + p = class->pick_task(rq, rf); 5994 + if (unlikely(p == RETRY_TASK)) 5995 + goto restart; 5908 5996 if (p) { 5909 5997 put_prev_set_next_task(rq, prev, p); 5910 5998 return p; ··· 5938 6018 return a->core_cookie == b->core_cookie; 5939 6019 } 5940 6020 5941 - static inline struct task_struct *pick_task(struct rq *rq) 6021 + /* 6022 + * Careful; this can return RETRY_TASK, it does not include the retry-loop 6023 + * itself due to the whole SMT pick retry thing below. 6024 + */ 6025 + static inline struct task_struct *pick_task(struct rq *rq, struct rq_flags *rf) 5942 6026 { 5943 6027 const struct sched_class *class; 5944 6028 struct task_struct *p; ··· 5950 6026 rq->dl_server = NULL; 5951 6027 5952 6028 for_each_active_class(class) { 5953 - p = class->pick_task(rq); 6029 + p = class->pick_task(rq, rf); 5954 6030 if (p) 5955 6031 return p; 5956 6032 } ··· 5965 6041 static struct task_struct * 5966 6042 pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) 5967 6043 { 5968 - struct task_struct *next, *p, *max = NULL; 6044 + struct task_struct *next, *p, *max; 5969 6045 const struct cpumask *smt_mask; 5970 6046 bool fi_before = false; 5971 6047 bool core_clock_updated = (rq == rq->core); ··· 6050 6126 * and there are no cookied tasks running on siblings. 6051 6127 */ 6052 6128 if (!need_sync) { 6053 - next = pick_task(rq); 6129 + restart_single: 6130 + next = pick_task(rq, rf); 6131 + if (unlikely(next == RETRY_TASK)) 6132 + goto restart_single; 6054 6133 if (!next->core_cookie) { 6055 6134 rq->core_pick = NULL; 6056 6135 rq->core_dl_server = NULL; ··· 6073 6146 * 6074 6147 * Tie-break prio towards the current CPU 6075 6148 */ 6149 + restart_multi: 6150 + max = NULL; 6076 6151 for_each_cpu_wrap(i, smt_mask, cpu) { 6077 6152 rq_i = cpu_rq(i); 6078 6153 ··· 6086 6157 if (i != cpu && (rq_i != rq->core || !core_clock_updated)) 6087 6158 update_rq_clock(rq_i); 6088 6159 6089 - rq_i->core_pick = p = pick_task(rq_i); 6160 + p = pick_task(rq_i, rf); 6161 + if (unlikely(p == RETRY_TASK)) 6162 + goto restart_multi; 6163 + 6164 + rq_i->core_pick = p; 6090 6165 rq_i->core_dl_server = rq_i->dl_server; 6091 6166 6092 6167 if (!max || prio_less(max, p, fi_before)) ··· 6112 6179 if (cookie) 6113 6180 p = sched_core_find(rq_i, cookie); 6114 6181 if (!p) 6115 - p = idle_sched_class.pick_task(rq_i); 6182 + p = idle_sched_class.pick_task(rq_i, rf); 6116 6183 } 6117 6184 6118 6185 rq_i->core_pick = p; ··· 6745 6812 6746 6813 local_irq_disable(); 6747 6814 rcu_note_context_switch(preempt); 6815 + migrate_disable_switch(rq, prev); 6748 6816 6749 6817 /* 6750 6818 * Make sure that signal_pending_state()->signal_pending() below ··· 6852 6918 */ 6853 6919 ++*switch_count; 6854 6920 6855 - migrate_disable_switch(rq, prev); 6856 6921 psi_account_irqtime(rq, prev, next); 6857 6922 psi_sched_switch(prev, next, !task_on_rq_queued(prev) || 6858 6923 prev->se.sched_delayed); ··· 7259 7326 */ 7260 7327 void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task) 7261 7328 { 7262 - int prio, oldprio, queued, running, queue_flag = 7329 + int prio, oldprio, queue_flag = 7263 7330 DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK; 7264 7331 const struct sched_class *prev_class, *next_class; 7265 7332 struct rq_flags rf; ··· 7321 7388 prev_class = p->sched_class; 7322 7389 next_class = __setscheduler_class(p->policy, prio); 7323 7390 7324 - if (prev_class != next_class && p->se.sched_delayed) 7325 - dequeue_task(rq, p, DEQUEUE_SLEEP | DEQUEUE_DELAYED | DEQUEUE_NOCLOCK); 7391 + if (prev_class != next_class) 7392 + queue_flag |= DEQUEUE_CLASS; 7326 7393 7327 - queued = task_on_rq_queued(p); 7328 - running = task_current_donor(rq, p); 7329 - if (queued) 7330 - dequeue_task(rq, p, queue_flag); 7331 - if (running) 7332 - put_prev_task(rq, p); 7333 - 7334 - /* 7335 - * Boosting condition are: 7336 - * 1. -rt task is running and holds mutex A 7337 - * --> -dl task blocks on mutex A 7338 - * 7339 - * 2. -dl task is running and holds mutex A 7340 - * --> -dl task blocks on mutex A and could preempt the 7341 - * running task 7342 - */ 7343 - if (dl_prio(prio)) { 7344 - if (!dl_prio(p->normal_prio) || 7345 - (pi_task && dl_prio(pi_task->prio) && 7346 - dl_entity_preempt(&pi_task->dl, &p->dl))) { 7347 - p->dl.pi_se = pi_task->dl.pi_se; 7348 - queue_flag |= ENQUEUE_REPLENISH; 7394 + scoped_guard (sched_change, p, queue_flag) { 7395 + /* 7396 + * Boosting condition are: 7397 + * 1. -rt task is running and holds mutex A 7398 + * --> -dl task blocks on mutex A 7399 + * 7400 + * 2. -dl task is running and holds mutex A 7401 + * --> -dl task blocks on mutex A and could preempt the 7402 + * running task 7403 + */ 7404 + if (dl_prio(prio)) { 7405 + if (!dl_prio(p->normal_prio) || 7406 + (pi_task && dl_prio(pi_task->prio) && 7407 + dl_entity_preempt(&pi_task->dl, &p->dl))) { 7408 + p->dl.pi_se = pi_task->dl.pi_se; 7409 + scope->flags |= ENQUEUE_REPLENISH; 7410 + } else { 7411 + p->dl.pi_se = &p->dl; 7412 + } 7413 + } else if (rt_prio(prio)) { 7414 + if (dl_prio(oldprio)) 7415 + p->dl.pi_se = &p->dl; 7416 + if (oldprio < prio) 7417 + scope->flags |= ENQUEUE_HEAD; 7349 7418 } else { 7350 - p->dl.pi_se = &p->dl; 7419 + if (dl_prio(oldprio)) 7420 + p->dl.pi_se = &p->dl; 7421 + if (rt_prio(oldprio)) 7422 + p->rt.timeout = 0; 7351 7423 } 7352 - } else if (rt_prio(prio)) { 7353 - if (dl_prio(oldprio)) 7354 - p->dl.pi_se = &p->dl; 7355 - if (oldprio < prio) 7356 - queue_flag |= ENQUEUE_HEAD; 7357 - } else { 7358 - if (dl_prio(oldprio)) 7359 - p->dl.pi_se = &p->dl; 7360 - if (rt_prio(oldprio)) 7361 - p->rt.timeout = 0; 7424 + 7425 + p->sched_class = next_class; 7426 + p->prio = prio; 7362 7427 } 7363 - 7364 - p->sched_class = next_class; 7365 - p->prio = prio; 7366 - 7367 - check_class_changing(rq, p, prev_class); 7368 - 7369 - if (queued) 7370 - enqueue_task(rq, p, queue_flag); 7371 - if (running) 7372 - set_next_task(rq, p); 7373 - 7374 - check_class_changed(rq, p, prev_class, oldprio); 7375 7428 out_unlock: 7376 7429 /* Avoid rq from going away on us: */ 7377 7430 preempt_disable(); 7378 7431 7379 7432 rq_unpin_lock(rq, &rf); 7380 7433 __balance_callbacks(rq); 7381 - raw_spin_rq_unlock(rq); 7434 + rq_repin_lock(rq, &rf); 7435 + __task_rq_unlock(rq, p, &rf); 7382 7436 7383 7437 preempt_enable(); 7384 7438 } ··· 8004 8084 */ 8005 8085 void sched_setnuma(struct task_struct *p, int nid) 8006 8086 { 8007 - bool queued, running; 8008 - struct rq_flags rf; 8009 - struct rq *rq; 8010 - 8011 - rq = task_rq_lock(p, &rf); 8012 - queued = task_on_rq_queued(p); 8013 - running = task_current_donor(rq, p); 8014 - 8015 - if (queued) 8016 - dequeue_task(rq, p, DEQUEUE_SAVE); 8017 - if (running) 8018 - put_prev_task(rq, p); 8019 - 8020 - p->numa_preferred_nid = nid; 8021 - 8022 - if (queued) 8023 - enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK); 8024 - if (running) 8025 - set_next_task(rq, p); 8026 - task_rq_unlock(rq, p, &rf); 8087 + guard(task_rq_lock)(p); 8088 + scoped_guard (sched_change, p, DEQUEUE_SAVE) 8089 + p->numa_preferred_nid = nid; 8027 8090 } 8028 8091 #endif /* CONFIG_NUMA_BALANCING */ 8029 8092 ··· 9108 9205 */ 9109 9206 void sched_move_task(struct task_struct *tsk, bool for_autogroup) 9110 9207 { 9111 - int queued, running, queue_flags = 9112 - DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK; 9208 + unsigned int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE; 9209 + bool resched = false; 9113 9210 struct rq *rq; 9114 9211 9115 9212 CLASS(task_rq_lock, rq_guard)(tsk); 9116 9213 rq = rq_guard.rq; 9117 9214 9118 - update_rq_clock(rq); 9119 - 9120 - running = task_current_donor(rq, tsk); 9121 - queued = task_on_rq_queued(tsk); 9122 - 9123 - if (queued) 9124 - dequeue_task(rq, tsk, queue_flags); 9125 - if (running) 9126 - put_prev_task(rq, tsk); 9127 - 9128 - sched_change_group(tsk); 9129 - if (!for_autogroup) 9130 - scx_cgroup_move_task(tsk); 9131 - 9132 - if (queued) 9133 - enqueue_task(rq, tsk, queue_flags); 9134 - if (running) { 9135 - set_next_task(rq, tsk); 9136 - /* 9137 - * After changing group, the running task may have joined a 9138 - * throttled one but it's still the running task. Trigger a 9139 - * resched to make sure that task can still run. 9140 - */ 9141 - resched_curr(rq); 9215 + scoped_guard (sched_change, tsk, queue_flags) { 9216 + sched_change_group(tsk); 9217 + if (!for_autogroup) 9218 + scx_cgroup_move_task(tsk); 9219 + if (scope->running) 9220 + resched = true; 9142 9221 } 9222 + 9223 + if (resched) 9224 + resched_curr(rq); 9143 9225 } 9144 9226 9145 9227 static struct cgroup_subsys_state * ··· 10780 10892 } 10781 10893 #endif /* CONFIG_SCHED_MM_CID */ 10782 10894 10783 - #ifdef CONFIG_SCHED_CLASS_EXT 10784 - void sched_deq_and_put_task(struct task_struct *p, int queue_flags, 10785 - struct sched_enq_and_set_ctx *ctx) 10895 + static DEFINE_PER_CPU(struct sched_change_ctx, sched_change_ctx); 10896 + 10897 + struct sched_change_ctx *sched_change_begin(struct task_struct *p, unsigned int flags) 10786 10898 { 10899 + struct sched_change_ctx *ctx = this_cpu_ptr(&sched_change_ctx); 10900 + struct rq *rq = task_rq(p); 10901 + 10902 + /* 10903 + * Must exclusively use matched flags since this is both dequeue and 10904 + * enqueue. 10905 + */ 10906 + WARN_ON_ONCE(flags & 0xFFFF0000); 10907 + 10908 + lockdep_assert_rq_held(rq); 10909 + 10910 + if (!(flags & DEQUEUE_NOCLOCK)) { 10911 + update_rq_clock(rq); 10912 + flags |= DEQUEUE_NOCLOCK; 10913 + } 10914 + 10915 + if (flags & DEQUEUE_CLASS) { 10916 + if (p->sched_class->switching_from) 10917 + p->sched_class->switching_from(rq, p); 10918 + } 10919 + 10920 + *ctx = (struct sched_change_ctx){ 10921 + .p = p, 10922 + .flags = flags, 10923 + .queued = task_on_rq_queued(p), 10924 + .running = task_current_donor(rq, p), 10925 + }; 10926 + 10927 + if (!(flags & DEQUEUE_CLASS)) { 10928 + if (p->sched_class->get_prio) 10929 + ctx->prio = p->sched_class->get_prio(rq, p); 10930 + else 10931 + ctx->prio = p->prio; 10932 + } 10933 + 10934 + if (ctx->queued) 10935 + dequeue_task(rq, p, flags); 10936 + if (ctx->running) 10937 + put_prev_task(rq, p); 10938 + 10939 + if ((flags & DEQUEUE_CLASS) && p->sched_class->switched_from) 10940 + p->sched_class->switched_from(rq, p); 10941 + 10942 + return ctx; 10943 + } 10944 + 10945 + void sched_change_end(struct sched_change_ctx *ctx) 10946 + { 10947 + struct task_struct *p = ctx->p; 10787 10948 struct rq *rq = task_rq(p); 10788 10949 10789 10950 lockdep_assert_rq_held(rq); 10790 10951 10791 - *ctx = (struct sched_enq_and_set_ctx){ 10792 - .p = p, 10793 - .queue_flags = queue_flags, 10794 - .queued = task_on_rq_queued(p), 10795 - .running = task_current(rq, p), 10796 - }; 10797 - 10798 - update_rq_clock(rq); 10799 - if (ctx->queued) 10800 - dequeue_task(rq, p, queue_flags | DEQUEUE_NOCLOCK); 10801 - if (ctx->running) 10802 - put_prev_task(rq, p); 10803 - } 10804 - 10805 - void sched_enq_and_set_task(struct sched_enq_and_set_ctx *ctx) 10806 - { 10807 - struct rq *rq = task_rq(ctx->p); 10808 - 10809 - lockdep_assert_rq_held(rq); 10952 + if ((ctx->flags & ENQUEUE_CLASS) && p->sched_class->switching_to) 10953 + p->sched_class->switching_to(rq, p); 10810 10954 10811 10955 if (ctx->queued) 10812 - enqueue_task(rq, ctx->p, ctx->queue_flags | ENQUEUE_NOCLOCK); 10956 + enqueue_task(rq, p, ctx->flags); 10813 10957 if (ctx->running) 10814 - set_next_task(rq, ctx->p); 10958 + set_next_task(rq, p); 10959 + 10960 + if (ctx->flags & ENQUEUE_CLASS) { 10961 + if (p->sched_class->switched_to) 10962 + p->sched_class->switched_to(rq, p); 10963 + } else { 10964 + p->sched_class->prio_changed(rq, p, ctx->prio); 10965 + } 10815 10966 } 10816 - #endif /* CONFIG_SCHED_CLASS_EXT */

+9 -25

kernel/sched/cpudeadline.c

··· 166 166 * cpudl_clear - remove a CPU from the cpudl max-heap 167 167 * @cp: the cpudl max-heap context 168 168 * @cpu: the target CPU 169 + * @online: the online state of the deadline runqueue 169 170 * 170 171 * Notes: assumes cpu_rq(cpu)->lock is locked 171 172 * 172 173 * Returns: (void) 173 174 */ 174 - void cpudl_clear(struct cpudl *cp, int cpu) 175 + void cpudl_clear(struct cpudl *cp, int cpu, bool online) 175 176 { 176 177 int old_idx, new_cpu; 177 178 unsigned long flags; ··· 185 184 if (old_idx == IDX_INVALID) { 186 185 /* 187 186 * Nothing to remove if old_idx was invalid. 188 - * This could happen if a rq_offline_dl is 187 + * This could happen if rq_online_dl or rq_offline_dl is 189 188 * called for a CPU without -dl tasks running. 190 189 */ 191 190 } else { ··· 196 195 cp->elements[new_cpu].idx = old_idx; 197 196 cp->elements[cpu].idx = IDX_INVALID; 198 197 cpudl_heapify(cp, old_idx); 199 - 200 - cpumask_set_cpu(cpu, cp->free_cpus); 201 198 } 199 + if (likely(online)) 200 + __cpumask_set_cpu(cpu, cp->free_cpus); 201 + else 202 + __cpumask_clear_cpu(cpu, cp->free_cpus); 203 + 202 204 raw_spin_unlock_irqrestore(&cp->lock, flags); 203 205 } 204 206 ··· 232 228 cp->elements[new_idx].cpu = cpu; 233 229 cp->elements[cpu].idx = new_idx; 234 230 cpudl_heapify_up(cp, new_idx); 235 - cpumask_clear_cpu(cpu, cp->free_cpus); 231 + __cpumask_clear_cpu(cpu, cp->free_cpus); 236 232 } else { 237 233 cp->elements[old_idx].dl = dl; 238 234 cpudl_heapify(cp, old_idx); 239 235 } 240 236 241 237 raw_spin_unlock_irqrestore(&cp->lock, flags); 242 - } 243 - 244 - /* 245 - * cpudl_set_freecpu - Set the cpudl.free_cpus 246 - * @cp: the cpudl max-heap context 247 - * @cpu: rd attached CPU 248 - */ 249 - void cpudl_set_freecpu(struct cpudl *cp, int cpu) 250 - { 251 - cpumask_set_cpu(cpu, cp->free_cpus); 252 - } 253 - 254 - /* 255 - * cpudl_clear_freecpu - Clear the cpudl.free_cpus 256 - * @cp: the cpudl max-heap context 257 - * @cpu: rd attached CPU 258 - */ 259 - void cpudl_clear_freecpu(struct cpudl *cp, int cpu) 260 - { 261 - cpumask_clear_cpu(cpu, cp->free_cpus); 262 238 } 263 239 264 240 /*

+1 -3

kernel/sched/cpudeadline.h

··· 19 19 20 20 int cpudl_find(struct cpudl *cp, struct task_struct *p, struct cpumask *later_mask); 21 21 void cpudl_set(struct cpudl *cp, int cpu, u64 dl); 22 - void cpudl_clear(struct cpudl *cp, int cpu); 22 + void cpudl_clear(struct cpudl *cp, int cpu, bool online); 23 23 int cpudl_init(struct cpudl *cp); 24 - void cpudl_set_freecpu(struct cpudl *cp, int cpu); 25 - void cpudl_clear_freecpu(struct cpudl *cp, int cpu); 26 24 void cpudl_cleanup(struct cpudl *cp);

+26 -22

kernel/sched/deadline.c

··· 405 405 * up, and checks if the task is still in the "ACTIVE non contending" 406 406 * state or not (in the second case, it updates running_bw). 407 407 */ 408 - static void task_non_contending(struct sched_dl_entity *dl_se) 408 + static void task_non_contending(struct sched_dl_entity *dl_se, bool dl_task) 409 409 { 410 410 struct hrtimer *timer = &dl_se->inactive_timer; 411 411 struct rq *rq = rq_of_dl_se(dl_se); ··· 444 444 } else { 445 445 struct task_struct *p = dl_task_of(dl_se); 446 446 447 - if (dl_task(p)) 447 + if (dl_task) 448 448 sub_running_bw(dl_se, dl_rq); 449 449 450 - if (!dl_task(p) || READ_ONCE(p->__state) == TASK_DEAD) { 450 + if (!dl_task || READ_ONCE(p->__state) == TASK_DEAD) { 451 451 struct dl_bw *dl_b = dl_bw_of(task_cpu(p)); 452 452 453 453 if (READ_ONCE(p->__state) == TASK_DEAD) ··· 1808 1808 if (!dl_rq->dl_nr_running) { 1809 1809 dl_rq->earliest_dl.curr = 0; 1810 1810 dl_rq->earliest_dl.next = 0; 1811 - cpudl_clear(&rq->rd->cpudl, rq->cpu); 1811 + cpudl_clear(&rq->rd->cpudl, rq->cpu, rq->online); 1812 1812 cpupri_set(&rq->rd->cpupri, rq->cpu, rq->rt.highest_prio.curr); 1813 1813 } else { 1814 1814 struct rb_node *leftmost = rb_first_cached(&dl_rq->root); ··· 2045 2045 * or "inactive") 2046 2046 */ 2047 2047 if (flags & DEQUEUE_SLEEP) 2048 - task_non_contending(dl_se); 2048 + task_non_contending(dl_se, true); 2049 2049 } 2050 2050 2051 2051 static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags) ··· 2352 2352 * __pick_next_task_dl - Helper to pick the next -deadline task to run. 2353 2353 * @rq: The runqueue to pick the next task from. 2354 2354 */ 2355 - static struct task_struct *__pick_task_dl(struct rq *rq) 2355 + static struct task_struct *__pick_task_dl(struct rq *rq, struct rq_flags *rf) 2356 2356 { 2357 2357 struct sched_dl_entity *dl_se; 2358 2358 struct dl_rq *dl_rq = &rq->dl; ··· 2366 2366 WARN_ON_ONCE(!dl_se); 2367 2367 2368 2368 if (dl_server(dl_se)) { 2369 - p = dl_se->server_pick_task(dl_se); 2369 + p = dl_se->server_pick_task(dl_se, rf); 2370 2370 if (!p) { 2371 2371 dl_server_stop(dl_se); 2372 2372 goto again; ··· 2379 2379 return p; 2380 2380 } 2381 2381 2382 - static struct task_struct *pick_task_dl(struct rq *rq) 2382 + static struct task_struct *pick_task_dl(struct rq *rq, struct rq_flags *rf) 2383 2383 { 2384 - return __pick_task_dl(rq); 2384 + return __pick_task_dl(rq, rf); 2385 2385 } 2386 2386 2387 2387 static void put_prev_task_dl(struct rq *rq, struct task_struct *p, struct task_struct *next) ··· 2880 2880 if (rq->dl.overloaded) 2881 2881 dl_set_overload(rq); 2882 2882 2883 - cpudl_set_freecpu(&rq->rd->cpudl, rq->cpu); 2884 2883 if (rq->dl.dl_nr_running > 0) 2885 2884 cpudl_set(&rq->rd->cpudl, rq->cpu, rq->dl.earliest_dl.curr); 2885 + else 2886 + cpudl_clear(&rq->rd->cpudl, rq->cpu, true); 2886 2887 } 2887 2888 2888 2889 /* Assumes rq->lock is held */ ··· 2892 2891 if (rq->dl.overloaded) 2893 2892 dl_clear_overload(rq); 2894 2893 2895 - cpudl_clear(&rq->rd->cpudl, rq->cpu); 2896 - cpudl_clear_freecpu(&rq->rd->cpudl, rq->cpu); 2894 + cpudl_clear(&rq->rd->cpudl, rq->cpu, false); 2897 2895 } 2898 2896 2899 2897 void __init init_sched_dl_class(void) ··· 2970 2970 * will reset the task parameters. 2971 2971 */ 2972 2972 if (task_on_rq_queued(p) && p->dl.dl_runtime) 2973 - task_non_contending(&p->dl); 2973 + task_non_contending(&p->dl, false); 2974 2974 2975 2975 /* 2976 2976 * In case a task is setscheduled out from SCHED_DEADLINE we need to ··· 3042 3042 } 3043 3043 } 3044 3044 3045 + static u64 get_prio_dl(struct rq *rq, struct task_struct *p) 3046 + { 3047 + return p->dl.deadline; 3048 + } 3049 + 3045 3050 /* 3046 3051 * If the scheduling parameters of a -deadline task changed, 3047 3052 * a push or pull operation might be needed. 3048 3053 */ 3049 - static void prio_changed_dl(struct rq *rq, struct task_struct *p, 3050 - int oldprio) 3054 + static void prio_changed_dl(struct rq *rq, struct task_struct *p, u64 old_deadline) 3051 3055 { 3052 3056 if (!task_on_rq_queued(p)) 3053 3057 return; 3054 3058 3055 - /* 3056 - * This might be too much, but unfortunately 3057 - * we don't have the old deadline value, and 3058 - * we can't argue if the task is increasing 3059 - * or lowering its prio, so... 3060 - */ 3061 - if (!rq->dl.overloaded) 3059 + if (p->dl.deadline == old_deadline) 3060 + return; 3061 + 3062 + if (dl_time_before(old_deadline, p->dl.deadline)) 3062 3063 deadline_queue_pull_task(rq); 3063 3064 3064 3065 if (task_current_donor(rq, p)) { ··· 3092 3091 3093 3092 DEFINE_SCHED_CLASS(dl) = { 3094 3093 3094 + .queue_mask = 8, 3095 + 3095 3096 .enqueue_task = enqueue_task_dl, 3096 3097 .dequeue_task = dequeue_task_dl, 3097 3098 .yield_task = yield_task_dl, ··· 3116 3113 .task_tick = task_tick_dl, 3117 3114 .task_fork = task_fork_dl, 3118 3115 3116 + .get_prio = get_prio_dl, 3119 3117 .prio_changed = prio_changed_dl, 3120 3118 .switched_from = switched_from_dl, 3121 3119 .switched_to = switched_to_dl,

+37 -94

kernel/sched/ext.c

··· 2066 2066 2067 2067 lockdep_assert_rq_held(rq); 2068 2068 rq->scx.flags |= SCX_RQ_IN_BALANCE; 2069 - rq->scx.flags &= ~(SCX_RQ_BAL_PENDING | SCX_RQ_BAL_KEEP); 2069 + rq->scx.flags &= ~SCX_RQ_BAL_KEEP; 2070 2070 2071 2071 if ((sch->ops.flags & SCX_OPS_HAS_CPU_PREEMPT) && 2072 2072 unlikely(rq->scx.cpu_released)) { ··· 2170 2170 has_tasks: 2171 2171 rq->scx.flags &= ~SCX_RQ_IN_BALANCE; 2172 2172 return true; 2173 - } 2174 - 2175 - static int balance_scx(struct rq *rq, struct task_struct *prev, 2176 - struct rq_flags *rf) 2177 - { 2178 - int ret; 2179 - 2180 - rq_unpin_lock(rq, rf); 2181 - 2182 - ret = balance_one(rq, prev); 2183 - 2184 - #ifdef CONFIG_SCHED_SMT 2185 - /* 2186 - * When core-sched is enabled, this ops.balance() call will be followed 2187 - * by pick_task_scx() on this CPU and the SMT siblings. Balance the 2188 - * siblings too. 2189 - */ 2190 - if (sched_core_enabled(rq)) { 2191 - const struct cpumask *smt_mask = cpu_smt_mask(cpu_of(rq)); 2192 - int scpu; 2193 - 2194 - for_each_cpu_andnot(scpu, smt_mask, cpumask_of(cpu_of(rq))) { 2195 - struct rq *srq = cpu_rq(scpu); 2196 - struct task_struct *sprev = srq->curr; 2197 - 2198 - WARN_ON_ONCE(__rq_lockp(rq) != __rq_lockp(srq)); 2199 - update_rq_clock(srq); 2200 - balance_one(srq, sprev); 2201 - } 2202 - } 2203 - #endif 2204 - rq_repin_lock(rq, rf); 2205 - 2206 - maybe_queue_balance_callback(rq); 2207 - 2208 - return ret; 2209 2173 } 2210 2174 2211 2175 static void process_ddsp_deferred_locals(struct rq *rq) ··· 2351 2387 struct task_struct, scx.dsq_list.node); 2352 2388 } 2353 2389 2354 - static struct task_struct *pick_task_scx(struct rq *rq) 2390 + static struct task_struct *pick_task_scx(struct rq *rq, struct rq_flags *rf) 2355 2391 { 2356 2392 struct task_struct *prev = rq->curr; 2393 + bool keep_prev, kick_idle = false; 2357 2394 struct task_struct *p; 2358 - bool keep_prev = rq->scx.flags & SCX_RQ_BAL_KEEP; 2359 - bool kick_idle = false; 2360 2395 2361 - /* 2362 - * WORKAROUND: 2363 - * 2364 - * %SCX_RQ_BAL_KEEP should be set iff $prev is on SCX as it must just 2365 - * have gone through balance_scx(). Unfortunately, there currently is a 2366 - * bug where fair could say yes on balance() but no on pick_task(), 2367 - * which then ends up calling pick_task_scx() without preceding 2368 - * balance_scx(). 2369 - * 2370 - * Keep running @prev if possible and avoid stalling from entering idle 2371 - * without balancing. 2372 - * 2373 - * Once fair is fixed, remove the workaround and trigger WARN_ON_ONCE() 2374 - * if pick_task_scx() is called without preceding balance_scx(). 2375 - */ 2376 - if (unlikely(rq->scx.flags & SCX_RQ_BAL_PENDING)) { 2377 - if (prev->scx.flags & SCX_TASK_QUEUED) { 2378 - keep_prev = true; 2379 - } else { 2380 - keep_prev = false; 2381 - kick_idle = true; 2382 - } 2383 - } else if (unlikely(keep_prev && 2384 - prev->sched_class != &ext_sched_class)) { 2385 - /* 2386 - * Can happen while enabling as SCX_RQ_BAL_PENDING assertion is 2387 - * conditional on scx_enabled() and may have been skipped. 2388 - */ 2396 + rq_modified_clear(rq); 2397 + 2398 + rq_unpin_lock(rq, rf); 2399 + balance_one(rq, prev); 2400 + rq_repin_lock(rq, rf); 2401 + 2402 + maybe_queue_balance_callback(rq); 2403 + 2404 + if (rq_modified_above(rq, &ext_sched_class)) 2405 + return RETRY_TASK; 2406 + 2407 + keep_prev = rq->scx.flags & SCX_RQ_BAL_KEEP; 2408 + if (unlikely(keep_prev && 2409 + prev->sched_class != &ext_sched_class)) { 2389 2410 WARN_ON_ONCE(scx_enable_state() == SCX_ENABLED); 2390 2411 keep_prev = false; 2391 2412 } ··· 2965 3016 p, p->scx.weight); 2966 3017 } 2967 3018 2968 - static void prio_changed_scx(struct rq *rq, struct task_struct *p, int oldprio) 3019 + static void prio_changed_scx(struct rq *rq, struct task_struct *p, u64 oldprio) 2969 3020 { 2970 3021 } 2971 3022 ··· 3250 3301 * their current sched_class. Call them directly from sched core instead. 3251 3302 */ 3252 3303 DEFINE_SCHED_CLASS(ext) = { 3304 + .queue_mask = 1, 3305 + 3253 3306 .enqueue_task = enqueue_task_scx, 3254 3307 .dequeue_task = dequeue_task_scx, 3255 3308 .yield_task = yield_task_scx, ··· 3259 3308 3260 3309 .wakeup_preempt = wakeup_preempt_scx, 3261 3310 3262 - .balance = balance_scx, 3263 3311 .pick_task = pick_task_scx, 3264 3312 3265 3313 .put_prev_task = put_prev_task_scx, ··· 3799 3849 */ 3800 3850 list_for_each_entry_safe_reverse(p, n, &rq->scx.runnable_list, 3801 3851 scx.runnable_node) { 3802 - struct sched_enq_and_set_ctx ctx; 3803 - 3804 3852 /* cycling deq/enq is enough, see the function comment */ 3805 - sched_deq_and_put_task(p, DEQUEUE_SAVE | DEQUEUE_MOVE, &ctx); 3806 - sched_enq_and_set_task(&ctx); 3853 + scoped_guard (sched_change, p, DEQUEUE_SAVE | DEQUEUE_MOVE) { 3854 + /* nothing */ ; 3855 + } 3807 3856 } 3808 3857 3809 3858 /* resched to restore ticks and idle state */ ··· 3952 4003 3953 4004 scx_task_iter_start(&sti); 3954 4005 while ((p = scx_task_iter_next_locked(&sti))) { 4006 + unsigned int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK; 3955 4007 const struct sched_class *old_class = p->sched_class; 3956 4008 const struct sched_class *new_class = 3957 4009 __setscheduler_class(p->policy, p->prio); 3958 - struct sched_enq_and_set_ctx ctx; 3959 4010 3960 - if (old_class != new_class && p->se.sched_delayed) 3961 - dequeue_task(task_rq(p), p, DEQUEUE_SLEEP | DEQUEUE_DELAYED); 4011 + update_rq_clock(task_rq(p)); 3962 4012 3963 - sched_deq_and_put_task(p, DEQUEUE_SAVE | DEQUEUE_MOVE, &ctx); 4013 + if (old_class != new_class) 4014 + queue_flags |= DEQUEUE_CLASS; 3964 4015 3965 - p->sched_class = new_class; 3966 - check_class_changing(task_rq(p), p, old_class); 4016 + scoped_guard (sched_change, p, queue_flags) { 4017 + p->sched_class = new_class; 4018 + } 3967 4019 3968 - sched_enq_and_set_task(&ctx); 3969 - 3970 - check_class_changed(task_rq(p), p, old_class, p->prio); 3971 4020 scx_exit_task(p); 3972 4021 } 3973 4022 scx_task_iter_stop(&sti); ··· 4733 4786 percpu_down_write(&scx_fork_rwsem); 4734 4787 scx_task_iter_start(&sti); 4735 4788 while ((p = scx_task_iter_next_locked(&sti))) { 4789 + unsigned int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE; 4736 4790 const struct sched_class *old_class = p->sched_class; 4737 4791 const struct sched_class *new_class = 4738 4792 __setscheduler_class(p->policy, p->prio); 4739 - struct sched_enq_and_set_ctx ctx; 4740 4793 4741 4794 if (!tryget_task_struct(p)) 4742 4795 continue; 4743 4796 4744 - if (old_class != new_class && p->se.sched_delayed) 4745 - dequeue_task(task_rq(p), p, DEQUEUE_SLEEP | DEQUEUE_DELAYED); 4797 + if (old_class != new_class) 4798 + queue_flags |= DEQUEUE_CLASS; 4746 4799 4747 - sched_deq_and_put_task(p, DEQUEUE_SAVE | DEQUEUE_MOVE, &ctx); 4800 + scoped_guard (sched_change, p, queue_flags) { 4801 + p->scx.slice = SCX_SLICE_DFL; 4802 + p->sched_class = new_class; 4803 + } 4748 4804 4749 - p->scx.slice = SCX_SLICE_DFL; 4750 - p->sched_class = new_class; 4751 - check_class_changing(task_rq(p), p, old_class); 4752 - 4753 - sched_enq_and_set_task(&ctx); 4754 - 4755 - check_class_changed(task_rq(p), p, old_class, p->prio); 4756 4805 put_task_struct(p); 4757 4806 } 4758 4807 scx_task_iter_stop(&sti);

+38 -26

kernel/sched/fair.c

··· 8705 8705 set_task_max_allowed_capacity(p); 8706 8706 } 8707 8707 8708 - static int 8709 - balance_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) 8710 - { 8711 - if (sched_fair_runnable(rq)) 8712 - return 1; 8713 - 8714 - return sched_balance_newidle(rq, rf) != 0; 8715 - } 8716 - 8717 8708 static void set_next_buddy(struct sched_entity *se) 8718 8709 { 8719 8710 for_each_sched_entity(se) { ··· 8813 8822 resched_curr_lazy(rq); 8814 8823 } 8815 8824 8816 - static struct task_struct *pick_task_fair(struct rq *rq) 8825 + static struct task_struct *pick_task_fair(struct rq *rq, struct rq_flags *rf) 8817 8826 { 8818 8827 struct sched_entity *se; 8819 8828 struct cfs_rq *cfs_rq; ··· 8857 8866 int new_tasks; 8858 8867 8859 8868 again: 8860 - p = pick_task_fair(rq); 8869 + p = pick_task_fair(rq, rf); 8861 8870 if (!p) 8862 8871 goto idle; 8863 8872 se = &p->se; ··· 8936 8945 return NULL; 8937 8946 } 8938 8947 8939 - static struct task_struct *__pick_next_task_fair(struct rq *rq, struct task_struct *prev) 8948 + static struct task_struct * 8949 + fair_server_pick_task(struct sched_dl_entity *dl_se, struct rq_flags *rf) 8940 8950 { 8941 - return pick_next_task_fair(rq, prev, NULL); 8942 - } 8943 - 8944 - static struct task_struct *fair_server_pick_task(struct sched_dl_entity *dl_se) 8945 - { 8946 - return pick_task_fair(dl_se->rq); 8951 + return pick_task_fair(dl_se->rq, rf); 8947 8952 } 8948 8953 8949 8954 void fair_server_init(struct rq *rq) ··· 8994 9007 */ 8995 9008 rq_clock_skip_update(rq); 8996 9009 8997 - se->deadline += calc_delta_fair(se->slice, se); 9010 + /* 9011 + * Forfeit the remaining vruntime, only if the entity is eligible. This 9012 + * condition is necessary because in core scheduling we prefer to run 9013 + * ineligible tasks rather than force idling. If this happens we may 9014 + * end up in a loop where the core scheduler picks the yielding task, 9015 + * which yields immediately again; without the condition the vruntime 9016 + * ends up quickly running away. 9017 + */ 9018 + if (entity_eligible(cfs_rq, se)) { 9019 + se->vruntime = se->deadline; 9020 + se->deadline += calc_delta_fair(se->slice, se); 9021 + update_min_vruntime(cfs_rq); 9022 + } 8998 9023 } 8999 9024 9000 9025 static bool yield_to_task_fair(struct rq *rq, struct task_struct *p) ··· 10670 10671 if (sd->flags & SD_ASYM_CPUCAPACITY) 10671 10672 sgs->group_misfit_task_load = 1; 10672 10673 10673 - for_each_cpu(i, sched_group_span(group)) { 10674 + for_each_cpu_and(i, sched_group_span(group), p->cpus_ptr) { 10674 10675 struct rq *rq = cpu_rq(i); 10675 10676 unsigned int local; 10676 10677 ··· 12828 12829 } 12829 12830 rcu_read_unlock(); 12830 12831 12832 + rq_modified_clear(this_rq); 12831 12833 raw_spin_rq_unlock(this_rq); 12832 12834 12833 12835 t0 = sched_clock_cpu(this_cpu); ··· 12886 12886 if (this_rq->cfs.h_nr_queued && !pulled_task) 12887 12887 pulled_task = 1; 12888 12888 12889 - /* Is there a task of a high priority class? */ 12890 - if (this_rq->nr_running != this_rq->cfs.h_nr_queued) 12889 + /* If a higher prio class was modified, restart the pick */ 12890 + if (rq_modified_above(this_rq, &fair_sched_class)) 12891 12891 pulled_task = -1; 12892 12892 12893 12893 out: ··· 13138 13138 * the current task. 13139 13139 */ 13140 13140 static void 13141 - prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio) 13141 + prio_changed_fair(struct rq *rq, struct task_struct *p, u64 oldprio) 13142 13142 { 13143 13143 if (!task_on_rq_queued(p)) 13144 + return; 13145 + 13146 + if (p->prio == oldprio) 13144 13147 return; 13145 13148 13146 13149 if (rq->cfs.nr_queued == 1) ··· 13157 13154 if (task_current_donor(rq, p)) { 13158 13155 if (p->prio > oldprio) 13159 13156 resched_curr(rq); 13160 - } else 13157 + } else { 13161 13158 wakeup_preempt(rq, p, 0); 13159 + } 13162 13160 } 13163 13161 13164 13162 #ifdef CONFIG_FAIR_GROUP_SCHED ··· 13239 13235 struct sched_entity *se = &p->se; 13240 13236 13241 13237 attach_entity_cfs_rq(se); 13238 + } 13239 + 13240 + static void switching_from_fair(struct rq *rq, struct task_struct *p) 13241 + { 13242 + if (p->se.sched_delayed) 13243 + dequeue_task(rq, p, DEQUEUE_SLEEP | DEQUEUE_DELAYED | DEQUEUE_NOCLOCK); 13242 13244 } 13243 13245 13244 13246 static void switched_from_fair(struct rq *rq, struct task_struct *p) ··· 13621 13611 */ 13622 13612 DEFINE_SCHED_CLASS(fair) = { 13623 13613 13614 + .queue_mask = 2, 13615 + 13624 13616 .enqueue_task = enqueue_task_fair, 13625 13617 .dequeue_task = dequeue_task_fair, 13626 13618 .yield_task = yield_task_fair, ··· 13631 13619 .wakeup_preempt = check_preempt_wakeup_fair, 13632 13620 13633 13621 .pick_task = pick_task_fair, 13634 - .pick_next_task = __pick_next_task_fair, 13622 + .pick_next_task = pick_next_task_fair, 13635 13623 .put_prev_task = put_prev_task_fair, 13636 13624 .set_next_task = set_next_task_fair, 13637 13625 13638 - .balance = balance_fair, 13639 13626 .select_task_rq = select_task_rq_fair, 13640 13627 .migrate_task_rq = migrate_task_rq_fair, 13641 13628 ··· 13649 13638 13650 13639 .reweight_task = reweight_task_fair, 13651 13640 .prio_changed = prio_changed_fair, 13641 + .switching_from = switching_from_fair, 13652 13642 .switched_from = switched_from_fair, 13653 13643 .switched_to = switched_to_fair, 13654 13644

+9 -4

kernel/sched/idle.c

··· 466 466 next->se.exec_start = rq_clock_task(rq); 467 467 } 468 468 469 - struct task_struct *pick_task_idle(struct rq *rq) 469 + struct task_struct *pick_task_idle(struct rq *rq, struct rq_flags *rf) 470 470 { 471 471 scx_update_idle(rq, true, false); 472 472 return rq->idle; ··· 498 498 { 499 499 } 500 500 501 - static void switched_to_idle(struct rq *rq, struct task_struct *p) 501 + static void switching_to_idle(struct rq *rq, struct task_struct *p) 502 502 { 503 503 BUG(); 504 504 } 505 505 506 506 static void 507 - prio_changed_idle(struct rq *rq, struct task_struct *p, int oldprio) 507 + prio_changed_idle(struct rq *rq, struct task_struct *p, u64 oldprio) 508 508 { 509 + if (p->prio == oldprio) 510 + return; 511 + 509 512 BUG(); 510 513 } 511 514 ··· 520 517 * Simple, special scheduling class for the per-CPU idle tasks: 521 518 */ 522 519 DEFINE_SCHED_CLASS(idle) = { 520 + 521 + .queue_mask = 0, 523 522 524 523 /* no enqueue/yield_task for idle tasks */ 525 524 ··· 541 536 .task_tick = task_tick_idle, 542 537 543 538 .prio_changed = prio_changed_idle, 544 - .switched_to = switched_to_idle, 539 + .switching_to = switching_to_idle, 545 540 .update_curr = update_curr_idle, 546 541 };

+8 -3

kernel/sched/rt.c

··· 1695 1695 return rt_task_of(rt_se); 1696 1696 } 1697 1697 1698 - static struct task_struct *pick_task_rt(struct rq *rq) 1698 + static struct task_struct *pick_task_rt(struct rq *rq, struct rq_flags *rf) 1699 1699 { 1700 1700 struct task_struct *p; 1701 1701 ··· 2437 2437 * us to initiate a push or pull. 2438 2438 */ 2439 2439 static void 2440 - prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio) 2440 + prio_changed_rt(struct rq *rq, struct task_struct *p, u64 oldprio) 2441 2441 { 2442 2442 if (!task_on_rq_queued(p)) 2443 + return; 2444 + 2445 + if (p->prio == oldprio) 2443 2446 return; 2444 2447 2445 2448 if (task_current_donor(rq, p)) { ··· 2569 2566 2570 2567 DEFINE_SCHED_CLASS(rt) = { 2571 2568 2569 + .queue_mask = 4, 2570 + 2572 2571 .enqueue_task = enqueue_task_rt, 2573 2572 .dequeue_task = dequeue_task_rt, 2574 2573 .yield_task = yield_task_rt, ··· 2594 2589 2595 2590 .get_rr_interval = get_rr_interval_rt, 2596 2591 2597 - .prio_changed = prio_changed_rt, 2598 2592 .switched_to = switched_to_rt, 2593 + .prio_changed = prio_changed_rt, 2599 2594 2600 2595 .update_curr = update_curr_rt, 2601 2596

+194 -54

kernel/sched/sched.h

··· 20 20 #include <linux/sched/task_flags.h> 21 21 #include <linux/sched/task.h> 22 22 #include <linux/sched/topology.h> 23 - 24 23 #include <linux/atomic.h> 25 24 #include <linux/bitmap.h> 26 25 #include <linux/bug.h> ··· 779 780 */ 780 781 SCX_RQ_ONLINE = 1 << 0, 781 782 SCX_RQ_CAN_STOP_TICK = 1 << 1, 782 - SCX_RQ_BAL_PENDING = 1 << 2, /* balance hasn't run yet */ 783 783 SCX_RQ_BAL_KEEP = 1 << 3, /* balance decided to keep current */ 784 784 SCX_RQ_BYPASSING = 1 << 4, 785 785 SCX_RQ_CLK_VALID = 1 << 5, /* RQ clock is fresh and valid */ ··· 1118 1120 /* runqueue lock: */ 1119 1121 raw_spinlock_t __lock; 1120 1122 1123 + /* Per class runqueue modification mask; bits in class order. */ 1124 + unsigned int queue_mask; 1121 1125 unsigned int nr_running; 1122 1126 #ifdef CONFIG_NUMA_BALANCING 1123 1127 unsigned int nr_numa_running; ··· 1827 1827 __acquires(p->pi_lock) 1828 1828 __acquires(rq->lock); 1829 1829 1830 - static inline void __task_rq_unlock(struct rq *rq, struct rq_flags *rf) 1830 + static inline void 1831 + __task_rq_unlock(struct rq *rq, struct task_struct *p, struct rq_flags *rf) 1831 1832 __releases(rq->lock) 1832 1833 { 1833 1834 rq_unpin_lock(rq, rf); ··· 1840 1839 __releases(rq->lock) 1841 1840 __releases(p->pi_lock) 1842 1841 { 1843 - rq_unpin_lock(rq, rf); 1844 - raw_spin_rq_unlock(rq); 1842 + __task_rq_unlock(rq, p, rf); 1845 1843 raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags); 1846 1844 } 1847 1845 1848 1846 DEFINE_LOCK_GUARD_1(task_rq_lock, struct task_struct, 1849 1847 _T->rq = task_rq_lock(_T->lock, &_T->rf), 1850 1848 task_rq_unlock(_T->rq, _T->lock, &_T->rf), 1849 + struct rq *rq; struct rq_flags rf) 1850 + 1851 + DEFINE_LOCK_GUARD_1(__task_rq_lock, struct task_struct, 1852 + _T->rq = __task_rq_lock(_T->lock, &_T->rf), 1853 + __task_rq_unlock(_T->rq, _T->lock, &_T->rf), 1851 1854 struct rq *rq; struct rq_flags rf) 1852 1855 1853 1856 static inline void rq_lock_irqsave(struct rq *rq, struct rq_flags *rf) ··· 2347 2342 /* 2348 2343 * {de,en}queue flags: 2349 2344 * 2350 - * DEQUEUE_SLEEP - task is no longer runnable 2351 - * ENQUEUE_WAKEUP - task just became runnable 2345 + * SLEEP/WAKEUP - task is no-longer/just-became runnable 2352 2346 * 2353 2347 * SAVE/RESTORE - an otherwise spurious dequeue/enqueue, done to ensure tasks 2354 2348 * are in a known state which allows modification. Such pairs ··· 2360 2356 * 2361 2357 * MIGRATION - p->on_rq == TASK_ON_RQ_MIGRATING (used for DEADLINE) 2362 2358 * 2359 + * DELAYED - de/re-queue a sched_delayed task 2360 + * 2361 + * CLASS - going to update p->sched_class; makes sched_change call the 2362 + * various switch methods. 2363 + * 2363 2364 * ENQUEUE_HEAD - place at front of runqueue (tail if not specified) 2364 2365 * ENQUEUE_REPLENISH - CBS (replenish runtime and postpone deadline) 2365 2366 * ENQUEUE_MIGRATED - the task was migrated during wakeup 2366 2367 * ENQUEUE_RQ_SELECTED - ->select_task_rq() was called 2367 2368 * 2369 + * XXX SAVE/RESTORE in combination with CLASS doesn't really make sense, but 2370 + * SCHED_DEADLINE seems to rely on this for now. 2368 2371 */ 2369 2372 2370 - #define DEQUEUE_SLEEP 0x01 /* Matches ENQUEUE_WAKEUP */ 2371 - #define DEQUEUE_SAVE 0x02 /* Matches ENQUEUE_RESTORE */ 2372 - #define DEQUEUE_MOVE 0x04 /* Matches ENQUEUE_MOVE */ 2373 - #define DEQUEUE_NOCLOCK 0x08 /* Matches ENQUEUE_NOCLOCK */ 2374 - #define DEQUEUE_SPECIAL 0x10 2375 - #define DEQUEUE_MIGRATING 0x100 /* Matches ENQUEUE_MIGRATING */ 2376 - #define DEQUEUE_DELAYED 0x200 /* Matches ENQUEUE_DELAYED */ 2377 - #define DEQUEUE_THROTTLE 0x800 2373 + #define DEQUEUE_SLEEP 0x0001 /* Matches ENQUEUE_WAKEUP */ 2374 + #define DEQUEUE_SAVE 0x0002 /* Matches ENQUEUE_RESTORE */ 2375 + #define DEQUEUE_MOVE 0x0004 /* Matches ENQUEUE_MOVE */ 2376 + #define DEQUEUE_NOCLOCK 0x0008 /* Matches ENQUEUE_NOCLOCK */ 2378 2377 2379 - #define ENQUEUE_WAKEUP 0x01 2380 - #define ENQUEUE_RESTORE 0x02 2381 - #define ENQUEUE_MOVE 0x04 2382 - #define ENQUEUE_NOCLOCK 0x08 2378 + #define DEQUEUE_MIGRATING 0x0010 /* Matches ENQUEUE_MIGRATING */ 2379 + #define DEQUEUE_DELAYED 0x0020 /* Matches ENQUEUE_DELAYED */ 2380 + #define DEQUEUE_CLASS 0x0040 /* Matches ENQUEUE_CLASS */ 2383 2381 2384 - #define ENQUEUE_HEAD 0x10 2385 - #define ENQUEUE_REPLENISH 0x20 2386 - #define ENQUEUE_MIGRATED 0x40 2387 - #define ENQUEUE_INITIAL 0x80 2388 - #define ENQUEUE_MIGRATING 0x100 2389 - #define ENQUEUE_DELAYED 0x200 2390 - #define ENQUEUE_RQ_SELECTED 0x400 2382 + #define DEQUEUE_SPECIAL 0x00010000 2383 + #define DEQUEUE_THROTTLE 0x00020000 2384 + 2385 + #define ENQUEUE_WAKEUP 0x0001 2386 + #define ENQUEUE_RESTORE 0x0002 2387 + #define ENQUEUE_MOVE 0x0004 2388 + #define ENQUEUE_NOCLOCK 0x0008 2389 + 2390 + #define ENQUEUE_MIGRATING 0x0010 2391 + #define ENQUEUE_DELAYED 0x0020 2392 + #define ENQUEUE_CLASS 0x0040 2393 + 2394 + #define ENQUEUE_HEAD 0x00010000 2395 + #define ENQUEUE_REPLENISH 0x00020000 2396 + #define ENQUEUE_MIGRATED 0x00040000 2397 + #define ENQUEUE_INITIAL 0x00080000 2398 + #define ENQUEUE_RQ_SELECTED 0x00100000 2391 2399 2392 2400 #define RETRY_TASK ((void *)-1UL) 2393 2401 ··· 2416 2400 #ifdef CONFIG_UCLAMP_TASK 2417 2401 int uclamp_enabled; 2418 2402 #endif 2403 + /* 2404 + * idle: 0 2405 + * ext: 1 2406 + * fair: 2 2407 + * rt: 4 2408 + * dl: 8 2409 + * stop: 16 2410 + */ 2411 + unsigned int queue_mask; 2419 2412 2413 + /* 2414 + * move_queued_task/activate_task/enqueue_task: rq->lock 2415 + * ttwu_do_activate/activate_task/enqueue_task: rq->lock 2416 + * wake_up_new_task/activate_task/enqueue_task: task_rq_lock 2417 + * ttwu_runnable/enqueue_task: task_rq_lock 2418 + * proxy_task_current: rq->lock 2419 + * sched_change_end 2420 + */ 2420 2421 void (*enqueue_task) (struct rq *rq, struct task_struct *p, int flags); 2422 + /* 2423 + * move_queued_task/deactivate_task/dequeue_task: rq->lock 2424 + * __schedule/block_task/dequeue_task: rq->lock 2425 + * proxy_task_current: rq->lock 2426 + * wait_task_inactive: task_rq_lock 2427 + * sched_change_begin 2428 + */ 2421 2429 bool (*dequeue_task) (struct rq *rq, struct task_struct *p, int flags); 2430 + 2431 + /* 2432 + * do_sched_yield: rq->lock 2433 + */ 2422 2434 void (*yield_task) (struct rq *rq); 2435 + /* 2436 + * yield_to: rq->lock (double) 2437 + */ 2423 2438 bool (*yield_to_task)(struct rq *rq, struct task_struct *p); 2424 2439 2440 + /* 2441 + * move_queued_task: rq->lock 2442 + * __migrate_swap_task: rq->lock 2443 + * ttwu_do_activate: rq->lock 2444 + * ttwu_runnable: task_rq_lock 2445 + * wake_up_new_task: task_rq_lock 2446 + */ 2425 2447 void (*wakeup_preempt)(struct rq *rq, struct task_struct *p, int flags); 2426 2448 2449 + /* 2450 + * schedule/pick_next_task/prev_balance: rq->lock 2451 + */ 2427 2452 int (*balance)(struct rq *rq, struct task_struct *prev, struct rq_flags *rf); 2428 - struct task_struct *(*pick_task)(struct rq *rq); 2453 + 2454 + /* 2455 + * schedule/pick_next_task: rq->lock 2456 + */ 2457 + struct task_struct *(*pick_task)(struct rq *rq, struct rq_flags *rf); 2429 2458 /* 2430 2459 * Optional! When implemented pick_next_task() should be equivalent to: 2431 2460 * ··· 2480 2419 * set_next_task_first(next); 2481 2420 * } 2482 2421 */ 2483 - struct task_struct *(*pick_next_task)(struct rq *rq, struct task_struct *prev); 2422 + struct task_struct *(*pick_next_task)(struct rq *rq, struct task_struct *prev, 2423 + struct rq_flags *rf); 2484 2424 2425 + /* 2426 + * sched_change: 2427 + * __schedule: rq->lock 2428 + */ 2485 2429 void (*put_prev_task)(struct rq *rq, struct task_struct *p, struct task_struct *next); 2486 2430 void (*set_next_task)(struct rq *rq, struct task_struct *p, bool first); 2487 2431 2432 + /* 2433 + * select_task_rq: p->pi_lock 2434 + * sched_exec: p->pi_lock 2435 + */ 2488 2436 int (*select_task_rq)(struct task_struct *p, int task_cpu, int flags); 2489 2437 2438 + /* 2439 + * set_task_cpu: p->pi_lock || rq->lock (ttwu like) 2440 + */ 2490 2441 void (*migrate_task_rq)(struct task_struct *p, int new_cpu); 2491 2442 2443 + /* 2444 + * ttwu_do_activate: rq->lock 2445 + * wake_up_new_task: task_rq_lock 2446 + */ 2492 2447 void (*task_woken)(struct rq *this_rq, struct task_struct *task); 2493 2448 2449 + /* 2450 + * do_set_cpus_allowed: task_rq_lock + sched_change 2451 + */ 2494 2452 void (*set_cpus_allowed)(struct task_struct *p, struct affinity_context *ctx); 2495 2453 2454 + /* 2455 + * sched_set_rq_{on,off}line: rq->lock 2456 + */ 2496 2457 void (*rq_online)(struct rq *rq); 2497 2458 void (*rq_offline)(struct rq *rq); 2498 2459 2460 + /* 2461 + * push_cpu_stop: p->pi_lock && rq->lock 2462 + */ 2499 2463 struct rq *(*find_lock_rq)(struct task_struct *p, struct rq *rq); 2500 2464 2465 + /* 2466 + * hrtick: rq->lock 2467 + * sched_tick: rq->lock 2468 + * sched_tick_remote: rq->lock 2469 + */ 2501 2470 void (*task_tick)(struct rq *rq, struct task_struct *p, int queued); 2471 + /* 2472 + * sched_cgroup_fork: p->pi_lock 2473 + */ 2502 2474 void (*task_fork)(struct task_struct *p); 2475 + /* 2476 + * finish_task_switch: no locks 2477 + */ 2503 2478 void (*task_dead)(struct task_struct *p); 2504 2479 2505 2480 /* 2506 - * The switched_from() call is allowed to drop rq->lock, therefore we 2507 - * cannot assume the switched_from/switched_to pair is serialized by 2508 - * rq->lock. They are however serialized by p->pi_lock. 2481 + * sched_change 2509 2482 */ 2510 - void (*switching_to) (struct rq *this_rq, struct task_struct *task); 2511 - void (*switched_from)(struct rq *this_rq, struct task_struct *task); 2512 - void (*switched_to) (struct rq *this_rq, struct task_struct *task); 2483 + void (*switching_from)(struct rq *this_rq, struct task_struct *task); 2484 + void (*switched_from) (struct rq *this_rq, struct task_struct *task); 2485 + void (*switching_to) (struct rq *this_rq, struct task_struct *task); 2486 + void (*switched_to) (struct rq *this_rq, struct task_struct *task); 2487 + u64 (*get_prio) (struct rq *this_rq, struct task_struct *task); 2488 + void (*prio_changed) (struct rq *this_rq, struct task_struct *task, 2489 + u64 oldprio); 2490 + 2491 + /* 2492 + * set_load_weight: task_rq_lock + sched_change 2493 + * __setscheduler_parms: task_rq_lock + sched_change 2494 + */ 2513 2495 void (*reweight_task)(struct rq *this_rq, struct task_struct *task, 2514 2496 const struct load_weight *lw); 2515 - void (*prio_changed) (struct rq *this_rq, struct task_struct *task, 2516 - int oldprio); 2517 2497 2498 + /* 2499 + * sched_rr_get_interval: task_rq_lock 2500 + */ 2518 2501 unsigned int (*get_rr_interval)(struct rq *rq, 2519 2502 struct task_struct *task); 2520 2503 2504 + /* 2505 + * task_sched_runtime: task_rq_lock 2506 + */ 2521 2507 void (*update_curr)(struct rq *rq); 2522 2508 2523 2509 #ifdef CONFIG_FAIR_GROUP_SCHED 2510 + /* 2511 + * sched_change_group: task_rq_lock + sched_change 2512 + */ 2524 2513 void (*task_change_group)(struct task_struct *p); 2525 2514 #endif 2526 2515 2527 2516 #ifdef CONFIG_SCHED_CORE 2517 + /* 2518 + * pick_next_task: rq->lock 2519 + * try_steal_cookie: rq->lock (double) 2520 + */ 2528 2521 int (*task_is_throttled)(struct task_struct *p, int cpu); 2529 2522 #endif 2530 2523 }; 2524 + 2525 + /* 2526 + * Does not nest; only used around sched_class::pick_task() rq-lock-breaks. 2527 + */ 2528 + static inline void rq_modified_clear(struct rq *rq) 2529 + { 2530 + rq->queue_mask = 0; 2531 + } 2532 + 2533 + static inline bool rq_modified_above(struct rq *rq, const struct sched_class * class) 2534 + { 2535 + unsigned int mask = class->queue_mask; 2536 + return rq->queue_mask & ~((mask << 1) - 1); 2537 + } 2531 2538 2532 2539 static inline void put_prev_task(struct rq *rq, struct task_struct *prev) 2533 2540 { ··· 2708 2579 return rq->cfs.nr_queued > 0; 2709 2580 } 2710 2581 2711 - extern struct task_struct *pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf); 2712 - extern struct task_struct *pick_task_idle(struct rq *rq); 2582 + extern struct task_struct *pick_next_task_fair(struct rq *rq, struct task_struct *prev, 2583 + struct rq_flags *rf); 2584 + extern struct task_struct *pick_task_idle(struct rq *rq, struct rq_flags *rf); 2713 2585 2714 2586 #define SCA_CHECK 0x01 2715 2587 #define SCA_MIGRATE_DISABLE 0x02 ··· 2740 2610 static inline cpumask_t *alloc_user_cpus_ptr(int node) 2741 2611 { 2742 2612 /* 2743 - * See do_set_cpus_allowed() above for the rcu_head usage. 2613 + * See set_cpus_allowed_force() above for the rcu_head usage. 2744 2614 */ 2745 2615 int size = max_t(int, cpumask_size(), sizeof(struct rcu_head)); 2746 2616 ··· 4007 3877 extern void enqueue_task(struct rq *rq, struct task_struct *p, int flags); 4008 3878 extern bool dequeue_task(struct rq *rq, struct task_struct *p, int flags); 4009 3879 4010 - extern void check_class_changing(struct rq *rq, struct task_struct *p, 4011 - const struct sched_class *prev_class); 4012 - extern void check_class_changed(struct rq *rq, struct task_struct *p, 4013 - const struct sched_class *prev_class, 4014 - int oldprio); 4015 - 4016 3880 extern struct balance_callback *splice_balance_callbacks(struct rq *rq); 4017 3881 extern void balance_callbacks(struct rq *rq, struct balance_callback *head); 4018 3882 4019 - #ifdef CONFIG_SCHED_CLASS_EXT 4020 3883 /* 4021 - * Used by SCX in the enable/disable paths to move tasks between sched_classes 4022 - * and establish invariants. 3884 + * The 'sched_change' pattern is the safe, easy and slow way of changing a 3885 + * task's scheduling properties. It dequeues a task, such that the scheduler 3886 + * is fully unaware of it; at which point its properties can be modified; 3887 + * after which it is enqueued again. 3888 + * 3889 + * Typically this must be called while holding task_rq_lock, since most/all 3890 + * properties are serialized under those locks. There is currently one 3891 + * exception to this rule in sched/ext which only holds rq->lock. 4023 3892 */ 4024 - struct sched_enq_and_set_ctx { 3893 + 3894 + /* 3895 + * This structure is a temporary, used to preserve/convey the queueing state 3896 + * of the task between sched_change_begin() and sched_change_end(). Ensuring 3897 + * the task's queueing state is idempotent across the operation. 3898 + */ 3899 + struct sched_change_ctx { 3900 + u64 prio; 4025 3901 struct task_struct *p; 4026 - int queue_flags; 3902 + int flags; 4027 3903 bool queued; 4028 3904 bool running; 4029 3905 }; 4030 3906 4031 - void sched_deq_and_put_task(struct task_struct *p, int queue_flags, 4032 - struct sched_enq_and_set_ctx *ctx); 4033 - void sched_enq_and_set_task(struct sched_enq_and_set_ctx *ctx); 3907 + struct sched_change_ctx *sched_change_begin(struct task_struct *p, unsigned int flags); 3908 + void sched_change_end(struct sched_change_ctx *ctx); 4034 3909 4035 - #endif /* CONFIG_SCHED_CLASS_EXT */ 3910 + DEFINE_CLASS(sched_change, struct sched_change_ctx *, 3911 + sched_change_end(_T), 3912 + sched_change_begin(p, flags), 3913 + struct task_struct *p, unsigned int flags) 3914 + 3915 + DEFINE_CLASS_IS_UNCONDITIONAL(sched_change) 4036 3916 4037 3917 #include "ext.h" 4038 3918

+1 -1

kernel/sched/stats.h

··· 206 206 207 207 rq = __task_rq_lock(p, &rf); 208 208 psi_task_change(p, p->psi_flags, 0); 209 - __task_rq_unlock(rq, &rf); 209 + __task_rq_unlock(rq, p, &rf); 210 210 } 211 211 } 212 212

+9 -4

kernel/sched/stop_task.c

··· 32 32 stop->se.exec_start = rq_clock_task(rq); 33 33 } 34 34 35 - static struct task_struct *pick_task_stop(struct rq *rq) 35 + static struct task_struct *pick_task_stop(struct rq *rq, struct rq_flags *rf) 36 36 { 37 37 if (!sched_stop_runnable(rq)) 38 38 return NULL; ··· 75 75 { 76 76 } 77 77 78 - static void switched_to_stop(struct rq *rq, struct task_struct *p) 78 + static void switching_to_stop(struct rq *rq, struct task_struct *p) 79 79 { 80 80 BUG(); /* its impossible to change to this class */ 81 81 } 82 82 83 83 static void 84 - prio_changed_stop(struct rq *rq, struct task_struct *p, int oldprio) 84 + prio_changed_stop(struct rq *rq, struct task_struct *p, u64 oldprio) 85 85 { 86 + if (p->prio == oldprio) 87 + return; 88 + 86 89 BUG(); /* how!?, what priority? */ 87 90 } 88 91 ··· 97 94 * Simple, special scheduling class for the per-CPU stop tasks: 98 95 */ 99 96 DEFINE_SCHED_CLASS(stop) = { 97 + 98 + .queue_mask = 16, 100 99 101 100 .enqueue_task = enqueue_task_stop, 102 101 .dequeue_task = dequeue_task_stop, ··· 117 112 .task_tick = task_tick_stop, 118 113 119 114 .prio_changed = prio_changed_stop, 120 - .switched_to = switched_to_stop, 115 + .switching_to = switching_to_stop, 121 116 .update_curr = update_curr_stop, 122 117 };

+26 -58

kernel/sched/syscalls.c

··· 64 64 65 65 void set_user_nice(struct task_struct *p, long nice) 66 66 { 67 - bool queued, running; 68 - struct rq *rq; 69 67 int old_prio; 70 68 71 69 if (task_nice(p) == nice || nice < MIN_NICE || nice > MAX_NICE) ··· 72 74 * We have to be careful, if called from sys_setpriority(), 73 75 * the task might be in the middle of scheduling on another CPU. 74 76 */ 75 - CLASS(task_rq_lock, rq_guard)(p); 76 - rq = rq_guard.rq; 77 - 78 - update_rq_clock(rq); 77 + guard(task_rq_lock)(p); 79 78 80 79 /* 81 80 * The RT priorities are set via sched_setscheduler(), but we still ··· 85 90 return; 86 91 } 87 92 88 - queued = task_on_rq_queued(p); 89 - running = task_current_donor(rq, p); 90 - if (queued) 91 - dequeue_task(rq, p, DEQUEUE_SAVE | DEQUEUE_NOCLOCK); 92 - if (running) 93 - put_prev_task(rq, p); 94 - 95 - p->static_prio = NICE_TO_PRIO(nice); 96 - set_load_weight(p, true); 97 - old_prio = p->prio; 98 - p->prio = effective_prio(p); 99 - 100 - if (queued) 101 - enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK); 102 - if (running) 103 - set_next_task(rq, p); 104 - 105 - /* 106 - * If the task increased its priority or is running and 107 - * lowered its priority, then reschedule its CPU: 108 - */ 109 - p->sched_class->prio_changed(rq, p, old_prio); 93 + scoped_guard (sched_change, p, DEQUEUE_SAVE) { 94 + p->static_prio = NICE_TO_PRIO(nice); 95 + set_load_weight(p, true); 96 + old_prio = p->prio; 97 + p->prio = effective_prio(p); 98 + } 110 99 } 111 100 EXPORT_SYMBOL(set_user_nice); 112 101 ··· 494 515 bool user, bool pi) 495 516 { 496 517 int oldpolicy = -1, policy = attr->sched_policy; 497 - int retval, oldprio, newprio, queued, running; 518 + int retval, oldprio, newprio; 498 519 const struct sched_class *prev_class, *next_class; 499 520 struct balance_callback *head; 500 521 struct rq_flags rf; ··· 674 695 prev_class = p->sched_class; 675 696 next_class = __setscheduler_class(policy, newprio); 676 697 677 - if (prev_class != next_class && p->se.sched_delayed) 678 - dequeue_task(rq, p, DEQUEUE_SLEEP | DEQUEUE_DELAYED | DEQUEUE_NOCLOCK); 698 + if (prev_class != next_class) 699 + queue_flags |= DEQUEUE_CLASS; 679 700 680 - queued = task_on_rq_queued(p); 681 - running = task_current_donor(rq, p); 682 - if (queued) 683 - dequeue_task(rq, p, queue_flags); 684 - if (running) 685 - put_prev_task(rq, p); 701 + scoped_guard (sched_change, p, queue_flags) { 686 702 687 - if (!(attr->sched_flags & SCHED_FLAG_KEEP_PARAMS)) { 688 - __setscheduler_params(p, attr); 689 - p->sched_class = next_class; 690 - p->prio = newprio; 703 + if (!(attr->sched_flags & SCHED_FLAG_KEEP_PARAMS)) { 704 + __setscheduler_params(p, attr); 705 + p->sched_class = next_class; 706 + p->prio = newprio; 707 + } 708 + __setscheduler_uclamp(p, attr); 709 + 710 + if (scope->queued) { 711 + /* 712 + * We enqueue to tail when the priority of a task is 713 + * increased (user space view). 714 + */ 715 + if (oldprio < p->prio) 716 + scope->flags |= ENQUEUE_HEAD; 717 + } 691 718 } 692 - __setscheduler_uclamp(p, attr); 693 - check_class_changing(rq, p, prev_class); 694 - 695 - if (queued) { 696 - /* 697 - * We enqueue to tail when the priority of a task is 698 - * increased (user space view). 699 - */ 700 - if (oldprio < p->prio) 701 - queue_flags |= ENQUEUE_HEAD; 702 - 703 - enqueue_task(rq, p, queue_flags); 704 - } 705 - if (running) 706 - set_next_task(rq, p); 707 - 708 - check_class_changed(rq, p, prev_class, oldprio); 709 719 710 720 /* Avoid rq from going away on us: */ 711 721 preempt_disable();

+86 -22

kernel/sched/topology.c

··· 1590 1590 #ifdef CONFIG_NUMA 1591 1591 enum numa_topology_type sched_numa_topology_type; 1592 1592 1593 + /* 1594 + * sched_domains_numa_distance is derived from sched_numa_node_distance 1595 + * and provides a simplified view of NUMA distances used specifically 1596 + * for building NUMA scheduling domains. 1597 + */ 1593 1598 static int sched_domains_numa_levels; 1599 + static int sched_numa_node_levels; 1594 1600 1595 1601 int sched_max_numa_distance; 1596 1602 static int *sched_domains_numa_distance; 1603 + static int *sched_numa_node_distance; 1597 1604 static struct cpumask ***sched_domains_numa_masks; 1598 1605 #endif /* CONFIG_NUMA */ 1599 1606 ··· 1852 1845 return true; 1853 1846 1854 1847 rcu_read_lock(); 1855 - distances = rcu_dereference(sched_domains_numa_distance); 1848 + distances = rcu_dereference(sched_numa_node_distance); 1856 1849 if (!distances) 1857 1850 goto unlock; 1858 - for (i = 0; i < sched_domains_numa_levels; i++) { 1851 + for (i = 0; i < sched_numa_node_levels; i++) { 1859 1852 if (distances[i] == distance) { 1860 1853 found = true; 1861 1854 break; ··· 1931 1924 1932 1925 #define NR_DISTANCE_VALUES (1 << DISTANCE_BITS) 1933 1926 1934 - void sched_init_numa(int offline_node) 1927 + /* 1928 + * An architecture could modify its NUMA distance, to change 1929 + * grouping of NUMA nodes and number of NUMA levels when creating 1930 + * NUMA level sched domains. 1931 + * 1932 + * A NUMA level is created for each unique 1933 + * arch_sched_node_distance. 1934 + */ 1935 + static int numa_node_dist(int i, int j) 1935 1936 { 1936 - struct sched_domain_topology_level *tl; 1937 - unsigned long *distance_map; 1937 + return node_distance(i, j); 1938 + } 1939 + 1940 + int arch_sched_node_distance(int from, int to) 1941 + __weak __alias(numa_node_dist); 1942 + 1943 + static bool modified_sched_node_distance(void) 1944 + { 1945 + return numa_node_dist != arch_sched_node_distance; 1946 + } 1947 + 1948 + static int sched_record_numa_dist(int offline_node, int (*n_dist)(int, int), 1949 + int **dist, int *levels) 1950 + { 1951 + unsigned long *distance_map __free(bitmap) = NULL; 1938 1952 int nr_levels = 0; 1939 1953 int i, j; 1940 1954 int *distances; 1941 - struct cpumask ***masks; 1942 1955 1943 1956 /* 1944 1957 * O(nr_nodes^2) de-duplicating selection sort -- in order to find the ··· 1966 1939 */ 1967 1940 distance_map = bitmap_alloc(NR_DISTANCE_VALUES, GFP_KERNEL); 1968 1941 if (!distance_map) 1969 - return; 1942 + return -ENOMEM; 1970 1943 1971 1944 bitmap_zero(distance_map, NR_DISTANCE_VALUES); 1972 1945 for_each_cpu_node_but(i, offline_node) { 1973 1946 for_each_cpu_node_but(j, offline_node) { 1974 - int distance = node_distance(i, j); 1947 + int distance = n_dist(i, j); 1975 1948 1976 1949 if (distance < LOCAL_DISTANCE || distance >= NR_DISTANCE_VALUES) { 1977 1950 sched_numa_warn("Invalid distance value range"); 1978 - bitmap_free(distance_map); 1979 - return; 1951 + return -EINVAL; 1980 1952 } 1981 1953 1982 1954 bitmap_set(distance_map, distance, 1); ··· 1988 1962 nr_levels = bitmap_weight(distance_map, NR_DISTANCE_VALUES); 1989 1963 1990 1964 distances = kcalloc(nr_levels, sizeof(int), GFP_KERNEL); 1991 - if (!distances) { 1992 - bitmap_free(distance_map); 1993 - return; 1994 - } 1965 + if (!distances) 1966 + return -ENOMEM; 1995 1967 1996 1968 for (i = 0, j = 0; i < nr_levels; i++, j++) { 1997 1969 j = find_next_bit(distance_map, NR_DISTANCE_VALUES, j); 1998 1970 distances[i] = j; 1999 1971 } 2000 - rcu_assign_pointer(sched_domains_numa_distance, distances); 1972 + *dist = distances; 1973 + *levels = nr_levels; 2001 1974 2002 - bitmap_free(distance_map); 1975 + return 0; 1976 + } 1977 + 1978 + void sched_init_numa(int offline_node) 1979 + { 1980 + struct sched_domain_topology_level *tl; 1981 + int nr_levels, nr_node_levels; 1982 + int i, j; 1983 + int *distances, *domain_distances; 1984 + struct cpumask ***masks; 1985 + 1986 + /* Record the NUMA distances from SLIT table */ 1987 + if (sched_record_numa_dist(offline_node, numa_node_dist, &distances, 1988 + &nr_node_levels)) 1989 + return; 1990 + 1991 + /* Record modified NUMA distances for building sched domains */ 1992 + if (modified_sched_node_distance()) { 1993 + if (sched_record_numa_dist(offline_node, arch_sched_node_distance, 1994 + &domain_distances, &nr_levels)) { 1995 + kfree(distances); 1996 + return; 1997 + } 1998 + } else { 1999 + domain_distances = distances; 2000 + nr_levels = nr_node_levels; 2001 + } 2002 + rcu_assign_pointer(sched_numa_node_distance, distances); 2003 + WRITE_ONCE(sched_max_numa_distance, distances[nr_node_levels - 1]); 2004 + WRITE_ONCE(sched_numa_node_levels, nr_node_levels); 2003 2005 2004 2006 /* 2005 2007 * 'nr_levels' contains the number of unique distances ··· 2045 1991 * 2046 1992 * We reset it to 'nr_levels' at the end of this function. 2047 1993 */ 1994 + rcu_assign_pointer(sched_domains_numa_distance, domain_distances); 1995 + 2048 1996 sched_domains_numa_levels = 0; 2049 1997 2050 1998 masks = kzalloc(sizeof(void *) * nr_levels, GFP_KERNEL); ··· 2072 2016 masks[i][j] = mask; 2073 2017 2074 2018 for_each_cpu_node_but(k, offline_node) { 2075 - if (sched_debug() && (node_distance(j, k) != node_distance(k, j))) 2019 + if (sched_debug() && 2020 + (arch_sched_node_distance(j, k) != 2021 + arch_sched_node_distance(k, j))) 2076 2022 sched_numa_warn("Node-distance not symmetric"); 2077 2023 2078 - if (node_distance(j, k) > sched_domains_numa_distance[i]) 2024 + if (arch_sched_node_distance(j, k) > 2025 + sched_domains_numa_distance[i]) 2079 2026 continue; 2080 2027 2081 2028 cpumask_or(mask, mask, cpumask_of_node(k)); ··· 2118 2059 sched_domain_topology = tl; 2119 2060 2120 2061 sched_domains_numa_levels = nr_levels; 2121 - WRITE_ONCE(sched_max_numa_distance, sched_domains_numa_distance[nr_levels - 1]); 2122 2062 2123 2063 init_numa_topology_type(offline_node); 2124 2064 } ··· 2125 2067 2126 2068 static void sched_reset_numa(void) 2127 2069 { 2128 - int nr_levels, *distances; 2070 + int nr_levels, *distances, *dom_distances = NULL; 2129 2071 struct cpumask ***masks; 2130 2072 2131 2073 nr_levels = sched_domains_numa_levels; 2074 + sched_numa_node_levels = 0; 2132 2075 sched_domains_numa_levels = 0; 2133 2076 sched_max_numa_distance = 0; 2134 2077 sched_numa_topology_type = NUMA_DIRECT; 2135 - distances = sched_domains_numa_distance; 2078 + distances = sched_numa_node_distance; 2079 + if (sched_numa_node_distance != sched_domains_numa_distance) 2080 + dom_distances = sched_domains_numa_distance; 2081 + rcu_assign_pointer(sched_numa_node_distance, NULL); 2136 2082 rcu_assign_pointer(sched_domains_numa_distance, NULL); 2137 2083 masks = sched_domains_numa_masks; 2138 2084 rcu_assign_pointer(sched_domains_numa_masks, NULL); ··· 2145 2083 2146 2084 synchronize_rcu(); 2147 2085 kfree(distances); 2086 + kfree(dom_distances); 2148 2087 for (i = 0; i < nr_levels && masks; i++) { 2149 2088 if (!masks[i]) 2150 2089 continue; ··· 2192 2129 continue; 2193 2130 2194 2131 /* Set ourselves in the remote node's masks */ 2195 - if (node_distance(j, node) <= sched_domains_numa_distance[i]) 2132 + if (arch_sched_node_distance(j, node) <= 2133 + sched_domains_numa_distance[i]) 2196 2134 cpumask_set_cpu(cpu, sched_domains_numa_masks[i][j]); 2197 2135 } 2198 2136 }