···199199 return cpumask_next_and(-1, src1p, src2p);200200}201201202202+static inline int cpumask_any_distribute(const struct cpumask *srcp)203203+{204204+ return cpumask_first(srcp);205205+}206206+202207#define for_each_cpu(cpu, mask) \203208 for ((cpu) = 0; (cpu) < 1; (cpu)++, (void)mask)204209#define for_each_cpu_not(cpu, mask) \···257252unsigned int cpumask_local_spread(unsigned int i, int node);258253int cpumask_any_and_distribute(const struct cpumask *src1p,259254 const struct cpumask *src2p);255255+int cpumask_any_distribute(const struct cpumask *srcp);260256261257/**262258 * for_each_cpu - iterate over every cpu in a mask
+69
include/linux/preempt.h
···322322323323#endif324324325325+#if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT_RT)326326+327327+/*328328+ * Migrate-Disable and why it is undesired.329329+ *330330+ * When a preempted task becomes elegible to run under the ideal model (IOW it331331+ * becomes one of the M highest priority tasks), it might still have to wait332332+ * for the preemptee's migrate_disable() section to complete. Thereby suffering333333+ * a reduction in bandwidth in the exact duration of the migrate_disable()334334+ * section.335335+ *336336+ * Per this argument, the change from preempt_disable() to migrate_disable()337337+ * gets us:338338+ *339339+ * - a higher priority tasks gains reduced wake-up latency; with preempt_disable()340340+ * it would have had to wait for the lower priority task.341341+ *342342+ * - a lower priority tasks; which under preempt_disable() could've instantly343343+ * migrated away when another CPU becomes available, is now constrained344344+ * by the ability to push the higher priority task away, which might itself be345345+ * in a migrate_disable() section, reducing it's available bandwidth.346346+ *347347+ * IOW it trades latency / moves the interference term, but it stays in the348348+ * system, and as long as it remains unbounded, the system is not fully349349+ * deterministic.350350+ *351351+ *352352+ * The reason we have it anyway.353353+ *354354+ * PREEMPT_RT breaks a number of assumptions traditionally held. By forcing a355355+ * number of primitives into becoming preemptible, they would also allow356356+ * migration. This turns out to break a bunch of per-cpu usage. To this end,357357+ * all these primitives employ migirate_disable() to restore this implicit358358+ * assumption.359359+ *360360+ * This is a 'temporary' work-around at best. The correct solution is getting361361+ * rid of the above assumptions and reworking the code to employ explicit362362+ * per-cpu locking or short preempt-disable regions.363363+ *364364+ * The end goal must be to get rid of migrate_disable(), alternatively we need365365+ * a schedulability theory that does not depend on abritrary migration.366366+ *367367+ *368368+ * Notes on the implementation.369369+ *370370+ * The implementation is particularly tricky since existing code patterns371371+ * dictate neither migrate_disable() nor migrate_enable() is allowed to block.372372+ * This means that it cannot use cpus_read_lock() to serialize against hotplug,373373+ * nor can it easily migrate itself into a pending affinity mask change on374374+ * migrate_enable().375375+ *376376+ *377377+ * Note: even non-work-conserving schedulers like semi-partitioned depends on378378+ * migration, so migrate_disable() is not only a problem for379379+ * work-conserving schedulers.380380+ *381381+ */382382+extern void migrate_disable(void);383383+extern void migrate_enable(void);384384+385385+#elif defined(CONFIG_PREEMPT_RT)386386+387387+static inline void migrate_disable(void) { }388388+static inline void migrate_enable(void) { }389389+390390+#else /* !CONFIG_PREEMPT_RT */391391+325392/**326393 * migrate_disable - Prevent migration of the current task327394 *···418351{419352 preempt_enable();420353}354354+355355+#endif /* CONFIG_SMP && CONFIG_PREEMPT_RT */421356422357#endif /* __LINUX_PREEMPT_H */
+5
include/linux/sched.h
···714714 int nr_cpus_allowed;715715 const cpumask_t *cpus_ptr;716716 cpumask_t cpus_mask;717717+ void *migration_pending;718718+#if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT_RT)719719+ unsigned short migration_disabled;720720+#endif721721+ unsigned short migration_flags;717722718723#ifdef CONFIG_PREEMPT_RCU719724 int rcu_read_lock_nesting;
+2
include/linux/sched/hotplug.h
···1111extern int sched_cpu_deactivate(unsigned int cpu);12121313#ifdef CONFIG_HOTPLUG_CPU1414+extern int sched_cpu_wait_empty(unsigned int cpu);1415extern int sched_cpu_dying(unsigned int cpu);1516#else1717+# define sched_cpu_wait_empty NULL1618# define sched_cpu_dying NULL1719#endif1820
···16021602 .name = "ap:online",16031603 },16041604 /*16051605- * Handled on controll processor until the plugged processor manages16051605+ * Handled on control processor until the plugged processor manages16061606 * this itself.16071607 */16081608 [CPUHP_TEARDOWN_CPU] = {···16111611 .teardown.single = takedown_cpu,16121612 .cant_stop = true,16131613 },16141614+16151615+ [CPUHP_AP_SCHED_WAIT_EMPTY] = {16161616+ .name = "sched:waitempty",16171617+ .startup.single = NULL,16181618+ .teardown.single = sched_cpu_wait_empty,16191619+ },16201620+16141621 /* Handle smpboot threads park/unpark */16151622 [CPUHP_AP_SMPBOOT_THREADS] = {16161623 .name = "smpboot/threads:online",
+757-199
kernel/sched/core.c
···1696169616971697#ifdef CONFIG_SMP1698169816991699+#ifdef CONFIG_PREEMPT_RT17001700+17011701+static void17021702+__do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask, u32 flags);17031703+17041704+static int __set_cpus_allowed_ptr(struct task_struct *p,17051705+ const struct cpumask *new_mask,17061706+ u32 flags);17071707+17081708+static void migrate_disable_switch(struct rq *rq, struct task_struct *p)17091709+{17101710+ if (likely(!p->migration_disabled))17111711+ return;17121712+17131713+ if (p->cpus_ptr != &p->cpus_mask)17141714+ return;17151715+17161716+ /*17171717+ * Violates locking rules! see comment in __do_set_cpus_allowed().17181718+ */17191719+ __do_set_cpus_allowed(p, cpumask_of(rq->cpu), SCA_MIGRATE_DISABLE);17201720+}17211721+17221722+void migrate_disable(void)17231723+{17241724+ struct task_struct *p = current;17251725+17261726+ if (p->migration_disabled) {17271727+ p->migration_disabled++;17281728+ return;17291729+ }17301730+17311731+ preempt_disable();17321732+ this_rq()->nr_pinned++;17331733+ p->migration_disabled = 1;17341734+ preempt_enable();17351735+}17361736+EXPORT_SYMBOL_GPL(migrate_disable);17371737+17381738+void migrate_enable(void)17391739+{17401740+ struct task_struct *p = current;17411741+17421742+ if (p->migration_disabled > 1) {17431743+ p->migration_disabled--;17441744+ return;17451745+ }17461746+17471747+ /*17481748+ * Ensure stop_task runs either before or after this, and that17491749+ * __set_cpus_allowed_ptr(SCA_MIGRATE_ENABLE) doesn't schedule().17501750+ */17511751+ preempt_disable();17521752+ if (p->cpus_ptr != &p->cpus_mask)17531753+ __set_cpus_allowed_ptr(p, &p->cpus_mask, SCA_MIGRATE_ENABLE);17541754+ /*17551755+ * Mustn't clear migration_disabled() until cpus_ptr points back at the17561756+ * regular cpus_mask, otherwise things that race (eg.17571757+ * select_fallback_rq) get confused.17581758+ */17591759+ barrier();17601760+ p->migration_disabled = 0;17611761+ this_rq()->nr_pinned--;17621762+ preempt_enable();17631763+}17641764+EXPORT_SYMBOL_GPL(migrate_enable);17651765+17661766+static inline bool rq_has_pinned_tasks(struct rq *rq)17671767+{17681768+ return rq->nr_pinned;17691769+}17701770+17711771+#endif17721772+16991773/*17001774 * Per-CPU kthreads are allowed to run on !active && online CPUs, see17011775 * __set_cpus_allowed_ptr() and select_fallback_rq().···17791705 if (!cpumask_test_cpu(cpu, p->cpus_ptr))17801706 return false;1781170717821782- if (is_per_cpu_kthread(p))17081708+ if (is_per_cpu_kthread(p) || is_migration_disabled(p))17831709 return cpu_online(cpu);1784171017851711 return cpu_active(cpu);···18241750}1825175118261752struct migration_arg {18271827- struct task_struct *task;18281828- int dest_cpu;17531753+ struct task_struct *task;17541754+ int dest_cpu;17551755+ struct set_affinity_pending *pending;17561756+};17571757+17581758+struct set_affinity_pending {17591759+ refcount_t refs;17601760+ struct completion done;17611761+ struct cpu_stop_work stop_work;17621762+ struct migration_arg arg;18291763};1830176418311765/*···18651783 */18661784static int migration_cpu_stop(void *data)18671785{17861786+ struct set_affinity_pending *pending;18681787 struct migration_arg *arg = data;18691788 struct task_struct *p = arg->task;17891789+ int dest_cpu = arg->dest_cpu;18701790 struct rq *rq = this_rq();17911791+ bool complete = false;18711792 struct rq_flags rf;1872179318731794 /*18741795 * The original target CPU might have gone down and we might18751796 * be on another CPU but it doesn't matter.18761797 */18771877- local_irq_disable();17981798+ local_irq_save(rf.flags);18781799 /*18791800 * We need to explicitly wake pending tasks before running18801801 * __migrate_task() such that we will not miss enforcing cpus_ptr···1887180218881803 raw_spin_lock(&p->pi_lock);18891804 rq_lock(rq, &rf);18051805+18061806+ pending = p->migration_pending;18901807 /*18911808 * If task_rq(p) != rq, it cannot be migrated here, because we're18921809 * holding rq->lock, if p->on_rq == 0 it cannot get enqueued because18931810 * we're holding p->pi_lock.18941811 */18951812 if (task_rq(p) == rq) {18961896- if (task_on_rq_queued(p))18971897- rq = __migrate_task(rq, &rf, p, arg->dest_cpu);18981898- else18991899- p->wake_cpu = arg->dest_cpu;19001900- }19011901- rq_unlock(rq, &rf);19021902- raw_spin_unlock(&p->pi_lock);18131813+ if (is_migration_disabled(p))18141814+ goto out;1903181519041904- local_irq_enable();18161816+ if (pending) {18171817+ p->migration_pending = NULL;18181818+ complete = true;18191819+ }18201820+18211821+ /* migrate_enable() -- we must not race against SCA */18221822+ if (dest_cpu < 0) {18231823+ /*18241824+ * When this was migrate_enable() but we no longer18251825+ * have a @pending, a concurrent SCA 'fixed' things18261826+ * and we should be valid again. Nothing to do.18271827+ */18281828+ if (!pending) {18291829+ WARN_ON_ONCE(!is_cpu_allowed(p, cpu_of(rq)));18301830+ goto out;18311831+ }18321832+18331833+ dest_cpu = cpumask_any_distribute(&p->cpus_mask);18341834+ }18351835+18361836+ if (task_on_rq_queued(p))18371837+ rq = __migrate_task(rq, &rf, p, dest_cpu);18381838+ else18391839+ p->wake_cpu = dest_cpu;18401840+18411841+ } else if (dest_cpu < 0) {18421842+ /*18431843+ * This happens when we get migrated between migrate_enable()'s18441844+ * preempt_enable() and scheduling the stopper task. At that18451845+ * point we're a regular task again and not current anymore.18461846+ *18471847+ * A !PREEMPT kernel has a giant hole here, which makes it far18481848+ * more likely.18491849+ */18501850+18511851+ /*18521852+ * When this was migrate_enable() but we no longer have an18531853+ * @pending, a concurrent SCA 'fixed' things and we should be18541854+ * valid again. Nothing to do.18551855+ */18561856+ if (!pending) {18571857+ WARN_ON_ONCE(!is_cpu_allowed(p, cpu_of(rq)));18581858+ goto out;18591859+ }18601860+18611861+ /*18621862+ * When migrate_enable() hits a rq mis-match we can't reliably18631863+ * determine is_migration_disabled() and so have to chase after18641864+ * it.18651865+ */18661866+ task_rq_unlock(rq, p, &rf);18671867+ stop_one_cpu_nowait(task_cpu(p), migration_cpu_stop,18681868+ &pending->arg, &pending->stop_work);18691869+ return 0;18701870+ }18711871+out:18721872+ task_rq_unlock(rq, p, &rf);18731873+18741874+ if (complete)18751875+ complete_all(&pending->done);18761876+18771877+ /* For pending->{arg,stop_work} */18781878+ pending = arg->pending;18791879+ if (pending && refcount_dec_and_test(&pending->refs))18801880+ wake_up_var(&pending->refs);18811881+18821882+ return 0;18831883+}18841884+18851885+int push_cpu_stop(void *arg)18861886+{18871887+ struct rq *lowest_rq = NULL, *rq = this_rq();18881888+ struct task_struct *p = arg;18891889+18901890+ raw_spin_lock_irq(&p->pi_lock);18911891+ raw_spin_lock(&rq->lock);18921892+18931893+ if (task_rq(p) != rq)18941894+ goto out_unlock;18951895+18961896+ if (is_migration_disabled(p)) {18971897+ p->migration_flags |= MDF_PUSH;18981898+ goto out_unlock;18991899+ }19001900+19011901+ p->migration_flags &= ~MDF_PUSH;19021902+19031903+ if (p->sched_class->find_lock_rq)19041904+ lowest_rq = p->sched_class->find_lock_rq(p, rq);19051905+19061906+ if (!lowest_rq)19071907+ goto out_unlock;19081908+19091909+ // XXX validate p is still the highest prio task19101910+ if (task_rq(p) == rq) {19111911+ deactivate_task(rq, p, 0);19121912+ set_task_cpu(p, lowest_rq->cpu);19131913+ activate_task(lowest_rq, p, 0);19141914+ resched_curr(lowest_rq);19151915+ }19161916+19171917+ double_unlock_balance(rq, lowest_rq);19181918+19191919+out_unlock:19201920+ rq->push_busy = false;19211921+ raw_spin_unlock(&rq->lock);19221922+ raw_spin_unlock_irq(&p->pi_lock);19231923+19241924+ put_task_struct(p);19051925 return 0;19061926}19071927···20141824 * sched_class::set_cpus_allowed must do the below, but is not required to20151825 * actually call this function.20161826 */20172017-void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask)18271827+void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask, u32 flags)20181828{18291829+ if (flags & (SCA_MIGRATE_ENABLE | SCA_MIGRATE_DISABLE)) {18301830+ p->cpus_ptr = new_mask;18311831+ return;18321832+ }18331833+20191834 cpumask_copy(&p->cpus_mask, new_mask);20201835 p->nr_cpus_allowed = cpumask_weight(new_mask);20211836}2022183720232023-void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)18381838+static void18391839+__do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask, u32 flags)20241840{20251841 struct rq *rq = task_rq(p);20261842 bool queued, running;2027184320282028- lockdep_assert_held(&p->pi_lock);18441844+ /*18451845+ * This here violates the locking rules for affinity, since we're only18461846+ * supposed to change these variables while holding both rq->lock and18471847+ * p->pi_lock.18481848+ *18491849+ * HOWEVER, it magically works, because ttwu() is the only code that18501850+ * accesses these variables under p->pi_lock and only does so after18511851+ * smp_cond_load_acquire(&p->on_cpu, !VAL), and we're in __schedule()18521852+ * before finish_task().18531853+ *18541854+ * XXX do further audits, this smells like something putrid.18551855+ */18561856+ if (flags & SCA_MIGRATE_DISABLE)18571857+ SCHED_WARN_ON(!p->on_cpu);18581858+ else18591859+ lockdep_assert_held(&p->pi_lock);2029186020301861 queued = task_on_rq_queued(p);20311862 running = task_current(rq, p);···20621851 if (running)20631852 put_prev_task(rq, p);2064185320652065- p->sched_class->set_cpus_allowed(p, new_mask);18541854+ p->sched_class->set_cpus_allowed(p, new_mask, flags);2066185520671856 if (queued)20681857 enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);20691858 if (running)20701859 set_next_task(rq, p);18601860+}18611861+18621862+void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)18631863+{18641864+ __do_set_cpus_allowed(p, new_mask, 0);18651865+}18661866+18671867+/*18681868+ * This function is wildly self concurrent; here be dragons.18691869+ *18701870+ *18711871+ * When given a valid mask, __set_cpus_allowed_ptr() must block until the18721872+ * designated task is enqueued on an allowed CPU. If that task is currently18731873+ * running, we have to kick it out using the CPU stopper.18741874+ *18751875+ * Migrate-Disable comes along and tramples all over our nice sandcastle.18761876+ * Consider:18771877+ *18781878+ * Initial conditions: P0->cpus_mask = [0, 1]18791879+ *18801880+ * P0@CPU0 P118811881+ *18821882+ * migrate_disable();18831883+ * <preempted>18841884+ * set_cpus_allowed_ptr(P0, [1]);18851885+ *18861886+ * P1 *cannot* return from this set_cpus_allowed_ptr() call until P0 executes18871887+ * its outermost migrate_enable() (i.e. it exits its Migrate-Disable region).18881888+ * This means we need the following scheme:18891889+ *18901890+ * P0@CPU0 P118911891+ *18921892+ * migrate_disable();18931893+ * <preempted>18941894+ * set_cpus_allowed_ptr(P0, [1]);18951895+ * <blocks>18961896+ * <resumes>18971897+ * migrate_enable();18981898+ * __set_cpus_allowed_ptr();18991899+ * <wakes local stopper>19001900+ * `--> <woken on migration completion>19011901+ *19021902+ * Now the fun stuff: there may be several P1-like tasks, i.e. multiple19031903+ * concurrent set_cpus_allowed_ptr(P0, [*]) calls. CPU affinity changes of any19041904+ * task p are serialized by p->pi_lock, which we can leverage: the one that19051905+ * should come into effect at the end of the Migrate-Disable region is the last19061906+ * one. This means we only need to track a single cpumask (i.e. p->cpus_mask),19071907+ * but we still need to properly signal those waiting tasks at the appropriate19081908+ * moment.19091909+ *19101910+ * This is implemented using struct set_affinity_pending. The first19111911+ * __set_cpus_allowed_ptr() caller within a given Migrate-Disable region will19121912+ * setup an instance of that struct and install it on the targeted task_struct.19131913+ * Any and all further callers will reuse that instance. Those then wait for19141914+ * a completion signaled at the tail of the CPU stopper callback (1), triggered19151915+ * on the end of the Migrate-Disable region (i.e. outermost migrate_enable()).19161916+ *19171917+ *19181918+ * (1) In the cases covered above. There is one more where the completion is19191919+ * signaled within affine_move_task() itself: when a subsequent affinity request19201920+ * cancels the need for an active migration. Consider:19211921+ *19221922+ * Initial conditions: P0->cpus_mask = [0, 1]19231923+ *19241924+ * P0@CPU0 P1 P219251925+ *19261926+ * migrate_disable();19271927+ * <preempted>19281928+ * set_cpus_allowed_ptr(P0, [1]);19291929+ * <blocks>19301930+ * set_cpus_allowed_ptr(P0, [0, 1]);19311931+ * <signal completion>19321932+ * <awakes>19331933+ *19341934+ * Note that the above is safe vs a concurrent migrate_enable(), as any19351935+ * pending affinity completion is preceded by an uninstallation of19361936+ * p->migration_pending done with p->pi_lock held.19371937+ */19381938+static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flags *rf,19391939+ int dest_cpu, unsigned int flags)19401940+{19411941+ struct set_affinity_pending my_pending = { }, *pending = NULL;19421942+ struct migration_arg arg = {19431943+ .task = p,19441944+ .dest_cpu = dest_cpu,19451945+ };19461946+ bool complete = false;19471947+19481948+ /* Can the task run on the task's current CPU? If so, we're done */19491949+ if (cpumask_test_cpu(task_cpu(p), &p->cpus_mask)) {19501950+ struct task_struct *push_task = NULL;19511951+19521952+ if ((flags & SCA_MIGRATE_ENABLE) &&19531953+ (p->migration_flags & MDF_PUSH) && !rq->push_busy) {19541954+ rq->push_busy = true;19551955+ push_task = get_task_struct(p);19561956+ }19571957+19581958+ pending = p->migration_pending;19591959+ if (pending) {19601960+ refcount_inc(&pending->refs);19611961+ p->migration_pending = NULL;19621962+ complete = true;19631963+ }19641964+ task_rq_unlock(rq, p, rf);19651965+19661966+ if (push_task) {19671967+ stop_one_cpu_nowait(rq->cpu, push_cpu_stop,19681968+ p, &rq->push_work);19691969+ }19701970+19711971+ if (complete)19721972+ goto do_complete;19731973+19741974+ return 0;19751975+ }19761976+19771977+ if (!(flags & SCA_MIGRATE_ENABLE)) {19781978+ /* serialized by p->pi_lock */19791979+ if (!p->migration_pending) {19801980+ /* Install the request */19811981+ refcount_set(&my_pending.refs, 1);19821982+ init_completion(&my_pending.done);19831983+ p->migration_pending = &my_pending;19841984+ } else {19851985+ pending = p->migration_pending;19861986+ refcount_inc(&pending->refs);19871987+ }19881988+ }19891989+ pending = p->migration_pending;19901990+ /*19911991+ * - !MIGRATE_ENABLE:19921992+ * we'll have installed a pending if there wasn't one already.19931993+ *19941994+ * - MIGRATE_ENABLE:19951995+ * we're here because the current CPU isn't matching anymore,19961996+ * the only way that can happen is because of a concurrent19971997+ * set_cpus_allowed_ptr() call, which should then still be19981998+ * pending completion.19991999+ *20002000+ * Either way, we really should have a @pending here.20012001+ */20022002+ if (WARN_ON_ONCE(!pending)) {20032003+ task_rq_unlock(rq, p, rf);20042004+ return -EINVAL;20052005+ }20062006+20072007+ if (flags & SCA_MIGRATE_ENABLE) {20082008+20092009+ refcount_inc(&pending->refs); /* pending->{arg,stop_work} */20102010+ p->migration_flags &= ~MDF_PUSH;20112011+ task_rq_unlock(rq, p, rf);20122012+20132013+ pending->arg = (struct migration_arg) {20142014+ .task = p,20152015+ .dest_cpu = -1,20162016+ .pending = pending,20172017+ };20182018+20192019+ stop_one_cpu_nowait(cpu_of(rq), migration_cpu_stop,20202020+ &pending->arg, &pending->stop_work);20212021+20222022+ return 0;20232023+ }20242024+20252025+ if (task_running(rq, p) || p->state == TASK_WAKING) {20262026+ /*20272027+ * Lessen races (and headaches) by delegating20282028+ * is_migration_disabled(p) checks to the stopper, which will20292029+ * run on the same CPU as said p.20302030+ */20312031+ task_rq_unlock(rq, p, rf);20322032+ stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);20332033+20342034+ } else {20352035+20362036+ if (!is_migration_disabled(p)) {20372037+ if (task_on_rq_queued(p))20382038+ rq = move_queued_task(rq, rf, p, dest_cpu);20392039+20402040+ p->migration_pending = NULL;20412041+ complete = true;20422042+ }20432043+ task_rq_unlock(rq, p, rf);20442044+20452045+do_complete:20462046+ if (complete)20472047+ complete_all(&pending->done);20482048+ }20492049+20502050+ wait_for_completion(&pending->done);20512051+20522052+ if (refcount_dec_and_test(&pending->refs))20532053+ wake_up_var(&pending->refs);20542054+20552055+ /*20562056+ * Block the original owner of &pending until all subsequent callers20572057+ * have seen the completion and decremented the refcount20582058+ */20592059+ wait_var_event(&my_pending.refs, !refcount_read(&my_pending.refs));20602060+20612061+ return 0;20712062}2072206320732064/*···22821869 * call is not atomic; no spinlocks may be held.22831870 */22841871static int __set_cpus_allowed_ptr(struct task_struct *p,22852285- const struct cpumask *new_mask, bool check)18721872+ const struct cpumask *new_mask,18731873+ u32 flags)22861874{22871875 const struct cpumask *cpu_valid_mask = cpu_active_mask;22881876 unsigned int dest_cpu;···22941880 rq = task_rq_lock(p, &rf);22951881 update_rq_clock(rq);2296188222972297- if (p->flags & PF_KTHREAD) {18831883+ if (p->flags & PF_KTHREAD || is_migration_disabled(p)) {22981884 /*22992299- * Kernel threads are allowed on online && !active CPUs18851885+ * Kernel threads are allowed on online && !active CPUs.18861886+ *18871887+ * Specifically, migration_disabled() tasks must not fail the18881888+ * cpumask_any_and_distribute() pick below, esp. so on18891889+ * SCA_MIGRATE_ENABLE, otherwise we'll not call18901890+ * set_cpus_allowed_common() and actually reset p->cpus_ptr.23001891 */23011892 cpu_valid_mask = cpu_online_mask;23021893 }···23101891 * Must re-check here, to close a race against __kthread_bind(),23111892 * sched_setaffinity() is not guaranteed to observe the flag.23121893 */23132313- if (check && (p->flags & PF_NO_SETAFFINITY)) {18941894+ if ((flags & SCA_CHECK) && (p->flags & PF_NO_SETAFFINITY)) {23141895 ret = -EINVAL;23151896 goto out;23161897 }2317189823182318- if (cpumask_equal(&p->cpus_mask, new_mask))23192319- goto out;18991899+ if (!(flags & SCA_MIGRATE_ENABLE)) {19001900+ if (cpumask_equal(&p->cpus_mask, new_mask))19011901+ goto out;19021902+19031903+ if (WARN_ON_ONCE(p == current &&19041904+ is_migration_disabled(p) &&19051905+ !cpumask_test_cpu(task_cpu(p), new_mask))) {19061906+ ret = -EBUSY;19071907+ goto out;19081908+ }19091909+ }2320191023211911 /*23221912 * Picking a ~random cpu helps in cases where we are changing affinity···23381910 goto out;23391911 }2340191223412341- do_set_cpus_allowed(p, new_mask);19131913+ __do_set_cpus_allowed(p, new_mask, flags);2342191423431915 if (p->flags & PF_KTHREAD) {23441916 /*···23501922 p->nr_cpus_allowed != 1);23511923 }2352192423532353- /* Can the task run on the task's current CPU? If so, we're done */23542354- if (cpumask_test_cpu(task_cpu(p), new_mask))23552355- goto out;19251925+ return affine_move_task(rq, p, &rf, dest_cpu, flags);2356192623572357- if (task_running(rq, p) || p->state == TASK_WAKING) {23582358- struct migration_arg arg = { p, dest_cpu };23592359- /* Need help from migration thread: drop lock and wait. */23602360- task_rq_unlock(rq, p, &rf);23612361- stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);23622362- return 0;23632363- } else if (task_on_rq_queued(p)) {23642364- /*23652365- * OK, since we're going to drop the lock immediately23662366- * afterwards anyway.23672367- */23682368- rq = move_queued_task(rq, &rf, p, dest_cpu);23692369- }23701927out:23711928 task_rq_unlock(rq, p, &rf);23721929···2360194723611948int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)23621949{23632363- return __set_cpus_allowed_ptr(p, new_mask, false);19501950+ return __set_cpus_allowed_ptr(p, new_mask, 0);23641951}23651952EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);23661953···24011988 * Clearly, migrating tasks to offline CPUs is a fairly daft thing.24021989 */24031990 WARN_ON_ONCE(!cpu_online(new_cpu));19911991+19921992+ WARN_ON_ONCE(is_migration_disabled(p));24041993#endif2405199424061995 trace_sched_migrate_task(p, new_cpu);···27332318 }27342319 fallthrough;27352320 case possible:23212321+ /*23222322+ * XXX When called from select_task_rq() we only23232323+ * hold p->pi_lock and again violate locking order.23242324+ *23252325+ * More yuck to audit.23262326+ */27362327 do_set_cpus_allowed(p, cpu_possible_mask);27372328 state = fail;27382329 break;···27732352{27742353 lockdep_assert_held(&p->pi_lock);2775235427762776- if (p->nr_cpus_allowed > 1)23552355+ if (p->nr_cpus_allowed > 1 && !is_migration_disabled(p))27772356 cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags);27782357 else27792358 cpu = cpumask_any(p->cpus_ptr);···2796237527972376void sched_set_stop_task(int cpu, struct task_struct *stop)27982377{23782378+ static struct lock_class_key stop_pi_lock;27992379 struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 };28002380 struct task_struct *old_stop = cpu_rq(cpu)->stop;28012381···28122390 sched_setscheduler_nocheck(stop, SCHED_FIFO, ¶m);2813239128142392 stop->sched_class = &stop_sched_class;23932393+23942394+ /*23952395+ * The PI code calls rt_mutex_setprio() with ->pi_lock held to23962396+ * adjust the effective priority of a task. As a result,23972397+ * rt_mutex_setprio() can trigger (RT) balancing operations,23982398+ * which can then trigger wakeups of the stop thread to push23992399+ * around the current task.24002400+ *24012401+ * The stop task itself will never be part of the PI-chain, it24022402+ * never blocks, therefore that ->pi_lock recursion is safe.24032403+ * Tell lockdep about this by placing the stop->pi_lock in its24042404+ * own class.24052405+ */24062406+ lockdep_set_class(&stop->pi_lock, &stop_pi_lock);28152407 }2816240828172409 cpu_rq(cpu)->stop = stop;···28422406#else2843240728442408static inline int __set_cpus_allowed_ptr(struct task_struct *p,28452845- const struct cpumask *new_mask, bool check)24092409+ const struct cpumask *new_mask,24102410+ u32 flags)28462411{28472412 return set_cpus_allowed_ptr(p, new_mask);28482413}2849241428502415#endif /* CONFIG_SMP */24162416+24172417+#if !defined(CONFIG_SMP) || !defined(CONFIG_PREEMPT_RT)24182418+24192419+static inline void migrate_disable_switch(struct rq *rq, struct task_struct *p) { }24202420+24212421+static inline bool rq_has_pinned_tasks(struct rq *rq)24222422+{24232423+ return false;24242424+}24252425+24262426+#endif2851242728522428static void28532429ttwu_stat(struct task_struct *p, int cpu, int wake_flags)···35463098 init_numa_balancing(clone_flags, p);35473099#ifdef CONFIG_SMP35483100 p->wake_entry.u_flags = CSD_TYPE_TTWU;31013101+ p->migration_pending = NULL;35493102#endif35503103}35513104···39343485#endif39353486}3936348734883488+#ifdef CONFIG_SMP34893489+34903490+static void do_balance_callbacks(struct rq *rq, struct callback_head *head)34913491+{34923492+ void (*func)(struct rq *rq);34933493+ struct callback_head *next;34943494+34953495+ lockdep_assert_held(&rq->lock);34963496+34973497+ while (head) {34983498+ func = (void (*)(struct rq *))head->func;34993499+ next = head->next;35003500+ head->next = NULL;35013501+ head = next;35023502+35033503+ func(rq);35043504+ }35053505+}35063506+35073507+static inline struct callback_head *splice_balance_callbacks(struct rq *rq)35083508+{35093509+ struct callback_head *head = rq->balance_callback;35103510+35113511+ lockdep_assert_held(&rq->lock);35123512+ if (head) {35133513+ rq->balance_callback = NULL;35143514+ rq->balance_flags &= ~BALANCE_WORK;35153515+ }35163516+35173517+ return head;35183518+}35193519+35203520+static void __balance_callbacks(struct rq *rq)35213521+{35223522+ do_balance_callbacks(rq, splice_balance_callbacks(rq));35233523+}35243524+35253525+static inline void balance_callbacks(struct rq *rq, struct callback_head *head)35263526+{35273527+ unsigned long flags;35283528+35293529+ if (unlikely(head)) {35303530+ raw_spin_lock_irqsave(&rq->lock, flags);35313531+ do_balance_callbacks(rq, head);35323532+ raw_spin_unlock_irqrestore(&rq->lock, flags);35333533+ }35343534+}35353535+35363536+static void balance_push(struct rq *rq);35373537+35383538+static inline void balance_switch(struct rq *rq)35393539+{35403540+ if (likely(!rq->balance_flags))35413541+ return;35423542+35433543+ if (rq->balance_flags & BALANCE_PUSH) {35443544+ balance_push(rq);35453545+ return;35463546+ }35473547+35483548+ __balance_callbacks(rq);35493549+}35503550+35513551+#else35523552+35533553+static inline void __balance_callbacks(struct rq *rq)35543554+{35553555+}35563556+35573557+static inline struct callback_head *splice_balance_callbacks(struct rq *rq)35583558+{35593559+ return NULL;35603560+}35613561+35623562+static inline void balance_callbacks(struct rq *rq, struct callback_head *head)35633563+{35643564+}35653565+35663566+static inline void balance_switch(struct rq *rq)35673567+{35683568+}35693569+35703570+#endif35713571+39373572static inline void39383573prepare_lock_switch(struct rq *rq, struct task_struct *next, struct rq_flags *rf)39393574{···40433510 * prev into current:40443511 */40453512 spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_);35133513+ balance_switch(rq);40463514 raw_spin_unlock_irq(&rq->lock);40473515}40483516···41853651 return rq;41863652}4187365341884188-#ifdef CONFIG_SMP41894189-41904190-/* rq->lock is NOT held, but preemption is disabled */41914191-static void __balance_callback(struct rq *rq)41924192-{41934193- struct callback_head *head, *next;41944194- void (*func)(struct rq *rq);41954195- unsigned long flags;41964196-41974197- raw_spin_lock_irqsave(&rq->lock, flags);41984198- head = rq->balance_callback;41994199- rq->balance_callback = NULL;42004200- while (head) {42014201- func = (void (*)(struct rq *))head->func;42024202- next = head->next;42034203- head->next = NULL;42044204- head = next;42054205-42064206- func(rq);42074207- }42084208- raw_spin_unlock_irqrestore(&rq->lock, flags);42094209-}42104210-42114211-static inline void balance_callback(struct rq *rq)42124212-{42134213- if (unlikely(rq->balance_callback))42144214- __balance_callback(rq);42154215-}42164216-42174217-#else42184218-42194219-static inline void balance_callback(struct rq *rq)42204220-{42214221-}42224222-42234223-#endif42244224-42253654/**42263655 * schedule_tail - first thing a freshly forked thread must call.42273656 * @prev: the thread we just switched away from.···42043707 */4205370842063709 rq = finish_task_switch(prev);42074207- balance_callback(rq);42083710 preempt_enable();4209371142103712 if (current->set_child_tid)···50114515 */50124516 ++*switch_count;5013451745184518+ migrate_disable_switch(rq, prev);50144519 psi_sched_switch(prev, next, !task_on_rq_queued(prev));5015452050164521 trace_sched_switch(preempt, prev, next);···50204523 rq = context_switch(rq, prev, next, &rf);50214524 } else {50224525 rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP);50235023- rq_unlock_irq(rq, &rf);50245024- }5025452650265026- balance_callback(rq);45274527+ rq_unpin_lock(rq, &rf);45284528+ __balance_callbacks(rq);45294529+ raw_spin_unlock_irq(&rq->lock);45304530+ }50274531}5028453250294533void __noreturn do_task_dead(void)···54354937out_unlock:54364938 /* Avoid rq from going away on us: */54374939 preempt_disable();54385438- __task_rq_unlock(rq, &rf);5439494054405440- balance_callback(rq);49414941+ rq_unpin_lock(rq, &rf);49424942+ __balance_callbacks(rq);49434943+ raw_spin_unlock(&rq->lock);49444944+54414945 preempt_enable();54424946}54434947#else···57135213 int retval, oldprio, oldpolicy = -1, queued, running;57145214 int new_effective_prio, policy = attr->sched_policy;57155215 const struct sched_class *prev_class;52165216+ struct callback_head *head;57165217 struct rq_flags rf;57175218 int reset_on_fork;57185219 int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;···5952545159535452 /* Avoid rq from going away on us: */59545453 preempt_disable();54545454+ head = splice_balance_callbacks(rq);59555455 task_rq_unlock(rq, p, &rf);5956545659575457 if (pi) {···59615459 }5962546059635461 /* Run balance callbacks after we've adjusted the PI chain: */59645964- balance_callback(rq);54625462+ balance_callbacks(rq, head);59655463 preempt_enable();5966546459675465 return 0;···64565954 }64575955#endif64585956again:64596459- retval = __set_cpus_allowed_ptr(p, new_mask, true);59575957+ retval = __set_cpus_allowed_ptr(p, new_mask, SCA_CHECK);6460595864615959 if (!retval) {64625960 cpuset_cpus_allowed(p, cpus_allowed);···69456443 (unsigned long)task_thread_info(p)->flags);6946644469476445 print_worker_info(KERN_INFO, p);64466446+ print_stop_info(KERN_INFO, p);69486447 show_stack(p, NULL, KERN_INFO);69496448 put_task_stack(p);69506449}···70366533 *70376534 * And since this is boot we can forgo the serialization.70386535 */70397039- set_cpus_allowed_common(idle, cpumask_of(cpu));65366536+ set_cpus_allowed_common(idle, cpumask_of(cpu), 0);70406537#endif70416538 /*70426539 * We're having a chicken and egg problem, even though we are···71876684 /* finish_cpu(), as ran on the BP, will clean up the active_mm state */71886685}7189668671907190-/*71917191- * Since this CPU is going 'away' for a while, fold any nr_active delta71927192- * we might have. Assumes we're called after migrate_tasks() so that the71937193- * nr_active count is stable. We need to take the teardown thread which71947194- * is calling this into account, so we hand in adjust = 1 to the load71957195- * calculation.71967196- *71977197- * Also see the comment "Global load-average calculations".71987198- */71997199-static void calc_load_migrate(struct rq *rq)66876687+static int __balance_push_cpu_stop(void *arg)72006688{72017201- long delta = calc_load_fold_active(rq, 1);72027202- if (delta)72037203- atomic_long_add(delta, &calc_load_tasks);72047204-}66896689+ struct task_struct *p = arg;66906690+ struct rq *rq = this_rq();66916691+ struct rq_flags rf;66926692+ int cpu;7205669372067206-static struct task_struct *__pick_migrate_task(struct rq *rq)72077207-{72087208- const struct sched_class *class;72097209- struct task_struct *next;66946694+ raw_spin_lock_irq(&p->pi_lock);66956695+ rq_lock(rq, &rf);7210669672117211- for_each_class(class) {72127212- next = class->pick_next_task(rq);72137213- if (next) {72147214- next->sched_class->put_prev_task(rq, next);72157215- return next;72167216- }72177217- }72187218-72197219- /* The idle class should always have a runnable task */72207220- BUG();72217221-}72227222-72237223-/*72247224- * Migrate all tasks from the rq, sleeping tasks will be migrated by72257225- * try_to_wake_up()->select_task_rq().72267226- *72277227- * Called with rq->lock held even though we'er in stop_machine() and72287228- * there's no concurrency possible, we hold the required locks anyway72297229- * because of lock validation efforts.72307230- */72317231-static void migrate_tasks(struct rq *dead_rq, struct rq_flags *rf)72327232-{72337233- struct rq *rq = dead_rq;72347234- struct task_struct *next, *stop = rq->stop;72357235- struct rq_flags orf = *rf;72367236- int dest_cpu;72377237-72387238- /*72397239- * Fudge the rq selection such that the below task selection loop72407240- * doesn't get stuck on the currently eligible stop task.72417241- *72427242- * We're currently inside stop_machine() and the rq is either stuck72437243- * in the stop_machine_cpu_stop() loop, or we're executing this code,72447244- * either way we should never end up calling schedule() until we're72457245- * done here.72467246- */72477247- rq->stop = NULL;72487248-72497249- /*72507250- * put_prev_task() and pick_next_task() sched72517251- * class method both need to have an up-to-date72527252- * value of rq->clock[_task]72537253- */72546697 update_rq_clock(rq);7255669872567256- for (;;) {72577257- /*72587258- * There's this thread running, bail when that's the only72597259- * remaining thread:72607260- */72617261- if (rq->nr_running == 1)72627262- break;72637263-72647264- next = __pick_migrate_task(rq);72657265-72667266- /*72677267- * Rules for changing task_struct::cpus_mask are holding72687268- * both pi_lock and rq->lock, such that holding either72697269- * stabilizes the mask.72707270- *72717271- * Drop rq->lock is not quite as disastrous as it usually is72727272- * because !cpu_active at this point, which means load-balance72737273- * will not interfere. Also, stop-machine.72747274- */72757275- rq_unlock(rq, rf);72767276- raw_spin_lock(&next->pi_lock);72777277- rq_relock(rq, rf);72787278-72797279- /*72807280- * Since we're inside stop-machine, _nothing_ should have72817281- * changed the task, WARN if weird stuff happened, because in72827282- * that case the above rq->lock drop is a fail too.72837283- */72847284- if (WARN_ON(task_rq(next) != rq || !task_on_rq_queued(next))) {72857285- raw_spin_unlock(&next->pi_lock);72867286- continue;72877287- }72887288-72897289- /* Find suitable destination for @next, with force if needed. */72907290- dest_cpu = select_fallback_rq(dead_rq->cpu, next);72917291- rq = __migrate_task(rq, rf, next, dest_cpu);72927292- if (rq != dead_rq) {72937293- rq_unlock(rq, rf);72947294- rq = dead_rq;72957295- *rf = orf;72967296- rq_relock(rq, rf);72977297- }72987298- raw_spin_unlock(&next->pi_lock);66996699+ if (task_rq(p) == rq && task_on_rq_queued(p)) {67006700+ cpu = select_fallback_rq(rq->cpu, p);67016701+ rq = __migrate_task(rq, &rf, p, cpu);72996702 }7300670373017301- rq->stop = stop;67046704+ rq_unlock(rq, &rf);67056705+ raw_spin_unlock_irq(&p->pi_lock);67066706+67076707+ put_task_struct(p);67086708+67096709+ return 0;73026710}67116711+67126712+static DEFINE_PER_CPU(struct cpu_stop_work, push_work);67136713+67146714+/*67156715+ * Ensure we only run per-cpu kthreads once the CPU goes !active.67166716+ */67176717+static void balance_push(struct rq *rq)67186718+{67196719+ struct task_struct *push_task = rq->curr;67206720+67216721+ lockdep_assert_held(&rq->lock);67226722+ SCHED_WARN_ON(rq->cpu != smp_processor_id());67236723+67246724+ /*67256725+ * Both the cpu-hotplug and stop task are in this case and are67266726+ * required to complete the hotplug process.67276727+ */67286728+ if (is_per_cpu_kthread(push_task) || is_migration_disabled(push_task)) {67296729+ /*67306730+ * If this is the idle task on the outgoing CPU try to wake67316731+ * up the hotplug control thread which might wait for the67326732+ * last task to vanish. The rcuwait_active() check is67336733+ * accurate here because the waiter is pinned on this CPU67346734+ * and can't obviously be running in parallel.67356735+ *67366736+ * On RT kernels this also has to check whether there are67376737+ * pinned and scheduled out tasks on the runqueue. They67386738+ * need to leave the migrate disabled section first.67396739+ */67406740+ if (!rq->nr_running && !rq_has_pinned_tasks(rq) &&67416741+ rcuwait_active(&rq->hotplug_wait)) {67426742+ raw_spin_unlock(&rq->lock);67436743+ rcuwait_wake_up(&rq->hotplug_wait);67446744+ raw_spin_lock(&rq->lock);67456745+ }67466746+ return;67476747+ }67486748+67496749+ get_task_struct(push_task);67506750+ /*67516751+ * Temporarily drop rq->lock such that we can wake-up the stop task.67526752+ * Both preemption and IRQs are still disabled.67536753+ */67546754+ raw_spin_unlock(&rq->lock);67556755+ stop_one_cpu_nowait(rq->cpu, __balance_push_cpu_stop, push_task,67566756+ this_cpu_ptr(&push_work));67576757+ /*67586758+ * At this point need_resched() is true and we'll take the loop in67596759+ * schedule(). The next pick is obviously going to be the stop task67606760+ * which is_per_cpu_kthread() and will push this task away.67616761+ */67626762+ raw_spin_lock(&rq->lock);67636763+}67646764+67656765+static void balance_push_set(int cpu, bool on)67666766+{67676767+ struct rq *rq = cpu_rq(cpu);67686768+ struct rq_flags rf;67696769+67706770+ rq_lock_irqsave(rq, &rf);67716771+ if (on)67726772+ rq->balance_flags |= BALANCE_PUSH;67736773+ else67746774+ rq->balance_flags &= ~BALANCE_PUSH;67756775+ rq_unlock_irqrestore(rq, &rf);67766776+}67776777+67786778+/*67796779+ * Invoked from a CPUs hotplug control thread after the CPU has been marked67806780+ * inactive. All tasks which are not per CPU kernel threads are either67816781+ * pushed off this CPU now via balance_push() or placed on a different CPU67826782+ * during wakeup. Wait until the CPU is quiescent.67836783+ */67846784+static void balance_hotplug_wait(void)67856785+{67866786+ struct rq *rq = this_rq();67876787+67886788+ rcuwait_wait_event(&rq->hotplug_wait,67896789+ rq->nr_running == 1 && !rq_has_pinned_tasks(rq),67906790+ TASK_UNINTERRUPTIBLE);67916791+}67926792+67936793+#else67946794+67956795+static inline void balance_push(struct rq *rq)67966796+{67976797+}67986798+67996799+static inline void balance_push_set(int cpu, bool on)68006800+{68016801+}68026802+68036803+static inline void balance_hotplug_wait(void)68046804+{68056805+}68066806+73036807#endif /* CONFIG_HOTPLUG_CPU */7304680873056809void set_rq_online(struct rq *rq)···73926882 struct rq *rq = cpu_rq(cpu);73936883 struct rq_flags rf;7394688468856885+ balance_push_set(cpu, false);68866886+73956887#ifdef CONFIG_SCHED_SMT73966888 /*73976889 * When going up, increment the number of cores with SMT present.···7429691774306918int sched_cpu_deactivate(unsigned int cpu)74316919{69206920+ struct rq *rq = cpu_rq(cpu);69216921+ struct rq_flags rf;74326922 int ret;7433692374346924 set_cpu_active(cpu, false);···74426928 * Do sync before park smpboot threads to take care the rcu boost case.74436929 */74446930 synchronize_rcu();69316931+69326932+ balance_push_set(cpu, true);69336933+69346934+ rq_lock_irqsave(rq, &rf);69356935+ if (rq->rd) {69366936+ update_rq_clock(rq);69376937+ BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));69386938+ set_rq_offline(rq);69396939+ }69406940+ rq_unlock_irqrestore(rq, &rf);7445694174466942#ifdef CONFIG_SCHED_SMT74476943 /*···7466694274676943 ret = cpuset_cpu_inactive(cpu);74686944 if (ret) {69456945+ balance_push_set(cpu, false);74696946 set_cpu_active(cpu, true);74706947 return ret;74716948 }···74906965}7491696674926967#ifdef CONFIG_HOTPLUG_CPU69686968+69696969+/*69706970+ * Invoked immediately before the stopper thread is invoked to bring the69716971+ * CPU down completely. At this point all per CPU kthreads except the69726972+ * hotplug thread (current) and the stopper thread (inactive) have been69736973+ * either parked or have been unbound from the outgoing CPU. Ensure that69746974+ * any of those which might be on the way out are gone.69756975+ *69766976+ * If after this point a bound task is being woken on this CPU then the69776977+ * responsible hotplug callback has failed to do it's job.69786978+ * sched_cpu_dying() will catch it with the appropriate fireworks.69796979+ */69806980+int sched_cpu_wait_empty(unsigned int cpu)69816981+{69826982+ balance_hotplug_wait();69836983+ return 0;69846984+}69856985+69866986+/*69876987+ * Since this CPU is going 'away' for a while, fold any nr_active delta we69886988+ * might have. Called from the CPU stopper task after ensuring that the69896989+ * stopper is the last running task on the CPU, so nr_active count is69906990+ * stable. We need to take the teardown thread which is calling this into69916991+ * account, so we hand in adjust = 1 to the load calculation.69926992+ *69936993+ * Also see the comment "Global load-average calculations".69946994+ */69956995+static void calc_load_migrate(struct rq *rq)69966996+{69976997+ long delta = calc_load_fold_active(rq, 1);69986998+69996999+ if (delta)70007000+ atomic_long_add(delta, &calc_load_tasks);70017001+}70027002+74937003int sched_cpu_dying(unsigned int cpu)74947004{74957005 struct rq *rq = cpu_rq(cpu);···75346974 sched_tick_stop(cpu);7535697575366976 rq_lock_irqsave(rq, &rf);75377537- if (rq->rd) {75387538- BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));75397539- set_rq_offline(rq);75407540- }75417541- migrate_tasks(rq, &rf);75427542- BUG_ON(rq->nr_running != 1);69776977+ BUG_ON(rq->nr_running != 1 || rq_has_pinned_tasks(rq));75436978 rq_unlock_irqrestore(rq, &rf);7544697975456980 calc_load_migrate(rq);···77407185 atomic_set(&rq->nohz_flags, 0);7741718677427187 rq_csd_init(rq, &rq->nohz_csd, nohz_csd_func);71887188+#endif71897189+#ifdef CONFIG_HOTPLUG_CPU71907190+ rcuwait_init(&rq->hotplug_wait);77437191#endif77447192#endif /* CONFIG_SMP */77457193 hrtick_rq_init(rq);
+2-2
kernel/sched/cpudeadline.c
···120120 const struct sched_dl_entity *dl_se = &p->dl;121121122122 if (later_mask &&123123- cpumask_and(later_mask, cp->free_cpus, p->cpus_ptr)) {123123+ cpumask_and(later_mask, cp->free_cpus, &p->cpus_mask)) {124124 unsigned long cap, max_cap = 0;125125 int cpu, max_cpu = -1;126126···151151152152 WARN_ON(best_cpu != -1 && !cpu_present(best_cpu));153153154154- if (cpumask_test_cpu(best_cpu, p->cpus_ptr) &&154154+ if (cpumask_test_cpu(best_cpu, &p->cpus_mask) &&155155 dl_time_before(dl_se->deadline, cp->elements[0].dl)) {156156 if (later_mask)157157 cpumask_set_cpu(best_cpu, later_mask);
+2-2
kernel/sched/cpupri.c
···9797 if (skip)9898 return 0;9999100100- if (cpumask_any_and(p->cpus_ptr, vec->mask) >= nr_cpu_ids)100100+ if (cpumask_any_and(&p->cpus_mask, vec->mask) >= nr_cpu_ids)101101 return 0;102102103103 if (lowest_mask) {104104- cpumask_and(lowest_mask, p->cpus_ptr, vec->mask);104104+ cpumask_and(lowest_mask, &p->cpus_mask, vec->mask);105105106106 /*107107 * We have to ensure that we have at least one bit
+31-15
kernel/sched/deadline.c
···559559560560static inline bool need_pull_dl_task(struct rq *rq, struct task_struct *prev)561561{562562- return dl_task(prev);562562+ return rq->online && dl_task(prev);563563}564564565565static DEFINE_PER_CPU(struct callback_head, dl_push_head);···19311931static int pick_dl_task(struct rq *rq, struct task_struct *p, int cpu)19321932{19331933 if (!task_running(rq, p) &&19341934- cpumask_test_cpu(cpu, p->cpus_ptr))19341934+ cpumask_test_cpu(cpu, &p->cpus_mask))19351935 return 1;19361936 return 0;19371937}···20212021 return this_cpu;20222022 }2023202320242024- best_cpu = cpumask_first_and(later_mask,20252025- sched_domain_span(sd));20242024+ best_cpu = cpumask_any_and_distribute(later_mask,20252025+ sched_domain_span(sd));20262026 /*20272027 * Last chance: if a CPU being in both later_mask20282028 * and current sd span is valid, that becomes our···20442044 if (this_cpu != -1)20452045 return this_cpu;2046204620472047- cpu = cpumask_any(later_mask);20472047+ cpu = cpumask_any_distribute(later_mask);20482048 if (cpu < nr_cpu_ids)20492049 return cpu;20502050···20812081 /* Retry if something changed. */20822082 if (double_lock_balance(rq, later_rq)) {20832083 if (unlikely(task_rq(task) != rq ||20842084- !cpumask_test_cpu(later_rq->cpu, task->cpus_ptr) ||20842084+ !cpumask_test_cpu(later_rq->cpu, &task->cpus_mask) ||20852085 task_running(rq, task) ||20862086 !dl_task(task) ||20872087 !task_on_rq_queued(task))) {···21482148 return 0;2149214921502150retry:21512151+ if (is_migration_disabled(next_task))21522152+ return 0;21532153+21512154 if (WARN_ON(next_task == rq->curr))21522155 return 0;21532156···22282225static void pull_dl_task(struct rq *this_rq)22292226{22302227 int this_cpu = this_rq->cpu, cpu;22312231- struct task_struct *p;22282228+ struct task_struct *p, *push_task;22322229 bool resched = false;22332230 struct rq *src_rq;22342231 u64 dmin = LONG_MAX;···22582255 continue;2259225622602257 /* Might drop this_rq->lock */22582258+ push_task = NULL;22612259 double_lock_balance(this_rq, src_rq);2262226022632261 /*···22902286 src_rq->curr->dl.deadline))22912287 goto skip;2292228822932293- resched = true;22942294-22952295- deactivate_task(src_rq, p, 0);22962296- set_task_cpu(p, this_cpu);22972297- activate_task(this_rq, p, 0);22982298- dmin = p->dl.deadline;22892289+ if (is_migration_disabled(p)) {22902290+ push_task = get_push_task(src_rq);22912291+ } else {22922292+ deactivate_task(src_rq, p, 0);22932293+ set_task_cpu(p, this_cpu);22942294+ activate_task(this_rq, p, 0);22952295+ dmin = p->dl.deadline;22962296+ resched = true;22972297+ }2299229823002299 /* Is there any other task even earlier? */23012300 }23022301skip:23032302 double_unlock_balance(this_rq, src_rq);23032303+23042304+ if (push_task) {23052305+ raw_spin_unlock(&this_rq->lock);23062306+ stop_one_cpu_nowait(src_rq->cpu, push_cpu_stop,23072307+ push_task, &src_rq->push_work);23082308+ raw_spin_lock(&this_rq->lock);23092309+ }23042310 }2305231123062312 if (resched)···23342320}2335232123362322static void set_cpus_allowed_dl(struct task_struct *p,23372337- const struct cpumask *new_mask)23232323+ const struct cpumask *new_mask,23242324+ u32 flags)23382325{23392326 struct root_domain *src_rd;23402327 struct rq *rq;···23642349 raw_spin_unlock(&src_dl_b->lock);23652350 }2366235123672367- set_cpus_allowed_common(p, new_mask);23522352+ set_cpus_allowed_common(p, new_mask, flags);23682353}2369235423702355/* Assumes rq->lock is held */···25572542 .rq_online = rq_online_dl,25582543 .rq_offline = rq_offline_dl,25592544 .task_woken = task_woken_dl,25452545+ .find_lock_rq = find_lock_later_rq,25602546#endif2561254725622548 .task_tick = task_tick_dl,
+57-18
kernel/sched/rt.c
···265265static inline bool need_pull_rt_task(struct rq *rq, struct task_struct *prev)266266{267267 /* Try to pull RT tasks here if we lower this rq's prio */268268- return rq->rt.highest_prio.curr > prev->prio;268268+ return rq->online && rq->rt.highest_prio.curr > prev->prio;269269}270270271271static inline int rt_overloaded(struct rq *rq)···16601660static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)16611661{16621662 if (!task_running(rq, p) &&16631663- cpumask_test_cpu(cpu, p->cpus_ptr))16631663+ cpumask_test_cpu(cpu, &p->cpus_mask))16641664 return 1;1665166516661666 return 0;···17541754 return this_cpu;17551755 }1756175617571757- best_cpu = cpumask_first_and(lowest_mask,17581758- sched_domain_span(sd));17571757+ best_cpu = cpumask_any_and_distribute(lowest_mask,17581758+ sched_domain_span(sd));17591759 if (best_cpu < nr_cpu_ids) {17601760 rcu_read_unlock();17611761 return best_cpu;···17721772 if (this_cpu != -1)17731773 return this_cpu;1774177417751775- cpu = cpumask_any(lowest_mask);17751775+ cpu = cpumask_any_distribute(lowest_mask);17761776 if (cpu < nr_cpu_ids)17771777 return cpu;17781778···18131813 * Also make sure that it wasn't scheduled on its rq.18141814 */18151815 if (unlikely(task_rq(task) != rq ||18161816- !cpumask_test_cpu(lowest_rq->cpu, task->cpus_ptr) ||18161816+ !cpumask_test_cpu(lowest_rq->cpu, &task->cpus_mask) ||18171817 task_running(rq, task) ||18181818 !rt_task(task) ||18191819 !task_on_rq_queued(task))) {···18611861 * running task can migrate over to a CPU that is running a task18621862 * of lesser priority.18631863 */18641864-static int push_rt_task(struct rq *rq)18641864+static int push_rt_task(struct rq *rq, bool pull)18651865{18661866 struct task_struct *next_task;18671867 struct rq *lowest_rq;···18751875 return 0;1876187618771877retry:18781878+ if (is_migration_disabled(next_task)) {18791879+ struct task_struct *push_task = NULL;18801880+ int cpu;18811881+18821882+ if (!pull || rq->push_busy)18831883+ return 0;18841884+18851885+ cpu = find_lowest_rq(rq->curr);18861886+ if (cpu == -1 || cpu == rq->cpu)18871887+ return 0;18881888+18891889+ /*18901890+ * Given we found a CPU with lower priority than @next_task,18911891+ * therefore it should be running. However we cannot migrate it18921892+ * to this other CPU, instead attempt to push the current18931893+ * running task on this CPU away.18941894+ */18951895+ push_task = get_push_task(rq);18961896+ if (push_task) {18971897+ raw_spin_unlock(&rq->lock);18981898+ stop_one_cpu_nowait(rq->cpu, push_cpu_stop,18991899+ push_task, &rq->push_work);19001900+ raw_spin_lock(&rq->lock);19011901+ }19021902+19031903+ return 0;19041904+ }19051905+18781906 if (WARN_ON(next_task == rq->curr))18791907 return 0;18801908···19571929 deactivate_task(rq, next_task, 0);19581930 set_task_cpu(next_task, lowest_rq->cpu);19591931 activate_task(lowest_rq, next_task, 0);19321932+ resched_curr(lowest_rq);19601933 ret = 1;1961193419621962- resched_curr(lowest_rq);19631963-19641935 double_unlock_balance(rq, lowest_rq);19651965-19661936out:19671937 put_task_struct(next_task);19681938···19701944static void push_rt_tasks(struct rq *rq)19711945{19721946 /* push_rt_task will return true if it moved an RT */19731973- while (push_rt_task(rq))19471947+ while (push_rt_task(rq, false))19741948 ;19751949}19761950···21232097 */21242098 if (has_pushable_tasks(rq)) {21252099 raw_spin_lock(&rq->lock);21262126- push_rt_tasks(rq);21002100+ while (push_rt_task(rq, true))21012101+ ;21272102 raw_spin_unlock(&rq->lock);21282103 }21292104···21492122{21502123 int this_cpu = this_rq->cpu, cpu;21512124 bool resched = false;21522152- struct task_struct *p;21252125+ struct task_struct *p, *push_task;21532126 struct rq *src_rq;21542127 int rt_overload_count = rt_overloaded(this_rq);21552128···21962169 * double_lock_balance, and another CPU could21972170 * alter this_rq21982171 */21722172+ push_task = NULL;21992173 double_lock_balance(this_rq, src_rq);2200217422012175 /*···22242196 if (p->prio < src_rq->curr->prio)22252197 goto skip;2226219822272227- resched = true;22282228-22292229- deactivate_task(src_rq, p, 0);22302230- set_task_cpu(p, this_cpu);22312231- activate_task(this_rq, p, 0);21992199+ if (is_migration_disabled(p)) {22002200+ push_task = get_push_task(src_rq);22012201+ } else {22022202+ deactivate_task(src_rq, p, 0);22032203+ set_task_cpu(p, this_cpu);22042204+ activate_task(this_rq, p, 0);22052205+ resched = true;22062206+ }22322207 /*22332208 * We continue with the search, just in22342209 * case there's an even higher prio task···22412210 }22422211skip:22432212 double_unlock_balance(this_rq, src_rq);22132213+22142214+ if (push_task) {22152215+ raw_spin_unlock(&this_rq->lock);22162216+ stop_one_cpu_nowait(src_rq->cpu, push_cpu_stop,22172217+ push_task, &src_rq->push_work);22182218+ raw_spin_lock(&this_rq->lock);22192219+ }22442220 }2245222122462222 if (resched)···24892451 .rq_offline = rq_offline_rt,24902452 .task_woken = task_woken_rt,24912453 .switched_from = switched_from_rt,24542454+ .find_lock_rq = find_lock_lowest_rq,24922455#endif2493245624942457 .task_tick = task_tick_rt,
···2626 if (current->nr_cpus_allowed == 1)2727 goto out;28282929+#if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT_RT)3030+ if (current->migration_disabled)3131+ goto out;3232+#endif3333+2934 /*3035 * It is valid to assume CPU-locality during early bootup:3136 */