Merge branch 'sched/migrate-disable' · tjh.dev/kernel@12fa97c

+2 -2

fs/proc/array.c

··· 382 382 static void task_cpus_allowed(struct seq_file *m, struct task_struct *task) 383 383 { 384 384 seq_printf(m, "Cpus_allowed:\t%*pb\n", 385 - cpumask_pr_args(task->cpus_ptr)); 385 + cpumask_pr_args(&task->cpus_mask)); 386 386 seq_printf(m, "Cpus_allowed_list:\t%*pbl\n", 387 - cpumask_pr_args(task->cpus_ptr)); 387 + cpumask_pr_args(&task->cpus_mask)); 388 388 } 389 389 390 390 static inline void task_core_dumping(struct seq_file *m, struct mm_struct *mm)

+1

include/linux/cpuhotplug.h

··· 152 152 CPUHP_AP_ONLINE, 153 153 CPUHP_TEARDOWN_CPU, 154 154 CPUHP_AP_ONLINE_IDLE, 155 + CPUHP_AP_SCHED_WAIT_EMPTY, 155 156 CPUHP_AP_SMPBOOT_THREADS, 156 157 CPUHP_AP_X86_VDSO_VMA_ONLINE, 157 158 CPUHP_AP_IRQ_AFFINITY_ONLINE,

+6

include/linux/cpumask.h

··· 199 199 return cpumask_next_and(-1, src1p, src2p); 200 200 } 201 201 202 + static inline int cpumask_any_distribute(const struct cpumask *srcp) 203 + { 204 + return cpumask_first(srcp); 205 + } 206 + 202 207 #define for_each_cpu(cpu, mask) \ 203 208 for ((cpu) = 0; (cpu) < 1; (cpu)++, (void)mask) 204 209 #define for_each_cpu_not(cpu, mask) \ ··· 257 252 unsigned int cpumask_local_spread(unsigned int i, int node); 258 253 int cpumask_any_and_distribute(const struct cpumask *src1p, 259 254 const struct cpumask *src2p); 255 + int cpumask_any_distribute(const struct cpumask *srcp); 260 256 261 257 /** 262 258 * for_each_cpu - iterate over every cpu in a mask

+69

include/linux/preempt.h

··· 322 322 323 323 #endif 324 324 325 + #if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT_RT) 326 + 327 + /* 328 + * Migrate-Disable and why it is undesired. 329 + * 330 + * When a preempted task becomes elegible to run under the ideal model (IOW it 331 + * becomes one of the M highest priority tasks), it might still have to wait 332 + * for the preemptee's migrate_disable() section to complete. Thereby suffering 333 + * a reduction in bandwidth in the exact duration of the migrate_disable() 334 + * section. 335 + * 336 + * Per this argument, the change from preempt_disable() to migrate_disable() 337 + * gets us: 338 + * 339 + * - a higher priority tasks gains reduced wake-up latency; with preempt_disable() 340 + * it would have had to wait for the lower priority task. 341 + * 342 + * - a lower priority tasks; which under preempt_disable() could've instantly 343 + * migrated away when another CPU becomes available, is now constrained 344 + * by the ability to push the higher priority task away, which might itself be 345 + * in a migrate_disable() section, reducing it's available bandwidth. 346 + * 347 + * IOW it trades latency / moves the interference term, but it stays in the 348 + * system, and as long as it remains unbounded, the system is not fully 349 + * deterministic. 350 + * 351 + * 352 + * The reason we have it anyway. 353 + * 354 + * PREEMPT_RT breaks a number of assumptions traditionally held. By forcing a 355 + * number of primitives into becoming preemptible, they would also allow 356 + * migration. This turns out to break a bunch of per-cpu usage. To this end, 357 + * all these primitives employ migirate_disable() to restore this implicit 358 + * assumption. 359 + * 360 + * This is a 'temporary' work-around at best. The correct solution is getting 361 + * rid of the above assumptions and reworking the code to employ explicit 362 + * per-cpu locking or short preempt-disable regions. 363 + * 364 + * The end goal must be to get rid of migrate_disable(), alternatively we need 365 + * a schedulability theory that does not depend on abritrary migration. 366 + * 367 + * 368 + * Notes on the implementation. 369 + * 370 + * The implementation is particularly tricky since existing code patterns 371 + * dictate neither migrate_disable() nor migrate_enable() is allowed to block. 372 + * This means that it cannot use cpus_read_lock() to serialize against hotplug, 373 + * nor can it easily migrate itself into a pending affinity mask change on 374 + * migrate_enable(). 375 + * 376 + * 377 + * Note: even non-work-conserving schedulers like semi-partitioned depends on 378 + * migration, so migrate_disable() is not only a problem for 379 + * work-conserving schedulers. 380 + * 381 + */ 382 + extern void migrate_disable(void); 383 + extern void migrate_enable(void); 384 + 385 + #elif defined(CONFIG_PREEMPT_RT) 386 + 387 + static inline void migrate_disable(void) { } 388 + static inline void migrate_enable(void) { } 389 + 390 + #else /* !CONFIG_PREEMPT_RT */ 391 + 325 392 /** 326 393 * migrate_disable - Prevent migration of the current task 327 394 * ··· 418 351 { 419 352 preempt_enable(); 420 353 } 354 + 355 + #endif /* CONFIG_SMP && CONFIG_PREEMPT_RT */ 421 356 422 357 #endif /* __LINUX_PREEMPT_H */

+5

include/linux/sched.h

··· 714 714 int nr_cpus_allowed; 715 715 const cpumask_t *cpus_ptr; 716 716 cpumask_t cpus_mask; 717 + void *migration_pending; 718 + #if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT_RT) 719 + unsigned short migration_disabled; 720 + #endif 721 + unsigned short migration_flags; 717 722 718 723 #ifdef CONFIG_PREEMPT_RCU 719 724 int rcu_read_lock_nesting;

+2

include/linux/sched/hotplug.h

··· 11 11 extern int sched_cpu_deactivate(unsigned int cpu); 12 12 13 13 #ifdef CONFIG_HOTPLUG_CPU 14 + extern int sched_cpu_wait_empty(unsigned int cpu); 14 15 extern int sched_cpu_dying(unsigned int cpu); 15 16 #else 17 + # define sched_cpu_wait_empty NULL 16 18 # define sched_cpu_dying NULL 17 19 #endif 18 20

+5

include/linux/stop_machine.h

··· 24 24 struct cpu_stop_work { 25 25 struct list_head list; /* cpu_stopper->works */ 26 26 cpu_stop_fn_t fn; 27 + unsigned long caller; 27 28 void *arg; 28 29 struct cpu_stop_done *done; 29 30 }; ··· 36 35 void stop_machine_park(int cpu); 37 36 void stop_machine_unpark(int cpu); 38 37 void stop_machine_yield(const struct cpumask *cpumask); 38 + 39 + extern void print_stop_info(const char *log_lvl, struct task_struct *task); 39 40 40 41 #else /* CONFIG_SMP */ 41 42 ··· 82 79 83 80 return false; 84 81 } 82 + 83 + static inline void print_stop_info(const char *log_lvl, struct task_struct *task) { } 85 84 86 85 #endif /* CONFIG_SMP */ 87 86

+8 -1

kernel/cpu.c

··· 1602 1602 .name = "ap:online", 1603 1603 }, 1604 1604 /* 1605 - * Handled on controll processor until the plugged processor manages 1605 + * Handled on control processor until the plugged processor manages 1606 1606 * this itself. 1607 1607 */ 1608 1608 [CPUHP_TEARDOWN_CPU] = { ··· 1611 1611 .teardown.single = takedown_cpu, 1612 1612 .cant_stop = true, 1613 1613 }, 1614 + 1615 + [CPUHP_AP_SCHED_WAIT_EMPTY] = { 1616 + .name = "sched:waitempty", 1617 + .startup.single = NULL, 1618 + .teardown.single = sched_cpu_wait_empty, 1619 + }, 1620 + 1614 1621 /* Handle smpboot threads park/unpark */ 1615 1622 [CPUHP_AP_SMPBOOT_THREADS] = { 1616 1623 .name = "smpboot/threads:online",

+757 -199

kernel/sched/core.c

··· 1696 1696 1697 1697 #ifdef CONFIG_SMP 1698 1698 1699 + #ifdef CONFIG_PREEMPT_RT 1700 + 1701 + static void 1702 + __do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask, u32 flags); 1703 + 1704 + static int __set_cpus_allowed_ptr(struct task_struct *p, 1705 + const struct cpumask *new_mask, 1706 + u32 flags); 1707 + 1708 + static void migrate_disable_switch(struct rq *rq, struct task_struct *p) 1709 + { 1710 + if (likely(!p->migration_disabled)) 1711 + return; 1712 + 1713 + if (p->cpus_ptr != &p->cpus_mask) 1714 + return; 1715 + 1716 + /* 1717 + * Violates locking rules! see comment in __do_set_cpus_allowed(). 1718 + */ 1719 + __do_set_cpus_allowed(p, cpumask_of(rq->cpu), SCA_MIGRATE_DISABLE); 1720 + } 1721 + 1722 + void migrate_disable(void) 1723 + { 1724 + struct task_struct *p = current; 1725 + 1726 + if (p->migration_disabled) { 1727 + p->migration_disabled++; 1728 + return; 1729 + } 1730 + 1731 + preempt_disable(); 1732 + this_rq()->nr_pinned++; 1733 + p->migration_disabled = 1; 1734 + preempt_enable(); 1735 + } 1736 + EXPORT_SYMBOL_GPL(migrate_disable); 1737 + 1738 + void migrate_enable(void) 1739 + { 1740 + struct task_struct *p = current; 1741 + 1742 + if (p->migration_disabled > 1) { 1743 + p->migration_disabled--; 1744 + return; 1745 + } 1746 + 1747 + /* 1748 + * Ensure stop_task runs either before or after this, and that 1749 + * __set_cpus_allowed_ptr(SCA_MIGRATE_ENABLE) doesn't schedule(). 1750 + */ 1751 + preempt_disable(); 1752 + if (p->cpus_ptr != &p->cpus_mask) 1753 + __set_cpus_allowed_ptr(p, &p->cpus_mask, SCA_MIGRATE_ENABLE); 1754 + /* 1755 + * Mustn't clear migration_disabled() until cpus_ptr points back at the 1756 + * regular cpus_mask, otherwise things that race (eg. 1757 + * select_fallback_rq) get confused. 1758 + */ 1759 + barrier(); 1760 + p->migration_disabled = 0; 1761 + this_rq()->nr_pinned--; 1762 + preempt_enable(); 1763 + } 1764 + EXPORT_SYMBOL_GPL(migrate_enable); 1765 + 1766 + static inline bool rq_has_pinned_tasks(struct rq *rq) 1767 + { 1768 + return rq->nr_pinned; 1769 + } 1770 + 1771 + #endif 1772 + 1699 1773 /* 1700 1774 * Per-CPU kthreads are allowed to run on !active && online CPUs, see 1701 1775 * __set_cpus_allowed_ptr() and select_fallback_rq(). ··· 1779 1705 if (!cpumask_test_cpu(cpu, p->cpus_ptr)) 1780 1706 return false; 1781 1707 1782 - if (is_per_cpu_kthread(p)) 1708 + if (is_per_cpu_kthread(p) || is_migration_disabled(p)) 1783 1709 return cpu_online(cpu); 1784 1710 1785 1711 return cpu_active(cpu); ··· 1824 1750 } 1825 1751 1826 1752 struct migration_arg { 1827 - struct task_struct *task; 1828 - int dest_cpu; 1753 + struct task_struct *task; 1754 + int dest_cpu; 1755 + struct set_affinity_pending *pending; 1756 + }; 1757 + 1758 + struct set_affinity_pending { 1759 + refcount_t refs; 1760 + struct completion done; 1761 + struct cpu_stop_work stop_work; 1762 + struct migration_arg arg; 1829 1763 }; 1830 1764 1831 1765 /* ··· 1865 1783 */ 1866 1784 static int migration_cpu_stop(void *data) 1867 1785 { 1786 + struct set_affinity_pending *pending; 1868 1787 struct migration_arg *arg = data; 1869 1788 struct task_struct *p = arg->task; 1789 + int dest_cpu = arg->dest_cpu; 1870 1790 struct rq *rq = this_rq(); 1791 + bool complete = false; 1871 1792 struct rq_flags rf; 1872 1793 1873 1794 /* 1874 1795 * The original target CPU might have gone down and we might 1875 1796 * be on another CPU but it doesn't matter. 1876 1797 */ 1877 - local_irq_disable(); 1798 + local_irq_save(rf.flags); 1878 1799 /* 1879 1800 * We need to explicitly wake pending tasks before running 1880 1801 * __migrate_task() such that we will not miss enforcing cpus_ptr ··· 1887 1802 1888 1803 raw_spin_lock(&p->pi_lock); 1889 1804 rq_lock(rq, &rf); 1805 + 1806 + pending = p->migration_pending; 1890 1807 /* 1891 1808 * If task_rq(p) != rq, it cannot be migrated here, because we're 1892 1809 * holding rq->lock, if p->on_rq == 0 it cannot get enqueued because 1893 1810 * we're holding p->pi_lock. 1894 1811 */ 1895 1812 if (task_rq(p) == rq) { 1896 - if (task_on_rq_queued(p)) 1897 - rq = __migrate_task(rq, &rf, p, arg->dest_cpu); 1898 - else 1899 - p->wake_cpu = arg->dest_cpu; 1900 - } 1901 - rq_unlock(rq, &rf); 1902 - raw_spin_unlock(&p->pi_lock); 1813 + if (is_migration_disabled(p)) 1814 + goto out; 1903 1815 1904 - local_irq_enable(); 1816 + if (pending) { 1817 + p->migration_pending = NULL; 1818 + complete = true; 1819 + } 1820 + 1821 + /* migrate_enable() -- we must not race against SCA */ 1822 + if (dest_cpu < 0) { 1823 + /* 1824 + * When this was migrate_enable() but we no longer 1825 + * have a @pending, a concurrent SCA 'fixed' things 1826 + * and we should be valid again. Nothing to do. 1827 + */ 1828 + if (!pending) { 1829 + WARN_ON_ONCE(!is_cpu_allowed(p, cpu_of(rq))); 1830 + goto out; 1831 + } 1832 + 1833 + dest_cpu = cpumask_any_distribute(&p->cpus_mask); 1834 + } 1835 + 1836 + if (task_on_rq_queued(p)) 1837 + rq = __migrate_task(rq, &rf, p, dest_cpu); 1838 + else 1839 + p->wake_cpu = dest_cpu; 1840 + 1841 + } else if (dest_cpu < 0) { 1842 + /* 1843 + * This happens when we get migrated between migrate_enable()'s 1844 + * preempt_enable() and scheduling the stopper task. At that 1845 + * point we're a regular task again and not current anymore. 1846 + * 1847 + * A !PREEMPT kernel has a giant hole here, which makes it far 1848 + * more likely. 1849 + */ 1850 + 1851 + /* 1852 + * When this was migrate_enable() but we no longer have an 1853 + * @pending, a concurrent SCA 'fixed' things and we should be 1854 + * valid again. Nothing to do. 1855 + */ 1856 + if (!pending) { 1857 + WARN_ON_ONCE(!is_cpu_allowed(p, cpu_of(rq))); 1858 + goto out; 1859 + } 1860 + 1861 + /* 1862 + * When migrate_enable() hits a rq mis-match we can't reliably 1863 + * determine is_migration_disabled() and so have to chase after 1864 + * it. 1865 + */ 1866 + task_rq_unlock(rq, p, &rf); 1867 + stop_one_cpu_nowait(task_cpu(p), migration_cpu_stop, 1868 + &pending->arg, &pending->stop_work); 1869 + return 0; 1870 + } 1871 + out: 1872 + task_rq_unlock(rq, p, &rf); 1873 + 1874 + if (complete) 1875 + complete_all(&pending->done); 1876 + 1877 + /* For pending->{arg,stop_work} */ 1878 + pending = arg->pending; 1879 + if (pending && refcount_dec_and_test(&pending->refs)) 1880 + wake_up_var(&pending->refs); 1881 + 1882 + return 0; 1883 + } 1884 + 1885 + int push_cpu_stop(void *arg) 1886 + { 1887 + struct rq *lowest_rq = NULL, *rq = this_rq(); 1888 + struct task_struct *p = arg; 1889 + 1890 + raw_spin_lock_irq(&p->pi_lock); 1891 + raw_spin_lock(&rq->lock); 1892 + 1893 + if (task_rq(p) != rq) 1894 + goto out_unlock; 1895 + 1896 + if (is_migration_disabled(p)) { 1897 + p->migration_flags |= MDF_PUSH; 1898 + goto out_unlock; 1899 + } 1900 + 1901 + p->migration_flags &= ~MDF_PUSH; 1902 + 1903 + if (p->sched_class->find_lock_rq) 1904 + lowest_rq = p->sched_class->find_lock_rq(p, rq); 1905 + 1906 + if (!lowest_rq) 1907 + goto out_unlock; 1908 + 1909 + // XXX validate p is still the highest prio task 1910 + if (task_rq(p) == rq) { 1911 + deactivate_task(rq, p, 0); 1912 + set_task_cpu(p, lowest_rq->cpu); 1913 + activate_task(lowest_rq, p, 0); 1914 + resched_curr(lowest_rq); 1915 + } 1916 + 1917 + double_unlock_balance(rq, lowest_rq); 1918 + 1919 + out_unlock: 1920 + rq->push_busy = false; 1921 + raw_spin_unlock(&rq->lock); 1922 + raw_spin_unlock_irq(&p->pi_lock); 1923 + 1924 + put_task_struct(p); 1905 1925 return 0; 1906 1926 } 1907 1927 ··· 2014 1824 * sched_class::set_cpus_allowed must do the below, but is not required to 2015 1825 * actually call this function. 2016 1826 */ 2017 - void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask) 1827 + void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask, u32 flags) 2018 1828 { 1829 + if (flags & (SCA_MIGRATE_ENABLE | SCA_MIGRATE_DISABLE)) { 1830 + p->cpus_ptr = new_mask; 1831 + return; 1832 + } 1833 + 2019 1834 cpumask_copy(&p->cpus_mask, new_mask); 2020 1835 p->nr_cpus_allowed = cpumask_weight(new_mask); 2021 1836 } 2022 1837 2023 - void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) 1838 + static void 1839 + __do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask, u32 flags) 2024 1840 { 2025 1841 struct rq *rq = task_rq(p); 2026 1842 bool queued, running; 2027 1843 2028 - lockdep_assert_held(&p->pi_lock); 1844 + /* 1845 + * This here violates the locking rules for affinity, since we're only 1846 + * supposed to change these variables while holding both rq->lock and 1847 + * p->pi_lock. 1848 + * 1849 + * HOWEVER, it magically works, because ttwu() is the only code that 1850 + * accesses these variables under p->pi_lock and only does so after 1851 + * smp_cond_load_acquire(&p->on_cpu, !VAL), and we're in __schedule() 1852 + * before finish_task(). 1853 + * 1854 + * XXX do further audits, this smells like something putrid. 1855 + */ 1856 + if (flags & SCA_MIGRATE_DISABLE) 1857 + SCHED_WARN_ON(!p->on_cpu); 1858 + else 1859 + lockdep_assert_held(&p->pi_lock); 2029 1860 2030 1861 queued = task_on_rq_queued(p); 2031 1862 running = task_current(rq, p); ··· 2062 1851 if (running) 2063 1852 put_prev_task(rq, p); 2064 1853 2065 - p->sched_class->set_cpus_allowed(p, new_mask); 1854 + p->sched_class->set_cpus_allowed(p, new_mask, flags); 2066 1855 2067 1856 if (queued) 2068 1857 enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK); 2069 1858 if (running) 2070 1859 set_next_task(rq, p); 1860 + } 1861 + 1862 + void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) 1863 + { 1864 + __do_set_cpus_allowed(p, new_mask, 0); 1865 + } 1866 + 1867 + /* 1868 + * This function is wildly self concurrent; here be dragons. 1869 + * 1870 + * 1871 + * When given a valid mask, __set_cpus_allowed_ptr() must block until the 1872 + * designated task is enqueued on an allowed CPU. If that task is currently 1873 + * running, we have to kick it out using the CPU stopper. 1874 + * 1875 + * Migrate-Disable comes along and tramples all over our nice sandcastle. 1876 + * Consider: 1877 + * 1878 + * Initial conditions: P0->cpus_mask = [0, 1] 1879 + * 1880 + * P0@CPU0 P1 1881 + * 1882 + * migrate_disable(); 1883 + * <preempted> 1884 + * set_cpus_allowed_ptr(P0, [1]); 1885 + * 1886 + * P1 *cannot* return from this set_cpus_allowed_ptr() call until P0 executes 1887 + * its outermost migrate_enable() (i.e. it exits its Migrate-Disable region). 1888 + * This means we need the following scheme: 1889 + * 1890 + * P0@CPU0 P1 1891 + * 1892 + * migrate_disable(); 1893 + * <preempted> 1894 + * set_cpus_allowed_ptr(P0, [1]); 1895 + * <blocks> 1896 + * <resumes> 1897 + * migrate_enable(); 1898 + * __set_cpus_allowed_ptr(); 1899 + * <wakes local stopper> 1900 + * `--> <woken on migration completion> 1901 + * 1902 + * Now the fun stuff: there may be several P1-like tasks, i.e. multiple 1903 + * concurrent set_cpus_allowed_ptr(P0, [*]) calls. CPU affinity changes of any 1904 + * task p are serialized by p->pi_lock, which we can leverage: the one that 1905 + * should come into effect at the end of the Migrate-Disable region is the last 1906 + * one. This means we only need to track a single cpumask (i.e. p->cpus_mask), 1907 + * but we still need to properly signal those waiting tasks at the appropriate 1908 + * moment. 1909 + * 1910 + * This is implemented using struct set_affinity_pending. The first 1911 + * __set_cpus_allowed_ptr() caller within a given Migrate-Disable region will 1912 + * setup an instance of that struct and install it on the targeted task_struct. 1913 + * Any and all further callers will reuse that instance. Those then wait for 1914 + * a completion signaled at the tail of the CPU stopper callback (1), triggered 1915 + * on the end of the Migrate-Disable region (i.e. outermost migrate_enable()). 1916 + * 1917 + * 1918 + * (1) In the cases covered above. There is one more where the completion is 1919 + * signaled within affine_move_task() itself: when a subsequent affinity request 1920 + * cancels the need for an active migration. Consider: 1921 + * 1922 + * Initial conditions: P0->cpus_mask = [0, 1] 1923 + * 1924 + * P0@CPU0 P1 P2 1925 + * 1926 + * migrate_disable(); 1927 + * <preempted> 1928 + * set_cpus_allowed_ptr(P0, [1]); 1929 + * <blocks> 1930 + * set_cpus_allowed_ptr(P0, [0, 1]); 1931 + * <signal completion> 1932 + * <awakes> 1933 + * 1934 + * Note that the above is safe vs a concurrent migrate_enable(), as any 1935 + * pending affinity completion is preceded by an uninstallation of 1936 + * p->migration_pending done with p->pi_lock held. 1937 + */ 1938 + static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flags *rf, 1939 + int dest_cpu, unsigned int flags) 1940 + { 1941 + struct set_affinity_pending my_pending = { }, *pending = NULL; 1942 + struct migration_arg arg = { 1943 + .task = p, 1944 + .dest_cpu = dest_cpu, 1945 + }; 1946 + bool complete = false; 1947 + 1948 + /* Can the task run on the task's current CPU? If so, we're done */ 1949 + if (cpumask_test_cpu(task_cpu(p), &p->cpus_mask)) { 1950 + struct task_struct *push_task = NULL; 1951 + 1952 + if ((flags & SCA_MIGRATE_ENABLE) && 1953 + (p->migration_flags & MDF_PUSH) && !rq->push_busy) { 1954 + rq->push_busy = true; 1955 + push_task = get_task_struct(p); 1956 + } 1957 + 1958 + pending = p->migration_pending; 1959 + if (pending) { 1960 + refcount_inc(&pending->refs); 1961 + p->migration_pending = NULL; 1962 + complete = true; 1963 + } 1964 + task_rq_unlock(rq, p, rf); 1965 + 1966 + if (push_task) { 1967 + stop_one_cpu_nowait(rq->cpu, push_cpu_stop, 1968 + p, &rq->push_work); 1969 + } 1970 + 1971 + if (complete) 1972 + goto do_complete; 1973 + 1974 + return 0; 1975 + } 1976 + 1977 + if (!(flags & SCA_MIGRATE_ENABLE)) { 1978 + /* serialized by p->pi_lock */ 1979 + if (!p->migration_pending) { 1980 + /* Install the request */ 1981 + refcount_set(&my_pending.refs, 1); 1982 + init_completion(&my_pending.done); 1983 + p->migration_pending = &my_pending; 1984 + } else { 1985 + pending = p->migration_pending; 1986 + refcount_inc(&pending->refs); 1987 + } 1988 + } 1989 + pending = p->migration_pending; 1990 + /* 1991 + * - !MIGRATE_ENABLE: 1992 + * we'll have installed a pending if there wasn't one already. 1993 + * 1994 + * - MIGRATE_ENABLE: 1995 + * we're here because the current CPU isn't matching anymore, 1996 + * the only way that can happen is because of a concurrent 1997 + * set_cpus_allowed_ptr() call, which should then still be 1998 + * pending completion. 1999 + * 2000 + * Either way, we really should have a @pending here. 2001 + */ 2002 + if (WARN_ON_ONCE(!pending)) { 2003 + task_rq_unlock(rq, p, rf); 2004 + return -EINVAL; 2005 + } 2006 + 2007 + if (flags & SCA_MIGRATE_ENABLE) { 2008 + 2009 + refcount_inc(&pending->refs); /* pending->{arg,stop_work} */ 2010 + p->migration_flags &= ~MDF_PUSH; 2011 + task_rq_unlock(rq, p, rf); 2012 + 2013 + pending->arg = (struct migration_arg) { 2014 + .task = p, 2015 + .dest_cpu = -1, 2016 + .pending = pending, 2017 + }; 2018 + 2019 + stop_one_cpu_nowait(cpu_of(rq), migration_cpu_stop, 2020 + &pending->arg, &pending->stop_work); 2021 + 2022 + return 0; 2023 + } 2024 + 2025 + if (task_running(rq, p) || p->state == TASK_WAKING) { 2026 + /* 2027 + * Lessen races (and headaches) by delegating 2028 + * is_migration_disabled(p) checks to the stopper, which will 2029 + * run on the same CPU as said p. 2030 + */ 2031 + task_rq_unlock(rq, p, rf); 2032 + stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg); 2033 + 2034 + } else { 2035 + 2036 + if (!is_migration_disabled(p)) { 2037 + if (task_on_rq_queued(p)) 2038 + rq = move_queued_task(rq, rf, p, dest_cpu); 2039 + 2040 + p->migration_pending = NULL; 2041 + complete = true; 2042 + } 2043 + task_rq_unlock(rq, p, rf); 2044 + 2045 + do_complete: 2046 + if (complete) 2047 + complete_all(&pending->done); 2048 + } 2049 + 2050 + wait_for_completion(&pending->done); 2051 + 2052 + if (refcount_dec_and_test(&pending->refs)) 2053 + wake_up_var(&pending->refs); 2054 + 2055 + /* 2056 + * Block the original owner of &pending until all subsequent callers 2057 + * have seen the completion and decremented the refcount 2058 + */ 2059 + wait_var_event(&my_pending.refs, !refcount_read(&my_pending.refs)); 2060 + 2061 + return 0; 2071 2062 } 2072 2063 2073 2064 /* ··· 2282 1869 * call is not atomic; no spinlocks may be held. 2283 1870 */ 2284 1871 static int __set_cpus_allowed_ptr(struct task_struct *p, 2285 - const struct cpumask *new_mask, bool check) 1872 + const struct cpumask *new_mask, 1873 + u32 flags) 2286 1874 { 2287 1875 const struct cpumask *cpu_valid_mask = cpu_active_mask; 2288 1876 unsigned int dest_cpu; ··· 2294 1880 rq = task_rq_lock(p, &rf); 2295 1881 update_rq_clock(rq); 2296 1882 2297 - if (p->flags & PF_KTHREAD) { 1883 + if (p->flags & PF_KTHREAD || is_migration_disabled(p)) { 2298 1884 /* 2299 - * Kernel threads are allowed on online && !active CPUs 1885 + * Kernel threads are allowed on online && !active CPUs. 1886 + * 1887 + * Specifically, migration_disabled() tasks must not fail the 1888 + * cpumask_any_and_distribute() pick below, esp. so on 1889 + * SCA_MIGRATE_ENABLE, otherwise we'll not call 1890 + * set_cpus_allowed_common() and actually reset p->cpus_ptr. 2300 1891 */ 2301 1892 cpu_valid_mask = cpu_online_mask; 2302 1893 } ··· 2310 1891 * Must re-check here, to close a race against __kthread_bind(), 2311 1892 * sched_setaffinity() is not guaranteed to observe the flag. 2312 1893 */ 2313 - if (check && (p->flags & PF_NO_SETAFFINITY)) { 1894 + if ((flags & SCA_CHECK) && (p->flags & PF_NO_SETAFFINITY)) { 2314 1895 ret = -EINVAL; 2315 1896 goto out; 2316 1897 } 2317 1898 2318 - if (cpumask_equal(&p->cpus_mask, new_mask)) 2319 - goto out; 1899 + if (!(flags & SCA_MIGRATE_ENABLE)) { 1900 + if (cpumask_equal(&p->cpus_mask, new_mask)) 1901 + goto out; 1902 + 1903 + if (WARN_ON_ONCE(p == current && 1904 + is_migration_disabled(p) && 1905 + !cpumask_test_cpu(task_cpu(p), new_mask))) { 1906 + ret = -EBUSY; 1907 + goto out; 1908 + } 1909 + } 2320 1910 2321 1911 /* 2322 1912 * Picking a ~random cpu helps in cases where we are changing affinity ··· 2338 1910 goto out; 2339 1911 } 2340 1912 2341 - do_set_cpus_allowed(p, new_mask); 1913 + __do_set_cpus_allowed(p, new_mask, flags); 2342 1914 2343 1915 if (p->flags & PF_KTHREAD) { 2344 1916 /* ··· 2350 1922 p->nr_cpus_allowed != 1); 2351 1923 } 2352 1924 2353 - /* Can the task run on the task's current CPU? If so, we're done */ 2354 - if (cpumask_test_cpu(task_cpu(p), new_mask)) 2355 - goto out; 1925 + return affine_move_task(rq, p, &rf, dest_cpu, flags); 2356 1926 2357 - if (task_running(rq, p) || p->state == TASK_WAKING) { 2358 - struct migration_arg arg = { p, dest_cpu }; 2359 - /* Need help from migration thread: drop lock and wait. */ 2360 - task_rq_unlock(rq, p, &rf); 2361 - stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg); 2362 - return 0; 2363 - } else if (task_on_rq_queued(p)) { 2364 - /* 2365 - * OK, since we're going to drop the lock immediately 2366 - * afterwards anyway. 2367 - */ 2368 - rq = move_queued_task(rq, &rf, p, dest_cpu); 2369 - } 2370 1927 out: 2371 1928 task_rq_unlock(rq, p, &rf); 2372 1929 ··· 2360 1947 2361 1948 int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) 2362 1949 { 2363 - return __set_cpus_allowed_ptr(p, new_mask, false); 1950 + return __set_cpus_allowed_ptr(p, new_mask, 0); 2364 1951 } 2365 1952 EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr); 2366 1953 ··· 2401 1988 * Clearly, migrating tasks to offline CPUs is a fairly daft thing. 2402 1989 */ 2403 1990 WARN_ON_ONCE(!cpu_online(new_cpu)); 1991 + 1992 + WARN_ON_ONCE(is_migration_disabled(p)); 2404 1993 #endif 2405 1994 2406 1995 trace_sched_migrate_task(p, new_cpu); ··· 2733 2318 } 2734 2319 fallthrough; 2735 2320 case possible: 2321 + /* 2322 + * XXX When called from select_task_rq() we only 2323 + * hold p->pi_lock and again violate locking order. 2324 + * 2325 + * More yuck to audit. 2326 + */ 2736 2327 do_set_cpus_allowed(p, cpu_possible_mask); 2737 2328 state = fail; 2738 2329 break; ··· 2773 2352 { 2774 2353 lockdep_assert_held(&p->pi_lock); 2775 2354 2776 - if (p->nr_cpus_allowed > 1) 2355 + if (p->nr_cpus_allowed > 1 && !is_migration_disabled(p)) 2777 2356 cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags); 2778 2357 else 2779 2358 cpu = cpumask_any(p->cpus_ptr); ··· 2796 2375 2797 2376 void sched_set_stop_task(int cpu, struct task_struct *stop) 2798 2377 { 2378 + static struct lock_class_key stop_pi_lock; 2799 2379 struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 }; 2800 2380 struct task_struct *old_stop = cpu_rq(cpu)->stop; 2801 2381 ··· 2812 2390 sched_setscheduler_nocheck(stop, SCHED_FIFO, &param); 2813 2391 2814 2392 stop->sched_class = &stop_sched_class; 2393 + 2394 + /* 2395 + * The PI code calls rt_mutex_setprio() with ->pi_lock held to 2396 + * adjust the effective priority of a task. As a result, 2397 + * rt_mutex_setprio() can trigger (RT) balancing operations, 2398 + * which can then trigger wakeups of the stop thread to push 2399 + * around the current task. 2400 + * 2401 + * The stop task itself will never be part of the PI-chain, it 2402 + * never blocks, therefore that ->pi_lock recursion is safe. 2403 + * Tell lockdep about this by placing the stop->pi_lock in its 2404 + * own class. 2405 + */ 2406 + lockdep_set_class(&stop->pi_lock, &stop_pi_lock); 2815 2407 } 2816 2408 2817 2409 cpu_rq(cpu)->stop = stop; ··· 2842 2406 #else 2843 2407 2844 2408 static inline int __set_cpus_allowed_ptr(struct task_struct *p, 2845 - const struct cpumask *new_mask, bool check) 2409 + const struct cpumask *new_mask, 2410 + u32 flags) 2846 2411 { 2847 2412 return set_cpus_allowed_ptr(p, new_mask); 2848 2413 } 2849 2414 2850 2415 #endif /* CONFIG_SMP */ 2416 + 2417 + #if !defined(CONFIG_SMP) || !defined(CONFIG_PREEMPT_RT) 2418 + 2419 + static inline void migrate_disable_switch(struct rq *rq, struct task_struct *p) { } 2420 + 2421 + static inline bool rq_has_pinned_tasks(struct rq *rq) 2422 + { 2423 + return false; 2424 + } 2425 + 2426 + #endif 2851 2427 2852 2428 static void 2853 2429 ttwu_stat(struct task_struct *p, int cpu, int wake_flags) ··· 3546 3098 init_numa_balancing(clone_flags, p); 3547 3099 #ifdef CONFIG_SMP 3548 3100 p->wake_entry.u_flags = CSD_TYPE_TTWU; 3101 + p->migration_pending = NULL; 3549 3102 #endif 3550 3103 } 3551 3104 ··· 3934 3485 #endif 3935 3486 } 3936 3487 3488 + #ifdef CONFIG_SMP 3489 + 3490 + static void do_balance_callbacks(struct rq *rq, struct callback_head *head) 3491 + { 3492 + void (*func)(struct rq *rq); 3493 + struct callback_head *next; 3494 + 3495 + lockdep_assert_held(&rq->lock); 3496 + 3497 + while (head) { 3498 + func = (void (*)(struct rq *))head->func; 3499 + next = head->next; 3500 + head->next = NULL; 3501 + head = next; 3502 + 3503 + func(rq); 3504 + } 3505 + } 3506 + 3507 + static inline struct callback_head *splice_balance_callbacks(struct rq *rq) 3508 + { 3509 + struct callback_head *head = rq->balance_callback; 3510 + 3511 + lockdep_assert_held(&rq->lock); 3512 + if (head) { 3513 + rq->balance_callback = NULL; 3514 + rq->balance_flags &= ~BALANCE_WORK; 3515 + } 3516 + 3517 + return head; 3518 + } 3519 + 3520 + static void __balance_callbacks(struct rq *rq) 3521 + { 3522 + do_balance_callbacks(rq, splice_balance_callbacks(rq)); 3523 + } 3524 + 3525 + static inline void balance_callbacks(struct rq *rq, struct callback_head *head) 3526 + { 3527 + unsigned long flags; 3528 + 3529 + if (unlikely(head)) { 3530 + raw_spin_lock_irqsave(&rq->lock, flags); 3531 + do_balance_callbacks(rq, head); 3532 + raw_spin_unlock_irqrestore(&rq->lock, flags); 3533 + } 3534 + } 3535 + 3536 + static void balance_push(struct rq *rq); 3537 + 3538 + static inline void balance_switch(struct rq *rq) 3539 + { 3540 + if (likely(!rq->balance_flags)) 3541 + return; 3542 + 3543 + if (rq->balance_flags & BALANCE_PUSH) { 3544 + balance_push(rq); 3545 + return; 3546 + } 3547 + 3548 + __balance_callbacks(rq); 3549 + } 3550 + 3551 + #else 3552 + 3553 + static inline void __balance_callbacks(struct rq *rq) 3554 + { 3555 + } 3556 + 3557 + static inline struct callback_head *splice_balance_callbacks(struct rq *rq) 3558 + { 3559 + return NULL; 3560 + } 3561 + 3562 + static inline void balance_callbacks(struct rq *rq, struct callback_head *head) 3563 + { 3564 + } 3565 + 3566 + static inline void balance_switch(struct rq *rq) 3567 + { 3568 + } 3569 + 3570 + #endif 3571 + 3937 3572 static inline void 3938 3573 prepare_lock_switch(struct rq *rq, struct task_struct *next, struct rq_flags *rf) 3939 3574 { ··· 4043 3510 * prev into current: 4044 3511 */ 4045 3512 spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_); 3513 + balance_switch(rq); 4046 3514 raw_spin_unlock_irq(&rq->lock); 4047 3515 } 4048 3516 ··· 4185 3651 return rq; 4186 3652 } 4187 3653 4188 - #ifdef CONFIG_SMP 4189 - 4190 - /* rq->lock is NOT held, but preemption is disabled */ 4191 - static void __balance_callback(struct rq *rq) 4192 - { 4193 - struct callback_head *head, *next; 4194 - void (*func)(struct rq *rq); 4195 - unsigned long flags; 4196 - 4197 - raw_spin_lock_irqsave(&rq->lock, flags); 4198 - head = rq->balance_callback; 4199 - rq->balance_callback = NULL; 4200 - while (head) { 4201 - func = (void (*)(struct rq *))head->func; 4202 - next = head->next; 4203 - head->next = NULL; 4204 - head = next; 4205 - 4206 - func(rq); 4207 - } 4208 - raw_spin_unlock_irqrestore(&rq->lock, flags); 4209 - } 4210 - 4211 - static inline void balance_callback(struct rq *rq) 4212 - { 4213 - if (unlikely(rq->balance_callback)) 4214 - __balance_callback(rq); 4215 - } 4216 - 4217 - #else 4218 - 4219 - static inline void balance_callback(struct rq *rq) 4220 - { 4221 - } 4222 - 4223 - #endif 4224 - 4225 3654 /** 4226 3655 * schedule_tail - first thing a freshly forked thread must call. 4227 3656 * @prev: the thread we just switched away from. ··· 4204 3707 */ 4205 3708 4206 3709 rq = finish_task_switch(prev); 4207 - balance_callback(rq); 4208 3710 preempt_enable(); 4209 3711 4210 3712 if (current->set_child_tid) ··· 5011 4515 */ 5012 4516 ++*switch_count; 5013 4517 4518 + migrate_disable_switch(rq, prev); 5014 4519 psi_sched_switch(prev, next, !task_on_rq_queued(prev)); 5015 4520 5016 4521 trace_sched_switch(preempt, prev, next); ··· 5020 4523 rq = context_switch(rq, prev, next, &rf); 5021 4524 } else { 5022 4525 rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP); 5023 - rq_unlock_irq(rq, &rf); 5024 - } 5025 4526 5026 - balance_callback(rq); 4527 + rq_unpin_lock(rq, &rf); 4528 + __balance_callbacks(rq); 4529 + raw_spin_unlock_irq(&rq->lock); 4530 + } 5027 4531 } 5028 4532 5029 4533 void __noreturn do_task_dead(void) ··· 5435 4937 out_unlock: 5436 4938 /* Avoid rq from going away on us: */ 5437 4939 preempt_disable(); 5438 - __task_rq_unlock(rq, &rf); 5439 4940 5440 - balance_callback(rq); 4941 + rq_unpin_lock(rq, &rf); 4942 + __balance_callbacks(rq); 4943 + raw_spin_unlock(&rq->lock); 4944 + 5441 4945 preempt_enable(); 5442 4946 } 5443 4947 #else ··· 5713 5213 int retval, oldprio, oldpolicy = -1, queued, running; 5714 5214 int new_effective_prio, policy = attr->sched_policy; 5715 5215 const struct sched_class *prev_class; 5216 + struct callback_head *head; 5716 5217 struct rq_flags rf; 5717 5218 int reset_on_fork; 5718 5219 int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK; ··· 5952 5451 5953 5452 /* Avoid rq from going away on us: */ 5954 5453 preempt_disable(); 5454 + head = splice_balance_callbacks(rq); 5955 5455 task_rq_unlock(rq, p, &rf); 5956 5456 5957 5457 if (pi) { ··· 5961 5459 } 5962 5460 5963 5461 /* Run balance callbacks after we've adjusted the PI chain: */ 5964 - balance_callback(rq); 5462 + balance_callbacks(rq, head); 5965 5463 preempt_enable(); 5966 5464 5967 5465 return 0; ··· 6456 5954 } 6457 5955 #endif 6458 5956 again: 6459 - retval = __set_cpus_allowed_ptr(p, new_mask, true); 5957 + retval = __set_cpus_allowed_ptr(p, new_mask, SCA_CHECK); 6460 5958 6461 5959 if (!retval) { 6462 5960 cpuset_cpus_allowed(p, cpus_allowed); ··· 6945 6443 (unsigned long)task_thread_info(p)->flags); 6946 6444 6947 6445 print_worker_info(KERN_INFO, p); 6446 + print_stop_info(KERN_INFO, p); 6948 6447 show_stack(p, NULL, KERN_INFO); 6949 6448 put_task_stack(p); 6950 6449 } ··· 7036 6533 * 7037 6534 * And since this is boot we can forgo the serialization. 7038 6535 */ 7039 - set_cpus_allowed_common(idle, cpumask_of(cpu)); 6536 + set_cpus_allowed_common(idle, cpumask_of(cpu), 0); 7040 6537 #endif 7041 6538 /* 7042 6539 * We're having a chicken and egg problem, even though we are ··· 7187 6684 /* finish_cpu(), as ran on the BP, will clean up the active_mm state */ 7188 6685 } 7189 6686 7190 - /* 7191 - * Since this CPU is going 'away' for a while, fold any nr_active delta 7192 - * we might have. Assumes we're called after migrate_tasks() so that the 7193 - * nr_active count is stable. We need to take the teardown thread which 7194 - * is calling this into account, so we hand in adjust = 1 to the load 7195 - * calculation. 7196 - * 7197 - * Also see the comment "Global load-average calculations". 7198 - */ 7199 - static void calc_load_migrate(struct rq *rq) 6687 + static int __balance_push_cpu_stop(void *arg) 7200 6688 { 7201 - long delta = calc_load_fold_active(rq, 1); 7202 - if (delta) 7203 - atomic_long_add(delta, &calc_load_tasks); 7204 - } 6689 + struct task_struct *p = arg; 6690 + struct rq *rq = this_rq(); 6691 + struct rq_flags rf; 6692 + int cpu; 7205 6693 7206 - static struct task_struct *__pick_migrate_task(struct rq *rq) 7207 - { 7208 - const struct sched_class *class; 7209 - struct task_struct *next; 6694 + raw_spin_lock_irq(&p->pi_lock); 6695 + rq_lock(rq, &rf); 7210 6696 7211 - for_each_class(class) { 7212 - next = class->pick_next_task(rq); 7213 - if (next) { 7214 - next->sched_class->put_prev_task(rq, next); 7215 - return next; 7216 - } 7217 - } 7218 - 7219 - /* The idle class should always have a runnable task */ 7220 - BUG(); 7221 - } 7222 - 7223 - /* 7224 - * Migrate all tasks from the rq, sleeping tasks will be migrated by 7225 - * try_to_wake_up()->select_task_rq(). 7226 - * 7227 - * Called with rq->lock held even though we'er in stop_machine() and 7228 - * there's no concurrency possible, we hold the required locks anyway 7229 - * because of lock validation efforts. 7230 - */ 7231 - static void migrate_tasks(struct rq *dead_rq, struct rq_flags *rf) 7232 - { 7233 - struct rq *rq = dead_rq; 7234 - struct task_struct *next, *stop = rq->stop; 7235 - struct rq_flags orf = *rf; 7236 - int dest_cpu; 7237 - 7238 - /* 7239 - * Fudge the rq selection such that the below task selection loop 7240 - * doesn't get stuck on the currently eligible stop task. 7241 - * 7242 - * We're currently inside stop_machine() and the rq is either stuck 7243 - * in the stop_machine_cpu_stop() loop, or we're executing this code, 7244 - * either way we should never end up calling schedule() until we're 7245 - * done here. 7246 - */ 7247 - rq->stop = NULL; 7248 - 7249 - /* 7250 - * put_prev_task() and pick_next_task() sched 7251 - * class method both need to have an up-to-date 7252 - * value of rq->clock[_task] 7253 - */ 7254 6697 update_rq_clock(rq); 7255 6698 7256 - for (;;) { 7257 - /* 7258 - * There's this thread running, bail when that's the only 7259 - * remaining thread: 7260 - */ 7261 - if (rq->nr_running == 1) 7262 - break; 7263 - 7264 - next = __pick_migrate_task(rq); 7265 - 7266 - /* 7267 - * Rules for changing task_struct::cpus_mask are holding 7268 - * both pi_lock and rq->lock, such that holding either 7269 - * stabilizes the mask. 7270 - * 7271 - * Drop rq->lock is not quite as disastrous as it usually is 7272 - * because !cpu_active at this point, which means load-balance 7273 - * will not interfere. Also, stop-machine. 7274 - */ 7275 - rq_unlock(rq, rf); 7276 - raw_spin_lock(&next->pi_lock); 7277 - rq_relock(rq, rf); 7278 - 7279 - /* 7280 - * Since we're inside stop-machine, _nothing_ should have 7281 - * changed the task, WARN if weird stuff happened, because in 7282 - * that case the above rq->lock drop is a fail too. 7283 - */ 7284 - if (WARN_ON(task_rq(next) != rq || !task_on_rq_queued(next))) { 7285 - raw_spin_unlock(&next->pi_lock); 7286 - continue; 7287 - } 7288 - 7289 - /* Find suitable destination for @next, with force if needed. */ 7290 - dest_cpu = select_fallback_rq(dead_rq->cpu, next); 7291 - rq = __migrate_task(rq, rf, next, dest_cpu); 7292 - if (rq != dead_rq) { 7293 - rq_unlock(rq, rf); 7294 - rq = dead_rq; 7295 - *rf = orf; 7296 - rq_relock(rq, rf); 7297 - } 7298 - raw_spin_unlock(&next->pi_lock); 6699 + if (task_rq(p) == rq && task_on_rq_queued(p)) { 6700 + cpu = select_fallback_rq(rq->cpu, p); 6701 + rq = __migrate_task(rq, &rf, p, cpu); 7299 6702 } 7300 6703 7301 - rq->stop = stop; 6704 + rq_unlock(rq, &rf); 6705 + raw_spin_unlock_irq(&p->pi_lock); 6706 + 6707 + put_task_struct(p); 6708 + 6709 + return 0; 7302 6710 } 6711 + 6712 + static DEFINE_PER_CPU(struct cpu_stop_work, push_work); 6713 + 6714 + /* 6715 + * Ensure we only run per-cpu kthreads once the CPU goes !active. 6716 + */ 6717 + static void balance_push(struct rq *rq) 6718 + { 6719 + struct task_struct *push_task = rq->curr; 6720 + 6721 + lockdep_assert_held(&rq->lock); 6722 + SCHED_WARN_ON(rq->cpu != smp_processor_id()); 6723 + 6724 + /* 6725 + * Both the cpu-hotplug and stop task are in this case and are 6726 + * required to complete the hotplug process. 6727 + */ 6728 + if (is_per_cpu_kthread(push_task) || is_migration_disabled(push_task)) { 6729 + /* 6730 + * If this is the idle task on the outgoing CPU try to wake 6731 + * up the hotplug control thread which might wait for the 6732 + * last task to vanish. The rcuwait_active() check is 6733 + * accurate here because the waiter is pinned on this CPU 6734 + * and can't obviously be running in parallel. 6735 + * 6736 + * On RT kernels this also has to check whether there are 6737 + * pinned and scheduled out tasks on the runqueue. They 6738 + * need to leave the migrate disabled section first. 6739 + */ 6740 + if (!rq->nr_running && !rq_has_pinned_tasks(rq) && 6741 + rcuwait_active(&rq->hotplug_wait)) { 6742 + raw_spin_unlock(&rq->lock); 6743 + rcuwait_wake_up(&rq->hotplug_wait); 6744 + raw_spin_lock(&rq->lock); 6745 + } 6746 + return; 6747 + } 6748 + 6749 + get_task_struct(push_task); 6750 + /* 6751 + * Temporarily drop rq->lock such that we can wake-up the stop task. 6752 + * Both preemption and IRQs are still disabled. 6753 + */ 6754 + raw_spin_unlock(&rq->lock); 6755 + stop_one_cpu_nowait(rq->cpu, __balance_push_cpu_stop, push_task, 6756 + this_cpu_ptr(&push_work)); 6757 + /* 6758 + * At this point need_resched() is true and we'll take the loop in 6759 + * schedule(). The next pick is obviously going to be the stop task 6760 + * which is_per_cpu_kthread() and will push this task away. 6761 + */ 6762 + raw_spin_lock(&rq->lock); 6763 + } 6764 + 6765 + static void balance_push_set(int cpu, bool on) 6766 + { 6767 + struct rq *rq = cpu_rq(cpu); 6768 + struct rq_flags rf; 6769 + 6770 + rq_lock_irqsave(rq, &rf); 6771 + if (on) 6772 + rq->balance_flags |= BALANCE_PUSH; 6773 + else 6774 + rq->balance_flags &= ~BALANCE_PUSH; 6775 + rq_unlock_irqrestore(rq, &rf); 6776 + } 6777 + 6778 + /* 6779 + * Invoked from a CPUs hotplug control thread after the CPU has been marked 6780 + * inactive. All tasks which are not per CPU kernel threads are either 6781 + * pushed off this CPU now via balance_push() or placed on a different CPU 6782 + * during wakeup. Wait until the CPU is quiescent. 6783 + */ 6784 + static void balance_hotplug_wait(void) 6785 + { 6786 + struct rq *rq = this_rq(); 6787 + 6788 + rcuwait_wait_event(&rq->hotplug_wait, 6789 + rq->nr_running == 1 && !rq_has_pinned_tasks(rq), 6790 + TASK_UNINTERRUPTIBLE); 6791 + } 6792 + 6793 + #else 6794 + 6795 + static inline void balance_push(struct rq *rq) 6796 + { 6797 + } 6798 + 6799 + static inline void balance_push_set(int cpu, bool on) 6800 + { 6801 + } 6802 + 6803 + static inline void balance_hotplug_wait(void) 6804 + { 6805 + } 6806 + 7303 6807 #endif /* CONFIG_HOTPLUG_CPU */ 7304 6808 7305 6809 void set_rq_online(struct rq *rq) ··· 7392 6882 struct rq *rq = cpu_rq(cpu); 7393 6883 struct rq_flags rf; 7394 6884 6885 + balance_push_set(cpu, false); 6886 + 7395 6887 #ifdef CONFIG_SCHED_SMT 7396 6888 /* 7397 6889 * When going up, increment the number of cores with SMT present. ··· 7429 6917 7430 6918 int sched_cpu_deactivate(unsigned int cpu) 7431 6919 { 6920 + struct rq *rq = cpu_rq(cpu); 6921 + struct rq_flags rf; 7432 6922 int ret; 7433 6923 7434 6924 set_cpu_active(cpu, false); ··· 7442 6928 * Do sync before park smpboot threads to take care the rcu boost case. 7443 6929 */ 7444 6930 synchronize_rcu(); 6931 + 6932 + balance_push_set(cpu, true); 6933 + 6934 + rq_lock_irqsave(rq, &rf); 6935 + if (rq->rd) { 6936 + update_rq_clock(rq); 6937 + BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); 6938 + set_rq_offline(rq); 6939 + } 6940 + rq_unlock_irqrestore(rq, &rf); 7445 6941 7446 6942 #ifdef CONFIG_SCHED_SMT 7447 6943 /* ··· 7466 6942 7467 6943 ret = cpuset_cpu_inactive(cpu); 7468 6944 if (ret) { 6945 + balance_push_set(cpu, false); 7469 6946 set_cpu_active(cpu, true); 7470 6947 return ret; 7471 6948 } ··· 7490 6965 } 7491 6966 7492 6967 #ifdef CONFIG_HOTPLUG_CPU 6968 + 6969 + /* 6970 + * Invoked immediately before the stopper thread is invoked to bring the 6971 + * CPU down completely. At this point all per CPU kthreads except the 6972 + * hotplug thread (current) and the stopper thread (inactive) have been 6973 + * either parked or have been unbound from the outgoing CPU. Ensure that 6974 + * any of those which might be on the way out are gone. 6975 + * 6976 + * If after this point a bound task is being woken on this CPU then the 6977 + * responsible hotplug callback has failed to do it's job. 6978 + * sched_cpu_dying() will catch it with the appropriate fireworks. 6979 + */ 6980 + int sched_cpu_wait_empty(unsigned int cpu) 6981 + { 6982 + balance_hotplug_wait(); 6983 + return 0; 6984 + } 6985 + 6986 + /* 6987 + * Since this CPU is going 'away' for a while, fold any nr_active delta we 6988 + * might have. Called from the CPU stopper task after ensuring that the 6989 + * stopper is the last running task on the CPU, so nr_active count is 6990 + * stable. We need to take the teardown thread which is calling this into 6991 + * account, so we hand in adjust = 1 to the load calculation. 6992 + * 6993 + * Also see the comment "Global load-average calculations". 6994 + */ 6995 + static void calc_load_migrate(struct rq *rq) 6996 + { 6997 + long delta = calc_load_fold_active(rq, 1); 6998 + 6999 + if (delta) 7000 + atomic_long_add(delta, &calc_load_tasks); 7001 + } 7002 + 7493 7003 int sched_cpu_dying(unsigned int cpu) 7494 7004 { 7495 7005 struct rq *rq = cpu_rq(cpu); ··· 7534 6974 sched_tick_stop(cpu); 7535 6975 7536 6976 rq_lock_irqsave(rq, &rf); 7537 - if (rq->rd) { 7538 - BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); 7539 - set_rq_offline(rq); 7540 - } 7541 - migrate_tasks(rq, &rf); 7542 - BUG_ON(rq->nr_running != 1); 6977 + BUG_ON(rq->nr_running != 1 || rq_has_pinned_tasks(rq)); 7543 6978 rq_unlock_irqrestore(rq, &rf); 7544 6979 7545 6980 calc_load_migrate(rq); ··· 7740 7185 atomic_set(&rq->nohz_flags, 0); 7741 7186 7742 7187 rq_csd_init(rq, &rq->nohz_csd, nohz_csd_func); 7188 + #endif 7189 + #ifdef CONFIG_HOTPLUG_CPU 7190 + rcuwait_init(&rq->hotplug_wait); 7743 7191 #endif 7744 7192 #endif /* CONFIG_SMP */ 7745 7193 hrtick_rq_init(rq);

+2 -2

kernel/sched/cpudeadline.c

··· 120 120 const struct sched_dl_entity *dl_se = &p->dl; 121 121 122 122 if (later_mask && 123 - cpumask_and(later_mask, cp->free_cpus, p->cpus_ptr)) { 123 + cpumask_and(later_mask, cp->free_cpus, &p->cpus_mask)) { 124 124 unsigned long cap, max_cap = 0; 125 125 int cpu, max_cpu = -1; 126 126 ··· 151 151 152 152 WARN_ON(best_cpu != -1 && !cpu_present(best_cpu)); 153 153 154 - if (cpumask_test_cpu(best_cpu, p->cpus_ptr) && 154 + if (cpumask_test_cpu(best_cpu, &p->cpus_mask) && 155 155 dl_time_before(dl_se->deadline, cp->elements[0].dl)) { 156 156 if (later_mask) 157 157 cpumask_set_cpu(best_cpu, later_mask);

+2 -2

kernel/sched/cpupri.c

··· 97 97 if (skip) 98 98 return 0; 99 99 100 - if (cpumask_any_and(p->cpus_ptr, vec->mask) >= nr_cpu_ids) 100 + if (cpumask_any_and(&p->cpus_mask, vec->mask) >= nr_cpu_ids) 101 101 return 0; 102 102 103 103 if (lowest_mask) { 104 - cpumask_and(lowest_mask, p->cpus_ptr, vec->mask); 104 + cpumask_and(lowest_mask, &p->cpus_mask, vec->mask); 105 105 106 106 /* 107 107 * We have to ensure that we have at least one bit

+31 -15

kernel/sched/deadline.c

··· 559 559 560 560 static inline bool need_pull_dl_task(struct rq *rq, struct task_struct *prev) 561 561 { 562 - return dl_task(prev); 562 + return rq->online && dl_task(prev); 563 563 } 564 564 565 565 static DEFINE_PER_CPU(struct callback_head, dl_push_head); ··· 1931 1931 static int pick_dl_task(struct rq *rq, struct task_struct *p, int cpu) 1932 1932 { 1933 1933 if (!task_running(rq, p) && 1934 - cpumask_test_cpu(cpu, p->cpus_ptr)) 1934 + cpumask_test_cpu(cpu, &p->cpus_mask)) 1935 1935 return 1; 1936 1936 return 0; 1937 1937 } ··· 2021 2021 return this_cpu; 2022 2022 } 2023 2023 2024 - best_cpu = cpumask_first_and(later_mask, 2025 - sched_domain_span(sd)); 2024 + best_cpu = cpumask_any_and_distribute(later_mask, 2025 + sched_domain_span(sd)); 2026 2026 /* 2027 2027 * Last chance: if a CPU being in both later_mask 2028 2028 * and current sd span is valid, that becomes our ··· 2044 2044 if (this_cpu != -1) 2045 2045 return this_cpu; 2046 2046 2047 - cpu = cpumask_any(later_mask); 2047 + cpu = cpumask_any_distribute(later_mask); 2048 2048 if (cpu < nr_cpu_ids) 2049 2049 return cpu; 2050 2050 ··· 2081 2081 /* Retry if something changed. */ 2082 2082 if (double_lock_balance(rq, later_rq)) { 2083 2083 if (unlikely(task_rq(task) != rq || 2084 - !cpumask_test_cpu(later_rq->cpu, task->cpus_ptr) || 2084 + !cpumask_test_cpu(later_rq->cpu, &task->cpus_mask) || 2085 2085 task_running(rq, task) || 2086 2086 !dl_task(task) || 2087 2087 !task_on_rq_queued(task))) { ··· 2148 2148 return 0; 2149 2149 2150 2150 retry: 2151 + if (is_migration_disabled(next_task)) 2152 + return 0; 2153 + 2151 2154 if (WARN_ON(next_task == rq->curr)) 2152 2155 return 0; 2153 2156 ··· 2228 2225 static void pull_dl_task(struct rq *this_rq) 2229 2226 { 2230 2227 int this_cpu = this_rq->cpu, cpu; 2231 - struct task_struct *p; 2228 + struct task_struct *p, *push_task; 2232 2229 bool resched = false; 2233 2230 struct rq *src_rq; 2234 2231 u64 dmin = LONG_MAX; ··· 2258 2255 continue; 2259 2256 2260 2257 /* Might drop this_rq->lock */ 2258 + push_task = NULL; 2261 2259 double_lock_balance(this_rq, src_rq); 2262 2260 2263 2261 /* ··· 2290 2286 src_rq->curr->dl.deadline)) 2291 2287 goto skip; 2292 2288 2293 - resched = true; 2294 - 2295 - deactivate_task(src_rq, p, 0); 2296 - set_task_cpu(p, this_cpu); 2297 - activate_task(this_rq, p, 0); 2298 - dmin = p->dl.deadline; 2289 + if (is_migration_disabled(p)) { 2290 + push_task = get_push_task(src_rq); 2291 + } else { 2292 + deactivate_task(src_rq, p, 0); 2293 + set_task_cpu(p, this_cpu); 2294 + activate_task(this_rq, p, 0); 2295 + dmin = p->dl.deadline; 2296 + resched = true; 2297 + } 2299 2298 2300 2299 /* Is there any other task even earlier? */ 2301 2300 } 2302 2301 skip: 2303 2302 double_unlock_balance(this_rq, src_rq); 2303 + 2304 + if (push_task) { 2305 + raw_spin_unlock(&this_rq->lock); 2306 + stop_one_cpu_nowait(src_rq->cpu, push_cpu_stop, 2307 + push_task, &src_rq->push_work); 2308 + raw_spin_lock(&this_rq->lock); 2309 + } 2304 2310 } 2305 2311 2306 2312 if (resched) ··· 2334 2320 } 2335 2321 2336 2322 static void set_cpus_allowed_dl(struct task_struct *p, 2337 - const struct cpumask *new_mask) 2323 + const struct cpumask *new_mask, 2324 + u32 flags) 2338 2325 { 2339 2326 struct root_domain *src_rd; 2340 2327 struct rq *rq; ··· 2364 2349 raw_spin_unlock(&src_dl_b->lock); 2365 2350 } 2366 2351 2367 - set_cpus_allowed_common(p, new_mask); 2352 + set_cpus_allowed_common(p, new_mask, flags); 2368 2353 } 2369 2354 2370 2355 /* Assumes rq->lock is held */ ··· 2557 2542 .rq_online = rq_online_dl, 2558 2543 .rq_offline = rq_offline_dl, 2559 2544 .task_woken = task_woken_dl, 2545 + .find_lock_rq = find_lock_later_rq, 2560 2546 #endif 2561 2547 2562 2548 .task_tick = task_tick_dl,

+57 -18

kernel/sched/rt.c

··· 265 265 static inline bool need_pull_rt_task(struct rq *rq, struct task_struct *prev) 266 266 { 267 267 /* Try to pull RT tasks here if we lower this rq's prio */ 268 - return rq->rt.highest_prio.curr > prev->prio; 268 + return rq->online && rq->rt.highest_prio.curr > prev->prio; 269 269 } 270 270 271 271 static inline int rt_overloaded(struct rq *rq) ··· 1660 1660 static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu) 1661 1661 { 1662 1662 if (!task_running(rq, p) && 1663 - cpumask_test_cpu(cpu, p->cpus_ptr)) 1663 + cpumask_test_cpu(cpu, &p->cpus_mask)) 1664 1664 return 1; 1665 1665 1666 1666 return 0; ··· 1754 1754 return this_cpu; 1755 1755 } 1756 1756 1757 - best_cpu = cpumask_first_and(lowest_mask, 1758 - sched_domain_span(sd)); 1757 + best_cpu = cpumask_any_and_distribute(lowest_mask, 1758 + sched_domain_span(sd)); 1759 1759 if (best_cpu < nr_cpu_ids) { 1760 1760 rcu_read_unlock(); 1761 1761 return best_cpu; ··· 1772 1772 if (this_cpu != -1) 1773 1773 return this_cpu; 1774 1774 1775 - cpu = cpumask_any(lowest_mask); 1775 + cpu = cpumask_any_distribute(lowest_mask); 1776 1776 if (cpu < nr_cpu_ids) 1777 1777 return cpu; 1778 1778 ··· 1813 1813 * Also make sure that it wasn't scheduled on its rq. 1814 1814 */ 1815 1815 if (unlikely(task_rq(task) != rq || 1816 - !cpumask_test_cpu(lowest_rq->cpu, task->cpus_ptr) || 1816 + !cpumask_test_cpu(lowest_rq->cpu, &task->cpus_mask) || 1817 1817 task_running(rq, task) || 1818 1818 !rt_task(task) || 1819 1819 !task_on_rq_queued(task))) { ··· 1861 1861 * running task can migrate over to a CPU that is running a task 1862 1862 * of lesser priority. 1863 1863 */ 1864 - static int push_rt_task(struct rq *rq) 1864 + static int push_rt_task(struct rq *rq, bool pull) 1865 1865 { 1866 1866 struct task_struct *next_task; 1867 1867 struct rq *lowest_rq; ··· 1875 1875 return 0; 1876 1876 1877 1877 retry: 1878 + if (is_migration_disabled(next_task)) { 1879 + struct task_struct *push_task = NULL; 1880 + int cpu; 1881 + 1882 + if (!pull || rq->push_busy) 1883 + return 0; 1884 + 1885 + cpu = find_lowest_rq(rq->curr); 1886 + if (cpu == -1 || cpu == rq->cpu) 1887 + return 0; 1888 + 1889 + /* 1890 + * Given we found a CPU with lower priority than @next_task, 1891 + * therefore it should be running. However we cannot migrate it 1892 + * to this other CPU, instead attempt to push the current 1893 + * running task on this CPU away. 1894 + */ 1895 + push_task = get_push_task(rq); 1896 + if (push_task) { 1897 + raw_spin_unlock(&rq->lock); 1898 + stop_one_cpu_nowait(rq->cpu, push_cpu_stop, 1899 + push_task, &rq->push_work); 1900 + raw_spin_lock(&rq->lock); 1901 + } 1902 + 1903 + return 0; 1904 + } 1905 + 1878 1906 if (WARN_ON(next_task == rq->curr)) 1879 1907 return 0; 1880 1908 ··· 1957 1929 deactivate_task(rq, next_task, 0); 1958 1930 set_task_cpu(next_task, lowest_rq->cpu); 1959 1931 activate_task(lowest_rq, next_task, 0); 1932 + resched_curr(lowest_rq); 1960 1933 ret = 1; 1961 1934 1962 - resched_curr(lowest_rq); 1963 - 1964 1935 double_unlock_balance(rq, lowest_rq); 1965 - 1966 1936 out: 1967 1937 put_task_struct(next_task); 1968 1938 ··· 1970 1944 static void push_rt_tasks(struct rq *rq) 1971 1945 { 1972 1946 /* push_rt_task will return true if it moved an RT */ 1973 - while (push_rt_task(rq)) 1947 + while (push_rt_task(rq, false)) 1974 1948 ; 1975 1949 } 1976 1950 ··· 2123 2097 */ 2124 2098 if (has_pushable_tasks(rq)) { 2125 2099 raw_spin_lock(&rq->lock); 2126 - push_rt_tasks(rq); 2100 + while (push_rt_task(rq, true)) 2101 + ; 2127 2102 raw_spin_unlock(&rq->lock); 2128 2103 } 2129 2104 ··· 2149 2122 { 2150 2123 int this_cpu = this_rq->cpu, cpu; 2151 2124 bool resched = false; 2152 - struct task_struct *p; 2125 + struct task_struct *p, *push_task; 2153 2126 struct rq *src_rq; 2154 2127 int rt_overload_count = rt_overloaded(this_rq); 2155 2128 ··· 2196 2169 * double_lock_balance, and another CPU could 2197 2170 * alter this_rq 2198 2171 */ 2172 + push_task = NULL; 2199 2173 double_lock_balance(this_rq, src_rq); 2200 2174 2201 2175 /* ··· 2224 2196 if (p->prio < src_rq->curr->prio) 2225 2197 goto skip; 2226 2198 2227 - resched = true; 2228 - 2229 - deactivate_task(src_rq, p, 0); 2230 - set_task_cpu(p, this_cpu); 2231 - activate_task(this_rq, p, 0); 2199 + if (is_migration_disabled(p)) { 2200 + push_task = get_push_task(src_rq); 2201 + } else { 2202 + deactivate_task(src_rq, p, 0); 2203 + set_task_cpu(p, this_cpu); 2204 + activate_task(this_rq, p, 0); 2205 + resched = true; 2206 + } 2232 2207 /* 2233 2208 * We continue with the search, just in 2234 2209 * case there's an even higher prio task ··· 2241 2210 } 2242 2211 skip: 2243 2212 double_unlock_balance(this_rq, src_rq); 2213 + 2214 + if (push_task) { 2215 + raw_spin_unlock(&this_rq->lock); 2216 + stop_one_cpu_nowait(src_rq->cpu, push_cpu_stop, 2217 + push_task, &src_rq->push_work); 2218 + raw_spin_lock(&this_rq->lock); 2219 + } 2244 2220 } 2245 2221 2246 2222 if (resched) ··· 2489 2451 .rq_offline = rq_offline_rt, 2490 2452 .task_woken = task_woken_rt, 2491 2453 .switched_from = switched_from_rt, 2454 + .find_lock_rq = find_lock_lowest_rq, 2492 2455 #endif 2493 2456 2494 2457 .task_tick = task_tick_rt,

+56 -3

kernel/sched/sched.h

··· 975 975 unsigned long cpu_capacity_orig; 976 976 977 977 struct callback_head *balance_callback; 978 + unsigned char balance_flags; 978 979 979 980 unsigned char nohz_idle_balance; 980 981 unsigned char idle_balance; ··· 1006 1005 1007 1006 /* This is used to determine avg_idle's max value */ 1008 1007 u64 max_idle_balance_cost; 1008 + 1009 + #ifdef CONFIG_HOTPLUG_CPU 1010 + struct rcuwait hotplug_wait; 1011 + #endif 1009 1012 #endif /* CONFIG_SMP */ 1010 1013 1011 1014 #ifdef CONFIG_IRQ_TIME_ACCOUNTING ··· 1055 1050 /* Must be inspected within a rcu lock section */ 1056 1051 struct cpuidle_state *idle_state; 1057 1052 #endif 1053 + 1054 + #if defined(CONFIG_PREEMPT_RT) && defined(CONFIG_SMP) 1055 + unsigned int nr_pinned; 1056 + #endif 1057 + unsigned int push_busy; 1058 + struct cpu_stop_work push_work; 1058 1059 }; 1059 1060 1060 1061 #ifdef CONFIG_FAIR_GROUP_SCHED ··· 1088 1077 #endif 1089 1078 } 1090 1079 1080 + #define MDF_PUSH 0x01 1081 + 1082 + static inline bool is_migration_disabled(struct task_struct *p) 1083 + { 1084 + #if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT_RT) 1085 + return p->migration_disabled; 1086 + #else 1087 + return false; 1088 + #endif 1089 + } 1091 1090 1092 1091 #ifdef CONFIG_SCHED_SMT 1093 1092 extern void __update_idle_core(struct rq *rq); ··· 1243 1222 #ifdef CONFIG_SCHED_DEBUG 1244 1223 rq->clock_update_flags &= (RQCF_REQ_SKIP|RQCF_ACT_SKIP); 1245 1224 rf->clock_update_flags = 0; 1225 + #endif 1226 + #ifdef CONFIG_SMP 1227 + SCHED_WARN_ON(rq->balance_callback); 1246 1228 #endif 1247 1229 } 1248 1230 ··· 1408 1384 1409 1385 #ifdef CONFIG_SMP 1410 1386 1387 + #define BALANCE_WORK 0x01 1388 + #define BALANCE_PUSH 0x02 1389 + 1411 1390 static inline void 1412 1391 queue_balance_callback(struct rq *rq, 1413 1392 struct callback_head *head, ··· 1418 1391 { 1419 1392 lockdep_assert_held(&rq->lock); 1420 1393 1421 - if (unlikely(head->next)) 1394 + if (unlikely(head->next || (rq->balance_flags & BALANCE_PUSH))) 1422 1395 return; 1423 1396 1424 1397 head->func = (void (*)(struct callback_head *))func; 1425 1398 head->next = rq->balance_callback; 1426 1399 rq->balance_callback = head; 1400 + rq->balance_flags |= BALANCE_WORK; 1427 1401 } 1428 1402 1429 1403 #define rcu_dereference_check_sched_domain(p) \ ··· 1832 1804 void (*task_woken)(struct rq *this_rq, struct task_struct *task); 1833 1805 1834 1806 void (*set_cpus_allowed)(struct task_struct *p, 1835 - const struct cpumask *newmask); 1807 + const struct cpumask *newmask, 1808 + u32 flags); 1836 1809 1837 1810 void (*rq_online)(struct rq *rq); 1838 1811 void (*rq_offline)(struct rq *rq); 1812 + 1813 + struct rq *(*find_lock_rq)(struct task_struct *p, struct rq *rq); 1839 1814 #endif 1840 1815 1841 1816 void (*task_tick)(struct rq *rq, struct task_struct *p, int queued); ··· 1936 1905 extern struct task_struct *pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf); 1937 1906 extern struct task_struct *pick_next_task_idle(struct rq *rq); 1938 1907 1908 + #define SCA_CHECK 0x01 1909 + #define SCA_MIGRATE_DISABLE 0x02 1910 + #define SCA_MIGRATE_ENABLE 0x04 1911 + 1939 1912 #ifdef CONFIG_SMP 1940 1913 1941 1914 extern void update_group_capacity(struct sched_domain *sd, int cpu); 1942 1915 1943 1916 extern void trigger_load_balance(struct rq *rq); 1944 1917 1945 - extern void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask); 1918 + extern void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask, u32 flags); 1919 + 1920 + static inline struct task_struct *get_push_task(struct rq *rq) 1921 + { 1922 + struct task_struct *p = rq->curr; 1923 + 1924 + lockdep_assert_held(&rq->lock); 1925 + 1926 + if (rq->push_busy) 1927 + return NULL; 1928 + 1929 + if (p->nr_cpus_allowed == 1) 1930 + return NULL; 1931 + 1932 + rq->push_busy = true; 1933 + return get_task_struct(p); 1934 + } 1935 + 1936 + extern int push_cpu_stop(void *arg); 1946 1937 1947 1938 #endif 1948 1939

+24 -3

kernel/stop_machine.c

··· 42 42 struct list_head works; /* list of pending works */ 43 43 44 44 struct cpu_stop_work stop_work; /* for stop_cpus */ 45 + unsigned long caller; 46 + cpu_stop_fn_t fn; 45 47 }; 46 48 47 49 static DEFINE_PER_CPU(struct cpu_stopper, cpu_stopper); 48 50 static bool stop_machine_initialized = false; 51 + 52 + void print_stop_info(const char *log_lvl, struct task_struct *task) 53 + { 54 + /* 55 + * If @task is a stopper task, it cannot migrate and task_cpu() is 56 + * stable. 57 + */ 58 + struct cpu_stopper *stopper = per_cpu_ptr(&cpu_stopper, task_cpu(task)); 59 + 60 + if (task != stopper->thread) 61 + return; 62 + 63 + printk("%sStopper: %pS <- %pS\n", log_lvl, stopper->fn, (void *)stopper->caller); 64 + } 49 65 50 66 /* static data for stop_cpus */ 51 67 static DEFINE_MUTEX(stop_cpus_mutex); ··· 139 123 int stop_one_cpu(unsigned int cpu, cpu_stop_fn_t fn, void *arg) 140 124 { 141 125 struct cpu_stop_done done; 142 - struct cpu_stop_work work = { .fn = fn, .arg = arg, .done = &done }; 126 + struct cpu_stop_work work = { .fn = fn, .arg = arg, .done = &done, .caller = _RET_IP_ }; 143 127 144 128 cpu_stop_init_done(&done, 1); 145 129 if (!cpu_stop_queue_work(cpu, &work)) ··· 347 331 work1 = work2 = (struct cpu_stop_work){ 348 332 .fn = multi_cpu_stop, 349 333 .arg = &msdata, 350 - .done = &done 334 + .done = &done, 335 + .caller = _RET_IP_, 351 336 }; 352 337 353 338 cpu_stop_init_done(&done, 2); ··· 384 367 bool stop_one_cpu_nowait(unsigned int cpu, cpu_stop_fn_t fn, void *arg, 385 368 struct cpu_stop_work *work_buf) 386 369 { 387 - *work_buf = (struct cpu_stop_work){ .fn = fn, .arg = arg, }; 370 + *work_buf = (struct cpu_stop_work){ .fn = fn, .arg = arg, .caller = _RET_IP_, }; 388 371 return cpu_stop_queue_work(cpu, work_buf); 389 372 } 390 373 ··· 504 487 int ret; 505 488 506 489 /* cpu stop callbacks must not sleep, make in_atomic() == T */ 490 + stopper->caller = work->caller; 491 + stopper->fn = fn; 507 492 preempt_count_inc(); 508 493 ret = fn(arg); 509 494 if (done) { ··· 514 495 cpu_stop_signal_done(done); 515 496 } 516 497 preempt_count_dec(); 498 + stopper->fn = NULL; 499 + stopper->caller = 0; 517 500 WARN_ONCE(preempt_count(), 518 501 "cpu_stop: %ps(%p) leaked preempt count\n", fn, arg); 519 502 goto repeat;

+4

kernel/workqueue.c

··· 4908 4908 pool->flags |= POOL_DISASSOCIATED; 4909 4909 4910 4910 raw_spin_unlock_irq(&pool->lock); 4911 + 4912 + for_each_pool_worker(worker, pool) 4913 + WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, cpu_active_mask) < 0); 4914 + 4911 4915 mutex_unlock(&wq_pool_attach_mutex); 4912 4916 4913 4917 /*

+18

lib/cpumask.c

··· 267 267 return next; 268 268 } 269 269 EXPORT_SYMBOL(cpumask_any_and_distribute); 270 + 271 + int cpumask_any_distribute(const struct cpumask *srcp) 272 + { 273 + int next, prev; 274 + 275 + /* NOTE: our first selection will skip 0. */ 276 + prev = __this_cpu_read(distribute_cpu_mask_prev); 277 + 278 + next = cpumask_next(prev, srcp); 279 + if (next >= nr_cpu_ids) 280 + next = cpumask_first(srcp); 281 + 282 + if (next < nr_cpu_ids) 283 + __this_cpu_write(distribute_cpu_mask_prev, next); 284 + 285 + return next; 286 + } 287 + EXPORT_SYMBOL(cpumask_any_distribute);

+2

lib/dump_stack.c

··· 12 12 #include <linux/atomic.h> 13 13 #include <linux/kexec.h> 14 14 #include <linux/utsname.h> 15 + #include <linux/stop_machine.h> 15 16 16 17 static char dump_stack_arch_desc_str[128]; 17 18 ··· 58 57 log_lvl, dump_stack_arch_desc_str); 59 58 60 59 print_worker_info(log_lvl, current); 60 + print_stop_info(log_lvl, current); 61 61 } 62 62 63 63 /**

+5

lib/smp_processor_id.c

··· 26 26 if (current->nr_cpus_allowed == 1) 27 27 goto out; 28 28 29 + #if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT_RT) 30 + if (current->migration_disabled) 31 + goto out; 32 + #endif 33 + 29 34 /* 30 35 * It is valid to assume CPU-locality during early bootup: 31 36 */