sched: Fix Core-wide rq->lock for uninitialized CPUs

Eugene tripped over the case where rq_lock(), as called in a
for_each_possible_cpu() loop came apart because rq->core hadn't been
setup yet.

This is a somewhat unusual, but valid case.

Rework things such that rq->core is initialized to point at itself. IOW
initialize each CPU as a single threaded Core. CPU online will then join
the new CPU (thread) to an existing Core where needed.

For completeness sake, have CPU offline fully undo the state so as to
not presume the topology will match the next time it comes online.

Fixes: 9edeaea1bc45 ("sched: Core-wide rq->lock")
Reported-by: Eugene Syromiatnikov <esyr@redhat.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Josh Don <joshdon@google.com>
Tested-by: Eugene Syromiatnikov <esyr@redhat.com>
Link: https://lkml.kernel.org/r/YR473ZGeKqMs6kw+@hirez.programming.kicks-ass.net

Changed files
+120 -29
kernel
sched
+119 -28
kernel/sched/core.c
··· 237 237 static atomic_t sched_core_count; 238 238 static struct cpumask sched_core_mask; 239 239 240 + static void sched_core_lock(int cpu, unsigned long *flags) 241 + { 242 + const struct cpumask *smt_mask = cpu_smt_mask(cpu); 243 + int t, i = 0; 244 + 245 + local_irq_save(*flags); 246 + for_each_cpu(t, smt_mask) 247 + raw_spin_lock_nested(&cpu_rq(t)->__lock, i++); 248 + } 249 + 250 + static void sched_core_unlock(int cpu, unsigned long *flags) 251 + { 252 + const struct cpumask *smt_mask = cpu_smt_mask(cpu); 253 + int t; 254 + 255 + for_each_cpu(t, smt_mask) 256 + raw_spin_unlock(&cpu_rq(t)->__lock); 257 + local_irq_restore(*flags); 258 + } 259 + 240 260 static void __sched_core_flip(bool enabled) 241 261 { 242 - int cpu, t, i; 262 + unsigned long flags; 263 + int cpu, t; 243 264 244 265 cpus_read_lock(); 245 266 ··· 271 250 for_each_cpu(cpu, &sched_core_mask) { 272 251 const struct cpumask *smt_mask = cpu_smt_mask(cpu); 273 252 274 - i = 0; 275 - local_irq_disable(); 276 - for_each_cpu(t, smt_mask) { 277 - /* supports up to SMT8 */ 278 - raw_spin_lock_nested(&cpu_rq(t)->__lock, i++); 279 - } 253 + sched_core_lock(cpu, &flags); 280 254 281 255 for_each_cpu(t, smt_mask) 282 256 cpu_rq(t)->core_enabled = enabled; 283 257 284 - for_each_cpu(t, smt_mask) 285 - raw_spin_unlock(&cpu_rq(t)->__lock); 286 - local_irq_enable(); 258 + sched_core_unlock(cpu, &flags); 287 259 288 260 cpumask_andnot(&sched_core_mask, &sched_core_mask, smt_mask); 289 261 } ··· 5750 5736 queue_balance_callback(rq, &per_cpu(core_balance_head, rq->cpu), sched_core_balance); 5751 5737 } 5752 5738 5753 - static inline void sched_core_cpu_starting(unsigned int cpu) 5739 + static void sched_core_cpu_starting(unsigned int cpu) 5754 5740 { 5755 5741 const struct cpumask *smt_mask = cpu_smt_mask(cpu); 5756 - struct rq *rq, *core_rq = NULL; 5757 - int i; 5742 + struct rq *rq = cpu_rq(cpu), *core_rq = NULL; 5743 + unsigned long flags; 5744 + int t; 5758 5745 5759 - core_rq = cpu_rq(cpu)->core; 5746 + sched_core_lock(cpu, &flags); 5760 5747 5761 - if (!core_rq) { 5762 - for_each_cpu(i, smt_mask) { 5763 - rq = cpu_rq(i); 5764 - if (rq->core && rq->core == rq) 5765 - core_rq = rq; 5766 - } 5748 + WARN_ON_ONCE(rq->core != rq); 5767 5749 5768 - if (!core_rq) 5769 - core_rq = cpu_rq(cpu); 5750 + /* if we're the first, we'll be our own leader */ 5751 + if (cpumask_weight(smt_mask) == 1) 5752 + goto unlock; 5770 5753 5771 - for_each_cpu(i, smt_mask) { 5772 - rq = cpu_rq(i); 5773 - 5774 - WARN_ON_ONCE(rq->core && rq->core != core_rq); 5775 - rq->core = core_rq; 5754 + /* find the leader */ 5755 + for_each_cpu(t, smt_mask) { 5756 + if (t == cpu) 5757 + continue; 5758 + rq = cpu_rq(t); 5759 + if (rq->core == rq) { 5760 + core_rq = rq; 5761 + break; 5776 5762 } 5777 5763 } 5764 + 5765 + if (WARN_ON_ONCE(!core_rq)) /* whoopsie */ 5766 + goto unlock; 5767 + 5768 + /* install and validate core_rq */ 5769 + for_each_cpu(t, smt_mask) { 5770 + rq = cpu_rq(t); 5771 + 5772 + if (t == cpu) 5773 + rq->core = core_rq; 5774 + 5775 + WARN_ON_ONCE(rq->core != core_rq); 5776 + } 5777 + 5778 + unlock: 5779 + sched_core_unlock(cpu, &flags); 5778 5780 } 5781 + 5782 + static void sched_core_cpu_deactivate(unsigned int cpu) 5783 + { 5784 + const struct cpumask *smt_mask = cpu_smt_mask(cpu); 5785 + struct rq *rq = cpu_rq(cpu), *core_rq = NULL; 5786 + unsigned long flags; 5787 + int t; 5788 + 5789 + sched_core_lock(cpu, &flags); 5790 + 5791 + /* if we're the last man standing, nothing to do */ 5792 + if (cpumask_weight(smt_mask) == 1) { 5793 + WARN_ON_ONCE(rq->core != rq); 5794 + goto unlock; 5795 + } 5796 + 5797 + /* if we're not the leader, nothing to do */ 5798 + if (rq->core != rq) 5799 + goto unlock; 5800 + 5801 + /* find a new leader */ 5802 + for_each_cpu(t, smt_mask) { 5803 + if (t == cpu) 5804 + continue; 5805 + core_rq = cpu_rq(t); 5806 + break; 5807 + } 5808 + 5809 + if (WARN_ON_ONCE(!core_rq)) /* impossible */ 5810 + goto unlock; 5811 + 5812 + /* copy the shared state to the new leader */ 5813 + core_rq->core_task_seq = rq->core_task_seq; 5814 + core_rq->core_pick_seq = rq->core_pick_seq; 5815 + core_rq->core_cookie = rq->core_cookie; 5816 + core_rq->core_forceidle = rq->core_forceidle; 5817 + core_rq->core_forceidle_seq = rq->core_forceidle_seq; 5818 + 5819 + /* install new leader */ 5820 + for_each_cpu(t, smt_mask) { 5821 + rq = cpu_rq(t); 5822 + rq->core = core_rq; 5823 + } 5824 + 5825 + unlock: 5826 + sched_core_unlock(cpu, &flags); 5827 + } 5828 + 5829 + static inline void sched_core_cpu_dying(unsigned int cpu) 5830 + { 5831 + struct rq *rq = cpu_rq(cpu); 5832 + 5833 + if (rq->core != rq) 5834 + rq->core = rq; 5835 + } 5836 + 5779 5837 #else /* !CONFIG_SCHED_CORE */ 5780 5838 5781 5839 static inline void sched_core_cpu_starting(unsigned int cpu) {} 5840 + static inline void sched_core_cpu_deactivate(unsigned int cpu) {} 5841 + static inline void sched_core_cpu_dying(unsigned int cpu) {} 5782 5842 5783 5843 static struct task_struct * 5784 5844 pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) ··· 8795 8707 */ 8796 8708 if (cpumask_weight(cpu_smt_mask(cpu)) == 2) 8797 8709 static_branch_dec_cpuslocked(&sched_smt_present); 8710 + 8711 + sched_core_cpu_deactivate(cpu); 8798 8712 #endif 8799 8713 8800 8714 if (!sched_smp_initialized) ··· 8901 8811 calc_load_migrate(rq); 8902 8812 update_max_interval(); 8903 8813 hrtick_clear(rq); 8814 + sched_core_cpu_dying(cpu); 8904 8815 return 0; 8905 8816 } 8906 8817 #endif ··· 9113 9022 atomic_set(&rq->nr_iowait, 0); 9114 9023 9115 9024 #ifdef CONFIG_SCHED_CORE 9116 - rq->core = NULL; 9025 + rq->core = rq; 9117 9026 rq->core_pick = NULL; 9118 9027 rq->core_enabled = 0; 9119 9028 rq->core_tree = RB_ROOT;
+1 -1
kernel/sched/sched.h
··· 1093 1093 unsigned int core_sched_seq; 1094 1094 struct rb_root core_tree; 1095 1095 1096 - /* shared state */ 1096 + /* shared state -- careful with sched_core_cpu_deactivate() */ 1097 1097 unsigned int core_task_seq; 1098 1098 unsigned int core_pick_seq; 1099 1099 unsigned long core_cookie;