sched: Optimize finish_lock_switch()

The kernel test robot measured a -1.6% performance regression on
will-it-scale/sched_yield due to commit:

2558aacff858 ("sched/hotplug: Ensure only per-cpu kthreads run during hotplug")

Even though we were careful to replace a single load with another
single load from the same cacheline.

Restore finish_lock_switch() to the exact state before the offending
patch and solve the problem differently.

Fixes: 2558aacff858 ("sched/hotplug: Ensure only per-cpu kthreads run during hotplug")
Reported-by: kernel test robot <oliver.sang@intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20201210161408.GX3021@hirez.programming.kicks-ass.net

+20 -33
+15 -25
kernel/sched/core.c
··· 3985 } 3986 } 3987 3988 static inline struct callback_head *splice_balance_callbacks(struct rq *rq) 3989 { 3990 struct callback_head *head = rq->balance_callback; 3991 3992 lockdep_assert_held(&rq->lock); 3993 - if (head) { 3994 rq->balance_callback = NULL; 3995 - rq->balance_flags &= ~BALANCE_WORK; 3996 - } 3997 3998 return head; 3999 } ··· 4019 } 4020 } 4021 4022 - static void balance_push(struct rq *rq); 4023 - 4024 - static inline void balance_switch(struct rq *rq) 4025 - { 4026 - if (likely(!rq->balance_flags)) 4027 - return; 4028 - 4029 - if (rq->balance_flags & BALANCE_PUSH) { 4030 - balance_push(rq); 4031 - return; 4032 - } 4033 - 4034 - __balance_callbacks(rq); 4035 - } 4036 - 4037 #else 4038 4039 static inline void __balance_callbacks(struct rq *rq) ··· 4031 } 4032 4033 static inline void balance_callbacks(struct rq *rq, struct callback_head *head) 4034 - { 4035 - } 4036 - 4037 - static inline void balance_switch(struct rq *rq) 4038 { 4039 } 4040 ··· 4061 * prev into current: 4062 */ 4063 spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_); 4064 - balance_switch(rq); 4065 raw_spin_unlock_irq(&rq->lock); 4066 } 4067 ··· 7242 7243 lockdep_assert_held(&rq->lock); 7244 SCHED_WARN_ON(rq->cpu != smp_processor_id()); 7245 7246 /* 7247 * Both the cpu-hotplug and stop task are in this case and are ··· 7295 7296 rq_lock_irqsave(rq, &rf); 7297 if (on) 7298 - rq->balance_flags |= BALANCE_PUSH; 7299 else 7300 - rq->balance_flags &= ~BALANCE_PUSH; 7301 rq_unlock_irqrestore(rq, &rf); 7302 } 7303
··· 3985 } 3986 } 3987 3988 + static void balance_push(struct rq *rq); 3989 + 3990 + struct callback_head balance_push_callback = { 3991 + .next = NULL, 3992 + .func = (void (*)(struct callback_head *))balance_push, 3993 + }; 3994 + 3995 static inline struct callback_head *splice_balance_callbacks(struct rq *rq) 3996 { 3997 struct callback_head *head = rq->balance_callback; 3998 3999 lockdep_assert_held(&rq->lock); 4000 + if (head) 4001 rq->balance_callback = NULL; 4002 4003 return head; 4004 } ··· 4014 } 4015 } 4016 4017 #else 4018 4019 static inline void __balance_callbacks(struct rq *rq) ··· 4041 } 4042 4043 static inline void balance_callbacks(struct rq *rq, struct callback_head *head) 4044 { 4045 } 4046 ··· 4075 * prev into current: 4076 */ 4077 spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_); 4078 + __balance_callbacks(rq); 4079 raw_spin_unlock_irq(&rq->lock); 4080 } 4081 ··· 7256 7257 lockdep_assert_held(&rq->lock); 7258 SCHED_WARN_ON(rq->cpu != smp_processor_id()); 7259 + /* 7260 + * Ensure the thing is persistent until balance_push_set(.on = false); 7261 + */ 7262 + rq->balance_callback = &balance_push_callback; 7263 7264 /* 7265 * Both the cpu-hotplug and stop task are in this case and are ··· 7305 7306 rq_lock_irqsave(rq, &rf); 7307 if (on) 7308 + rq->balance_callback = &balance_push_callback; 7309 else 7310 + rq->balance_callback = NULL; 7311 rq_unlock_irqrestore(rq, &rf); 7312 } 7313
+5 -8
kernel/sched/sched.h
··· 975 unsigned long cpu_capacity_orig; 976 977 struct callback_head *balance_callback; 978 - unsigned char balance_flags; 979 980 unsigned char nohz_idle_balance; 981 unsigned char idle_balance; ··· 1225 #endif 1226 }; 1227 1228 /* 1229 * Lockdep annotation that avoids accidental unlocks; it's like a 1230 * sticky/continuous lockdep_assert_held(). ··· 1244 #ifdef CONFIG_SCHED_DEBUG 1245 rq->clock_update_flags &= (RQCF_REQ_SKIP|RQCF_ACT_SKIP); 1246 rf->clock_update_flags = 0; 1247 - #endif 1248 #ifdef CONFIG_SMP 1249 - SCHED_WARN_ON(rq->balance_callback); 1250 #endif 1251 } 1252 ··· 1409 1410 #ifdef CONFIG_SMP 1411 1412 - #define BALANCE_WORK 0x01 1413 - #define BALANCE_PUSH 0x02 1414 - 1415 static inline void 1416 queue_balance_callback(struct rq *rq, 1417 struct callback_head *head, ··· 1416 { 1417 lockdep_assert_held(&rq->lock); 1418 1419 - if (unlikely(head->next || (rq->balance_flags & BALANCE_PUSH))) 1420 return; 1421 1422 head->func = (void (*)(struct callback_head *))func; 1423 head->next = rq->balance_callback; 1424 rq->balance_callback = head; 1425 - rq->balance_flags |= BALANCE_WORK; 1426 } 1427 1428 #define rcu_dereference_check_sched_domain(p) \
··· 975 unsigned long cpu_capacity_orig; 976 977 struct callback_head *balance_callback; 978 979 unsigned char nohz_idle_balance; 980 unsigned char idle_balance; ··· 1226 #endif 1227 }; 1228 1229 + extern struct callback_head balance_push_callback; 1230 + 1231 /* 1232 * Lockdep annotation that avoids accidental unlocks; it's like a 1233 * sticky/continuous lockdep_assert_held(). ··· 1243 #ifdef CONFIG_SCHED_DEBUG 1244 rq->clock_update_flags &= (RQCF_REQ_SKIP|RQCF_ACT_SKIP); 1245 rf->clock_update_flags = 0; 1246 #ifdef CONFIG_SMP 1247 + SCHED_WARN_ON(rq->balance_callback && rq->balance_callback != &balance_push_callback); 1248 + #endif 1249 #endif 1250 } 1251 ··· 1408 1409 #ifdef CONFIG_SMP 1410 1411 static inline void 1412 queue_balance_callback(struct rq *rq, 1413 struct callback_head *head, ··· 1418 { 1419 lockdep_assert_held(&rq->lock); 1420 1421 + if (unlikely(head->next || rq->balance_callback == &balance_push_callback)) 1422 return; 1423 1424 head->func = (void (*)(struct callback_head *))func; 1425 head->next = rq->balance_callback; 1426 rq->balance_callback = head; 1427 } 1428 1429 #define rcu_dereference_check_sched_domain(p) \