locking/pvqspinlock: Queue node adaptive spinning

Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git

kernel os linux

In an overcommitted guest where some vCPUs have to be halted to make
forward progress in other areas, it is highly likely that a vCPU later
in the spinlock queue will be spinning while the ones earlier in the
queue would have been halted. The spinning in the later vCPUs is then
just a waste of precious CPU cycles because they are not going to
get the lock soon as the earlier ones have to be woken up and take
their turn to get the lock.

This patch implements an adaptive spinning mechanism where the vCPU
will call pv_wait() if the previous vCPU is not running.

Linux kernel builds were run in KVM guest on an 8-socket, 4
cores/socket Westmere-EX system and a 4-socket, 8 cores/socket
Haswell-EX system. Both systems are configured to have 32 physical
CPUs. The kernel build times before and after the patch were:

Westmere Haswell
Patch 32 vCPUs 48 vCPUs 32 vCPUs 48 vCPUs
----- -------- -------- -------- --------
Before patch 3m02.3s 5m00.2s 1m43.7s 3m03.5s
After patch 3m03.0s 4m37.5s 1m43.0s 2m47.2s

For 32 vCPUs, this patch doesn't cause any noticeable change in
performance. For 48 vCPUs (over-committed), there is about 8%
performance improvement.

Signed-off-by: Waiman Long <Waiman.Long@hpe.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Douglas Hatch <doug.hatch@hpe.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Scott J Norton <scott.norton@hpe.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1447114167-47185-8-git-send-email-Waiman.Long@hpe.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>

authored by

Waiman Long and committed by

Ingo Molnar 10 years ago cd0272fa 1c4941fd

+50 -4

3 changed files

expand all

kernel

locking

qspinlock.c

qspinlock_paravirt.h

qspinlock_stat.h

+3 -2

kernel/locking/qspinlock.c

··· 248 248 */ 249 249 250 250 static __always_inline void __pv_init_node(struct mcs_spinlock *node) { } 251 - static __always_inline void __pv_wait_node(struct mcs_spinlock *node) { } 251 + static __always_inline void __pv_wait_node(struct mcs_spinlock *node, 252 + struct mcs_spinlock *prev) { } 252 253 static __always_inline void __pv_kick_node(struct qspinlock *lock, 253 254 struct mcs_spinlock *node) { } 254 255 static __always_inline u32 __pv_wait_head_or_lock(struct qspinlock *lock, ··· 408 407 prev = decode_tail(old); 409 408 WRITE_ONCE(prev->next, node); 410 409 411 - pv_wait_node(node); 410 + pv_wait_node(node, prev); 412 411 arch_mcs_spin_lock_contended(&node->locked); 413 412 414 413 /*

+44 -2

kernel/locking/qspinlock_paravirt.h

··· 23 23 #define _Q_SLOW_VAL (3U << _Q_LOCKED_OFFSET) 24 24 25 25 /* 26 + * Queue Node Adaptive Spinning 27 + * 28 + * A queue node vCPU will stop spinning if the vCPU in the previous node is 29 + * not running. The one lock stealing attempt allowed at slowpath entry 30 + * mitigates the slight slowdown for non-overcommitted guest with this 31 + * aggressive wait-early mechanism. 32 + * 33 + * The status of the previous node will be checked at fixed interval 34 + * controlled by PV_PREV_CHECK_MASK. This is to ensure that we won't 35 + * pound on the cacheline of the previous node too heavily. 36 + */ 37 + #define PV_PREV_CHECK_MASK 0xff 38 + 39 + /* 26 40 * Queue node uses: vcpu_running & vcpu_halted. 27 41 * Queue head uses: vcpu_running & vcpu_hashed. 28 42 */ ··· 249 235 } 250 236 251 237 /* 238 + * Return true if when it is time to check the previous node which is not 239 + * in a running state. 240 + */ 241 + static inline bool 242 + pv_wait_early(struct pv_node *prev, int loop) 243 + { 244 + 245 + if ((loop & PV_PREV_CHECK_MASK) != 0) 246 + return false; 247 + 248 + return READ_ONCE(prev->state) != vcpu_running; 249 + } 250 + 251 + /* 252 252 * Initialize the PV part of the mcs_spinlock node. 253 253 */ 254 254 static void pv_init_node(struct mcs_spinlock *node) ··· 280 252 * pv_kick_node() is used to set _Q_SLOW_VAL and fill in hash table on its 281 253 * behalf. 282 254 */ 283 - static void pv_wait_node(struct mcs_spinlock *node) 255 + static void pv_wait_node(struct mcs_spinlock *node, struct mcs_spinlock *prev) 284 256 { 285 257 struct pv_node *pn = (struct pv_node *)node; 258 + struct pv_node *pp = (struct pv_node *)prev; 286 259 int waitcnt = 0; 287 260 int loop; 261 + bool wait_early; 288 262 289 263 /* waitcnt processing will be compiled out if !QUEUED_LOCK_STAT */ 290 264 for (;; waitcnt++) { 291 - for (loop = SPIN_THRESHOLD; loop; loop--) { 265 + for (wait_early = false, loop = SPIN_THRESHOLD; loop; loop--) { 292 266 if (READ_ONCE(node->locked)) 293 267 return; 268 + if (pv_wait_early(pp, loop)) { 269 + wait_early = true; 270 + break; 271 + } 294 272 cpu_relax(); 295 273 } 296 274 ··· 314 280 if (!READ_ONCE(node->locked)) { 315 281 qstat_inc(qstat_pv_wait_node, true); 316 282 qstat_inc(qstat_pv_wait_again, waitcnt); 283 + qstat_inc(qstat_pv_wait_early, wait_early); 317 284 pv_wait(&pn->state, vcpu_halted); 318 285 } 319 286 ··· 400 365 401 366 for (;; waitcnt++) { 402 367 /* 368 + * Set correct vCPU state to be used by queue node wait-early 369 + * mechanism. 370 + */ 371 + WRITE_ONCE(pn->state, vcpu_running); 372 + 373 + /* 403 374 * Set the pending bit in the active lock spinning loop to 404 375 * disable lock stealing before attempting to acquire the lock. 405 376 */ ··· 443 402 goto gotlock; 444 403 } 445 404 } 405 + WRITE_ONCE(pn->state, vcpu_halted); 446 406 qstat_inc(qstat_pv_wait_head, true); 447 407 qstat_inc(qstat_pv_wait_again, waitcnt); 448 408 pv_wait(&l->locked, _Q_SLOW_VAL);

kernel/locking/qspinlock_stat.h

··· 25 25 * pv_lock_stealing - # of lock stealing operations 26 26 * pv_spurious_wakeup - # of spurious wakeups 27 27 * pv_wait_again - # of vCPU wait's that happened after a vCPU kick 28 + * pv_wait_early - # of early vCPU wait's 28 29 * pv_wait_head - # of vCPU wait's at the queue head 29 30 * pv_wait_node - # of vCPU wait's at a non-head queue node 30 31 * ··· 48 47 qstat_pv_lock_stealing, 49 48 qstat_pv_spurious_wakeup, 50 49 qstat_pv_wait_again, 50 + qstat_pv_wait_early, 51 51 qstat_pv_wait_head, 52 52 qstat_pv_wait_node, 53 53 qstat_num, /* Total number of statistical counters */ ··· 72 70 [qstat_pv_latency_wake] = "pv_latency_wake", 73 71 [qstat_pv_lock_stealing] = "pv_lock_stealing", 74 72 [qstat_pv_wait_again] = "pv_wait_again", 73 + [qstat_pv_wait_early] = "pv_wait_early", 75 74 [qstat_pv_wait_head] = "pv_wait_head", 76 75 [qstat_pv_wait_node] = "pv_wait_node", 77 76 [qstat_reset_cnts] = "reset_counters",