Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge branch 'locking-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull locking updates from Ingo Molnar:
"So we have a laundry list of locking subsystem changes:

- continuing barrier API and code improvements

- futex enhancements

- atomics API improvements

- pvqspinlock enhancements: in particular lock stealing and adaptive
spinning

- qspinlock micro-enhancements"

* 'locking-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
futex: Allow FUTEX_CLOCK_REALTIME with FUTEX_WAIT op
futex: Cleanup the goto confusion in requeue_pi()
futex: Remove pointless put_pi_state calls in requeue()
futex: Document pi_state refcounting in requeue code
futex: Rename free_pi_state() to put_pi_state()
futex: Drop refcount if requeue_pi() acquired the rtmutex
locking/barriers, arch: Remove ambiguous statement in the smp_store_mb() documentation
lcoking/barriers, arch: Use smp barriers in smp_store_release()
locking/cmpxchg, arch: Remove tas() definitions
locking/pvqspinlock: Queue node adaptive spinning
locking/pvqspinlock: Allow limited lock stealing
locking/pvqspinlock: Collect slowpath lock statistics
sched/core, locking: Document Program-Order guarantees
locking, sched: Introduce smp_cond_acquire() and use it
locking/pvqspinlock, x86: Optimize the PV unlock code path
locking/qspinlock: Avoid redundant read of next pointer
locking/qspinlock: Prefetch the next node cacheline
locking/qspinlock: Use _acquire/_release() versions of cmpxchg() & xchg()
atomics: Add test for atomic operations with _relaxed variants

+904 -146
+2 -2
Documentation/memory-barriers.txt
··· 1673 1673 (*) smp_store_mb(var, value) 1674 1674 1675 1675 This assigns the value to the variable and then inserts a full memory 1676 - barrier after it, depending on the function. It isn't guaranteed to 1677 - insert anything more than a compiler barrier in a UP compilation. 1676 + barrier after it. It isn't guaranteed to insert anything more than a 1677 + compiler barrier in a UP compilation. 1678 1678 1679 1679 1680 1680 (*) smp_mb__before_atomic();
-1
arch/blackfin/include/asm/cmpxchg.h
··· 128 128 #endif /* !CONFIG_SMP */ 129 129 130 130 #define xchg(ptr, x) ((__typeof__(*(ptr)))__xchg((unsigned long)(x), (ptr), sizeof(*(ptr)))) 131 - #define tas(ptr) ((void)xchg((ptr), 1)) 132 131 133 132 #endif /* __ARCH_BLACKFIN_CMPXCHG__ */
-2
arch/c6x/include/asm/cmpxchg.h
··· 47 47 #define xchg(ptr, x) \ 48 48 ((__typeof__(*(ptr)))__xchg((unsigned int)(x), (void *) (ptr), \ 49 49 sizeof(*(ptr)))) 50 - #define tas(ptr) xchg((ptr), 1) 51 - 52 50 53 51 #include <asm-generic/cmpxchg-local.h> 54 52
-2
arch/frv/include/asm/cmpxchg.h
··· 69 69 70 70 #endif 71 71 72 - #define tas(ptr) (xchg((ptr), 1)) 73 - 74 72 /*****************************************************************************/ 75 73 /* 76 74 * compare and conditionally exchange value with memory
+1 -1
arch/ia64/include/asm/barrier.h
··· 77 77 ___p1; \ 78 78 }) 79 79 80 - #define smp_store_mb(var, value) do { WRITE_ONCE(var, value); mb(); } while (0) 80 + #define smp_store_mb(var, value) do { WRITE_ONCE(var, value); smp_mb(); } while (0) 81 81 82 82 /* 83 83 * The group barrier in front of the rsm & ssm are necessary to ensure
+1 -1
arch/powerpc/include/asm/barrier.h
··· 34 34 #define rmb() __asm__ __volatile__ ("sync" : : : "memory") 35 35 #define wmb() __asm__ __volatile__ ("sync" : : : "memory") 36 36 37 - #define smp_store_mb(var, value) do { WRITE_ONCE(var, value); mb(); } while (0) 37 + #define smp_store_mb(var, value) do { WRITE_ONCE(var, value); smp_mb(); } while (0) 38 38 39 39 #ifdef __SUBARCH_HAS_LWSYNC 40 40 # define SMPWMB LWSYNC
+1 -1
arch/s390/include/asm/barrier.h
··· 36 36 #define smp_mb__before_atomic() smp_mb() 37 37 #define smp_mb__after_atomic() smp_mb() 38 38 39 - #define smp_store_mb(var, value) do { WRITE_ONCE(var, value); mb(); } while (0) 39 + #define smp_store_mb(var, value) do { WRITE_ONCE(var, value); smp_mb(); } while (0) 40 40 41 41 #define smp_store_release(p, v) \ 42 42 do { \
-2
arch/tile/include/asm/cmpxchg.h
··· 127 127 128 128 #endif 129 129 130 - #define tas(ptr) xchg((ptr), 1) 131 - 132 130 #endif /* __ASSEMBLY__ */ 133 131 134 132 #endif /* _ASM_TILE_CMPXCHG_H */
+8
arch/x86/Kconfig
··· 687 687 688 688 If you are unsure how to answer this question, answer Y. 689 689 690 + config QUEUED_LOCK_STAT 691 + bool "Paravirt queued spinlock statistics" 692 + depends on PARAVIRT_SPINLOCKS && DEBUG_FS && QUEUED_SPINLOCKS 693 + ---help--- 694 + Enable the collection of statistical data on the slowpath 695 + behavior of paravirtualized queued spinlocks and report 696 + them on debugfs. 697 + 690 698 source "arch/x86/xen/Kconfig" 691 699 692 700 config KVM_GUEST
+59
arch/x86/include/asm/qspinlock_paravirt.h
··· 1 1 #ifndef __ASM_QSPINLOCK_PARAVIRT_H 2 2 #define __ASM_QSPINLOCK_PARAVIRT_H 3 3 4 + /* 5 + * For x86-64, PV_CALLEE_SAVE_REGS_THUNK() saves and restores 8 64-bit 6 + * registers. For i386, however, only 1 32-bit register needs to be saved 7 + * and restored. So an optimized version of __pv_queued_spin_unlock() is 8 + * hand-coded for 64-bit, but it isn't worthwhile to do it for 32-bit. 9 + */ 10 + #ifdef CONFIG_64BIT 11 + 12 + PV_CALLEE_SAVE_REGS_THUNK(__pv_queued_spin_unlock_slowpath); 13 + #define __pv_queued_spin_unlock __pv_queued_spin_unlock 14 + #define PV_UNLOCK "__raw_callee_save___pv_queued_spin_unlock" 15 + #define PV_UNLOCK_SLOWPATH "__raw_callee_save___pv_queued_spin_unlock_slowpath" 16 + 17 + /* 18 + * Optimized assembly version of __raw_callee_save___pv_queued_spin_unlock 19 + * which combines the registers saving trunk and the body of the following 20 + * C code: 21 + * 22 + * void __pv_queued_spin_unlock(struct qspinlock *lock) 23 + * { 24 + * struct __qspinlock *l = (void *)lock; 25 + * u8 lockval = cmpxchg(&l->locked, _Q_LOCKED_VAL, 0); 26 + * 27 + * if (likely(lockval == _Q_LOCKED_VAL)) 28 + * return; 29 + * pv_queued_spin_unlock_slowpath(lock, lockval); 30 + * } 31 + * 32 + * For x86-64, 33 + * rdi = lock (first argument) 34 + * rsi = lockval (second argument) 35 + * rdx = internal variable (set to 0) 36 + */ 37 + asm (".pushsection .text;" 38 + ".globl " PV_UNLOCK ";" 39 + ".align 4,0x90;" 40 + PV_UNLOCK ": " 41 + "push %rdx;" 42 + "mov $0x1,%eax;" 43 + "xor %edx,%edx;" 44 + "lock cmpxchg %dl,(%rdi);" 45 + "cmp $0x1,%al;" 46 + "jne .slowpath;" 47 + "pop %rdx;" 48 + "ret;" 49 + ".slowpath: " 50 + "push %rsi;" 51 + "movzbl %al,%esi;" 52 + "call " PV_UNLOCK_SLOWPATH ";" 53 + "pop %rsi;" 54 + "pop %rdx;" 55 + "ret;" 56 + ".size " PV_UNLOCK ", .-" PV_UNLOCK ";" 57 + ".popsection"); 58 + 59 + #else /* CONFIG_64BIT */ 60 + 61 + extern void __pv_queued_spin_unlock(struct qspinlock *lock); 4 62 PV_CALLEE_SAVE_REGS_THUNK(__pv_queued_spin_unlock); 5 63 64 + #endif /* CONFIG_64BIT */ 6 65 #endif
+1 -1
include/asm-generic/barrier.h
··· 93 93 #endif /* CONFIG_SMP */ 94 94 95 95 #ifndef smp_store_mb 96 - #define smp_store_mb(var, value) do { WRITE_ONCE(var, value); mb(); } while (0) 96 + #define smp_store_mb(var, value) do { WRITE_ONCE(var, value); smp_mb(); } while (0) 97 97 #endif 98 98 99 99 #ifndef smp_mb__before_atomic
+5 -4
include/asm-generic/qspinlock.h
··· 12 12 * GNU General Public License for more details. 13 13 * 14 14 * (C) Copyright 2013-2015 Hewlett-Packard Development Company, L.P. 15 + * (C) Copyright 2015 Hewlett-Packard Enterprise Development LP 15 16 * 16 - * Authors: Waiman Long <waiman.long@hp.com> 17 + * Authors: Waiman Long <waiman.long@hpe.com> 17 18 */ 18 19 #ifndef __ASM_GENERIC_QSPINLOCK_H 19 20 #define __ASM_GENERIC_QSPINLOCK_H ··· 63 62 static __always_inline int queued_spin_trylock(struct qspinlock *lock) 64 63 { 65 64 if (!atomic_read(&lock->val) && 66 - (atomic_cmpxchg(&lock->val, 0, _Q_LOCKED_VAL) == 0)) 65 + (atomic_cmpxchg_acquire(&lock->val, 0, _Q_LOCKED_VAL) == 0)) 67 66 return 1; 68 67 return 0; 69 68 } ··· 78 77 { 79 78 u32 val; 80 79 81 - val = atomic_cmpxchg(&lock->val, 0, _Q_LOCKED_VAL); 80 + val = atomic_cmpxchg_acquire(&lock->val, 0, _Q_LOCKED_VAL); 82 81 if (likely(val == 0)) 83 82 return; 84 83 queued_spin_lock_slowpath(lock, val); ··· 94 93 /* 95 94 * smp_mb__before_atomic() in order to guarantee release semantics 96 95 */ 97 - smp_mb__before_atomic_dec(); 96 + smp_mb__before_atomic(); 98 97 atomic_sub(_Q_LOCKED_VAL, &lock->val); 99 98 } 100 99 #endif
+17
include/linux/compiler.h
··· 299 299 __u.__val; \ 300 300 }) 301 301 302 + /** 303 + * smp_cond_acquire() - Spin wait for cond with ACQUIRE ordering 304 + * @cond: boolean expression to wait for 305 + * 306 + * Equivalent to using smp_load_acquire() on the condition variable but employs 307 + * the control dependency of the wait to reduce the barrier on many platforms. 308 + * 309 + * The control dependency provides a LOAD->STORE order, the additional RMB 310 + * provides LOAD->LOAD order, together they provide LOAD->{LOAD,STORE} order, 311 + * aka. ACQUIRE. 312 + */ 313 + #define smp_cond_acquire(cond) do { \ 314 + while (!(cond)) \ 315 + cpu_relax(); \ 316 + smp_rmb(); /* ctrl + rmb := acquire */ \ 317 + } while (0) 318 + 302 319 #endif /* __KERNEL__ */ 303 320 304 321 #endif /* __ASSEMBLY__ */
+61 -22
kernel/futex.c
··· 725 725 } 726 726 727 727 /* 728 + * Drops a reference to the pi_state object and frees or caches it 729 + * when the last reference is gone. 730 + * 728 731 * Must be called with the hb lock held. 729 732 */ 730 - static void free_pi_state(struct futex_pi_state *pi_state) 733 + static void put_pi_state(struct futex_pi_state *pi_state) 731 734 { 732 735 if (!pi_state) 733 736 return; ··· 1709 1706 * exist yet, look it up one more time to ensure we have a 1710 1707 * reference to it. If the lock was taken, ret contains the 1711 1708 * vpid of the top waiter task. 1709 + * If the lock was not taken, we have pi_state and an initial 1710 + * refcount on it. In case of an error we have nothing. 1712 1711 */ 1713 1712 if (ret > 0) { 1714 1713 WARN_ON(pi_state); 1715 1714 drop_count++; 1716 1715 task_count++; 1717 1716 /* 1718 - * If we acquired the lock, then the user 1719 - * space value of uaddr2 should be vpid. It 1720 - * cannot be changed by the top waiter as it 1721 - * is blocked on hb2 lock if it tries to do 1722 - * so. If something fiddled with it behind our 1723 - * back the pi state lookup might unearth 1724 - * it. So we rather use the known value than 1725 - * rereading and handing potential crap to 1726 - * lookup_pi_state. 1717 + * If we acquired the lock, then the user space value 1718 + * of uaddr2 should be vpid. It cannot be changed by 1719 + * the top waiter as it is blocked on hb2 lock if it 1720 + * tries to do so. If something fiddled with it behind 1721 + * our back the pi state lookup might unearth it. So 1722 + * we rather use the known value than rereading and 1723 + * handing potential crap to lookup_pi_state. 1724 + * 1725 + * If that call succeeds then we have pi_state and an 1726 + * initial refcount on it. 1727 1727 */ 1728 1728 ret = lookup_pi_state(ret, hb2, &key2, &pi_state); 1729 1729 } 1730 1730 1731 1731 switch (ret) { 1732 1732 case 0: 1733 + /* We hold a reference on the pi state. */ 1733 1734 break; 1735 + 1736 + /* If the above failed, then pi_state is NULL */ 1734 1737 case -EFAULT: 1735 - free_pi_state(pi_state); 1736 - pi_state = NULL; 1737 1738 double_unlock_hb(hb1, hb2); 1738 1739 hb_waiters_dec(hb2); 1739 1740 put_futex_key(&key2); ··· 1753 1746 * exit to complete. 1754 1747 * - The user space value changed. 1755 1748 */ 1756 - free_pi_state(pi_state); 1757 - pi_state = NULL; 1758 1749 double_unlock_hb(hb1, hb2); 1759 1750 hb_waiters_dec(hb2); 1760 1751 put_futex_key(&key2); ··· 1806 1801 * of requeue_pi if we couldn't acquire the lock atomically. 1807 1802 */ 1808 1803 if (requeue_pi) { 1809 - /* Prepare the waiter to take the rt_mutex. */ 1804 + /* 1805 + * Prepare the waiter to take the rt_mutex. Take a 1806 + * refcount on the pi_state and store the pointer in 1807 + * the futex_q object of the waiter. 1808 + */ 1810 1809 atomic_inc(&pi_state->refcount); 1811 1810 this->pi_state = pi_state; 1812 1811 ret = rt_mutex_start_proxy_lock(&pi_state->pi_mutex, 1813 1812 this->rt_waiter, 1814 1813 this->task); 1815 1814 if (ret == 1) { 1816 - /* We got the lock. */ 1815 + /* 1816 + * We got the lock. We do neither drop the 1817 + * refcount on pi_state nor clear 1818 + * this->pi_state because the waiter needs the 1819 + * pi_state for cleaning up the user space 1820 + * value. It will drop the refcount after 1821 + * doing so. 1822 + */ 1817 1823 requeue_pi_wake_futex(this, &key2, hb2); 1818 1824 drop_count++; 1819 1825 continue; 1820 1826 } else if (ret) { 1821 - /* -EDEADLK */ 1827 + /* 1828 + * rt_mutex_start_proxy_lock() detected a 1829 + * potential deadlock when we tried to queue 1830 + * that waiter. Drop the pi_state reference 1831 + * which we took above and remove the pointer 1832 + * to the state from the waiters futex_q 1833 + * object. 1834 + */ 1822 1835 this->pi_state = NULL; 1823 - free_pi_state(pi_state); 1824 - goto out_unlock; 1836 + put_pi_state(pi_state); 1837 + /* 1838 + * We stop queueing more waiters and let user 1839 + * space deal with the mess. 1840 + */ 1841 + break; 1825 1842 } 1826 1843 } 1827 1844 requeue_futex(this, hb1, hb2, &key2); 1828 1845 drop_count++; 1829 1846 } 1830 1847 1848 + /* 1849 + * We took an extra initial reference to the pi_state either 1850 + * in futex_proxy_trylock_atomic() or in lookup_pi_state(). We 1851 + * need to drop it here again. 1852 + */ 1853 + put_pi_state(pi_state); 1854 + 1831 1855 out_unlock: 1832 - free_pi_state(pi_state); 1833 1856 double_unlock_hb(hb1, hb2); 1834 1857 wake_up_q(&wake_q); 1835 1858 hb_waiters_dec(hb2); ··· 2006 1973 __unqueue_futex(q); 2007 1974 2008 1975 BUG_ON(!q->pi_state); 2009 - free_pi_state(q->pi_state); 1976 + put_pi_state(q->pi_state); 2010 1977 q->pi_state = NULL; 2011 1978 2012 1979 spin_unlock(q->lock_ptr); ··· 2788 2755 if (q.pi_state && (q.pi_state->owner != current)) { 2789 2756 spin_lock(q.lock_ptr); 2790 2757 ret = fixup_pi_state_owner(uaddr2, &q, current); 2758 + /* 2759 + * Drop the reference to the pi state which 2760 + * the requeue_pi() code acquired for us. 2761 + */ 2762 + put_pi_state(q.pi_state); 2791 2763 spin_unlock(q.lock_ptr); 2792 2764 } 2793 2765 } else { ··· 3084 3046 3085 3047 if (op & FUTEX_CLOCK_REALTIME) { 3086 3048 flags |= FLAGS_CLOCKRT; 3087 - if (cmd != FUTEX_WAIT_BITSET && cmd != FUTEX_WAIT_REQUEUE_PI) 3049 + if (cmd != FUTEX_WAIT && cmd != FUTEX_WAIT_BITSET && \ 3050 + cmd != FUTEX_WAIT_REQUEUE_PI) 3088 3051 return -ENOSYS; 3089 3052 } 3090 3053
+64 -18
kernel/locking/qspinlock.c
··· 14 14 * (C) Copyright 2013-2015 Hewlett-Packard Development Company, L.P. 15 15 * (C) Copyright 2013-2014 Red Hat, Inc. 16 16 * (C) Copyright 2015 Intel Corp. 17 + * (C) Copyright 2015 Hewlett-Packard Enterprise Development LP 17 18 * 18 - * Authors: Waiman Long <waiman.long@hp.com> 19 + * Authors: Waiman Long <waiman.long@hpe.com> 19 20 * Peter Zijlstra <peterz@infradead.org> 20 21 */ 21 22 ··· 177 176 { 178 177 struct __qspinlock *l = (void *)lock; 179 178 180 - return (u32)xchg(&l->tail, tail >> _Q_TAIL_OFFSET) << _Q_TAIL_OFFSET; 179 + /* 180 + * Use release semantics to make sure that the MCS node is properly 181 + * initialized before changing the tail code. 182 + */ 183 + return (u32)xchg_release(&l->tail, 184 + tail >> _Q_TAIL_OFFSET) << _Q_TAIL_OFFSET; 181 185 } 182 186 183 187 #else /* _Q_PENDING_BITS == 8 */ ··· 214 208 215 209 for (;;) { 216 210 new = (val & _Q_LOCKED_PENDING_MASK) | tail; 217 - old = atomic_cmpxchg(&lock->val, val, new); 211 + /* 212 + * Use release semantics to make sure that the MCS node is 213 + * properly initialized before changing the tail code. 214 + */ 215 + old = atomic_cmpxchg_release(&lock->val, val, new); 218 216 if (old == val) 219 217 break; 220 218 ··· 248 238 */ 249 239 250 240 static __always_inline void __pv_init_node(struct mcs_spinlock *node) { } 251 - static __always_inline void __pv_wait_node(struct mcs_spinlock *node) { } 241 + static __always_inline void __pv_wait_node(struct mcs_spinlock *node, 242 + struct mcs_spinlock *prev) { } 252 243 static __always_inline void __pv_kick_node(struct qspinlock *lock, 253 244 struct mcs_spinlock *node) { } 254 - static __always_inline void __pv_wait_head(struct qspinlock *lock, 255 - struct mcs_spinlock *node) { } 245 + static __always_inline u32 __pv_wait_head_or_lock(struct qspinlock *lock, 246 + struct mcs_spinlock *node) 247 + { return 0; } 256 248 257 249 #define pv_enabled() false 258 250 259 251 #define pv_init_node __pv_init_node 260 252 #define pv_wait_node __pv_wait_node 261 253 #define pv_kick_node __pv_kick_node 262 - #define pv_wait_head __pv_wait_head 254 + #define pv_wait_head_or_lock __pv_wait_head_or_lock 263 255 264 256 #ifdef CONFIG_PARAVIRT_SPINLOCKS 265 257 #define queued_spin_lock_slowpath native_queued_spin_lock_slowpath ··· 331 319 if (val == new) 332 320 new |= _Q_PENDING_VAL; 333 321 334 - old = atomic_cmpxchg(&lock->val, val, new); 322 + /* 323 + * Acquire semantic is required here as the function may 324 + * return immediately if the lock was free. 325 + */ 326 + old = atomic_cmpxchg_acquire(&lock->val, val, new); 335 327 if (old == val) 336 328 break; 337 329 ··· 398 382 * p,*,* -> n,*,* 399 383 */ 400 384 old = xchg_tail(lock, tail); 385 + next = NULL; 401 386 402 387 /* 403 388 * if there was a previous node; link it and wait until reaching the ··· 408 391 prev = decode_tail(old); 409 392 WRITE_ONCE(prev->next, node); 410 393 411 - pv_wait_node(node); 394 + pv_wait_node(node, prev); 412 395 arch_mcs_spin_lock_contended(&node->locked); 396 + 397 + /* 398 + * While waiting for the MCS lock, the next pointer may have 399 + * been set by another lock waiter. We optimistically load 400 + * the next pointer & prefetch the cacheline for writing 401 + * to reduce latency in the upcoming MCS unlock operation. 402 + */ 403 + next = READ_ONCE(node->next); 404 + if (next) 405 + prefetchw(next); 413 406 } 414 407 415 408 /* ··· 433 406 * sequentiality; this is because the set_locked() function below 434 407 * does not imply a full barrier. 435 408 * 409 + * The PV pv_wait_head_or_lock function, if active, will acquire 410 + * the lock and return a non-zero value. So we have to skip the 411 + * smp_load_acquire() call. As the next PV queue head hasn't been 412 + * designated yet, there is no way for the locked value to become 413 + * _Q_SLOW_VAL. So both the set_locked() and the 414 + * atomic_cmpxchg_relaxed() calls will be safe. 415 + * 416 + * If PV isn't active, 0 will be returned instead. 417 + * 436 418 */ 437 - pv_wait_head(lock, node); 438 - while ((val = smp_load_acquire(&lock->val.counter)) & _Q_LOCKED_PENDING_MASK) 439 - cpu_relax(); 419 + if ((val = pv_wait_head_or_lock(lock, node))) 420 + goto locked; 440 421 422 + smp_cond_acquire(!((val = atomic_read(&lock->val)) & _Q_LOCKED_PENDING_MASK)); 423 + 424 + locked: 441 425 /* 442 426 * claim the lock: 443 427 * ··· 460 422 * to grab the lock. 461 423 */ 462 424 for (;;) { 463 - if (val != tail) { 425 + /* In the PV case we might already have _Q_LOCKED_VAL set */ 426 + if ((val & _Q_TAIL_MASK) != tail) { 464 427 set_locked(lock); 465 428 break; 466 429 } 467 - old = atomic_cmpxchg(&lock->val, val, _Q_LOCKED_VAL); 430 + /* 431 + * The smp_load_acquire() call above has provided the necessary 432 + * acquire semantics required for locking. At most two 433 + * iterations of this loop may be ran. 434 + */ 435 + old = atomic_cmpxchg_relaxed(&lock->val, val, _Q_LOCKED_VAL); 468 436 if (old == val) 469 437 goto release; /* No contention */ 470 438 ··· 478 434 } 479 435 480 436 /* 481 - * contended path; wait for next, release. 437 + * contended path; wait for next if not observed yet, release. 482 438 */ 483 - while (!(next = READ_ONCE(node->next))) 484 - cpu_relax(); 439 + if (!next) { 440 + while (!(next = READ_ONCE(node->next))) 441 + cpu_relax(); 442 + } 485 443 486 444 arch_mcs_spin_unlock_contended(&next->locked); 487 445 pv_kick_node(lock, next); ··· 508 462 #undef pv_init_node 509 463 #undef pv_wait_node 510 464 #undef pv_kick_node 511 - #undef pv_wait_head 465 + #undef pv_wait_head_or_lock 512 466 513 467 #undef queued_spin_lock_slowpath 514 468 #define queued_spin_lock_slowpath __pv_queued_spin_lock_slowpath
+212 -40
kernel/locking/qspinlock_paravirt.h
··· 23 23 #define _Q_SLOW_VAL (3U << _Q_LOCKED_OFFSET) 24 24 25 25 /* 26 + * Queue Node Adaptive Spinning 27 + * 28 + * A queue node vCPU will stop spinning if the vCPU in the previous node is 29 + * not running. The one lock stealing attempt allowed at slowpath entry 30 + * mitigates the slight slowdown for non-overcommitted guest with this 31 + * aggressive wait-early mechanism. 32 + * 33 + * The status of the previous node will be checked at fixed interval 34 + * controlled by PV_PREV_CHECK_MASK. This is to ensure that we won't 35 + * pound on the cacheline of the previous node too heavily. 36 + */ 37 + #define PV_PREV_CHECK_MASK 0xff 38 + 39 + /* 26 40 * Queue node uses: vcpu_running & vcpu_halted. 27 41 * Queue head uses: vcpu_running & vcpu_hashed. 28 42 */ ··· 53 39 int cpu; 54 40 u8 state; 55 41 }; 42 + 43 + /* 44 + * By replacing the regular queued_spin_trylock() with the function below, 45 + * it will be called once when a lock waiter enter the PV slowpath before 46 + * being queued. By allowing one lock stealing attempt here when the pending 47 + * bit is off, it helps to reduce the performance impact of lock waiter 48 + * preemption without the drawback of lock starvation. 49 + */ 50 + #define queued_spin_trylock(l) pv_queued_spin_steal_lock(l) 51 + static inline bool pv_queued_spin_steal_lock(struct qspinlock *lock) 52 + { 53 + struct __qspinlock *l = (void *)lock; 54 + 55 + return !(atomic_read(&lock->val) & _Q_LOCKED_PENDING_MASK) && 56 + (cmpxchg(&l->locked, 0, _Q_LOCKED_VAL) == 0); 57 + } 58 + 59 + /* 60 + * The pending bit is used by the queue head vCPU to indicate that it 61 + * is actively spinning on the lock and no lock stealing is allowed. 62 + */ 63 + #if _Q_PENDING_BITS == 8 64 + static __always_inline void set_pending(struct qspinlock *lock) 65 + { 66 + struct __qspinlock *l = (void *)lock; 67 + 68 + WRITE_ONCE(l->pending, 1); 69 + } 70 + 71 + static __always_inline void clear_pending(struct qspinlock *lock) 72 + { 73 + struct __qspinlock *l = (void *)lock; 74 + 75 + WRITE_ONCE(l->pending, 0); 76 + } 77 + 78 + /* 79 + * The pending bit check in pv_queued_spin_steal_lock() isn't a memory 80 + * barrier. Therefore, an atomic cmpxchg() is used to acquire the lock 81 + * just to be sure that it will get it. 82 + */ 83 + static __always_inline int trylock_clear_pending(struct qspinlock *lock) 84 + { 85 + struct __qspinlock *l = (void *)lock; 86 + 87 + return !READ_ONCE(l->locked) && 88 + (cmpxchg(&l->locked_pending, _Q_PENDING_VAL, _Q_LOCKED_VAL) 89 + == _Q_PENDING_VAL); 90 + } 91 + #else /* _Q_PENDING_BITS == 8 */ 92 + static __always_inline void set_pending(struct qspinlock *lock) 93 + { 94 + atomic_set_mask(_Q_PENDING_VAL, &lock->val); 95 + } 96 + 97 + static __always_inline void clear_pending(struct qspinlock *lock) 98 + { 99 + atomic_clear_mask(_Q_PENDING_VAL, &lock->val); 100 + } 101 + 102 + static __always_inline int trylock_clear_pending(struct qspinlock *lock) 103 + { 104 + int val = atomic_read(&lock->val); 105 + 106 + for (;;) { 107 + int old, new; 108 + 109 + if (val & _Q_LOCKED_MASK) 110 + break; 111 + 112 + /* 113 + * Try to clear pending bit & set locked bit 114 + */ 115 + old = val; 116 + new = (val & ~_Q_PENDING_MASK) | _Q_LOCKED_VAL; 117 + val = atomic_cmpxchg(&lock->val, old, new); 118 + 119 + if (val == old) 120 + return 1; 121 + } 122 + return 0; 123 + } 124 + #endif /* _Q_PENDING_BITS == 8 */ 125 + 126 + /* 127 + * Include queued spinlock statistics code 128 + */ 129 + #include "qspinlock_stat.h" 56 130 57 131 /* 58 132 * Lock and MCS node addresses hash table for fast lookup ··· 202 100 { 203 101 unsigned long offset, hash = hash_ptr(lock, pv_lock_hash_bits); 204 102 struct pv_hash_entry *he; 103 + int hopcnt = 0; 205 104 206 105 for_each_hash_entry(he, offset, hash) { 106 + hopcnt++; 207 107 if (!cmpxchg(&he->lock, NULL, lock)) { 208 108 WRITE_ONCE(he->node, node); 109 + qstat_hop(hopcnt); 209 110 return &he->lock; 210 111 } 211 112 } ··· 249 144 } 250 145 251 146 /* 147 + * Return true if when it is time to check the previous node which is not 148 + * in a running state. 149 + */ 150 + static inline bool 151 + pv_wait_early(struct pv_node *prev, int loop) 152 + { 153 + 154 + if ((loop & PV_PREV_CHECK_MASK) != 0) 155 + return false; 156 + 157 + return READ_ONCE(prev->state) != vcpu_running; 158 + } 159 + 160 + /* 252 161 * Initialize the PV part of the mcs_spinlock node. 253 162 */ 254 163 static void pv_init_node(struct mcs_spinlock *node) ··· 280 161 * pv_kick_node() is used to set _Q_SLOW_VAL and fill in hash table on its 281 162 * behalf. 282 163 */ 283 - static void pv_wait_node(struct mcs_spinlock *node) 164 + static void pv_wait_node(struct mcs_spinlock *node, struct mcs_spinlock *prev) 284 165 { 285 166 struct pv_node *pn = (struct pv_node *)node; 167 + struct pv_node *pp = (struct pv_node *)prev; 168 + int waitcnt = 0; 286 169 int loop; 170 + bool wait_early; 287 171 288 - for (;;) { 289 - for (loop = SPIN_THRESHOLD; loop; loop--) { 172 + /* waitcnt processing will be compiled out if !QUEUED_LOCK_STAT */ 173 + for (;; waitcnt++) { 174 + for (wait_early = false, loop = SPIN_THRESHOLD; loop; loop--) { 290 175 if (READ_ONCE(node->locked)) 291 176 return; 177 + if (pv_wait_early(pp, loop)) { 178 + wait_early = true; 179 + break; 180 + } 292 181 cpu_relax(); 293 182 } 294 183 ··· 311 184 */ 312 185 smp_store_mb(pn->state, vcpu_halted); 313 186 314 - if (!READ_ONCE(node->locked)) 187 + if (!READ_ONCE(node->locked)) { 188 + qstat_inc(qstat_pv_wait_node, true); 189 + qstat_inc(qstat_pv_wait_again, waitcnt); 190 + qstat_inc(qstat_pv_wait_early, wait_early); 315 191 pv_wait(&pn->state, vcpu_halted); 192 + } 316 193 317 194 /* 318 - * If pv_kick_node() changed us to vcpu_hashed, retain that value 319 - * so that pv_wait_head() knows to not also try to hash this lock. 195 + * If pv_kick_node() changed us to vcpu_hashed, retain that 196 + * value so that pv_wait_head_or_lock() knows to not also try 197 + * to hash this lock. 320 198 */ 321 199 cmpxchg(&pn->state, vcpu_halted, vcpu_running); 322 200 ··· 332 200 * So it is better to spin for a while in the hope that the 333 201 * MCS lock will be released soon. 334 202 */ 203 + qstat_inc(qstat_pv_spurious_wakeup, !READ_ONCE(node->locked)); 335 204 } 336 205 337 206 /* ··· 345 212 /* 346 213 * Called after setting next->locked = 1 when we're the lock owner. 347 214 * 348 - * Instead of waking the waiters stuck in pv_wait_node() advance their state such 349 - * that they're waiting in pv_wait_head(), this avoids a wake/sleep cycle. 215 + * Instead of waking the waiters stuck in pv_wait_node() advance their state 216 + * such that they're waiting in pv_wait_head_or_lock(), this avoids a 217 + * wake/sleep cycle. 350 218 */ 351 219 static void pv_kick_node(struct qspinlock *lock, struct mcs_spinlock *node) 352 220 { ··· 376 242 } 377 243 378 244 /* 379 - * Wait for l->locked to become clear; halt the vcpu after a short spin. 245 + * Wait for l->locked to become clear and acquire the lock; 246 + * halt the vcpu after a short spin. 380 247 * __pv_queued_spin_unlock() will wake us. 248 + * 249 + * The current value of the lock will be returned for additional processing. 381 250 */ 382 - static void pv_wait_head(struct qspinlock *lock, struct mcs_spinlock *node) 251 + static u32 252 + pv_wait_head_or_lock(struct qspinlock *lock, struct mcs_spinlock *node) 383 253 { 384 254 struct pv_node *pn = (struct pv_node *)node; 385 255 struct __qspinlock *l = (void *)lock; 386 256 struct qspinlock **lp = NULL; 257 + int waitcnt = 0; 387 258 int loop; 388 259 389 260 /* ··· 398 259 if (READ_ONCE(pn->state) == vcpu_hashed) 399 260 lp = (struct qspinlock **)1; 400 261 401 - for (;;) { 262 + for (;; waitcnt++) { 263 + /* 264 + * Set correct vCPU state to be used by queue node wait-early 265 + * mechanism. 266 + */ 267 + WRITE_ONCE(pn->state, vcpu_running); 268 + 269 + /* 270 + * Set the pending bit in the active lock spinning loop to 271 + * disable lock stealing before attempting to acquire the lock. 272 + */ 273 + set_pending(lock); 402 274 for (loop = SPIN_THRESHOLD; loop; loop--) { 403 - if (!READ_ONCE(l->locked)) 404 - return; 275 + if (trylock_clear_pending(lock)) 276 + goto gotlock; 405 277 cpu_relax(); 406 278 } 279 + clear_pending(lock); 280 + 407 281 408 282 if (!lp) { /* ONCE */ 409 283 lp = pv_hash(lock, pn); ··· 432 280 * 433 281 * Matches the smp_rmb() in __pv_queued_spin_unlock(). 434 282 */ 435 - if (!cmpxchg(&l->locked, _Q_LOCKED_VAL, _Q_SLOW_VAL)) { 283 + if (xchg(&l->locked, _Q_SLOW_VAL) == 0) { 436 284 /* 437 - * The lock is free and _Q_SLOW_VAL has never 438 - * been set. Therefore we need to unhash before 439 - * getting the lock. 285 + * The lock was free and now we own the lock. 286 + * Change the lock value back to _Q_LOCKED_VAL 287 + * and unhash the table. 440 288 */ 289 + WRITE_ONCE(l->locked, _Q_LOCKED_VAL); 441 290 WRITE_ONCE(*lp, NULL); 442 - return; 291 + goto gotlock; 443 292 } 444 293 } 294 + WRITE_ONCE(pn->state, vcpu_halted); 295 + qstat_inc(qstat_pv_wait_head, true); 296 + qstat_inc(qstat_pv_wait_again, waitcnt); 445 297 pv_wait(&l->locked, _Q_SLOW_VAL); 446 298 447 299 /* 448 300 * The unlocker should have freed the lock before kicking the 449 301 * CPU. So if the lock is still not free, it is a spurious 450 - * wakeup and so the vCPU should wait again after spinning for 451 - * a while. 302 + * wakeup or another vCPU has stolen the lock. The current 303 + * vCPU should spin again. 452 304 */ 305 + qstat_inc(qstat_pv_spurious_wakeup, READ_ONCE(l->locked)); 453 306 } 454 307 455 308 /* 456 - * Lock is unlocked now; the caller will acquire it without waiting. 457 - * As with pv_wait_node() we rely on the caller to do a load-acquire 458 - * for us. 309 + * The cmpxchg() or xchg() call before coming here provides the 310 + * acquire semantics for locking. The dummy ORing of _Q_LOCKED_VAL 311 + * here is to indicate to the compiler that the value will always 312 + * be nozero to enable better code optimization. 459 313 */ 314 + gotlock: 315 + return (u32)(atomic_read(&lock->val) | _Q_LOCKED_VAL); 460 316 } 461 317 462 318 /* 463 - * PV version of the unlock function to be used in stead of 464 - * queued_spin_unlock(). 319 + * PV versions of the unlock fastpath and slowpath functions to be used 320 + * instead of queued_spin_unlock(). 465 321 */ 466 - __visible void __pv_queued_spin_unlock(struct qspinlock *lock) 322 + __visible void 323 + __pv_queued_spin_unlock_slowpath(struct qspinlock *lock, u8 locked) 467 324 { 468 325 struct __qspinlock *l = (void *)lock; 469 326 struct pv_node *node; 470 - u8 locked; 471 - 472 - /* 473 - * We must not unlock if SLOW, because in that case we must first 474 - * unhash. Otherwise it would be possible to have multiple @lock 475 - * entries, which would be BAD. 476 - */ 477 - locked = cmpxchg(&l->locked, _Q_LOCKED_VAL, 0); 478 - if (likely(locked == _Q_LOCKED_VAL)) 479 - return; 480 327 481 328 if (unlikely(locked != _Q_SLOW_VAL)) { 482 329 WARN(!debug_locks_silent, ··· 489 338 * so we need a barrier to order the read of the node data in 490 339 * pv_unhash *after* we've read the lock being _Q_SLOW_VAL. 491 340 * 492 - * Matches the cmpxchg() in pv_wait_head() setting _Q_SLOW_VAL. 341 + * Matches the cmpxchg() in pv_wait_head_or_lock() setting _Q_SLOW_VAL. 493 342 */ 494 343 smp_rmb(); 495 344 ··· 512 361 * vCPU is harmless other than the additional latency in completing 513 362 * the unlock. 514 363 */ 364 + qstat_inc(qstat_pv_kick_unlock, true); 515 365 pv_kick(node->cpu); 516 366 } 367 + 517 368 /* 518 369 * Include the architecture specific callee-save thunk of the 519 370 * __pv_queued_spin_unlock(). This thunk is put together with 520 - * __pv_queued_spin_unlock() near the top of the file to make sure 521 - * that the callee-save thunk and the real unlock function are close 522 - * to each other sharing consecutive instruction cachelines. 371 + * __pv_queued_spin_unlock() to make the callee-save thunk and the real unlock 372 + * function close to each other sharing consecutive instruction cachelines. 373 + * Alternatively, architecture specific version of __pv_queued_spin_unlock() 374 + * can be defined. 523 375 */ 524 376 #include <asm/qspinlock_paravirt.h> 525 377 378 + #ifndef __pv_queued_spin_unlock 379 + __visible void __pv_queued_spin_unlock(struct qspinlock *lock) 380 + { 381 + struct __qspinlock *l = (void *)lock; 382 + u8 locked; 383 + 384 + /* 385 + * We must not unlock if SLOW, because in that case we must first 386 + * unhash. Otherwise it would be possible to have multiple @lock 387 + * entries, which would be BAD. 388 + */ 389 + locked = cmpxchg(&l->locked, _Q_LOCKED_VAL, 0); 390 + if (likely(locked == _Q_LOCKED_VAL)) 391 + return; 392 + 393 + __pv_queued_spin_unlock_slowpath(lock, locked); 394 + } 395 + #endif /* __pv_queued_spin_unlock */
+300
kernel/locking/qspinlock_stat.h
··· 1 + /* 2 + * This program is free software; you can redistribute it and/or modify 3 + * it under the terms of the GNU General Public License as published by 4 + * the Free Software Foundation; either version 2 of the License, or 5 + * (at your option) any later version. 6 + * 7 + * This program is distributed in the hope that it will be useful, 8 + * but WITHOUT ANY WARRANTY; without even the implied warranty of 9 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 10 + * GNU General Public License for more details. 11 + * 12 + * Authors: Waiman Long <waiman.long@hpe.com> 13 + */ 14 + 15 + /* 16 + * When queued spinlock statistical counters are enabled, the following 17 + * debugfs files will be created for reporting the counter values: 18 + * 19 + * <debugfs>/qlockstat/ 20 + * pv_hash_hops - average # of hops per hashing operation 21 + * pv_kick_unlock - # of vCPU kicks issued at unlock time 22 + * pv_kick_wake - # of vCPU kicks used for computing pv_latency_wake 23 + * pv_latency_kick - average latency (ns) of vCPU kick operation 24 + * pv_latency_wake - average latency (ns) from vCPU kick to wakeup 25 + * pv_lock_stealing - # of lock stealing operations 26 + * pv_spurious_wakeup - # of spurious wakeups 27 + * pv_wait_again - # of vCPU wait's that happened after a vCPU kick 28 + * pv_wait_early - # of early vCPU wait's 29 + * pv_wait_head - # of vCPU wait's at the queue head 30 + * pv_wait_node - # of vCPU wait's at a non-head queue node 31 + * 32 + * Writing to the "reset_counters" file will reset all the above counter 33 + * values. 34 + * 35 + * These statistical counters are implemented as per-cpu variables which are 36 + * summed and computed whenever the corresponding debugfs files are read. This 37 + * minimizes added overhead making the counters usable even in a production 38 + * environment. 39 + * 40 + * There may be slight difference between pv_kick_wake and pv_kick_unlock. 41 + */ 42 + enum qlock_stats { 43 + qstat_pv_hash_hops, 44 + qstat_pv_kick_unlock, 45 + qstat_pv_kick_wake, 46 + qstat_pv_latency_kick, 47 + qstat_pv_latency_wake, 48 + qstat_pv_lock_stealing, 49 + qstat_pv_spurious_wakeup, 50 + qstat_pv_wait_again, 51 + qstat_pv_wait_early, 52 + qstat_pv_wait_head, 53 + qstat_pv_wait_node, 54 + qstat_num, /* Total number of statistical counters */ 55 + qstat_reset_cnts = qstat_num, 56 + }; 57 + 58 + #ifdef CONFIG_QUEUED_LOCK_STAT 59 + /* 60 + * Collect pvqspinlock statistics 61 + */ 62 + #include <linux/debugfs.h> 63 + #include <linux/sched.h> 64 + #include <linux/fs.h> 65 + 66 + static const char * const qstat_names[qstat_num + 1] = { 67 + [qstat_pv_hash_hops] = "pv_hash_hops", 68 + [qstat_pv_kick_unlock] = "pv_kick_unlock", 69 + [qstat_pv_kick_wake] = "pv_kick_wake", 70 + [qstat_pv_spurious_wakeup] = "pv_spurious_wakeup", 71 + [qstat_pv_latency_kick] = "pv_latency_kick", 72 + [qstat_pv_latency_wake] = "pv_latency_wake", 73 + [qstat_pv_lock_stealing] = "pv_lock_stealing", 74 + [qstat_pv_wait_again] = "pv_wait_again", 75 + [qstat_pv_wait_early] = "pv_wait_early", 76 + [qstat_pv_wait_head] = "pv_wait_head", 77 + [qstat_pv_wait_node] = "pv_wait_node", 78 + [qstat_reset_cnts] = "reset_counters", 79 + }; 80 + 81 + /* 82 + * Per-cpu counters 83 + */ 84 + static DEFINE_PER_CPU(unsigned long, qstats[qstat_num]); 85 + static DEFINE_PER_CPU(u64, pv_kick_time); 86 + 87 + /* 88 + * Function to read and return the qlock statistical counter values 89 + * 90 + * The following counters are handled specially: 91 + * 1. qstat_pv_latency_kick 92 + * Average kick latency (ns) = pv_latency_kick/pv_kick_unlock 93 + * 2. qstat_pv_latency_wake 94 + * Average wake latency (ns) = pv_latency_wake/pv_kick_wake 95 + * 3. qstat_pv_hash_hops 96 + * Average hops/hash = pv_hash_hops/pv_kick_unlock 97 + */ 98 + static ssize_t qstat_read(struct file *file, char __user *user_buf, 99 + size_t count, loff_t *ppos) 100 + { 101 + char buf[64]; 102 + int cpu, counter, len; 103 + u64 stat = 0, kicks = 0; 104 + 105 + /* 106 + * Get the counter ID stored in file->f_inode->i_private 107 + */ 108 + if (!file->f_inode) { 109 + WARN_ON_ONCE(1); 110 + return -EBADF; 111 + } 112 + counter = (long)(file->f_inode->i_private); 113 + 114 + if (counter >= qstat_num) 115 + return -EBADF; 116 + 117 + for_each_possible_cpu(cpu) { 118 + stat += per_cpu(qstats[counter], cpu); 119 + /* 120 + * Need to sum additional counter for some of them 121 + */ 122 + switch (counter) { 123 + 124 + case qstat_pv_latency_kick: 125 + case qstat_pv_hash_hops: 126 + kicks += per_cpu(qstats[qstat_pv_kick_unlock], cpu); 127 + break; 128 + 129 + case qstat_pv_latency_wake: 130 + kicks += per_cpu(qstats[qstat_pv_kick_wake], cpu); 131 + break; 132 + } 133 + } 134 + 135 + if (counter == qstat_pv_hash_hops) { 136 + u64 frac; 137 + 138 + frac = 100ULL * do_div(stat, kicks); 139 + frac = DIV_ROUND_CLOSEST_ULL(frac, kicks); 140 + 141 + /* 142 + * Return a X.XX decimal number 143 + */ 144 + len = snprintf(buf, sizeof(buf) - 1, "%llu.%02llu\n", stat, frac); 145 + } else { 146 + /* 147 + * Round to the nearest ns 148 + */ 149 + if ((counter == qstat_pv_latency_kick) || 150 + (counter == qstat_pv_latency_wake)) { 151 + stat = 0; 152 + if (kicks) 153 + stat = DIV_ROUND_CLOSEST_ULL(stat, kicks); 154 + } 155 + len = snprintf(buf, sizeof(buf) - 1, "%llu\n", stat); 156 + } 157 + 158 + return simple_read_from_buffer(user_buf, count, ppos, buf, len); 159 + } 160 + 161 + /* 162 + * Function to handle write request 163 + * 164 + * When counter = reset_cnts, reset all the counter values. 165 + * Since the counter updates aren't atomic, the resetting is done twice 166 + * to make sure that the counters are very likely to be all cleared. 167 + */ 168 + static ssize_t qstat_write(struct file *file, const char __user *user_buf, 169 + size_t count, loff_t *ppos) 170 + { 171 + int cpu; 172 + 173 + /* 174 + * Get the counter ID stored in file->f_inode->i_private 175 + */ 176 + if (!file->f_inode) { 177 + WARN_ON_ONCE(1); 178 + return -EBADF; 179 + } 180 + if ((long)(file->f_inode->i_private) != qstat_reset_cnts) 181 + return count; 182 + 183 + for_each_possible_cpu(cpu) { 184 + int i; 185 + unsigned long *ptr = per_cpu_ptr(qstats, cpu); 186 + 187 + for (i = 0 ; i < qstat_num; i++) 188 + WRITE_ONCE(ptr[i], 0); 189 + for (i = 0 ; i < qstat_num; i++) 190 + WRITE_ONCE(ptr[i], 0); 191 + } 192 + return count; 193 + } 194 + 195 + /* 196 + * Debugfs data structures 197 + */ 198 + static const struct file_operations fops_qstat = { 199 + .read = qstat_read, 200 + .write = qstat_write, 201 + .llseek = default_llseek, 202 + }; 203 + 204 + /* 205 + * Initialize debugfs for the qspinlock statistical counters 206 + */ 207 + static int __init init_qspinlock_stat(void) 208 + { 209 + struct dentry *d_qstat = debugfs_create_dir("qlockstat", NULL); 210 + int i; 211 + 212 + if (!d_qstat) { 213 + pr_warn("Could not create 'qlockstat' debugfs directory\n"); 214 + return 0; 215 + } 216 + 217 + /* 218 + * Create the debugfs files 219 + * 220 + * As reading from and writing to the stat files can be slow, only 221 + * root is allowed to do the read/write to limit impact to system 222 + * performance. 223 + */ 224 + for (i = 0; i < qstat_num; i++) 225 + debugfs_create_file(qstat_names[i], 0400, d_qstat, 226 + (void *)(long)i, &fops_qstat); 227 + 228 + debugfs_create_file(qstat_names[qstat_reset_cnts], 0200, d_qstat, 229 + (void *)(long)qstat_reset_cnts, &fops_qstat); 230 + return 0; 231 + } 232 + fs_initcall(init_qspinlock_stat); 233 + 234 + /* 235 + * Increment the PV qspinlock statistical counters 236 + */ 237 + static inline void qstat_inc(enum qlock_stats stat, bool cond) 238 + { 239 + if (cond) 240 + this_cpu_inc(qstats[stat]); 241 + } 242 + 243 + /* 244 + * PV hash hop count 245 + */ 246 + static inline void qstat_hop(int hopcnt) 247 + { 248 + this_cpu_add(qstats[qstat_pv_hash_hops], hopcnt); 249 + } 250 + 251 + /* 252 + * Replacement function for pv_kick() 253 + */ 254 + static inline void __pv_kick(int cpu) 255 + { 256 + u64 start = sched_clock(); 257 + 258 + per_cpu(pv_kick_time, cpu) = start; 259 + pv_kick(cpu); 260 + this_cpu_add(qstats[qstat_pv_latency_kick], sched_clock() - start); 261 + } 262 + 263 + /* 264 + * Replacement function for pv_wait() 265 + */ 266 + static inline void __pv_wait(u8 *ptr, u8 val) 267 + { 268 + u64 *pkick_time = this_cpu_ptr(&pv_kick_time); 269 + 270 + *pkick_time = 0; 271 + pv_wait(ptr, val); 272 + if (*pkick_time) { 273 + this_cpu_add(qstats[qstat_pv_latency_wake], 274 + sched_clock() - *pkick_time); 275 + qstat_inc(qstat_pv_kick_wake, true); 276 + } 277 + } 278 + 279 + #define pv_kick(c) __pv_kick(c) 280 + #define pv_wait(p, v) __pv_wait(p, v) 281 + 282 + /* 283 + * PV unfair trylock count tracking function 284 + */ 285 + static inline int qstat_spin_steal_lock(struct qspinlock *lock) 286 + { 287 + int ret = pv_queued_spin_steal_lock(lock); 288 + 289 + qstat_inc(qstat_pv_lock_stealing, ret); 290 + return ret; 291 + } 292 + #undef queued_spin_trylock 293 + #define queued_spin_trylock(l) qstat_spin_steal_lock(l) 294 + 295 + #else /* CONFIG_QUEUED_LOCK_STAT */ 296 + 297 + static inline void qstat_inc(enum qlock_stats stat, bool cond) { } 298 + static inline void qstat_hop(int hopcnt) { } 299 + 300 + #endif /* CONFIG_QUEUED_LOCK_STAT */
+92 -7
kernel/sched/core.c
··· 1905 1905 raw_spin_unlock(&rq->lock); 1906 1906 } 1907 1907 1908 + /* 1909 + * Notes on Program-Order guarantees on SMP systems. 1910 + * 1911 + * MIGRATION 1912 + * 1913 + * The basic program-order guarantee on SMP systems is that when a task [t] 1914 + * migrates, all its activity on its old cpu [c0] happens-before any subsequent 1915 + * execution on its new cpu [c1]. 1916 + * 1917 + * For migration (of runnable tasks) this is provided by the following means: 1918 + * 1919 + * A) UNLOCK of the rq(c0)->lock scheduling out task t 1920 + * B) migration for t is required to synchronize *both* rq(c0)->lock and 1921 + * rq(c1)->lock (if not at the same time, then in that order). 1922 + * C) LOCK of the rq(c1)->lock scheduling in task 1923 + * 1924 + * Transitivity guarantees that B happens after A and C after B. 1925 + * Note: we only require RCpc transitivity. 1926 + * Note: the cpu doing B need not be c0 or c1 1927 + * 1928 + * Example: 1929 + * 1930 + * CPU0 CPU1 CPU2 1931 + * 1932 + * LOCK rq(0)->lock 1933 + * sched-out X 1934 + * sched-in Y 1935 + * UNLOCK rq(0)->lock 1936 + * 1937 + * LOCK rq(0)->lock // orders against CPU0 1938 + * dequeue X 1939 + * UNLOCK rq(0)->lock 1940 + * 1941 + * LOCK rq(1)->lock 1942 + * enqueue X 1943 + * UNLOCK rq(1)->lock 1944 + * 1945 + * LOCK rq(1)->lock // orders against CPU2 1946 + * sched-out Z 1947 + * sched-in X 1948 + * UNLOCK rq(1)->lock 1949 + * 1950 + * 1951 + * BLOCKING -- aka. SLEEP + WAKEUP 1952 + * 1953 + * For blocking we (obviously) need to provide the same guarantee as for 1954 + * migration. However the means are completely different as there is no lock 1955 + * chain to provide order. Instead we do: 1956 + * 1957 + * 1) smp_store_release(X->on_cpu, 0) 1958 + * 2) smp_cond_acquire(!X->on_cpu) 1959 + * 1960 + * Example: 1961 + * 1962 + * CPU0 (schedule) CPU1 (try_to_wake_up) CPU2 (schedule) 1963 + * 1964 + * LOCK rq(0)->lock LOCK X->pi_lock 1965 + * dequeue X 1966 + * sched-out X 1967 + * smp_store_release(X->on_cpu, 0); 1968 + * 1969 + * smp_cond_acquire(!X->on_cpu); 1970 + * X->state = WAKING 1971 + * set_task_cpu(X,2) 1972 + * 1973 + * LOCK rq(2)->lock 1974 + * enqueue X 1975 + * X->state = RUNNING 1976 + * UNLOCK rq(2)->lock 1977 + * 1978 + * LOCK rq(2)->lock // orders against CPU1 1979 + * sched-out Z 1980 + * sched-in X 1981 + * UNLOCK rq(2)->lock 1982 + * 1983 + * UNLOCK X->pi_lock 1984 + * UNLOCK rq(0)->lock 1985 + * 1986 + * 1987 + * However; for wakeups there is a second guarantee we must provide, namely we 1988 + * must observe the state that lead to our wakeup. That is, not only must our 1989 + * task observe its own prior state, it must also observe the stores prior to 1990 + * its wakeup. 1991 + * 1992 + * This means that any means of doing remote wakeups must order the CPU doing 1993 + * the wakeup against the CPU the task is going to end up running on. This, 1994 + * however, is already required for the regular Program-Order guarantee above, 1995 + * since the waking CPU is the one issueing the ACQUIRE (smp_cond_acquire). 1996 + * 1997 + */ 1998 + 1908 1999 /** 1909 2000 * try_to_wake_up - wake up a thread 1910 2001 * @p: the thread to be awakened ··· 2059 1968 /* 2060 1969 * If the owning (remote) cpu is still in the middle of schedule() with 2061 1970 * this task as prev, wait until its done referencing the task. 2062 - */ 2063 - while (p->on_cpu) 2064 - cpu_relax(); 2065 - /* 2066 - * Combined with the control dependency above, we have an effective 2067 - * smp_load_acquire() without the need for full barriers. 2068 1971 * 2069 1972 * Pairs with the smp_store_release() in finish_lock_switch(). 2070 1973 * 2071 1974 * This ensures that tasks getting woken will be fully ordered against 2072 1975 * their previous state and preserve Program Order. 2073 1976 */ 2074 - smp_rmb(); 1977 + smp_cond_acquire(!p->on_cpu); 2075 1978 2076 1979 p->sched_contributes_to_load = !!task_contributes_to_load(p); 2077 1980 p->state = TASK_WAKING;
+1 -1
kernel/sched/sched.h
··· 1076 1076 * In particular, the load of prev->state in finish_task_switch() must 1077 1077 * happen before this. 1078 1078 * 1079 - * Pairs with the control dependency and rmb in try_to_wake_up(). 1079 + * Pairs with the smp_cond_acquire() in try_to_wake_up(). 1080 1080 */ 1081 1081 smp_store_release(&prev->on_cpu, 0); 1082 1082 #endif
+79 -41
lib/atomic64_test.c
··· 27 27 (unsigned long long)r); \ 28 28 } while (0) 29 29 30 + /* 31 + * Test for a atomic operation family, 32 + * @test should be a macro accepting parameters (bit, op, ...) 33 + */ 34 + 35 + #define FAMILY_TEST(test, bit, op, args...) \ 36 + do { \ 37 + test(bit, op, ##args); \ 38 + test(bit, op##_acquire, ##args); \ 39 + test(bit, op##_release, ##args); \ 40 + test(bit, op##_relaxed, ##args); \ 41 + } while (0) 42 + 43 + #define TEST_RETURN(bit, op, c_op, val) \ 44 + do { \ 45 + atomic##bit##_set(&v, v0); \ 46 + r = v0; \ 47 + r c_op val; \ 48 + BUG_ON(atomic##bit##_##op(val, &v) != r); \ 49 + BUG_ON(atomic##bit##_read(&v) != r); \ 50 + } while (0) 51 + 52 + #define RETURN_FAMILY_TEST(bit, op, c_op, val) \ 53 + do { \ 54 + FAMILY_TEST(TEST_RETURN, bit, op, c_op, val); \ 55 + } while (0) 56 + 57 + #define TEST_ARGS(bit, op, init, ret, expect, args...) \ 58 + do { \ 59 + atomic##bit##_set(&v, init); \ 60 + BUG_ON(atomic##bit##_##op(&v, ##args) != ret); \ 61 + BUG_ON(atomic##bit##_read(&v) != expect); \ 62 + } while (0) 63 + 64 + #define XCHG_FAMILY_TEST(bit, init, new) \ 65 + do { \ 66 + FAMILY_TEST(TEST_ARGS, bit, xchg, init, init, new, new); \ 67 + } while (0) 68 + 69 + #define CMPXCHG_FAMILY_TEST(bit, init, new, wrong) \ 70 + do { \ 71 + FAMILY_TEST(TEST_ARGS, bit, cmpxchg, \ 72 + init, init, new, init, new); \ 73 + FAMILY_TEST(TEST_ARGS, bit, cmpxchg, \ 74 + init, init, init, wrong, new); \ 75 + } while (0) 76 + 77 + #define INC_RETURN_FAMILY_TEST(bit, i) \ 78 + do { \ 79 + FAMILY_TEST(TEST_ARGS, bit, inc_return, \ 80 + i, (i) + one, (i) + one); \ 81 + } while (0) 82 + 83 + #define DEC_RETURN_FAMILY_TEST(bit, i) \ 84 + do { \ 85 + FAMILY_TEST(TEST_ARGS, bit, dec_return, \ 86 + i, (i) - one, (i) - one); \ 87 + } while (0) 88 + 30 89 static __init void test_atomic(void) 31 90 { 32 91 int v0 = 0xaaa31337; ··· 104 45 TEST(, and, &=, v1); 105 46 TEST(, xor, ^=, v1); 106 47 TEST(, andnot, &= ~, v1); 48 + 49 + RETURN_FAMILY_TEST(, add_return, +=, onestwos); 50 + RETURN_FAMILY_TEST(, add_return, +=, -one); 51 + RETURN_FAMILY_TEST(, sub_return, -=, onestwos); 52 + RETURN_FAMILY_TEST(, sub_return, -=, -one); 53 + 54 + INC_RETURN_FAMILY_TEST(, v0); 55 + DEC_RETURN_FAMILY_TEST(, v0); 56 + 57 + XCHG_FAMILY_TEST(, v0, v1); 58 + CMPXCHG_FAMILY_TEST(, v0, v1, onestwos); 59 + 107 60 } 108 61 109 62 #define INIT(c) do { atomic64_set(&v, c); r = c; } while (0) ··· 145 74 TEST(64, xor, ^=, v1); 146 75 TEST(64, andnot, &= ~, v1); 147 76 148 - INIT(v0); 149 - r += onestwos; 150 - BUG_ON(atomic64_add_return(onestwos, &v) != r); 151 - BUG_ON(v.counter != r); 152 - 153 - INIT(v0); 154 - r += -one; 155 - BUG_ON(atomic64_add_return(-one, &v) != r); 156 - BUG_ON(v.counter != r); 157 - 158 - INIT(v0); 159 - r -= onestwos; 160 - BUG_ON(atomic64_sub_return(onestwos, &v) != r); 161 - BUG_ON(v.counter != r); 162 - 163 - INIT(v0); 164 - r -= -one; 165 - BUG_ON(atomic64_sub_return(-one, &v) != r); 166 - BUG_ON(v.counter != r); 77 + RETURN_FAMILY_TEST(64, add_return, +=, onestwos); 78 + RETURN_FAMILY_TEST(64, add_return, +=, -one); 79 + RETURN_FAMILY_TEST(64, sub_return, -=, onestwos); 80 + RETURN_FAMILY_TEST(64, sub_return, -=, -one); 167 81 168 82 INIT(v0); 169 83 atomic64_inc(&v); ··· 156 100 BUG_ON(v.counter != r); 157 101 158 102 INIT(v0); 159 - r += one; 160 - BUG_ON(atomic64_inc_return(&v) != r); 161 - BUG_ON(v.counter != r); 162 - 163 - INIT(v0); 164 103 atomic64_dec(&v); 165 104 r -= one; 166 105 BUG_ON(v.counter != r); 167 106 168 - INIT(v0); 169 - r -= one; 170 - BUG_ON(atomic64_dec_return(&v) != r); 171 - BUG_ON(v.counter != r); 107 + INC_RETURN_FAMILY_TEST(64, v0); 108 + DEC_RETURN_FAMILY_TEST(64, v0); 172 109 173 - INIT(v0); 174 - BUG_ON(atomic64_xchg(&v, v1) != v0); 175 - r = v1; 176 - BUG_ON(v.counter != r); 177 - 178 - INIT(v0); 179 - BUG_ON(atomic64_cmpxchg(&v, v0, v1) != v0); 180 - r = v1; 181 - BUG_ON(v.counter != r); 182 - 183 - INIT(v0); 184 - BUG_ON(atomic64_cmpxchg(&v, v2, v1) != v0); 185 - BUG_ON(v.counter != r); 110 + XCHG_FAMILY_TEST(64, v0, v1); 111 + CMPXCHG_FAMILY_TEST(64, v0, v1, v2); 186 112 187 113 INIT(v0); 188 114 BUG_ON(atomic64_add_unless(&v, one, v0));