Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

locking/mutexes: Introduce cancelable MCS lock for adaptive spinning

Since we want a task waiting for a mutex_lock() to go to sleep and
reschedule on need_resched() we must be able to abort the
mcs_spin_lock() around the adaptive spin.

Therefore implement a cancelable mcs lock.

Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Cc: chegu_vinod@hp.com
Cc: paulmck@linux.vnet.ibm.com
Cc: Waiman.Long@hp.com
Cc: torvalds@linux-foundation.org
Cc: tglx@linutronix.de
Cc: riel@redhat.com
Cc: akpm@linux-foundation.org
Cc: davidlohr@hp.com
Cc: hpa@zytor.com
Cc: andi@firstfloor.org
Cc: aswin@hp.com
Cc: scott.norton@hp.com
Cc: Jason Low <jason.low2@hp.com>
Link: http://lkml.kernel.org/n/tip-62hcl5wxydmjzd182zhvk89m@git.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>

authored by

Peter Zijlstra and committed by
Ingo Molnar
fb0527bd 1d8fe7dc

+202 -7
+2 -2
include/linux/mutex.h
··· 46 46 * - detects multi-task circular deadlocks and prints out all affected 47 47 * locks and tasks (and only those tasks) 48 48 */ 49 - struct mcs_spinlock; 49 + struct optimistic_spin_queue; 50 50 struct mutex { 51 51 /* 1: unlocked, 0: locked, negative: locked, possible waiters */ 52 52 atomic_t count; ··· 56 56 struct task_struct *owner; 57 57 #endif 58 58 #ifdef CONFIG_MUTEX_SPIN_ON_OWNER 59 - struct mcs_spinlock *mcs_lock; /* Spinner MCS lock */ 59 + struct optimistic_spin_queue *osq; /* Spinner MCS lock */ 60 60 #endif 61 61 #ifdef CONFIG_DEBUG_MUTEXES 62 62 const char *name;
+1 -1
kernel/locking/Makefile
··· 1 1 2 - obj-y += mutex.o semaphore.o rwsem.o lglock.o 2 + obj-y += mutex.o semaphore.o rwsem.o lglock.o mcs_spinlock.o 3 3 4 4 ifdef CONFIG_FUNCTION_TRACER 5 5 CFLAGS_REMOVE_lockdep.o = -pg
+178
kernel/locking/mcs_spinlock.c
··· 1 + 2 + #include <linux/percpu.h> 3 + #include <linux/mutex.h> 4 + #include <linux/sched.h> 5 + #include "mcs_spinlock.h" 6 + 7 + #ifdef CONFIG_SMP 8 + 9 + /* 10 + * An MCS like lock especially tailored for optimistic spinning for sleeping 11 + * lock implementations (mutex, rwsem, etc). 12 + * 13 + * Using a single mcs node per CPU is safe because sleeping locks should not be 14 + * called from interrupt context and we have preemption disabled while 15 + * spinning. 16 + */ 17 + static DEFINE_PER_CPU_SHARED_ALIGNED(struct optimistic_spin_queue, osq_node); 18 + 19 + /* 20 + * Get a stable @node->next pointer, either for unlock() or unqueue() purposes. 21 + * Can return NULL in case we were the last queued and we updated @lock instead. 22 + */ 23 + static inline struct optimistic_spin_queue * 24 + osq_wait_next(struct optimistic_spin_queue **lock, 25 + struct optimistic_spin_queue *node, 26 + struct optimistic_spin_queue *prev) 27 + { 28 + struct optimistic_spin_queue *next = NULL; 29 + 30 + for (;;) { 31 + if (*lock == node && cmpxchg(lock, node, prev) == node) { 32 + /* 33 + * We were the last queued, we moved @lock back. @prev 34 + * will now observe @lock and will complete its 35 + * unlock()/unqueue(). 36 + */ 37 + break; 38 + } 39 + 40 + /* 41 + * We must xchg() the @node->next value, because if we were to 42 + * leave it in, a concurrent unlock()/unqueue() from 43 + * @node->next might complete Step-A and think its @prev is 44 + * still valid. 45 + * 46 + * If the concurrent unlock()/unqueue() wins the race, we'll 47 + * wait for either @lock to point to us, through its Step-B, or 48 + * wait for a new @node->next from its Step-C. 49 + */ 50 + if (node->next) { 51 + next = xchg(&node->next, NULL); 52 + if (next) 53 + break; 54 + } 55 + 56 + arch_mutex_cpu_relax(); 57 + } 58 + 59 + return next; 60 + } 61 + 62 + bool osq_lock(struct optimistic_spin_queue **lock) 63 + { 64 + struct optimistic_spin_queue *node = this_cpu_ptr(&osq_node); 65 + struct optimistic_spin_queue *prev, *next; 66 + 67 + node->locked = 0; 68 + node->next = NULL; 69 + 70 + node->prev = prev = xchg(lock, node); 71 + if (likely(prev == NULL)) 72 + return true; 73 + 74 + ACCESS_ONCE(prev->next) = node; 75 + 76 + /* 77 + * Normally @prev is untouchable after the above store; because at that 78 + * moment unlock can proceed and wipe the node element from stack. 79 + * 80 + * However, since our nodes are static per-cpu storage, we're 81 + * guaranteed their existence -- this allows us to apply 82 + * cmpxchg in an attempt to undo our queueing. 83 + */ 84 + 85 + while (!smp_load_acquire(&node->locked)) { 86 + /* 87 + * If we need to reschedule bail... so we can block. 88 + */ 89 + if (need_resched()) 90 + goto unqueue; 91 + 92 + arch_mutex_cpu_relax(); 93 + } 94 + return true; 95 + 96 + unqueue: 97 + /* 98 + * Step - A -- stabilize @prev 99 + * 100 + * Undo our @prev->next assignment; this will make @prev's 101 + * unlock()/unqueue() wait for a next pointer since @lock points to us 102 + * (or later). 103 + */ 104 + 105 + for (;;) { 106 + if (prev->next == node && 107 + cmpxchg(&prev->next, node, NULL) == node) 108 + break; 109 + 110 + /* 111 + * We can only fail the cmpxchg() racing against an unlock(), 112 + * in which case we should observe @node->locked becomming 113 + * true. 114 + */ 115 + if (smp_load_acquire(&node->locked)) 116 + return true; 117 + 118 + arch_mutex_cpu_relax(); 119 + 120 + /* 121 + * Or we race against a concurrent unqueue()'s step-B, in which 122 + * case its step-C will write us a new @node->prev pointer. 123 + */ 124 + prev = ACCESS_ONCE(node->prev); 125 + } 126 + 127 + /* 128 + * Step - B -- stabilize @next 129 + * 130 + * Similar to unlock(), wait for @node->next or move @lock from @node 131 + * back to @prev. 132 + */ 133 + 134 + next = osq_wait_next(lock, node, prev); 135 + if (!next) 136 + return false; 137 + 138 + /* 139 + * Step - C -- unlink 140 + * 141 + * @prev is stable because its still waiting for a new @prev->next 142 + * pointer, @next is stable because our @node->next pointer is NULL and 143 + * it will wait in Step-A. 144 + */ 145 + 146 + ACCESS_ONCE(next->prev) = prev; 147 + ACCESS_ONCE(prev->next) = next; 148 + 149 + return false; 150 + } 151 + 152 + void osq_unlock(struct optimistic_spin_queue **lock) 153 + { 154 + struct optimistic_spin_queue *node = this_cpu_ptr(&osq_node); 155 + struct optimistic_spin_queue *next; 156 + 157 + /* 158 + * Fast path for the uncontended case. 159 + */ 160 + if (likely(cmpxchg(lock, node, NULL) == node)) 161 + return; 162 + 163 + /* 164 + * Second most likely case. 165 + */ 166 + next = xchg(&node->next, NULL); 167 + if (next) { 168 + ACCESS_ONCE(next->locked) = 1; 169 + return; 170 + } 171 + 172 + next = osq_wait_next(lock, node, NULL); 173 + if (next) 174 + ACCESS_ONCE(next->locked) = 1; 175 + } 176 + 177 + #endif 178 +
+15
kernel/locking/mcs_spinlock.h
··· 111 111 arch_mcs_spin_unlock_contended(&next->locked); 112 112 } 113 113 114 + /* 115 + * Cancellable version of the MCS lock above. 116 + * 117 + * Intended for adaptive spinning of sleeping locks: 118 + * mutex_lock()/rwsem_down_{read,write}() etc. 119 + */ 120 + 121 + struct optimistic_spin_queue { 122 + struct optimistic_spin_queue *next, *prev; 123 + int locked; /* 1 if lock acquired */ 124 + }; 125 + 126 + extern bool osq_lock(struct optimistic_spin_queue **lock); 127 + extern void osq_unlock(struct optimistic_spin_queue **lock); 128 + 114 129 #endif /* __LINUX_MCS_SPINLOCK_H */
+6 -4
kernel/locking/mutex.c
··· 53 53 INIT_LIST_HEAD(&lock->wait_list); 54 54 mutex_clear_owner(lock); 55 55 #ifdef CONFIG_MUTEX_SPIN_ON_OWNER 56 - lock->mcs_lock = NULL; 56 + lock->osq = NULL; 57 57 #endif 58 58 59 59 debug_mutex_init(lock, name, key); ··· 403 403 if (!mutex_can_spin_on_owner(lock)) 404 404 goto slowpath; 405 405 406 - mcs_spin_lock(&lock->mcs_lock, &node); 406 + if (!osq_lock(&lock->osq)) 407 + goto slowpath; 408 + 407 409 for (;;) { 408 410 struct task_struct *owner; 409 411 ··· 444 442 } 445 443 446 444 mutex_set_owner(lock); 447 - mcs_spin_unlock(&lock->mcs_lock, &node); 445 + osq_unlock(&lock->osq); 448 446 preempt_enable(); 449 447 return 0; 450 448 } ··· 466 464 */ 467 465 arch_mutex_cpu_relax(); 468 466 } 469 - mcs_spin_unlock(&lock->mcs_lock, &node); 467 + osq_unlock(&lock->osq); 470 468 slowpath: 471 469 #endif 472 470 spin_lock_mutex(&lock->wait_lock, flags);