Linux kernel mirror (for testing)
git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel
os
linux
1/* SPDX-License-Identifier: GPL-2.0-or-later */
2/*
3 * Resilient Queued Spin Lock
4 *
5 * (C) Copyright 2024-2025 Meta Platforms, Inc. and affiliates.
6 *
7 * Authors: Kumar Kartikeya Dwivedi <memxor@gmail.com>
8 */
9#ifndef __ASM_GENERIC_RQSPINLOCK_H
10#define __ASM_GENERIC_RQSPINLOCK_H
11
12#include <linux/types.h>
13#include <vdso/time64.h>
14#include <linux/percpu.h>
15#ifdef CONFIG_QUEUED_SPINLOCKS
16#include <asm/qspinlock.h>
17#endif
18
19struct rqspinlock {
20 union {
21 atomic_t val;
22 u32 locked;
23 };
24};
25
26/* Even though this is same as struct rqspinlock, we need to emit a distinct
27 * type in BTF for BPF programs.
28 */
29struct bpf_res_spin_lock {
30 u32 val;
31};
32
33struct qspinlock;
34#ifdef CONFIG_QUEUED_SPINLOCKS
35typedef struct qspinlock rqspinlock_t;
36#else
37typedef struct rqspinlock rqspinlock_t;
38#endif
39
40extern int resilient_tas_spin_lock(rqspinlock_t *lock);
41#ifdef CONFIG_QUEUED_SPINLOCKS
42extern int resilient_queued_spin_lock_slowpath(rqspinlock_t *lock, u32 val);
43#endif
44
45#ifndef resilient_virt_spin_lock_enabled
46static __always_inline bool resilient_virt_spin_lock_enabled(void)
47{
48 return false;
49}
50#endif
51
52#ifndef resilient_virt_spin_lock
53static __always_inline int resilient_virt_spin_lock(rqspinlock_t *lock)
54{
55 return 0;
56}
57#endif
58
59/*
60 * Default timeout for waiting loops is 0.25 seconds
61 */
62#define RES_DEF_TIMEOUT (NSEC_PER_SEC / 4)
63
64/*
65 * Choose 31 as it makes rqspinlock_held cacheline-aligned.
66 */
67#define RES_NR_HELD 31
68
69struct rqspinlock_held {
70 int cnt;
71 void *locks[RES_NR_HELD];
72};
73
74DECLARE_PER_CPU_ALIGNED(struct rqspinlock_held, rqspinlock_held_locks);
75
76static __always_inline void grab_held_lock_entry(void *lock)
77{
78 int cnt = this_cpu_inc_return(rqspinlock_held_locks.cnt);
79
80 if (unlikely(cnt > RES_NR_HELD)) {
81 /* Still keep the inc so we decrement later. */
82 return;
83 }
84
85 /*
86 * Implied compiler barrier in per-CPU operations; otherwise we can have
87 * the compiler reorder inc with write to table, allowing interrupts to
88 * overwrite and erase our write to the table (as on interrupt exit it
89 * will be reset to NULL).
90 *
91 * It is fine for cnt inc to be reordered wrt remote readers though,
92 * they won't observe our entry until the cnt update is visible, that's
93 * all.
94 */
95 this_cpu_write(rqspinlock_held_locks.locks[cnt - 1], lock);
96}
97
98/*
99 * We simply don't support out-of-order unlocks, and keep the logic simple here.
100 * The verifier prevents BPF programs from unlocking out-of-order, and the same
101 * holds for in-kernel users.
102 *
103 * It is possible to run into misdetection scenarios of AA deadlocks on the same
104 * CPU, and missed ABBA deadlocks on remote CPUs if this function pops entries
105 * out of order (due to lock A, lock B, unlock A, unlock B) pattern. The correct
106 * logic to preserve right entries in the table would be to walk the array of
107 * held locks and swap and clear out-of-order entries, but that's too
108 * complicated and we don't have a compelling use case for out of order unlocking.
109 */
110static __always_inline void release_held_lock_entry(void)
111{
112 struct rqspinlock_held *rqh = this_cpu_ptr(&rqspinlock_held_locks);
113
114 if (unlikely(rqh->cnt > RES_NR_HELD))
115 goto dec;
116 WRITE_ONCE(rqh->locks[rqh->cnt - 1], NULL);
117dec:
118 /*
119 * Reordering of clearing above with inc and its write in
120 * grab_held_lock_entry that came before us (in same acquisition
121 * attempt) is ok, we either see a valid entry or NULL when it's
122 * visible.
123 *
124 * But this helper is invoked when we unwind upon failing to acquire the
125 * lock. Unlike the unlock path which constitutes a release store after
126 * we clear the entry, we need to emit a write barrier here. Otherwise,
127 * we may have a situation as follows:
128 *
129 * <error> for lock B
130 * release_held_lock_entry
131 *
132 * grab_held_lock_entry
133 * try_cmpxchg_acquire for lock A
134 *
135 * Lack of any ordering means reordering may occur such that dec, inc
136 * are done before entry is overwritten. This permits a remote lock
137 * holder of lock B (which this CPU failed to acquire) to now observe it
138 * as being attempted on this CPU, and may lead to misdetection (if this
139 * CPU holds a lock it is attempting to acquire, leading to false ABBA
140 * diagnosis).
141 *
142 * The case of unlock is treated differently due to NMI reentrancy, see
143 * comments in res_spin_unlock.
144 *
145 * In theory we don't have a problem if the dec and WRITE_ONCE above get
146 * reordered with each other, we either notice an empty NULL entry on
147 * top (if dec succeeds WRITE_ONCE), or a potentially stale entry which
148 * cannot be observed (if dec precedes WRITE_ONCE).
149 *
150 * Emit the write barrier _before_ the dec, this permits dec-inc
151 * reordering but that is harmless as we'd have new entry set to NULL
152 * already, i.e. they cannot precede the NULL store above.
153 */
154 smp_wmb();
155 this_cpu_dec(rqspinlock_held_locks.cnt);
156}
157
158#ifdef CONFIG_QUEUED_SPINLOCKS
159
160/**
161 * res_spin_lock - acquire a queued spinlock
162 * @lock: Pointer to queued spinlock structure
163 *
164 * Return:
165 * * 0 - Lock was acquired successfully.
166 * * -EDEADLK - Lock acquisition failed because of AA/ABBA deadlock.
167 * * -ETIMEDOUT - Lock acquisition failed because of timeout.
168 */
169static __always_inline int res_spin_lock(rqspinlock_t *lock)
170{
171 int val = 0;
172
173 /*
174 * Grab the deadlock detection entry before doing the cmpxchg, so that
175 * reentrancy due to NMIs between the succeeding cmpxchg and creation of
176 * held lock entry can correctly detect an acquisition attempt in the
177 * interrupted context.
178 *
179 * cmpxchg lock A
180 * <NMI>
181 * res_spin_lock(A) --> missed AA, leads to timeout
182 * </NMI>
183 * grab_held_lock_entry(A)
184 */
185 grab_held_lock_entry(lock);
186
187 if (likely(atomic_try_cmpxchg_acquire(&lock->val, &val, _Q_LOCKED_VAL)))
188 return 0;
189 return resilient_queued_spin_lock_slowpath(lock, val);
190}
191
192#else
193
194#define res_spin_lock(lock) resilient_tas_spin_lock(lock)
195
196#endif /* CONFIG_QUEUED_SPINLOCKS */
197
198static __always_inline void res_spin_unlock(rqspinlock_t *lock)
199{
200 struct rqspinlock_held *rqh = this_cpu_ptr(&rqspinlock_held_locks);
201
202 /*
203 * Release barrier, ensures correct ordering. Perform release store
204 * instead of queued_spin_unlock, since we use this function for the TAS
205 * fallback as well. When we have CONFIG_QUEUED_SPINLOCKS=n, we clear
206 * the full 4-byte lockword.
207 *
208 * Perform the smp_store_release before clearing the lock entry so that
209 * NMIs landing in the unlock path can correctly detect AA issues. The
210 * opposite order shown below may lead to missed AA checks:
211 *
212 * WRITE_ONCE(rqh->locks[rqh->cnt - 1], NULL)
213 * <NMI>
214 * res_spin_lock(A) --> missed AA, leads to timeout
215 * </NMI>
216 * smp_store_release(A->locked, 0)
217 */
218 smp_store_release(&lock->locked, 0);
219 if (likely(rqh->cnt <= RES_NR_HELD))
220 WRITE_ONCE(rqh->locks[rqh->cnt - 1], NULL);
221 this_cpu_dec(rqspinlock_held_locks.cnt);
222}
223
224#ifdef CONFIG_QUEUED_SPINLOCKS
225#define raw_res_spin_lock_init(lock) ({ *(lock) = (rqspinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; })
226#else
227#define raw_res_spin_lock_init(lock) ({ *(lock) = (rqspinlock_t){0}; })
228#endif
229
230#define raw_res_spin_lock(lock) \
231 ({ \
232 int __ret; \
233 preempt_disable(); \
234 __ret = res_spin_lock(lock); \
235 if (__ret) \
236 preempt_enable(); \
237 __ret; \
238 })
239
240#define raw_res_spin_unlock(lock) ({ res_spin_unlock(lock); preempt_enable(); })
241
242#define raw_res_spin_lock_irqsave(lock, flags) \
243 ({ \
244 int __ret; \
245 local_irq_save(flags); \
246 __ret = raw_res_spin_lock(lock); \
247 if (__ret) \
248 local_irq_restore(flags); \
249 __ret; \
250 })
251
252#define raw_res_spin_unlock_irqrestore(lock, flags) ({ raw_res_spin_unlock(lock); local_irq_restore(flags); })
253
254#endif /* __ASM_GENERIC_RQSPINLOCK_H */