Merge tag 'bpf_res_spin_lock' of git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next

Pull bpf relisient spinlock support from Alexei Starovoitov:
"This patch set introduces Resilient Queued Spin Lock (or rqspinlock
with res_spin_lock() and res_spin_unlock() APIs).

This is a qspinlock variant which recovers the kernel from a stalled
state when the lock acquisition path cannot make forward progress.
This can occur when a lock acquisition attempt enters a deadlock
situation (e.g. AA, or ABBA), or more generally, when the owner of the
lock (which we’re trying to acquire) isn’t making forward progress.
Deadlock detection is the main mechanism used to provide instant
recovery, with the timeout mechanism acting as a final line of
defense. Detection is triggered immediately when beginning the waiting
loop of a lock slow path.

Additionally, BPF programs attached to different parts of the kernel
can introduce new control flow into the kernel, which increases the
likelihood of deadlocks in code not written to handle reentrancy.
There have been multiple syzbot reports surfacing deadlocks in
internal kernel code due to the diverse ways in which BPF programs can
be attached to different parts of the kernel. By switching the BPF
subsystem’s lock usage to rqspinlock, all of these issues are
mitigated at runtime.

This spin lock implementation allows BPF maps to become safer and
remove mechanisms that have fallen short in assuring safety when
nesting programs in arbitrary ways in the same context or across
different contexts.

We run benchmarks that stress locking scalability and perform
comparison against the baseline (qspinlock). For the rqspinlock case,
we replace the default qspinlock with it in the kernel, such that all
spin locks in the kernel use the rqspinlock slow path. As such,
benchmarks that stress kernel spin locks end up exercising rqspinlock.

More details in the cover letter in commit 6ffb9017e932 ("Merge branch
'resilient-queued-spin-lock'")"

* tag 'bpf_res_spin_lock' of git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next: (24 commits)
selftests/bpf: Add tests for rqspinlock
bpf: Maintain FIFO property for rqspinlock unlock
bpf: Implement verifier support for rqspinlock
bpf: Introduce rqspinlock kfuncs
bpf: Convert lpm_trie.c to rqspinlock
bpf: Convert percpu_freelist.c to rqspinlock
bpf: Convert hashtab.c to rqspinlock
rqspinlock: Add locktorture support
rqspinlock: Add entry to Makefile, MAINTAINERS
rqspinlock: Add macros for rqspinlock usage
rqspinlock: Add basic support for CONFIG_PARAVIRT
rqspinlock: Add a test-and-set fallback
rqspinlock: Add deadlock detection and recovery
rqspinlock: Protect waiters in trylock fallback from stalls
rqspinlock: Protect waiters in queue from stalls
rqspinlock: Protect pending bit owners from stalls
rqspinlock: Hardcode cond_acquire loops for arm64
rqspinlock: Add support for timeouts
rqspinlock: Drop PV and virtualization support
rqspinlock: Add rqspinlock.h header
...

+2315 -420
+2
MAINTAINERS
··· 4361 4361 F: kernel/bpf/ 4362 4362 F: kernel/trace/bpf_trace.c 4363 4363 F: lib/buildid.c 4364 + F: arch/*/include/asm/rqspinlock.h 4365 + F: include/asm-generic/rqspinlock.h 4364 4366 F: lib/test_bpf.c 4365 4367 F: net/bpf/ 4366 4368 F: net/core/filter.c
+93
arch/arm64/include/asm/rqspinlock.h
··· 1 + /* SPDX-License-Identifier: GPL-2.0 */ 2 + #ifndef _ASM_RQSPINLOCK_H 3 + #define _ASM_RQSPINLOCK_H 4 + 5 + #include <asm/barrier.h> 6 + 7 + /* 8 + * Hardcode res_smp_cond_load_acquire implementations for arm64 to a custom 9 + * version based on [0]. In rqspinlock code, our conditional expression involves 10 + * checking the value _and_ additionally a timeout. However, on arm64, the 11 + * WFE-based implementation may never spin again if no stores occur to the 12 + * locked byte in the lock word. As such, we may be stuck forever if 13 + * event-stream based unblocking is not available on the platform for WFE spin 14 + * loops (arch_timer_evtstrm_available). 15 + * 16 + * Once support for smp_cond_load_acquire_timewait [0] lands, we can drop this 17 + * copy-paste. 18 + * 19 + * While we rely on the implementation to amortize the cost of sampling 20 + * cond_expr for us, it will not happen when event stream support is 21 + * unavailable, time_expr check is amortized. This is not the common case, and 22 + * it would be difficult to fit our logic in the time_expr_ns >= time_limit_ns 23 + * comparison, hence just let it be. In case of event-stream, the loop is woken 24 + * up at microsecond granularity. 25 + * 26 + * [0]: https://lore.kernel.org/lkml/20250203214911.898276-1-ankur.a.arora@oracle.com 27 + */ 28 + 29 + #ifndef smp_cond_load_acquire_timewait 30 + 31 + #define smp_cond_time_check_count 200 32 + 33 + #define __smp_cond_load_relaxed_spinwait(ptr, cond_expr, time_expr_ns, \ 34 + time_limit_ns) ({ \ 35 + typeof(ptr) __PTR = (ptr); \ 36 + __unqual_scalar_typeof(*ptr) VAL; \ 37 + unsigned int __count = 0; \ 38 + for (;;) { \ 39 + VAL = READ_ONCE(*__PTR); \ 40 + if (cond_expr) \ 41 + break; \ 42 + cpu_relax(); \ 43 + if (__count++ < smp_cond_time_check_count) \ 44 + continue; \ 45 + if ((time_expr_ns) >= (time_limit_ns)) \ 46 + break; \ 47 + __count = 0; \ 48 + } \ 49 + (typeof(*ptr))VAL; \ 50 + }) 51 + 52 + #define __smp_cond_load_acquire_timewait(ptr, cond_expr, \ 53 + time_expr_ns, time_limit_ns) \ 54 + ({ \ 55 + typeof(ptr) __PTR = (ptr); \ 56 + __unqual_scalar_typeof(*ptr) VAL; \ 57 + for (;;) { \ 58 + VAL = smp_load_acquire(__PTR); \ 59 + if (cond_expr) \ 60 + break; \ 61 + __cmpwait_relaxed(__PTR, VAL); \ 62 + if ((time_expr_ns) >= (time_limit_ns)) \ 63 + break; \ 64 + } \ 65 + (typeof(*ptr))VAL; \ 66 + }) 67 + 68 + #define smp_cond_load_acquire_timewait(ptr, cond_expr, \ 69 + time_expr_ns, time_limit_ns) \ 70 + ({ \ 71 + __unqual_scalar_typeof(*ptr) _val; \ 72 + int __wfe = arch_timer_evtstrm_available(); \ 73 + \ 74 + if (likely(__wfe)) { \ 75 + _val = __smp_cond_load_acquire_timewait(ptr, cond_expr, \ 76 + time_expr_ns, \ 77 + time_limit_ns); \ 78 + } else { \ 79 + _val = __smp_cond_load_relaxed_spinwait(ptr, cond_expr, \ 80 + time_expr_ns, \ 81 + time_limit_ns); \ 82 + smp_acquire__after_ctrl_dep(); \ 83 + } \ 84 + (typeof(*ptr))_val; \ 85 + }) 86 + 87 + #endif 88 + 89 + #define res_smp_cond_load_acquire_timewait(v, c) smp_cond_load_acquire_timewait(v, c, 0, 1) 90 + 91 + #include <asm-generic/rqspinlock.h> 92 + 93 + #endif /* _ASM_RQSPINLOCK_H */
+33
arch/x86/include/asm/rqspinlock.h
··· 1 + /* SPDX-License-Identifier: GPL-2.0 */ 2 + #ifndef _ASM_X86_RQSPINLOCK_H 3 + #define _ASM_X86_RQSPINLOCK_H 4 + 5 + #include <asm/paravirt.h> 6 + 7 + #ifdef CONFIG_PARAVIRT 8 + DECLARE_STATIC_KEY_FALSE(virt_spin_lock_key); 9 + 10 + #define resilient_virt_spin_lock_enabled resilient_virt_spin_lock_enabled 11 + static __always_inline bool resilient_virt_spin_lock_enabled(void) 12 + { 13 + return static_branch_likely(&virt_spin_lock_key); 14 + } 15 + 16 + #ifdef CONFIG_QUEUED_SPINLOCKS 17 + typedef struct qspinlock rqspinlock_t; 18 + #else 19 + typedef struct rqspinlock rqspinlock_t; 20 + #endif 21 + extern int resilient_tas_spin_lock(rqspinlock_t *lock); 22 + 23 + #define resilient_virt_spin_lock resilient_virt_spin_lock 24 + static inline int resilient_virt_spin_lock(rqspinlock_t *lock) 25 + { 26 + return resilient_tas_spin_lock(lock); 27 + } 28 + 29 + #endif /* CONFIG_PARAVIRT */ 30 + 31 + #include <asm-generic/rqspinlock.h> 32 + 33 + #endif /* _ASM_X86_RQSPINLOCK_H */
+1
include/asm-generic/Kbuild
··· 45 45 mandatory-y += percpu.h 46 46 mandatory-y += pgalloc.h 47 47 mandatory-y += preempt.h 48 + mandatory-y += rqspinlock.h 48 49 mandatory-y += runtime-const.h 49 50 mandatory-y += rwonce.h 50 51 mandatory-y += sections.h
+6
include/asm-generic/mcs_spinlock.h
··· 1 1 #ifndef __ASM_MCS_SPINLOCK_H 2 2 #define __ASM_MCS_SPINLOCK_H 3 3 4 + struct mcs_spinlock { 5 + struct mcs_spinlock *next; 6 + int locked; /* 1 if lock acquired */ 7 + int count; /* nesting count, see qspinlock.c */ 8 + }; 9 + 4 10 /* 5 11 * Architectures can define their own: 6 12 *
+250
include/asm-generic/rqspinlock.h
··· 1 + /* SPDX-License-Identifier: GPL-2.0-or-later */ 2 + /* 3 + * Resilient Queued Spin Lock 4 + * 5 + * (C) Copyright 2024-2025 Meta Platforms, Inc. and affiliates. 6 + * 7 + * Authors: Kumar Kartikeya Dwivedi <memxor@gmail.com> 8 + */ 9 + #ifndef __ASM_GENERIC_RQSPINLOCK_H 10 + #define __ASM_GENERIC_RQSPINLOCK_H 11 + 12 + #include <linux/types.h> 13 + #include <vdso/time64.h> 14 + #include <linux/percpu.h> 15 + #ifdef CONFIG_QUEUED_SPINLOCKS 16 + #include <asm/qspinlock.h> 17 + #endif 18 + 19 + struct rqspinlock { 20 + union { 21 + atomic_t val; 22 + u32 locked; 23 + }; 24 + }; 25 + 26 + /* Even though this is same as struct rqspinlock, we need to emit a distinct 27 + * type in BTF for BPF programs. 28 + */ 29 + struct bpf_res_spin_lock { 30 + u32 val; 31 + }; 32 + 33 + struct qspinlock; 34 + #ifdef CONFIG_QUEUED_SPINLOCKS 35 + typedef struct qspinlock rqspinlock_t; 36 + #else 37 + typedef struct rqspinlock rqspinlock_t; 38 + #endif 39 + 40 + extern int resilient_tas_spin_lock(rqspinlock_t *lock); 41 + #ifdef CONFIG_QUEUED_SPINLOCKS 42 + extern int resilient_queued_spin_lock_slowpath(rqspinlock_t *lock, u32 val); 43 + #endif 44 + 45 + #ifndef resilient_virt_spin_lock_enabled 46 + static __always_inline bool resilient_virt_spin_lock_enabled(void) 47 + { 48 + return false; 49 + } 50 + #endif 51 + 52 + #ifndef resilient_virt_spin_lock 53 + static __always_inline int resilient_virt_spin_lock(rqspinlock_t *lock) 54 + { 55 + return 0; 56 + } 57 + #endif 58 + 59 + /* 60 + * Default timeout for waiting loops is 0.25 seconds 61 + */ 62 + #define RES_DEF_TIMEOUT (NSEC_PER_SEC / 4) 63 + 64 + /* 65 + * Choose 31 as it makes rqspinlock_held cacheline-aligned. 66 + */ 67 + #define RES_NR_HELD 31 68 + 69 + struct rqspinlock_held { 70 + int cnt; 71 + void *locks[RES_NR_HELD]; 72 + }; 73 + 74 + DECLARE_PER_CPU_ALIGNED(struct rqspinlock_held, rqspinlock_held_locks); 75 + 76 + static __always_inline void grab_held_lock_entry(void *lock) 77 + { 78 + int cnt = this_cpu_inc_return(rqspinlock_held_locks.cnt); 79 + 80 + if (unlikely(cnt > RES_NR_HELD)) { 81 + /* Still keep the inc so we decrement later. */ 82 + return; 83 + } 84 + 85 + /* 86 + * Implied compiler barrier in per-CPU operations; otherwise we can have 87 + * the compiler reorder inc with write to table, allowing interrupts to 88 + * overwrite and erase our write to the table (as on interrupt exit it 89 + * will be reset to NULL). 90 + * 91 + * It is fine for cnt inc to be reordered wrt remote readers though, 92 + * they won't observe our entry until the cnt update is visible, that's 93 + * all. 94 + */ 95 + this_cpu_write(rqspinlock_held_locks.locks[cnt - 1], lock); 96 + } 97 + 98 + /* 99 + * We simply don't support out-of-order unlocks, and keep the logic simple here. 100 + * The verifier prevents BPF programs from unlocking out-of-order, and the same 101 + * holds for in-kernel users. 102 + * 103 + * It is possible to run into misdetection scenarios of AA deadlocks on the same 104 + * CPU, and missed ABBA deadlocks on remote CPUs if this function pops entries 105 + * out of order (due to lock A, lock B, unlock A, unlock B) pattern. The correct 106 + * logic to preserve right entries in the table would be to walk the array of 107 + * held locks and swap and clear out-of-order entries, but that's too 108 + * complicated and we don't have a compelling use case for out of order unlocking. 109 + */ 110 + static __always_inline void release_held_lock_entry(void) 111 + { 112 + struct rqspinlock_held *rqh = this_cpu_ptr(&rqspinlock_held_locks); 113 + 114 + if (unlikely(rqh->cnt > RES_NR_HELD)) 115 + goto dec; 116 + WRITE_ONCE(rqh->locks[rqh->cnt - 1], NULL); 117 + dec: 118 + /* 119 + * Reordering of clearing above with inc and its write in 120 + * grab_held_lock_entry that came before us (in same acquisition 121 + * attempt) is ok, we either see a valid entry or NULL when it's 122 + * visible. 123 + * 124 + * But this helper is invoked when we unwind upon failing to acquire the 125 + * lock. Unlike the unlock path which constitutes a release store after 126 + * we clear the entry, we need to emit a write barrier here. Otherwise, 127 + * we may have a situation as follows: 128 + * 129 + * <error> for lock B 130 + * release_held_lock_entry 131 + * 132 + * try_cmpxchg_acquire for lock A 133 + * grab_held_lock_entry 134 + * 135 + * Lack of any ordering means reordering may occur such that dec, inc 136 + * are done before entry is overwritten. This permits a remote lock 137 + * holder of lock B (which this CPU failed to acquire) to now observe it 138 + * as being attempted on this CPU, and may lead to misdetection (if this 139 + * CPU holds a lock it is attempting to acquire, leading to false ABBA 140 + * diagnosis). 141 + * 142 + * In case of unlock, we will always do a release on the lock word after 143 + * releasing the entry, ensuring that other CPUs cannot hold the lock 144 + * (and make conclusions about deadlocks) until the entry has been 145 + * cleared on the local CPU, preventing any anomalies. Reordering is 146 + * still possible there, but a remote CPU cannot observe a lock in our 147 + * table which it is already holding, since visibility entails our 148 + * release store for the said lock has not retired. 149 + * 150 + * In theory we don't have a problem if the dec and WRITE_ONCE above get 151 + * reordered with each other, we either notice an empty NULL entry on 152 + * top (if dec succeeds WRITE_ONCE), or a potentially stale entry which 153 + * cannot be observed (if dec precedes WRITE_ONCE). 154 + * 155 + * Emit the write barrier _before_ the dec, this permits dec-inc 156 + * reordering but that is harmless as we'd have new entry set to NULL 157 + * already, i.e. they cannot precede the NULL store above. 158 + */ 159 + smp_wmb(); 160 + this_cpu_dec(rqspinlock_held_locks.cnt); 161 + } 162 + 163 + #ifdef CONFIG_QUEUED_SPINLOCKS 164 + 165 + /** 166 + * res_spin_lock - acquire a queued spinlock 167 + * @lock: Pointer to queued spinlock structure 168 + * 169 + * Return: 170 + * * 0 - Lock was acquired successfully. 171 + * * -EDEADLK - Lock acquisition failed because of AA/ABBA deadlock. 172 + * * -ETIMEDOUT - Lock acquisition failed because of timeout. 173 + */ 174 + static __always_inline int res_spin_lock(rqspinlock_t *lock) 175 + { 176 + int val = 0; 177 + 178 + if (likely(atomic_try_cmpxchg_acquire(&lock->val, &val, _Q_LOCKED_VAL))) { 179 + grab_held_lock_entry(lock); 180 + return 0; 181 + } 182 + return resilient_queued_spin_lock_slowpath(lock, val); 183 + } 184 + 185 + #else 186 + 187 + #define res_spin_lock(lock) resilient_tas_spin_lock(lock) 188 + 189 + #endif /* CONFIG_QUEUED_SPINLOCKS */ 190 + 191 + static __always_inline void res_spin_unlock(rqspinlock_t *lock) 192 + { 193 + struct rqspinlock_held *rqh = this_cpu_ptr(&rqspinlock_held_locks); 194 + 195 + if (unlikely(rqh->cnt > RES_NR_HELD)) 196 + goto unlock; 197 + WRITE_ONCE(rqh->locks[rqh->cnt - 1], NULL); 198 + unlock: 199 + /* 200 + * Release barrier, ensures correct ordering. See release_held_lock_entry 201 + * for details. Perform release store instead of queued_spin_unlock, 202 + * since we use this function for test-and-set fallback as well. When we 203 + * have CONFIG_QUEUED_SPINLOCKS=n, we clear the full 4-byte lockword. 204 + * 205 + * Like release_held_lock_entry, we can do the release before the dec. 206 + * We simply care about not seeing the 'lock' in our table from a remote 207 + * CPU once the lock has been released, which doesn't rely on the dec. 208 + * 209 + * Unlike smp_wmb(), release is not a two way fence, hence it is 210 + * possible for a inc to move up and reorder with our clearing of the 211 + * entry. This isn't a problem however, as for a misdiagnosis of ABBA, 212 + * the remote CPU needs to hold this lock, which won't be released until 213 + * the store below is done, which would ensure the entry is overwritten 214 + * to NULL, etc. 215 + */ 216 + smp_store_release(&lock->locked, 0); 217 + this_cpu_dec(rqspinlock_held_locks.cnt); 218 + } 219 + 220 + #ifdef CONFIG_QUEUED_SPINLOCKS 221 + #define raw_res_spin_lock_init(lock) ({ *(lock) = (rqspinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; }) 222 + #else 223 + #define raw_res_spin_lock_init(lock) ({ *(lock) = (rqspinlock_t){0}; }) 224 + #endif 225 + 226 + #define raw_res_spin_lock(lock) \ 227 + ({ \ 228 + int __ret; \ 229 + preempt_disable(); \ 230 + __ret = res_spin_lock(lock); \ 231 + if (__ret) \ 232 + preempt_enable(); \ 233 + __ret; \ 234 + }) 235 + 236 + #define raw_res_spin_unlock(lock) ({ res_spin_unlock(lock); preempt_enable(); }) 237 + 238 + #define raw_res_spin_lock_irqsave(lock, flags) \ 239 + ({ \ 240 + int __ret; \ 241 + local_irq_save(flags); \ 242 + __ret = raw_res_spin_lock(lock); \ 243 + if (__ret) \ 244 + local_irq_restore(flags); \ 245 + __ret; \ 246 + }) 247 + 248 + #define raw_res_spin_unlock_irqrestore(lock, flags) ({ raw_res_spin_unlock(lock); local_irq_restore(flags); }) 249 + 250 + #endif /* __ASM_GENERIC_RQSPINLOCK_H */
+10
include/linux/bpf.h
··· 30 30 #include <linux/static_call.h> 31 31 #include <linux/memcontrol.h> 32 32 #include <linux/cfi.h> 33 + #include <asm/rqspinlock.h> 33 34 34 35 struct bpf_verifier_env; 35 36 struct bpf_verifier_log; ··· 205 204 BPF_REFCOUNT = (1 << 9), 206 205 BPF_WORKQUEUE = (1 << 10), 207 206 BPF_UPTR = (1 << 11), 207 + BPF_RES_SPIN_LOCK = (1 << 12), 208 208 }; 209 209 210 210 typedef void (*btf_dtor_kfunc_t)(void *); ··· 241 239 u32 cnt; 242 240 u32 field_mask; 243 241 int spin_lock_off; 242 + int res_spin_lock_off; 244 243 int timer_off; 245 244 int wq_off; 246 245 int refcount_off; ··· 317 314 switch (type) { 318 315 case BPF_SPIN_LOCK: 319 316 return "bpf_spin_lock"; 317 + case BPF_RES_SPIN_LOCK: 318 + return "bpf_res_spin_lock"; 320 319 case BPF_TIMER: 321 320 return "bpf_timer"; 322 321 case BPF_WORKQUEUE: ··· 351 346 switch (type) { 352 347 case BPF_SPIN_LOCK: 353 348 return sizeof(struct bpf_spin_lock); 349 + case BPF_RES_SPIN_LOCK: 350 + return sizeof(struct bpf_res_spin_lock); 354 351 case BPF_TIMER: 355 352 return sizeof(struct bpf_timer); 356 353 case BPF_WORKQUEUE: ··· 383 376 switch (type) { 384 377 case BPF_SPIN_LOCK: 385 378 return __alignof__(struct bpf_spin_lock); 379 + case BPF_RES_SPIN_LOCK: 380 + return __alignof__(struct bpf_res_spin_lock); 386 381 case BPF_TIMER: 387 382 return __alignof__(struct bpf_timer); 388 383 case BPF_WORKQUEUE: ··· 428 419 case BPF_RB_ROOT: 429 420 /* RB_ROOT_CACHED 0-inits, no need to do anything after memset */ 430 421 case BPF_SPIN_LOCK: 422 + case BPF_RES_SPIN_LOCK: 431 423 case BPF_TIMER: 432 424 case BPF_WORKQUEUE: 433 425 case BPF_KPTR_UNREF:
+16 -3
include/linux/bpf_verifier.h
··· 115 115 int depth:30; 116 116 } iter; 117 117 118 + /* For irq stack slots */ 119 + struct { 120 + enum { 121 + IRQ_NATIVE_KFUNC, 122 + IRQ_LOCK_KFUNC, 123 + } kfunc_class; 124 + } irq; 125 + 118 126 /* Max size from any of the above. */ 119 127 struct { 120 128 unsigned long raw1; ··· 263 255 * default to pointer reference on zero initialization of a state. 264 256 */ 265 257 enum ref_state_type { 266 - REF_TYPE_PTR = 1, 267 - REF_TYPE_IRQ = 2, 268 - REF_TYPE_LOCK = 3, 258 + REF_TYPE_PTR = (1 << 1), 259 + REF_TYPE_IRQ = (1 << 2), 260 + REF_TYPE_LOCK = (1 << 3), 261 + REF_TYPE_RES_LOCK = (1 << 4), 262 + REF_TYPE_RES_LOCK_IRQ = (1 << 5), 263 + REF_TYPE_LOCK_MASK = REF_TYPE_LOCK | REF_TYPE_RES_LOCK | REF_TYPE_RES_LOCK_IRQ, 269 264 } type; 270 265 /* Track each reference created with a unique id, even if the same 271 266 * instruction creates the reference multiple times (eg, via CALL). ··· 435 424 u32 active_locks; 436 425 u32 active_preempt_locks; 437 426 u32 active_irq_id; 427 + u32 active_lock_id; 428 + void *active_lock_ptr; 438 429 bool active_rcu_lock; 439 430 440 431 bool speculative;
+1 -1
kernel/bpf/Makefile
··· 14 14 obj-${CONFIG_BPF_LSM} += bpf_inode_storage.o 15 15 obj-$(CONFIG_BPF_SYSCALL) += disasm.o mprog.o 16 16 obj-$(CONFIG_BPF_JIT) += trampoline.o 17 - obj-$(CONFIG_BPF_SYSCALL) += btf.o memalloc.o 17 + obj-$(CONFIG_BPF_SYSCALL) += btf.o memalloc.o rqspinlock.o 18 18 ifeq ($(CONFIG_MMU)$(CONFIG_64BIT),yy) 19 19 obj-$(CONFIG_BPF_SYSCALL) += arena.o range_tree.o 20 20 endif
+24 -2
kernel/bpf/btf.c
··· 3481 3481 goto end; 3482 3482 } 3483 3483 } 3484 + if (field_mask & BPF_RES_SPIN_LOCK) { 3485 + if (!strcmp(name, "bpf_res_spin_lock")) { 3486 + if (*seen_mask & BPF_RES_SPIN_LOCK) 3487 + return -E2BIG; 3488 + *seen_mask |= BPF_RES_SPIN_LOCK; 3489 + type = BPF_RES_SPIN_LOCK; 3490 + goto end; 3491 + } 3492 + } 3484 3493 if (field_mask & BPF_TIMER) { 3485 3494 if (!strcmp(name, "bpf_timer")) { 3486 3495 if (*seen_mask & BPF_TIMER) ··· 3668 3659 3669 3660 switch (field_type) { 3670 3661 case BPF_SPIN_LOCK: 3662 + case BPF_RES_SPIN_LOCK: 3671 3663 case BPF_TIMER: 3672 3664 case BPF_WORKQUEUE: 3673 3665 case BPF_LIST_NODE: ··· 3962 3952 return ERR_PTR(-ENOMEM); 3963 3953 3964 3954 rec->spin_lock_off = -EINVAL; 3955 + rec->res_spin_lock_off = -EINVAL; 3965 3956 rec->timer_off = -EINVAL; 3966 3957 rec->wq_off = -EINVAL; 3967 3958 rec->refcount_off = -EINVAL; ··· 3989 3978 WARN_ON_ONCE(rec->spin_lock_off >= 0); 3990 3979 /* Cache offset for faster lookup at runtime */ 3991 3980 rec->spin_lock_off = rec->fields[i].offset; 3981 + break; 3982 + case BPF_RES_SPIN_LOCK: 3983 + WARN_ON_ONCE(rec->spin_lock_off >= 0); 3984 + /* Cache offset for faster lookup at runtime */ 3985 + rec->res_spin_lock_off = rec->fields[i].offset; 3992 3986 break; 3993 3987 case BPF_TIMER: 3994 3988 WARN_ON_ONCE(rec->timer_off >= 0); ··· 4038 4022 rec->cnt++; 4039 4023 } 4040 4024 4025 + if (rec->spin_lock_off >= 0 && rec->res_spin_lock_off >= 0) { 4026 + ret = -EINVAL; 4027 + goto end; 4028 + } 4029 + 4041 4030 /* bpf_{list_head, rb_node} require bpf_spin_lock */ 4042 4031 if ((btf_record_has_field(rec, BPF_LIST_HEAD) || 4043 - btf_record_has_field(rec, BPF_RB_ROOT)) && rec->spin_lock_off < 0) { 4032 + btf_record_has_field(rec, BPF_RB_ROOT)) && 4033 + (rec->spin_lock_off < 0 && rec->res_spin_lock_off < 0)) { 4044 4034 ret = -EINVAL; 4045 4035 goto end; 4046 4036 } ··· 5659 5637 5660 5638 type = &tab->types[tab->cnt]; 5661 5639 type->btf_id = i; 5662 - record = btf_parse_fields(btf, t, BPF_SPIN_LOCK | BPF_LIST_HEAD | BPF_LIST_NODE | 5640 + record = btf_parse_fields(btf, t, BPF_SPIN_LOCK | BPF_RES_SPIN_LOCK | BPF_LIST_HEAD | BPF_LIST_NODE | 5663 5641 BPF_RB_ROOT | BPF_RB_NODE | BPF_REFCOUNT | 5664 5642 BPF_KPTR, t->size); 5665 5643 /* The record cannot be unset, treat it as an error if so */
+32 -70
kernel/bpf/hashtab.c
··· 16 16 #include "bpf_lru_list.h" 17 17 #include "map_in_map.h" 18 18 #include <linux/bpf_mem_alloc.h> 19 + #include <asm/rqspinlock.h> 19 20 20 21 #define HTAB_CREATE_FLAG_MASK \ 21 22 (BPF_F_NO_PREALLOC | BPF_F_NO_COMMON_LRU | BPF_F_NUMA_NODE | \ ··· 79 78 */ 80 79 struct bucket { 81 80 struct hlist_nulls_head head; 82 - raw_spinlock_t raw_lock; 81 + rqspinlock_t raw_lock; 83 82 }; 84 83 85 84 #define HASHTAB_MAP_LOCK_COUNT 8 ··· 105 104 u32 n_buckets; /* number of hash buckets */ 106 105 u32 elem_size; /* size of each element in bytes */ 107 106 u32 hashrnd; 108 - struct lock_class_key lockdep_key; 109 - int __percpu *map_locked[HASHTAB_MAP_LOCK_COUNT]; 110 107 }; 111 108 112 109 /* each htab element is struct htab_elem + key + value */ ··· 139 140 140 141 for (i = 0; i < htab->n_buckets; i++) { 141 142 INIT_HLIST_NULLS_HEAD(&htab->buckets[i].head, i); 142 - raw_spin_lock_init(&htab->buckets[i].raw_lock); 143 - lockdep_set_class(&htab->buckets[i].raw_lock, 144 - &htab->lockdep_key); 143 + raw_res_spin_lock_init(&htab->buckets[i].raw_lock); 145 144 cond_resched(); 146 145 } 147 146 } 148 147 149 - static inline int htab_lock_bucket(const struct bpf_htab *htab, 150 - struct bucket *b, u32 hash, 151 - unsigned long *pflags) 148 + static inline int htab_lock_bucket(struct bucket *b, unsigned long *pflags) 152 149 { 153 150 unsigned long flags; 151 + int ret; 154 152 155 - hash = hash & min_t(u32, HASHTAB_MAP_LOCK_MASK, htab->n_buckets - 1); 156 - 157 - preempt_disable(); 158 - local_irq_save(flags); 159 - if (unlikely(__this_cpu_inc_return(*(htab->map_locked[hash])) != 1)) { 160 - __this_cpu_dec(*(htab->map_locked[hash])); 161 - local_irq_restore(flags); 162 - preempt_enable(); 163 - return -EBUSY; 164 - } 165 - 166 - raw_spin_lock(&b->raw_lock); 153 + ret = raw_res_spin_lock_irqsave(&b->raw_lock, flags); 154 + if (ret) 155 + return ret; 167 156 *pflags = flags; 168 - 169 157 return 0; 170 158 } 171 159 172 - static inline void htab_unlock_bucket(const struct bpf_htab *htab, 173 - struct bucket *b, u32 hash, 174 - unsigned long flags) 160 + static inline void htab_unlock_bucket(struct bucket *b, unsigned long flags) 175 161 { 176 - hash = hash & min_t(u32, HASHTAB_MAP_LOCK_MASK, htab->n_buckets - 1); 177 - raw_spin_unlock(&b->raw_lock); 178 - __this_cpu_dec(*(htab->map_locked[hash])); 179 - local_irq_restore(flags); 180 - preempt_enable(); 162 + raw_res_spin_unlock_irqrestore(&b->raw_lock, flags); 181 163 } 182 164 183 165 static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node); ··· 463 483 bool percpu_lru = (attr->map_flags & BPF_F_NO_COMMON_LRU); 464 484 bool prealloc = !(attr->map_flags & BPF_F_NO_PREALLOC); 465 485 struct bpf_htab *htab; 466 - int err, i; 486 + int err; 467 487 468 488 htab = bpf_map_area_alloc(sizeof(*htab), NUMA_NO_NODE); 469 489 if (!htab) 470 490 return ERR_PTR(-ENOMEM); 471 - 472 - lockdep_register_key(&htab->lockdep_key); 473 491 474 492 bpf_map_init_from_attr(&htab->map, attr); 475 493 ··· 513 535 htab->map.numa_node); 514 536 if (!htab->buckets) 515 537 goto free_elem_count; 516 - 517 - for (i = 0; i < HASHTAB_MAP_LOCK_COUNT; i++) { 518 - htab->map_locked[i] = bpf_map_alloc_percpu(&htab->map, 519 - sizeof(int), 520 - sizeof(int), 521 - GFP_USER); 522 - if (!htab->map_locked[i]) 523 - goto free_map_locked; 524 - } 525 538 526 539 if (htab->map.map_flags & BPF_F_ZERO_SEED) 527 540 htab->hashrnd = 0; ··· 576 607 free_map_locked: 577 608 if (htab->use_percpu_counter) 578 609 percpu_counter_destroy(&htab->pcount); 579 - for (i = 0; i < HASHTAB_MAP_LOCK_COUNT; i++) 580 - free_percpu(htab->map_locked[i]); 581 610 bpf_map_area_free(htab->buckets); 582 611 bpf_mem_alloc_destroy(&htab->pcpu_ma); 583 612 bpf_mem_alloc_destroy(&htab->ma); 584 613 free_elem_count: 585 614 bpf_map_free_elem_count(&htab->map); 586 615 free_htab: 587 - lockdep_unregister_key(&htab->lockdep_key); 588 616 bpf_map_area_free(htab); 589 617 return ERR_PTR(err); 590 618 } ··· 786 820 b = __select_bucket(htab, tgt_l->hash); 787 821 head = &b->head; 788 822 789 - ret = htab_lock_bucket(htab, b, tgt_l->hash, &flags); 823 + ret = htab_lock_bucket(b, &flags); 790 824 if (ret) 791 825 return false; 792 826 ··· 797 831 break; 798 832 } 799 833 800 - htab_unlock_bucket(htab, b, tgt_l->hash, flags); 834 + htab_unlock_bucket(b, flags); 801 835 802 836 if (l == tgt_l) 803 837 check_and_free_fields(htab, l); ··· 1116 1150 */ 1117 1151 } 1118 1152 1119 - ret = htab_lock_bucket(htab, b, hash, &flags); 1153 + ret = htab_lock_bucket(b, &flags); 1120 1154 if (ret) 1121 1155 return ret; 1122 1156 ··· 1167 1201 check_and_free_fields(htab, l_old); 1168 1202 } 1169 1203 } 1170 - htab_unlock_bucket(htab, b, hash, flags); 1204 + htab_unlock_bucket(b, flags); 1171 1205 if (l_old) { 1172 1206 if (old_map_ptr) 1173 1207 map->ops->map_fd_put_ptr(map, old_map_ptr, true); ··· 1176 1210 } 1177 1211 return 0; 1178 1212 err: 1179 - htab_unlock_bucket(htab, b, hash, flags); 1213 + htab_unlock_bucket(b, flags); 1180 1214 return ret; 1181 1215 } 1182 1216 ··· 1223 1257 copy_map_value(&htab->map, 1224 1258 l_new->key + round_up(map->key_size, 8), value); 1225 1259 1226 - ret = htab_lock_bucket(htab, b, hash, &flags); 1260 + ret = htab_lock_bucket(b, &flags); 1227 1261 if (ret) 1228 1262 goto err_lock_bucket; 1229 1263 ··· 1244 1278 ret = 0; 1245 1279 1246 1280 err: 1247 - htab_unlock_bucket(htab, b, hash, flags); 1281 + htab_unlock_bucket(b, flags); 1248 1282 1249 1283 err_lock_bucket: 1250 1284 if (ret) ··· 1281 1315 b = __select_bucket(htab, hash); 1282 1316 head = &b->head; 1283 1317 1284 - ret = htab_lock_bucket(htab, b, hash, &flags); 1318 + ret = htab_lock_bucket(b, &flags); 1285 1319 if (ret) 1286 1320 return ret; 1287 1321 ··· 1306 1340 } 1307 1341 ret = 0; 1308 1342 err: 1309 - htab_unlock_bucket(htab, b, hash, flags); 1343 + htab_unlock_bucket(b, flags); 1310 1344 return ret; 1311 1345 } 1312 1346 ··· 1347 1381 return -ENOMEM; 1348 1382 } 1349 1383 1350 - ret = htab_lock_bucket(htab, b, hash, &flags); 1384 + ret = htab_lock_bucket(b, &flags); 1351 1385 if (ret) 1352 1386 goto err_lock_bucket; 1353 1387 ··· 1371 1405 } 1372 1406 ret = 0; 1373 1407 err: 1374 - htab_unlock_bucket(htab, b, hash, flags); 1408 + htab_unlock_bucket(b, flags); 1375 1409 err_lock_bucket: 1376 1410 if (l_new) { 1377 1411 bpf_map_dec_elem_count(&htab->map); ··· 1413 1447 b = __select_bucket(htab, hash); 1414 1448 head = &b->head; 1415 1449 1416 - ret = htab_lock_bucket(htab, b, hash, &flags); 1450 + ret = htab_lock_bucket(b, &flags); 1417 1451 if (ret) 1418 1452 return ret; 1419 1453 ··· 1423 1457 else 1424 1458 ret = -ENOENT; 1425 1459 1426 - htab_unlock_bucket(htab, b, hash, flags); 1460 + htab_unlock_bucket(b, flags); 1427 1461 1428 1462 if (l) 1429 1463 free_htab_elem(htab, l); ··· 1449 1483 b = __select_bucket(htab, hash); 1450 1484 head = &b->head; 1451 1485 1452 - ret = htab_lock_bucket(htab, b, hash, &flags); 1486 + ret = htab_lock_bucket(b, &flags); 1453 1487 if (ret) 1454 1488 return ret; 1455 1489 ··· 1460 1494 else 1461 1495 ret = -ENOENT; 1462 1496 1463 - htab_unlock_bucket(htab, b, hash, flags); 1497 + htab_unlock_bucket(b, flags); 1464 1498 if (l) 1465 1499 htab_lru_push_free(htab, l); 1466 1500 return ret; ··· 1527 1561 static void htab_map_free(struct bpf_map *map) 1528 1562 { 1529 1563 struct bpf_htab *htab = container_of(map, struct bpf_htab, map); 1530 - int i; 1531 1564 1532 1565 /* bpf_free_used_maps() or close(map_fd) will trigger this map_free callback. 1533 1566 * bpf_free_used_maps() is called after bpf prog is no longer executing. ··· 1551 1586 bpf_mem_alloc_destroy(&htab->ma); 1552 1587 if (htab->use_percpu_counter) 1553 1588 percpu_counter_destroy(&htab->pcount); 1554 - for (i = 0; i < HASHTAB_MAP_LOCK_COUNT; i++) 1555 - free_percpu(htab->map_locked[i]); 1556 - lockdep_unregister_key(&htab->lockdep_key); 1557 1589 bpf_map_area_free(htab); 1558 1590 } 1559 1591 ··· 1593 1631 b = __select_bucket(htab, hash); 1594 1632 head = &b->head; 1595 1633 1596 - ret = htab_lock_bucket(htab, b, hash, &bflags); 1634 + ret = htab_lock_bucket(b, &bflags); 1597 1635 if (ret) 1598 1636 return ret; 1599 1637 ··· 1630 1668 hlist_nulls_del_rcu(&l->hash_node); 1631 1669 1632 1670 out_unlock: 1633 - htab_unlock_bucket(htab, b, hash, bflags); 1671 + htab_unlock_bucket(b, bflags); 1634 1672 1635 1673 if (l) { 1636 1674 if (is_lru_map) ··· 1752 1790 head = &b->head; 1753 1791 /* do not grab the lock unless need it (bucket_cnt > 0). */ 1754 1792 if (locked) { 1755 - ret = htab_lock_bucket(htab, b, batch, &flags); 1793 + ret = htab_lock_bucket(b, &flags); 1756 1794 if (ret) { 1757 1795 rcu_read_unlock(); 1758 1796 bpf_enable_instrumentation(); ··· 1775 1813 /* Note that since bucket_cnt > 0 here, it is implicit 1776 1814 * that the locked was grabbed, so release it. 1777 1815 */ 1778 - htab_unlock_bucket(htab, b, batch, flags); 1816 + htab_unlock_bucket(b, flags); 1779 1817 rcu_read_unlock(); 1780 1818 bpf_enable_instrumentation(); 1781 1819 goto after_loop; ··· 1786 1824 /* Note that since bucket_cnt > 0 here, it is implicit 1787 1825 * that the locked was grabbed, so release it. 1788 1826 */ 1789 - htab_unlock_bucket(htab, b, batch, flags); 1827 + htab_unlock_bucket(b, flags); 1790 1828 rcu_read_unlock(); 1791 1829 bpf_enable_instrumentation(); 1792 1830 kvfree(keys); ··· 1849 1887 dst_val += value_size; 1850 1888 } 1851 1889 1852 - htab_unlock_bucket(htab, b, batch, flags); 1890 + htab_unlock_bucket(b, flags); 1853 1891 locked = false; 1854 1892 1855 1893 while (node_to_free) {
+14 -11
kernel/bpf/lpm_trie.c
··· 15 15 #include <net/ipv6.h> 16 16 #include <uapi/linux/btf.h> 17 17 #include <linux/btf_ids.h> 18 + #include <asm/rqspinlock.h> 18 19 #include <linux/bpf_mem_alloc.h> 19 20 20 21 /* Intermediate node */ ··· 37 36 size_t n_entries; 38 37 size_t max_prefixlen; 39 38 size_t data_size; 40 - raw_spinlock_t lock; 39 + rqspinlock_t lock; 41 40 }; 42 41 43 42 /* This trie implements a longest prefix match algorithm that can be used to ··· 343 342 if (!new_node) 344 343 return -ENOMEM; 345 344 346 - raw_spin_lock_irqsave(&trie->lock, irq_flags); 345 + ret = raw_res_spin_lock_irqsave(&trie->lock, irq_flags); 346 + if (ret) 347 + goto out_free; 347 348 348 349 new_node->prefixlen = key->prefixlen; 349 350 RCU_INIT_POINTER(new_node->child[0], NULL); ··· 359 356 */ 360 357 slot = &trie->root; 361 358 362 - while ((node = rcu_dereference_protected(*slot, 363 - lockdep_is_held(&trie->lock)))) { 359 + while ((node = rcu_dereference(*slot))) { 364 360 matchlen = longest_prefix_match(trie, node, key); 365 361 366 362 if (node->prefixlen != matchlen || ··· 444 442 rcu_assign_pointer(*slot, im_node); 445 443 446 444 out: 447 - raw_spin_unlock_irqrestore(&trie->lock, irq_flags); 448 - 445 + raw_res_spin_unlock_irqrestore(&trie->lock, irq_flags); 446 + out_free: 449 447 if (ret) 450 448 bpf_mem_cache_free(&trie->ma, new_node); 451 449 bpf_mem_cache_free_rcu(&trie->ma, free_node); ··· 469 467 if (key->prefixlen > trie->max_prefixlen) 470 468 return -EINVAL; 471 469 472 - raw_spin_lock_irqsave(&trie->lock, irq_flags); 470 + ret = raw_res_spin_lock_irqsave(&trie->lock, irq_flags); 471 + if (ret) 472 + return ret; 473 473 474 474 /* Walk the tree looking for an exact key/length match and keeping 475 475 * track of the path we traverse. We will need to know the node ··· 482 478 trim = &trie->root; 483 479 trim2 = trim; 484 480 parent = NULL; 485 - while ((node = rcu_dereference_protected( 486 - *trim, lockdep_is_held(&trie->lock)))) { 481 + while ((node = rcu_dereference(*trim))) { 487 482 matchlen = longest_prefix_match(trie, node, key); 488 483 489 484 if (node->prefixlen != matchlen || ··· 546 543 free_node = node; 547 544 548 545 out: 549 - raw_spin_unlock_irqrestore(&trie->lock, irq_flags); 546 + raw_res_spin_unlock_irqrestore(&trie->lock, irq_flags); 550 547 551 548 bpf_mem_cache_free_rcu(&trie->ma, free_parent); 552 549 bpf_mem_cache_free_rcu(&trie->ma, free_node); ··· 595 592 offsetof(struct bpf_lpm_trie_key_u8, data); 596 593 trie->max_prefixlen = trie->data_size * 8; 597 594 598 - raw_spin_lock_init(&trie->lock); 595 + raw_res_spin_lock_init(&trie->lock); 599 596 600 597 /* Allocate intermediate and leaf nodes from the same allocator */ 601 598 leaf_size = sizeof(struct lpm_trie_node) + trie->data_size +
+28 -91
kernel/bpf/percpu_freelist.c
··· 14 14 for_each_possible_cpu(cpu) { 15 15 struct pcpu_freelist_head *head = per_cpu_ptr(s->freelist, cpu); 16 16 17 - raw_spin_lock_init(&head->lock); 17 + raw_res_spin_lock_init(&head->lock); 18 18 head->first = NULL; 19 19 } 20 - raw_spin_lock_init(&s->extralist.lock); 21 - s->extralist.first = NULL; 22 20 return 0; 23 21 } 24 22 ··· 32 34 WRITE_ONCE(head->first, node); 33 35 } 34 36 35 - static inline void ___pcpu_freelist_push(struct pcpu_freelist_head *head, 37 + static inline bool ___pcpu_freelist_push(struct pcpu_freelist_head *head, 36 38 struct pcpu_freelist_node *node) 37 39 { 38 - raw_spin_lock(&head->lock); 39 - pcpu_freelist_push_node(head, node); 40 - raw_spin_unlock(&head->lock); 41 - } 42 - 43 - static inline bool pcpu_freelist_try_push_extra(struct pcpu_freelist *s, 44 - struct pcpu_freelist_node *node) 45 - { 46 - if (!raw_spin_trylock(&s->extralist.lock)) 40 + if (raw_res_spin_lock(&head->lock)) 47 41 return false; 48 - 49 - pcpu_freelist_push_node(&s->extralist, node); 50 - raw_spin_unlock(&s->extralist.lock); 42 + pcpu_freelist_push_node(head, node); 43 + raw_res_spin_unlock(&head->lock); 51 44 return true; 52 - } 53 - 54 - static inline void ___pcpu_freelist_push_nmi(struct pcpu_freelist *s, 55 - struct pcpu_freelist_node *node) 56 - { 57 - int cpu, orig_cpu; 58 - 59 - orig_cpu = raw_smp_processor_id(); 60 - while (1) { 61 - for_each_cpu_wrap(cpu, cpu_possible_mask, orig_cpu) { 62 - struct pcpu_freelist_head *head; 63 - 64 - head = per_cpu_ptr(s->freelist, cpu); 65 - if (raw_spin_trylock(&head->lock)) { 66 - pcpu_freelist_push_node(head, node); 67 - raw_spin_unlock(&head->lock); 68 - return; 69 - } 70 - } 71 - 72 - /* cannot lock any per cpu lock, try extralist */ 73 - if (pcpu_freelist_try_push_extra(s, node)) 74 - return; 75 - } 76 45 } 77 46 78 47 void __pcpu_freelist_push(struct pcpu_freelist *s, 79 48 struct pcpu_freelist_node *node) 80 49 { 81 - if (in_nmi()) 82 - ___pcpu_freelist_push_nmi(s, node); 83 - else 84 - ___pcpu_freelist_push(this_cpu_ptr(s->freelist), node); 50 + struct pcpu_freelist_head *head; 51 + int cpu; 52 + 53 + if (___pcpu_freelist_push(this_cpu_ptr(s->freelist), node)) 54 + return; 55 + 56 + while (true) { 57 + for_each_cpu_wrap(cpu, cpu_possible_mask, raw_smp_processor_id()) { 58 + if (cpu == raw_smp_processor_id()) 59 + continue; 60 + head = per_cpu_ptr(s->freelist, cpu); 61 + if (raw_res_spin_lock(&head->lock)) 62 + continue; 63 + pcpu_freelist_push_node(head, node); 64 + raw_res_spin_unlock(&head->lock); 65 + return; 66 + } 67 + } 85 68 } 86 69 87 70 void pcpu_freelist_push(struct pcpu_freelist *s, ··· 99 120 100 121 static struct pcpu_freelist_node *___pcpu_freelist_pop(struct pcpu_freelist *s) 101 122 { 123 + struct pcpu_freelist_node *node = NULL; 102 124 struct pcpu_freelist_head *head; 103 - struct pcpu_freelist_node *node; 104 125 int cpu; 105 126 106 127 for_each_cpu_wrap(cpu, cpu_possible_mask, raw_smp_processor_id()) { 107 128 head = per_cpu_ptr(s->freelist, cpu); 108 129 if (!READ_ONCE(head->first)) 109 130 continue; 110 - raw_spin_lock(&head->lock); 131 + if (raw_res_spin_lock(&head->lock)) 132 + continue; 111 133 node = head->first; 112 134 if (node) { 113 135 WRITE_ONCE(head->first, node->next); 114 - raw_spin_unlock(&head->lock); 136 + raw_res_spin_unlock(&head->lock); 115 137 return node; 116 138 } 117 - raw_spin_unlock(&head->lock); 139 + raw_res_spin_unlock(&head->lock); 118 140 } 119 - 120 - /* per cpu lists are all empty, try extralist */ 121 - if (!READ_ONCE(s->extralist.first)) 122 - return NULL; 123 - raw_spin_lock(&s->extralist.lock); 124 - node = s->extralist.first; 125 - if (node) 126 - WRITE_ONCE(s->extralist.first, node->next); 127 - raw_spin_unlock(&s->extralist.lock); 128 - return node; 129 - } 130 - 131 - static struct pcpu_freelist_node * 132 - ___pcpu_freelist_pop_nmi(struct pcpu_freelist *s) 133 - { 134 - struct pcpu_freelist_head *head; 135 - struct pcpu_freelist_node *node; 136 - int cpu; 137 - 138 - for_each_cpu_wrap(cpu, cpu_possible_mask, raw_smp_processor_id()) { 139 - head = per_cpu_ptr(s->freelist, cpu); 140 - if (!READ_ONCE(head->first)) 141 - continue; 142 - if (raw_spin_trylock(&head->lock)) { 143 - node = head->first; 144 - if (node) { 145 - WRITE_ONCE(head->first, node->next); 146 - raw_spin_unlock(&head->lock); 147 - return node; 148 - } 149 - raw_spin_unlock(&head->lock); 150 - } 151 - } 152 - 153 - /* cannot pop from per cpu lists, try extralist */ 154 - if (!READ_ONCE(s->extralist.first) || !raw_spin_trylock(&s->extralist.lock)) 155 - return NULL; 156 - node = s->extralist.first; 157 - if (node) 158 - WRITE_ONCE(s->extralist.first, node->next); 159 - raw_spin_unlock(&s->extralist.lock); 160 141 return node; 161 142 } 162 143 163 144 struct pcpu_freelist_node *__pcpu_freelist_pop(struct pcpu_freelist *s) 164 145 { 165 - if (in_nmi()) 166 - return ___pcpu_freelist_pop_nmi(s); 167 146 return ___pcpu_freelist_pop(s); 168 147 } 169 148
+2 -2
kernel/bpf/percpu_freelist.h
··· 5 5 #define __PERCPU_FREELIST_H__ 6 6 #include <linux/spinlock.h> 7 7 #include <linux/percpu.h> 8 + #include <asm/rqspinlock.h> 8 9 9 10 struct pcpu_freelist_head { 10 11 struct pcpu_freelist_node *first; 11 - raw_spinlock_t lock; 12 + rqspinlock_t lock; 12 13 }; 13 14 14 15 struct pcpu_freelist { 15 16 struct pcpu_freelist_head __percpu *freelist; 16 - struct pcpu_freelist_head extralist; 17 17 }; 18 18 19 19 struct pcpu_freelist_node {
+737
kernel/bpf/rqspinlock.c
··· 1 + // SPDX-License-Identifier: GPL-2.0-or-later 2 + /* 3 + * Resilient Queued Spin Lock 4 + * 5 + * (C) Copyright 2013-2015 Hewlett-Packard Development Company, L.P. 6 + * (C) Copyright 2013-2014,2018 Red Hat, Inc. 7 + * (C) Copyright 2015 Intel Corp. 8 + * (C) Copyright 2015 Hewlett-Packard Enterprise Development LP 9 + * (C) Copyright 2024-2025 Meta Platforms, Inc. and affiliates. 10 + * 11 + * Authors: Waiman Long <longman@redhat.com> 12 + * Peter Zijlstra <peterz@infradead.org> 13 + * Kumar Kartikeya Dwivedi <memxor@gmail.com> 14 + */ 15 + 16 + #include <linux/smp.h> 17 + #include <linux/bug.h> 18 + #include <linux/bpf.h> 19 + #include <linux/err.h> 20 + #include <linux/cpumask.h> 21 + #include <linux/percpu.h> 22 + #include <linux/hardirq.h> 23 + #include <linux/mutex.h> 24 + #include <linux/prefetch.h> 25 + #include <asm/byteorder.h> 26 + #ifdef CONFIG_QUEUED_SPINLOCKS 27 + #include <asm/qspinlock.h> 28 + #endif 29 + #include <trace/events/lock.h> 30 + #include <asm/rqspinlock.h> 31 + #include <linux/timekeeping.h> 32 + 33 + /* 34 + * Include queued spinlock definitions and statistics code 35 + */ 36 + #ifdef CONFIG_QUEUED_SPINLOCKS 37 + #include "../locking/qspinlock.h" 38 + #include "../locking/lock_events.h" 39 + #include "rqspinlock.h" 40 + #include "../locking/mcs_spinlock.h" 41 + #endif 42 + 43 + /* 44 + * The basic principle of a queue-based spinlock can best be understood 45 + * by studying a classic queue-based spinlock implementation called the 46 + * MCS lock. A copy of the original MCS lock paper ("Algorithms for Scalable 47 + * Synchronization on Shared-Memory Multiprocessors by Mellor-Crummey and 48 + * Scott") is available at 49 + * 50 + * https://bugzilla.kernel.org/show_bug.cgi?id=206115 51 + * 52 + * This queued spinlock implementation is based on the MCS lock, however to 53 + * make it fit the 4 bytes we assume spinlock_t to be, and preserve its 54 + * existing API, we must modify it somehow. 55 + * 56 + * In particular; where the traditional MCS lock consists of a tail pointer 57 + * (8 bytes) and needs the next pointer (another 8 bytes) of its own node to 58 + * unlock the next pending (next->locked), we compress both these: {tail, 59 + * next->locked} into a single u32 value. 60 + * 61 + * Since a spinlock disables recursion of its own context and there is a limit 62 + * to the contexts that can nest; namely: task, softirq, hardirq, nmi. As there 63 + * are at most 4 nesting levels, it can be encoded by a 2-bit number. Now 64 + * we can encode the tail by combining the 2-bit nesting level with the cpu 65 + * number. With one byte for the lock value and 3 bytes for the tail, only a 66 + * 32-bit word is now needed. Even though we only need 1 bit for the lock, 67 + * we extend it to a full byte to achieve better performance for architectures 68 + * that support atomic byte write. 69 + * 70 + * We also change the first spinner to spin on the lock bit instead of its 71 + * node; whereby avoiding the need to carry a node from lock to unlock, and 72 + * preserving existing lock API. This also makes the unlock code simpler and 73 + * faster. 74 + * 75 + * N.B. The current implementation only supports architectures that allow 76 + * atomic operations on smaller 8-bit and 16-bit data types. 77 + * 78 + */ 79 + 80 + struct rqspinlock_timeout { 81 + u64 timeout_end; 82 + u64 duration; 83 + u64 cur; 84 + u16 spin; 85 + }; 86 + 87 + #define RES_TIMEOUT_VAL 2 88 + 89 + DEFINE_PER_CPU_ALIGNED(struct rqspinlock_held, rqspinlock_held_locks); 90 + EXPORT_SYMBOL_GPL(rqspinlock_held_locks); 91 + 92 + static bool is_lock_released(rqspinlock_t *lock, u32 mask, struct rqspinlock_timeout *ts) 93 + { 94 + if (!(atomic_read_acquire(&lock->val) & (mask))) 95 + return true; 96 + return false; 97 + } 98 + 99 + static noinline int check_deadlock_AA(rqspinlock_t *lock, u32 mask, 100 + struct rqspinlock_timeout *ts) 101 + { 102 + struct rqspinlock_held *rqh = this_cpu_ptr(&rqspinlock_held_locks); 103 + int cnt = min(RES_NR_HELD, rqh->cnt); 104 + 105 + /* 106 + * Return an error if we hold the lock we are attempting to acquire. 107 + * We'll iterate over max 32 locks; no need to do is_lock_released. 108 + */ 109 + for (int i = 0; i < cnt - 1; i++) { 110 + if (rqh->locks[i] == lock) 111 + return -EDEADLK; 112 + } 113 + return 0; 114 + } 115 + 116 + /* 117 + * This focuses on the most common case of ABBA deadlocks (or ABBA involving 118 + * more locks, which reduce to ABBA). This is not exhaustive, and we rely on 119 + * timeouts as the final line of defense. 120 + */ 121 + static noinline int check_deadlock_ABBA(rqspinlock_t *lock, u32 mask, 122 + struct rqspinlock_timeout *ts) 123 + { 124 + struct rqspinlock_held *rqh = this_cpu_ptr(&rqspinlock_held_locks); 125 + int rqh_cnt = min(RES_NR_HELD, rqh->cnt); 126 + void *remote_lock; 127 + int cpu; 128 + 129 + /* 130 + * Find the CPU holding the lock that we want to acquire. If there is a 131 + * deadlock scenario, we will read a stable set on the remote CPU and 132 + * find the target. This would be a constant time operation instead of 133 + * O(NR_CPUS) if we could determine the owning CPU from a lock value, but 134 + * that requires increasing the size of the lock word. 135 + */ 136 + for_each_possible_cpu(cpu) { 137 + struct rqspinlock_held *rqh_cpu = per_cpu_ptr(&rqspinlock_held_locks, cpu); 138 + int real_cnt = READ_ONCE(rqh_cpu->cnt); 139 + int cnt = min(RES_NR_HELD, real_cnt); 140 + 141 + /* 142 + * Let's ensure to break out of this loop if the lock is available for 143 + * us to potentially acquire. 144 + */ 145 + if (is_lock_released(lock, mask, ts)) 146 + return 0; 147 + 148 + /* 149 + * Skip ourselves, and CPUs whose count is less than 2, as they need at 150 + * least one held lock and one acquisition attempt (reflected as top 151 + * most entry) to participate in an ABBA deadlock. 152 + * 153 + * If cnt is more than RES_NR_HELD, it means the current lock being 154 + * acquired won't appear in the table, and other locks in the table are 155 + * already held, so we can't determine ABBA. 156 + */ 157 + if (cpu == smp_processor_id() || real_cnt < 2 || real_cnt > RES_NR_HELD) 158 + continue; 159 + 160 + /* 161 + * Obtain the entry at the top, this corresponds to the lock the 162 + * remote CPU is attempting to acquire in a deadlock situation, 163 + * and would be one of the locks we hold on the current CPU. 164 + */ 165 + remote_lock = READ_ONCE(rqh_cpu->locks[cnt - 1]); 166 + /* 167 + * If it is NULL, we've raced and cannot determine a deadlock 168 + * conclusively, skip this CPU. 169 + */ 170 + if (!remote_lock) 171 + continue; 172 + /* 173 + * Find if the lock we're attempting to acquire is held by this CPU. 174 + * Don't consider the topmost entry, as that must be the latest lock 175 + * being held or acquired. For a deadlock, the target CPU must also 176 + * attempt to acquire a lock we hold, so for this search only 'cnt - 1' 177 + * entries are important. 178 + */ 179 + for (int i = 0; i < cnt - 1; i++) { 180 + if (READ_ONCE(rqh_cpu->locks[i]) != lock) 181 + continue; 182 + /* 183 + * We found our lock as held on the remote CPU. Is the 184 + * acquisition attempt on the remote CPU for a lock held 185 + * by us? If so, we have a deadlock situation, and need 186 + * to recover. 187 + */ 188 + for (int i = 0; i < rqh_cnt - 1; i++) { 189 + if (rqh->locks[i] == remote_lock) 190 + return -EDEADLK; 191 + } 192 + /* 193 + * Inconclusive; retry again later. 194 + */ 195 + return 0; 196 + } 197 + } 198 + return 0; 199 + } 200 + 201 + static noinline int check_deadlock(rqspinlock_t *lock, u32 mask, 202 + struct rqspinlock_timeout *ts) 203 + { 204 + int ret; 205 + 206 + ret = check_deadlock_AA(lock, mask, ts); 207 + if (ret) 208 + return ret; 209 + ret = check_deadlock_ABBA(lock, mask, ts); 210 + if (ret) 211 + return ret; 212 + 213 + return 0; 214 + } 215 + 216 + static noinline int check_timeout(rqspinlock_t *lock, u32 mask, 217 + struct rqspinlock_timeout *ts) 218 + { 219 + u64 time = ktime_get_mono_fast_ns(); 220 + u64 prev = ts->cur; 221 + 222 + if (!ts->timeout_end) { 223 + ts->cur = time; 224 + ts->timeout_end = time + ts->duration; 225 + return 0; 226 + } 227 + 228 + if (time > ts->timeout_end) 229 + return -ETIMEDOUT; 230 + 231 + /* 232 + * A millisecond interval passed from last time? Trigger deadlock 233 + * checks. 234 + */ 235 + if (prev + NSEC_PER_MSEC < time) { 236 + ts->cur = time; 237 + return check_deadlock(lock, mask, ts); 238 + } 239 + 240 + return 0; 241 + } 242 + 243 + /* 244 + * Do not amortize with spins when res_smp_cond_load_acquire is defined, 245 + * as the macro does internal amortization for us. 246 + */ 247 + #ifndef res_smp_cond_load_acquire 248 + #define RES_CHECK_TIMEOUT(ts, ret, mask) \ 249 + ({ \ 250 + if (!(ts).spin++) \ 251 + (ret) = check_timeout((lock), (mask), &(ts)); \ 252 + (ret); \ 253 + }) 254 + #else 255 + #define RES_CHECK_TIMEOUT(ts, ret, mask) \ 256 + ({ (ret) = check_timeout(&(ts)); }) 257 + #endif 258 + 259 + /* 260 + * Initialize the 'spin' member. 261 + * Set spin member to 0 to trigger AA/ABBA checks immediately. 262 + */ 263 + #define RES_INIT_TIMEOUT(ts) ({ (ts).spin = 0; }) 264 + 265 + /* 266 + * We only need to reset 'timeout_end', 'spin' will just wrap around as necessary. 267 + * Duration is defined for each spin attempt, so set it here. 268 + */ 269 + #define RES_RESET_TIMEOUT(ts, _duration) ({ (ts).timeout_end = 0; (ts).duration = _duration; }) 270 + 271 + /* 272 + * Provide a test-and-set fallback for cases when queued spin lock support is 273 + * absent from the architecture. 274 + */ 275 + int __lockfunc resilient_tas_spin_lock(rqspinlock_t *lock) 276 + { 277 + struct rqspinlock_timeout ts; 278 + int val, ret = 0; 279 + 280 + RES_INIT_TIMEOUT(ts); 281 + grab_held_lock_entry(lock); 282 + 283 + /* 284 + * Since the waiting loop's time is dependent on the amount of 285 + * contention, a short timeout unlike rqspinlock waiting loops 286 + * isn't enough. Choose a second as the timeout value. 287 + */ 288 + RES_RESET_TIMEOUT(ts, NSEC_PER_SEC); 289 + retry: 290 + val = atomic_read(&lock->val); 291 + 292 + if (val || !atomic_try_cmpxchg(&lock->val, &val, 1)) { 293 + if (RES_CHECK_TIMEOUT(ts, ret, ~0u)) 294 + goto out; 295 + cpu_relax(); 296 + goto retry; 297 + } 298 + 299 + return 0; 300 + out: 301 + release_held_lock_entry(); 302 + return ret; 303 + } 304 + EXPORT_SYMBOL_GPL(resilient_tas_spin_lock); 305 + 306 + #ifdef CONFIG_QUEUED_SPINLOCKS 307 + 308 + /* 309 + * Per-CPU queue node structures; we can never have more than 4 nested 310 + * contexts: task, softirq, hardirq, nmi. 311 + * 312 + * Exactly fits one 64-byte cacheline on a 64-bit architecture. 313 + */ 314 + static DEFINE_PER_CPU_ALIGNED(struct qnode, rqnodes[_Q_MAX_NODES]); 315 + 316 + #ifndef res_smp_cond_load_acquire 317 + #define res_smp_cond_load_acquire(v, c) smp_cond_load_acquire(v, c) 318 + #endif 319 + 320 + #define res_atomic_cond_read_acquire(v, c) res_smp_cond_load_acquire(&(v)->counter, (c)) 321 + 322 + /** 323 + * resilient_queued_spin_lock_slowpath - acquire the queued spinlock 324 + * @lock: Pointer to queued spinlock structure 325 + * @val: Current value of the queued spinlock 32-bit word 326 + * 327 + * Return: 328 + * * 0 - Lock was acquired successfully. 329 + * * -EDEADLK - Lock acquisition failed because of AA/ABBA deadlock. 330 + * * -ETIMEDOUT - Lock acquisition failed because of timeout. 331 + * 332 + * (queue tail, pending bit, lock value) 333 + * 334 + * fast : slow : unlock 335 + * : : 336 + * uncontended (0,0,0) -:--> (0,0,1) ------------------------------:--> (*,*,0) 337 + * : | ^--------.------. / : 338 + * : v \ \ | : 339 + * pending : (0,1,1) +--> (0,1,0) \ | : 340 + * : | ^--' | | : 341 + * : v | | : 342 + * uncontended : (n,x,y) +--> (n,0,0) --' | : 343 + * queue : | ^--' | : 344 + * : v | : 345 + * contended : (*,x,y) +--> (*,0,0) ---> (*,0,1) -' : 346 + * queue : ^--' : 347 + */ 348 + int __lockfunc resilient_queued_spin_lock_slowpath(rqspinlock_t *lock, u32 val) 349 + { 350 + struct mcs_spinlock *prev, *next, *node; 351 + struct rqspinlock_timeout ts; 352 + int idx, ret = 0; 353 + u32 old, tail; 354 + 355 + BUILD_BUG_ON(CONFIG_NR_CPUS >= (1U << _Q_TAIL_CPU_BITS)); 356 + 357 + if (resilient_virt_spin_lock_enabled()) 358 + return resilient_virt_spin_lock(lock); 359 + 360 + RES_INIT_TIMEOUT(ts); 361 + 362 + /* 363 + * Wait for in-progress pending->locked hand-overs with a bounded 364 + * number of spins so that we guarantee forward progress. 365 + * 366 + * 0,1,0 -> 0,0,1 367 + */ 368 + if (val == _Q_PENDING_VAL) { 369 + int cnt = _Q_PENDING_LOOPS; 370 + val = atomic_cond_read_relaxed(&lock->val, 371 + (VAL != _Q_PENDING_VAL) || !cnt--); 372 + } 373 + 374 + /* 375 + * If we observe any contention; queue. 376 + */ 377 + if (val & ~_Q_LOCKED_MASK) 378 + goto queue; 379 + 380 + /* 381 + * trylock || pending 382 + * 383 + * 0,0,* -> 0,1,* -> 0,0,1 pending, trylock 384 + */ 385 + val = queued_fetch_set_pending_acquire(lock); 386 + 387 + /* 388 + * If we observe contention, there is a concurrent locker. 389 + * 390 + * Undo and queue; our setting of PENDING might have made the 391 + * n,0,0 -> 0,0,0 transition fail and it will now be waiting 392 + * on @next to become !NULL. 393 + */ 394 + if (unlikely(val & ~_Q_LOCKED_MASK)) { 395 + 396 + /* Undo PENDING if we set it. */ 397 + if (!(val & _Q_PENDING_MASK)) 398 + clear_pending(lock); 399 + 400 + goto queue; 401 + } 402 + 403 + /* 404 + * Grab an entry in the held locks array, to enable deadlock detection. 405 + */ 406 + grab_held_lock_entry(lock); 407 + 408 + /* 409 + * We're pending, wait for the owner to go away. 410 + * 411 + * 0,1,1 -> *,1,0 412 + * 413 + * this wait loop must be a load-acquire such that we match the 414 + * store-release that clears the locked bit and create lock 415 + * sequentiality; this is because not all 416 + * clear_pending_set_locked() implementations imply full 417 + * barriers. 418 + */ 419 + if (val & _Q_LOCKED_MASK) { 420 + RES_RESET_TIMEOUT(ts, RES_DEF_TIMEOUT); 421 + res_smp_cond_load_acquire(&lock->locked, !VAL || RES_CHECK_TIMEOUT(ts, ret, _Q_LOCKED_MASK)); 422 + } 423 + 424 + if (ret) { 425 + /* 426 + * We waited for the locked bit to go back to 0, as the pending 427 + * waiter, but timed out. We need to clear the pending bit since 428 + * we own it. Once a stuck owner has been recovered, the lock 429 + * must be restored to a valid state, hence removing the pending 430 + * bit is necessary. 431 + * 432 + * *,1,* -> *,0,* 433 + */ 434 + clear_pending(lock); 435 + lockevent_inc(rqspinlock_lock_timeout); 436 + goto err_release_entry; 437 + } 438 + 439 + /* 440 + * take ownership and clear the pending bit. 441 + * 442 + * 0,1,0 -> 0,0,1 443 + */ 444 + clear_pending_set_locked(lock); 445 + lockevent_inc(lock_pending); 446 + return 0; 447 + 448 + /* 449 + * End of pending bit optimistic spinning and beginning of MCS 450 + * queuing. 451 + */ 452 + queue: 453 + lockevent_inc(lock_slowpath); 454 + /* 455 + * Grab deadlock detection entry for the queue path. 456 + */ 457 + grab_held_lock_entry(lock); 458 + 459 + node = this_cpu_ptr(&rqnodes[0].mcs); 460 + idx = node->count++; 461 + tail = encode_tail(smp_processor_id(), idx); 462 + 463 + trace_contention_begin(lock, LCB_F_SPIN); 464 + 465 + /* 466 + * 4 nodes are allocated based on the assumption that there will 467 + * not be nested NMIs taking spinlocks. That may not be true in 468 + * some architectures even though the chance of needing more than 469 + * 4 nodes will still be extremely unlikely. When that happens, 470 + * we fall back to spinning on the lock directly without using 471 + * any MCS node. This is not the most elegant solution, but is 472 + * simple enough. 473 + */ 474 + if (unlikely(idx >= _Q_MAX_NODES)) { 475 + lockevent_inc(lock_no_node); 476 + RES_RESET_TIMEOUT(ts, RES_DEF_TIMEOUT); 477 + while (!queued_spin_trylock(lock)) { 478 + if (RES_CHECK_TIMEOUT(ts, ret, ~0u)) { 479 + lockevent_inc(rqspinlock_lock_timeout); 480 + goto err_release_node; 481 + } 482 + cpu_relax(); 483 + } 484 + goto release; 485 + } 486 + 487 + node = grab_mcs_node(node, idx); 488 + 489 + /* 490 + * Keep counts of non-zero index values: 491 + */ 492 + lockevent_cond_inc(lock_use_node2 + idx - 1, idx); 493 + 494 + /* 495 + * Ensure that we increment the head node->count before initialising 496 + * the actual node. If the compiler is kind enough to reorder these 497 + * stores, then an IRQ could overwrite our assignments. 498 + */ 499 + barrier(); 500 + 501 + node->locked = 0; 502 + node->next = NULL; 503 + 504 + /* 505 + * We touched a (possibly) cold cacheline in the per-cpu queue node; 506 + * attempt the trylock once more in the hope someone let go while we 507 + * weren't watching. 508 + */ 509 + if (queued_spin_trylock(lock)) 510 + goto release; 511 + 512 + /* 513 + * Ensure that the initialisation of @node is complete before we 514 + * publish the updated tail via xchg_tail() and potentially link 515 + * @node into the waitqueue via WRITE_ONCE(prev->next, node) below. 516 + */ 517 + smp_wmb(); 518 + 519 + /* 520 + * Publish the updated tail. 521 + * We have already touched the queueing cacheline; don't bother with 522 + * pending stuff. 523 + * 524 + * p,*,* -> n,*,* 525 + */ 526 + old = xchg_tail(lock, tail); 527 + next = NULL; 528 + 529 + /* 530 + * if there was a previous node; link it and wait until reaching the 531 + * head of the waitqueue. 532 + */ 533 + if (old & _Q_TAIL_MASK) { 534 + int val; 535 + 536 + prev = decode_tail(old, rqnodes); 537 + 538 + /* Link @node into the waitqueue. */ 539 + WRITE_ONCE(prev->next, node); 540 + 541 + val = arch_mcs_spin_lock_contended(&node->locked); 542 + if (val == RES_TIMEOUT_VAL) { 543 + ret = -EDEADLK; 544 + goto waitq_timeout; 545 + } 546 + 547 + /* 548 + * While waiting for the MCS lock, the next pointer may have 549 + * been set by another lock waiter. We optimistically load 550 + * the next pointer & prefetch the cacheline for writing 551 + * to reduce latency in the upcoming MCS unlock operation. 552 + */ 553 + next = READ_ONCE(node->next); 554 + if (next) 555 + prefetchw(next); 556 + } 557 + 558 + /* 559 + * we're at the head of the waitqueue, wait for the owner & pending to 560 + * go away. 561 + * 562 + * *,x,y -> *,0,0 563 + * 564 + * this wait loop must use a load-acquire such that we match the 565 + * store-release that clears the locked bit and create lock 566 + * sequentiality; this is because the set_locked() function below 567 + * does not imply a full barrier. 568 + * 569 + * We use RES_DEF_TIMEOUT * 2 as the duration, as RES_DEF_TIMEOUT is 570 + * meant to span maximum allowed time per critical section, and we may 571 + * have both the owner of the lock and the pending bit waiter ahead of 572 + * us. 573 + */ 574 + RES_RESET_TIMEOUT(ts, RES_DEF_TIMEOUT * 2); 575 + val = res_atomic_cond_read_acquire(&lock->val, !(VAL & _Q_LOCKED_PENDING_MASK) || 576 + RES_CHECK_TIMEOUT(ts, ret, _Q_LOCKED_PENDING_MASK)); 577 + 578 + waitq_timeout: 579 + if (ret) { 580 + /* 581 + * If the tail is still pointing to us, then we are the final waiter, 582 + * and are responsible for resetting the tail back to 0. Otherwise, if 583 + * the cmpxchg operation fails, we signal the next waiter to take exit 584 + * and try the same. For a waiter with tail node 'n': 585 + * 586 + * n,*,* -> 0,*,* 587 + * 588 + * When performing cmpxchg for the whole word (NR_CPUS > 16k), it is 589 + * possible locked/pending bits keep changing and we see failures even 590 + * when we remain the head of wait queue. However, eventually, 591 + * pending bit owner will unset the pending bit, and new waiters 592 + * will queue behind us. This will leave the lock owner in 593 + * charge, and it will eventually either set locked bit to 0, or 594 + * leave it as 1, allowing us to make progress. 595 + * 596 + * We terminate the whole wait queue for two reasons. Firstly, 597 + * we eschew per-waiter timeouts with one applied at the head of 598 + * the wait queue. This allows everyone to break out faster 599 + * once we've seen the owner / pending waiter not responding for 600 + * the timeout duration from the head. Secondly, it avoids 601 + * complicated synchronization, because when not leaving in FIFO 602 + * order, prev's next pointer needs to be fixed up etc. 603 + */ 604 + if (!try_cmpxchg_tail(lock, tail, 0)) { 605 + next = smp_cond_load_relaxed(&node->next, VAL); 606 + WRITE_ONCE(next->locked, RES_TIMEOUT_VAL); 607 + } 608 + lockevent_inc(rqspinlock_lock_timeout); 609 + goto err_release_node; 610 + } 611 + 612 + /* 613 + * claim the lock: 614 + * 615 + * n,0,0 -> 0,0,1 : lock, uncontended 616 + * *,*,0 -> *,*,1 : lock, contended 617 + * 618 + * If the queue head is the only one in the queue (lock value == tail) 619 + * and nobody is pending, clear the tail code and grab the lock. 620 + * Otherwise, we only need to grab the lock. 621 + */ 622 + 623 + /* 624 + * Note: at this point: (val & _Q_PENDING_MASK) == 0, because of the 625 + * above wait condition, therefore any concurrent setting of 626 + * PENDING will make the uncontended transition fail. 627 + */ 628 + if ((val & _Q_TAIL_MASK) == tail) { 629 + if (atomic_try_cmpxchg_relaxed(&lock->val, &val, _Q_LOCKED_VAL)) 630 + goto release; /* No contention */ 631 + } 632 + 633 + /* 634 + * Either somebody is queued behind us or _Q_PENDING_VAL got set 635 + * which will then detect the remaining tail and queue behind us 636 + * ensuring we'll see a @next. 637 + */ 638 + set_locked(lock); 639 + 640 + /* 641 + * contended path; wait for next if not observed yet, release. 642 + */ 643 + if (!next) 644 + next = smp_cond_load_relaxed(&node->next, (VAL)); 645 + 646 + arch_mcs_spin_unlock_contended(&next->locked); 647 + 648 + release: 649 + trace_contention_end(lock, 0); 650 + 651 + /* 652 + * release the node 653 + */ 654 + __this_cpu_dec(rqnodes[0].mcs.count); 655 + return ret; 656 + err_release_node: 657 + trace_contention_end(lock, ret); 658 + __this_cpu_dec(rqnodes[0].mcs.count); 659 + err_release_entry: 660 + release_held_lock_entry(); 661 + return ret; 662 + } 663 + EXPORT_SYMBOL_GPL(resilient_queued_spin_lock_slowpath); 664 + 665 + #endif /* CONFIG_QUEUED_SPINLOCKS */ 666 + 667 + __bpf_kfunc_start_defs(); 668 + 669 + __bpf_kfunc int bpf_res_spin_lock(struct bpf_res_spin_lock *lock) 670 + { 671 + int ret; 672 + 673 + BUILD_BUG_ON(sizeof(rqspinlock_t) != sizeof(struct bpf_res_spin_lock)); 674 + BUILD_BUG_ON(__alignof__(rqspinlock_t) != __alignof__(struct bpf_res_spin_lock)); 675 + 676 + preempt_disable(); 677 + ret = res_spin_lock((rqspinlock_t *)lock); 678 + if (unlikely(ret)) { 679 + preempt_enable(); 680 + return ret; 681 + } 682 + return 0; 683 + } 684 + 685 + __bpf_kfunc void bpf_res_spin_unlock(struct bpf_res_spin_lock *lock) 686 + { 687 + res_spin_unlock((rqspinlock_t *)lock); 688 + preempt_enable(); 689 + } 690 + 691 + __bpf_kfunc int bpf_res_spin_lock_irqsave(struct bpf_res_spin_lock *lock, unsigned long *flags__irq_flag) 692 + { 693 + u64 *ptr = (u64 *)flags__irq_flag; 694 + unsigned long flags; 695 + int ret; 696 + 697 + preempt_disable(); 698 + local_irq_save(flags); 699 + ret = res_spin_lock((rqspinlock_t *)lock); 700 + if (unlikely(ret)) { 701 + local_irq_restore(flags); 702 + preempt_enable(); 703 + return ret; 704 + } 705 + *ptr = flags; 706 + return 0; 707 + } 708 + 709 + __bpf_kfunc void bpf_res_spin_unlock_irqrestore(struct bpf_res_spin_lock *lock, unsigned long *flags__irq_flag) 710 + { 711 + u64 *ptr = (u64 *)flags__irq_flag; 712 + unsigned long flags = *ptr; 713 + 714 + res_spin_unlock((rqspinlock_t *)lock); 715 + local_irq_restore(flags); 716 + preempt_enable(); 717 + } 718 + 719 + __bpf_kfunc_end_defs(); 720 + 721 + BTF_KFUNCS_START(rqspinlock_kfunc_ids) 722 + BTF_ID_FLAGS(func, bpf_res_spin_lock, KF_RET_NULL) 723 + BTF_ID_FLAGS(func, bpf_res_spin_unlock) 724 + BTF_ID_FLAGS(func, bpf_res_spin_lock_irqsave, KF_RET_NULL) 725 + BTF_ID_FLAGS(func, bpf_res_spin_unlock_irqrestore) 726 + BTF_KFUNCS_END(rqspinlock_kfunc_ids) 727 + 728 + static const struct btf_kfunc_id_set rqspinlock_kfunc_set = { 729 + .owner = THIS_MODULE, 730 + .set = &rqspinlock_kfunc_ids, 731 + }; 732 + 733 + static __init int rqspinlock_register_kfuncs(void) 734 + { 735 + return register_btf_kfunc_id_set(BPF_PROG_TYPE_UNSPEC, &rqspinlock_kfunc_set); 736 + } 737 + late_initcall(rqspinlock_register_kfuncs);
+48
kernel/bpf/rqspinlock.h
··· 1 + /* SPDX-License-Identifier: GPL-2.0-or-later */ 2 + /* 3 + * Resilient Queued Spin Lock defines 4 + * 5 + * (C) Copyright 2024-2025 Meta Platforms, Inc. and affiliates. 6 + * 7 + * Authors: Kumar Kartikeya Dwivedi <memxor@gmail.com> 8 + */ 9 + #ifndef __LINUX_RQSPINLOCK_H 10 + #define __LINUX_RQSPINLOCK_H 11 + 12 + #include "../locking/qspinlock.h" 13 + 14 + /* 15 + * try_cmpxchg_tail - Return result of cmpxchg of tail word with a new value 16 + * @lock: Pointer to queued spinlock structure 17 + * @tail: The tail to compare against 18 + * @new_tail: The new queue tail code word 19 + * Return: Bool to indicate whether the cmpxchg operation succeeded 20 + * 21 + * This is used by the head of the wait queue to clean up the queue. 22 + * Provides relaxed ordering, since observers only rely on initialized 23 + * state of the node which was made visible through the xchg_tail operation, 24 + * i.e. through the smp_wmb preceding xchg_tail. 25 + * 26 + * We avoid using 16-bit cmpxchg, which is not available on all architectures. 27 + */ 28 + static __always_inline bool try_cmpxchg_tail(struct qspinlock *lock, u32 tail, u32 new_tail) 29 + { 30 + u32 old, new; 31 + 32 + old = atomic_read(&lock->val); 33 + do { 34 + /* 35 + * Is the tail part we compare to already stale? Fail. 36 + */ 37 + if ((old & _Q_TAIL_MASK) != tail) 38 + return false; 39 + /* 40 + * Encode latest locked/pending state for new tail. 41 + */ 42 + new = (old & _Q_LOCKED_PENDING_MASK) | new_tail; 43 + } while (!atomic_try_cmpxchg_relaxed(&lock->val, &old, new)); 44 + 45 + return true; 46 + } 47 + 48 + #endif /* __LINUX_RQSPINLOCK_H */
+5 -1
kernel/bpf/syscall.c
··· 648 648 case BPF_RB_ROOT: 649 649 case BPF_RB_NODE: 650 650 case BPF_SPIN_LOCK: 651 + case BPF_RES_SPIN_LOCK: 651 652 case BPF_TIMER: 652 653 case BPF_REFCOUNT: 653 654 case BPF_WORKQUEUE: ··· 701 700 case BPF_RB_ROOT: 702 701 case BPF_RB_NODE: 703 702 case BPF_SPIN_LOCK: 703 + case BPF_RES_SPIN_LOCK: 704 704 case BPF_TIMER: 705 705 case BPF_REFCOUNT: 706 706 case BPF_WORKQUEUE: ··· 779 777 780 778 switch (fields[i].type) { 781 779 case BPF_SPIN_LOCK: 780 + case BPF_RES_SPIN_LOCK: 782 781 break; 783 782 case BPF_TIMER: 784 783 bpf_timer_cancel_and_free(field_ptr); ··· 1215 1212 return -EINVAL; 1216 1213 1217 1214 map->record = btf_parse_fields(btf, value_type, 1218 - BPF_SPIN_LOCK | BPF_TIMER | BPF_KPTR | BPF_LIST_HEAD | 1215 + BPF_SPIN_LOCK | BPF_RES_SPIN_LOCK | BPF_TIMER | BPF_KPTR | BPF_LIST_HEAD | 1219 1216 BPF_RB_ROOT | BPF_REFCOUNT | BPF_WORKQUEUE | BPF_UPTR, 1220 1217 map->value_size); 1221 1218 if (!IS_ERR_OR_NULL(map->record)) { ··· 1234 1231 case 0: 1235 1232 continue; 1236 1233 case BPF_SPIN_LOCK: 1234 + case BPF_RES_SPIN_LOCK: 1237 1235 if (map->map_type != BPF_MAP_TYPE_HASH && 1238 1236 map->map_type != BPF_MAP_TYPE_ARRAY && 1239 1237 map->map_type != BPF_MAP_TYPE_CGROUP_STORAGE &&
+206 -42
kernel/bpf/verifier.c
··· 456 456 457 457 static bool reg_may_point_to_spin_lock(const struct bpf_reg_state *reg) 458 458 { 459 - return btf_record_has_field(reg_btf_record(reg), BPF_SPIN_LOCK); 459 + return btf_record_has_field(reg_btf_record(reg), BPF_SPIN_LOCK | BPF_RES_SPIN_LOCK); 460 460 } 461 461 462 462 static bool type_is_rdonly_mem(u32 type) ··· 1155 1155 1156 1156 static int mark_stack_slot_irq_flag(struct bpf_verifier_env *env, 1157 1157 struct bpf_kfunc_call_arg_meta *meta, 1158 - struct bpf_reg_state *reg, int insn_idx) 1158 + struct bpf_reg_state *reg, int insn_idx, 1159 + int kfunc_class) 1159 1160 { 1160 1161 struct bpf_func_state *state = func(env, reg); 1161 1162 struct bpf_stack_state *slot; ··· 1178 1177 st->type = PTR_TO_STACK; /* we don't have dedicated reg type */ 1179 1178 st->live |= REG_LIVE_WRITTEN; 1180 1179 st->ref_obj_id = id; 1180 + st->irq.kfunc_class = kfunc_class; 1181 1181 1182 1182 for (i = 0; i < BPF_REG_SIZE; i++) 1183 1183 slot->slot_type[i] = STACK_IRQ_FLAG; ··· 1187 1185 return 0; 1188 1186 } 1189 1187 1190 - static int unmark_stack_slot_irq_flag(struct bpf_verifier_env *env, struct bpf_reg_state *reg) 1188 + static int unmark_stack_slot_irq_flag(struct bpf_verifier_env *env, struct bpf_reg_state *reg, 1189 + int kfunc_class) 1191 1190 { 1192 1191 struct bpf_func_state *state = func(env, reg); 1193 1192 struct bpf_stack_state *slot; ··· 1201 1198 1202 1199 slot = &state->stack[spi]; 1203 1200 st = &slot->spilled_ptr; 1201 + 1202 + if (st->irq.kfunc_class != kfunc_class) { 1203 + const char *flag_kfunc = st->irq.kfunc_class == IRQ_NATIVE_KFUNC ? "native" : "lock"; 1204 + const char *used_kfunc = kfunc_class == IRQ_NATIVE_KFUNC ? "native" : "lock"; 1205 + 1206 + verbose(env, "irq flag acquired by %s kfuncs cannot be restored with %s kfuncs\n", 1207 + flag_kfunc, used_kfunc); 1208 + return -EINVAL; 1209 + } 1204 1210 1205 1211 err = release_irq_state(env->cur_state, st->ref_obj_id); 1206 1212 WARN_ON_ONCE(err && err != -EACCES); ··· 1428 1416 dst->active_preempt_locks = src->active_preempt_locks; 1429 1417 dst->active_rcu_lock = src->active_rcu_lock; 1430 1418 dst->active_irq_id = src->active_irq_id; 1419 + dst->active_lock_id = src->active_lock_id; 1420 + dst->active_lock_ptr = src->active_lock_ptr; 1431 1421 return 0; 1432 1422 } 1433 1423 ··· 1529 1515 s->ptr = ptr; 1530 1516 1531 1517 state->active_locks++; 1518 + state->active_lock_id = id; 1519 + state->active_lock_ptr = ptr; 1532 1520 return 0; 1533 1521 } 1534 1522 ··· 1581 1565 1582 1566 static int release_lock_state(struct bpf_verifier_state *state, int type, int id, void *ptr) 1583 1567 { 1568 + void *prev_ptr = NULL; 1569 + u32 prev_id = 0; 1584 1570 int i; 1585 1571 1586 1572 for (i = 0; i < state->acquired_refs; i++) { 1587 - if (state->refs[i].type != type) 1588 - continue; 1589 - if (state->refs[i].id == id && state->refs[i].ptr == ptr) { 1573 + if (state->refs[i].type == type && state->refs[i].id == id && 1574 + state->refs[i].ptr == ptr) { 1590 1575 release_reference_state(state, i); 1591 1576 state->active_locks--; 1577 + /* Reassign active lock (id, ptr). */ 1578 + state->active_lock_id = prev_id; 1579 + state->active_lock_ptr = prev_ptr; 1592 1580 return 0; 1581 + } 1582 + if (state->refs[i].type & REF_TYPE_LOCK_MASK) { 1583 + prev_id = state->refs[i].id; 1584 + prev_ptr = state->refs[i].ptr; 1593 1585 } 1594 1586 } 1595 1587 return -EINVAL; ··· 1633 1609 for (i = 0; i < state->acquired_refs; i++) { 1634 1610 struct bpf_reference_state *s = &state->refs[i]; 1635 1611 1636 - if (s->type != type) 1612 + if (!(s->type & type)) 1637 1613 continue; 1638 1614 1639 1615 if (s->id == id && s->ptr == ptr) ··· 8240 8216 return err; 8241 8217 } 8242 8218 8219 + enum { 8220 + PROCESS_SPIN_LOCK = (1 << 0), 8221 + PROCESS_RES_LOCK = (1 << 1), 8222 + PROCESS_LOCK_IRQ = (1 << 2), 8223 + }; 8224 + 8243 8225 /* Implementation details: 8244 8226 * bpf_map_lookup returns PTR_TO_MAP_VALUE_OR_NULL. 8245 8227 * bpf_obj_new returns PTR_TO_BTF_ID | MEM_ALLOC | PTR_MAYBE_NULL. ··· 8268 8238 * env->cur_state->active_locks remembers which map value element or allocated 8269 8239 * object got locked and clears it after bpf_spin_unlock. 8270 8240 */ 8271 - static int process_spin_lock(struct bpf_verifier_env *env, int regno, 8272 - bool is_lock) 8241 + static int process_spin_lock(struct bpf_verifier_env *env, int regno, int flags) 8273 8242 { 8243 + bool is_lock = flags & PROCESS_SPIN_LOCK, is_res_lock = flags & PROCESS_RES_LOCK; 8244 + const char *lock_str = is_res_lock ? "bpf_res_spin" : "bpf_spin"; 8274 8245 struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno]; 8275 8246 struct bpf_verifier_state *cur = env->cur_state; 8276 8247 bool is_const = tnum_is_const(reg->var_off); 8248 + bool is_irq = flags & PROCESS_LOCK_IRQ; 8277 8249 u64 val = reg->var_off.value; 8278 8250 struct bpf_map *map = NULL; 8279 8251 struct btf *btf = NULL; 8280 8252 struct btf_record *rec; 8253 + u32 spin_lock_off; 8281 8254 int err; 8282 8255 8283 8256 if (!is_const) { 8284 8257 verbose(env, 8285 - "R%d doesn't have constant offset. bpf_spin_lock has to be at the constant offset\n", 8286 - regno); 8258 + "R%d doesn't have constant offset. %s_lock has to be at the constant offset\n", 8259 + regno, lock_str); 8287 8260 return -EINVAL; 8288 8261 } 8289 8262 if (reg->type == PTR_TO_MAP_VALUE) { 8290 8263 map = reg->map_ptr; 8291 8264 if (!map->btf) { 8292 8265 verbose(env, 8293 - "map '%s' has to have BTF in order to use bpf_spin_lock\n", 8294 - map->name); 8266 + "map '%s' has to have BTF in order to use %s_lock\n", 8267 + map->name, lock_str); 8295 8268 return -EINVAL; 8296 8269 } 8297 8270 } else { ··· 8302 8269 } 8303 8270 8304 8271 rec = reg_btf_record(reg); 8305 - if (!btf_record_has_field(rec, BPF_SPIN_LOCK)) { 8306 - verbose(env, "%s '%s' has no valid bpf_spin_lock\n", map ? "map" : "local", 8307 - map ? map->name : "kptr"); 8272 + if (!btf_record_has_field(rec, is_res_lock ? BPF_RES_SPIN_LOCK : BPF_SPIN_LOCK)) { 8273 + verbose(env, "%s '%s' has no valid %s_lock\n", map ? "map" : "local", 8274 + map ? map->name : "kptr", lock_str); 8308 8275 return -EINVAL; 8309 8276 } 8310 - if (rec->spin_lock_off != val + reg->off) { 8311 - verbose(env, "off %lld doesn't point to 'struct bpf_spin_lock' that is at %d\n", 8312 - val + reg->off, rec->spin_lock_off); 8277 + spin_lock_off = is_res_lock ? rec->res_spin_lock_off : rec->spin_lock_off; 8278 + if (spin_lock_off != val + reg->off) { 8279 + verbose(env, "off %lld doesn't point to 'struct %s_lock' that is at %d\n", 8280 + val + reg->off, lock_str, spin_lock_off); 8313 8281 return -EINVAL; 8314 8282 } 8315 8283 if (is_lock) { 8316 8284 void *ptr; 8285 + int type; 8317 8286 8318 8287 if (map) 8319 8288 ptr = map; 8320 8289 else 8321 8290 ptr = btf; 8322 8291 8323 - if (cur->active_locks) { 8324 - verbose(env, 8325 - "Locking two bpf_spin_locks are not allowed\n"); 8326 - return -EINVAL; 8292 + if (!is_res_lock && cur->active_locks) { 8293 + if (find_lock_state(env->cur_state, REF_TYPE_LOCK, 0, NULL)) { 8294 + verbose(env, 8295 + "Locking two bpf_spin_locks are not allowed\n"); 8296 + return -EINVAL; 8297 + } 8298 + } else if (is_res_lock && cur->active_locks) { 8299 + if (find_lock_state(env->cur_state, REF_TYPE_RES_LOCK | REF_TYPE_RES_LOCK_IRQ, reg->id, ptr)) { 8300 + verbose(env, "Acquiring the same lock again, AA deadlock detected\n"); 8301 + return -EINVAL; 8302 + } 8327 8303 } 8328 - err = acquire_lock_state(env, env->insn_idx, REF_TYPE_LOCK, reg->id, ptr); 8304 + 8305 + if (is_res_lock && is_irq) 8306 + type = REF_TYPE_RES_LOCK_IRQ; 8307 + else if (is_res_lock) 8308 + type = REF_TYPE_RES_LOCK; 8309 + else 8310 + type = REF_TYPE_LOCK; 8311 + err = acquire_lock_state(env, env->insn_idx, type, reg->id, ptr); 8329 8312 if (err < 0) { 8330 8313 verbose(env, "Failed to acquire lock state\n"); 8331 8314 return err; 8332 8315 } 8333 8316 } else { 8334 8317 void *ptr; 8318 + int type; 8335 8319 8336 8320 if (map) 8337 8321 ptr = map; ··· 8356 8306 ptr = btf; 8357 8307 8358 8308 if (!cur->active_locks) { 8359 - verbose(env, "bpf_spin_unlock without taking a lock\n"); 8309 + verbose(env, "%s_unlock without taking a lock\n", lock_str); 8360 8310 return -EINVAL; 8361 8311 } 8362 8312 8363 - if (release_lock_state(env->cur_state, REF_TYPE_LOCK, reg->id, ptr)) { 8364 - verbose(env, "bpf_spin_unlock of different lock\n"); 8313 + if (is_res_lock && is_irq) 8314 + type = REF_TYPE_RES_LOCK_IRQ; 8315 + else if (is_res_lock) 8316 + type = REF_TYPE_RES_LOCK; 8317 + else 8318 + type = REF_TYPE_LOCK; 8319 + if (!find_lock_state(cur, type, reg->id, ptr)) { 8320 + verbose(env, "%s_unlock of different lock\n", lock_str); 8321 + return -EINVAL; 8322 + } 8323 + if (reg->id != cur->active_lock_id || ptr != cur->active_lock_ptr) { 8324 + verbose(env, "%s_unlock cannot be out of order\n", lock_str); 8325 + return -EINVAL; 8326 + } 8327 + if (release_lock_state(cur, type, reg->id, ptr)) { 8328 + verbose(env, "%s_unlock of different lock\n", lock_str); 8365 8329 return -EINVAL; 8366 8330 } 8367 8331 ··· 9701 9637 return -EACCES; 9702 9638 } 9703 9639 if (meta->func_id == BPF_FUNC_spin_lock) { 9704 - err = process_spin_lock(env, regno, true); 9640 + err = process_spin_lock(env, regno, PROCESS_SPIN_LOCK); 9705 9641 if (err) 9706 9642 return err; 9707 9643 } else if (meta->func_id == BPF_FUNC_spin_unlock) { 9708 - err = process_spin_lock(env, regno, false); 9644 + err = process_spin_lock(env, regno, 0); 9709 9645 if (err) 9710 9646 return err; 9711 9647 } else { ··· 11587 11523 regs[BPF_REG_0].map_uid = meta.map_uid; 11588 11524 regs[BPF_REG_0].type = PTR_TO_MAP_VALUE | ret_flag; 11589 11525 if (!type_may_be_null(ret_flag) && 11590 - btf_record_has_field(meta.map_ptr->record, BPF_SPIN_LOCK)) { 11526 + btf_record_has_field(meta.map_ptr->record, BPF_SPIN_LOCK | BPF_RES_SPIN_LOCK)) { 11591 11527 regs[BPF_REG_0].id = ++env->id_gen; 11592 11528 } 11593 11529 break; ··· 11759 11695 /* mark_btf_func_reg_size() is used when the reg size is determined by 11760 11696 * the BTF func_proto's return value size and argument. 11761 11697 */ 11762 - static void mark_btf_func_reg_size(struct bpf_verifier_env *env, u32 regno, 11763 - size_t reg_size) 11698 + static void __mark_btf_func_reg_size(struct bpf_verifier_env *env, struct bpf_reg_state *regs, 11699 + u32 regno, size_t reg_size) 11764 11700 { 11765 - struct bpf_reg_state *reg = &cur_regs(env)[regno]; 11701 + struct bpf_reg_state *reg = &regs[regno]; 11766 11702 11767 11703 if (regno == BPF_REG_0) { 11768 11704 /* Function return value */ ··· 11778 11714 mark_reg_read(env, reg, reg->parent, REG_LIVE_READ32); 11779 11715 } 11780 11716 } 11717 + } 11718 + 11719 + static void mark_btf_func_reg_size(struct bpf_verifier_env *env, u32 regno, 11720 + size_t reg_size) 11721 + { 11722 + return __mark_btf_func_reg_size(env, cur_regs(env), regno, reg_size); 11781 11723 } 11782 11724 11783 11725 static bool is_kfunc_acquire(struct bpf_kfunc_call_arg_meta *meta) ··· 11923 11853 KF_ARG_RB_ROOT_ID, 11924 11854 KF_ARG_RB_NODE_ID, 11925 11855 KF_ARG_WORKQUEUE_ID, 11856 + KF_ARG_RES_SPIN_LOCK_ID, 11926 11857 }; 11927 11858 11928 11859 BTF_ID_LIST(kf_arg_btf_ids) ··· 11933 11862 BTF_ID(struct, bpf_rb_root) 11934 11863 BTF_ID(struct, bpf_rb_node) 11935 11864 BTF_ID(struct, bpf_wq) 11865 + BTF_ID(struct, bpf_res_spin_lock) 11936 11866 11937 11867 static bool __is_kfunc_ptr_arg_type(const struct btf *btf, 11938 11868 const struct btf_param *arg, int type) ··· 11980 11908 static bool is_kfunc_arg_wq(const struct btf *btf, const struct btf_param *arg) 11981 11909 { 11982 11910 return __is_kfunc_ptr_arg_type(btf, arg, KF_ARG_WORKQUEUE_ID); 11911 + } 11912 + 11913 + static bool is_kfunc_arg_res_spin_lock(const struct btf *btf, const struct btf_param *arg) 11914 + { 11915 + return __is_kfunc_ptr_arg_type(btf, arg, KF_ARG_RES_SPIN_LOCK_ID); 11983 11916 } 11984 11917 11985 11918 static bool is_kfunc_arg_callback(struct bpf_verifier_env *env, const struct btf *btf, ··· 12058 11981 KF_ARG_PTR_TO_MAP, 12059 11982 KF_ARG_PTR_TO_WORKQUEUE, 12060 11983 KF_ARG_PTR_TO_IRQ_FLAG, 11984 + KF_ARG_PTR_TO_RES_SPIN_LOCK, 12061 11985 }; 12062 11986 12063 11987 enum special_kfunc_type { ··· 12097 12019 KF_bpf_iter_num_destroy, 12098 12020 KF_bpf_set_dentry_xattr, 12099 12021 KF_bpf_remove_dentry_xattr, 12022 + KF_bpf_res_spin_lock, 12023 + KF_bpf_res_spin_unlock, 12024 + KF_bpf_res_spin_lock_irqsave, 12025 + KF_bpf_res_spin_unlock_irqrestore, 12100 12026 }; 12101 12027 12102 12028 BTF_SET_START(special_kfunc_set) ··· 12190 12108 BTF_ID_UNUSED 12191 12109 BTF_ID_UNUSED 12192 12110 #endif 12111 + BTF_ID(func, bpf_res_spin_lock) 12112 + BTF_ID(func, bpf_res_spin_unlock) 12113 + BTF_ID(func, bpf_res_spin_lock_irqsave) 12114 + BTF_ID(func, bpf_res_spin_unlock_irqrestore) 12193 12115 12194 12116 static bool is_kfunc_ret_null(struct bpf_kfunc_call_arg_meta *meta) 12195 12117 { ··· 12286 12200 12287 12201 if (is_kfunc_arg_irq_flag(meta->btf, &args[argno])) 12288 12202 return KF_ARG_PTR_TO_IRQ_FLAG; 12203 + 12204 + if (is_kfunc_arg_res_spin_lock(meta->btf, &args[argno])) 12205 + return KF_ARG_PTR_TO_RES_SPIN_LOCK; 12289 12206 12290 12207 if ((base_type(reg->type) == PTR_TO_BTF_ID || reg2btf_ids[base_type(reg->type)])) { 12291 12208 if (!btf_type_is_struct(ref_t)) { ··· 12397 12308 struct bpf_kfunc_call_arg_meta *meta) 12398 12309 { 12399 12310 struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno]; 12311 + int err, kfunc_class = IRQ_NATIVE_KFUNC; 12400 12312 bool irq_save; 12401 - int err; 12402 12313 12403 - if (meta->func_id == special_kfunc_list[KF_bpf_local_irq_save]) { 12314 + if (meta->func_id == special_kfunc_list[KF_bpf_local_irq_save] || 12315 + meta->func_id == special_kfunc_list[KF_bpf_res_spin_lock_irqsave]) { 12404 12316 irq_save = true; 12405 - } else if (meta->func_id == special_kfunc_list[KF_bpf_local_irq_restore]) { 12317 + if (meta->func_id == special_kfunc_list[KF_bpf_res_spin_lock_irqsave]) 12318 + kfunc_class = IRQ_LOCK_KFUNC; 12319 + } else if (meta->func_id == special_kfunc_list[KF_bpf_local_irq_restore] || 12320 + meta->func_id == special_kfunc_list[KF_bpf_res_spin_unlock_irqrestore]) { 12406 12321 irq_save = false; 12322 + if (meta->func_id == special_kfunc_list[KF_bpf_res_spin_unlock_irqrestore]) 12323 + kfunc_class = IRQ_LOCK_KFUNC; 12407 12324 } else { 12408 12325 verbose(env, "verifier internal error: unknown irq flags kfunc\n"); 12409 12326 return -EFAULT; ··· 12425 12330 if (err) 12426 12331 return err; 12427 12332 12428 - err = mark_stack_slot_irq_flag(env, meta, reg, env->insn_idx); 12333 + err = mark_stack_slot_irq_flag(env, meta, reg, env->insn_idx, kfunc_class); 12429 12334 if (err) 12430 12335 return err; 12431 12336 } else { ··· 12439 12344 if (err) 12440 12345 return err; 12441 12346 12442 - err = unmark_stack_slot_irq_flag(env, reg); 12347 + err = unmark_stack_slot_irq_flag(env, reg, kfunc_class); 12443 12348 if (err) 12444 12349 return err; 12445 12350 } ··· 12566 12471 12567 12472 if (!env->cur_state->active_locks) 12568 12473 return -EINVAL; 12569 - s = find_lock_state(env->cur_state, REF_TYPE_LOCK, id, ptr); 12474 + s = find_lock_state(env->cur_state, REF_TYPE_LOCK_MASK, id, ptr); 12570 12475 if (!s) { 12571 12476 verbose(env, "held lock and object are not in the same allocation\n"); 12572 12477 return -EINVAL; ··· 12602 12507 btf_id == special_kfunc_list[KF_bpf_refcount_acquire_impl]; 12603 12508 } 12604 12509 12510 + static bool is_bpf_res_spin_lock_kfunc(u32 btf_id) 12511 + { 12512 + return btf_id == special_kfunc_list[KF_bpf_res_spin_lock] || 12513 + btf_id == special_kfunc_list[KF_bpf_res_spin_unlock] || 12514 + btf_id == special_kfunc_list[KF_bpf_res_spin_lock_irqsave] || 12515 + btf_id == special_kfunc_list[KF_bpf_res_spin_unlock_irqrestore]; 12516 + } 12517 + 12605 12518 static bool kfunc_spin_allowed(u32 btf_id) 12606 12519 { 12607 - return is_bpf_graph_api_kfunc(btf_id) || is_bpf_iter_num_api_kfunc(btf_id); 12520 + return is_bpf_graph_api_kfunc(btf_id) || is_bpf_iter_num_api_kfunc(btf_id) || 12521 + is_bpf_res_spin_lock_kfunc(btf_id); 12608 12522 } 12609 12523 12610 12524 static bool is_sync_callback_calling_kfunc(u32 btf_id) ··· 13045 12941 case KF_ARG_PTR_TO_CONST_STR: 13046 12942 case KF_ARG_PTR_TO_WORKQUEUE: 13047 12943 case KF_ARG_PTR_TO_IRQ_FLAG: 12944 + case KF_ARG_PTR_TO_RES_SPIN_LOCK: 13048 12945 break; 13049 12946 default: 13050 12947 WARN_ON_ONCE(1); ··· 13344 13239 if (ret < 0) 13345 13240 return ret; 13346 13241 break; 13242 + case KF_ARG_PTR_TO_RES_SPIN_LOCK: 13243 + { 13244 + int flags = PROCESS_RES_LOCK; 13245 + 13246 + if (reg->type != PTR_TO_MAP_VALUE && reg->type != (PTR_TO_BTF_ID | MEM_ALLOC)) { 13247 + verbose(env, "arg#%d doesn't point to map value or allocated object\n", i); 13248 + return -EINVAL; 13249 + } 13250 + 13251 + if (!is_bpf_res_spin_lock_kfunc(meta->func_id)) 13252 + return -EFAULT; 13253 + if (meta->func_id == special_kfunc_list[KF_bpf_res_spin_lock] || 13254 + meta->func_id == special_kfunc_list[KF_bpf_res_spin_lock_irqsave]) 13255 + flags |= PROCESS_SPIN_LOCK; 13256 + if (meta->func_id == special_kfunc_list[KF_bpf_res_spin_lock_irqsave] || 13257 + meta->func_id == special_kfunc_list[KF_bpf_res_spin_unlock_irqrestore]) 13258 + flags |= PROCESS_LOCK_IRQ; 13259 + ret = process_spin_lock(env, regno, flags); 13260 + if (ret < 0) 13261 + return ret; 13262 + break; 13263 + } 13347 13264 } 13348 13265 } 13349 13266 ··· 13450 13323 insn_aux = &env->insn_aux_data[insn_idx]; 13451 13324 13452 13325 insn_aux->is_iter_next = is_iter_next_kfunc(&meta); 13326 + 13327 + if (!insn->off && 13328 + (insn->imm == special_kfunc_list[KF_bpf_res_spin_lock] || 13329 + insn->imm == special_kfunc_list[KF_bpf_res_spin_lock_irqsave])) { 13330 + struct bpf_verifier_state *branch; 13331 + struct bpf_reg_state *regs; 13332 + 13333 + branch = push_stack(env, env->insn_idx + 1, env->insn_idx, false); 13334 + if (!branch) { 13335 + verbose(env, "failed to push state for failed lock acquisition\n"); 13336 + return -ENOMEM; 13337 + } 13338 + 13339 + regs = branch->frame[branch->curframe]->regs; 13340 + 13341 + /* Clear r0-r5 registers in forked state */ 13342 + for (i = 0; i < CALLER_SAVED_REGS; i++) 13343 + mark_reg_not_init(env, regs, caller_saved[i]); 13344 + 13345 + mark_reg_unknown(env, regs, BPF_REG_0); 13346 + err = __mark_reg_s32_range(env, regs, BPF_REG_0, -MAX_ERRNO, -1); 13347 + if (err) { 13348 + verbose(env, "failed to mark s32 range for retval in forked state for lock\n"); 13349 + return err; 13350 + } 13351 + __mark_btf_func_reg_size(env, regs, BPF_REG_0, sizeof(u32)); 13352 + } 13453 13353 13454 13354 if (is_kfunc_destructive(&meta) && !capable(CAP_SYS_BOOT)) { 13455 13355 verbose(env, "destructive kfunc calls require CAP_SYS_BOOT capability\n"); ··· 13648 13494 13649 13495 if (btf_type_is_scalar(t)) { 13650 13496 mark_reg_unknown(env, regs, BPF_REG_0); 13497 + if (meta.btf == btf_vmlinux && (meta.func_id == special_kfunc_list[KF_bpf_res_spin_lock] || 13498 + meta.func_id == special_kfunc_list[KF_bpf_res_spin_lock_irqsave])) 13499 + __mark_reg_const_zero(env, &regs[BPF_REG_0]); 13651 13500 mark_btf_func_reg_size(env, BPF_REG_0, t->size); 13652 13501 } else if (btf_type_is_ptr(t)) { 13653 13502 ptr_type = btf_type_skip_modifiers(desc_btf, t->type, &ptr_type_id); ··· 18586 18429 case STACK_IRQ_FLAG: 18587 18430 old_reg = &old->stack[spi].spilled_ptr; 18588 18431 cur_reg = &cur->stack[spi].spilled_ptr; 18589 - if (!check_ids(old_reg->ref_obj_id, cur_reg->ref_obj_id, idmap)) 18432 + if (!check_ids(old_reg->ref_obj_id, cur_reg->ref_obj_id, idmap) || 18433 + old_reg->irq.kfunc_class != cur_reg->irq.kfunc_class) 18590 18434 return false; 18591 18435 break; 18592 18436 case STACK_MISC: ··· 18622 18464 if (!check_ids(old->active_irq_id, cur->active_irq_id, idmap)) 18623 18465 return false; 18624 18466 18467 + if (!check_ids(old->active_lock_id, cur->active_lock_id, idmap) || 18468 + old->active_lock_ptr != cur->active_lock_ptr) 18469 + return false; 18470 + 18625 18471 for (i = 0; i < old->acquired_refs; i++) { 18626 18472 if (!check_ids(old->refs[i].id, cur->refs[i].id, idmap) || 18627 18473 old->refs[i].type != cur->refs[i].type) ··· 18635 18473 case REF_TYPE_IRQ: 18636 18474 break; 18637 18475 case REF_TYPE_LOCK: 18476 + case REF_TYPE_RES_LOCK: 18477 + case REF_TYPE_RES_LOCK_IRQ: 18638 18478 if (old->refs[i].ptr != cur->refs[i].ptr) 18639 18479 return false; 18640 18480 break; ··· 19922 19758 } 19923 19759 } 19924 19760 19925 - if (btf_record_has_field(map->record, BPF_SPIN_LOCK)) { 19761 + if (btf_record_has_field(map->record, BPF_SPIN_LOCK | BPF_RES_SPIN_LOCK)) { 19926 19762 if (prog_type == BPF_PROG_TYPE_SOCKET_FILTER) { 19927 19763 verbose(env, "socket filter progs cannot use bpf_spin_lock yet\n"); 19928 19764 return -EINVAL;
+5
kernel/locking/lock_events_list.h
··· 50 50 #endif /* CONFIG_QUEUED_SPINLOCKS */ 51 51 52 52 /* 53 + * Locking events for Resilient Queued Spin Lock 54 + */ 55 + LOCK_EVENT(rqspinlock_lock_timeout) /* # of locking ops that timeout */ 56 + 57 + /* 53 58 * Locking events for rwsem 54 59 */ 55 60 LOCK_EVENT(rwsem_sleep_reader) /* # of reader sleeps */
+57
kernel/locking/locktorture.c
··· 362 362 .name = "raw_spin_lock_irq" 363 363 }; 364 364 365 + #ifdef CONFIG_BPF_SYSCALL 366 + 367 + #include <asm/rqspinlock.h> 368 + static rqspinlock_t rqspinlock; 369 + 370 + static int torture_raw_res_spin_write_lock(int tid __maybe_unused) 371 + { 372 + raw_res_spin_lock(&rqspinlock); 373 + return 0; 374 + } 375 + 376 + static void torture_raw_res_spin_write_unlock(int tid __maybe_unused) 377 + { 378 + raw_res_spin_unlock(&rqspinlock); 379 + } 380 + 381 + static struct lock_torture_ops raw_res_spin_lock_ops = { 382 + .writelock = torture_raw_res_spin_write_lock, 383 + .write_delay = torture_spin_lock_write_delay, 384 + .task_boost = torture_rt_boost, 385 + .writeunlock = torture_raw_res_spin_write_unlock, 386 + .readlock = NULL, 387 + .read_delay = NULL, 388 + .readunlock = NULL, 389 + .name = "raw_res_spin_lock" 390 + }; 391 + 392 + static int torture_raw_res_spin_write_lock_irq(int tid __maybe_unused) 393 + { 394 + unsigned long flags; 395 + 396 + raw_res_spin_lock_irqsave(&rqspinlock, flags); 397 + cxt.cur_ops->flags = flags; 398 + return 0; 399 + } 400 + 401 + static void torture_raw_res_spin_write_unlock_irq(int tid __maybe_unused) 402 + { 403 + raw_res_spin_unlock_irqrestore(&rqspinlock, cxt.cur_ops->flags); 404 + } 405 + 406 + static struct lock_torture_ops raw_res_spin_lock_irq_ops = { 407 + .writelock = torture_raw_res_spin_write_lock_irq, 408 + .write_delay = torture_spin_lock_write_delay, 409 + .task_boost = torture_rt_boost, 410 + .writeunlock = torture_raw_res_spin_write_unlock_irq, 411 + .readlock = NULL, 412 + .read_delay = NULL, 413 + .readunlock = NULL, 414 + .name = "raw_res_spin_lock_irq" 415 + }; 416 + 417 + #endif 418 + 365 419 static DEFINE_RWLOCK(torture_rwlock); 366 420 367 421 static int torture_rwlock_write_lock(int tid __maybe_unused) ··· 1222 1168 &lock_busted_ops, 1223 1169 &spin_lock_ops, &spin_lock_irq_ops, 1224 1170 &raw_spin_lock_ops, &raw_spin_lock_irq_ops, 1171 + #ifdef CONFIG_BPF_SYSCALL 1172 + &raw_res_spin_lock_ops, &raw_res_spin_lock_irq_ops, 1173 + #endif 1225 1174 &rw_lock_ops, &rw_lock_irq_ops, 1226 1175 &mutex_lock_ops, 1227 1176 &ww_mutex_lock_ops,
+1 -9
kernel/locking/mcs_spinlock.h
··· 15 15 16 16 #include <asm/mcs_spinlock.h> 17 17 18 - struct mcs_spinlock { 19 - struct mcs_spinlock *next; 20 - int locked; /* 1 if lock acquired */ 21 - int count; /* nesting count, see qspinlock.c */ 22 - }; 23 - 24 18 #ifndef arch_mcs_spin_lock_contended 25 19 /* 26 20 * Using smp_cond_load_acquire() provides the acquire semantics ··· 24 30 * spinning, and smp_cond_load_acquire() provides that behavior. 25 31 */ 26 32 #define arch_mcs_spin_lock_contended(l) \ 27 - do { \ 28 - smp_cond_load_acquire(l, VAL); \ 29 - } while (0) 33 + smp_cond_load_acquire(l, VAL) 30 34 #endif 31 35 32 36 #ifndef arch_mcs_spin_unlock_contended
+5 -188
kernel/locking/qspinlock.c
··· 25 25 #include <trace/events/lock.h> 26 26 27 27 /* 28 - * Include queued spinlock statistics code 28 + * Include queued spinlock definitions and statistics code 29 29 */ 30 + #include "qspinlock.h" 30 31 #include "qspinlock_stat.h" 31 32 32 33 /* ··· 68 67 */ 69 68 70 69 #include "mcs_spinlock.h" 71 - #define MAX_NODES 4 72 - 73 - /* 74 - * On 64-bit architectures, the mcs_spinlock structure will be 16 bytes in 75 - * size and four of them will fit nicely in one 64-byte cacheline. For 76 - * pvqspinlock, however, we need more space for extra data. To accommodate 77 - * that, we insert two more long words to pad it up to 32 bytes. IOW, only 78 - * two of them can fit in a cacheline in this case. That is OK as it is rare 79 - * to have more than 2 levels of slowpath nesting in actual use. We don't 80 - * want to penalize pvqspinlocks to optimize for a rare case in native 81 - * qspinlocks. 82 - */ 83 - struct qnode { 84 - struct mcs_spinlock mcs; 85 - #ifdef CONFIG_PARAVIRT_SPINLOCKS 86 - long reserved[2]; 87 - #endif 88 - }; 89 - 90 - /* 91 - * The pending bit spinning loop count. 92 - * This heuristic is used to limit the number of lockword accesses 93 - * made by atomic_cond_read_relaxed when waiting for the lock to 94 - * transition out of the "== _Q_PENDING_VAL" state. We don't spin 95 - * indefinitely because there's no guarantee that we'll make forward 96 - * progress. 97 - */ 98 - #ifndef _Q_PENDING_LOOPS 99 - #define _Q_PENDING_LOOPS 1 100 - #endif 101 70 102 71 /* 103 72 * Per-CPU queue node structures; we can never have more than 4 nested ··· 77 106 * 78 107 * PV doubles the storage and uses the second cacheline for PV state. 79 108 */ 80 - static DEFINE_PER_CPU_ALIGNED(struct qnode, qnodes[MAX_NODES]); 81 - 82 - /* 83 - * We must be able to distinguish between no-tail and the tail at 0:0, 84 - * therefore increment the cpu number by one. 85 - */ 86 - 87 - static inline __pure u32 encode_tail(int cpu, int idx) 88 - { 89 - u32 tail; 90 - 91 - tail = (cpu + 1) << _Q_TAIL_CPU_OFFSET; 92 - tail |= idx << _Q_TAIL_IDX_OFFSET; /* assume < 4 */ 93 - 94 - return tail; 95 - } 96 - 97 - static inline __pure struct mcs_spinlock *decode_tail(u32 tail) 98 - { 99 - int cpu = (tail >> _Q_TAIL_CPU_OFFSET) - 1; 100 - int idx = (tail & _Q_TAIL_IDX_MASK) >> _Q_TAIL_IDX_OFFSET; 101 - 102 - return per_cpu_ptr(&qnodes[idx].mcs, cpu); 103 - } 104 - 105 - static inline __pure 106 - struct mcs_spinlock *grab_mcs_node(struct mcs_spinlock *base, int idx) 107 - { 108 - return &((struct qnode *)base + idx)->mcs; 109 - } 110 - 111 - #define _Q_LOCKED_PENDING_MASK (_Q_LOCKED_MASK | _Q_PENDING_MASK) 112 - 113 - #if _Q_PENDING_BITS == 8 114 - /** 115 - * clear_pending - clear the pending bit. 116 - * @lock: Pointer to queued spinlock structure 117 - * 118 - * *,1,* -> *,0,* 119 - */ 120 - static __always_inline void clear_pending(struct qspinlock *lock) 121 - { 122 - WRITE_ONCE(lock->pending, 0); 123 - } 124 - 125 - /** 126 - * clear_pending_set_locked - take ownership and clear the pending bit. 127 - * @lock: Pointer to queued spinlock structure 128 - * 129 - * *,1,0 -> *,0,1 130 - * 131 - * Lock stealing is not allowed if this function is used. 132 - */ 133 - static __always_inline void clear_pending_set_locked(struct qspinlock *lock) 134 - { 135 - WRITE_ONCE(lock->locked_pending, _Q_LOCKED_VAL); 136 - } 137 - 138 - /* 139 - * xchg_tail - Put in the new queue tail code word & retrieve previous one 140 - * @lock : Pointer to queued spinlock structure 141 - * @tail : The new queue tail code word 142 - * Return: The previous queue tail code word 143 - * 144 - * xchg(lock, tail), which heads an address dependency 145 - * 146 - * p,*,* -> n,*,* ; prev = xchg(lock, node) 147 - */ 148 - static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail) 149 - { 150 - /* 151 - * We can use relaxed semantics since the caller ensures that the 152 - * MCS node is properly initialized before updating the tail. 153 - */ 154 - return (u32)xchg_relaxed(&lock->tail, 155 - tail >> _Q_TAIL_OFFSET) << _Q_TAIL_OFFSET; 156 - } 157 - 158 - #else /* _Q_PENDING_BITS == 8 */ 159 - 160 - /** 161 - * clear_pending - clear the pending bit. 162 - * @lock: Pointer to queued spinlock structure 163 - * 164 - * *,1,* -> *,0,* 165 - */ 166 - static __always_inline void clear_pending(struct qspinlock *lock) 167 - { 168 - atomic_andnot(_Q_PENDING_VAL, &lock->val); 169 - } 170 - 171 - /** 172 - * clear_pending_set_locked - take ownership and clear the pending bit. 173 - * @lock: Pointer to queued spinlock structure 174 - * 175 - * *,1,0 -> *,0,1 176 - */ 177 - static __always_inline void clear_pending_set_locked(struct qspinlock *lock) 178 - { 179 - atomic_add(-_Q_PENDING_VAL + _Q_LOCKED_VAL, &lock->val); 180 - } 181 - 182 - /** 183 - * xchg_tail - Put in the new queue tail code word & retrieve previous one 184 - * @lock : Pointer to queued spinlock structure 185 - * @tail : The new queue tail code word 186 - * Return: The previous queue tail code word 187 - * 188 - * xchg(lock, tail) 189 - * 190 - * p,*,* -> n,*,* ; prev = xchg(lock, node) 191 - */ 192 - static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail) 193 - { 194 - u32 old, new; 195 - 196 - old = atomic_read(&lock->val); 197 - do { 198 - new = (old & _Q_LOCKED_PENDING_MASK) | tail; 199 - /* 200 - * We can use relaxed semantics since the caller ensures that 201 - * the MCS node is properly initialized before updating the 202 - * tail. 203 - */ 204 - } while (!atomic_try_cmpxchg_relaxed(&lock->val, &old, new)); 205 - 206 - return old; 207 - } 208 - #endif /* _Q_PENDING_BITS == 8 */ 209 - 210 - /** 211 - * queued_fetch_set_pending_acquire - fetch the whole lock value and set pending 212 - * @lock : Pointer to queued spinlock structure 213 - * Return: The previous lock value 214 - * 215 - * *,*,* -> *,1,* 216 - */ 217 - #ifndef queued_fetch_set_pending_acquire 218 - static __always_inline u32 queued_fetch_set_pending_acquire(struct qspinlock *lock) 219 - { 220 - return atomic_fetch_or_acquire(_Q_PENDING_VAL, &lock->val); 221 - } 222 - #endif 223 - 224 - /** 225 - * set_locked - Set the lock bit and own the lock 226 - * @lock: Pointer to queued spinlock structure 227 - * 228 - * *,*,0 -> *,0,1 229 - */ 230 - static __always_inline void set_locked(struct qspinlock *lock) 231 - { 232 - WRITE_ONCE(lock->locked, _Q_LOCKED_VAL); 233 - } 234 - 109 + static DEFINE_PER_CPU_ALIGNED(struct qnode, qnodes[_Q_MAX_NODES]); 235 110 236 111 /* 237 112 * Generate the native code for queued_spin_unlock_slowpath(); provide NOPs for ··· 227 410 * any MCS node. This is not the most elegant solution, but is 228 411 * simple enough. 229 412 */ 230 - if (unlikely(idx >= MAX_NODES)) { 413 + if (unlikely(idx >= _Q_MAX_NODES)) { 231 414 lockevent_inc(lock_no_node); 232 415 while (!queued_spin_trylock(lock)) 233 416 cpu_relax(); ··· 282 465 * head of the waitqueue. 283 466 */ 284 467 if (old & _Q_TAIL_MASK) { 285 - prev = decode_tail(old); 468 + prev = decode_tail(old, qnodes); 286 469 287 470 /* Link @node into the waitqueue. */ 288 471 WRITE_ONCE(prev->next, node);
+201
kernel/locking/qspinlock.h
··· 1 + /* SPDX-License-Identifier: GPL-2.0-or-later */ 2 + /* 3 + * Queued spinlock defines 4 + * 5 + * This file contains macro definitions and functions shared between different 6 + * qspinlock slow path implementations. 7 + */ 8 + #ifndef __LINUX_QSPINLOCK_H 9 + #define __LINUX_QSPINLOCK_H 10 + 11 + #include <asm-generic/percpu.h> 12 + #include <linux/percpu-defs.h> 13 + #include <asm-generic/qspinlock.h> 14 + #include <asm-generic/mcs_spinlock.h> 15 + 16 + #define _Q_MAX_NODES 4 17 + 18 + /* 19 + * The pending bit spinning loop count. 20 + * This heuristic is used to limit the number of lockword accesses 21 + * made by atomic_cond_read_relaxed when waiting for the lock to 22 + * transition out of the "== _Q_PENDING_VAL" state. We don't spin 23 + * indefinitely because there's no guarantee that we'll make forward 24 + * progress. 25 + */ 26 + #ifndef _Q_PENDING_LOOPS 27 + #define _Q_PENDING_LOOPS 1 28 + #endif 29 + 30 + /* 31 + * On 64-bit architectures, the mcs_spinlock structure will be 16 bytes in 32 + * size and four of them will fit nicely in one 64-byte cacheline. For 33 + * pvqspinlock, however, we need more space for extra data. To accommodate 34 + * that, we insert two more long words to pad it up to 32 bytes. IOW, only 35 + * two of them can fit in a cacheline in this case. That is OK as it is rare 36 + * to have more than 2 levels of slowpath nesting in actual use. We don't 37 + * want to penalize pvqspinlocks to optimize for a rare case in native 38 + * qspinlocks. 39 + */ 40 + struct qnode { 41 + struct mcs_spinlock mcs; 42 + #ifdef CONFIG_PARAVIRT_SPINLOCKS 43 + long reserved[2]; 44 + #endif 45 + }; 46 + 47 + /* 48 + * We must be able to distinguish between no-tail and the tail at 0:0, 49 + * therefore increment the cpu number by one. 50 + */ 51 + 52 + static inline __pure u32 encode_tail(int cpu, int idx) 53 + { 54 + u32 tail; 55 + 56 + tail = (cpu + 1) << _Q_TAIL_CPU_OFFSET; 57 + tail |= idx << _Q_TAIL_IDX_OFFSET; /* assume < 4 */ 58 + 59 + return tail; 60 + } 61 + 62 + static inline __pure struct mcs_spinlock *decode_tail(u32 tail, 63 + struct qnode __percpu *qnodes) 64 + { 65 + int cpu = (tail >> _Q_TAIL_CPU_OFFSET) - 1; 66 + int idx = (tail & _Q_TAIL_IDX_MASK) >> _Q_TAIL_IDX_OFFSET; 67 + 68 + return per_cpu_ptr(&qnodes[idx].mcs, cpu); 69 + } 70 + 71 + static inline __pure 72 + struct mcs_spinlock *grab_mcs_node(struct mcs_spinlock *base, int idx) 73 + { 74 + return &((struct qnode *)base + idx)->mcs; 75 + } 76 + 77 + #define _Q_LOCKED_PENDING_MASK (_Q_LOCKED_MASK | _Q_PENDING_MASK) 78 + 79 + #if _Q_PENDING_BITS == 8 80 + /** 81 + * clear_pending - clear the pending bit. 82 + * @lock: Pointer to queued spinlock structure 83 + * 84 + * *,1,* -> *,0,* 85 + */ 86 + static __always_inline void clear_pending(struct qspinlock *lock) 87 + { 88 + WRITE_ONCE(lock->pending, 0); 89 + } 90 + 91 + /** 92 + * clear_pending_set_locked - take ownership and clear the pending bit. 93 + * @lock: Pointer to queued spinlock structure 94 + * 95 + * *,1,0 -> *,0,1 96 + * 97 + * Lock stealing is not allowed if this function is used. 98 + */ 99 + static __always_inline void clear_pending_set_locked(struct qspinlock *lock) 100 + { 101 + WRITE_ONCE(lock->locked_pending, _Q_LOCKED_VAL); 102 + } 103 + 104 + /* 105 + * xchg_tail - Put in the new queue tail code word & retrieve previous one 106 + * @lock : Pointer to queued spinlock structure 107 + * @tail : The new queue tail code word 108 + * Return: The previous queue tail code word 109 + * 110 + * xchg(lock, tail), which heads an address dependency 111 + * 112 + * p,*,* -> n,*,* ; prev = xchg(lock, node) 113 + */ 114 + static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail) 115 + { 116 + /* 117 + * We can use relaxed semantics since the caller ensures that the 118 + * MCS node is properly initialized before updating the tail. 119 + */ 120 + return (u32)xchg_relaxed(&lock->tail, 121 + tail >> _Q_TAIL_OFFSET) << _Q_TAIL_OFFSET; 122 + } 123 + 124 + #else /* _Q_PENDING_BITS == 8 */ 125 + 126 + /** 127 + * clear_pending - clear the pending bit. 128 + * @lock: Pointer to queued spinlock structure 129 + * 130 + * *,1,* -> *,0,* 131 + */ 132 + static __always_inline void clear_pending(struct qspinlock *lock) 133 + { 134 + atomic_andnot(_Q_PENDING_VAL, &lock->val); 135 + } 136 + 137 + /** 138 + * clear_pending_set_locked - take ownership and clear the pending bit. 139 + * @lock: Pointer to queued spinlock structure 140 + * 141 + * *,1,0 -> *,0,1 142 + */ 143 + static __always_inline void clear_pending_set_locked(struct qspinlock *lock) 144 + { 145 + atomic_add(-_Q_PENDING_VAL + _Q_LOCKED_VAL, &lock->val); 146 + } 147 + 148 + /** 149 + * xchg_tail - Put in the new queue tail code word & retrieve previous one 150 + * @lock : Pointer to queued spinlock structure 151 + * @tail : The new queue tail code word 152 + * Return: The previous queue tail code word 153 + * 154 + * xchg(lock, tail) 155 + * 156 + * p,*,* -> n,*,* ; prev = xchg(lock, node) 157 + */ 158 + static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail) 159 + { 160 + u32 old, new; 161 + 162 + old = atomic_read(&lock->val); 163 + do { 164 + new = (old & _Q_LOCKED_PENDING_MASK) | tail; 165 + /* 166 + * We can use relaxed semantics since the caller ensures that 167 + * the MCS node is properly initialized before updating the 168 + * tail. 169 + */ 170 + } while (!atomic_try_cmpxchg_relaxed(&lock->val, &old, new)); 171 + 172 + return old; 173 + } 174 + #endif /* _Q_PENDING_BITS == 8 */ 175 + 176 + /** 177 + * queued_fetch_set_pending_acquire - fetch the whole lock value and set pending 178 + * @lock : Pointer to queued spinlock structure 179 + * Return: The previous lock value 180 + * 181 + * *,*,* -> *,1,* 182 + */ 183 + #ifndef queued_fetch_set_pending_acquire 184 + static __always_inline u32 queued_fetch_set_pending_acquire(struct qspinlock *lock) 185 + { 186 + return atomic_fetch_or_acquire(_Q_PENDING_VAL, &lock->val); 187 + } 188 + #endif 189 + 190 + /** 191 + * set_locked - Set the lock bit and own the lock 192 + * @lock: Pointer to queued spinlock structure 193 + * 194 + * *,*,0 -> *,0,1 195 + */ 196 + static __always_inline void set_locked(struct qspinlock *lock) 197 + { 198 + WRITE_ONCE(lock->locked, _Q_LOCKED_VAL); 199 + } 200 + 201 + #endif /* __LINUX_QSPINLOCK_H */
+98
tools/testing/selftests/bpf/prog_tests/res_spin_lock.c
··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + /* Copyright (c) 2024-2025 Meta Platforms, Inc. and affiliates. */ 3 + #include <test_progs.h> 4 + #include <network_helpers.h> 5 + #include <sys/sysinfo.h> 6 + 7 + #include "res_spin_lock.skel.h" 8 + #include "res_spin_lock_fail.skel.h" 9 + 10 + void test_res_spin_lock_failure(void) 11 + { 12 + RUN_TESTS(res_spin_lock_fail); 13 + } 14 + 15 + static volatile int skip; 16 + 17 + static void *spin_lock_thread(void *arg) 18 + { 19 + int err, prog_fd = *(u32 *) arg; 20 + LIBBPF_OPTS(bpf_test_run_opts, topts, 21 + .data_in = &pkt_v4, 22 + .data_size_in = sizeof(pkt_v4), 23 + .repeat = 10000, 24 + ); 25 + 26 + while (!READ_ONCE(skip)) { 27 + err = bpf_prog_test_run_opts(prog_fd, &topts); 28 + ASSERT_OK(err, "test_run"); 29 + ASSERT_OK(topts.retval, "test_run retval"); 30 + } 31 + pthread_exit(arg); 32 + } 33 + 34 + void test_res_spin_lock_success(void) 35 + { 36 + LIBBPF_OPTS(bpf_test_run_opts, topts, 37 + .data_in = &pkt_v4, 38 + .data_size_in = sizeof(pkt_v4), 39 + .repeat = 1, 40 + ); 41 + struct res_spin_lock *skel; 42 + pthread_t thread_id[16]; 43 + int prog_fd, i, err; 44 + void *ret; 45 + 46 + if (get_nprocs() < 2) { 47 + test__skip(); 48 + return; 49 + } 50 + 51 + skel = res_spin_lock__open_and_load(); 52 + if (!ASSERT_OK_PTR(skel, "res_spin_lock__open_and_load")) 53 + return; 54 + /* AA deadlock */ 55 + prog_fd = bpf_program__fd(skel->progs.res_spin_lock_test); 56 + err = bpf_prog_test_run_opts(prog_fd, &topts); 57 + ASSERT_OK(err, "error"); 58 + ASSERT_OK(topts.retval, "retval"); 59 + 60 + prog_fd = bpf_program__fd(skel->progs.res_spin_lock_test_held_lock_max); 61 + err = bpf_prog_test_run_opts(prog_fd, &topts); 62 + ASSERT_OK(err, "error"); 63 + ASSERT_OK(topts.retval, "retval"); 64 + 65 + /* Multi-threaded ABBA deadlock. */ 66 + 67 + prog_fd = bpf_program__fd(skel->progs.res_spin_lock_test_AB); 68 + for (i = 0; i < 16; i++) { 69 + int err; 70 + 71 + err = pthread_create(&thread_id[i], NULL, &spin_lock_thread, &prog_fd); 72 + if (!ASSERT_OK(err, "pthread_create")) 73 + goto end; 74 + } 75 + 76 + topts.retval = 0; 77 + topts.repeat = 1000; 78 + int fd = bpf_program__fd(skel->progs.res_spin_lock_test_BA); 79 + while (!topts.retval && !err && !READ_ONCE(skel->bss->err)) { 80 + err = bpf_prog_test_run_opts(fd, &topts); 81 + } 82 + 83 + WRITE_ONCE(skip, true); 84 + 85 + for (i = 0; i < 16; i++) { 86 + if (!ASSERT_OK(pthread_join(thread_id[i], &ret), "pthread_join")) 87 + goto end; 88 + if (!ASSERT_EQ(ret, &prog_fd, "ret == prog_fd")) 89 + goto end; 90 + } 91 + 92 + ASSERT_EQ(READ_ONCE(skel->bss->err), -EDEADLK, "timeout err"); 93 + ASSERT_OK(err, "err"); 94 + ASSERT_EQ(topts.retval, -EDEADLK, "timeout"); 95 + end: 96 + res_spin_lock__destroy(skel); 97 + return; 98 + }
+53
tools/testing/selftests/bpf/progs/irq.c
··· 11 11 extern void bpf_local_irq_restore(unsigned long *) __weak __ksym; 12 12 extern int bpf_copy_from_user_str(void *dst, u32 dst__sz, const void *unsafe_ptr__ign, u64 flags) __weak __ksym; 13 13 14 + struct bpf_res_spin_lock lockA __hidden SEC(".data.A"); 15 + struct bpf_res_spin_lock lockB __hidden SEC(".data.B"); 16 + 14 17 SEC("?tc") 15 18 __failure __msg("arg#0 doesn't point to an irq flag on stack") 16 19 int irq_save_bad_arg(struct __sk_buff *ctx) ··· 510 507 bpf_local_irq_save(&flags); 511 508 global_subprog_calling_sleepable_global(0); 512 509 bpf_local_irq_restore(&flags); 510 + return 0; 511 + } 512 + 513 + SEC("?tc") 514 + __failure __msg("cannot restore irq state out of order") 515 + int irq_ooo_lock_cond_inv(struct __sk_buff *ctx) 516 + { 517 + unsigned long flags1, flags2; 518 + 519 + if (bpf_res_spin_lock_irqsave(&lockA, &flags1)) 520 + return 0; 521 + if (bpf_res_spin_lock_irqsave(&lockB, &flags2)) { 522 + bpf_res_spin_unlock_irqrestore(&lockA, &flags1); 523 + return 0; 524 + } 525 + 526 + bpf_res_spin_unlock_irqrestore(&lockB, &flags1); 527 + bpf_res_spin_unlock_irqrestore(&lockA, &flags2); 528 + return 0; 529 + } 530 + 531 + SEC("?tc") 532 + __failure __msg("function calls are not allowed") 533 + int irq_wrong_kfunc_class_1(struct __sk_buff *ctx) 534 + { 535 + unsigned long flags1; 536 + 537 + if (bpf_res_spin_lock_irqsave(&lockA, &flags1)) 538 + return 0; 539 + /* For now, bpf_local_irq_restore is not allowed in critical section, 540 + * but this test ensures error will be caught with kfunc_class when it's 541 + * opened up. Tested by temporarily permitting this kfunc in critical 542 + * section. 543 + */ 544 + bpf_local_irq_restore(&flags1); 545 + bpf_res_spin_unlock_irqrestore(&lockA, &flags1); 546 + return 0; 547 + } 548 + 549 + SEC("?tc") 550 + __failure __msg("function calls are not allowed") 551 + int irq_wrong_kfunc_class_2(struct __sk_buff *ctx) 552 + { 553 + unsigned long flags1, flags2; 554 + 555 + bpf_local_irq_save(&flags1); 556 + if (bpf_res_spin_lock_irqsave(&lockA, &flags2)) 557 + return 0; 558 + bpf_local_irq_restore(&flags2); 559 + bpf_res_spin_unlock_irqrestore(&lockA, &flags1); 513 560 return 0; 514 561 } 515 562
+143
tools/testing/selftests/bpf/progs/res_spin_lock.c
··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + /* Copyright (c) 2024-2025 Meta Platforms, Inc. and affiliates. */ 3 + #include <vmlinux.h> 4 + #include <bpf/bpf_tracing.h> 5 + #include <bpf/bpf_helpers.h> 6 + #include "bpf_misc.h" 7 + 8 + #define EDEADLK 35 9 + #define ETIMEDOUT 110 10 + 11 + struct arr_elem { 12 + struct bpf_res_spin_lock lock; 13 + }; 14 + 15 + struct { 16 + __uint(type, BPF_MAP_TYPE_ARRAY); 17 + __uint(max_entries, 64); 18 + __type(key, int); 19 + __type(value, struct arr_elem); 20 + } arrmap SEC(".maps"); 21 + 22 + struct bpf_res_spin_lock lockA __hidden SEC(".data.A"); 23 + struct bpf_res_spin_lock lockB __hidden SEC(".data.B"); 24 + 25 + SEC("tc") 26 + int res_spin_lock_test(struct __sk_buff *ctx) 27 + { 28 + struct arr_elem *elem1, *elem2; 29 + int r; 30 + 31 + elem1 = bpf_map_lookup_elem(&arrmap, &(int){0}); 32 + if (!elem1) 33 + return -1; 34 + elem2 = bpf_map_lookup_elem(&arrmap, &(int){0}); 35 + if (!elem2) 36 + return -1; 37 + 38 + r = bpf_res_spin_lock(&elem1->lock); 39 + if (r) 40 + return r; 41 + if (!bpf_res_spin_lock(&elem2->lock)) { 42 + bpf_res_spin_unlock(&elem2->lock); 43 + bpf_res_spin_unlock(&elem1->lock); 44 + return -1; 45 + } 46 + bpf_res_spin_unlock(&elem1->lock); 47 + return 0; 48 + } 49 + 50 + SEC("tc") 51 + int res_spin_lock_test_AB(struct __sk_buff *ctx) 52 + { 53 + int r; 54 + 55 + r = bpf_res_spin_lock(&lockA); 56 + if (r) 57 + return !r; 58 + /* Only unlock if we took the lock. */ 59 + if (!bpf_res_spin_lock(&lockB)) 60 + bpf_res_spin_unlock(&lockB); 61 + bpf_res_spin_unlock(&lockA); 62 + return 0; 63 + } 64 + 65 + int err; 66 + 67 + SEC("tc") 68 + int res_spin_lock_test_BA(struct __sk_buff *ctx) 69 + { 70 + int r; 71 + 72 + r = bpf_res_spin_lock(&lockB); 73 + if (r) 74 + return !r; 75 + if (!bpf_res_spin_lock(&lockA)) 76 + bpf_res_spin_unlock(&lockA); 77 + else 78 + err = -EDEADLK; 79 + bpf_res_spin_unlock(&lockB); 80 + return err ?: 0; 81 + } 82 + 83 + SEC("tc") 84 + int res_spin_lock_test_held_lock_max(struct __sk_buff *ctx) 85 + { 86 + struct bpf_res_spin_lock *locks[48] = {}; 87 + struct arr_elem *e; 88 + u64 time_beg, time; 89 + int ret = 0, i; 90 + 91 + _Static_assert(ARRAY_SIZE(((struct rqspinlock_held){}).locks) == 31, 92 + "RES_NR_HELD assumed to be 31"); 93 + 94 + for (i = 0; i < 34; i++) { 95 + int key = i; 96 + 97 + /* We cannot pass in i as it will get spilled/filled by the compiler and 98 + * loses bounds in verifier state. 99 + */ 100 + e = bpf_map_lookup_elem(&arrmap, &key); 101 + if (!e) 102 + return 1; 103 + locks[i] = &e->lock; 104 + } 105 + 106 + for (; i < 48; i++) { 107 + int key = i - 2; 108 + 109 + /* We cannot pass in i as it will get spilled/filled by the compiler and 110 + * loses bounds in verifier state. 111 + */ 112 + e = bpf_map_lookup_elem(&arrmap, &key); 113 + if (!e) 114 + return 1; 115 + locks[i] = &e->lock; 116 + } 117 + 118 + time_beg = bpf_ktime_get_ns(); 119 + for (i = 0; i < 34; i++) { 120 + if (bpf_res_spin_lock(locks[i])) 121 + goto end; 122 + } 123 + 124 + /* Trigger AA, after exhausting entries in the held lock table. This 125 + * time, only the timeout can save us, as AA detection won't succeed. 126 + */ 127 + if (!bpf_res_spin_lock(locks[34])) { 128 + bpf_res_spin_unlock(locks[34]); 129 + ret = 1; 130 + goto end; 131 + } 132 + 133 + end: 134 + for (i = i - 1; i >= 0; i--) 135 + bpf_res_spin_unlock(locks[i]); 136 + time = bpf_ktime_get_ns() - time_beg; 137 + /* Time spent should be easily above our limit (1/4 s), since AA 138 + * detection won't be expedited due to lack of held lock entry. 139 + */ 140 + return ret ?: (time > 1000000000 / 4 ? 0 : 1); 141 + } 142 + 143 + char _license[] SEC("license") = "GPL";
+244
tools/testing/selftests/bpf/progs/res_spin_lock_fail.c
··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + /* Copyright (c) 2024-2025 Meta Platforms, Inc. and affiliates. */ 3 + #include <vmlinux.h> 4 + #include <bpf/bpf_tracing.h> 5 + #include <bpf/bpf_helpers.h> 6 + #include <bpf/bpf_core_read.h> 7 + #include "bpf_misc.h" 8 + #include "bpf_experimental.h" 9 + 10 + struct arr_elem { 11 + struct bpf_res_spin_lock lock; 12 + }; 13 + 14 + struct { 15 + __uint(type, BPF_MAP_TYPE_ARRAY); 16 + __uint(max_entries, 1); 17 + __type(key, int); 18 + __type(value, struct arr_elem); 19 + } arrmap SEC(".maps"); 20 + 21 + long value; 22 + 23 + struct bpf_spin_lock lock __hidden SEC(".data.A"); 24 + struct bpf_res_spin_lock res_lock __hidden SEC(".data.B"); 25 + 26 + SEC("?tc") 27 + __failure __msg("point to map value or allocated object") 28 + int res_spin_lock_arg(struct __sk_buff *ctx) 29 + { 30 + struct arr_elem *elem; 31 + 32 + elem = bpf_map_lookup_elem(&arrmap, &(int){0}); 33 + if (!elem) 34 + return 0; 35 + bpf_res_spin_lock((struct bpf_res_spin_lock *)bpf_core_cast(&elem->lock, struct __sk_buff)); 36 + bpf_res_spin_lock(&elem->lock); 37 + return 0; 38 + } 39 + 40 + SEC("?tc") 41 + __failure __msg("AA deadlock detected") 42 + int res_spin_lock_AA(struct __sk_buff *ctx) 43 + { 44 + struct arr_elem *elem; 45 + 46 + elem = bpf_map_lookup_elem(&arrmap, &(int){0}); 47 + if (!elem) 48 + return 0; 49 + bpf_res_spin_lock(&elem->lock); 50 + bpf_res_spin_lock(&elem->lock); 51 + return 0; 52 + } 53 + 54 + SEC("?tc") 55 + __failure __msg("AA deadlock detected") 56 + int res_spin_lock_cond_AA(struct __sk_buff *ctx) 57 + { 58 + struct arr_elem *elem; 59 + 60 + elem = bpf_map_lookup_elem(&arrmap, &(int){0}); 61 + if (!elem) 62 + return 0; 63 + if (bpf_res_spin_lock(&elem->lock)) 64 + return 0; 65 + bpf_res_spin_lock(&elem->lock); 66 + return 0; 67 + } 68 + 69 + SEC("?tc") 70 + __failure __msg("unlock of different lock") 71 + int res_spin_lock_mismatch_1(struct __sk_buff *ctx) 72 + { 73 + struct arr_elem *elem; 74 + 75 + elem = bpf_map_lookup_elem(&arrmap, &(int){0}); 76 + if (!elem) 77 + return 0; 78 + if (bpf_res_spin_lock(&elem->lock)) 79 + return 0; 80 + bpf_res_spin_unlock(&res_lock); 81 + return 0; 82 + } 83 + 84 + SEC("?tc") 85 + __failure __msg("unlock of different lock") 86 + int res_spin_lock_mismatch_2(struct __sk_buff *ctx) 87 + { 88 + struct arr_elem *elem; 89 + 90 + elem = bpf_map_lookup_elem(&arrmap, &(int){0}); 91 + if (!elem) 92 + return 0; 93 + if (bpf_res_spin_lock(&res_lock)) 94 + return 0; 95 + bpf_res_spin_unlock(&elem->lock); 96 + return 0; 97 + } 98 + 99 + SEC("?tc") 100 + __failure __msg("unlock of different lock") 101 + int res_spin_lock_irq_mismatch_1(struct __sk_buff *ctx) 102 + { 103 + struct arr_elem *elem; 104 + unsigned long f1; 105 + 106 + elem = bpf_map_lookup_elem(&arrmap, &(int){0}); 107 + if (!elem) 108 + return 0; 109 + bpf_local_irq_save(&f1); 110 + if (bpf_res_spin_lock(&res_lock)) 111 + return 0; 112 + bpf_res_spin_unlock_irqrestore(&res_lock, &f1); 113 + return 0; 114 + } 115 + 116 + SEC("?tc") 117 + __failure __msg("unlock of different lock") 118 + int res_spin_lock_irq_mismatch_2(struct __sk_buff *ctx) 119 + { 120 + struct arr_elem *elem; 121 + unsigned long f1; 122 + 123 + elem = bpf_map_lookup_elem(&arrmap, &(int){0}); 124 + if (!elem) 125 + return 0; 126 + if (bpf_res_spin_lock_irqsave(&res_lock, &f1)) 127 + return 0; 128 + bpf_res_spin_unlock(&res_lock); 129 + return 0; 130 + } 131 + 132 + SEC("?tc") 133 + __success 134 + int res_spin_lock_ooo(struct __sk_buff *ctx) 135 + { 136 + struct arr_elem *elem; 137 + 138 + elem = bpf_map_lookup_elem(&arrmap, &(int){0}); 139 + if (!elem) 140 + return 0; 141 + if (bpf_res_spin_lock(&res_lock)) 142 + return 0; 143 + if (bpf_res_spin_lock(&elem->lock)) { 144 + bpf_res_spin_unlock(&res_lock); 145 + return 0; 146 + } 147 + bpf_res_spin_unlock(&elem->lock); 148 + bpf_res_spin_unlock(&res_lock); 149 + return 0; 150 + } 151 + 152 + SEC("?tc") 153 + __success 154 + int res_spin_lock_ooo_irq(struct __sk_buff *ctx) 155 + { 156 + struct arr_elem *elem; 157 + unsigned long f1, f2; 158 + 159 + elem = bpf_map_lookup_elem(&arrmap, &(int){0}); 160 + if (!elem) 161 + return 0; 162 + if (bpf_res_spin_lock_irqsave(&res_lock, &f1)) 163 + return 0; 164 + if (bpf_res_spin_lock_irqsave(&elem->lock, &f2)) { 165 + bpf_res_spin_unlock_irqrestore(&res_lock, &f1); 166 + /* We won't have a unreleased IRQ flag error here. */ 167 + return 0; 168 + } 169 + bpf_res_spin_unlock_irqrestore(&elem->lock, &f2); 170 + bpf_res_spin_unlock_irqrestore(&res_lock, &f1); 171 + return 0; 172 + } 173 + 174 + struct bpf_res_spin_lock lock1 __hidden SEC(".data.OO1"); 175 + struct bpf_res_spin_lock lock2 __hidden SEC(".data.OO2"); 176 + 177 + SEC("?tc") 178 + __failure __msg("bpf_res_spin_unlock cannot be out of order") 179 + int res_spin_lock_ooo_unlock(struct __sk_buff *ctx) 180 + { 181 + if (bpf_res_spin_lock(&lock1)) 182 + return 0; 183 + if (bpf_res_spin_lock(&lock2)) { 184 + bpf_res_spin_unlock(&lock1); 185 + return 0; 186 + } 187 + bpf_res_spin_unlock(&lock1); 188 + bpf_res_spin_unlock(&lock2); 189 + return 0; 190 + } 191 + 192 + SEC("?tc") 193 + __failure __msg("off 1 doesn't point to 'struct bpf_res_spin_lock' that is at 0") 194 + int res_spin_lock_bad_off(struct __sk_buff *ctx) 195 + { 196 + struct arr_elem *elem; 197 + 198 + elem = bpf_map_lookup_elem(&arrmap, &(int){0}); 199 + if (!elem) 200 + return 0; 201 + bpf_res_spin_lock((void *)&elem->lock + 1); 202 + return 0; 203 + } 204 + 205 + SEC("?tc") 206 + __failure __msg("R1 doesn't have constant offset. bpf_res_spin_lock has to be at the constant offset") 207 + int res_spin_lock_var_off(struct __sk_buff *ctx) 208 + { 209 + struct arr_elem *elem; 210 + u64 val = value; 211 + 212 + elem = bpf_map_lookup_elem(&arrmap, &(int){0}); 213 + if (!elem) { 214 + // FIXME: Only inline assembly use in assert macro doesn't emit 215 + // BTF definition. 216 + bpf_throw(0); 217 + return 0; 218 + } 219 + bpf_assert_range(val, 0, 40); 220 + bpf_res_spin_lock((void *)&value + val); 221 + return 0; 222 + } 223 + 224 + SEC("?tc") 225 + __failure __msg("map 'res_spin.bss' has no valid bpf_res_spin_lock") 226 + int res_spin_lock_no_lock_map(struct __sk_buff *ctx) 227 + { 228 + bpf_res_spin_lock((void *)&value + 1); 229 + return 0; 230 + } 231 + 232 + SEC("?tc") 233 + __failure __msg("local 'kptr' has no valid bpf_res_spin_lock") 234 + int res_spin_lock_no_lock_kptr(struct __sk_buff *ctx) 235 + { 236 + struct { int i; } *p = bpf_obj_new(typeof(*p)); 237 + 238 + if (!p) 239 + return 0; 240 + bpf_res_spin_lock((void *)p); 241 + return 0; 242 + } 243 + 244 + char _license[] SEC("license") = "GPL";