Merge tag 'bpf_res_spin_lock' of git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next

Pull bpf relisient spinlock support from Alexei Starovoitov:
"This patch set introduces Resilient Queued Spin Lock (or rqspinlock
with res_spin_lock() and res_spin_unlock() APIs).

This is a qspinlock variant which recovers the kernel from a stalled
state when the lock acquisition path cannot make forward progress.
This can occur when a lock acquisition attempt enters a deadlock
situation (e.g. AA, or ABBA), or more generally, when the owner of the
lock (which we’re trying to acquire) isn’t making forward progress.
Deadlock detection is the main mechanism used to provide instant
recovery, with the timeout mechanism acting as a final line of
defense. Detection is triggered immediately when beginning the waiting
loop of a lock slow path.

Additionally, BPF programs attached to different parts of the kernel
can introduce new control flow into the kernel, which increases the
likelihood of deadlocks in code not written to handle reentrancy.
There have been multiple syzbot reports surfacing deadlocks in
internal kernel code due to the diverse ways in which BPF programs can
be attached to different parts of the kernel. By switching the BPF
subsystem’s lock usage to rqspinlock, all of these issues are
mitigated at runtime.

This spin lock implementation allows BPF maps to become safer and
remove mechanisms that have fallen short in assuring safety when
nesting programs in arbitrary ways in the same context or across
different contexts.

We run benchmarks that stress locking scalability and perform
comparison against the baseline (qspinlock). For the rqspinlock case,
we replace the default qspinlock with it in the kernel, such that all
spin locks in the kernel use the rqspinlock slow path. As such,
benchmarks that stress kernel spin locks end up exercising rqspinlock.

More details in the cover letter in commit 6ffb9017e932 ("Merge branch
'resilient-queued-spin-lock'")"

* tag 'bpf_res_spin_lock' of git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next: (24 commits)
selftests/bpf: Add tests for rqspinlock
bpf: Maintain FIFO property for rqspinlock unlock
bpf: Implement verifier support for rqspinlock
bpf: Introduce rqspinlock kfuncs
bpf: Convert lpm_trie.c to rqspinlock
bpf: Convert percpu_freelist.c to rqspinlock
bpf: Convert hashtab.c to rqspinlock
rqspinlock: Add locktorture support
rqspinlock: Add entry to Makefile, MAINTAINERS
rqspinlock: Add macros for rqspinlock usage
rqspinlock: Add basic support for CONFIG_PARAVIRT
rqspinlock: Add a test-and-set fallback
rqspinlock: Add deadlock detection and recovery
rqspinlock: Protect waiters in trylock fallback from stalls
rqspinlock: Protect waiters in queue from stalls
rqspinlock: Protect pending bit owners from stalls
rqspinlock: Hardcode cond_acquire loops for arm64
rqspinlock: Add support for timeouts
rqspinlock: Drop PV and virtualization support
rqspinlock: Add rqspinlock.h header
...

+2315 -420
+2
MAINTAINERS
··· 4361 F: kernel/bpf/ 4362 F: kernel/trace/bpf_trace.c 4363 F: lib/buildid.c 4364 F: lib/test_bpf.c 4365 F: net/bpf/ 4366 F: net/core/filter.c
··· 4361 F: kernel/bpf/ 4362 F: kernel/trace/bpf_trace.c 4363 F: lib/buildid.c 4364 + F: arch/*/include/asm/rqspinlock.h 4365 + F: include/asm-generic/rqspinlock.h 4366 F: lib/test_bpf.c 4367 F: net/bpf/ 4368 F: net/core/filter.c
+93
arch/arm64/include/asm/rqspinlock.h
···
··· 1 + /* SPDX-License-Identifier: GPL-2.0 */ 2 + #ifndef _ASM_RQSPINLOCK_H 3 + #define _ASM_RQSPINLOCK_H 4 + 5 + #include <asm/barrier.h> 6 + 7 + /* 8 + * Hardcode res_smp_cond_load_acquire implementations for arm64 to a custom 9 + * version based on [0]. In rqspinlock code, our conditional expression involves 10 + * checking the value _and_ additionally a timeout. However, on arm64, the 11 + * WFE-based implementation may never spin again if no stores occur to the 12 + * locked byte in the lock word. As such, we may be stuck forever if 13 + * event-stream based unblocking is not available on the platform for WFE spin 14 + * loops (arch_timer_evtstrm_available). 15 + * 16 + * Once support for smp_cond_load_acquire_timewait [0] lands, we can drop this 17 + * copy-paste. 18 + * 19 + * While we rely on the implementation to amortize the cost of sampling 20 + * cond_expr for us, it will not happen when event stream support is 21 + * unavailable, time_expr check is amortized. This is not the common case, and 22 + * it would be difficult to fit our logic in the time_expr_ns >= time_limit_ns 23 + * comparison, hence just let it be. In case of event-stream, the loop is woken 24 + * up at microsecond granularity. 25 + * 26 + * [0]: https://lore.kernel.org/lkml/20250203214911.898276-1-ankur.a.arora@oracle.com 27 + */ 28 + 29 + #ifndef smp_cond_load_acquire_timewait 30 + 31 + #define smp_cond_time_check_count 200 32 + 33 + #define __smp_cond_load_relaxed_spinwait(ptr, cond_expr, time_expr_ns, \ 34 + time_limit_ns) ({ \ 35 + typeof(ptr) __PTR = (ptr); \ 36 + __unqual_scalar_typeof(*ptr) VAL; \ 37 + unsigned int __count = 0; \ 38 + for (;;) { \ 39 + VAL = READ_ONCE(*__PTR); \ 40 + if (cond_expr) \ 41 + break; \ 42 + cpu_relax(); \ 43 + if (__count++ < smp_cond_time_check_count) \ 44 + continue; \ 45 + if ((time_expr_ns) >= (time_limit_ns)) \ 46 + break; \ 47 + __count = 0; \ 48 + } \ 49 + (typeof(*ptr))VAL; \ 50 + }) 51 + 52 + #define __smp_cond_load_acquire_timewait(ptr, cond_expr, \ 53 + time_expr_ns, time_limit_ns) \ 54 + ({ \ 55 + typeof(ptr) __PTR = (ptr); \ 56 + __unqual_scalar_typeof(*ptr) VAL; \ 57 + for (;;) { \ 58 + VAL = smp_load_acquire(__PTR); \ 59 + if (cond_expr) \ 60 + break; \ 61 + __cmpwait_relaxed(__PTR, VAL); \ 62 + if ((time_expr_ns) >= (time_limit_ns)) \ 63 + break; \ 64 + } \ 65 + (typeof(*ptr))VAL; \ 66 + }) 67 + 68 + #define smp_cond_load_acquire_timewait(ptr, cond_expr, \ 69 + time_expr_ns, time_limit_ns) \ 70 + ({ \ 71 + __unqual_scalar_typeof(*ptr) _val; \ 72 + int __wfe = arch_timer_evtstrm_available(); \ 73 + \ 74 + if (likely(__wfe)) { \ 75 + _val = __smp_cond_load_acquire_timewait(ptr, cond_expr, \ 76 + time_expr_ns, \ 77 + time_limit_ns); \ 78 + } else { \ 79 + _val = __smp_cond_load_relaxed_spinwait(ptr, cond_expr, \ 80 + time_expr_ns, \ 81 + time_limit_ns); \ 82 + smp_acquire__after_ctrl_dep(); \ 83 + } \ 84 + (typeof(*ptr))_val; \ 85 + }) 86 + 87 + #endif 88 + 89 + #define res_smp_cond_load_acquire_timewait(v, c) smp_cond_load_acquire_timewait(v, c, 0, 1) 90 + 91 + #include <asm-generic/rqspinlock.h> 92 + 93 + #endif /* _ASM_RQSPINLOCK_H */
+33
arch/x86/include/asm/rqspinlock.h
···
··· 1 + /* SPDX-License-Identifier: GPL-2.0 */ 2 + #ifndef _ASM_X86_RQSPINLOCK_H 3 + #define _ASM_X86_RQSPINLOCK_H 4 + 5 + #include <asm/paravirt.h> 6 + 7 + #ifdef CONFIG_PARAVIRT 8 + DECLARE_STATIC_KEY_FALSE(virt_spin_lock_key); 9 + 10 + #define resilient_virt_spin_lock_enabled resilient_virt_spin_lock_enabled 11 + static __always_inline bool resilient_virt_spin_lock_enabled(void) 12 + { 13 + return static_branch_likely(&virt_spin_lock_key); 14 + } 15 + 16 + #ifdef CONFIG_QUEUED_SPINLOCKS 17 + typedef struct qspinlock rqspinlock_t; 18 + #else 19 + typedef struct rqspinlock rqspinlock_t; 20 + #endif 21 + extern int resilient_tas_spin_lock(rqspinlock_t *lock); 22 + 23 + #define resilient_virt_spin_lock resilient_virt_spin_lock 24 + static inline int resilient_virt_spin_lock(rqspinlock_t *lock) 25 + { 26 + return resilient_tas_spin_lock(lock); 27 + } 28 + 29 + #endif /* CONFIG_PARAVIRT */ 30 + 31 + #include <asm-generic/rqspinlock.h> 32 + 33 + #endif /* _ASM_X86_RQSPINLOCK_H */
+1
include/asm-generic/Kbuild
··· 45 mandatory-y += percpu.h 46 mandatory-y += pgalloc.h 47 mandatory-y += preempt.h 48 mandatory-y += runtime-const.h 49 mandatory-y += rwonce.h 50 mandatory-y += sections.h
··· 45 mandatory-y += percpu.h 46 mandatory-y += pgalloc.h 47 mandatory-y += preempt.h 48 + mandatory-y += rqspinlock.h 49 mandatory-y += runtime-const.h 50 mandatory-y += rwonce.h 51 mandatory-y += sections.h
+6
include/asm-generic/mcs_spinlock.h
··· 1 #ifndef __ASM_MCS_SPINLOCK_H 2 #define __ASM_MCS_SPINLOCK_H 3 4 /* 5 * Architectures can define their own: 6 *
··· 1 #ifndef __ASM_MCS_SPINLOCK_H 2 #define __ASM_MCS_SPINLOCK_H 3 4 + struct mcs_spinlock { 5 + struct mcs_spinlock *next; 6 + int locked; /* 1 if lock acquired */ 7 + int count; /* nesting count, see qspinlock.c */ 8 + }; 9 + 10 /* 11 * Architectures can define their own: 12 *
+250
include/asm-generic/rqspinlock.h
···
··· 1 + /* SPDX-License-Identifier: GPL-2.0-or-later */ 2 + /* 3 + * Resilient Queued Spin Lock 4 + * 5 + * (C) Copyright 2024-2025 Meta Platforms, Inc. and affiliates. 6 + * 7 + * Authors: Kumar Kartikeya Dwivedi <memxor@gmail.com> 8 + */ 9 + #ifndef __ASM_GENERIC_RQSPINLOCK_H 10 + #define __ASM_GENERIC_RQSPINLOCK_H 11 + 12 + #include <linux/types.h> 13 + #include <vdso/time64.h> 14 + #include <linux/percpu.h> 15 + #ifdef CONFIG_QUEUED_SPINLOCKS 16 + #include <asm/qspinlock.h> 17 + #endif 18 + 19 + struct rqspinlock { 20 + union { 21 + atomic_t val; 22 + u32 locked; 23 + }; 24 + }; 25 + 26 + /* Even though this is same as struct rqspinlock, we need to emit a distinct 27 + * type in BTF for BPF programs. 28 + */ 29 + struct bpf_res_spin_lock { 30 + u32 val; 31 + }; 32 + 33 + struct qspinlock; 34 + #ifdef CONFIG_QUEUED_SPINLOCKS 35 + typedef struct qspinlock rqspinlock_t; 36 + #else 37 + typedef struct rqspinlock rqspinlock_t; 38 + #endif 39 + 40 + extern int resilient_tas_spin_lock(rqspinlock_t *lock); 41 + #ifdef CONFIG_QUEUED_SPINLOCKS 42 + extern int resilient_queued_spin_lock_slowpath(rqspinlock_t *lock, u32 val); 43 + #endif 44 + 45 + #ifndef resilient_virt_spin_lock_enabled 46 + static __always_inline bool resilient_virt_spin_lock_enabled(void) 47 + { 48 + return false; 49 + } 50 + #endif 51 + 52 + #ifndef resilient_virt_spin_lock 53 + static __always_inline int resilient_virt_spin_lock(rqspinlock_t *lock) 54 + { 55 + return 0; 56 + } 57 + #endif 58 + 59 + /* 60 + * Default timeout for waiting loops is 0.25 seconds 61 + */ 62 + #define RES_DEF_TIMEOUT (NSEC_PER_SEC / 4) 63 + 64 + /* 65 + * Choose 31 as it makes rqspinlock_held cacheline-aligned. 66 + */ 67 + #define RES_NR_HELD 31 68 + 69 + struct rqspinlock_held { 70 + int cnt; 71 + void *locks[RES_NR_HELD]; 72 + }; 73 + 74 + DECLARE_PER_CPU_ALIGNED(struct rqspinlock_held, rqspinlock_held_locks); 75 + 76 + static __always_inline void grab_held_lock_entry(void *lock) 77 + { 78 + int cnt = this_cpu_inc_return(rqspinlock_held_locks.cnt); 79 + 80 + if (unlikely(cnt > RES_NR_HELD)) { 81 + /* Still keep the inc so we decrement later. */ 82 + return; 83 + } 84 + 85 + /* 86 + * Implied compiler barrier in per-CPU operations; otherwise we can have 87 + * the compiler reorder inc with write to table, allowing interrupts to 88 + * overwrite and erase our write to the table (as on interrupt exit it 89 + * will be reset to NULL). 90 + * 91 + * It is fine for cnt inc to be reordered wrt remote readers though, 92 + * they won't observe our entry until the cnt update is visible, that's 93 + * all. 94 + */ 95 + this_cpu_write(rqspinlock_held_locks.locks[cnt - 1], lock); 96 + } 97 + 98 + /* 99 + * We simply don't support out-of-order unlocks, and keep the logic simple here. 100 + * The verifier prevents BPF programs from unlocking out-of-order, and the same 101 + * holds for in-kernel users. 102 + * 103 + * It is possible to run into misdetection scenarios of AA deadlocks on the same 104 + * CPU, and missed ABBA deadlocks on remote CPUs if this function pops entries 105 + * out of order (due to lock A, lock B, unlock A, unlock B) pattern. The correct 106 + * logic to preserve right entries in the table would be to walk the array of 107 + * held locks and swap and clear out-of-order entries, but that's too 108 + * complicated and we don't have a compelling use case for out of order unlocking. 109 + */ 110 + static __always_inline void release_held_lock_entry(void) 111 + { 112 + struct rqspinlock_held *rqh = this_cpu_ptr(&rqspinlock_held_locks); 113 + 114 + if (unlikely(rqh->cnt > RES_NR_HELD)) 115 + goto dec; 116 + WRITE_ONCE(rqh->locks[rqh->cnt - 1], NULL); 117 + dec: 118 + /* 119 + * Reordering of clearing above with inc and its write in 120 + * grab_held_lock_entry that came before us (in same acquisition 121 + * attempt) is ok, we either see a valid entry or NULL when it's 122 + * visible. 123 + * 124 + * But this helper is invoked when we unwind upon failing to acquire the 125 + * lock. Unlike the unlock path which constitutes a release store after 126 + * we clear the entry, we need to emit a write barrier here. Otherwise, 127 + * we may have a situation as follows: 128 + * 129 + * <error> for lock B 130 + * release_held_lock_entry 131 + * 132 + * try_cmpxchg_acquire for lock A 133 + * grab_held_lock_entry 134 + * 135 + * Lack of any ordering means reordering may occur such that dec, inc 136 + * are done before entry is overwritten. This permits a remote lock 137 + * holder of lock B (which this CPU failed to acquire) to now observe it 138 + * as being attempted on this CPU, and may lead to misdetection (if this 139 + * CPU holds a lock it is attempting to acquire, leading to false ABBA 140 + * diagnosis). 141 + * 142 + * In case of unlock, we will always do a release on the lock word after 143 + * releasing the entry, ensuring that other CPUs cannot hold the lock 144 + * (and make conclusions about deadlocks) until the entry has been 145 + * cleared on the local CPU, preventing any anomalies. Reordering is 146 + * still possible there, but a remote CPU cannot observe a lock in our 147 + * table which it is already holding, since visibility entails our 148 + * release store for the said lock has not retired. 149 + * 150 + * In theory we don't have a problem if the dec and WRITE_ONCE above get 151 + * reordered with each other, we either notice an empty NULL entry on 152 + * top (if dec succeeds WRITE_ONCE), or a potentially stale entry which 153 + * cannot be observed (if dec precedes WRITE_ONCE). 154 + * 155 + * Emit the write barrier _before_ the dec, this permits dec-inc 156 + * reordering but that is harmless as we'd have new entry set to NULL 157 + * already, i.e. they cannot precede the NULL store above. 158 + */ 159 + smp_wmb(); 160 + this_cpu_dec(rqspinlock_held_locks.cnt); 161 + } 162 + 163 + #ifdef CONFIG_QUEUED_SPINLOCKS 164 + 165 + /** 166 + * res_spin_lock - acquire a queued spinlock 167 + * @lock: Pointer to queued spinlock structure 168 + * 169 + * Return: 170 + * * 0 - Lock was acquired successfully. 171 + * * -EDEADLK - Lock acquisition failed because of AA/ABBA deadlock. 172 + * * -ETIMEDOUT - Lock acquisition failed because of timeout. 173 + */ 174 + static __always_inline int res_spin_lock(rqspinlock_t *lock) 175 + { 176 + int val = 0; 177 + 178 + if (likely(atomic_try_cmpxchg_acquire(&lock->val, &val, _Q_LOCKED_VAL))) { 179 + grab_held_lock_entry(lock); 180 + return 0; 181 + } 182 + return resilient_queued_spin_lock_slowpath(lock, val); 183 + } 184 + 185 + #else 186 + 187 + #define res_spin_lock(lock) resilient_tas_spin_lock(lock) 188 + 189 + #endif /* CONFIG_QUEUED_SPINLOCKS */ 190 + 191 + static __always_inline void res_spin_unlock(rqspinlock_t *lock) 192 + { 193 + struct rqspinlock_held *rqh = this_cpu_ptr(&rqspinlock_held_locks); 194 + 195 + if (unlikely(rqh->cnt > RES_NR_HELD)) 196 + goto unlock; 197 + WRITE_ONCE(rqh->locks[rqh->cnt - 1], NULL); 198 + unlock: 199 + /* 200 + * Release barrier, ensures correct ordering. See release_held_lock_entry 201 + * for details. Perform release store instead of queued_spin_unlock, 202 + * since we use this function for test-and-set fallback as well. When we 203 + * have CONFIG_QUEUED_SPINLOCKS=n, we clear the full 4-byte lockword. 204 + * 205 + * Like release_held_lock_entry, we can do the release before the dec. 206 + * We simply care about not seeing the 'lock' in our table from a remote 207 + * CPU once the lock has been released, which doesn't rely on the dec. 208 + * 209 + * Unlike smp_wmb(), release is not a two way fence, hence it is 210 + * possible for a inc to move up and reorder with our clearing of the 211 + * entry. This isn't a problem however, as for a misdiagnosis of ABBA, 212 + * the remote CPU needs to hold this lock, which won't be released until 213 + * the store below is done, which would ensure the entry is overwritten 214 + * to NULL, etc. 215 + */ 216 + smp_store_release(&lock->locked, 0); 217 + this_cpu_dec(rqspinlock_held_locks.cnt); 218 + } 219 + 220 + #ifdef CONFIG_QUEUED_SPINLOCKS 221 + #define raw_res_spin_lock_init(lock) ({ *(lock) = (rqspinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; }) 222 + #else 223 + #define raw_res_spin_lock_init(lock) ({ *(lock) = (rqspinlock_t){0}; }) 224 + #endif 225 + 226 + #define raw_res_spin_lock(lock) \ 227 + ({ \ 228 + int __ret; \ 229 + preempt_disable(); \ 230 + __ret = res_spin_lock(lock); \ 231 + if (__ret) \ 232 + preempt_enable(); \ 233 + __ret; \ 234 + }) 235 + 236 + #define raw_res_spin_unlock(lock) ({ res_spin_unlock(lock); preempt_enable(); }) 237 + 238 + #define raw_res_spin_lock_irqsave(lock, flags) \ 239 + ({ \ 240 + int __ret; \ 241 + local_irq_save(flags); \ 242 + __ret = raw_res_spin_lock(lock); \ 243 + if (__ret) \ 244 + local_irq_restore(flags); \ 245 + __ret; \ 246 + }) 247 + 248 + #define raw_res_spin_unlock_irqrestore(lock, flags) ({ raw_res_spin_unlock(lock); local_irq_restore(flags); }) 249 + 250 + #endif /* __ASM_GENERIC_RQSPINLOCK_H */
+10
include/linux/bpf.h
··· 30 #include <linux/static_call.h> 31 #include <linux/memcontrol.h> 32 #include <linux/cfi.h> 33 34 struct bpf_verifier_env; 35 struct bpf_verifier_log; ··· 205 BPF_REFCOUNT = (1 << 9), 206 BPF_WORKQUEUE = (1 << 10), 207 BPF_UPTR = (1 << 11), 208 }; 209 210 typedef void (*btf_dtor_kfunc_t)(void *); ··· 241 u32 cnt; 242 u32 field_mask; 243 int spin_lock_off; 244 int timer_off; 245 int wq_off; 246 int refcount_off; ··· 317 switch (type) { 318 case BPF_SPIN_LOCK: 319 return "bpf_spin_lock"; 320 case BPF_TIMER: 321 return "bpf_timer"; 322 case BPF_WORKQUEUE: ··· 351 switch (type) { 352 case BPF_SPIN_LOCK: 353 return sizeof(struct bpf_spin_lock); 354 case BPF_TIMER: 355 return sizeof(struct bpf_timer); 356 case BPF_WORKQUEUE: ··· 383 switch (type) { 384 case BPF_SPIN_LOCK: 385 return __alignof__(struct bpf_spin_lock); 386 case BPF_TIMER: 387 return __alignof__(struct bpf_timer); 388 case BPF_WORKQUEUE: ··· 428 case BPF_RB_ROOT: 429 /* RB_ROOT_CACHED 0-inits, no need to do anything after memset */ 430 case BPF_SPIN_LOCK: 431 case BPF_TIMER: 432 case BPF_WORKQUEUE: 433 case BPF_KPTR_UNREF:
··· 30 #include <linux/static_call.h> 31 #include <linux/memcontrol.h> 32 #include <linux/cfi.h> 33 + #include <asm/rqspinlock.h> 34 35 struct bpf_verifier_env; 36 struct bpf_verifier_log; ··· 204 BPF_REFCOUNT = (1 << 9), 205 BPF_WORKQUEUE = (1 << 10), 206 BPF_UPTR = (1 << 11), 207 + BPF_RES_SPIN_LOCK = (1 << 12), 208 }; 209 210 typedef void (*btf_dtor_kfunc_t)(void *); ··· 239 u32 cnt; 240 u32 field_mask; 241 int spin_lock_off; 242 + int res_spin_lock_off; 243 int timer_off; 244 int wq_off; 245 int refcount_off; ··· 314 switch (type) { 315 case BPF_SPIN_LOCK: 316 return "bpf_spin_lock"; 317 + case BPF_RES_SPIN_LOCK: 318 + return "bpf_res_spin_lock"; 319 case BPF_TIMER: 320 return "bpf_timer"; 321 case BPF_WORKQUEUE: ··· 346 switch (type) { 347 case BPF_SPIN_LOCK: 348 return sizeof(struct bpf_spin_lock); 349 + case BPF_RES_SPIN_LOCK: 350 + return sizeof(struct bpf_res_spin_lock); 351 case BPF_TIMER: 352 return sizeof(struct bpf_timer); 353 case BPF_WORKQUEUE: ··· 376 switch (type) { 377 case BPF_SPIN_LOCK: 378 return __alignof__(struct bpf_spin_lock); 379 + case BPF_RES_SPIN_LOCK: 380 + return __alignof__(struct bpf_res_spin_lock); 381 case BPF_TIMER: 382 return __alignof__(struct bpf_timer); 383 case BPF_WORKQUEUE: ··· 419 case BPF_RB_ROOT: 420 /* RB_ROOT_CACHED 0-inits, no need to do anything after memset */ 421 case BPF_SPIN_LOCK: 422 + case BPF_RES_SPIN_LOCK: 423 case BPF_TIMER: 424 case BPF_WORKQUEUE: 425 case BPF_KPTR_UNREF:
+16 -3
include/linux/bpf_verifier.h
··· 115 int depth:30; 116 } iter; 117 118 /* Max size from any of the above. */ 119 struct { 120 unsigned long raw1; ··· 263 * default to pointer reference on zero initialization of a state. 264 */ 265 enum ref_state_type { 266 - REF_TYPE_PTR = 1, 267 - REF_TYPE_IRQ = 2, 268 - REF_TYPE_LOCK = 3, 269 } type; 270 /* Track each reference created with a unique id, even if the same 271 * instruction creates the reference multiple times (eg, via CALL). ··· 435 u32 active_locks; 436 u32 active_preempt_locks; 437 u32 active_irq_id; 438 bool active_rcu_lock; 439 440 bool speculative;
··· 115 int depth:30; 116 } iter; 117 118 + /* For irq stack slots */ 119 + struct { 120 + enum { 121 + IRQ_NATIVE_KFUNC, 122 + IRQ_LOCK_KFUNC, 123 + } kfunc_class; 124 + } irq; 125 + 126 /* Max size from any of the above. */ 127 struct { 128 unsigned long raw1; ··· 255 * default to pointer reference on zero initialization of a state. 256 */ 257 enum ref_state_type { 258 + REF_TYPE_PTR = (1 << 1), 259 + REF_TYPE_IRQ = (1 << 2), 260 + REF_TYPE_LOCK = (1 << 3), 261 + REF_TYPE_RES_LOCK = (1 << 4), 262 + REF_TYPE_RES_LOCK_IRQ = (1 << 5), 263 + REF_TYPE_LOCK_MASK = REF_TYPE_LOCK | REF_TYPE_RES_LOCK | REF_TYPE_RES_LOCK_IRQ, 264 } type; 265 /* Track each reference created with a unique id, even if the same 266 * instruction creates the reference multiple times (eg, via CALL). ··· 424 u32 active_locks; 425 u32 active_preempt_locks; 426 u32 active_irq_id; 427 + u32 active_lock_id; 428 + void *active_lock_ptr; 429 bool active_rcu_lock; 430 431 bool speculative;
+1 -1
kernel/bpf/Makefile
··· 14 obj-${CONFIG_BPF_LSM} += bpf_inode_storage.o 15 obj-$(CONFIG_BPF_SYSCALL) += disasm.o mprog.o 16 obj-$(CONFIG_BPF_JIT) += trampoline.o 17 - obj-$(CONFIG_BPF_SYSCALL) += btf.o memalloc.o 18 ifeq ($(CONFIG_MMU)$(CONFIG_64BIT),yy) 19 obj-$(CONFIG_BPF_SYSCALL) += arena.o range_tree.o 20 endif
··· 14 obj-${CONFIG_BPF_LSM} += bpf_inode_storage.o 15 obj-$(CONFIG_BPF_SYSCALL) += disasm.o mprog.o 16 obj-$(CONFIG_BPF_JIT) += trampoline.o 17 + obj-$(CONFIG_BPF_SYSCALL) += btf.o memalloc.o rqspinlock.o 18 ifeq ($(CONFIG_MMU)$(CONFIG_64BIT),yy) 19 obj-$(CONFIG_BPF_SYSCALL) += arena.o range_tree.o 20 endif
+24 -2
kernel/bpf/btf.c
··· 3481 goto end; 3482 } 3483 } 3484 if (field_mask & BPF_TIMER) { 3485 if (!strcmp(name, "bpf_timer")) { 3486 if (*seen_mask & BPF_TIMER) ··· 3668 3669 switch (field_type) { 3670 case BPF_SPIN_LOCK: 3671 case BPF_TIMER: 3672 case BPF_WORKQUEUE: 3673 case BPF_LIST_NODE: ··· 3962 return ERR_PTR(-ENOMEM); 3963 3964 rec->spin_lock_off = -EINVAL; 3965 rec->timer_off = -EINVAL; 3966 rec->wq_off = -EINVAL; 3967 rec->refcount_off = -EINVAL; ··· 3989 WARN_ON_ONCE(rec->spin_lock_off >= 0); 3990 /* Cache offset for faster lookup at runtime */ 3991 rec->spin_lock_off = rec->fields[i].offset; 3992 break; 3993 case BPF_TIMER: 3994 WARN_ON_ONCE(rec->timer_off >= 0); ··· 4038 rec->cnt++; 4039 } 4040 4041 /* bpf_{list_head, rb_node} require bpf_spin_lock */ 4042 if ((btf_record_has_field(rec, BPF_LIST_HEAD) || 4043 - btf_record_has_field(rec, BPF_RB_ROOT)) && rec->spin_lock_off < 0) { 4044 ret = -EINVAL; 4045 goto end; 4046 } ··· 5659 5660 type = &tab->types[tab->cnt]; 5661 type->btf_id = i; 5662 - record = btf_parse_fields(btf, t, BPF_SPIN_LOCK | BPF_LIST_HEAD | BPF_LIST_NODE | 5663 BPF_RB_ROOT | BPF_RB_NODE | BPF_REFCOUNT | 5664 BPF_KPTR, t->size); 5665 /* The record cannot be unset, treat it as an error if so */
··· 3481 goto end; 3482 } 3483 } 3484 + if (field_mask & BPF_RES_SPIN_LOCK) { 3485 + if (!strcmp(name, "bpf_res_spin_lock")) { 3486 + if (*seen_mask & BPF_RES_SPIN_LOCK) 3487 + return -E2BIG; 3488 + *seen_mask |= BPF_RES_SPIN_LOCK; 3489 + type = BPF_RES_SPIN_LOCK; 3490 + goto end; 3491 + } 3492 + } 3493 if (field_mask & BPF_TIMER) { 3494 if (!strcmp(name, "bpf_timer")) { 3495 if (*seen_mask & BPF_TIMER) ··· 3659 3660 switch (field_type) { 3661 case BPF_SPIN_LOCK: 3662 + case BPF_RES_SPIN_LOCK: 3663 case BPF_TIMER: 3664 case BPF_WORKQUEUE: 3665 case BPF_LIST_NODE: ··· 3952 return ERR_PTR(-ENOMEM); 3953 3954 rec->spin_lock_off = -EINVAL; 3955 + rec->res_spin_lock_off = -EINVAL; 3956 rec->timer_off = -EINVAL; 3957 rec->wq_off = -EINVAL; 3958 rec->refcount_off = -EINVAL; ··· 3978 WARN_ON_ONCE(rec->spin_lock_off >= 0); 3979 /* Cache offset for faster lookup at runtime */ 3980 rec->spin_lock_off = rec->fields[i].offset; 3981 + break; 3982 + case BPF_RES_SPIN_LOCK: 3983 + WARN_ON_ONCE(rec->spin_lock_off >= 0); 3984 + /* Cache offset for faster lookup at runtime */ 3985 + rec->res_spin_lock_off = rec->fields[i].offset; 3986 break; 3987 case BPF_TIMER: 3988 WARN_ON_ONCE(rec->timer_off >= 0); ··· 4022 rec->cnt++; 4023 } 4024 4025 + if (rec->spin_lock_off >= 0 && rec->res_spin_lock_off >= 0) { 4026 + ret = -EINVAL; 4027 + goto end; 4028 + } 4029 + 4030 /* bpf_{list_head, rb_node} require bpf_spin_lock */ 4031 if ((btf_record_has_field(rec, BPF_LIST_HEAD) || 4032 + btf_record_has_field(rec, BPF_RB_ROOT)) && 4033 + (rec->spin_lock_off < 0 && rec->res_spin_lock_off < 0)) { 4034 ret = -EINVAL; 4035 goto end; 4036 } ··· 5637 5638 type = &tab->types[tab->cnt]; 5639 type->btf_id = i; 5640 + record = btf_parse_fields(btf, t, BPF_SPIN_LOCK | BPF_RES_SPIN_LOCK | BPF_LIST_HEAD | BPF_LIST_NODE | 5641 BPF_RB_ROOT | BPF_RB_NODE | BPF_REFCOUNT | 5642 BPF_KPTR, t->size); 5643 /* The record cannot be unset, treat it as an error if so */
+32 -70
kernel/bpf/hashtab.c
··· 16 #include "bpf_lru_list.h" 17 #include "map_in_map.h" 18 #include <linux/bpf_mem_alloc.h> 19 20 #define HTAB_CREATE_FLAG_MASK \ 21 (BPF_F_NO_PREALLOC | BPF_F_NO_COMMON_LRU | BPF_F_NUMA_NODE | \ ··· 79 */ 80 struct bucket { 81 struct hlist_nulls_head head; 82 - raw_spinlock_t raw_lock; 83 }; 84 85 #define HASHTAB_MAP_LOCK_COUNT 8 ··· 105 u32 n_buckets; /* number of hash buckets */ 106 u32 elem_size; /* size of each element in bytes */ 107 u32 hashrnd; 108 - struct lock_class_key lockdep_key; 109 - int __percpu *map_locked[HASHTAB_MAP_LOCK_COUNT]; 110 }; 111 112 /* each htab element is struct htab_elem + key + value */ ··· 139 140 for (i = 0; i < htab->n_buckets; i++) { 141 INIT_HLIST_NULLS_HEAD(&htab->buckets[i].head, i); 142 - raw_spin_lock_init(&htab->buckets[i].raw_lock); 143 - lockdep_set_class(&htab->buckets[i].raw_lock, 144 - &htab->lockdep_key); 145 cond_resched(); 146 } 147 } 148 149 - static inline int htab_lock_bucket(const struct bpf_htab *htab, 150 - struct bucket *b, u32 hash, 151 - unsigned long *pflags) 152 { 153 unsigned long flags; 154 155 - hash = hash & min_t(u32, HASHTAB_MAP_LOCK_MASK, htab->n_buckets - 1); 156 - 157 - preempt_disable(); 158 - local_irq_save(flags); 159 - if (unlikely(__this_cpu_inc_return(*(htab->map_locked[hash])) != 1)) { 160 - __this_cpu_dec(*(htab->map_locked[hash])); 161 - local_irq_restore(flags); 162 - preempt_enable(); 163 - return -EBUSY; 164 - } 165 - 166 - raw_spin_lock(&b->raw_lock); 167 *pflags = flags; 168 - 169 return 0; 170 } 171 172 - static inline void htab_unlock_bucket(const struct bpf_htab *htab, 173 - struct bucket *b, u32 hash, 174 - unsigned long flags) 175 { 176 - hash = hash & min_t(u32, HASHTAB_MAP_LOCK_MASK, htab->n_buckets - 1); 177 - raw_spin_unlock(&b->raw_lock); 178 - __this_cpu_dec(*(htab->map_locked[hash])); 179 - local_irq_restore(flags); 180 - preempt_enable(); 181 } 182 183 static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node); ··· 463 bool percpu_lru = (attr->map_flags & BPF_F_NO_COMMON_LRU); 464 bool prealloc = !(attr->map_flags & BPF_F_NO_PREALLOC); 465 struct bpf_htab *htab; 466 - int err, i; 467 468 htab = bpf_map_area_alloc(sizeof(*htab), NUMA_NO_NODE); 469 if (!htab) 470 return ERR_PTR(-ENOMEM); 471 - 472 - lockdep_register_key(&htab->lockdep_key); 473 474 bpf_map_init_from_attr(&htab->map, attr); 475 ··· 513 htab->map.numa_node); 514 if (!htab->buckets) 515 goto free_elem_count; 516 - 517 - for (i = 0; i < HASHTAB_MAP_LOCK_COUNT; i++) { 518 - htab->map_locked[i] = bpf_map_alloc_percpu(&htab->map, 519 - sizeof(int), 520 - sizeof(int), 521 - GFP_USER); 522 - if (!htab->map_locked[i]) 523 - goto free_map_locked; 524 - } 525 526 if (htab->map.map_flags & BPF_F_ZERO_SEED) 527 htab->hashrnd = 0; ··· 576 free_map_locked: 577 if (htab->use_percpu_counter) 578 percpu_counter_destroy(&htab->pcount); 579 - for (i = 0; i < HASHTAB_MAP_LOCK_COUNT; i++) 580 - free_percpu(htab->map_locked[i]); 581 bpf_map_area_free(htab->buckets); 582 bpf_mem_alloc_destroy(&htab->pcpu_ma); 583 bpf_mem_alloc_destroy(&htab->ma); 584 free_elem_count: 585 bpf_map_free_elem_count(&htab->map); 586 free_htab: 587 - lockdep_unregister_key(&htab->lockdep_key); 588 bpf_map_area_free(htab); 589 return ERR_PTR(err); 590 } ··· 786 b = __select_bucket(htab, tgt_l->hash); 787 head = &b->head; 788 789 - ret = htab_lock_bucket(htab, b, tgt_l->hash, &flags); 790 if (ret) 791 return false; 792 ··· 797 break; 798 } 799 800 - htab_unlock_bucket(htab, b, tgt_l->hash, flags); 801 802 if (l == tgt_l) 803 check_and_free_fields(htab, l); ··· 1116 */ 1117 } 1118 1119 - ret = htab_lock_bucket(htab, b, hash, &flags); 1120 if (ret) 1121 return ret; 1122 ··· 1167 check_and_free_fields(htab, l_old); 1168 } 1169 } 1170 - htab_unlock_bucket(htab, b, hash, flags); 1171 if (l_old) { 1172 if (old_map_ptr) 1173 map->ops->map_fd_put_ptr(map, old_map_ptr, true); ··· 1176 } 1177 return 0; 1178 err: 1179 - htab_unlock_bucket(htab, b, hash, flags); 1180 return ret; 1181 } 1182 ··· 1223 copy_map_value(&htab->map, 1224 l_new->key + round_up(map->key_size, 8), value); 1225 1226 - ret = htab_lock_bucket(htab, b, hash, &flags); 1227 if (ret) 1228 goto err_lock_bucket; 1229 ··· 1244 ret = 0; 1245 1246 err: 1247 - htab_unlock_bucket(htab, b, hash, flags); 1248 1249 err_lock_bucket: 1250 if (ret) ··· 1281 b = __select_bucket(htab, hash); 1282 head = &b->head; 1283 1284 - ret = htab_lock_bucket(htab, b, hash, &flags); 1285 if (ret) 1286 return ret; 1287 ··· 1306 } 1307 ret = 0; 1308 err: 1309 - htab_unlock_bucket(htab, b, hash, flags); 1310 return ret; 1311 } 1312 ··· 1347 return -ENOMEM; 1348 } 1349 1350 - ret = htab_lock_bucket(htab, b, hash, &flags); 1351 if (ret) 1352 goto err_lock_bucket; 1353 ··· 1371 } 1372 ret = 0; 1373 err: 1374 - htab_unlock_bucket(htab, b, hash, flags); 1375 err_lock_bucket: 1376 if (l_new) { 1377 bpf_map_dec_elem_count(&htab->map); ··· 1413 b = __select_bucket(htab, hash); 1414 head = &b->head; 1415 1416 - ret = htab_lock_bucket(htab, b, hash, &flags); 1417 if (ret) 1418 return ret; 1419 ··· 1423 else 1424 ret = -ENOENT; 1425 1426 - htab_unlock_bucket(htab, b, hash, flags); 1427 1428 if (l) 1429 free_htab_elem(htab, l); ··· 1449 b = __select_bucket(htab, hash); 1450 head = &b->head; 1451 1452 - ret = htab_lock_bucket(htab, b, hash, &flags); 1453 if (ret) 1454 return ret; 1455 ··· 1460 else 1461 ret = -ENOENT; 1462 1463 - htab_unlock_bucket(htab, b, hash, flags); 1464 if (l) 1465 htab_lru_push_free(htab, l); 1466 return ret; ··· 1527 static void htab_map_free(struct bpf_map *map) 1528 { 1529 struct bpf_htab *htab = container_of(map, struct bpf_htab, map); 1530 - int i; 1531 1532 /* bpf_free_used_maps() or close(map_fd) will trigger this map_free callback. 1533 * bpf_free_used_maps() is called after bpf prog is no longer executing. ··· 1551 bpf_mem_alloc_destroy(&htab->ma); 1552 if (htab->use_percpu_counter) 1553 percpu_counter_destroy(&htab->pcount); 1554 - for (i = 0; i < HASHTAB_MAP_LOCK_COUNT; i++) 1555 - free_percpu(htab->map_locked[i]); 1556 - lockdep_unregister_key(&htab->lockdep_key); 1557 bpf_map_area_free(htab); 1558 } 1559 ··· 1593 b = __select_bucket(htab, hash); 1594 head = &b->head; 1595 1596 - ret = htab_lock_bucket(htab, b, hash, &bflags); 1597 if (ret) 1598 return ret; 1599 ··· 1630 hlist_nulls_del_rcu(&l->hash_node); 1631 1632 out_unlock: 1633 - htab_unlock_bucket(htab, b, hash, bflags); 1634 1635 if (l) { 1636 if (is_lru_map) ··· 1752 head = &b->head; 1753 /* do not grab the lock unless need it (bucket_cnt > 0). */ 1754 if (locked) { 1755 - ret = htab_lock_bucket(htab, b, batch, &flags); 1756 if (ret) { 1757 rcu_read_unlock(); 1758 bpf_enable_instrumentation(); ··· 1775 /* Note that since bucket_cnt > 0 here, it is implicit 1776 * that the locked was grabbed, so release it. 1777 */ 1778 - htab_unlock_bucket(htab, b, batch, flags); 1779 rcu_read_unlock(); 1780 bpf_enable_instrumentation(); 1781 goto after_loop; ··· 1786 /* Note that since bucket_cnt > 0 here, it is implicit 1787 * that the locked was grabbed, so release it. 1788 */ 1789 - htab_unlock_bucket(htab, b, batch, flags); 1790 rcu_read_unlock(); 1791 bpf_enable_instrumentation(); 1792 kvfree(keys); ··· 1849 dst_val += value_size; 1850 } 1851 1852 - htab_unlock_bucket(htab, b, batch, flags); 1853 locked = false; 1854 1855 while (node_to_free) {
··· 16 #include "bpf_lru_list.h" 17 #include "map_in_map.h" 18 #include <linux/bpf_mem_alloc.h> 19 + #include <asm/rqspinlock.h> 20 21 #define HTAB_CREATE_FLAG_MASK \ 22 (BPF_F_NO_PREALLOC | BPF_F_NO_COMMON_LRU | BPF_F_NUMA_NODE | \ ··· 78 */ 79 struct bucket { 80 struct hlist_nulls_head head; 81 + rqspinlock_t raw_lock; 82 }; 83 84 #define HASHTAB_MAP_LOCK_COUNT 8 ··· 104 u32 n_buckets; /* number of hash buckets */ 105 u32 elem_size; /* size of each element in bytes */ 106 u32 hashrnd; 107 }; 108 109 /* each htab element is struct htab_elem + key + value */ ··· 140 141 for (i = 0; i < htab->n_buckets; i++) { 142 INIT_HLIST_NULLS_HEAD(&htab->buckets[i].head, i); 143 + raw_res_spin_lock_init(&htab->buckets[i].raw_lock); 144 cond_resched(); 145 } 146 } 147 148 + static inline int htab_lock_bucket(struct bucket *b, unsigned long *pflags) 149 { 150 unsigned long flags; 151 + int ret; 152 153 + ret = raw_res_spin_lock_irqsave(&b->raw_lock, flags); 154 + if (ret) 155 + return ret; 156 *pflags = flags; 157 return 0; 158 } 159 160 + static inline void htab_unlock_bucket(struct bucket *b, unsigned long flags) 161 { 162 + raw_res_spin_unlock_irqrestore(&b->raw_lock, flags); 163 } 164 165 static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node); ··· 483 bool percpu_lru = (attr->map_flags & BPF_F_NO_COMMON_LRU); 484 bool prealloc = !(attr->map_flags & BPF_F_NO_PREALLOC); 485 struct bpf_htab *htab; 486 + int err; 487 488 htab = bpf_map_area_alloc(sizeof(*htab), NUMA_NO_NODE); 489 if (!htab) 490 return ERR_PTR(-ENOMEM); 491 492 bpf_map_init_from_attr(&htab->map, attr); 493 ··· 535 htab->map.numa_node); 536 if (!htab->buckets) 537 goto free_elem_count; 538 539 if (htab->map.map_flags & BPF_F_ZERO_SEED) 540 htab->hashrnd = 0; ··· 607 free_map_locked: 608 if (htab->use_percpu_counter) 609 percpu_counter_destroy(&htab->pcount); 610 bpf_map_area_free(htab->buckets); 611 bpf_mem_alloc_destroy(&htab->pcpu_ma); 612 bpf_mem_alloc_destroy(&htab->ma); 613 free_elem_count: 614 bpf_map_free_elem_count(&htab->map); 615 free_htab: 616 bpf_map_area_free(htab); 617 return ERR_PTR(err); 618 } ··· 820 b = __select_bucket(htab, tgt_l->hash); 821 head = &b->head; 822 823 + ret = htab_lock_bucket(b, &flags); 824 if (ret) 825 return false; 826 ··· 831 break; 832 } 833 834 + htab_unlock_bucket(b, flags); 835 836 if (l == tgt_l) 837 check_and_free_fields(htab, l); ··· 1150 */ 1151 } 1152 1153 + ret = htab_lock_bucket(b, &flags); 1154 if (ret) 1155 return ret; 1156 ··· 1201 check_and_free_fields(htab, l_old); 1202 } 1203 } 1204 + htab_unlock_bucket(b, flags); 1205 if (l_old) { 1206 if (old_map_ptr) 1207 map->ops->map_fd_put_ptr(map, old_map_ptr, true); ··· 1210 } 1211 return 0; 1212 err: 1213 + htab_unlock_bucket(b, flags); 1214 return ret; 1215 } 1216 ··· 1257 copy_map_value(&htab->map, 1258 l_new->key + round_up(map->key_size, 8), value); 1259 1260 + ret = htab_lock_bucket(b, &flags); 1261 if (ret) 1262 goto err_lock_bucket; 1263 ··· 1278 ret = 0; 1279 1280 err: 1281 + htab_unlock_bucket(b, flags); 1282 1283 err_lock_bucket: 1284 if (ret) ··· 1315 b = __select_bucket(htab, hash); 1316 head = &b->head; 1317 1318 + ret = htab_lock_bucket(b, &flags); 1319 if (ret) 1320 return ret; 1321 ··· 1340 } 1341 ret = 0; 1342 err: 1343 + htab_unlock_bucket(b, flags); 1344 return ret; 1345 } 1346 ··· 1381 return -ENOMEM; 1382 } 1383 1384 + ret = htab_lock_bucket(b, &flags); 1385 if (ret) 1386 goto err_lock_bucket; 1387 ··· 1405 } 1406 ret = 0; 1407 err: 1408 + htab_unlock_bucket(b, flags); 1409 err_lock_bucket: 1410 if (l_new) { 1411 bpf_map_dec_elem_count(&htab->map); ··· 1447 b = __select_bucket(htab, hash); 1448 head = &b->head; 1449 1450 + ret = htab_lock_bucket(b, &flags); 1451 if (ret) 1452 return ret; 1453 ··· 1457 else 1458 ret = -ENOENT; 1459 1460 + htab_unlock_bucket(b, flags); 1461 1462 if (l) 1463 free_htab_elem(htab, l); ··· 1483 b = __select_bucket(htab, hash); 1484 head = &b->head; 1485 1486 + ret = htab_lock_bucket(b, &flags); 1487 if (ret) 1488 return ret; 1489 ··· 1494 else 1495 ret = -ENOENT; 1496 1497 + htab_unlock_bucket(b, flags); 1498 if (l) 1499 htab_lru_push_free(htab, l); 1500 return ret; ··· 1561 static void htab_map_free(struct bpf_map *map) 1562 { 1563 struct bpf_htab *htab = container_of(map, struct bpf_htab, map); 1564 1565 /* bpf_free_used_maps() or close(map_fd) will trigger this map_free callback. 1566 * bpf_free_used_maps() is called after bpf prog is no longer executing. ··· 1586 bpf_mem_alloc_destroy(&htab->ma); 1587 if (htab->use_percpu_counter) 1588 percpu_counter_destroy(&htab->pcount); 1589 bpf_map_area_free(htab); 1590 } 1591 ··· 1631 b = __select_bucket(htab, hash); 1632 head = &b->head; 1633 1634 + ret = htab_lock_bucket(b, &bflags); 1635 if (ret) 1636 return ret; 1637 ··· 1668 hlist_nulls_del_rcu(&l->hash_node); 1669 1670 out_unlock: 1671 + htab_unlock_bucket(b, bflags); 1672 1673 if (l) { 1674 if (is_lru_map) ··· 1790 head = &b->head; 1791 /* do not grab the lock unless need it (bucket_cnt > 0). */ 1792 if (locked) { 1793 + ret = htab_lock_bucket(b, &flags); 1794 if (ret) { 1795 rcu_read_unlock(); 1796 bpf_enable_instrumentation(); ··· 1813 /* Note that since bucket_cnt > 0 here, it is implicit 1814 * that the locked was grabbed, so release it. 1815 */ 1816 + htab_unlock_bucket(b, flags); 1817 rcu_read_unlock(); 1818 bpf_enable_instrumentation(); 1819 goto after_loop; ··· 1824 /* Note that since bucket_cnt > 0 here, it is implicit 1825 * that the locked was grabbed, so release it. 1826 */ 1827 + htab_unlock_bucket(b, flags); 1828 rcu_read_unlock(); 1829 bpf_enable_instrumentation(); 1830 kvfree(keys); ··· 1887 dst_val += value_size; 1888 } 1889 1890 + htab_unlock_bucket(b, flags); 1891 locked = false; 1892 1893 while (node_to_free) {
+14 -11
kernel/bpf/lpm_trie.c
··· 15 #include <net/ipv6.h> 16 #include <uapi/linux/btf.h> 17 #include <linux/btf_ids.h> 18 #include <linux/bpf_mem_alloc.h> 19 20 /* Intermediate node */ ··· 37 size_t n_entries; 38 size_t max_prefixlen; 39 size_t data_size; 40 - raw_spinlock_t lock; 41 }; 42 43 /* This trie implements a longest prefix match algorithm that can be used to ··· 343 if (!new_node) 344 return -ENOMEM; 345 346 - raw_spin_lock_irqsave(&trie->lock, irq_flags); 347 348 new_node->prefixlen = key->prefixlen; 349 RCU_INIT_POINTER(new_node->child[0], NULL); ··· 359 */ 360 slot = &trie->root; 361 362 - while ((node = rcu_dereference_protected(*slot, 363 - lockdep_is_held(&trie->lock)))) { 364 matchlen = longest_prefix_match(trie, node, key); 365 366 if (node->prefixlen != matchlen || ··· 444 rcu_assign_pointer(*slot, im_node); 445 446 out: 447 - raw_spin_unlock_irqrestore(&trie->lock, irq_flags); 448 - 449 if (ret) 450 bpf_mem_cache_free(&trie->ma, new_node); 451 bpf_mem_cache_free_rcu(&trie->ma, free_node); ··· 469 if (key->prefixlen > trie->max_prefixlen) 470 return -EINVAL; 471 472 - raw_spin_lock_irqsave(&trie->lock, irq_flags); 473 474 /* Walk the tree looking for an exact key/length match and keeping 475 * track of the path we traverse. We will need to know the node ··· 482 trim = &trie->root; 483 trim2 = trim; 484 parent = NULL; 485 - while ((node = rcu_dereference_protected( 486 - *trim, lockdep_is_held(&trie->lock)))) { 487 matchlen = longest_prefix_match(trie, node, key); 488 489 if (node->prefixlen != matchlen || ··· 546 free_node = node; 547 548 out: 549 - raw_spin_unlock_irqrestore(&trie->lock, irq_flags); 550 551 bpf_mem_cache_free_rcu(&trie->ma, free_parent); 552 bpf_mem_cache_free_rcu(&trie->ma, free_node); ··· 595 offsetof(struct bpf_lpm_trie_key_u8, data); 596 trie->max_prefixlen = trie->data_size * 8; 597 598 - raw_spin_lock_init(&trie->lock); 599 600 /* Allocate intermediate and leaf nodes from the same allocator */ 601 leaf_size = sizeof(struct lpm_trie_node) + trie->data_size +
··· 15 #include <net/ipv6.h> 16 #include <uapi/linux/btf.h> 17 #include <linux/btf_ids.h> 18 + #include <asm/rqspinlock.h> 19 #include <linux/bpf_mem_alloc.h> 20 21 /* Intermediate node */ ··· 36 size_t n_entries; 37 size_t max_prefixlen; 38 size_t data_size; 39 + rqspinlock_t lock; 40 }; 41 42 /* This trie implements a longest prefix match algorithm that can be used to ··· 342 if (!new_node) 343 return -ENOMEM; 344 345 + ret = raw_res_spin_lock_irqsave(&trie->lock, irq_flags); 346 + if (ret) 347 + goto out_free; 348 349 new_node->prefixlen = key->prefixlen; 350 RCU_INIT_POINTER(new_node->child[0], NULL); ··· 356 */ 357 slot = &trie->root; 358 359 + while ((node = rcu_dereference(*slot))) { 360 matchlen = longest_prefix_match(trie, node, key); 361 362 if (node->prefixlen != matchlen || ··· 442 rcu_assign_pointer(*slot, im_node); 443 444 out: 445 + raw_res_spin_unlock_irqrestore(&trie->lock, irq_flags); 446 + out_free: 447 if (ret) 448 bpf_mem_cache_free(&trie->ma, new_node); 449 bpf_mem_cache_free_rcu(&trie->ma, free_node); ··· 467 if (key->prefixlen > trie->max_prefixlen) 468 return -EINVAL; 469 470 + ret = raw_res_spin_lock_irqsave(&trie->lock, irq_flags); 471 + if (ret) 472 + return ret; 473 474 /* Walk the tree looking for an exact key/length match and keeping 475 * track of the path we traverse. We will need to know the node ··· 478 trim = &trie->root; 479 trim2 = trim; 480 parent = NULL; 481 + while ((node = rcu_dereference(*trim))) { 482 matchlen = longest_prefix_match(trie, node, key); 483 484 if (node->prefixlen != matchlen || ··· 543 free_node = node; 544 545 out: 546 + raw_res_spin_unlock_irqrestore(&trie->lock, irq_flags); 547 548 bpf_mem_cache_free_rcu(&trie->ma, free_parent); 549 bpf_mem_cache_free_rcu(&trie->ma, free_node); ··· 592 offsetof(struct bpf_lpm_trie_key_u8, data); 593 trie->max_prefixlen = trie->data_size * 8; 594 595 + raw_res_spin_lock_init(&trie->lock); 596 597 /* Allocate intermediate and leaf nodes from the same allocator */ 598 leaf_size = sizeof(struct lpm_trie_node) + trie->data_size +
+28 -91
kernel/bpf/percpu_freelist.c
··· 14 for_each_possible_cpu(cpu) { 15 struct pcpu_freelist_head *head = per_cpu_ptr(s->freelist, cpu); 16 17 - raw_spin_lock_init(&head->lock); 18 head->first = NULL; 19 } 20 - raw_spin_lock_init(&s->extralist.lock); 21 - s->extralist.first = NULL; 22 return 0; 23 } 24 ··· 32 WRITE_ONCE(head->first, node); 33 } 34 35 - static inline void ___pcpu_freelist_push(struct pcpu_freelist_head *head, 36 struct pcpu_freelist_node *node) 37 { 38 - raw_spin_lock(&head->lock); 39 - pcpu_freelist_push_node(head, node); 40 - raw_spin_unlock(&head->lock); 41 - } 42 - 43 - static inline bool pcpu_freelist_try_push_extra(struct pcpu_freelist *s, 44 - struct pcpu_freelist_node *node) 45 - { 46 - if (!raw_spin_trylock(&s->extralist.lock)) 47 return false; 48 - 49 - pcpu_freelist_push_node(&s->extralist, node); 50 - raw_spin_unlock(&s->extralist.lock); 51 return true; 52 - } 53 - 54 - static inline void ___pcpu_freelist_push_nmi(struct pcpu_freelist *s, 55 - struct pcpu_freelist_node *node) 56 - { 57 - int cpu, orig_cpu; 58 - 59 - orig_cpu = raw_smp_processor_id(); 60 - while (1) { 61 - for_each_cpu_wrap(cpu, cpu_possible_mask, orig_cpu) { 62 - struct pcpu_freelist_head *head; 63 - 64 - head = per_cpu_ptr(s->freelist, cpu); 65 - if (raw_spin_trylock(&head->lock)) { 66 - pcpu_freelist_push_node(head, node); 67 - raw_spin_unlock(&head->lock); 68 - return; 69 - } 70 - } 71 - 72 - /* cannot lock any per cpu lock, try extralist */ 73 - if (pcpu_freelist_try_push_extra(s, node)) 74 - return; 75 - } 76 } 77 78 void __pcpu_freelist_push(struct pcpu_freelist *s, 79 struct pcpu_freelist_node *node) 80 { 81 - if (in_nmi()) 82 - ___pcpu_freelist_push_nmi(s, node); 83 - else 84 - ___pcpu_freelist_push(this_cpu_ptr(s->freelist), node); 85 } 86 87 void pcpu_freelist_push(struct pcpu_freelist *s, ··· 99 100 static struct pcpu_freelist_node *___pcpu_freelist_pop(struct pcpu_freelist *s) 101 { 102 struct pcpu_freelist_head *head; 103 - struct pcpu_freelist_node *node; 104 int cpu; 105 106 for_each_cpu_wrap(cpu, cpu_possible_mask, raw_smp_processor_id()) { 107 head = per_cpu_ptr(s->freelist, cpu); 108 if (!READ_ONCE(head->first)) 109 continue; 110 - raw_spin_lock(&head->lock); 111 node = head->first; 112 if (node) { 113 WRITE_ONCE(head->first, node->next); 114 - raw_spin_unlock(&head->lock); 115 return node; 116 } 117 - raw_spin_unlock(&head->lock); 118 } 119 - 120 - /* per cpu lists are all empty, try extralist */ 121 - if (!READ_ONCE(s->extralist.first)) 122 - return NULL; 123 - raw_spin_lock(&s->extralist.lock); 124 - node = s->extralist.first; 125 - if (node) 126 - WRITE_ONCE(s->extralist.first, node->next); 127 - raw_spin_unlock(&s->extralist.lock); 128 - return node; 129 - } 130 - 131 - static struct pcpu_freelist_node * 132 - ___pcpu_freelist_pop_nmi(struct pcpu_freelist *s) 133 - { 134 - struct pcpu_freelist_head *head; 135 - struct pcpu_freelist_node *node; 136 - int cpu; 137 - 138 - for_each_cpu_wrap(cpu, cpu_possible_mask, raw_smp_processor_id()) { 139 - head = per_cpu_ptr(s->freelist, cpu); 140 - if (!READ_ONCE(head->first)) 141 - continue; 142 - if (raw_spin_trylock(&head->lock)) { 143 - node = head->first; 144 - if (node) { 145 - WRITE_ONCE(head->first, node->next); 146 - raw_spin_unlock(&head->lock); 147 - return node; 148 - } 149 - raw_spin_unlock(&head->lock); 150 - } 151 - } 152 - 153 - /* cannot pop from per cpu lists, try extralist */ 154 - if (!READ_ONCE(s->extralist.first) || !raw_spin_trylock(&s->extralist.lock)) 155 - return NULL; 156 - node = s->extralist.first; 157 - if (node) 158 - WRITE_ONCE(s->extralist.first, node->next); 159 - raw_spin_unlock(&s->extralist.lock); 160 return node; 161 } 162 163 struct pcpu_freelist_node *__pcpu_freelist_pop(struct pcpu_freelist *s) 164 { 165 - if (in_nmi()) 166 - return ___pcpu_freelist_pop_nmi(s); 167 return ___pcpu_freelist_pop(s); 168 } 169
··· 14 for_each_possible_cpu(cpu) { 15 struct pcpu_freelist_head *head = per_cpu_ptr(s->freelist, cpu); 16 17 + raw_res_spin_lock_init(&head->lock); 18 head->first = NULL; 19 } 20 return 0; 21 } 22 ··· 34 WRITE_ONCE(head->first, node); 35 } 36 37 + static inline bool ___pcpu_freelist_push(struct pcpu_freelist_head *head, 38 struct pcpu_freelist_node *node) 39 { 40 + if (raw_res_spin_lock(&head->lock)) 41 return false; 42 + pcpu_freelist_push_node(head, node); 43 + raw_res_spin_unlock(&head->lock); 44 return true; 45 } 46 47 void __pcpu_freelist_push(struct pcpu_freelist *s, 48 struct pcpu_freelist_node *node) 49 { 50 + struct pcpu_freelist_head *head; 51 + int cpu; 52 + 53 + if (___pcpu_freelist_push(this_cpu_ptr(s->freelist), node)) 54 + return; 55 + 56 + while (true) { 57 + for_each_cpu_wrap(cpu, cpu_possible_mask, raw_smp_processor_id()) { 58 + if (cpu == raw_smp_processor_id()) 59 + continue; 60 + head = per_cpu_ptr(s->freelist, cpu); 61 + if (raw_res_spin_lock(&head->lock)) 62 + continue; 63 + pcpu_freelist_push_node(head, node); 64 + raw_res_spin_unlock(&head->lock); 65 + return; 66 + } 67 + } 68 } 69 70 void pcpu_freelist_push(struct pcpu_freelist *s, ··· 120 121 static struct pcpu_freelist_node *___pcpu_freelist_pop(struct pcpu_freelist *s) 122 { 123 + struct pcpu_freelist_node *node = NULL; 124 struct pcpu_freelist_head *head; 125 int cpu; 126 127 for_each_cpu_wrap(cpu, cpu_possible_mask, raw_smp_processor_id()) { 128 head = per_cpu_ptr(s->freelist, cpu); 129 if (!READ_ONCE(head->first)) 130 continue; 131 + if (raw_res_spin_lock(&head->lock)) 132 + continue; 133 node = head->first; 134 if (node) { 135 WRITE_ONCE(head->first, node->next); 136 + raw_res_spin_unlock(&head->lock); 137 return node; 138 } 139 + raw_res_spin_unlock(&head->lock); 140 } 141 return node; 142 } 143 144 struct pcpu_freelist_node *__pcpu_freelist_pop(struct pcpu_freelist *s) 145 { 146 return ___pcpu_freelist_pop(s); 147 } 148
+2 -2
kernel/bpf/percpu_freelist.h
··· 5 #define __PERCPU_FREELIST_H__ 6 #include <linux/spinlock.h> 7 #include <linux/percpu.h> 8 9 struct pcpu_freelist_head { 10 struct pcpu_freelist_node *first; 11 - raw_spinlock_t lock; 12 }; 13 14 struct pcpu_freelist { 15 struct pcpu_freelist_head __percpu *freelist; 16 - struct pcpu_freelist_head extralist; 17 }; 18 19 struct pcpu_freelist_node {
··· 5 #define __PERCPU_FREELIST_H__ 6 #include <linux/spinlock.h> 7 #include <linux/percpu.h> 8 + #include <asm/rqspinlock.h> 9 10 struct pcpu_freelist_head { 11 struct pcpu_freelist_node *first; 12 + rqspinlock_t lock; 13 }; 14 15 struct pcpu_freelist { 16 struct pcpu_freelist_head __percpu *freelist; 17 }; 18 19 struct pcpu_freelist_node {
+737
kernel/bpf/rqspinlock.c
···
··· 1 + // SPDX-License-Identifier: GPL-2.0-or-later 2 + /* 3 + * Resilient Queued Spin Lock 4 + * 5 + * (C) Copyright 2013-2015 Hewlett-Packard Development Company, L.P. 6 + * (C) Copyright 2013-2014,2018 Red Hat, Inc. 7 + * (C) Copyright 2015 Intel Corp. 8 + * (C) Copyright 2015 Hewlett-Packard Enterprise Development LP 9 + * (C) Copyright 2024-2025 Meta Platforms, Inc. and affiliates. 10 + * 11 + * Authors: Waiman Long <longman@redhat.com> 12 + * Peter Zijlstra <peterz@infradead.org> 13 + * Kumar Kartikeya Dwivedi <memxor@gmail.com> 14 + */ 15 + 16 + #include <linux/smp.h> 17 + #include <linux/bug.h> 18 + #include <linux/bpf.h> 19 + #include <linux/err.h> 20 + #include <linux/cpumask.h> 21 + #include <linux/percpu.h> 22 + #include <linux/hardirq.h> 23 + #include <linux/mutex.h> 24 + #include <linux/prefetch.h> 25 + #include <asm/byteorder.h> 26 + #ifdef CONFIG_QUEUED_SPINLOCKS 27 + #include <asm/qspinlock.h> 28 + #endif 29 + #include <trace/events/lock.h> 30 + #include <asm/rqspinlock.h> 31 + #include <linux/timekeeping.h> 32 + 33 + /* 34 + * Include queued spinlock definitions and statistics code 35 + */ 36 + #ifdef CONFIG_QUEUED_SPINLOCKS 37 + #include "../locking/qspinlock.h" 38 + #include "../locking/lock_events.h" 39 + #include "rqspinlock.h" 40 + #include "../locking/mcs_spinlock.h" 41 + #endif 42 + 43 + /* 44 + * The basic principle of a queue-based spinlock can best be understood 45 + * by studying a classic queue-based spinlock implementation called the 46 + * MCS lock. A copy of the original MCS lock paper ("Algorithms for Scalable 47 + * Synchronization on Shared-Memory Multiprocessors by Mellor-Crummey and 48 + * Scott") is available at 49 + * 50 + * https://bugzilla.kernel.org/show_bug.cgi?id=206115 51 + * 52 + * This queued spinlock implementation is based on the MCS lock, however to 53 + * make it fit the 4 bytes we assume spinlock_t to be, and preserve its 54 + * existing API, we must modify it somehow. 55 + * 56 + * In particular; where the traditional MCS lock consists of a tail pointer 57 + * (8 bytes) and needs the next pointer (another 8 bytes) of its own node to 58 + * unlock the next pending (next->locked), we compress both these: {tail, 59 + * next->locked} into a single u32 value. 60 + * 61 + * Since a spinlock disables recursion of its own context and there is a limit 62 + * to the contexts that can nest; namely: task, softirq, hardirq, nmi. As there 63 + * are at most 4 nesting levels, it can be encoded by a 2-bit number. Now 64 + * we can encode the tail by combining the 2-bit nesting level with the cpu 65 + * number. With one byte for the lock value and 3 bytes for the tail, only a 66 + * 32-bit word is now needed. Even though we only need 1 bit for the lock, 67 + * we extend it to a full byte to achieve better performance for architectures 68 + * that support atomic byte write. 69 + * 70 + * We also change the first spinner to spin on the lock bit instead of its 71 + * node; whereby avoiding the need to carry a node from lock to unlock, and 72 + * preserving existing lock API. This also makes the unlock code simpler and 73 + * faster. 74 + * 75 + * N.B. The current implementation only supports architectures that allow 76 + * atomic operations on smaller 8-bit and 16-bit data types. 77 + * 78 + */ 79 + 80 + struct rqspinlock_timeout { 81 + u64 timeout_end; 82 + u64 duration; 83 + u64 cur; 84 + u16 spin; 85 + }; 86 + 87 + #define RES_TIMEOUT_VAL 2 88 + 89 + DEFINE_PER_CPU_ALIGNED(struct rqspinlock_held, rqspinlock_held_locks); 90 + EXPORT_SYMBOL_GPL(rqspinlock_held_locks); 91 + 92 + static bool is_lock_released(rqspinlock_t *lock, u32 mask, struct rqspinlock_timeout *ts) 93 + { 94 + if (!(atomic_read_acquire(&lock->val) & (mask))) 95 + return true; 96 + return false; 97 + } 98 + 99 + static noinline int check_deadlock_AA(rqspinlock_t *lock, u32 mask, 100 + struct rqspinlock_timeout *ts) 101 + { 102 + struct rqspinlock_held *rqh = this_cpu_ptr(&rqspinlock_held_locks); 103 + int cnt = min(RES_NR_HELD, rqh->cnt); 104 + 105 + /* 106 + * Return an error if we hold the lock we are attempting to acquire. 107 + * We'll iterate over max 32 locks; no need to do is_lock_released. 108 + */ 109 + for (int i = 0; i < cnt - 1; i++) { 110 + if (rqh->locks[i] == lock) 111 + return -EDEADLK; 112 + } 113 + return 0; 114 + } 115 + 116 + /* 117 + * This focuses on the most common case of ABBA deadlocks (or ABBA involving 118 + * more locks, which reduce to ABBA). This is not exhaustive, and we rely on 119 + * timeouts as the final line of defense. 120 + */ 121 + static noinline int check_deadlock_ABBA(rqspinlock_t *lock, u32 mask, 122 + struct rqspinlock_timeout *ts) 123 + { 124 + struct rqspinlock_held *rqh = this_cpu_ptr(&rqspinlock_held_locks); 125 + int rqh_cnt = min(RES_NR_HELD, rqh->cnt); 126 + void *remote_lock; 127 + int cpu; 128 + 129 + /* 130 + * Find the CPU holding the lock that we want to acquire. If there is a 131 + * deadlock scenario, we will read a stable set on the remote CPU and 132 + * find the target. This would be a constant time operation instead of 133 + * O(NR_CPUS) if we could determine the owning CPU from a lock value, but 134 + * that requires increasing the size of the lock word. 135 + */ 136 + for_each_possible_cpu(cpu) { 137 + struct rqspinlock_held *rqh_cpu = per_cpu_ptr(&rqspinlock_held_locks, cpu); 138 + int real_cnt = READ_ONCE(rqh_cpu->cnt); 139 + int cnt = min(RES_NR_HELD, real_cnt); 140 + 141 + /* 142 + * Let's ensure to break out of this loop if the lock is available for 143 + * us to potentially acquire. 144 + */ 145 + if (is_lock_released(lock, mask, ts)) 146 + return 0; 147 + 148 + /* 149 + * Skip ourselves, and CPUs whose count is less than 2, as they need at 150 + * least one held lock and one acquisition attempt (reflected as top 151 + * most entry) to participate in an ABBA deadlock. 152 + * 153 + * If cnt is more than RES_NR_HELD, it means the current lock being 154 + * acquired won't appear in the table, and other locks in the table are 155 + * already held, so we can't determine ABBA. 156 + */ 157 + if (cpu == smp_processor_id() || real_cnt < 2 || real_cnt > RES_NR_HELD) 158 + continue; 159 + 160 + /* 161 + * Obtain the entry at the top, this corresponds to the lock the 162 + * remote CPU is attempting to acquire in a deadlock situation, 163 + * and would be one of the locks we hold on the current CPU. 164 + */ 165 + remote_lock = READ_ONCE(rqh_cpu->locks[cnt - 1]); 166 + /* 167 + * If it is NULL, we've raced and cannot determine a deadlock 168 + * conclusively, skip this CPU. 169 + */ 170 + if (!remote_lock) 171 + continue; 172 + /* 173 + * Find if the lock we're attempting to acquire is held by this CPU. 174 + * Don't consider the topmost entry, as that must be the latest lock 175 + * being held or acquired. For a deadlock, the target CPU must also 176 + * attempt to acquire a lock we hold, so for this search only 'cnt - 1' 177 + * entries are important. 178 + */ 179 + for (int i = 0; i < cnt - 1; i++) { 180 + if (READ_ONCE(rqh_cpu->locks[i]) != lock) 181 + continue; 182 + /* 183 + * We found our lock as held on the remote CPU. Is the 184 + * acquisition attempt on the remote CPU for a lock held 185 + * by us? If so, we have a deadlock situation, and need 186 + * to recover. 187 + */ 188 + for (int i = 0; i < rqh_cnt - 1; i++) { 189 + if (rqh->locks[i] == remote_lock) 190 + return -EDEADLK; 191 + } 192 + /* 193 + * Inconclusive; retry again later. 194 + */ 195 + return 0; 196 + } 197 + } 198 + return 0; 199 + } 200 + 201 + static noinline int check_deadlock(rqspinlock_t *lock, u32 mask, 202 + struct rqspinlock_timeout *ts) 203 + { 204 + int ret; 205 + 206 + ret = check_deadlock_AA(lock, mask, ts); 207 + if (ret) 208 + return ret; 209 + ret = check_deadlock_ABBA(lock, mask, ts); 210 + if (ret) 211 + return ret; 212 + 213 + return 0; 214 + } 215 + 216 + static noinline int check_timeout(rqspinlock_t *lock, u32 mask, 217 + struct rqspinlock_timeout *ts) 218 + { 219 + u64 time = ktime_get_mono_fast_ns(); 220 + u64 prev = ts->cur; 221 + 222 + if (!ts->timeout_end) { 223 + ts->cur = time; 224 + ts->timeout_end = time + ts->duration; 225 + return 0; 226 + } 227 + 228 + if (time > ts->timeout_end) 229 + return -ETIMEDOUT; 230 + 231 + /* 232 + * A millisecond interval passed from last time? Trigger deadlock 233 + * checks. 234 + */ 235 + if (prev + NSEC_PER_MSEC < time) { 236 + ts->cur = time; 237 + return check_deadlock(lock, mask, ts); 238 + } 239 + 240 + return 0; 241 + } 242 + 243 + /* 244 + * Do not amortize with spins when res_smp_cond_load_acquire is defined, 245 + * as the macro does internal amortization for us. 246 + */ 247 + #ifndef res_smp_cond_load_acquire 248 + #define RES_CHECK_TIMEOUT(ts, ret, mask) \ 249 + ({ \ 250 + if (!(ts).spin++) \ 251 + (ret) = check_timeout((lock), (mask), &(ts)); \ 252 + (ret); \ 253 + }) 254 + #else 255 + #define RES_CHECK_TIMEOUT(ts, ret, mask) \ 256 + ({ (ret) = check_timeout(&(ts)); }) 257 + #endif 258 + 259 + /* 260 + * Initialize the 'spin' member. 261 + * Set spin member to 0 to trigger AA/ABBA checks immediately. 262 + */ 263 + #define RES_INIT_TIMEOUT(ts) ({ (ts).spin = 0; }) 264 + 265 + /* 266 + * We only need to reset 'timeout_end', 'spin' will just wrap around as necessary. 267 + * Duration is defined for each spin attempt, so set it here. 268 + */ 269 + #define RES_RESET_TIMEOUT(ts, _duration) ({ (ts).timeout_end = 0; (ts).duration = _duration; }) 270 + 271 + /* 272 + * Provide a test-and-set fallback for cases when queued spin lock support is 273 + * absent from the architecture. 274 + */ 275 + int __lockfunc resilient_tas_spin_lock(rqspinlock_t *lock) 276 + { 277 + struct rqspinlock_timeout ts; 278 + int val, ret = 0; 279 + 280 + RES_INIT_TIMEOUT(ts); 281 + grab_held_lock_entry(lock); 282 + 283 + /* 284 + * Since the waiting loop's time is dependent on the amount of 285 + * contention, a short timeout unlike rqspinlock waiting loops 286 + * isn't enough. Choose a second as the timeout value. 287 + */ 288 + RES_RESET_TIMEOUT(ts, NSEC_PER_SEC); 289 + retry: 290 + val = atomic_read(&lock->val); 291 + 292 + if (val || !atomic_try_cmpxchg(&lock->val, &val, 1)) { 293 + if (RES_CHECK_TIMEOUT(ts, ret, ~0u)) 294 + goto out; 295 + cpu_relax(); 296 + goto retry; 297 + } 298 + 299 + return 0; 300 + out: 301 + release_held_lock_entry(); 302 + return ret; 303 + } 304 + EXPORT_SYMBOL_GPL(resilient_tas_spin_lock); 305 + 306 + #ifdef CONFIG_QUEUED_SPINLOCKS 307 + 308 + /* 309 + * Per-CPU queue node structures; we can never have more than 4 nested 310 + * contexts: task, softirq, hardirq, nmi. 311 + * 312 + * Exactly fits one 64-byte cacheline on a 64-bit architecture. 313 + */ 314 + static DEFINE_PER_CPU_ALIGNED(struct qnode, rqnodes[_Q_MAX_NODES]); 315 + 316 + #ifndef res_smp_cond_load_acquire 317 + #define res_smp_cond_load_acquire(v, c) smp_cond_load_acquire(v, c) 318 + #endif 319 + 320 + #define res_atomic_cond_read_acquire(v, c) res_smp_cond_load_acquire(&(v)->counter, (c)) 321 + 322 + /** 323 + * resilient_queued_spin_lock_slowpath - acquire the queued spinlock 324 + * @lock: Pointer to queued spinlock structure 325 + * @val: Current value of the queued spinlock 32-bit word 326 + * 327 + * Return: 328 + * * 0 - Lock was acquired successfully. 329 + * * -EDEADLK - Lock acquisition failed because of AA/ABBA deadlock. 330 + * * -ETIMEDOUT - Lock acquisition failed because of timeout. 331 + * 332 + * (queue tail, pending bit, lock value) 333 + * 334 + * fast : slow : unlock 335 + * : : 336 + * uncontended (0,0,0) -:--> (0,0,1) ------------------------------:--> (*,*,0) 337 + * : | ^--------.------. / : 338 + * : v \ \ | : 339 + * pending : (0,1,1) +--> (0,1,0) \ | : 340 + * : | ^--' | | : 341 + * : v | | : 342 + * uncontended : (n,x,y) +--> (n,0,0) --' | : 343 + * queue : | ^--' | : 344 + * : v | : 345 + * contended : (*,x,y) +--> (*,0,0) ---> (*,0,1) -' : 346 + * queue : ^--' : 347 + */ 348 + int __lockfunc resilient_queued_spin_lock_slowpath(rqspinlock_t *lock, u32 val) 349 + { 350 + struct mcs_spinlock *prev, *next, *node; 351 + struct rqspinlock_timeout ts; 352 + int idx, ret = 0; 353 + u32 old, tail; 354 + 355 + BUILD_BUG_ON(CONFIG_NR_CPUS >= (1U << _Q_TAIL_CPU_BITS)); 356 + 357 + if (resilient_virt_spin_lock_enabled()) 358 + return resilient_virt_spin_lock(lock); 359 + 360 + RES_INIT_TIMEOUT(ts); 361 + 362 + /* 363 + * Wait for in-progress pending->locked hand-overs with a bounded 364 + * number of spins so that we guarantee forward progress. 365 + * 366 + * 0,1,0 -> 0,0,1 367 + */ 368 + if (val == _Q_PENDING_VAL) { 369 + int cnt = _Q_PENDING_LOOPS; 370 + val = atomic_cond_read_relaxed(&lock->val, 371 + (VAL != _Q_PENDING_VAL) || !cnt--); 372 + } 373 + 374 + /* 375 + * If we observe any contention; queue. 376 + */ 377 + if (val & ~_Q_LOCKED_MASK) 378 + goto queue; 379 + 380 + /* 381 + * trylock || pending 382 + * 383 + * 0,0,* -> 0,1,* -> 0,0,1 pending, trylock 384 + */ 385 + val = queued_fetch_set_pending_acquire(lock); 386 + 387 + /* 388 + * If we observe contention, there is a concurrent locker. 389 + * 390 + * Undo and queue; our setting of PENDING might have made the 391 + * n,0,0 -> 0,0,0 transition fail and it will now be waiting 392 + * on @next to become !NULL. 393 + */ 394 + if (unlikely(val & ~_Q_LOCKED_MASK)) { 395 + 396 + /* Undo PENDING if we set it. */ 397 + if (!(val & _Q_PENDING_MASK)) 398 + clear_pending(lock); 399 + 400 + goto queue; 401 + } 402 + 403 + /* 404 + * Grab an entry in the held locks array, to enable deadlock detection. 405 + */ 406 + grab_held_lock_entry(lock); 407 + 408 + /* 409 + * We're pending, wait for the owner to go away. 410 + * 411 + * 0,1,1 -> *,1,0 412 + * 413 + * this wait loop must be a load-acquire such that we match the 414 + * store-release that clears the locked bit and create lock 415 + * sequentiality; this is because not all 416 + * clear_pending_set_locked() implementations imply full 417 + * barriers. 418 + */ 419 + if (val & _Q_LOCKED_MASK) { 420 + RES_RESET_TIMEOUT(ts, RES_DEF_TIMEOUT); 421 + res_smp_cond_load_acquire(&lock->locked, !VAL || RES_CHECK_TIMEOUT(ts, ret, _Q_LOCKED_MASK)); 422 + } 423 + 424 + if (ret) { 425 + /* 426 + * We waited for the locked bit to go back to 0, as the pending 427 + * waiter, but timed out. We need to clear the pending bit since 428 + * we own it. Once a stuck owner has been recovered, the lock 429 + * must be restored to a valid state, hence removing the pending 430 + * bit is necessary. 431 + * 432 + * *,1,* -> *,0,* 433 + */ 434 + clear_pending(lock); 435 + lockevent_inc(rqspinlock_lock_timeout); 436 + goto err_release_entry; 437 + } 438 + 439 + /* 440 + * take ownership and clear the pending bit. 441 + * 442 + * 0,1,0 -> 0,0,1 443 + */ 444 + clear_pending_set_locked(lock); 445 + lockevent_inc(lock_pending); 446 + return 0; 447 + 448 + /* 449 + * End of pending bit optimistic spinning and beginning of MCS 450 + * queuing. 451 + */ 452 + queue: 453 + lockevent_inc(lock_slowpath); 454 + /* 455 + * Grab deadlock detection entry for the queue path. 456 + */ 457 + grab_held_lock_entry(lock); 458 + 459 + node = this_cpu_ptr(&rqnodes[0].mcs); 460 + idx = node->count++; 461 + tail = encode_tail(smp_processor_id(), idx); 462 + 463 + trace_contention_begin(lock, LCB_F_SPIN); 464 + 465 + /* 466 + * 4 nodes are allocated based on the assumption that there will 467 + * not be nested NMIs taking spinlocks. That may not be true in 468 + * some architectures even though the chance of needing more than 469 + * 4 nodes will still be extremely unlikely. When that happens, 470 + * we fall back to spinning on the lock directly without using 471 + * any MCS node. This is not the most elegant solution, but is 472 + * simple enough. 473 + */ 474 + if (unlikely(idx >= _Q_MAX_NODES)) { 475 + lockevent_inc(lock_no_node); 476 + RES_RESET_TIMEOUT(ts, RES_DEF_TIMEOUT); 477 + while (!queued_spin_trylock(lock)) { 478 + if (RES_CHECK_TIMEOUT(ts, ret, ~0u)) { 479 + lockevent_inc(rqspinlock_lock_timeout); 480 + goto err_release_node; 481 + } 482 + cpu_relax(); 483 + } 484 + goto release; 485 + } 486 + 487 + node = grab_mcs_node(node, idx); 488 + 489 + /* 490 + * Keep counts of non-zero index values: 491 + */ 492 + lockevent_cond_inc(lock_use_node2 + idx - 1, idx); 493 + 494 + /* 495 + * Ensure that we increment the head node->count before initialising 496 + * the actual node. If the compiler is kind enough to reorder these 497 + * stores, then an IRQ could overwrite our assignments. 498 + */ 499 + barrier(); 500 + 501 + node->locked = 0; 502 + node->next = NULL; 503 + 504 + /* 505 + * We touched a (possibly) cold cacheline in the per-cpu queue node; 506 + * attempt the trylock once more in the hope someone let go while we 507 + * weren't watching. 508 + */ 509 + if (queued_spin_trylock(lock)) 510 + goto release; 511 + 512 + /* 513 + * Ensure that the initialisation of @node is complete before we 514 + * publish the updated tail via xchg_tail() and potentially link 515 + * @node into the waitqueue via WRITE_ONCE(prev->next, node) below. 516 + */ 517 + smp_wmb(); 518 + 519 + /* 520 + * Publish the updated tail. 521 + * We have already touched the queueing cacheline; don't bother with 522 + * pending stuff. 523 + * 524 + * p,*,* -> n,*,* 525 + */ 526 + old = xchg_tail(lock, tail); 527 + next = NULL; 528 + 529 + /* 530 + * if there was a previous node; link it and wait until reaching the 531 + * head of the waitqueue. 532 + */ 533 + if (old & _Q_TAIL_MASK) { 534 + int val; 535 + 536 + prev = decode_tail(old, rqnodes); 537 + 538 + /* Link @node into the waitqueue. */ 539 + WRITE_ONCE(prev->next, node); 540 + 541 + val = arch_mcs_spin_lock_contended(&node->locked); 542 + if (val == RES_TIMEOUT_VAL) { 543 + ret = -EDEADLK; 544 + goto waitq_timeout; 545 + } 546 + 547 + /* 548 + * While waiting for the MCS lock, the next pointer may have 549 + * been set by another lock waiter. We optimistically load 550 + * the next pointer & prefetch the cacheline for writing 551 + * to reduce latency in the upcoming MCS unlock operation. 552 + */ 553 + next = READ_ONCE(node->next); 554 + if (next) 555 + prefetchw(next); 556 + } 557 + 558 + /* 559 + * we're at the head of the waitqueue, wait for the owner & pending to 560 + * go away. 561 + * 562 + * *,x,y -> *,0,0 563 + * 564 + * this wait loop must use a load-acquire such that we match the 565 + * store-release that clears the locked bit and create lock 566 + * sequentiality; this is because the set_locked() function below 567 + * does not imply a full barrier. 568 + * 569 + * We use RES_DEF_TIMEOUT * 2 as the duration, as RES_DEF_TIMEOUT is 570 + * meant to span maximum allowed time per critical section, and we may 571 + * have both the owner of the lock and the pending bit waiter ahead of 572 + * us. 573 + */ 574 + RES_RESET_TIMEOUT(ts, RES_DEF_TIMEOUT * 2); 575 + val = res_atomic_cond_read_acquire(&lock->val, !(VAL & _Q_LOCKED_PENDING_MASK) || 576 + RES_CHECK_TIMEOUT(ts, ret, _Q_LOCKED_PENDING_MASK)); 577 + 578 + waitq_timeout: 579 + if (ret) { 580 + /* 581 + * If the tail is still pointing to us, then we are the final waiter, 582 + * and are responsible for resetting the tail back to 0. Otherwise, if 583 + * the cmpxchg operation fails, we signal the next waiter to take exit 584 + * and try the same. For a waiter with tail node 'n': 585 + * 586 + * n,*,* -> 0,*,* 587 + * 588 + * When performing cmpxchg for the whole word (NR_CPUS > 16k), it is 589 + * possible locked/pending bits keep changing and we see failures even 590 + * when we remain the head of wait queue. However, eventually, 591 + * pending bit owner will unset the pending bit, and new waiters 592 + * will queue behind us. This will leave the lock owner in 593 + * charge, and it will eventually either set locked bit to 0, or 594 + * leave it as 1, allowing us to make progress. 595 + * 596 + * We terminate the whole wait queue for two reasons. Firstly, 597 + * we eschew per-waiter timeouts with one applied at the head of 598 + * the wait queue. This allows everyone to break out faster 599 + * once we've seen the owner / pending waiter not responding for 600 + * the timeout duration from the head. Secondly, it avoids 601 + * complicated synchronization, because when not leaving in FIFO 602 + * order, prev's next pointer needs to be fixed up etc. 603 + */ 604 + if (!try_cmpxchg_tail(lock, tail, 0)) { 605 + next = smp_cond_load_relaxed(&node->next, VAL); 606 + WRITE_ONCE(next->locked, RES_TIMEOUT_VAL); 607 + } 608 + lockevent_inc(rqspinlock_lock_timeout); 609 + goto err_release_node; 610 + } 611 + 612 + /* 613 + * claim the lock: 614 + * 615 + * n,0,0 -> 0,0,1 : lock, uncontended 616 + * *,*,0 -> *,*,1 : lock, contended 617 + * 618 + * If the queue head is the only one in the queue (lock value == tail) 619 + * and nobody is pending, clear the tail code and grab the lock. 620 + * Otherwise, we only need to grab the lock. 621 + */ 622 + 623 + /* 624 + * Note: at this point: (val & _Q_PENDING_MASK) == 0, because of the 625 + * above wait condition, therefore any concurrent setting of 626 + * PENDING will make the uncontended transition fail. 627 + */ 628 + if ((val & _Q_TAIL_MASK) == tail) { 629 + if (atomic_try_cmpxchg_relaxed(&lock->val, &val, _Q_LOCKED_VAL)) 630 + goto release; /* No contention */ 631 + } 632 + 633 + /* 634 + * Either somebody is queued behind us or _Q_PENDING_VAL got set 635 + * which will then detect the remaining tail and queue behind us 636 + * ensuring we'll see a @next. 637 + */ 638 + set_locked(lock); 639 + 640 + /* 641 + * contended path; wait for next if not observed yet, release. 642 + */ 643 + if (!next) 644 + next = smp_cond_load_relaxed(&node->next, (VAL)); 645 + 646 + arch_mcs_spin_unlock_contended(&next->locked); 647 + 648 + release: 649 + trace_contention_end(lock, 0); 650 + 651 + /* 652 + * release the node 653 + */ 654 + __this_cpu_dec(rqnodes[0].mcs.count); 655 + return ret; 656 + err_release_node: 657 + trace_contention_end(lock, ret); 658 + __this_cpu_dec(rqnodes[0].mcs.count); 659 + err_release_entry: 660 + release_held_lock_entry(); 661 + return ret; 662 + } 663 + EXPORT_SYMBOL_GPL(resilient_queued_spin_lock_slowpath); 664 + 665 + #endif /* CONFIG_QUEUED_SPINLOCKS */ 666 + 667 + __bpf_kfunc_start_defs(); 668 + 669 + __bpf_kfunc int bpf_res_spin_lock(struct bpf_res_spin_lock *lock) 670 + { 671 + int ret; 672 + 673 + BUILD_BUG_ON(sizeof(rqspinlock_t) != sizeof(struct bpf_res_spin_lock)); 674 + BUILD_BUG_ON(__alignof__(rqspinlock_t) != __alignof__(struct bpf_res_spin_lock)); 675 + 676 + preempt_disable(); 677 + ret = res_spin_lock((rqspinlock_t *)lock); 678 + if (unlikely(ret)) { 679 + preempt_enable(); 680 + return ret; 681 + } 682 + return 0; 683 + } 684 + 685 + __bpf_kfunc void bpf_res_spin_unlock(struct bpf_res_spin_lock *lock) 686 + { 687 + res_spin_unlock((rqspinlock_t *)lock); 688 + preempt_enable(); 689 + } 690 + 691 + __bpf_kfunc int bpf_res_spin_lock_irqsave(struct bpf_res_spin_lock *lock, unsigned long *flags__irq_flag) 692 + { 693 + u64 *ptr = (u64 *)flags__irq_flag; 694 + unsigned long flags; 695 + int ret; 696 + 697 + preempt_disable(); 698 + local_irq_save(flags); 699 + ret = res_spin_lock((rqspinlock_t *)lock); 700 + if (unlikely(ret)) { 701 + local_irq_restore(flags); 702 + preempt_enable(); 703 + return ret; 704 + } 705 + *ptr = flags; 706 + return 0; 707 + } 708 + 709 + __bpf_kfunc void bpf_res_spin_unlock_irqrestore(struct bpf_res_spin_lock *lock, unsigned long *flags__irq_flag) 710 + { 711 + u64 *ptr = (u64 *)flags__irq_flag; 712 + unsigned long flags = *ptr; 713 + 714 + res_spin_unlock((rqspinlock_t *)lock); 715 + local_irq_restore(flags); 716 + preempt_enable(); 717 + } 718 + 719 + __bpf_kfunc_end_defs(); 720 + 721 + BTF_KFUNCS_START(rqspinlock_kfunc_ids) 722 + BTF_ID_FLAGS(func, bpf_res_spin_lock, KF_RET_NULL) 723 + BTF_ID_FLAGS(func, bpf_res_spin_unlock) 724 + BTF_ID_FLAGS(func, bpf_res_spin_lock_irqsave, KF_RET_NULL) 725 + BTF_ID_FLAGS(func, bpf_res_spin_unlock_irqrestore) 726 + BTF_KFUNCS_END(rqspinlock_kfunc_ids) 727 + 728 + static const struct btf_kfunc_id_set rqspinlock_kfunc_set = { 729 + .owner = THIS_MODULE, 730 + .set = &rqspinlock_kfunc_ids, 731 + }; 732 + 733 + static __init int rqspinlock_register_kfuncs(void) 734 + { 735 + return register_btf_kfunc_id_set(BPF_PROG_TYPE_UNSPEC, &rqspinlock_kfunc_set); 736 + } 737 + late_initcall(rqspinlock_register_kfuncs);
+48
kernel/bpf/rqspinlock.h
···
··· 1 + /* SPDX-License-Identifier: GPL-2.0-or-later */ 2 + /* 3 + * Resilient Queued Spin Lock defines 4 + * 5 + * (C) Copyright 2024-2025 Meta Platforms, Inc. and affiliates. 6 + * 7 + * Authors: Kumar Kartikeya Dwivedi <memxor@gmail.com> 8 + */ 9 + #ifndef __LINUX_RQSPINLOCK_H 10 + #define __LINUX_RQSPINLOCK_H 11 + 12 + #include "../locking/qspinlock.h" 13 + 14 + /* 15 + * try_cmpxchg_tail - Return result of cmpxchg of tail word with a new value 16 + * @lock: Pointer to queued spinlock structure 17 + * @tail: The tail to compare against 18 + * @new_tail: The new queue tail code word 19 + * Return: Bool to indicate whether the cmpxchg operation succeeded 20 + * 21 + * This is used by the head of the wait queue to clean up the queue. 22 + * Provides relaxed ordering, since observers only rely on initialized 23 + * state of the node which was made visible through the xchg_tail operation, 24 + * i.e. through the smp_wmb preceding xchg_tail. 25 + * 26 + * We avoid using 16-bit cmpxchg, which is not available on all architectures. 27 + */ 28 + static __always_inline bool try_cmpxchg_tail(struct qspinlock *lock, u32 tail, u32 new_tail) 29 + { 30 + u32 old, new; 31 + 32 + old = atomic_read(&lock->val); 33 + do { 34 + /* 35 + * Is the tail part we compare to already stale? Fail. 36 + */ 37 + if ((old & _Q_TAIL_MASK) != tail) 38 + return false; 39 + /* 40 + * Encode latest locked/pending state for new tail. 41 + */ 42 + new = (old & _Q_LOCKED_PENDING_MASK) | new_tail; 43 + } while (!atomic_try_cmpxchg_relaxed(&lock->val, &old, new)); 44 + 45 + return true; 46 + } 47 + 48 + #endif /* __LINUX_RQSPINLOCK_H */
+5 -1
kernel/bpf/syscall.c
··· 648 case BPF_RB_ROOT: 649 case BPF_RB_NODE: 650 case BPF_SPIN_LOCK: 651 case BPF_TIMER: 652 case BPF_REFCOUNT: 653 case BPF_WORKQUEUE: ··· 701 case BPF_RB_ROOT: 702 case BPF_RB_NODE: 703 case BPF_SPIN_LOCK: 704 case BPF_TIMER: 705 case BPF_REFCOUNT: 706 case BPF_WORKQUEUE: ··· 779 780 switch (fields[i].type) { 781 case BPF_SPIN_LOCK: 782 break; 783 case BPF_TIMER: 784 bpf_timer_cancel_and_free(field_ptr); ··· 1215 return -EINVAL; 1216 1217 map->record = btf_parse_fields(btf, value_type, 1218 - BPF_SPIN_LOCK | BPF_TIMER | BPF_KPTR | BPF_LIST_HEAD | 1219 BPF_RB_ROOT | BPF_REFCOUNT | BPF_WORKQUEUE | BPF_UPTR, 1220 map->value_size); 1221 if (!IS_ERR_OR_NULL(map->record)) { ··· 1234 case 0: 1235 continue; 1236 case BPF_SPIN_LOCK: 1237 if (map->map_type != BPF_MAP_TYPE_HASH && 1238 map->map_type != BPF_MAP_TYPE_ARRAY && 1239 map->map_type != BPF_MAP_TYPE_CGROUP_STORAGE &&
··· 648 case BPF_RB_ROOT: 649 case BPF_RB_NODE: 650 case BPF_SPIN_LOCK: 651 + case BPF_RES_SPIN_LOCK: 652 case BPF_TIMER: 653 case BPF_REFCOUNT: 654 case BPF_WORKQUEUE: ··· 700 case BPF_RB_ROOT: 701 case BPF_RB_NODE: 702 case BPF_SPIN_LOCK: 703 + case BPF_RES_SPIN_LOCK: 704 case BPF_TIMER: 705 case BPF_REFCOUNT: 706 case BPF_WORKQUEUE: ··· 777 778 switch (fields[i].type) { 779 case BPF_SPIN_LOCK: 780 + case BPF_RES_SPIN_LOCK: 781 break; 782 case BPF_TIMER: 783 bpf_timer_cancel_and_free(field_ptr); ··· 1212 return -EINVAL; 1213 1214 map->record = btf_parse_fields(btf, value_type, 1215 + BPF_SPIN_LOCK | BPF_RES_SPIN_LOCK | BPF_TIMER | BPF_KPTR | BPF_LIST_HEAD | 1216 BPF_RB_ROOT | BPF_REFCOUNT | BPF_WORKQUEUE | BPF_UPTR, 1217 map->value_size); 1218 if (!IS_ERR_OR_NULL(map->record)) { ··· 1231 case 0: 1232 continue; 1233 case BPF_SPIN_LOCK: 1234 + case BPF_RES_SPIN_LOCK: 1235 if (map->map_type != BPF_MAP_TYPE_HASH && 1236 map->map_type != BPF_MAP_TYPE_ARRAY && 1237 map->map_type != BPF_MAP_TYPE_CGROUP_STORAGE &&
+206 -42
kernel/bpf/verifier.c
··· 456 457 static bool reg_may_point_to_spin_lock(const struct bpf_reg_state *reg) 458 { 459 - return btf_record_has_field(reg_btf_record(reg), BPF_SPIN_LOCK); 460 } 461 462 static bool type_is_rdonly_mem(u32 type) ··· 1155 1156 static int mark_stack_slot_irq_flag(struct bpf_verifier_env *env, 1157 struct bpf_kfunc_call_arg_meta *meta, 1158 - struct bpf_reg_state *reg, int insn_idx) 1159 { 1160 struct bpf_func_state *state = func(env, reg); 1161 struct bpf_stack_state *slot; ··· 1178 st->type = PTR_TO_STACK; /* we don't have dedicated reg type */ 1179 st->live |= REG_LIVE_WRITTEN; 1180 st->ref_obj_id = id; 1181 1182 for (i = 0; i < BPF_REG_SIZE; i++) 1183 slot->slot_type[i] = STACK_IRQ_FLAG; ··· 1187 return 0; 1188 } 1189 1190 - static int unmark_stack_slot_irq_flag(struct bpf_verifier_env *env, struct bpf_reg_state *reg) 1191 { 1192 struct bpf_func_state *state = func(env, reg); 1193 struct bpf_stack_state *slot; ··· 1201 1202 slot = &state->stack[spi]; 1203 st = &slot->spilled_ptr; 1204 1205 err = release_irq_state(env->cur_state, st->ref_obj_id); 1206 WARN_ON_ONCE(err && err != -EACCES); ··· 1428 dst->active_preempt_locks = src->active_preempt_locks; 1429 dst->active_rcu_lock = src->active_rcu_lock; 1430 dst->active_irq_id = src->active_irq_id; 1431 return 0; 1432 } 1433 ··· 1529 s->ptr = ptr; 1530 1531 state->active_locks++; 1532 return 0; 1533 } 1534 ··· 1581 1582 static int release_lock_state(struct bpf_verifier_state *state, int type, int id, void *ptr) 1583 { 1584 int i; 1585 1586 for (i = 0; i < state->acquired_refs; i++) { 1587 - if (state->refs[i].type != type) 1588 - continue; 1589 - if (state->refs[i].id == id && state->refs[i].ptr == ptr) { 1590 release_reference_state(state, i); 1591 state->active_locks--; 1592 return 0; 1593 } 1594 } 1595 return -EINVAL; ··· 1633 for (i = 0; i < state->acquired_refs; i++) { 1634 struct bpf_reference_state *s = &state->refs[i]; 1635 1636 - if (s->type != type) 1637 continue; 1638 1639 if (s->id == id && s->ptr == ptr) ··· 8240 return err; 8241 } 8242 8243 /* Implementation details: 8244 * bpf_map_lookup returns PTR_TO_MAP_VALUE_OR_NULL. 8245 * bpf_obj_new returns PTR_TO_BTF_ID | MEM_ALLOC | PTR_MAYBE_NULL. ··· 8268 * env->cur_state->active_locks remembers which map value element or allocated 8269 * object got locked and clears it after bpf_spin_unlock. 8270 */ 8271 - static int process_spin_lock(struct bpf_verifier_env *env, int regno, 8272 - bool is_lock) 8273 { 8274 struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno]; 8275 struct bpf_verifier_state *cur = env->cur_state; 8276 bool is_const = tnum_is_const(reg->var_off); 8277 u64 val = reg->var_off.value; 8278 struct bpf_map *map = NULL; 8279 struct btf *btf = NULL; 8280 struct btf_record *rec; 8281 int err; 8282 8283 if (!is_const) { 8284 verbose(env, 8285 - "R%d doesn't have constant offset. bpf_spin_lock has to be at the constant offset\n", 8286 - regno); 8287 return -EINVAL; 8288 } 8289 if (reg->type == PTR_TO_MAP_VALUE) { 8290 map = reg->map_ptr; 8291 if (!map->btf) { 8292 verbose(env, 8293 - "map '%s' has to have BTF in order to use bpf_spin_lock\n", 8294 - map->name); 8295 return -EINVAL; 8296 } 8297 } else { ··· 8302 } 8303 8304 rec = reg_btf_record(reg); 8305 - if (!btf_record_has_field(rec, BPF_SPIN_LOCK)) { 8306 - verbose(env, "%s '%s' has no valid bpf_spin_lock\n", map ? "map" : "local", 8307 - map ? map->name : "kptr"); 8308 return -EINVAL; 8309 } 8310 - if (rec->spin_lock_off != val + reg->off) { 8311 - verbose(env, "off %lld doesn't point to 'struct bpf_spin_lock' that is at %d\n", 8312 - val + reg->off, rec->spin_lock_off); 8313 return -EINVAL; 8314 } 8315 if (is_lock) { 8316 void *ptr; 8317 8318 if (map) 8319 ptr = map; 8320 else 8321 ptr = btf; 8322 8323 - if (cur->active_locks) { 8324 - verbose(env, 8325 - "Locking two bpf_spin_locks are not allowed\n"); 8326 - return -EINVAL; 8327 } 8328 - err = acquire_lock_state(env, env->insn_idx, REF_TYPE_LOCK, reg->id, ptr); 8329 if (err < 0) { 8330 verbose(env, "Failed to acquire lock state\n"); 8331 return err; 8332 } 8333 } else { 8334 void *ptr; 8335 8336 if (map) 8337 ptr = map; ··· 8356 ptr = btf; 8357 8358 if (!cur->active_locks) { 8359 - verbose(env, "bpf_spin_unlock without taking a lock\n"); 8360 return -EINVAL; 8361 } 8362 8363 - if (release_lock_state(env->cur_state, REF_TYPE_LOCK, reg->id, ptr)) { 8364 - verbose(env, "bpf_spin_unlock of different lock\n"); 8365 return -EINVAL; 8366 } 8367 ··· 9701 return -EACCES; 9702 } 9703 if (meta->func_id == BPF_FUNC_spin_lock) { 9704 - err = process_spin_lock(env, regno, true); 9705 if (err) 9706 return err; 9707 } else if (meta->func_id == BPF_FUNC_spin_unlock) { 9708 - err = process_spin_lock(env, regno, false); 9709 if (err) 9710 return err; 9711 } else { ··· 11587 regs[BPF_REG_0].map_uid = meta.map_uid; 11588 regs[BPF_REG_0].type = PTR_TO_MAP_VALUE | ret_flag; 11589 if (!type_may_be_null(ret_flag) && 11590 - btf_record_has_field(meta.map_ptr->record, BPF_SPIN_LOCK)) { 11591 regs[BPF_REG_0].id = ++env->id_gen; 11592 } 11593 break; ··· 11759 /* mark_btf_func_reg_size() is used when the reg size is determined by 11760 * the BTF func_proto's return value size and argument. 11761 */ 11762 - static void mark_btf_func_reg_size(struct bpf_verifier_env *env, u32 regno, 11763 - size_t reg_size) 11764 { 11765 - struct bpf_reg_state *reg = &cur_regs(env)[regno]; 11766 11767 if (regno == BPF_REG_0) { 11768 /* Function return value */ ··· 11778 mark_reg_read(env, reg, reg->parent, REG_LIVE_READ32); 11779 } 11780 } 11781 } 11782 11783 static bool is_kfunc_acquire(struct bpf_kfunc_call_arg_meta *meta) ··· 11923 KF_ARG_RB_ROOT_ID, 11924 KF_ARG_RB_NODE_ID, 11925 KF_ARG_WORKQUEUE_ID, 11926 }; 11927 11928 BTF_ID_LIST(kf_arg_btf_ids) ··· 11933 BTF_ID(struct, bpf_rb_root) 11934 BTF_ID(struct, bpf_rb_node) 11935 BTF_ID(struct, bpf_wq) 11936 11937 static bool __is_kfunc_ptr_arg_type(const struct btf *btf, 11938 const struct btf_param *arg, int type) ··· 11980 static bool is_kfunc_arg_wq(const struct btf *btf, const struct btf_param *arg) 11981 { 11982 return __is_kfunc_ptr_arg_type(btf, arg, KF_ARG_WORKQUEUE_ID); 11983 } 11984 11985 static bool is_kfunc_arg_callback(struct bpf_verifier_env *env, const struct btf *btf, ··· 12058 KF_ARG_PTR_TO_MAP, 12059 KF_ARG_PTR_TO_WORKQUEUE, 12060 KF_ARG_PTR_TO_IRQ_FLAG, 12061 }; 12062 12063 enum special_kfunc_type { ··· 12097 KF_bpf_iter_num_destroy, 12098 KF_bpf_set_dentry_xattr, 12099 KF_bpf_remove_dentry_xattr, 12100 }; 12101 12102 BTF_SET_START(special_kfunc_set) ··· 12190 BTF_ID_UNUSED 12191 BTF_ID_UNUSED 12192 #endif 12193 12194 static bool is_kfunc_ret_null(struct bpf_kfunc_call_arg_meta *meta) 12195 { ··· 12286 12287 if (is_kfunc_arg_irq_flag(meta->btf, &args[argno])) 12288 return KF_ARG_PTR_TO_IRQ_FLAG; 12289 12290 if ((base_type(reg->type) == PTR_TO_BTF_ID || reg2btf_ids[base_type(reg->type)])) { 12291 if (!btf_type_is_struct(ref_t)) { ··· 12397 struct bpf_kfunc_call_arg_meta *meta) 12398 { 12399 struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno]; 12400 bool irq_save; 12401 - int err; 12402 12403 - if (meta->func_id == special_kfunc_list[KF_bpf_local_irq_save]) { 12404 irq_save = true; 12405 - } else if (meta->func_id == special_kfunc_list[KF_bpf_local_irq_restore]) { 12406 irq_save = false; 12407 } else { 12408 verbose(env, "verifier internal error: unknown irq flags kfunc\n"); 12409 return -EFAULT; ··· 12425 if (err) 12426 return err; 12427 12428 - err = mark_stack_slot_irq_flag(env, meta, reg, env->insn_idx); 12429 if (err) 12430 return err; 12431 } else { ··· 12439 if (err) 12440 return err; 12441 12442 - err = unmark_stack_slot_irq_flag(env, reg); 12443 if (err) 12444 return err; 12445 } ··· 12566 12567 if (!env->cur_state->active_locks) 12568 return -EINVAL; 12569 - s = find_lock_state(env->cur_state, REF_TYPE_LOCK, id, ptr); 12570 if (!s) { 12571 verbose(env, "held lock and object are not in the same allocation\n"); 12572 return -EINVAL; ··· 12602 btf_id == special_kfunc_list[KF_bpf_refcount_acquire_impl]; 12603 } 12604 12605 static bool kfunc_spin_allowed(u32 btf_id) 12606 { 12607 - return is_bpf_graph_api_kfunc(btf_id) || is_bpf_iter_num_api_kfunc(btf_id); 12608 } 12609 12610 static bool is_sync_callback_calling_kfunc(u32 btf_id) ··· 13045 case KF_ARG_PTR_TO_CONST_STR: 13046 case KF_ARG_PTR_TO_WORKQUEUE: 13047 case KF_ARG_PTR_TO_IRQ_FLAG: 13048 break; 13049 default: 13050 WARN_ON_ONCE(1); ··· 13344 if (ret < 0) 13345 return ret; 13346 break; 13347 } 13348 } 13349 ··· 13450 insn_aux = &env->insn_aux_data[insn_idx]; 13451 13452 insn_aux->is_iter_next = is_iter_next_kfunc(&meta); 13453 13454 if (is_kfunc_destructive(&meta) && !capable(CAP_SYS_BOOT)) { 13455 verbose(env, "destructive kfunc calls require CAP_SYS_BOOT capability\n"); ··· 13648 13649 if (btf_type_is_scalar(t)) { 13650 mark_reg_unknown(env, regs, BPF_REG_0); 13651 mark_btf_func_reg_size(env, BPF_REG_0, t->size); 13652 } else if (btf_type_is_ptr(t)) { 13653 ptr_type = btf_type_skip_modifiers(desc_btf, t->type, &ptr_type_id); ··· 18586 case STACK_IRQ_FLAG: 18587 old_reg = &old->stack[spi].spilled_ptr; 18588 cur_reg = &cur->stack[spi].spilled_ptr; 18589 - if (!check_ids(old_reg->ref_obj_id, cur_reg->ref_obj_id, idmap)) 18590 return false; 18591 break; 18592 case STACK_MISC: ··· 18622 if (!check_ids(old->active_irq_id, cur->active_irq_id, idmap)) 18623 return false; 18624 18625 for (i = 0; i < old->acquired_refs; i++) { 18626 if (!check_ids(old->refs[i].id, cur->refs[i].id, idmap) || 18627 old->refs[i].type != cur->refs[i].type) ··· 18635 case REF_TYPE_IRQ: 18636 break; 18637 case REF_TYPE_LOCK: 18638 if (old->refs[i].ptr != cur->refs[i].ptr) 18639 return false; 18640 break; ··· 19922 } 19923 } 19924 19925 - if (btf_record_has_field(map->record, BPF_SPIN_LOCK)) { 19926 if (prog_type == BPF_PROG_TYPE_SOCKET_FILTER) { 19927 verbose(env, "socket filter progs cannot use bpf_spin_lock yet\n"); 19928 return -EINVAL;
··· 456 457 static bool reg_may_point_to_spin_lock(const struct bpf_reg_state *reg) 458 { 459 + return btf_record_has_field(reg_btf_record(reg), BPF_SPIN_LOCK | BPF_RES_SPIN_LOCK); 460 } 461 462 static bool type_is_rdonly_mem(u32 type) ··· 1155 1156 static int mark_stack_slot_irq_flag(struct bpf_verifier_env *env, 1157 struct bpf_kfunc_call_arg_meta *meta, 1158 + struct bpf_reg_state *reg, int insn_idx, 1159 + int kfunc_class) 1160 { 1161 struct bpf_func_state *state = func(env, reg); 1162 struct bpf_stack_state *slot; ··· 1177 st->type = PTR_TO_STACK; /* we don't have dedicated reg type */ 1178 st->live |= REG_LIVE_WRITTEN; 1179 st->ref_obj_id = id; 1180 + st->irq.kfunc_class = kfunc_class; 1181 1182 for (i = 0; i < BPF_REG_SIZE; i++) 1183 slot->slot_type[i] = STACK_IRQ_FLAG; ··· 1185 return 0; 1186 } 1187 1188 + static int unmark_stack_slot_irq_flag(struct bpf_verifier_env *env, struct bpf_reg_state *reg, 1189 + int kfunc_class) 1190 { 1191 struct bpf_func_state *state = func(env, reg); 1192 struct bpf_stack_state *slot; ··· 1198 1199 slot = &state->stack[spi]; 1200 st = &slot->spilled_ptr; 1201 + 1202 + if (st->irq.kfunc_class != kfunc_class) { 1203 + const char *flag_kfunc = st->irq.kfunc_class == IRQ_NATIVE_KFUNC ? "native" : "lock"; 1204 + const char *used_kfunc = kfunc_class == IRQ_NATIVE_KFUNC ? "native" : "lock"; 1205 + 1206 + verbose(env, "irq flag acquired by %s kfuncs cannot be restored with %s kfuncs\n", 1207 + flag_kfunc, used_kfunc); 1208 + return -EINVAL; 1209 + } 1210 1211 err = release_irq_state(env->cur_state, st->ref_obj_id); 1212 WARN_ON_ONCE(err && err != -EACCES); ··· 1416 dst->active_preempt_locks = src->active_preempt_locks; 1417 dst->active_rcu_lock = src->active_rcu_lock; 1418 dst->active_irq_id = src->active_irq_id; 1419 + dst->active_lock_id = src->active_lock_id; 1420 + dst->active_lock_ptr = src->active_lock_ptr; 1421 return 0; 1422 } 1423 ··· 1515 s->ptr = ptr; 1516 1517 state->active_locks++; 1518 + state->active_lock_id = id; 1519 + state->active_lock_ptr = ptr; 1520 return 0; 1521 } 1522 ··· 1565 1566 static int release_lock_state(struct bpf_verifier_state *state, int type, int id, void *ptr) 1567 { 1568 + void *prev_ptr = NULL; 1569 + u32 prev_id = 0; 1570 int i; 1571 1572 for (i = 0; i < state->acquired_refs; i++) { 1573 + if (state->refs[i].type == type && state->refs[i].id == id && 1574 + state->refs[i].ptr == ptr) { 1575 release_reference_state(state, i); 1576 state->active_locks--; 1577 + /* Reassign active lock (id, ptr). */ 1578 + state->active_lock_id = prev_id; 1579 + state->active_lock_ptr = prev_ptr; 1580 return 0; 1581 + } 1582 + if (state->refs[i].type & REF_TYPE_LOCK_MASK) { 1583 + prev_id = state->refs[i].id; 1584 + prev_ptr = state->refs[i].ptr; 1585 } 1586 } 1587 return -EINVAL; ··· 1609 for (i = 0; i < state->acquired_refs; i++) { 1610 struct bpf_reference_state *s = &state->refs[i]; 1611 1612 + if (!(s->type & type)) 1613 continue; 1614 1615 if (s->id == id && s->ptr == ptr) ··· 8216 return err; 8217 } 8218 8219 + enum { 8220 + PROCESS_SPIN_LOCK = (1 << 0), 8221 + PROCESS_RES_LOCK = (1 << 1), 8222 + PROCESS_LOCK_IRQ = (1 << 2), 8223 + }; 8224 + 8225 /* Implementation details: 8226 * bpf_map_lookup returns PTR_TO_MAP_VALUE_OR_NULL. 8227 * bpf_obj_new returns PTR_TO_BTF_ID | MEM_ALLOC | PTR_MAYBE_NULL. ··· 8238 * env->cur_state->active_locks remembers which map value element or allocated 8239 * object got locked and clears it after bpf_spin_unlock. 8240 */ 8241 + static int process_spin_lock(struct bpf_verifier_env *env, int regno, int flags) 8242 { 8243 + bool is_lock = flags & PROCESS_SPIN_LOCK, is_res_lock = flags & PROCESS_RES_LOCK; 8244 + const char *lock_str = is_res_lock ? "bpf_res_spin" : "bpf_spin"; 8245 struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno]; 8246 struct bpf_verifier_state *cur = env->cur_state; 8247 bool is_const = tnum_is_const(reg->var_off); 8248 + bool is_irq = flags & PROCESS_LOCK_IRQ; 8249 u64 val = reg->var_off.value; 8250 struct bpf_map *map = NULL; 8251 struct btf *btf = NULL; 8252 struct btf_record *rec; 8253 + u32 spin_lock_off; 8254 int err; 8255 8256 if (!is_const) { 8257 verbose(env, 8258 + "R%d doesn't have constant offset. %s_lock has to be at the constant offset\n", 8259 + regno, lock_str); 8260 return -EINVAL; 8261 } 8262 if (reg->type == PTR_TO_MAP_VALUE) { 8263 map = reg->map_ptr; 8264 if (!map->btf) { 8265 verbose(env, 8266 + "map '%s' has to have BTF in order to use %s_lock\n", 8267 + map->name, lock_str); 8268 return -EINVAL; 8269 } 8270 } else { ··· 8269 } 8270 8271 rec = reg_btf_record(reg); 8272 + if (!btf_record_has_field(rec, is_res_lock ? BPF_RES_SPIN_LOCK : BPF_SPIN_LOCK)) { 8273 + verbose(env, "%s '%s' has no valid %s_lock\n", map ? "map" : "local", 8274 + map ? map->name : "kptr", lock_str); 8275 return -EINVAL; 8276 } 8277 + spin_lock_off = is_res_lock ? rec->res_spin_lock_off : rec->spin_lock_off; 8278 + if (spin_lock_off != val + reg->off) { 8279 + verbose(env, "off %lld doesn't point to 'struct %s_lock' that is at %d\n", 8280 + val + reg->off, lock_str, spin_lock_off); 8281 return -EINVAL; 8282 } 8283 if (is_lock) { 8284 void *ptr; 8285 + int type; 8286 8287 if (map) 8288 ptr = map; 8289 else 8290 ptr = btf; 8291 8292 + if (!is_res_lock && cur->active_locks) { 8293 + if (find_lock_state(env->cur_state, REF_TYPE_LOCK, 0, NULL)) { 8294 + verbose(env, 8295 + "Locking two bpf_spin_locks are not allowed\n"); 8296 + return -EINVAL; 8297 + } 8298 + } else if (is_res_lock && cur->active_locks) { 8299 + if (find_lock_state(env->cur_state, REF_TYPE_RES_LOCK | REF_TYPE_RES_LOCK_IRQ, reg->id, ptr)) { 8300 + verbose(env, "Acquiring the same lock again, AA deadlock detected\n"); 8301 + return -EINVAL; 8302 + } 8303 } 8304 + 8305 + if (is_res_lock && is_irq) 8306 + type = REF_TYPE_RES_LOCK_IRQ; 8307 + else if (is_res_lock) 8308 + type = REF_TYPE_RES_LOCK; 8309 + else 8310 + type = REF_TYPE_LOCK; 8311 + err = acquire_lock_state(env, env->insn_idx, type, reg->id, ptr); 8312 if (err < 0) { 8313 verbose(env, "Failed to acquire lock state\n"); 8314 return err; 8315 } 8316 } else { 8317 void *ptr; 8318 + int type; 8319 8320 if (map) 8321 ptr = map; ··· 8306 ptr = btf; 8307 8308 if (!cur->active_locks) { 8309 + verbose(env, "%s_unlock without taking a lock\n", lock_str); 8310 return -EINVAL; 8311 } 8312 8313 + if (is_res_lock && is_irq) 8314 + type = REF_TYPE_RES_LOCK_IRQ; 8315 + else if (is_res_lock) 8316 + type = REF_TYPE_RES_LOCK; 8317 + else 8318 + type = REF_TYPE_LOCK; 8319 + if (!find_lock_state(cur, type, reg->id, ptr)) { 8320 + verbose(env, "%s_unlock of different lock\n", lock_str); 8321 + return -EINVAL; 8322 + } 8323 + if (reg->id != cur->active_lock_id || ptr != cur->active_lock_ptr) { 8324 + verbose(env, "%s_unlock cannot be out of order\n", lock_str); 8325 + return -EINVAL; 8326 + } 8327 + if (release_lock_state(cur, type, reg->id, ptr)) { 8328 + verbose(env, "%s_unlock of different lock\n", lock_str); 8329 return -EINVAL; 8330 } 8331 ··· 9637 return -EACCES; 9638 } 9639 if (meta->func_id == BPF_FUNC_spin_lock) { 9640 + err = process_spin_lock(env, regno, PROCESS_SPIN_LOCK); 9641 if (err) 9642 return err; 9643 } else if (meta->func_id == BPF_FUNC_spin_unlock) { 9644 + err = process_spin_lock(env, regno, 0); 9645 if (err) 9646 return err; 9647 } else { ··· 11523 regs[BPF_REG_0].map_uid = meta.map_uid; 11524 regs[BPF_REG_0].type = PTR_TO_MAP_VALUE | ret_flag; 11525 if (!type_may_be_null(ret_flag) && 11526 + btf_record_has_field(meta.map_ptr->record, BPF_SPIN_LOCK | BPF_RES_SPIN_LOCK)) { 11527 regs[BPF_REG_0].id = ++env->id_gen; 11528 } 11529 break; ··· 11695 /* mark_btf_func_reg_size() is used when the reg size is determined by 11696 * the BTF func_proto's return value size and argument. 11697 */ 11698 + static void __mark_btf_func_reg_size(struct bpf_verifier_env *env, struct bpf_reg_state *regs, 11699 + u32 regno, size_t reg_size) 11700 { 11701 + struct bpf_reg_state *reg = &regs[regno]; 11702 11703 if (regno == BPF_REG_0) { 11704 /* Function return value */ ··· 11714 mark_reg_read(env, reg, reg->parent, REG_LIVE_READ32); 11715 } 11716 } 11717 + } 11718 + 11719 + static void mark_btf_func_reg_size(struct bpf_verifier_env *env, u32 regno, 11720 + size_t reg_size) 11721 + { 11722 + return __mark_btf_func_reg_size(env, cur_regs(env), regno, reg_size); 11723 } 11724 11725 static bool is_kfunc_acquire(struct bpf_kfunc_call_arg_meta *meta) ··· 11853 KF_ARG_RB_ROOT_ID, 11854 KF_ARG_RB_NODE_ID, 11855 KF_ARG_WORKQUEUE_ID, 11856 + KF_ARG_RES_SPIN_LOCK_ID, 11857 }; 11858 11859 BTF_ID_LIST(kf_arg_btf_ids) ··· 11862 BTF_ID(struct, bpf_rb_root) 11863 BTF_ID(struct, bpf_rb_node) 11864 BTF_ID(struct, bpf_wq) 11865 + BTF_ID(struct, bpf_res_spin_lock) 11866 11867 static bool __is_kfunc_ptr_arg_type(const struct btf *btf, 11868 const struct btf_param *arg, int type) ··· 11908 static bool is_kfunc_arg_wq(const struct btf *btf, const struct btf_param *arg) 11909 { 11910 return __is_kfunc_ptr_arg_type(btf, arg, KF_ARG_WORKQUEUE_ID); 11911 + } 11912 + 11913 + static bool is_kfunc_arg_res_spin_lock(const struct btf *btf, const struct btf_param *arg) 11914 + { 11915 + return __is_kfunc_ptr_arg_type(btf, arg, KF_ARG_RES_SPIN_LOCK_ID); 11916 } 11917 11918 static bool is_kfunc_arg_callback(struct bpf_verifier_env *env, const struct btf *btf, ··· 11981 KF_ARG_PTR_TO_MAP, 11982 KF_ARG_PTR_TO_WORKQUEUE, 11983 KF_ARG_PTR_TO_IRQ_FLAG, 11984 + KF_ARG_PTR_TO_RES_SPIN_LOCK, 11985 }; 11986 11987 enum special_kfunc_type { ··· 12019 KF_bpf_iter_num_destroy, 12020 KF_bpf_set_dentry_xattr, 12021 KF_bpf_remove_dentry_xattr, 12022 + KF_bpf_res_spin_lock, 12023 + KF_bpf_res_spin_unlock, 12024 + KF_bpf_res_spin_lock_irqsave, 12025 + KF_bpf_res_spin_unlock_irqrestore, 12026 }; 12027 12028 BTF_SET_START(special_kfunc_set) ··· 12108 BTF_ID_UNUSED 12109 BTF_ID_UNUSED 12110 #endif 12111 + BTF_ID(func, bpf_res_spin_lock) 12112 + BTF_ID(func, bpf_res_spin_unlock) 12113 + BTF_ID(func, bpf_res_spin_lock_irqsave) 12114 + BTF_ID(func, bpf_res_spin_unlock_irqrestore) 12115 12116 static bool is_kfunc_ret_null(struct bpf_kfunc_call_arg_meta *meta) 12117 { ··· 12200 12201 if (is_kfunc_arg_irq_flag(meta->btf, &args[argno])) 12202 return KF_ARG_PTR_TO_IRQ_FLAG; 12203 + 12204 + if (is_kfunc_arg_res_spin_lock(meta->btf, &args[argno])) 12205 + return KF_ARG_PTR_TO_RES_SPIN_LOCK; 12206 12207 if ((base_type(reg->type) == PTR_TO_BTF_ID || reg2btf_ids[base_type(reg->type)])) { 12208 if (!btf_type_is_struct(ref_t)) { ··· 12308 struct bpf_kfunc_call_arg_meta *meta) 12309 { 12310 struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno]; 12311 + int err, kfunc_class = IRQ_NATIVE_KFUNC; 12312 bool irq_save; 12313 12314 + if (meta->func_id == special_kfunc_list[KF_bpf_local_irq_save] || 12315 + meta->func_id == special_kfunc_list[KF_bpf_res_spin_lock_irqsave]) { 12316 irq_save = true; 12317 + if (meta->func_id == special_kfunc_list[KF_bpf_res_spin_lock_irqsave]) 12318 + kfunc_class = IRQ_LOCK_KFUNC; 12319 + } else if (meta->func_id == special_kfunc_list[KF_bpf_local_irq_restore] || 12320 + meta->func_id == special_kfunc_list[KF_bpf_res_spin_unlock_irqrestore]) { 12321 irq_save = false; 12322 + if (meta->func_id == special_kfunc_list[KF_bpf_res_spin_unlock_irqrestore]) 12323 + kfunc_class = IRQ_LOCK_KFUNC; 12324 } else { 12325 verbose(env, "verifier internal error: unknown irq flags kfunc\n"); 12326 return -EFAULT; ··· 12330 if (err) 12331 return err; 12332 12333 + err = mark_stack_slot_irq_flag(env, meta, reg, env->insn_idx, kfunc_class); 12334 if (err) 12335 return err; 12336 } else { ··· 12344 if (err) 12345 return err; 12346 12347 + err = unmark_stack_slot_irq_flag(env, reg, kfunc_class); 12348 if (err) 12349 return err; 12350 } ··· 12471 12472 if (!env->cur_state->active_locks) 12473 return -EINVAL; 12474 + s = find_lock_state(env->cur_state, REF_TYPE_LOCK_MASK, id, ptr); 12475 if (!s) { 12476 verbose(env, "held lock and object are not in the same allocation\n"); 12477 return -EINVAL; ··· 12507 btf_id == special_kfunc_list[KF_bpf_refcount_acquire_impl]; 12508 } 12509 12510 + static bool is_bpf_res_spin_lock_kfunc(u32 btf_id) 12511 + { 12512 + return btf_id == special_kfunc_list[KF_bpf_res_spin_lock] || 12513 + btf_id == special_kfunc_list[KF_bpf_res_spin_unlock] || 12514 + btf_id == special_kfunc_list[KF_bpf_res_spin_lock_irqsave] || 12515 + btf_id == special_kfunc_list[KF_bpf_res_spin_unlock_irqrestore]; 12516 + } 12517 + 12518 static bool kfunc_spin_allowed(u32 btf_id) 12519 { 12520 + return is_bpf_graph_api_kfunc(btf_id) || is_bpf_iter_num_api_kfunc(btf_id) || 12521 + is_bpf_res_spin_lock_kfunc(btf_id); 12522 } 12523 12524 static bool is_sync_callback_calling_kfunc(u32 btf_id) ··· 12941 case KF_ARG_PTR_TO_CONST_STR: 12942 case KF_ARG_PTR_TO_WORKQUEUE: 12943 case KF_ARG_PTR_TO_IRQ_FLAG: 12944 + case KF_ARG_PTR_TO_RES_SPIN_LOCK: 12945 break; 12946 default: 12947 WARN_ON_ONCE(1); ··· 13239 if (ret < 0) 13240 return ret; 13241 break; 13242 + case KF_ARG_PTR_TO_RES_SPIN_LOCK: 13243 + { 13244 + int flags = PROCESS_RES_LOCK; 13245 + 13246 + if (reg->type != PTR_TO_MAP_VALUE && reg->type != (PTR_TO_BTF_ID | MEM_ALLOC)) { 13247 + verbose(env, "arg#%d doesn't point to map value or allocated object\n", i); 13248 + return -EINVAL; 13249 + } 13250 + 13251 + if (!is_bpf_res_spin_lock_kfunc(meta->func_id)) 13252 + return -EFAULT; 13253 + if (meta->func_id == special_kfunc_list[KF_bpf_res_spin_lock] || 13254 + meta->func_id == special_kfunc_list[KF_bpf_res_spin_lock_irqsave]) 13255 + flags |= PROCESS_SPIN_LOCK; 13256 + if (meta->func_id == special_kfunc_list[KF_bpf_res_spin_lock_irqsave] || 13257 + meta->func_id == special_kfunc_list[KF_bpf_res_spin_unlock_irqrestore]) 13258 + flags |= PROCESS_LOCK_IRQ; 13259 + ret = process_spin_lock(env, regno, flags); 13260 + if (ret < 0) 13261 + return ret; 13262 + break; 13263 + } 13264 } 13265 } 13266 ··· 13323 insn_aux = &env->insn_aux_data[insn_idx]; 13324 13325 insn_aux->is_iter_next = is_iter_next_kfunc(&meta); 13326 + 13327 + if (!insn->off && 13328 + (insn->imm == special_kfunc_list[KF_bpf_res_spin_lock] || 13329 + insn->imm == special_kfunc_list[KF_bpf_res_spin_lock_irqsave])) { 13330 + struct bpf_verifier_state *branch; 13331 + struct bpf_reg_state *regs; 13332 + 13333 + branch = push_stack(env, env->insn_idx + 1, env->insn_idx, false); 13334 + if (!branch) { 13335 + verbose(env, "failed to push state for failed lock acquisition\n"); 13336 + return -ENOMEM; 13337 + } 13338 + 13339 + regs = branch->frame[branch->curframe]->regs; 13340 + 13341 + /* Clear r0-r5 registers in forked state */ 13342 + for (i = 0; i < CALLER_SAVED_REGS; i++) 13343 + mark_reg_not_init(env, regs, caller_saved[i]); 13344 + 13345 + mark_reg_unknown(env, regs, BPF_REG_0); 13346 + err = __mark_reg_s32_range(env, regs, BPF_REG_0, -MAX_ERRNO, -1); 13347 + if (err) { 13348 + verbose(env, "failed to mark s32 range for retval in forked state for lock\n"); 13349 + return err; 13350 + } 13351 + __mark_btf_func_reg_size(env, regs, BPF_REG_0, sizeof(u32)); 13352 + } 13353 13354 if (is_kfunc_destructive(&meta) && !capable(CAP_SYS_BOOT)) { 13355 verbose(env, "destructive kfunc calls require CAP_SYS_BOOT capability\n"); ··· 13494 13495 if (btf_type_is_scalar(t)) { 13496 mark_reg_unknown(env, regs, BPF_REG_0); 13497 + if (meta.btf == btf_vmlinux && (meta.func_id == special_kfunc_list[KF_bpf_res_spin_lock] || 13498 + meta.func_id == special_kfunc_list[KF_bpf_res_spin_lock_irqsave])) 13499 + __mark_reg_const_zero(env, &regs[BPF_REG_0]); 13500 mark_btf_func_reg_size(env, BPF_REG_0, t->size); 13501 } else if (btf_type_is_ptr(t)) { 13502 ptr_type = btf_type_skip_modifiers(desc_btf, t->type, &ptr_type_id); ··· 18429 case STACK_IRQ_FLAG: 18430 old_reg = &old->stack[spi].spilled_ptr; 18431 cur_reg = &cur->stack[spi].spilled_ptr; 18432 + if (!check_ids(old_reg->ref_obj_id, cur_reg->ref_obj_id, idmap) || 18433 + old_reg->irq.kfunc_class != cur_reg->irq.kfunc_class) 18434 return false; 18435 break; 18436 case STACK_MISC: ··· 18464 if (!check_ids(old->active_irq_id, cur->active_irq_id, idmap)) 18465 return false; 18466 18467 + if (!check_ids(old->active_lock_id, cur->active_lock_id, idmap) || 18468 + old->active_lock_ptr != cur->active_lock_ptr) 18469 + return false; 18470 + 18471 for (i = 0; i < old->acquired_refs; i++) { 18472 if (!check_ids(old->refs[i].id, cur->refs[i].id, idmap) || 18473 old->refs[i].type != cur->refs[i].type) ··· 18473 case REF_TYPE_IRQ: 18474 break; 18475 case REF_TYPE_LOCK: 18476 + case REF_TYPE_RES_LOCK: 18477 + case REF_TYPE_RES_LOCK_IRQ: 18478 if (old->refs[i].ptr != cur->refs[i].ptr) 18479 return false; 18480 break; ··· 19758 } 19759 } 19760 19761 + if (btf_record_has_field(map->record, BPF_SPIN_LOCK | BPF_RES_SPIN_LOCK)) { 19762 if (prog_type == BPF_PROG_TYPE_SOCKET_FILTER) { 19763 verbose(env, "socket filter progs cannot use bpf_spin_lock yet\n"); 19764 return -EINVAL;
+5
kernel/locking/lock_events_list.h
··· 50 #endif /* CONFIG_QUEUED_SPINLOCKS */ 51 52 /* 53 * Locking events for rwsem 54 */ 55 LOCK_EVENT(rwsem_sleep_reader) /* # of reader sleeps */
··· 50 #endif /* CONFIG_QUEUED_SPINLOCKS */ 51 52 /* 53 + * Locking events for Resilient Queued Spin Lock 54 + */ 55 + LOCK_EVENT(rqspinlock_lock_timeout) /* # of locking ops that timeout */ 56 + 57 + /* 58 * Locking events for rwsem 59 */ 60 LOCK_EVENT(rwsem_sleep_reader) /* # of reader sleeps */
+57
kernel/locking/locktorture.c
··· 362 .name = "raw_spin_lock_irq" 363 }; 364 365 static DEFINE_RWLOCK(torture_rwlock); 366 367 static int torture_rwlock_write_lock(int tid __maybe_unused) ··· 1222 &lock_busted_ops, 1223 &spin_lock_ops, &spin_lock_irq_ops, 1224 &raw_spin_lock_ops, &raw_spin_lock_irq_ops, 1225 &rw_lock_ops, &rw_lock_irq_ops, 1226 &mutex_lock_ops, 1227 &ww_mutex_lock_ops,
··· 362 .name = "raw_spin_lock_irq" 363 }; 364 365 + #ifdef CONFIG_BPF_SYSCALL 366 + 367 + #include <asm/rqspinlock.h> 368 + static rqspinlock_t rqspinlock; 369 + 370 + static int torture_raw_res_spin_write_lock(int tid __maybe_unused) 371 + { 372 + raw_res_spin_lock(&rqspinlock); 373 + return 0; 374 + } 375 + 376 + static void torture_raw_res_spin_write_unlock(int tid __maybe_unused) 377 + { 378 + raw_res_spin_unlock(&rqspinlock); 379 + } 380 + 381 + static struct lock_torture_ops raw_res_spin_lock_ops = { 382 + .writelock = torture_raw_res_spin_write_lock, 383 + .write_delay = torture_spin_lock_write_delay, 384 + .task_boost = torture_rt_boost, 385 + .writeunlock = torture_raw_res_spin_write_unlock, 386 + .readlock = NULL, 387 + .read_delay = NULL, 388 + .readunlock = NULL, 389 + .name = "raw_res_spin_lock" 390 + }; 391 + 392 + static int torture_raw_res_spin_write_lock_irq(int tid __maybe_unused) 393 + { 394 + unsigned long flags; 395 + 396 + raw_res_spin_lock_irqsave(&rqspinlock, flags); 397 + cxt.cur_ops->flags = flags; 398 + return 0; 399 + } 400 + 401 + static void torture_raw_res_spin_write_unlock_irq(int tid __maybe_unused) 402 + { 403 + raw_res_spin_unlock_irqrestore(&rqspinlock, cxt.cur_ops->flags); 404 + } 405 + 406 + static struct lock_torture_ops raw_res_spin_lock_irq_ops = { 407 + .writelock = torture_raw_res_spin_write_lock_irq, 408 + .write_delay = torture_spin_lock_write_delay, 409 + .task_boost = torture_rt_boost, 410 + .writeunlock = torture_raw_res_spin_write_unlock_irq, 411 + .readlock = NULL, 412 + .read_delay = NULL, 413 + .readunlock = NULL, 414 + .name = "raw_res_spin_lock_irq" 415 + }; 416 + 417 + #endif 418 + 419 static DEFINE_RWLOCK(torture_rwlock); 420 421 static int torture_rwlock_write_lock(int tid __maybe_unused) ··· 1168 &lock_busted_ops, 1169 &spin_lock_ops, &spin_lock_irq_ops, 1170 &raw_spin_lock_ops, &raw_spin_lock_irq_ops, 1171 + #ifdef CONFIG_BPF_SYSCALL 1172 + &raw_res_spin_lock_ops, &raw_res_spin_lock_irq_ops, 1173 + #endif 1174 &rw_lock_ops, &rw_lock_irq_ops, 1175 &mutex_lock_ops, 1176 &ww_mutex_lock_ops,
+1 -9
kernel/locking/mcs_spinlock.h
··· 15 16 #include <asm/mcs_spinlock.h> 17 18 - struct mcs_spinlock { 19 - struct mcs_spinlock *next; 20 - int locked; /* 1 if lock acquired */ 21 - int count; /* nesting count, see qspinlock.c */ 22 - }; 23 - 24 #ifndef arch_mcs_spin_lock_contended 25 /* 26 * Using smp_cond_load_acquire() provides the acquire semantics ··· 24 * spinning, and smp_cond_load_acquire() provides that behavior. 25 */ 26 #define arch_mcs_spin_lock_contended(l) \ 27 - do { \ 28 - smp_cond_load_acquire(l, VAL); \ 29 - } while (0) 30 #endif 31 32 #ifndef arch_mcs_spin_unlock_contended
··· 15 16 #include <asm/mcs_spinlock.h> 17 18 #ifndef arch_mcs_spin_lock_contended 19 /* 20 * Using smp_cond_load_acquire() provides the acquire semantics ··· 30 * spinning, and smp_cond_load_acquire() provides that behavior. 31 */ 32 #define arch_mcs_spin_lock_contended(l) \ 33 + smp_cond_load_acquire(l, VAL) 34 #endif 35 36 #ifndef arch_mcs_spin_unlock_contended
+5 -188
kernel/locking/qspinlock.c
··· 25 #include <trace/events/lock.h> 26 27 /* 28 - * Include queued spinlock statistics code 29 */ 30 #include "qspinlock_stat.h" 31 32 /* ··· 68 */ 69 70 #include "mcs_spinlock.h" 71 - #define MAX_NODES 4 72 - 73 - /* 74 - * On 64-bit architectures, the mcs_spinlock structure will be 16 bytes in 75 - * size and four of them will fit nicely in one 64-byte cacheline. For 76 - * pvqspinlock, however, we need more space for extra data. To accommodate 77 - * that, we insert two more long words to pad it up to 32 bytes. IOW, only 78 - * two of them can fit in a cacheline in this case. That is OK as it is rare 79 - * to have more than 2 levels of slowpath nesting in actual use. We don't 80 - * want to penalize pvqspinlocks to optimize for a rare case in native 81 - * qspinlocks. 82 - */ 83 - struct qnode { 84 - struct mcs_spinlock mcs; 85 - #ifdef CONFIG_PARAVIRT_SPINLOCKS 86 - long reserved[2]; 87 - #endif 88 - }; 89 - 90 - /* 91 - * The pending bit spinning loop count. 92 - * This heuristic is used to limit the number of lockword accesses 93 - * made by atomic_cond_read_relaxed when waiting for the lock to 94 - * transition out of the "== _Q_PENDING_VAL" state. We don't spin 95 - * indefinitely because there's no guarantee that we'll make forward 96 - * progress. 97 - */ 98 - #ifndef _Q_PENDING_LOOPS 99 - #define _Q_PENDING_LOOPS 1 100 - #endif 101 102 /* 103 * Per-CPU queue node structures; we can never have more than 4 nested ··· 77 * 78 * PV doubles the storage and uses the second cacheline for PV state. 79 */ 80 - static DEFINE_PER_CPU_ALIGNED(struct qnode, qnodes[MAX_NODES]); 81 - 82 - /* 83 - * We must be able to distinguish between no-tail and the tail at 0:0, 84 - * therefore increment the cpu number by one. 85 - */ 86 - 87 - static inline __pure u32 encode_tail(int cpu, int idx) 88 - { 89 - u32 tail; 90 - 91 - tail = (cpu + 1) << _Q_TAIL_CPU_OFFSET; 92 - tail |= idx << _Q_TAIL_IDX_OFFSET; /* assume < 4 */ 93 - 94 - return tail; 95 - } 96 - 97 - static inline __pure struct mcs_spinlock *decode_tail(u32 tail) 98 - { 99 - int cpu = (tail >> _Q_TAIL_CPU_OFFSET) - 1; 100 - int idx = (tail & _Q_TAIL_IDX_MASK) >> _Q_TAIL_IDX_OFFSET; 101 - 102 - return per_cpu_ptr(&qnodes[idx].mcs, cpu); 103 - } 104 - 105 - static inline __pure 106 - struct mcs_spinlock *grab_mcs_node(struct mcs_spinlock *base, int idx) 107 - { 108 - return &((struct qnode *)base + idx)->mcs; 109 - } 110 - 111 - #define _Q_LOCKED_PENDING_MASK (_Q_LOCKED_MASK | _Q_PENDING_MASK) 112 - 113 - #if _Q_PENDING_BITS == 8 114 - /** 115 - * clear_pending - clear the pending bit. 116 - * @lock: Pointer to queued spinlock structure 117 - * 118 - * *,1,* -> *,0,* 119 - */ 120 - static __always_inline void clear_pending(struct qspinlock *lock) 121 - { 122 - WRITE_ONCE(lock->pending, 0); 123 - } 124 - 125 - /** 126 - * clear_pending_set_locked - take ownership and clear the pending bit. 127 - * @lock: Pointer to queued spinlock structure 128 - * 129 - * *,1,0 -> *,0,1 130 - * 131 - * Lock stealing is not allowed if this function is used. 132 - */ 133 - static __always_inline void clear_pending_set_locked(struct qspinlock *lock) 134 - { 135 - WRITE_ONCE(lock->locked_pending, _Q_LOCKED_VAL); 136 - } 137 - 138 - /* 139 - * xchg_tail - Put in the new queue tail code word & retrieve previous one 140 - * @lock : Pointer to queued spinlock structure 141 - * @tail : The new queue tail code word 142 - * Return: The previous queue tail code word 143 - * 144 - * xchg(lock, tail), which heads an address dependency 145 - * 146 - * p,*,* -> n,*,* ; prev = xchg(lock, node) 147 - */ 148 - static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail) 149 - { 150 - /* 151 - * We can use relaxed semantics since the caller ensures that the 152 - * MCS node is properly initialized before updating the tail. 153 - */ 154 - return (u32)xchg_relaxed(&lock->tail, 155 - tail >> _Q_TAIL_OFFSET) << _Q_TAIL_OFFSET; 156 - } 157 - 158 - #else /* _Q_PENDING_BITS == 8 */ 159 - 160 - /** 161 - * clear_pending - clear the pending bit. 162 - * @lock: Pointer to queued spinlock structure 163 - * 164 - * *,1,* -> *,0,* 165 - */ 166 - static __always_inline void clear_pending(struct qspinlock *lock) 167 - { 168 - atomic_andnot(_Q_PENDING_VAL, &lock->val); 169 - } 170 - 171 - /** 172 - * clear_pending_set_locked - take ownership and clear the pending bit. 173 - * @lock: Pointer to queued spinlock structure 174 - * 175 - * *,1,0 -> *,0,1 176 - */ 177 - static __always_inline void clear_pending_set_locked(struct qspinlock *lock) 178 - { 179 - atomic_add(-_Q_PENDING_VAL + _Q_LOCKED_VAL, &lock->val); 180 - } 181 - 182 - /** 183 - * xchg_tail - Put in the new queue tail code word & retrieve previous one 184 - * @lock : Pointer to queued spinlock structure 185 - * @tail : The new queue tail code word 186 - * Return: The previous queue tail code word 187 - * 188 - * xchg(lock, tail) 189 - * 190 - * p,*,* -> n,*,* ; prev = xchg(lock, node) 191 - */ 192 - static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail) 193 - { 194 - u32 old, new; 195 - 196 - old = atomic_read(&lock->val); 197 - do { 198 - new = (old & _Q_LOCKED_PENDING_MASK) | tail; 199 - /* 200 - * We can use relaxed semantics since the caller ensures that 201 - * the MCS node is properly initialized before updating the 202 - * tail. 203 - */ 204 - } while (!atomic_try_cmpxchg_relaxed(&lock->val, &old, new)); 205 - 206 - return old; 207 - } 208 - #endif /* _Q_PENDING_BITS == 8 */ 209 - 210 - /** 211 - * queued_fetch_set_pending_acquire - fetch the whole lock value and set pending 212 - * @lock : Pointer to queued spinlock structure 213 - * Return: The previous lock value 214 - * 215 - * *,*,* -> *,1,* 216 - */ 217 - #ifndef queued_fetch_set_pending_acquire 218 - static __always_inline u32 queued_fetch_set_pending_acquire(struct qspinlock *lock) 219 - { 220 - return atomic_fetch_or_acquire(_Q_PENDING_VAL, &lock->val); 221 - } 222 - #endif 223 - 224 - /** 225 - * set_locked - Set the lock bit and own the lock 226 - * @lock: Pointer to queued spinlock structure 227 - * 228 - * *,*,0 -> *,0,1 229 - */ 230 - static __always_inline void set_locked(struct qspinlock *lock) 231 - { 232 - WRITE_ONCE(lock->locked, _Q_LOCKED_VAL); 233 - } 234 - 235 236 /* 237 * Generate the native code for queued_spin_unlock_slowpath(); provide NOPs for ··· 227 * any MCS node. This is not the most elegant solution, but is 228 * simple enough. 229 */ 230 - if (unlikely(idx >= MAX_NODES)) { 231 lockevent_inc(lock_no_node); 232 while (!queued_spin_trylock(lock)) 233 cpu_relax(); ··· 282 * head of the waitqueue. 283 */ 284 if (old & _Q_TAIL_MASK) { 285 - prev = decode_tail(old); 286 287 /* Link @node into the waitqueue. */ 288 WRITE_ONCE(prev->next, node);
··· 25 #include <trace/events/lock.h> 26 27 /* 28 + * Include queued spinlock definitions and statistics code 29 */ 30 + #include "qspinlock.h" 31 #include "qspinlock_stat.h" 32 33 /* ··· 67 */ 68 69 #include "mcs_spinlock.h" 70 71 /* 72 * Per-CPU queue node structures; we can never have more than 4 nested ··· 106 * 107 * PV doubles the storage and uses the second cacheline for PV state. 108 */ 109 + static DEFINE_PER_CPU_ALIGNED(struct qnode, qnodes[_Q_MAX_NODES]); 110 111 /* 112 * Generate the native code for queued_spin_unlock_slowpath(); provide NOPs for ··· 410 * any MCS node. This is not the most elegant solution, but is 411 * simple enough. 412 */ 413 + if (unlikely(idx >= _Q_MAX_NODES)) { 414 lockevent_inc(lock_no_node); 415 while (!queued_spin_trylock(lock)) 416 cpu_relax(); ··· 465 * head of the waitqueue. 466 */ 467 if (old & _Q_TAIL_MASK) { 468 + prev = decode_tail(old, qnodes); 469 470 /* Link @node into the waitqueue. */ 471 WRITE_ONCE(prev->next, node);
+201
kernel/locking/qspinlock.h
···
··· 1 + /* SPDX-License-Identifier: GPL-2.0-or-later */ 2 + /* 3 + * Queued spinlock defines 4 + * 5 + * This file contains macro definitions and functions shared between different 6 + * qspinlock slow path implementations. 7 + */ 8 + #ifndef __LINUX_QSPINLOCK_H 9 + #define __LINUX_QSPINLOCK_H 10 + 11 + #include <asm-generic/percpu.h> 12 + #include <linux/percpu-defs.h> 13 + #include <asm-generic/qspinlock.h> 14 + #include <asm-generic/mcs_spinlock.h> 15 + 16 + #define _Q_MAX_NODES 4 17 + 18 + /* 19 + * The pending bit spinning loop count. 20 + * This heuristic is used to limit the number of lockword accesses 21 + * made by atomic_cond_read_relaxed when waiting for the lock to 22 + * transition out of the "== _Q_PENDING_VAL" state. We don't spin 23 + * indefinitely because there's no guarantee that we'll make forward 24 + * progress. 25 + */ 26 + #ifndef _Q_PENDING_LOOPS 27 + #define _Q_PENDING_LOOPS 1 28 + #endif 29 + 30 + /* 31 + * On 64-bit architectures, the mcs_spinlock structure will be 16 bytes in 32 + * size and four of them will fit nicely in one 64-byte cacheline. For 33 + * pvqspinlock, however, we need more space for extra data. To accommodate 34 + * that, we insert two more long words to pad it up to 32 bytes. IOW, only 35 + * two of them can fit in a cacheline in this case. That is OK as it is rare 36 + * to have more than 2 levels of slowpath nesting in actual use. We don't 37 + * want to penalize pvqspinlocks to optimize for a rare case in native 38 + * qspinlocks. 39 + */ 40 + struct qnode { 41 + struct mcs_spinlock mcs; 42 + #ifdef CONFIG_PARAVIRT_SPINLOCKS 43 + long reserved[2]; 44 + #endif 45 + }; 46 + 47 + /* 48 + * We must be able to distinguish between no-tail and the tail at 0:0, 49 + * therefore increment the cpu number by one. 50 + */ 51 + 52 + static inline __pure u32 encode_tail(int cpu, int idx) 53 + { 54 + u32 tail; 55 + 56 + tail = (cpu + 1) << _Q_TAIL_CPU_OFFSET; 57 + tail |= idx << _Q_TAIL_IDX_OFFSET; /* assume < 4 */ 58 + 59 + return tail; 60 + } 61 + 62 + static inline __pure struct mcs_spinlock *decode_tail(u32 tail, 63 + struct qnode __percpu *qnodes) 64 + { 65 + int cpu = (tail >> _Q_TAIL_CPU_OFFSET) - 1; 66 + int idx = (tail & _Q_TAIL_IDX_MASK) >> _Q_TAIL_IDX_OFFSET; 67 + 68 + return per_cpu_ptr(&qnodes[idx].mcs, cpu); 69 + } 70 + 71 + static inline __pure 72 + struct mcs_spinlock *grab_mcs_node(struct mcs_spinlock *base, int idx) 73 + { 74 + return &((struct qnode *)base + idx)->mcs; 75 + } 76 + 77 + #define _Q_LOCKED_PENDING_MASK (_Q_LOCKED_MASK | _Q_PENDING_MASK) 78 + 79 + #if _Q_PENDING_BITS == 8 80 + /** 81 + * clear_pending - clear the pending bit. 82 + * @lock: Pointer to queued spinlock structure 83 + * 84 + * *,1,* -> *,0,* 85 + */ 86 + static __always_inline void clear_pending(struct qspinlock *lock) 87 + { 88 + WRITE_ONCE(lock->pending, 0); 89 + } 90 + 91 + /** 92 + * clear_pending_set_locked - take ownership and clear the pending bit. 93 + * @lock: Pointer to queued spinlock structure 94 + * 95 + * *,1,0 -> *,0,1 96 + * 97 + * Lock stealing is not allowed if this function is used. 98 + */ 99 + static __always_inline void clear_pending_set_locked(struct qspinlock *lock) 100 + { 101 + WRITE_ONCE(lock->locked_pending, _Q_LOCKED_VAL); 102 + } 103 + 104 + /* 105 + * xchg_tail - Put in the new queue tail code word & retrieve previous one 106 + * @lock : Pointer to queued spinlock structure 107 + * @tail : The new queue tail code word 108 + * Return: The previous queue tail code word 109 + * 110 + * xchg(lock, tail), which heads an address dependency 111 + * 112 + * p,*,* -> n,*,* ; prev = xchg(lock, node) 113 + */ 114 + static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail) 115 + { 116 + /* 117 + * We can use relaxed semantics since the caller ensures that the 118 + * MCS node is properly initialized before updating the tail. 119 + */ 120 + return (u32)xchg_relaxed(&lock->tail, 121 + tail >> _Q_TAIL_OFFSET) << _Q_TAIL_OFFSET; 122 + } 123 + 124 + #else /* _Q_PENDING_BITS == 8 */ 125 + 126 + /** 127 + * clear_pending - clear the pending bit. 128 + * @lock: Pointer to queued spinlock structure 129 + * 130 + * *,1,* -> *,0,* 131 + */ 132 + static __always_inline void clear_pending(struct qspinlock *lock) 133 + { 134 + atomic_andnot(_Q_PENDING_VAL, &lock->val); 135 + } 136 + 137 + /** 138 + * clear_pending_set_locked - take ownership and clear the pending bit. 139 + * @lock: Pointer to queued spinlock structure 140 + * 141 + * *,1,0 -> *,0,1 142 + */ 143 + static __always_inline void clear_pending_set_locked(struct qspinlock *lock) 144 + { 145 + atomic_add(-_Q_PENDING_VAL + _Q_LOCKED_VAL, &lock->val); 146 + } 147 + 148 + /** 149 + * xchg_tail - Put in the new queue tail code word & retrieve previous one 150 + * @lock : Pointer to queued spinlock structure 151 + * @tail : The new queue tail code word 152 + * Return: The previous queue tail code word 153 + * 154 + * xchg(lock, tail) 155 + * 156 + * p,*,* -> n,*,* ; prev = xchg(lock, node) 157 + */ 158 + static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail) 159 + { 160 + u32 old, new; 161 + 162 + old = atomic_read(&lock->val); 163 + do { 164 + new = (old & _Q_LOCKED_PENDING_MASK) | tail; 165 + /* 166 + * We can use relaxed semantics since the caller ensures that 167 + * the MCS node is properly initialized before updating the 168 + * tail. 169 + */ 170 + } while (!atomic_try_cmpxchg_relaxed(&lock->val, &old, new)); 171 + 172 + return old; 173 + } 174 + #endif /* _Q_PENDING_BITS == 8 */ 175 + 176 + /** 177 + * queued_fetch_set_pending_acquire - fetch the whole lock value and set pending 178 + * @lock : Pointer to queued spinlock structure 179 + * Return: The previous lock value 180 + * 181 + * *,*,* -> *,1,* 182 + */ 183 + #ifndef queued_fetch_set_pending_acquire 184 + static __always_inline u32 queued_fetch_set_pending_acquire(struct qspinlock *lock) 185 + { 186 + return atomic_fetch_or_acquire(_Q_PENDING_VAL, &lock->val); 187 + } 188 + #endif 189 + 190 + /** 191 + * set_locked - Set the lock bit and own the lock 192 + * @lock: Pointer to queued spinlock structure 193 + * 194 + * *,*,0 -> *,0,1 195 + */ 196 + static __always_inline void set_locked(struct qspinlock *lock) 197 + { 198 + WRITE_ONCE(lock->locked, _Q_LOCKED_VAL); 199 + } 200 + 201 + #endif /* __LINUX_QSPINLOCK_H */
+98
tools/testing/selftests/bpf/prog_tests/res_spin_lock.c
···
··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + /* Copyright (c) 2024-2025 Meta Platforms, Inc. and affiliates. */ 3 + #include <test_progs.h> 4 + #include <network_helpers.h> 5 + #include <sys/sysinfo.h> 6 + 7 + #include "res_spin_lock.skel.h" 8 + #include "res_spin_lock_fail.skel.h" 9 + 10 + void test_res_spin_lock_failure(void) 11 + { 12 + RUN_TESTS(res_spin_lock_fail); 13 + } 14 + 15 + static volatile int skip; 16 + 17 + static void *spin_lock_thread(void *arg) 18 + { 19 + int err, prog_fd = *(u32 *) arg; 20 + LIBBPF_OPTS(bpf_test_run_opts, topts, 21 + .data_in = &pkt_v4, 22 + .data_size_in = sizeof(pkt_v4), 23 + .repeat = 10000, 24 + ); 25 + 26 + while (!READ_ONCE(skip)) { 27 + err = bpf_prog_test_run_opts(prog_fd, &topts); 28 + ASSERT_OK(err, "test_run"); 29 + ASSERT_OK(topts.retval, "test_run retval"); 30 + } 31 + pthread_exit(arg); 32 + } 33 + 34 + void test_res_spin_lock_success(void) 35 + { 36 + LIBBPF_OPTS(bpf_test_run_opts, topts, 37 + .data_in = &pkt_v4, 38 + .data_size_in = sizeof(pkt_v4), 39 + .repeat = 1, 40 + ); 41 + struct res_spin_lock *skel; 42 + pthread_t thread_id[16]; 43 + int prog_fd, i, err; 44 + void *ret; 45 + 46 + if (get_nprocs() < 2) { 47 + test__skip(); 48 + return; 49 + } 50 + 51 + skel = res_spin_lock__open_and_load(); 52 + if (!ASSERT_OK_PTR(skel, "res_spin_lock__open_and_load")) 53 + return; 54 + /* AA deadlock */ 55 + prog_fd = bpf_program__fd(skel->progs.res_spin_lock_test); 56 + err = bpf_prog_test_run_opts(prog_fd, &topts); 57 + ASSERT_OK(err, "error"); 58 + ASSERT_OK(topts.retval, "retval"); 59 + 60 + prog_fd = bpf_program__fd(skel->progs.res_spin_lock_test_held_lock_max); 61 + err = bpf_prog_test_run_opts(prog_fd, &topts); 62 + ASSERT_OK(err, "error"); 63 + ASSERT_OK(topts.retval, "retval"); 64 + 65 + /* Multi-threaded ABBA deadlock. */ 66 + 67 + prog_fd = bpf_program__fd(skel->progs.res_spin_lock_test_AB); 68 + for (i = 0; i < 16; i++) { 69 + int err; 70 + 71 + err = pthread_create(&thread_id[i], NULL, &spin_lock_thread, &prog_fd); 72 + if (!ASSERT_OK(err, "pthread_create")) 73 + goto end; 74 + } 75 + 76 + topts.retval = 0; 77 + topts.repeat = 1000; 78 + int fd = bpf_program__fd(skel->progs.res_spin_lock_test_BA); 79 + while (!topts.retval && !err && !READ_ONCE(skel->bss->err)) { 80 + err = bpf_prog_test_run_opts(fd, &topts); 81 + } 82 + 83 + WRITE_ONCE(skip, true); 84 + 85 + for (i = 0; i < 16; i++) { 86 + if (!ASSERT_OK(pthread_join(thread_id[i], &ret), "pthread_join")) 87 + goto end; 88 + if (!ASSERT_EQ(ret, &prog_fd, "ret == prog_fd")) 89 + goto end; 90 + } 91 + 92 + ASSERT_EQ(READ_ONCE(skel->bss->err), -EDEADLK, "timeout err"); 93 + ASSERT_OK(err, "err"); 94 + ASSERT_EQ(topts.retval, -EDEADLK, "timeout"); 95 + end: 96 + res_spin_lock__destroy(skel); 97 + return; 98 + }
+53
tools/testing/selftests/bpf/progs/irq.c
··· 11 extern void bpf_local_irq_restore(unsigned long *) __weak __ksym; 12 extern int bpf_copy_from_user_str(void *dst, u32 dst__sz, const void *unsafe_ptr__ign, u64 flags) __weak __ksym; 13 14 SEC("?tc") 15 __failure __msg("arg#0 doesn't point to an irq flag on stack") 16 int irq_save_bad_arg(struct __sk_buff *ctx) ··· 510 bpf_local_irq_save(&flags); 511 global_subprog_calling_sleepable_global(0); 512 bpf_local_irq_restore(&flags); 513 return 0; 514 } 515
··· 11 extern void bpf_local_irq_restore(unsigned long *) __weak __ksym; 12 extern int bpf_copy_from_user_str(void *dst, u32 dst__sz, const void *unsafe_ptr__ign, u64 flags) __weak __ksym; 13 14 + struct bpf_res_spin_lock lockA __hidden SEC(".data.A"); 15 + struct bpf_res_spin_lock lockB __hidden SEC(".data.B"); 16 + 17 SEC("?tc") 18 __failure __msg("arg#0 doesn't point to an irq flag on stack") 19 int irq_save_bad_arg(struct __sk_buff *ctx) ··· 507 bpf_local_irq_save(&flags); 508 global_subprog_calling_sleepable_global(0); 509 bpf_local_irq_restore(&flags); 510 + return 0; 511 + } 512 + 513 + SEC("?tc") 514 + __failure __msg("cannot restore irq state out of order") 515 + int irq_ooo_lock_cond_inv(struct __sk_buff *ctx) 516 + { 517 + unsigned long flags1, flags2; 518 + 519 + if (bpf_res_spin_lock_irqsave(&lockA, &flags1)) 520 + return 0; 521 + if (bpf_res_spin_lock_irqsave(&lockB, &flags2)) { 522 + bpf_res_spin_unlock_irqrestore(&lockA, &flags1); 523 + return 0; 524 + } 525 + 526 + bpf_res_spin_unlock_irqrestore(&lockB, &flags1); 527 + bpf_res_spin_unlock_irqrestore(&lockA, &flags2); 528 + return 0; 529 + } 530 + 531 + SEC("?tc") 532 + __failure __msg("function calls are not allowed") 533 + int irq_wrong_kfunc_class_1(struct __sk_buff *ctx) 534 + { 535 + unsigned long flags1; 536 + 537 + if (bpf_res_spin_lock_irqsave(&lockA, &flags1)) 538 + return 0; 539 + /* For now, bpf_local_irq_restore is not allowed in critical section, 540 + * but this test ensures error will be caught with kfunc_class when it's 541 + * opened up. Tested by temporarily permitting this kfunc in critical 542 + * section. 543 + */ 544 + bpf_local_irq_restore(&flags1); 545 + bpf_res_spin_unlock_irqrestore(&lockA, &flags1); 546 + return 0; 547 + } 548 + 549 + SEC("?tc") 550 + __failure __msg("function calls are not allowed") 551 + int irq_wrong_kfunc_class_2(struct __sk_buff *ctx) 552 + { 553 + unsigned long flags1, flags2; 554 + 555 + bpf_local_irq_save(&flags1); 556 + if (bpf_res_spin_lock_irqsave(&lockA, &flags2)) 557 + return 0; 558 + bpf_local_irq_restore(&flags2); 559 + bpf_res_spin_unlock_irqrestore(&lockA, &flags1); 560 return 0; 561 } 562
+143
tools/testing/selftests/bpf/progs/res_spin_lock.c
···
··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + /* Copyright (c) 2024-2025 Meta Platforms, Inc. and affiliates. */ 3 + #include <vmlinux.h> 4 + #include <bpf/bpf_tracing.h> 5 + #include <bpf/bpf_helpers.h> 6 + #include "bpf_misc.h" 7 + 8 + #define EDEADLK 35 9 + #define ETIMEDOUT 110 10 + 11 + struct arr_elem { 12 + struct bpf_res_spin_lock lock; 13 + }; 14 + 15 + struct { 16 + __uint(type, BPF_MAP_TYPE_ARRAY); 17 + __uint(max_entries, 64); 18 + __type(key, int); 19 + __type(value, struct arr_elem); 20 + } arrmap SEC(".maps"); 21 + 22 + struct bpf_res_spin_lock lockA __hidden SEC(".data.A"); 23 + struct bpf_res_spin_lock lockB __hidden SEC(".data.B"); 24 + 25 + SEC("tc") 26 + int res_spin_lock_test(struct __sk_buff *ctx) 27 + { 28 + struct arr_elem *elem1, *elem2; 29 + int r; 30 + 31 + elem1 = bpf_map_lookup_elem(&arrmap, &(int){0}); 32 + if (!elem1) 33 + return -1; 34 + elem2 = bpf_map_lookup_elem(&arrmap, &(int){0}); 35 + if (!elem2) 36 + return -1; 37 + 38 + r = bpf_res_spin_lock(&elem1->lock); 39 + if (r) 40 + return r; 41 + if (!bpf_res_spin_lock(&elem2->lock)) { 42 + bpf_res_spin_unlock(&elem2->lock); 43 + bpf_res_spin_unlock(&elem1->lock); 44 + return -1; 45 + } 46 + bpf_res_spin_unlock(&elem1->lock); 47 + return 0; 48 + } 49 + 50 + SEC("tc") 51 + int res_spin_lock_test_AB(struct __sk_buff *ctx) 52 + { 53 + int r; 54 + 55 + r = bpf_res_spin_lock(&lockA); 56 + if (r) 57 + return !r; 58 + /* Only unlock if we took the lock. */ 59 + if (!bpf_res_spin_lock(&lockB)) 60 + bpf_res_spin_unlock(&lockB); 61 + bpf_res_spin_unlock(&lockA); 62 + return 0; 63 + } 64 + 65 + int err; 66 + 67 + SEC("tc") 68 + int res_spin_lock_test_BA(struct __sk_buff *ctx) 69 + { 70 + int r; 71 + 72 + r = bpf_res_spin_lock(&lockB); 73 + if (r) 74 + return !r; 75 + if (!bpf_res_spin_lock(&lockA)) 76 + bpf_res_spin_unlock(&lockA); 77 + else 78 + err = -EDEADLK; 79 + bpf_res_spin_unlock(&lockB); 80 + return err ?: 0; 81 + } 82 + 83 + SEC("tc") 84 + int res_spin_lock_test_held_lock_max(struct __sk_buff *ctx) 85 + { 86 + struct bpf_res_spin_lock *locks[48] = {}; 87 + struct arr_elem *e; 88 + u64 time_beg, time; 89 + int ret = 0, i; 90 + 91 + _Static_assert(ARRAY_SIZE(((struct rqspinlock_held){}).locks) == 31, 92 + "RES_NR_HELD assumed to be 31"); 93 + 94 + for (i = 0; i < 34; i++) { 95 + int key = i; 96 + 97 + /* We cannot pass in i as it will get spilled/filled by the compiler and 98 + * loses bounds in verifier state. 99 + */ 100 + e = bpf_map_lookup_elem(&arrmap, &key); 101 + if (!e) 102 + return 1; 103 + locks[i] = &e->lock; 104 + } 105 + 106 + for (; i < 48; i++) { 107 + int key = i - 2; 108 + 109 + /* We cannot pass in i as it will get spilled/filled by the compiler and 110 + * loses bounds in verifier state. 111 + */ 112 + e = bpf_map_lookup_elem(&arrmap, &key); 113 + if (!e) 114 + return 1; 115 + locks[i] = &e->lock; 116 + } 117 + 118 + time_beg = bpf_ktime_get_ns(); 119 + for (i = 0; i < 34; i++) { 120 + if (bpf_res_spin_lock(locks[i])) 121 + goto end; 122 + } 123 + 124 + /* Trigger AA, after exhausting entries in the held lock table. This 125 + * time, only the timeout can save us, as AA detection won't succeed. 126 + */ 127 + if (!bpf_res_spin_lock(locks[34])) { 128 + bpf_res_spin_unlock(locks[34]); 129 + ret = 1; 130 + goto end; 131 + } 132 + 133 + end: 134 + for (i = i - 1; i >= 0; i--) 135 + bpf_res_spin_unlock(locks[i]); 136 + time = bpf_ktime_get_ns() - time_beg; 137 + /* Time spent should be easily above our limit (1/4 s), since AA 138 + * detection won't be expedited due to lack of held lock entry. 139 + */ 140 + return ret ?: (time > 1000000000 / 4 ? 0 : 1); 141 + } 142 + 143 + char _license[] SEC("license") = "GPL";
+244
tools/testing/selftests/bpf/progs/res_spin_lock_fail.c
···
··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + /* Copyright (c) 2024-2025 Meta Platforms, Inc. and affiliates. */ 3 + #include <vmlinux.h> 4 + #include <bpf/bpf_tracing.h> 5 + #include <bpf/bpf_helpers.h> 6 + #include <bpf/bpf_core_read.h> 7 + #include "bpf_misc.h" 8 + #include "bpf_experimental.h" 9 + 10 + struct arr_elem { 11 + struct bpf_res_spin_lock lock; 12 + }; 13 + 14 + struct { 15 + __uint(type, BPF_MAP_TYPE_ARRAY); 16 + __uint(max_entries, 1); 17 + __type(key, int); 18 + __type(value, struct arr_elem); 19 + } arrmap SEC(".maps"); 20 + 21 + long value; 22 + 23 + struct bpf_spin_lock lock __hidden SEC(".data.A"); 24 + struct bpf_res_spin_lock res_lock __hidden SEC(".data.B"); 25 + 26 + SEC("?tc") 27 + __failure __msg("point to map value or allocated object") 28 + int res_spin_lock_arg(struct __sk_buff *ctx) 29 + { 30 + struct arr_elem *elem; 31 + 32 + elem = bpf_map_lookup_elem(&arrmap, &(int){0}); 33 + if (!elem) 34 + return 0; 35 + bpf_res_spin_lock((struct bpf_res_spin_lock *)bpf_core_cast(&elem->lock, struct __sk_buff)); 36 + bpf_res_spin_lock(&elem->lock); 37 + return 0; 38 + } 39 + 40 + SEC("?tc") 41 + __failure __msg("AA deadlock detected") 42 + int res_spin_lock_AA(struct __sk_buff *ctx) 43 + { 44 + struct arr_elem *elem; 45 + 46 + elem = bpf_map_lookup_elem(&arrmap, &(int){0}); 47 + if (!elem) 48 + return 0; 49 + bpf_res_spin_lock(&elem->lock); 50 + bpf_res_spin_lock(&elem->lock); 51 + return 0; 52 + } 53 + 54 + SEC("?tc") 55 + __failure __msg("AA deadlock detected") 56 + int res_spin_lock_cond_AA(struct __sk_buff *ctx) 57 + { 58 + struct arr_elem *elem; 59 + 60 + elem = bpf_map_lookup_elem(&arrmap, &(int){0}); 61 + if (!elem) 62 + return 0; 63 + if (bpf_res_spin_lock(&elem->lock)) 64 + return 0; 65 + bpf_res_spin_lock(&elem->lock); 66 + return 0; 67 + } 68 + 69 + SEC("?tc") 70 + __failure __msg("unlock of different lock") 71 + int res_spin_lock_mismatch_1(struct __sk_buff *ctx) 72 + { 73 + struct arr_elem *elem; 74 + 75 + elem = bpf_map_lookup_elem(&arrmap, &(int){0}); 76 + if (!elem) 77 + return 0; 78 + if (bpf_res_spin_lock(&elem->lock)) 79 + return 0; 80 + bpf_res_spin_unlock(&res_lock); 81 + return 0; 82 + } 83 + 84 + SEC("?tc") 85 + __failure __msg("unlock of different lock") 86 + int res_spin_lock_mismatch_2(struct __sk_buff *ctx) 87 + { 88 + struct arr_elem *elem; 89 + 90 + elem = bpf_map_lookup_elem(&arrmap, &(int){0}); 91 + if (!elem) 92 + return 0; 93 + if (bpf_res_spin_lock(&res_lock)) 94 + return 0; 95 + bpf_res_spin_unlock(&elem->lock); 96 + return 0; 97 + } 98 + 99 + SEC("?tc") 100 + __failure __msg("unlock of different lock") 101 + int res_spin_lock_irq_mismatch_1(struct __sk_buff *ctx) 102 + { 103 + struct arr_elem *elem; 104 + unsigned long f1; 105 + 106 + elem = bpf_map_lookup_elem(&arrmap, &(int){0}); 107 + if (!elem) 108 + return 0; 109 + bpf_local_irq_save(&f1); 110 + if (bpf_res_spin_lock(&res_lock)) 111 + return 0; 112 + bpf_res_spin_unlock_irqrestore(&res_lock, &f1); 113 + return 0; 114 + } 115 + 116 + SEC("?tc") 117 + __failure __msg("unlock of different lock") 118 + int res_spin_lock_irq_mismatch_2(struct __sk_buff *ctx) 119 + { 120 + struct arr_elem *elem; 121 + unsigned long f1; 122 + 123 + elem = bpf_map_lookup_elem(&arrmap, &(int){0}); 124 + if (!elem) 125 + return 0; 126 + if (bpf_res_spin_lock_irqsave(&res_lock, &f1)) 127 + return 0; 128 + bpf_res_spin_unlock(&res_lock); 129 + return 0; 130 + } 131 + 132 + SEC("?tc") 133 + __success 134 + int res_spin_lock_ooo(struct __sk_buff *ctx) 135 + { 136 + struct arr_elem *elem; 137 + 138 + elem = bpf_map_lookup_elem(&arrmap, &(int){0}); 139 + if (!elem) 140 + return 0; 141 + if (bpf_res_spin_lock(&res_lock)) 142 + return 0; 143 + if (bpf_res_spin_lock(&elem->lock)) { 144 + bpf_res_spin_unlock(&res_lock); 145 + return 0; 146 + } 147 + bpf_res_spin_unlock(&elem->lock); 148 + bpf_res_spin_unlock(&res_lock); 149 + return 0; 150 + } 151 + 152 + SEC("?tc") 153 + __success 154 + int res_spin_lock_ooo_irq(struct __sk_buff *ctx) 155 + { 156 + struct arr_elem *elem; 157 + unsigned long f1, f2; 158 + 159 + elem = bpf_map_lookup_elem(&arrmap, &(int){0}); 160 + if (!elem) 161 + return 0; 162 + if (bpf_res_spin_lock_irqsave(&res_lock, &f1)) 163 + return 0; 164 + if (bpf_res_spin_lock_irqsave(&elem->lock, &f2)) { 165 + bpf_res_spin_unlock_irqrestore(&res_lock, &f1); 166 + /* We won't have a unreleased IRQ flag error here. */ 167 + return 0; 168 + } 169 + bpf_res_spin_unlock_irqrestore(&elem->lock, &f2); 170 + bpf_res_spin_unlock_irqrestore(&res_lock, &f1); 171 + return 0; 172 + } 173 + 174 + struct bpf_res_spin_lock lock1 __hidden SEC(".data.OO1"); 175 + struct bpf_res_spin_lock lock2 __hidden SEC(".data.OO2"); 176 + 177 + SEC("?tc") 178 + __failure __msg("bpf_res_spin_unlock cannot be out of order") 179 + int res_spin_lock_ooo_unlock(struct __sk_buff *ctx) 180 + { 181 + if (bpf_res_spin_lock(&lock1)) 182 + return 0; 183 + if (bpf_res_spin_lock(&lock2)) { 184 + bpf_res_spin_unlock(&lock1); 185 + return 0; 186 + } 187 + bpf_res_spin_unlock(&lock1); 188 + bpf_res_spin_unlock(&lock2); 189 + return 0; 190 + } 191 + 192 + SEC("?tc") 193 + __failure __msg("off 1 doesn't point to 'struct bpf_res_spin_lock' that is at 0") 194 + int res_spin_lock_bad_off(struct __sk_buff *ctx) 195 + { 196 + struct arr_elem *elem; 197 + 198 + elem = bpf_map_lookup_elem(&arrmap, &(int){0}); 199 + if (!elem) 200 + return 0; 201 + bpf_res_spin_lock((void *)&elem->lock + 1); 202 + return 0; 203 + } 204 + 205 + SEC("?tc") 206 + __failure __msg("R1 doesn't have constant offset. bpf_res_spin_lock has to be at the constant offset") 207 + int res_spin_lock_var_off(struct __sk_buff *ctx) 208 + { 209 + struct arr_elem *elem; 210 + u64 val = value; 211 + 212 + elem = bpf_map_lookup_elem(&arrmap, &(int){0}); 213 + if (!elem) { 214 + // FIXME: Only inline assembly use in assert macro doesn't emit 215 + // BTF definition. 216 + bpf_throw(0); 217 + return 0; 218 + } 219 + bpf_assert_range(val, 0, 40); 220 + bpf_res_spin_lock((void *)&value + val); 221 + return 0; 222 + } 223 + 224 + SEC("?tc") 225 + __failure __msg("map 'res_spin.bss' has no valid bpf_res_spin_lock") 226 + int res_spin_lock_no_lock_map(struct __sk_buff *ctx) 227 + { 228 + bpf_res_spin_lock((void *)&value + 1); 229 + return 0; 230 + } 231 + 232 + SEC("?tc") 233 + __failure __msg("local 'kptr' has no valid bpf_res_spin_lock") 234 + int res_spin_lock_no_lock_kptr(struct __sk_buff *ctx) 235 + { 236 + struct { int i; } *p = bpf_obj_new(typeof(*p)); 237 + 238 + if (!p) 239 + return 0; 240 + bpf_res_spin_lock((void *)p); 241 + return 0; 242 + } 243 + 244 + char _license[] SEC("license") = "GPL";