commit 494e7fe591bf834d57c6607cdc26ab8873708aa7 · tjh.dev/kernel

+2

MAINTAINERS

··· 4361 4361 F: kernel/bpf/ 4362 4362 F: kernel/trace/bpf_trace.c 4363 4363 F: lib/buildid.c 4364 + F: arch/*/include/asm/rqspinlock.h 4365 + F: include/asm-generic/rqspinlock.h 4364 4366 F: lib/test_bpf.c 4365 4367 F: net/bpf/ 4366 4368 F: net/core/filter.c

+93

arch/arm64/include/asm/rqspinlock.h

··· 1 + /* SPDX-License-Identifier: GPL-2.0 */ 2 + #ifndef _ASM_RQSPINLOCK_H 3 + #define _ASM_RQSPINLOCK_H 4 + 5 + #include <asm/barrier.h> 6 + 7 + /* 8 + * Hardcode res_smp_cond_load_acquire implementations for arm64 to a custom 9 + * version based on [0]. In rqspinlock code, our conditional expression involves 10 + * checking the value _and_ additionally a timeout. However, on arm64, the 11 + * WFE-based implementation may never spin again if no stores occur to the 12 + * locked byte in the lock word. As such, we may be stuck forever if 13 + * event-stream based unblocking is not available on the platform for WFE spin 14 + * loops (arch_timer_evtstrm_available). 15 + * 16 + * Once support for smp_cond_load_acquire_timewait [0] lands, we can drop this 17 + * copy-paste. 18 + * 19 + * While we rely on the implementation to amortize the cost of sampling 20 + * cond_expr for us, it will not happen when event stream support is 21 + * unavailable, time_expr check is amortized. This is not the common case, and 22 + * it would be difficult to fit our logic in the time_expr_ns >= time_limit_ns 23 + * comparison, hence just let it be. In case of event-stream, the loop is woken 24 + * up at microsecond granularity. 25 + * 26 + * [0]: https://lore.kernel.org/lkml/20250203214911.898276-1-ankur.a.arora@oracle.com 27 + */ 28 + 29 + #ifndef smp_cond_load_acquire_timewait 30 + 31 + #define smp_cond_time_check_count 200 32 + 33 + #define __smp_cond_load_relaxed_spinwait(ptr, cond_expr, time_expr_ns, \ 34 + time_limit_ns) ({ \ 35 + typeof(ptr) __PTR = (ptr); \ 36 + __unqual_scalar_typeof(*ptr) VAL; \ 37 + unsigned int __count = 0; \ 38 + for (;;) { \ 39 + VAL = READ_ONCE(*__PTR); \ 40 + if (cond_expr) \ 41 + break; \ 42 + cpu_relax(); \ 43 + if (__count++ < smp_cond_time_check_count) \ 44 + continue; \ 45 + if ((time_expr_ns) >= (time_limit_ns)) \ 46 + break; \ 47 + __count = 0; \ 48 + } \ 49 + (typeof(*ptr))VAL; \ 50 + }) 51 + 52 + #define __smp_cond_load_acquire_timewait(ptr, cond_expr, \ 53 + time_expr_ns, time_limit_ns) \ 54 + ({ \ 55 + typeof(ptr) __PTR = (ptr); \ 56 + __unqual_scalar_typeof(*ptr) VAL; \ 57 + for (;;) { \ 58 + VAL = smp_load_acquire(__PTR); \ 59 + if (cond_expr) \ 60 + break; \ 61 + __cmpwait_relaxed(__PTR, VAL); \ 62 + if ((time_expr_ns) >= (time_limit_ns)) \ 63 + break; \ 64 + } \ 65 + (typeof(*ptr))VAL; \ 66 + }) 67 + 68 + #define smp_cond_load_acquire_timewait(ptr, cond_expr, \ 69 + time_expr_ns, time_limit_ns) \ 70 + ({ \ 71 + __unqual_scalar_typeof(*ptr) _val; \ 72 + int __wfe = arch_timer_evtstrm_available(); \ 73 + \ 74 + if (likely(__wfe)) { \ 75 + _val = __smp_cond_load_acquire_timewait(ptr, cond_expr, \ 76 + time_expr_ns, \ 77 + time_limit_ns); \ 78 + } else { \ 79 + _val = __smp_cond_load_relaxed_spinwait(ptr, cond_expr, \ 80 + time_expr_ns, \ 81 + time_limit_ns); \ 82 + smp_acquire__after_ctrl_dep(); \ 83 + } \ 84 + (typeof(*ptr))_val; \ 85 + }) 86 + 87 + #endif 88 + 89 + #define res_smp_cond_load_acquire_timewait(v, c) smp_cond_load_acquire_timewait(v, c, 0, 1) 90 + 91 + #include <asm-generic/rqspinlock.h> 92 + 93 + #endif /* _ASM_RQSPINLOCK_H */

+33

arch/x86/include/asm/rqspinlock.h

··· 1 + /* SPDX-License-Identifier: GPL-2.0 */ 2 + #ifndef _ASM_X86_RQSPINLOCK_H 3 + #define _ASM_X86_RQSPINLOCK_H 4 + 5 + #include <asm/paravirt.h> 6 + 7 + #ifdef CONFIG_PARAVIRT 8 + DECLARE_STATIC_KEY_FALSE(virt_spin_lock_key); 9 + 10 + #define resilient_virt_spin_lock_enabled resilient_virt_spin_lock_enabled 11 + static __always_inline bool resilient_virt_spin_lock_enabled(void) 12 + { 13 + return static_branch_likely(&virt_spin_lock_key); 14 + } 15 + 16 + #ifdef CONFIG_QUEUED_SPINLOCKS 17 + typedef struct qspinlock rqspinlock_t; 18 + #else 19 + typedef struct rqspinlock rqspinlock_t; 20 + #endif 21 + extern int resilient_tas_spin_lock(rqspinlock_t *lock); 22 + 23 + #define resilient_virt_spin_lock resilient_virt_spin_lock 24 + static inline int resilient_virt_spin_lock(rqspinlock_t *lock) 25 + { 26 + return resilient_tas_spin_lock(lock); 27 + } 28 + 29 + #endif /* CONFIG_PARAVIRT */ 30 + 31 + #include <asm-generic/rqspinlock.h> 32 + 33 + #endif /* _ASM_X86_RQSPINLOCK_H */

+1

include/asm-generic/Kbuild

··· 45 45 mandatory-y += percpu.h 46 46 mandatory-y += pgalloc.h 47 47 mandatory-y += preempt.h 48 + mandatory-y += rqspinlock.h 48 49 mandatory-y += runtime-const.h 49 50 mandatory-y += rwonce.h 50 51 mandatory-y += sections.h

+6

include/asm-generic/mcs_spinlock.h

··· 1 1 #ifndef __ASM_MCS_SPINLOCK_H 2 2 #define __ASM_MCS_SPINLOCK_H 3 3 4 + struct mcs_spinlock { 5 + struct mcs_spinlock *next; 6 + int locked; /* 1 if lock acquired */ 7 + int count; /* nesting count, see qspinlock.c */ 8 + }; 9 + 4 10 /* 5 11 * Architectures can define their own: 6 12 *

+250

include/asm-generic/rqspinlock.h

··· 1 + /* SPDX-License-Identifier: GPL-2.0-or-later */ 2 + /* 3 + * Resilient Queued Spin Lock 4 + * 5 + * (C) Copyright 2024-2025 Meta Platforms, Inc. and affiliates. 6 + * 7 + * Authors: Kumar Kartikeya Dwivedi <memxor@gmail.com> 8 + */ 9 + #ifndef __ASM_GENERIC_RQSPINLOCK_H 10 + #define __ASM_GENERIC_RQSPINLOCK_H 11 + 12 + #include <linux/types.h> 13 + #include <vdso/time64.h> 14 + #include <linux/percpu.h> 15 + #ifdef CONFIG_QUEUED_SPINLOCKS 16 + #include <asm/qspinlock.h> 17 + #endif 18 + 19 + struct rqspinlock { 20 + union { 21 + atomic_t val; 22 + u32 locked; 23 + }; 24 + }; 25 + 26 + /* Even though this is same as struct rqspinlock, we need to emit a distinct 27 + * type in BTF for BPF programs. 28 + */ 29 + struct bpf_res_spin_lock { 30 + u32 val; 31 + }; 32 + 33 + struct qspinlock; 34 + #ifdef CONFIG_QUEUED_SPINLOCKS 35 + typedef struct qspinlock rqspinlock_t; 36 + #else 37 + typedef struct rqspinlock rqspinlock_t; 38 + #endif 39 + 40 + extern int resilient_tas_spin_lock(rqspinlock_t *lock); 41 + #ifdef CONFIG_QUEUED_SPINLOCKS 42 + extern int resilient_queued_spin_lock_slowpath(rqspinlock_t *lock, u32 val); 43 + #endif 44 + 45 + #ifndef resilient_virt_spin_lock_enabled 46 + static __always_inline bool resilient_virt_spin_lock_enabled(void) 47 + { 48 + return false; 49 + } 50 + #endif 51 + 52 + #ifndef resilient_virt_spin_lock 53 + static __always_inline int resilient_virt_spin_lock(rqspinlock_t *lock) 54 + { 55 + return 0; 56 + } 57 + #endif 58 + 59 + /* 60 + * Default timeout for waiting loops is 0.25 seconds 61 + */ 62 + #define RES_DEF_TIMEOUT (NSEC_PER_SEC / 4) 63 + 64 + /* 65 + * Choose 31 as it makes rqspinlock_held cacheline-aligned. 66 + */ 67 + #define RES_NR_HELD 31 68 + 69 + struct rqspinlock_held { 70 + int cnt; 71 + void *locks[RES_NR_HELD]; 72 + }; 73 + 74 + DECLARE_PER_CPU_ALIGNED(struct rqspinlock_held, rqspinlock_held_locks); 75 + 76 + static __always_inline void grab_held_lock_entry(void *lock) 77 + { 78 + int cnt = this_cpu_inc_return(rqspinlock_held_locks.cnt); 79 + 80 + if (unlikely(cnt > RES_NR_HELD)) { 81 + /* Still keep the inc so we decrement later. */ 82 + return; 83 + } 84 + 85 + /* 86 + * Implied compiler barrier in per-CPU operations; otherwise we can have 87 + * the compiler reorder inc with write to table, allowing interrupts to 88 + * overwrite and erase our write to the table (as on interrupt exit it 89 + * will be reset to NULL). 90 + * 91 + * It is fine for cnt inc to be reordered wrt remote readers though, 92 + * they won't observe our entry until the cnt update is visible, that's 93 + * all. 94 + */ 95 + this_cpu_write(rqspinlock_held_locks.locks[cnt - 1], lock); 96 + } 97 + 98 + /* 99 + * We simply don't support out-of-order unlocks, and keep the logic simple here. 100 + * The verifier prevents BPF programs from unlocking out-of-order, and the same 101 + * holds for in-kernel users. 102 + * 103 + * It is possible to run into misdetection scenarios of AA deadlocks on the same 104 + * CPU, and missed ABBA deadlocks on remote CPUs if this function pops entries 105 + * out of order (due to lock A, lock B, unlock A, unlock B) pattern. The correct 106 + * logic to preserve right entries in the table would be to walk the array of 107 + * held locks and swap and clear out-of-order entries, but that's too 108 + * complicated and we don't have a compelling use case for out of order unlocking. 109 + */ 110 + static __always_inline void release_held_lock_entry(void) 111 + { 112 + struct rqspinlock_held *rqh = this_cpu_ptr(&rqspinlock_held_locks); 113 + 114 + if (unlikely(rqh->cnt > RES_NR_HELD)) 115 + goto dec; 116 + WRITE_ONCE(rqh->locks[rqh->cnt - 1], NULL); 117 + dec: 118 + /* 119 + * Reordering of clearing above with inc and its write in 120 + * grab_held_lock_entry that came before us (in same acquisition 121 + * attempt) is ok, we either see a valid entry or NULL when it's 122 + * visible. 123 + * 124 + * But this helper is invoked when we unwind upon failing to acquire the 125 + * lock. Unlike the unlock path which constitutes a release store after 126 + * we clear the entry, we need to emit a write barrier here. Otherwise, 127 + * we may have a situation as follows: 128 + * 129 + * <error> for lock B 130 + * release_held_lock_entry 131 + * 132 + * try_cmpxchg_acquire for lock A 133 + * grab_held_lock_entry 134 + * 135 + * Lack of any ordering means reordering may occur such that dec, inc 136 + * are done before entry is overwritten. This permits a remote lock 137 + * holder of lock B (which this CPU failed to acquire) to now observe it 138 + * as being attempted on this CPU, and may lead to misdetection (if this 139 + * CPU holds a lock it is attempting to acquire, leading to false ABBA 140 + * diagnosis). 141 + * 142 + * In case of unlock, we will always do a release on the lock word after 143 + * releasing the entry, ensuring that other CPUs cannot hold the lock 144 + * (and make conclusions about deadlocks) until the entry has been 145 + * cleared on the local CPU, preventing any anomalies. Reordering is 146 + * still possible there, but a remote CPU cannot observe a lock in our 147 + * table which it is already holding, since visibility entails our 148 + * release store for the said lock has not retired. 149 + * 150 + * In theory we don't have a problem if the dec and WRITE_ONCE above get 151 + * reordered with each other, we either notice an empty NULL entry on 152 + * top (if dec succeeds WRITE_ONCE), or a potentially stale entry which 153 + * cannot be observed (if dec precedes WRITE_ONCE). 154 + * 155 + * Emit the write barrier _before_ the dec, this permits dec-inc 156 + * reordering but that is harmless as we'd have new entry set to NULL 157 + * already, i.e. they cannot precede the NULL store above. 158 + */ 159 + smp_wmb(); 160 + this_cpu_dec(rqspinlock_held_locks.cnt); 161 + } 162 + 163 + #ifdef CONFIG_QUEUED_SPINLOCKS 164 + 165 + /** 166 + * res_spin_lock - acquire a queued spinlock 167 + * @lock: Pointer to queued spinlock structure 168 + * 169 + * Return: 170 + * * 0 - Lock was acquired successfully. 171 + * * -EDEADLK - Lock acquisition failed because of AA/ABBA deadlock. 172 + * * -ETIMEDOUT - Lock acquisition failed because of timeout. 173 + */ 174 + static __always_inline int res_spin_lock(rqspinlock_t *lock) 175 + { 176 + int val = 0; 177 + 178 + if (likely(atomic_try_cmpxchg_acquire(&lock->val, &val, _Q_LOCKED_VAL))) { 179 + grab_held_lock_entry(lock); 180 + return 0; 181 + } 182 + return resilient_queued_spin_lock_slowpath(lock, val); 183 + } 184 + 185 + #else 186 + 187 + #define res_spin_lock(lock) resilient_tas_spin_lock(lock) 188 + 189 + #endif /* CONFIG_QUEUED_SPINLOCKS */ 190 + 191 + static __always_inline void res_spin_unlock(rqspinlock_t *lock) 192 + { 193 + struct rqspinlock_held *rqh = this_cpu_ptr(&rqspinlock_held_locks); 194 + 195 + if (unlikely(rqh->cnt > RES_NR_HELD)) 196 + goto unlock; 197 + WRITE_ONCE(rqh->locks[rqh->cnt - 1], NULL); 198 + unlock: 199 + /* 200 + * Release barrier, ensures correct ordering. See release_held_lock_entry 201 + * for details. Perform release store instead of queued_spin_unlock, 202 + * since we use this function for test-and-set fallback as well. When we 203 + * have CONFIG_QUEUED_SPINLOCKS=n, we clear the full 4-byte lockword. 204 + * 205 + * Like release_held_lock_entry, we can do the release before the dec. 206 + * We simply care about not seeing the 'lock' in our table from a remote 207 + * CPU once the lock has been released, which doesn't rely on the dec. 208 + * 209 + * Unlike smp_wmb(), release is not a two way fence, hence it is 210 + * possible for a inc to move up and reorder with our clearing of the 211 + * entry. This isn't a problem however, as for a misdiagnosis of ABBA, 212 + * the remote CPU needs to hold this lock, which won't be released until 213 + * the store below is done, which would ensure the entry is overwritten 214 + * to NULL, etc. 215 + */ 216 + smp_store_release(&lock->locked, 0); 217 + this_cpu_dec(rqspinlock_held_locks.cnt); 218 + } 219 + 220 + #ifdef CONFIG_QUEUED_SPINLOCKS 221 + #define raw_res_spin_lock_init(lock) ({ *(lock) = (rqspinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; }) 222 + #else 223 + #define raw_res_spin_lock_init(lock) ({ *(lock) = (rqspinlock_t){0}; }) 224 + #endif 225 + 226 + #define raw_res_spin_lock(lock) \ 227 + ({ \ 228 + int __ret; \ 229 + preempt_disable(); \ 230 + __ret = res_spin_lock(lock); \ 231 + if (__ret) \ 232 + preempt_enable(); \ 233 + __ret; \ 234 + }) 235 + 236 + #define raw_res_spin_unlock(lock) ({ res_spin_unlock(lock); preempt_enable(); }) 237 + 238 + #define raw_res_spin_lock_irqsave(lock, flags) \ 239 + ({ \ 240 + int __ret; \ 241 + local_irq_save(flags); \ 242 + __ret = raw_res_spin_lock(lock); \ 243 + if (__ret) \ 244 + local_irq_restore(flags); \ 245 + __ret; \ 246 + }) 247 + 248 + #define raw_res_spin_unlock_irqrestore(lock, flags) ({ raw_res_spin_unlock(lock); local_irq_restore(flags); }) 249 + 250 + #endif /* __ASM_GENERIC_RQSPINLOCK_H */

+10

include/linux/bpf.h

··· 30 30 #include <linux/static_call.h> 31 31 #include <linux/memcontrol.h> 32 32 #include <linux/cfi.h> 33 + #include <asm/rqspinlock.h> 33 34 34 35 struct bpf_verifier_env; 35 36 struct bpf_verifier_log; ··· 205 204 BPF_REFCOUNT = (1 << 9), 206 205 BPF_WORKQUEUE = (1 << 10), 207 206 BPF_UPTR = (1 << 11), 207 + BPF_RES_SPIN_LOCK = (1 << 12), 208 208 }; 209 209 210 210 typedef void (*btf_dtor_kfunc_t)(void *); ··· 241 239 u32 cnt; 242 240 u32 field_mask; 243 241 int spin_lock_off; 242 + int res_spin_lock_off; 244 243 int timer_off; 245 244 int wq_off; 246 245 int refcount_off; ··· 317 314 switch (type) { 318 315 case BPF_SPIN_LOCK: 319 316 return "bpf_spin_lock"; 317 + case BPF_RES_SPIN_LOCK: 318 + return "bpf_res_spin_lock"; 320 319 case BPF_TIMER: 321 320 return "bpf_timer"; 322 321 case BPF_WORKQUEUE: ··· 351 346 switch (type) { 352 347 case BPF_SPIN_LOCK: 353 348 return sizeof(struct bpf_spin_lock); 349 + case BPF_RES_SPIN_LOCK: 350 + return sizeof(struct bpf_res_spin_lock); 354 351 case BPF_TIMER: 355 352 return sizeof(struct bpf_timer); 356 353 case BPF_WORKQUEUE: ··· 383 376 switch (type) { 384 377 case BPF_SPIN_LOCK: 385 378 return __alignof__(struct bpf_spin_lock); 379 + case BPF_RES_SPIN_LOCK: 380 + return __alignof__(struct bpf_res_spin_lock); 386 381 case BPF_TIMER: 387 382 return __alignof__(struct bpf_timer); 388 383 case BPF_WORKQUEUE: ··· 428 419 case BPF_RB_ROOT: 429 420 /* RB_ROOT_CACHED 0-inits, no need to do anything after memset */ 430 421 case BPF_SPIN_LOCK: 422 + case BPF_RES_SPIN_LOCK: 431 423 case BPF_TIMER: 432 424 case BPF_WORKQUEUE: 433 425 case BPF_KPTR_UNREF:

+16 -3

include/linux/bpf_verifier.h

··· 115 115 int depth:30; 116 116 } iter; 117 117 118 + /* For irq stack slots */ 119 + struct { 120 + enum { 121 + IRQ_NATIVE_KFUNC, 122 + IRQ_LOCK_KFUNC, 123 + } kfunc_class; 124 + } irq; 125 + 118 126 /* Max size from any of the above. */ 119 127 struct { 120 128 unsigned long raw1; ··· 263 255 * default to pointer reference on zero initialization of a state. 264 256 */ 265 257 enum ref_state_type { 266 - REF_TYPE_PTR = 1, 267 - REF_TYPE_IRQ = 2, 268 - REF_TYPE_LOCK = 3, 258 + REF_TYPE_PTR = (1 << 1), 259 + REF_TYPE_IRQ = (1 << 2), 260 + REF_TYPE_LOCK = (1 << 3), 261 + REF_TYPE_RES_LOCK = (1 << 4), 262 + REF_TYPE_RES_LOCK_IRQ = (1 << 5), 263 + REF_TYPE_LOCK_MASK = REF_TYPE_LOCK | REF_TYPE_RES_LOCK | REF_TYPE_RES_LOCK_IRQ, 269 264 } type; 270 265 /* Track each reference created with a unique id, even if the same 271 266 * instruction creates the reference multiple times (eg, via CALL). ··· 435 424 u32 active_locks; 436 425 u32 active_preempt_locks; 437 426 u32 active_irq_id; 427 + u32 active_lock_id; 428 + void *active_lock_ptr; 438 429 bool active_rcu_lock; 439 430 440 431 bool speculative;

+1 -1

kernel/bpf/Makefile

··· 14 14 obj-${CONFIG_BPF_LSM} += bpf_inode_storage.o 15 15 obj-$(CONFIG_BPF_SYSCALL) += disasm.o mprog.o 16 16 obj-$(CONFIG_BPF_JIT) += trampoline.o 17 - obj-$(CONFIG_BPF_SYSCALL) += btf.o memalloc.o 17 + obj-$(CONFIG_BPF_SYSCALL) += btf.o memalloc.o rqspinlock.o 18 18 ifeq ($(CONFIG_MMU)$(CONFIG_64BIT),yy) 19 19 obj-$(CONFIG_BPF_SYSCALL) += arena.o range_tree.o 20 20 endif

+24 -2

kernel/bpf/btf.c

··· 3481 3481 goto end; 3482 3482 } 3483 3483 } 3484 + if (field_mask & BPF_RES_SPIN_LOCK) { 3485 + if (!strcmp(name, "bpf_res_spin_lock")) { 3486 + if (*seen_mask & BPF_RES_SPIN_LOCK) 3487 + return -E2BIG; 3488 + *seen_mask |= BPF_RES_SPIN_LOCK; 3489 + type = BPF_RES_SPIN_LOCK; 3490 + goto end; 3491 + } 3492 + } 3484 3493 if (field_mask & BPF_TIMER) { 3485 3494 if (!strcmp(name, "bpf_timer")) { 3486 3495 if (*seen_mask & BPF_TIMER) ··· 3668 3659 3669 3660 switch (field_type) { 3670 3661 case BPF_SPIN_LOCK: 3662 + case BPF_RES_SPIN_LOCK: 3671 3663 case BPF_TIMER: 3672 3664 case BPF_WORKQUEUE: 3673 3665 case BPF_LIST_NODE: ··· 3962 3952 return ERR_PTR(-ENOMEM); 3963 3953 3964 3954 rec->spin_lock_off = -EINVAL; 3955 + rec->res_spin_lock_off = -EINVAL; 3965 3956 rec->timer_off = -EINVAL; 3966 3957 rec->wq_off = -EINVAL; 3967 3958 rec->refcount_off = -EINVAL; ··· 3989 3978 WARN_ON_ONCE(rec->spin_lock_off >= 0); 3990 3979 /* Cache offset for faster lookup at runtime */ 3991 3980 rec->spin_lock_off = rec->fields[i].offset; 3981 + break; 3982 + case BPF_RES_SPIN_LOCK: 3983 + WARN_ON_ONCE(rec->spin_lock_off >= 0); 3984 + /* Cache offset for faster lookup at runtime */ 3985 + rec->res_spin_lock_off = rec->fields[i].offset; 3992 3986 break; 3993 3987 case BPF_TIMER: 3994 3988 WARN_ON_ONCE(rec->timer_off >= 0); ··· 4038 4022 rec->cnt++; 4039 4023 } 4040 4024 4025 + if (rec->spin_lock_off >= 0 && rec->res_spin_lock_off >= 0) { 4026 + ret = -EINVAL; 4027 + goto end; 4028 + } 4029 + 4041 4030 /* bpf_{list_head, rb_node} require bpf_spin_lock */ 4042 4031 if ((btf_record_has_field(rec, BPF_LIST_HEAD) || 4043 - btf_record_has_field(rec, BPF_RB_ROOT)) && rec->spin_lock_off < 0) { 4032 + btf_record_has_field(rec, BPF_RB_ROOT)) && 4033 + (rec->spin_lock_off < 0 && rec->res_spin_lock_off < 0)) { 4044 4034 ret = -EINVAL; 4045 4035 goto end; 4046 4036 } ··· 5659 5637 5660 5638 type = &tab->types[tab->cnt]; 5661 5639 type->btf_id = i; 5662 - record = btf_parse_fields(btf, t, BPF_SPIN_LOCK | BPF_LIST_HEAD | BPF_LIST_NODE | 5640 + record = btf_parse_fields(btf, t, BPF_SPIN_LOCK | BPF_RES_SPIN_LOCK | BPF_LIST_HEAD | BPF_LIST_NODE | 5663 5641 BPF_RB_ROOT | BPF_RB_NODE | BPF_REFCOUNT | 5664 5642 BPF_KPTR, t->size); 5665 5643 /* The record cannot be unset, treat it as an error if so */

+32 -70

kernel/bpf/hashtab.c

··· 16 16 #include "bpf_lru_list.h" 17 17 #include "map_in_map.h" 18 18 #include <linux/bpf_mem_alloc.h> 19 + #include <asm/rqspinlock.h> 19 20 20 21 #define HTAB_CREATE_FLAG_MASK \ 21 22 (BPF_F_NO_PREALLOC | BPF_F_NO_COMMON_LRU | BPF_F_NUMA_NODE | \ ··· 79 78 */ 80 79 struct bucket { 81 80 struct hlist_nulls_head head; 82 - raw_spinlock_t raw_lock; 81 + rqspinlock_t raw_lock; 83 82 }; 84 83 85 84 #define HASHTAB_MAP_LOCK_COUNT 8 ··· 105 104 u32 n_buckets; /* number of hash buckets */ 106 105 u32 elem_size; /* size of each element in bytes */ 107 106 u32 hashrnd; 108 - struct lock_class_key lockdep_key; 109 - int __percpu *map_locked[HASHTAB_MAP_LOCK_COUNT]; 110 107 }; 111 108 112 109 /* each htab element is struct htab_elem + key + value */ ··· 139 140 140 141 for (i = 0; i < htab->n_buckets; i++) { 141 142 INIT_HLIST_NULLS_HEAD(&htab->buckets[i].head, i); 142 - raw_spin_lock_init(&htab->buckets[i].raw_lock); 143 - lockdep_set_class(&htab->buckets[i].raw_lock, 144 - &htab->lockdep_key); 143 + raw_res_spin_lock_init(&htab->buckets[i].raw_lock); 145 144 cond_resched(); 146 145 } 147 146 } 148 147 149 - static inline int htab_lock_bucket(const struct bpf_htab *htab, 150 - struct bucket *b, u32 hash, 151 - unsigned long *pflags) 148 + static inline int htab_lock_bucket(struct bucket *b, unsigned long *pflags) 152 149 { 153 150 unsigned long flags; 151 + int ret; 154 152 155 - hash = hash & min_t(u32, HASHTAB_MAP_LOCK_MASK, htab->n_buckets - 1); 156 - 157 - preempt_disable(); 158 - local_irq_save(flags); 159 - if (unlikely(__this_cpu_inc_return(*(htab->map_locked[hash])) != 1)) { 160 - __this_cpu_dec(*(htab->map_locked[hash])); 161 - local_irq_restore(flags); 162 - preempt_enable(); 163 - return -EBUSY; 164 - } 165 - 166 - raw_spin_lock(&b->raw_lock); 153 + ret = raw_res_spin_lock_irqsave(&b->raw_lock, flags); 154 + if (ret) 155 + return ret; 167 156 *pflags = flags; 168 - 169 157 return 0; 170 158 } 171 159 172 - static inline void htab_unlock_bucket(const struct bpf_htab *htab, 173 - struct bucket *b, u32 hash, 174 - unsigned long flags) 160 + static inline void htab_unlock_bucket(struct bucket *b, unsigned long flags) 175 161 { 176 - hash = hash & min_t(u32, HASHTAB_MAP_LOCK_MASK, htab->n_buckets - 1); 177 - raw_spin_unlock(&b->raw_lock); 178 - __this_cpu_dec(*(htab->map_locked[hash])); 179 - local_irq_restore(flags); 180 - preempt_enable(); 162 + raw_res_spin_unlock_irqrestore(&b->raw_lock, flags); 181 163 } 182 164 183 165 static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node); ··· 463 483 bool percpu_lru = (attr->map_flags & BPF_F_NO_COMMON_LRU); 464 484 bool prealloc = !(attr->map_flags & BPF_F_NO_PREALLOC); 465 485 struct bpf_htab *htab; 466 - int err, i; 486 + int err; 467 487 468 488 htab = bpf_map_area_alloc(sizeof(*htab), NUMA_NO_NODE); 469 489 if (!htab) 470 490 return ERR_PTR(-ENOMEM); 471 - 472 - lockdep_register_key(&htab->lockdep_key); 473 491 474 492 bpf_map_init_from_attr(&htab->map, attr); 475 493 ··· 513 535 htab->map.numa_node); 514 536 if (!htab->buckets) 515 537 goto free_elem_count; 516 - 517 - for (i = 0; i < HASHTAB_MAP_LOCK_COUNT; i++) { 518 - htab->map_locked[i] = bpf_map_alloc_percpu(&htab->map, 519 - sizeof(int), 520 - sizeof(int), 521 - GFP_USER); 522 - if (!htab->map_locked[i]) 523 - goto free_map_locked; 524 - } 525 538 526 539 if (htab->map.map_flags & BPF_F_ZERO_SEED) 527 540 htab->hashrnd = 0; ··· 576 607 free_map_locked: 577 608 if (htab->use_percpu_counter) 578 609 percpu_counter_destroy(&htab->pcount); 579 - for (i = 0; i < HASHTAB_MAP_LOCK_COUNT; i++) 580 - free_percpu(htab->map_locked[i]); 581 610 bpf_map_area_free(htab->buckets); 582 611 bpf_mem_alloc_destroy(&htab->pcpu_ma); 583 612 bpf_mem_alloc_destroy(&htab->ma); 584 613 free_elem_count: 585 614 bpf_map_free_elem_count(&htab->map); 586 615 free_htab: 587 - lockdep_unregister_key(&htab->lockdep_key); 588 616 bpf_map_area_free(htab); 589 617 return ERR_PTR(err); 590 618 } ··· 786 820 b = __select_bucket(htab, tgt_l->hash); 787 821 head = &b->head; 788 822 789 - ret = htab_lock_bucket(htab, b, tgt_l->hash, &flags); 823 + ret = htab_lock_bucket(b, &flags); 790 824 if (ret) 791 825 return false; 792 826 ··· 797 831 break; 798 832 } 799 833 800 - htab_unlock_bucket(htab, b, tgt_l->hash, flags); 834 + htab_unlock_bucket(b, flags); 801 835 802 836 if (l == tgt_l) 803 837 check_and_free_fields(htab, l); ··· 1116 1150 */ 1117 1151 } 1118 1152 1119 - ret = htab_lock_bucket(htab, b, hash, &flags); 1153 + ret = htab_lock_bucket(b, &flags); 1120 1154 if (ret) 1121 1155 return ret; 1122 1156 ··· 1167 1201 check_and_free_fields(htab, l_old); 1168 1202 } 1169 1203 } 1170 - htab_unlock_bucket(htab, b, hash, flags); 1204 + htab_unlock_bucket(b, flags); 1171 1205 if (l_old) { 1172 1206 if (old_map_ptr) 1173 1207 map->ops->map_fd_put_ptr(map, old_map_ptr, true); ··· 1176 1210 } 1177 1211 return 0; 1178 1212 err: 1179 - htab_unlock_bucket(htab, b, hash, flags); 1213 + htab_unlock_bucket(b, flags); 1180 1214 return ret; 1181 1215 } 1182 1216 ··· 1223 1257 copy_map_value(&htab->map, 1224 1258 l_new->key + round_up(map->key_size, 8), value); 1225 1259 1226 - ret = htab_lock_bucket(htab, b, hash, &flags); 1260 + ret = htab_lock_bucket(b, &flags); 1227 1261 if (ret) 1228 1262 goto err_lock_bucket; 1229 1263 ··· 1244 1278 ret = 0; 1245 1279 1246 1280 err: 1247 - htab_unlock_bucket(htab, b, hash, flags); 1281 + htab_unlock_bucket(b, flags); 1248 1282 1249 1283 err_lock_bucket: 1250 1284 if (ret) ··· 1281 1315 b = __select_bucket(htab, hash); 1282 1316 head = &b->head; 1283 1317 1284 - ret = htab_lock_bucket(htab, b, hash, &flags); 1318 + ret = htab_lock_bucket(b, &flags); 1285 1319 if (ret) 1286 1320 return ret; 1287 1321 ··· 1306 1340 } 1307 1341 ret = 0; 1308 1342 err: 1309 - htab_unlock_bucket(htab, b, hash, flags); 1343 + htab_unlock_bucket(b, flags); 1310 1344 return ret; 1311 1345 } 1312 1346 ··· 1347 1381 return -ENOMEM; 1348 1382 } 1349 1383 1350 - ret = htab_lock_bucket(htab, b, hash, &flags); 1384 + ret = htab_lock_bucket(b, &flags); 1351 1385 if (ret) 1352 1386 goto err_lock_bucket; 1353 1387 ··· 1371 1405 } 1372 1406 ret = 0; 1373 1407 err: 1374 - htab_unlock_bucket(htab, b, hash, flags); 1408 + htab_unlock_bucket(b, flags); 1375 1409 err_lock_bucket: 1376 1410 if (l_new) { 1377 1411 bpf_map_dec_elem_count(&htab->map); ··· 1413 1447 b = __select_bucket(htab, hash); 1414 1448 head = &b->head; 1415 1449 1416 - ret = htab_lock_bucket(htab, b, hash, &flags); 1450 + ret = htab_lock_bucket(b, &flags); 1417 1451 if (ret) 1418 1452 return ret; 1419 1453 ··· 1423 1457 else 1424 1458 ret = -ENOENT; 1425 1459 1426 - htab_unlock_bucket(htab, b, hash, flags); 1460 + htab_unlock_bucket(b, flags); 1427 1461 1428 1462 if (l) 1429 1463 free_htab_elem(htab, l); ··· 1449 1483 b = __select_bucket(htab, hash); 1450 1484 head = &b->head; 1451 1485 1452 - ret = htab_lock_bucket(htab, b, hash, &flags); 1486 + ret = htab_lock_bucket(b, &flags); 1453 1487 if (ret) 1454 1488 return ret; 1455 1489 ··· 1460 1494 else 1461 1495 ret = -ENOENT; 1462 1496 1463 - htab_unlock_bucket(htab, b, hash, flags); 1497 + htab_unlock_bucket(b, flags); 1464 1498 if (l) 1465 1499 htab_lru_push_free(htab, l); 1466 1500 return ret; ··· 1527 1561 static void htab_map_free(struct bpf_map *map) 1528 1562 { 1529 1563 struct bpf_htab *htab = container_of(map, struct bpf_htab, map); 1530 - int i; 1531 1564 1532 1565 /* bpf_free_used_maps() or close(map_fd) will trigger this map_free callback. 1533 1566 * bpf_free_used_maps() is called after bpf prog is no longer executing. ··· 1551 1586 bpf_mem_alloc_destroy(&htab->ma); 1552 1587 if (htab->use_percpu_counter) 1553 1588 percpu_counter_destroy(&htab->pcount); 1554 - for (i = 0; i < HASHTAB_MAP_LOCK_COUNT; i++) 1555 - free_percpu(htab->map_locked[i]); 1556 - lockdep_unregister_key(&htab->lockdep_key); 1557 1589 bpf_map_area_free(htab); 1558 1590 } 1559 1591 ··· 1593 1631 b = __select_bucket(htab, hash); 1594 1632 head = &b->head; 1595 1633 1596 - ret = htab_lock_bucket(htab, b, hash, &bflags); 1634 + ret = htab_lock_bucket(b, &bflags); 1597 1635 if (ret) 1598 1636 return ret; 1599 1637 ··· 1630 1668 hlist_nulls_del_rcu(&l->hash_node); 1631 1669 1632 1670 out_unlock: 1633 - htab_unlock_bucket(htab, b, hash, bflags); 1671 + htab_unlock_bucket(b, bflags); 1634 1672 1635 1673 if (l) { 1636 1674 if (is_lru_map) ··· 1752 1790 head = &b->head; 1753 1791 /* do not grab the lock unless need it (bucket_cnt > 0). */ 1754 1792 if (locked) { 1755 - ret = htab_lock_bucket(htab, b, batch, &flags); 1793 + ret = htab_lock_bucket(b, &flags); 1756 1794 if (ret) { 1757 1795 rcu_read_unlock(); 1758 1796 bpf_enable_instrumentation(); ··· 1775 1813 /* Note that since bucket_cnt > 0 here, it is implicit 1776 1814 * that the locked was grabbed, so release it. 1777 1815 */ 1778 - htab_unlock_bucket(htab, b, batch, flags); 1816 + htab_unlock_bucket(b, flags); 1779 1817 rcu_read_unlock(); 1780 1818 bpf_enable_instrumentation(); 1781 1819 goto after_loop; ··· 1786 1824 /* Note that since bucket_cnt > 0 here, it is implicit 1787 1825 * that the locked was grabbed, so release it. 1788 1826 */ 1789 - htab_unlock_bucket(htab, b, batch, flags); 1827 + htab_unlock_bucket(b, flags); 1790 1828 rcu_read_unlock(); 1791 1829 bpf_enable_instrumentation(); 1792 1830 kvfree(keys); ··· 1849 1887 dst_val += value_size; 1850 1888 } 1851 1889 1852 - htab_unlock_bucket(htab, b, batch, flags); 1890 + htab_unlock_bucket(b, flags); 1853 1891 locked = false; 1854 1892 1855 1893 while (node_to_free) {

+14 -11

kernel/bpf/lpm_trie.c

··· 15 15 #include <net/ipv6.h> 16 16 #include <uapi/linux/btf.h> 17 17 #include <linux/btf_ids.h> 18 + #include <asm/rqspinlock.h> 18 19 #include <linux/bpf_mem_alloc.h> 19 20 20 21 /* Intermediate node */ ··· 37 36 size_t n_entries; 38 37 size_t max_prefixlen; 39 38 size_t data_size; 40 - raw_spinlock_t lock; 39 + rqspinlock_t lock; 41 40 }; 42 41 43 42 /* This trie implements a longest prefix match algorithm that can be used to ··· 343 342 if (!new_node) 344 343 return -ENOMEM; 345 344 346 - raw_spin_lock_irqsave(&trie->lock, irq_flags); 345 + ret = raw_res_spin_lock_irqsave(&trie->lock, irq_flags); 346 + if (ret) 347 + goto out_free; 347 348 348 349 new_node->prefixlen = key->prefixlen; 349 350 RCU_INIT_POINTER(new_node->child[0], NULL); ··· 359 356 */ 360 357 slot = &trie->root; 361 358 362 - while ((node = rcu_dereference_protected(*slot, 363 - lockdep_is_held(&trie->lock)))) { 359 + while ((node = rcu_dereference(*slot))) { 364 360 matchlen = longest_prefix_match(trie, node, key); 365 361 366 362 if (node->prefixlen != matchlen || ··· 444 442 rcu_assign_pointer(*slot, im_node); 445 443 446 444 out: 447 - raw_spin_unlock_irqrestore(&trie->lock, irq_flags); 448 - 445 + raw_res_spin_unlock_irqrestore(&trie->lock, irq_flags); 446 + out_free: 449 447 if (ret) 450 448 bpf_mem_cache_free(&trie->ma, new_node); 451 449 bpf_mem_cache_free_rcu(&trie->ma, free_node); ··· 469 467 if (key->prefixlen > trie->max_prefixlen) 470 468 return -EINVAL; 471 469 472 - raw_spin_lock_irqsave(&trie->lock, irq_flags); 470 + ret = raw_res_spin_lock_irqsave(&trie->lock, irq_flags); 471 + if (ret) 472 + return ret; 473 473 474 474 /* Walk the tree looking for an exact key/length match and keeping 475 475 * track of the path we traverse. We will need to know the node ··· 482 478 trim = &trie->root; 483 479 trim2 = trim; 484 480 parent = NULL; 485 - while ((node = rcu_dereference_protected( 486 - *trim, lockdep_is_held(&trie->lock)))) { 481 + while ((node = rcu_dereference(*trim))) { 487 482 matchlen = longest_prefix_match(trie, node, key); 488 483 489 484 if (node->prefixlen != matchlen || ··· 546 543 free_node = node; 547 544 548 545 out: 549 - raw_spin_unlock_irqrestore(&trie->lock, irq_flags); 546 + raw_res_spin_unlock_irqrestore(&trie->lock, irq_flags); 550 547 551 548 bpf_mem_cache_free_rcu(&trie->ma, free_parent); 552 549 bpf_mem_cache_free_rcu(&trie->ma, free_node); ··· 595 592 offsetof(struct bpf_lpm_trie_key_u8, data); 596 593 trie->max_prefixlen = trie->data_size * 8; 597 594 598 - raw_spin_lock_init(&trie->lock); 595 + raw_res_spin_lock_init(&trie->lock); 599 596 600 597 /* Allocate intermediate and leaf nodes from the same allocator */ 601 598 leaf_size = sizeof(struct lpm_trie_node) + trie->data_size +

+28 -91

kernel/bpf/percpu_freelist.c

··· 14 14 for_each_possible_cpu(cpu) { 15 15 struct pcpu_freelist_head *head = per_cpu_ptr(s->freelist, cpu); 16 16 17 - raw_spin_lock_init(&head->lock); 17 + raw_res_spin_lock_init(&head->lock); 18 18 head->first = NULL; 19 19 } 20 - raw_spin_lock_init(&s->extralist.lock); 21 - s->extralist.first = NULL; 22 20 return 0; 23 21 } 24 22 ··· 32 34 WRITE_ONCE(head->first, node); 33 35 } 34 36 35 - static inline void ___pcpu_freelist_push(struct pcpu_freelist_head *head, 37 + static inline bool ___pcpu_freelist_push(struct pcpu_freelist_head *head, 36 38 struct pcpu_freelist_node *node) 37 39 { 38 - raw_spin_lock(&head->lock); 39 - pcpu_freelist_push_node(head, node); 40 - raw_spin_unlock(&head->lock); 41 - } 42 - 43 - static inline bool pcpu_freelist_try_push_extra(struct pcpu_freelist *s, 44 - struct pcpu_freelist_node *node) 45 - { 46 - if (!raw_spin_trylock(&s->extralist.lock)) 40 + if (raw_res_spin_lock(&head->lock)) 47 41 return false; 48 - 49 - pcpu_freelist_push_node(&s->extralist, node); 50 - raw_spin_unlock(&s->extralist.lock); 42 + pcpu_freelist_push_node(head, node); 43 + raw_res_spin_unlock(&head->lock); 51 44 return true; 52 - } 53 - 54 - static inline void ___pcpu_freelist_push_nmi(struct pcpu_freelist *s, 55 - struct pcpu_freelist_node *node) 56 - { 57 - int cpu, orig_cpu; 58 - 59 - orig_cpu = raw_smp_processor_id(); 60 - while (1) { 61 - for_each_cpu_wrap(cpu, cpu_possible_mask, orig_cpu) { 62 - struct pcpu_freelist_head *head; 63 - 64 - head = per_cpu_ptr(s->freelist, cpu); 65 - if (raw_spin_trylock(&head->lock)) { 66 - pcpu_freelist_push_node(head, node); 67 - raw_spin_unlock(&head->lock); 68 - return; 69 - } 70 - } 71 - 72 - /* cannot lock any per cpu lock, try extralist */ 73 - if (pcpu_freelist_try_push_extra(s, node)) 74 - return; 75 - } 76 45 } 77 46 78 47 void __pcpu_freelist_push(struct pcpu_freelist *s, 79 48 struct pcpu_freelist_node *node) 80 49 { 81 - if (in_nmi()) 82 - ___pcpu_freelist_push_nmi(s, node); 83 - else 84 - ___pcpu_freelist_push(this_cpu_ptr(s->freelist), node); 50 + struct pcpu_freelist_head *head; 51 + int cpu; 52 + 53 + if (___pcpu_freelist_push(this_cpu_ptr(s->freelist), node)) 54 + return; 55 + 56 + while (true) { 57 + for_each_cpu_wrap(cpu, cpu_possible_mask, raw_smp_processor_id()) { 58 + if (cpu == raw_smp_processor_id()) 59 + continue; 60 + head = per_cpu_ptr(s->freelist, cpu); 61 + if (raw_res_spin_lock(&head->lock)) 62 + continue; 63 + pcpu_freelist_push_node(head, node); 64 + raw_res_spin_unlock(&head->lock); 65 + return; 66 + } 67 + } 85 68 } 86 69 87 70 void pcpu_freelist_push(struct pcpu_freelist *s, ··· 99 120 100 121 static struct pcpu_freelist_node *___pcpu_freelist_pop(struct pcpu_freelist *s) 101 122 { 123 + struct pcpu_freelist_node *node = NULL; 102 124 struct pcpu_freelist_head *head; 103 - struct pcpu_freelist_node *node; 104 125 int cpu; 105 126 106 127 for_each_cpu_wrap(cpu, cpu_possible_mask, raw_smp_processor_id()) { 107 128 head = per_cpu_ptr(s->freelist, cpu); 108 129 if (!READ_ONCE(head->first)) 109 130 continue; 110 - raw_spin_lock(&head->lock); 131 + if (raw_res_spin_lock(&head->lock)) 132 + continue; 111 133 node = head->first; 112 134 if (node) { 113 135 WRITE_ONCE(head->first, node->next); 114 - raw_spin_unlock(&head->lock); 136 + raw_res_spin_unlock(&head->lock); 115 137 return node; 116 138 } 117 - raw_spin_unlock(&head->lock); 139 + raw_res_spin_unlock(&head->lock); 118 140 } 119 - 120 - /* per cpu lists are all empty, try extralist */ 121 - if (!READ_ONCE(s->extralist.first)) 122 - return NULL; 123 - raw_spin_lock(&s->extralist.lock); 124 - node = s->extralist.first; 125 - if (node) 126 - WRITE_ONCE(s->extralist.first, node->next); 127 - raw_spin_unlock(&s->extralist.lock); 128 - return node; 129 - } 130 - 131 - static struct pcpu_freelist_node * 132 - ___pcpu_freelist_pop_nmi(struct pcpu_freelist *s) 133 - { 134 - struct pcpu_freelist_head *head; 135 - struct pcpu_freelist_node *node; 136 - int cpu; 137 - 138 - for_each_cpu_wrap(cpu, cpu_possible_mask, raw_smp_processor_id()) { 139 - head = per_cpu_ptr(s->freelist, cpu); 140 - if (!READ_ONCE(head->first)) 141 - continue; 142 - if (raw_spin_trylock(&head->lock)) { 143 - node = head->first; 144 - if (node) { 145 - WRITE_ONCE(head->first, node->next); 146 - raw_spin_unlock(&head->lock); 147 - return node; 148 - } 149 - raw_spin_unlock(&head->lock); 150 - } 151 - } 152 - 153 - /* cannot pop from per cpu lists, try extralist */ 154 - if (!READ_ONCE(s->extralist.first) || !raw_spin_trylock(&s->extralist.lock)) 155 - return NULL; 156 - node = s->extralist.first; 157 - if (node) 158 - WRITE_ONCE(s->extralist.first, node->next); 159 - raw_spin_unlock(&s->extralist.lock); 160 141 return node; 161 142 } 162 143 163 144 struct pcpu_freelist_node *__pcpu_freelist_pop(struct pcpu_freelist *s) 164 145 { 165 - if (in_nmi()) 166 - return ___pcpu_freelist_pop_nmi(s); 167 146 return ___pcpu_freelist_pop(s); 168 147 } 169 148

+2 -2

kernel/bpf/percpu_freelist.h

··· 5 5 #define __PERCPU_FREELIST_H__ 6 6 #include <linux/spinlock.h> 7 7 #include <linux/percpu.h> 8 + #include <asm/rqspinlock.h> 8 9 9 10 struct pcpu_freelist_head { 10 11 struct pcpu_freelist_node *first; 11 - raw_spinlock_t lock; 12 + rqspinlock_t lock; 12 13 }; 13 14 14 15 struct pcpu_freelist { 15 16 struct pcpu_freelist_head __percpu *freelist; 16 - struct pcpu_freelist_head extralist; 17 17 }; 18 18 19 19 struct pcpu_freelist_node {

+737

kernel/bpf/rqspinlock.c

··· 1 + // SPDX-License-Identifier: GPL-2.0-or-later 2 + /* 3 + * Resilient Queued Spin Lock 4 + * 5 + * (C) Copyright 2013-2015 Hewlett-Packard Development Company, L.P. 6 + * (C) Copyright 2013-2014,2018 Red Hat, Inc. 7 + * (C) Copyright 2015 Intel Corp. 8 + * (C) Copyright 2015 Hewlett-Packard Enterprise Development LP 9 + * (C) Copyright 2024-2025 Meta Platforms, Inc. and affiliates. 10 + * 11 + * Authors: Waiman Long <longman@redhat.com> 12 + * Peter Zijlstra <peterz@infradead.org> 13 + * Kumar Kartikeya Dwivedi <memxor@gmail.com> 14 + */ 15 + 16 + #include <linux/smp.h> 17 + #include <linux/bug.h> 18 + #include <linux/bpf.h> 19 + #include <linux/err.h> 20 + #include <linux/cpumask.h> 21 + #include <linux/percpu.h> 22 + #include <linux/hardirq.h> 23 + #include <linux/mutex.h> 24 + #include <linux/prefetch.h> 25 + #include <asm/byteorder.h> 26 + #ifdef CONFIG_QUEUED_SPINLOCKS 27 + #include <asm/qspinlock.h> 28 + #endif 29 + #include <trace/events/lock.h> 30 + #include <asm/rqspinlock.h> 31 + #include <linux/timekeeping.h> 32 + 33 + /* 34 + * Include queued spinlock definitions and statistics code 35 + */ 36 + #ifdef CONFIG_QUEUED_SPINLOCKS 37 + #include "../locking/qspinlock.h" 38 + #include "../locking/lock_events.h" 39 + #include "rqspinlock.h" 40 + #include "../locking/mcs_spinlock.h" 41 + #endif 42 + 43 + /* 44 + * The basic principle of a queue-based spinlock can best be understood 45 + * by studying a classic queue-based spinlock implementation called the 46 + * MCS lock. A copy of the original MCS lock paper ("Algorithms for Scalable 47 + * Synchronization on Shared-Memory Multiprocessors by Mellor-Crummey and 48 + * Scott") is available at 49 + * 50 + * https://bugzilla.kernel.org/show_bug.cgi?id=206115 51 + * 52 + * This queued spinlock implementation is based on the MCS lock, however to 53 + * make it fit the 4 bytes we assume spinlock_t to be, and preserve its 54 + * existing API, we must modify it somehow. 55 + * 56 + * In particular; where the traditional MCS lock consists of a tail pointer 57 + * (8 bytes) and needs the next pointer (another 8 bytes) of its own node to 58 + * unlock the next pending (next->locked), we compress both these: {tail, 59 + * next->locked} into a single u32 value. 60 + * 61 + * Since a spinlock disables recursion of its own context and there is a limit 62 + * to the contexts that can nest; namely: task, softirq, hardirq, nmi. As there 63 + * are at most 4 nesting levels, it can be encoded by a 2-bit number. Now 64 + * we can encode the tail by combining the 2-bit nesting level with the cpu 65 + * number. With one byte for the lock value and 3 bytes for the tail, only a 66 + * 32-bit word is now needed. Even though we only need 1 bit for the lock, 67 + * we extend it to a full byte to achieve better performance for architectures 68 + * that support atomic byte write. 69 + * 70 + * We also change the first spinner to spin on the lock bit instead of its 71 + * node; whereby avoiding the need to carry a node from lock to unlock, and 72 + * preserving existing lock API. This also makes the unlock code simpler and 73 + * faster. 74 + * 75 + * N.B. The current implementation only supports architectures that allow 76 + * atomic operations on smaller 8-bit and 16-bit data types. 77 + * 78 + */ 79 + 80 + struct rqspinlock_timeout { 81 + u64 timeout_end; 82 + u64 duration; 83 + u64 cur; 84 + u16 spin; 85 + }; 86 + 87 + #define RES_TIMEOUT_VAL 2 88 + 89 + DEFINE_PER_CPU_ALIGNED(struct rqspinlock_held, rqspinlock_held_locks); 90 + EXPORT_SYMBOL_GPL(rqspinlock_held_locks); 91 + 92 + static bool is_lock_released(rqspinlock_t *lock, u32 mask, struct rqspinlock_timeout *ts) 93 + { 94 + if (!(atomic_read_acquire(&lock->val) & (mask))) 95 + return true; 96 + return false; 97 + } 98 + 99 + static noinline int check_deadlock_AA(rqspinlock_t *lock, u32 mask, 100 + struct rqspinlock_timeout *ts) 101 + { 102 + struct rqspinlock_held *rqh = this_cpu_ptr(&rqspinlock_held_locks); 103 + int cnt = min(RES_NR_HELD, rqh->cnt); 104 + 105 + /* 106 + * Return an error if we hold the lock we are attempting to acquire. 107 + * We'll iterate over max 32 locks; no need to do is_lock_released. 108 + */ 109 + for (int i = 0; i < cnt - 1; i++) { 110 + if (rqh->locks[i] == lock) 111 + return -EDEADLK; 112 + } 113 + return 0; 114 + } 115 + 116 + /* 117 + * This focuses on the most common case of ABBA deadlocks (or ABBA involving 118 + * more locks, which reduce to ABBA). This is not exhaustive, and we rely on 119 + * timeouts as the final line of defense. 120 + */ 121 + static noinline int check_deadlock_ABBA(rqspinlock_t *lock, u32 mask, 122 + struct rqspinlock_timeout *ts) 123 + { 124 + struct rqspinlock_held *rqh = this_cpu_ptr(&rqspinlock_held_locks); 125 + int rqh_cnt = min(RES_NR_HELD, rqh->cnt); 126 + void *remote_lock; 127 + int cpu; 128 + 129 + /* 130 + * Find the CPU holding the lock that we want to acquire. If there is a 131 + * deadlock scenario, we will read a stable set on the remote CPU and 132 + * find the target. This would be a constant time operation instead of 133 + * O(NR_CPUS) if we could determine the owning CPU from a lock value, but 134 + * that requires increasing the size of the lock word. 135 + */ 136 + for_each_possible_cpu(cpu) { 137 + struct rqspinlock_held *rqh_cpu = per_cpu_ptr(&rqspinlock_held_locks, cpu); 138 + int real_cnt = READ_ONCE(rqh_cpu->cnt); 139 + int cnt = min(RES_NR_HELD, real_cnt); 140 + 141 + /* 142 + * Let's ensure to break out of this loop if the lock is available for 143 + * us to potentially acquire. 144 + */ 145 + if (is_lock_released(lock, mask, ts)) 146 + return 0; 147 + 148 + /* 149 + * Skip ourselves, and CPUs whose count is less than 2, as they need at 150 + * least one held lock and one acquisition attempt (reflected as top 151 + * most entry) to participate in an ABBA deadlock. 152 + * 153 + * If cnt is more than RES_NR_HELD, it means the current lock being 154 + * acquired won't appear in the table, and other locks in the table are 155 + * already held, so we can't determine ABBA. 156 + */ 157 + if (cpu == smp_processor_id() || real_cnt < 2 || real_cnt > RES_NR_HELD) 158 + continue; 159 + 160 + /* 161 + * Obtain the entry at the top, this corresponds to the lock the 162 + * remote CPU is attempting to acquire in a deadlock situation, 163 + * and would be one of the locks we hold on the current CPU. 164 + */ 165 + remote_lock = READ_ONCE(rqh_cpu->locks[cnt - 1]); 166 + /* 167 + * If it is NULL, we've raced and cannot determine a deadlock 168 + * conclusively, skip this CPU. 169 + */ 170 + if (!remote_lock) 171 + continue; 172 + /* 173 + * Find if the lock we're attempting to acquire is held by this CPU. 174 + * Don't consider the topmost entry, as that must be the latest lock 175 + * being held or acquired. For a deadlock, the target CPU must also 176 + * attempt to acquire a lock we hold, so for this search only 'cnt - 1' 177 + * entries are important. 178 + */ 179 + for (int i = 0; i < cnt - 1; i++) { 180 + if (READ_ONCE(rqh_cpu->locks[i]) != lock) 181 + continue; 182 + /* 183 + * We found our lock as held on the remote CPU. Is the 184 + * acquisition attempt on the remote CPU for a lock held 185 + * by us? If so, we have a deadlock situation, and need 186 + * to recover. 187 + */ 188 + for (int i = 0; i < rqh_cnt - 1; i++) { 189 + if (rqh->locks[i] == remote_lock) 190 + return -EDEADLK; 191 + } 192 + /* 193 + * Inconclusive; retry again later. 194 + */ 195 + return 0; 196 + } 197 + } 198 + return 0; 199 + } 200 + 201 + static noinline int check_deadlock(rqspinlock_t *lock, u32 mask, 202 + struct rqspinlock_timeout *ts) 203 + { 204 + int ret; 205 + 206 + ret = check_deadlock_AA(lock, mask, ts); 207 + if (ret) 208 + return ret; 209 + ret = check_deadlock_ABBA(lock, mask, ts); 210 + if (ret) 211 + return ret; 212 + 213 + return 0; 214 + } 215 + 216 + static noinline int check_timeout(rqspinlock_t *lock, u32 mask, 217 + struct rqspinlock_timeout *ts) 218 + { 219 + u64 time = ktime_get_mono_fast_ns(); 220 + u64 prev = ts->cur; 221 + 222 + if (!ts->timeout_end) { 223 + ts->cur = time; 224 + ts->timeout_end = time + ts->duration; 225 + return 0; 226 + } 227 + 228 + if (time > ts->timeout_end) 229 + return -ETIMEDOUT; 230 + 231 + /* 232 + * A millisecond interval passed from last time? Trigger deadlock 233 + * checks. 234 + */ 235 + if (prev + NSEC_PER_MSEC < time) { 236 + ts->cur = time; 237 + return check_deadlock(lock, mask, ts); 238 + } 239 + 240 + return 0; 241 + } 242 + 243 + /* 244 + * Do not amortize with spins when res_smp_cond_load_acquire is defined, 245 + * as the macro does internal amortization for us. 246 + */ 247 + #ifndef res_smp_cond_load_acquire 248 + #define RES_CHECK_TIMEOUT(ts, ret, mask) \ 249 + ({ \ 250 + if (!(ts).spin++) \ 251 + (ret) = check_timeout((lock), (mask), &(ts)); \ 252 + (ret); \ 253 + }) 254 + #else 255 + #define RES_CHECK_TIMEOUT(ts, ret, mask) \ 256 + ({ (ret) = check_timeout(&(ts)); }) 257 + #endif 258 + 259 + /* 260 + * Initialize the 'spin' member. 261 + * Set spin member to 0 to trigger AA/ABBA checks immediately. 262 + */ 263 + #define RES_INIT_TIMEOUT(ts) ({ (ts).spin = 0; }) 264 + 265 + /* 266 + * We only need to reset 'timeout_end', 'spin' will just wrap around as necessary. 267 + * Duration is defined for each spin attempt, so set it here. 268 + */ 269 + #define RES_RESET_TIMEOUT(ts, _duration) ({ (ts).timeout_end = 0; (ts).duration = _duration; }) 270 + 271 + /* 272 + * Provide a test-and-set fallback for cases when queued spin lock support is 273 + * absent from the architecture. 274 + */ 275 + int __lockfunc resilient_tas_spin_lock(rqspinlock_t *lock) 276 + { 277 + struct rqspinlock_timeout ts; 278 + int val, ret = 0; 279 + 280 + RES_INIT_TIMEOUT(ts); 281 + grab_held_lock_entry(lock); 282 + 283 + /* 284 + * Since the waiting loop's time is dependent on the amount of 285 + * contention, a short timeout unlike rqspinlock waiting loops 286 + * isn't enough. Choose a second as the timeout value. 287 + */ 288 + RES_RESET_TIMEOUT(ts, NSEC_PER_SEC); 289 + retry: 290 + val = atomic_read(&lock->val); 291 + 292 + if (val || !atomic_try_cmpxchg(&lock->val, &val, 1)) { 293 + if (RES_CHECK_TIMEOUT(ts, ret, ~0u)) 294 + goto out; 295 + cpu_relax(); 296 + goto retry; 297 + } 298 + 299 + return 0; 300 + out: 301 + release_held_lock_entry(); 302 + return ret; 303 + } 304 + EXPORT_SYMBOL_GPL(resilient_tas_spin_lock); 305 + 306 + #ifdef CONFIG_QUEUED_SPINLOCKS 307 + 308 + /* 309 + * Per-CPU queue node structures; we can never have more than 4 nested 310 + * contexts: task, softirq, hardirq, nmi. 311 + * 312 + * Exactly fits one 64-byte cacheline on a 64-bit architecture. 313 + */ 314 + static DEFINE_PER_CPU_ALIGNED(struct qnode, rqnodes[_Q_MAX_NODES]); 315 + 316 + #ifndef res_smp_cond_load_acquire 317 + #define res_smp_cond_load_acquire(v, c) smp_cond_load_acquire(v, c) 318 + #endif 319 + 320 + #define res_atomic_cond_read_acquire(v, c) res_smp_cond_load_acquire(&(v)->counter, (c)) 321 + 322 + /** 323 + * resilient_queued_spin_lock_slowpath - acquire the queued spinlock 324 + * @lock: Pointer to queued spinlock structure 325 + * @val: Current value of the queued spinlock 32-bit word 326 + * 327 + * Return: 328 + * * 0 - Lock was acquired successfully. 329 + * * -EDEADLK - Lock acquisition failed because of AA/ABBA deadlock. 330 + * * -ETIMEDOUT - Lock acquisition failed because of timeout. 331 + * 332 + * (queue tail, pending bit, lock value) 333 + * 334 + * fast : slow : unlock 335 + * : : 336 + * uncontended (0,0,0) -:--> (0,0,1) ------------------------------:--> (*,*,0) 337 + * : | ^--------.------. / : 338 + * : v \ \ | : 339 + * pending : (0,1,1) +--> (0,1,0) \ | : 340 + * : | ^--' | | : 341 + * : v | | : 342 + * uncontended : (n,x,y) +--> (n,0,0) --' | : 343 + * queue : | ^--' | : 344 + * : v | : 345 + * contended : (*,x,y) +--> (*,0,0) ---> (*,0,1) -' : 346 + * queue : ^--' : 347 + */ 348 + int __lockfunc resilient_queued_spin_lock_slowpath(rqspinlock_t *lock, u32 val) 349 + { 350 + struct mcs_spinlock *prev, *next, *node; 351 + struct rqspinlock_timeout ts; 352 + int idx, ret = 0; 353 + u32 old, tail; 354 + 355 + BUILD_BUG_ON(CONFIG_NR_CPUS >= (1U << _Q_TAIL_CPU_BITS)); 356 + 357 + if (resilient_virt_spin_lock_enabled()) 358 + return resilient_virt_spin_lock(lock); 359 + 360 + RES_INIT_TIMEOUT(ts); 361 + 362 + /* 363 + * Wait for in-progress pending->locked hand-overs with a bounded 364 + * number of spins so that we guarantee forward progress. 365 + * 366 + * 0,1,0 -> 0,0,1 367 + */ 368 + if (val == _Q_PENDING_VAL) { 369 + int cnt = _Q_PENDING_LOOPS; 370 + val = atomic_cond_read_relaxed(&lock->val, 371 + (VAL != _Q_PENDING_VAL) || !cnt--); 372 + } 373 + 374 + /* 375 + * If we observe any contention; queue. 376 + */ 377 + if (val & ~_Q_LOCKED_MASK) 378 + goto queue; 379 + 380 + /* 381 + * trylock || pending 382 + * 383 + * 0,0,* -> 0,1,* -> 0,0,1 pending, trylock 384 + */ 385 + val = queued_fetch_set_pending_acquire(lock); 386 + 387 + /* 388 + * If we observe contention, there is a concurrent locker. 389 + * 390 + * Undo and queue; our setting of PENDING might have made the 391 + * n,0,0 -> 0,0,0 transition fail and it will now be waiting 392 + * on @next to become !NULL. 393 + */ 394 + if (unlikely(val & ~_Q_LOCKED_MASK)) { 395 + 396 + /* Undo PENDING if we set it. */ 397 + if (!(val & _Q_PENDING_MASK)) 398 + clear_pending(lock); 399 + 400 + goto queue; 401 + } 402 + 403 + /* 404 + * Grab an entry in the held locks array, to enable deadlock detection. 405 + */ 406 + grab_held_lock_entry(lock); 407 + 408 + /* 409 + * We're pending, wait for the owner to go away. 410 + * 411 + * 0,1,1 -> *,1,0 412 + * 413 + * this wait loop must be a load-acquire such that we match the 414 + * store-release that clears the locked bit and create lock 415 + * sequentiality; this is because not all 416 + * clear_pending_set_locked() implementations imply full 417 + * barriers. 418 + */ 419 + if (val & _Q_LOCKED_MASK) { 420 + RES_RESET_TIMEOUT(ts, RES_DEF_TIMEOUT); 421 + res_smp_cond_load_acquire(&lock->locked, !VAL || RES_CHECK_TIMEOUT(ts, ret, _Q_LOCKED_MASK)); 422 + } 423 + 424 + if (ret) { 425 + /* 426 + * We waited for the locked bit to go back to 0, as the pending 427 + * waiter, but timed out. We need to clear the pending bit since 428 + * we own it. Once a stuck owner has been recovered, the lock 429 + * must be restored to a valid state, hence removing the pending 430 + * bit is necessary. 431 + * 432 + * *,1,* -> *,0,* 433 + */ 434 + clear_pending(lock); 435 + lockevent_inc(rqspinlock_lock_timeout); 436 + goto err_release_entry; 437 + } 438 + 439 + /* 440 + * take ownership and clear the pending bit. 441 + * 442 + * 0,1,0 -> 0,0,1 443 + */ 444 + clear_pending_set_locked(lock); 445 + lockevent_inc(lock_pending); 446 + return 0; 447 + 448 + /* 449 + * End of pending bit optimistic spinning and beginning of MCS 450 + * queuing. 451 + */ 452 + queue: 453 + lockevent_inc(lock_slowpath); 454 + /* 455 + * Grab deadlock detection entry for the queue path. 456 + */ 457 + grab_held_lock_entry(lock); 458 + 459 + node = this_cpu_ptr(&rqnodes[0].mcs); 460 + idx = node->count++; 461 + tail = encode_tail(smp_processor_id(), idx); 462 + 463 + trace_contention_begin(lock, LCB_F_SPIN); 464 + 465 + /* 466 + * 4 nodes are allocated based on the assumption that there will 467 + * not be nested NMIs taking spinlocks. That may not be true in 468 + * some architectures even though the chance of needing more than 469 + * 4 nodes will still be extremely unlikely. When that happens, 470 + * we fall back to spinning on the lock directly without using 471 + * any MCS node. This is not the most elegant solution, but is 472 + * simple enough. 473 + */ 474 + if (unlikely(idx >= _Q_MAX_NODES)) { 475 + lockevent_inc(lock_no_node); 476 + RES_RESET_TIMEOUT(ts, RES_DEF_TIMEOUT); 477 + while (!queued_spin_trylock(lock)) { 478 + if (RES_CHECK_TIMEOUT(ts, ret, ~0u)) { 479 + lockevent_inc(rqspinlock_lock_timeout); 480 + goto err_release_node; 481 + } 482 + cpu_relax(); 483 + } 484 + goto release; 485 + } 486 + 487 + node = grab_mcs_node(node, idx); 488 + 489 + /* 490 + * Keep counts of non-zero index values: 491 + */ 492 + lockevent_cond_inc(lock_use_node2 + idx - 1, idx); 493 + 494 + /* 495 + * Ensure that we increment the head node->count before initialising 496 + * the actual node. If the compiler is kind enough to reorder these 497 + * stores, then an IRQ could overwrite our assignments. 498 + */ 499 + barrier(); 500 + 501 + node->locked = 0; 502 + node->next = NULL; 503 + 504 + /* 505 + * We touched a (possibly) cold cacheline in the per-cpu queue node; 506 + * attempt the trylock once more in the hope someone let go while we 507 + * weren't watching. 508 + */ 509 + if (queued_spin_trylock(lock)) 510 + goto release; 511 + 512 + /* 513 + * Ensure that the initialisation of @node is complete before we 514 + * publish the updated tail via xchg_tail() and potentially link 515 + * @node into the waitqueue via WRITE_ONCE(prev->next, node) below. 516 + */ 517 + smp_wmb(); 518 + 519 + /* 520 + * Publish the updated tail. 521 + * We have already touched the queueing cacheline; don't bother with 522 + * pending stuff. 523 + * 524 + * p,*,* -> n,*,* 525 + */ 526 + old = xchg_tail(lock, tail); 527 + next = NULL; 528 + 529 + /* 530 + * if there was a previous node; link it and wait until reaching the 531 + * head of the waitqueue. 532 + */ 533 + if (old & _Q_TAIL_MASK) { 534 + int val; 535 + 536 + prev = decode_tail(old, rqnodes); 537 + 538 + /* Link @node into the waitqueue. */ 539 + WRITE_ONCE(prev->next, node); 540 + 541 + val = arch_mcs_spin_lock_contended(&node->locked); 542 + if (val == RES_TIMEOUT_VAL) { 543 + ret = -EDEADLK; 544 + goto waitq_timeout; 545 + } 546 + 547 + /* 548 + * While waiting for the MCS lock, the next pointer may have 549 + * been set by another lock waiter. We optimistically load 550 + * the next pointer & prefetch the cacheline for writing 551 + * to reduce latency in the upcoming MCS unlock operation. 552 + */ 553 + next = READ_ONCE(node->next); 554 + if (next) 555 + prefetchw(next); 556 + } 557 + 558 + /* 559 + * we're at the head of the waitqueue, wait for the owner & pending to 560 + * go away. 561 + * 562 + * *,x,y -> *,0,0 563 + * 564 + * this wait loop must use a load-acquire such that we match the 565 + * store-release that clears the locked bit and create lock 566 + * sequentiality; this is because the set_locked() function below 567 + * does not imply a full barrier. 568 + * 569 + * We use RES_DEF_TIMEOUT * 2 as the duration, as RES_DEF_TIMEOUT is 570 + * meant to span maximum allowed time per critical section, and we may 571 + * have both the owner of the lock and the pending bit waiter ahead of 572 + * us. 573 + */ 574 + RES_RESET_TIMEOUT(ts, RES_DEF_TIMEOUT * 2); 575 + val = res_atomic_cond_read_acquire(&lock->val, !(VAL & _Q_LOCKED_PENDING_MASK) || 576 + RES_CHECK_TIMEOUT(ts, ret, _Q_LOCKED_PENDING_MASK)); 577 + 578 + waitq_timeout: 579 + if (ret) { 580 + /* 581 + * If the tail is still pointing to us, then we are the final waiter, 582 + * and are responsible for resetting the tail back to 0. Otherwise, if 583 + * the cmpxchg operation fails, we signal the next waiter to take exit 584 + * and try the same. For a waiter with tail node 'n': 585 + * 586 + * n,*,* -> 0,*,* 587 + * 588 + * When performing cmpxchg for the whole word (NR_CPUS > 16k), it is 589 + * possible locked/pending bits keep changing and we see failures even 590 + * when we remain the head of wait queue. However, eventually, 591 + * pending bit owner will unset the pending bit, and new waiters 592 + * will queue behind us. This will leave the lock owner in 593 + * charge, and it will eventually either set locked bit to 0, or 594 + * leave it as 1, allowing us to make progress. 595 + * 596 + * We terminate the whole wait queue for two reasons. Firstly, 597 + * we eschew per-waiter timeouts with one applied at the head of 598 + * the wait queue. This allows everyone to break out faster 599 + * once we've seen the owner / pending waiter not responding for 600 + * the timeout duration from the head. Secondly, it avoids 601 + * complicated synchronization, because when not leaving in FIFO 602 + * order, prev's next pointer needs to be fixed up etc. 603 + */ 604 + if (!try_cmpxchg_tail(lock, tail, 0)) { 605 + next = smp_cond_load_relaxed(&node->next, VAL); 606 + WRITE_ONCE(next->locked, RES_TIMEOUT_VAL); 607 + } 608 + lockevent_inc(rqspinlock_lock_timeout); 609 + goto err_release_node; 610 + } 611 + 612 + /* 613 + * claim the lock: 614 + * 615 + * n,0,0 -> 0,0,1 : lock, uncontended 616 + * *,*,0 -> *,*,1 : lock, contended 617 + * 618 + * If the queue head is the only one in the queue (lock value == tail) 619 + * and nobody is pending, clear the tail code and grab the lock. 620 + * Otherwise, we only need to grab the lock. 621 + */ 622 + 623 + /* 624 + * Note: at this point: (val & _Q_PENDING_MASK) == 0, because of the 625 + * above wait condition, therefore any concurrent setting of 626 + * PENDING will make the uncontended transition fail. 627 + */ 628 + if ((val & _Q_TAIL_MASK) == tail) { 629 + if (atomic_try_cmpxchg_relaxed(&lock->val, &val, _Q_LOCKED_VAL)) 630 + goto release; /* No contention */ 631 + } 632 + 633 + /* 634 + * Either somebody is queued behind us or _Q_PENDING_VAL got set 635 + * which will then detect the remaining tail and queue behind us 636 + * ensuring we'll see a @next. 637 + */ 638 + set_locked(lock); 639 + 640 + /* 641 + * contended path; wait for next if not observed yet, release. 642 + */ 643 + if (!next) 644 + next = smp_cond_load_relaxed(&node->next, (VAL)); 645 + 646 + arch_mcs_spin_unlock_contended(&next->locked); 647 + 648 + release: 649 + trace_contention_end(lock, 0); 650 + 651 + /* 652 + * release the node 653 + */ 654 + __this_cpu_dec(rqnodes[0].mcs.count); 655 + return ret; 656 + err_release_node: 657 + trace_contention_end(lock, ret); 658 + __this_cpu_dec(rqnodes[0].mcs.count); 659 + err_release_entry: 660 + release_held_lock_entry(); 661 + return ret; 662 + } 663 + EXPORT_SYMBOL_GPL(resilient_queued_spin_lock_slowpath); 664 + 665 + #endif /* CONFIG_QUEUED_SPINLOCKS */ 666 + 667 + __bpf_kfunc_start_defs(); 668 + 669 + __bpf_kfunc int bpf_res_spin_lock(struct bpf_res_spin_lock *lock) 670 + { 671 + int ret; 672 + 673 + BUILD_BUG_ON(sizeof(rqspinlock_t) != sizeof(struct bpf_res_spin_lock)); 674 + BUILD_BUG_ON(__alignof__(rqspinlock_t) != __alignof__(struct bpf_res_spin_lock)); 675 + 676 + preempt_disable(); 677 + ret = res_spin_lock((rqspinlock_t *)lock); 678 + if (unlikely(ret)) { 679 + preempt_enable(); 680 + return ret; 681 + } 682 + return 0; 683 + } 684 + 685 + __bpf_kfunc void bpf_res_spin_unlock(struct bpf_res_spin_lock *lock) 686 + { 687 + res_spin_unlock((rqspinlock_t *)lock); 688 + preempt_enable(); 689 + } 690 + 691 + __bpf_kfunc int bpf_res_spin_lock_irqsave(struct bpf_res_spin_lock *lock, unsigned long *flags__irq_flag) 692 + { 693 + u64 *ptr = (u64 *)flags__irq_flag; 694 + unsigned long flags; 695 + int ret; 696 + 697 + preempt_disable(); 698 + local_irq_save(flags); 699 + ret = res_spin_lock((rqspinlock_t *)lock); 700 + if (unlikely(ret)) { 701 + local_irq_restore(flags); 702 + preempt_enable(); 703 + return ret; 704 + } 705 + *ptr = flags; 706 + return 0; 707 + } 708 + 709 + __bpf_kfunc void bpf_res_spin_unlock_irqrestore(struct bpf_res_spin_lock *lock, unsigned long *flags__irq_flag) 710 + { 711 + u64 *ptr = (u64 *)flags__irq_flag; 712 + unsigned long flags = *ptr; 713 + 714 + res_spin_unlock((rqspinlock_t *)lock); 715 + local_irq_restore(flags); 716 + preempt_enable(); 717 + } 718 + 719 + __bpf_kfunc_end_defs(); 720 + 721 + BTF_KFUNCS_START(rqspinlock_kfunc_ids) 722 + BTF_ID_FLAGS(func, bpf_res_spin_lock, KF_RET_NULL) 723 + BTF_ID_FLAGS(func, bpf_res_spin_unlock) 724 + BTF_ID_FLAGS(func, bpf_res_spin_lock_irqsave, KF_RET_NULL) 725 + BTF_ID_FLAGS(func, bpf_res_spin_unlock_irqrestore) 726 + BTF_KFUNCS_END(rqspinlock_kfunc_ids) 727 + 728 + static const struct btf_kfunc_id_set rqspinlock_kfunc_set = { 729 + .owner = THIS_MODULE, 730 + .set = &rqspinlock_kfunc_ids, 731 + }; 732 + 733 + static __init int rqspinlock_register_kfuncs(void) 734 + { 735 + return register_btf_kfunc_id_set(BPF_PROG_TYPE_UNSPEC, &rqspinlock_kfunc_set); 736 + } 737 + late_initcall(rqspinlock_register_kfuncs);

+48

kernel/bpf/rqspinlock.h

··· 1 + /* SPDX-License-Identifier: GPL-2.0-or-later */ 2 + /* 3 + * Resilient Queued Spin Lock defines 4 + * 5 + * (C) Copyright 2024-2025 Meta Platforms, Inc. and affiliates. 6 + * 7 + * Authors: Kumar Kartikeya Dwivedi <memxor@gmail.com> 8 + */ 9 + #ifndef __LINUX_RQSPINLOCK_H 10 + #define __LINUX_RQSPINLOCK_H 11 + 12 + #include "../locking/qspinlock.h" 13 + 14 + /* 15 + * try_cmpxchg_tail - Return result of cmpxchg of tail word with a new value 16 + * @lock: Pointer to queued spinlock structure 17 + * @tail: The tail to compare against 18 + * @new_tail: The new queue tail code word 19 + * Return: Bool to indicate whether the cmpxchg operation succeeded 20 + * 21 + * This is used by the head of the wait queue to clean up the queue. 22 + * Provides relaxed ordering, since observers only rely on initialized 23 + * state of the node which was made visible through the xchg_tail operation, 24 + * i.e. through the smp_wmb preceding xchg_tail. 25 + * 26 + * We avoid using 16-bit cmpxchg, which is not available on all architectures. 27 + */ 28 + static __always_inline bool try_cmpxchg_tail(struct qspinlock *lock, u32 tail, u32 new_tail) 29 + { 30 + u32 old, new; 31 + 32 + old = atomic_read(&lock->val); 33 + do { 34 + /* 35 + * Is the tail part we compare to already stale? Fail. 36 + */ 37 + if ((old & _Q_TAIL_MASK) != tail) 38 + return false; 39 + /* 40 + * Encode latest locked/pending state for new tail. 41 + */ 42 + new = (old & _Q_LOCKED_PENDING_MASK) | new_tail; 43 + } while (!atomic_try_cmpxchg_relaxed(&lock->val, &old, new)); 44 + 45 + return true; 46 + } 47 + 48 + #endif /* __LINUX_RQSPINLOCK_H */

+5 -1

kernel/bpf/syscall.c

··· 648 648 case BPF_RB_ROOT: 649 649 case BPF_RB_NODE: 650 650 case BPF_SPIN_LOCK: 651 + case BPF_RES_SPIN_LOCK: 651 652 case BPF_TIMER: 652 653 case BPF_REFCOUNT: 653 654 case BPF_WORKQUEUE: ··· 701 700 case BPF_RB_ROOT: 702 701 case BPF_RB_NODE: 703 702 case BPF_SPIN_LOCK: 703 + case BPF_RES_SPIN_LOCK: 704 704 case BPF_TIMER: 705 705 case BPF_REFCOUNT: 706 706 case BPF_WORKQUEUE: ··· 779 777 780 778 switch (fields[i].type) { 781 779 case BPF_SPIN_LOCK: 780 + case BPF_RES_SPIN_LOCK: 782 781 break; 783 782 case BPF_TIMER: 784 783 bpf_timer_cancel_and_free(field_ptr); ··· 1215 1212 return -EINVAL; 1216 1213 1217 1214 map->record = btf_parse_fields(btf, value_type, 1218 - BPF_SPIN_LOCK | BPF_TIMER | BPF_KPTR | BPF_LIST_HEAD | 1215 + BPF_SPIN_LOCK | BPF_RES_SPIN_LOCK | BPF_TIMER | BPF_KPTR | BPF_LIST_HEAD | 1219 1216 BPF_RB_ROOT | BPF_REFCOUNT | BPF_WORKQUEUE | BPF_UPTR, 1220 1217 map->value_size); 1221 1218 if (!IS_ERR_OR_NULL(map->record)) { ··· 1234 1231 case 0: 1235 1232 continue; 1236 1233 case BPF_SPIN_LOCK: 1234 + case BPF_RES_SPIN_LOCK: 1237 1235 if (map->map_type != BPF_MAP_TYPE_HASH && 1238 1236 map->map_type != BPF_MAP_TYPE_ARRAY && 1239 1237 map->map_type != BPF_MAP_TYPE_CGROUP_STORAGE &&

+206 -42

kernel/bpf/verifier.c

··· 456 456 457 457 static bool reg_may_point_to_spin_lock(const struct bpf_reg_state *reg) 458 458 { 459 - return btf_record_has_field(reg_btf_record(reg), BPF_SPIN_LOCK); 459 + return btf_record_has_field(reg_btf_record(reg), BPF_SPIN_LOCK | BPF_RES_SPIN_LOCK); 460 460 } 461 461 462 462 static bool type_is_rdonly_mem(u32 type) ··· 1155 1155 1156 1156 static int mark_stack_slot_irq_flag(struct bpf_verifier_env *env, 1157 1157 struct bpf_kfunc_call_arg_meta *meta, 1158 - struct bpf_reg_state *reg, int insn_idx) 1158 + struct bpf_reg_state *reg, int insn_idx, 1159 + int kfunc_class) 1159 1160 { 1160 1161 struct bpf_func_state *state = func(env, reg); 1161 1162 struct bpf_stack_state *slot; ··· 1178 1177 st->type = PTR_TO_STACK; /* we don't have dedicated reg type */ 1179 1178 st->live |= REG_LIVE_WRITTEN; 1180 1179 st->ref_obj_id = id; 1180 + st->irq.kfunc_class = kfunc_class; 1181 1181 1182 1182 for (i = 0; i < BPF_REG_SIZE; i++) 1183 1183 slot->slot_type[i] = STACK_IRQ_FLAG; ··· 1187 1185 return 0; 1188 1186 } 1189 1187 1190 - static int unmark_stack_slot_irq_flag(struct bpf_verifier_env *env, struct bpf_reg_state *reg) 1188 + static int unmark_stack_slot_irq_flag(struct bpf_verifier_env *env, struct bpf_reg_state *reg, 1189 + int kfunc_class) 1191 1190 { 1192 1191 struct bpf_func_state *state = func(env, reg); 1193 1192 struct bpf_stack_state *slot; ··· 1201 1198 1202 1199 slot = &state->stack[spi]; 1203 1200 st = &slot->spilled_ptr; 1201 + 1202 + if (st->irq.kfunc_class != kfunc_class) { 1203 + const char *flag_kfunc = st->irq.kfunc_class == IRQ_NATIVE_KFUNC ? "native" : "lock"; 1204 + const char *used_kfunc = kfunc_class == IRQ_NATIVE_KFUNC ? "native" : "lock"; 1205 + 1206 + verbose(env, "irq flag acquired by %s kfuncs cannot be restored with %s kfuncs\n", 1207 + flag_kfunc, used_kfunc); 1208 + return -EINVAL; 1209 + } 1204 1210 1205 1211 err = release_irq_state(env->cur_state, st->ref_obj_id); 1206 1212 WARN_ON_ONCE(err && err != -EACCES); ··· 1428 1416 dst->active_preempt_locks = src->active_preempt_locks; 1429 1417 dst->active_rcu_lock = src->active_rcu_lock; 1430 1418 dst->active_irq_id = src->active_irq_id; 1419 + dst->active_lock_id = src->active_lock_id; 1420 + dst->active_lock_ptr = src->active_lock_ptr; 1431 1421 return 0; 1432 1422 } 1433 1423 ··· 1529 1515 s->ptr = ptr; 1530 1516 1531 1517 state->active_locks++; 1518 + state->active_lock_id = id; 1519 + state->active_lock_ptr = ptr; 1532 1520 return 0; 1533 1521 } 1534 1522 ··· 1581 1565 1582 1566 static int release_lock_state(struct bpf_verifier_state *state, int type, int id, void *ptr) 1583 1567 { 1568 + void *prev_ptr = NULL; 1569 + u32 prev_id = 0; 1584 1570 int i; 1585 1571 1586 1572 for (i = 0; i < state->acquired_refs; i++) { 1587 - if (state->refs[i].type != type) 1588 - continue; 1589 - if (state->refs[i].id == id && state->refs[i].ptr == ptr) { 1573 + if (state->refs[i].type == type && state->refs[i].id == id && 1574 + state->refs[i].ptr == ptr) { 1590 1575 release_reference_state(state, i); 1591 1576 state->active_locks--; 1577 + /* Reassign active lock (id, ptr). */ 1578 + state->active_lock_id = prev_id; 1579 + state->active_lock_ptr = prev_ptr; 1592 1580 return 0; 1581 + } 1582 + if (state->refs[i].type & REF_TYPE_LOCK_MASK) { 1583 + prev_id = state->refs[i].id; 1584 + prev_ptr = state->refs[i].ptr; 1593 1585 } 1594 1586 } 1595 1587 return -EINVAL; ··· 1633 1609 for (i = 0; i < state->acquired_refs; i++) { 1634 1610 struct bpf_reference_state *s = &state->refs[i]; 1635 1611 1636 - if (s->type != type) 1612 + if (!(s->type & type)) 1637 1613 continue; 1638 1614 1639 1615 if (s->id == id && s->ptr == ptr) ··· 8240 8216 return err; 8241 8217 } 8242 8218 8219 + enum { 8220 + PROCESS_SPIN_LOCK = (1 << 0), 8221 + PROCESS_RES_LOCK = (1 << 1), 8222 + PROCESS_LOCK_IRQ = (1 << 2), 8223 + }; 8224 + 8243 8225 /* Implementation details: 8244 8226 * bpf_map_lookup returns PTR_TO_MAP_VALUE_OR_NULL. 8245 8227 * bpf_obj_new returns PTR_TO_BTF_ID | MEM_ALLOC | PTR_MAYBE_NULL. ··· 8268 8238 * env->cur_state->active_locks remembers which map value element or allocated 8269 8239 * object got locked and clears it after bpf_spin_unlock. 8270 8240 */ 8271 - static int process_spin_lock(struct bpf_verifier_env *env, int regno, 8272 - bool is_lock) 8241 + static int process_spin_lock(struct bpf_verifier_env *env, int regno, int flags) 8273 8242 { 8243 + bool is_lock = flags & PROCESS_SPIN_LOCK, is_res_lock = flags & PROCESS_RES_LOCK; 8244 + const char *lock_str = is_res_lock ? "bpf_res_spin" : "bpf_spin"; 8274 8245 struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno]; 8275 8246 struct bpf_verifier_state *cur = env->cur_state; 8276 8247 bool is_const = tnum_is_const(reg->var_off); 8248 + bool is_irq = flags & PROCESS_LOCK_IRQ; 8277 8249 u64 val = reg->var_off.value; 8278 8250 struct bpf_map *map = NULL; 8279 8251 struct btf *btf = NULL; 8280 8252 struct btf_record *rec; 8253 + u32 spin_lock_off; 8281 8254 int err; 8282 8255 8283 8256 if (!is_const) { 8284 8257 verbose(env, 8285 - "R%d doesn't have constant offset. bpf_spin_lock has to be at the constant offset\n", 8286 - regno); 8258 + "R%d doesn't have constant offset. %s_lock has to be at the constant offset\n", 8259 + regno, lock_str); 8287 8260 return -EINVAL; 8288 8261 } 8289 8262 if (reg->type == PTR_TO_MAP_VALUE) { 8290 8263 map = reg->map_ptr; 8291 8264 if (!map->btf) { 8292 8265 verbose(env, 8293 - "map '%s' has to have BTF in order to use bpf_spin_lock\n", 8294 - map->name); 8266 + "map '%s' has to have BTF in order to use %s_lock\n", 8267 + map->name, lock_str); 8295 8268 return -EINVAL; 8296 8269 } 8297 8270 } else { ··· 8302 8269 } 8303 8270 8304 8271 rec = reg_btf_record(reg); 8305 - if (!btf_record_has_field(rec, BPF_SPIN_LOCK)) { 8306 - verbose(env, "%s '%s' has no valid bpf_spin_lock\n", map ? "map" : "local", 8307 - map ? map->name : "kptr"); 8272 + if (!btf_record_has_field(rec, is_res_lock ? BPF_RES_SPIN_LOCK : BPF_SPIN_LOCK)) { 8273 + verbose(env, "%s '%s' has no valid %s_lock\n", map ? "map" : "local", 8274 + map ? map->name : "kptr", lock_str); 8308 8275 return -EINVAL; 8309 8276 } 8310 - if (rec->spin_lock_off != val + reg->off) { 8311 - verbose(env, "off %lld doesn't point to 'struct bpf_spin_lock' that is at %d\n", 8312 - val + reg->off, rec->spin_lock_off); 8277 + spin_lock_off = is_res_lock ? rec->res_spin_lock_off : rec->spin_lock_off; 8278 + if (spin_lock_off != val + reg->off) { 8279 + verbose(env, "off %lld doesn't point to 'struct %s_lock' that is at %d\n", 8280 + val + reg->off, lock_str, spin_lock_off); 8313 8281 return -EINVAL; 8314 8282 } 8315 8283 if (is_lock) { 8316 8284 void *ptr; 8285 + int type; 8317 8286 8318 8287 if (map) 8319 8288 ptr = map; 8320 8289 else 8321 8290 ptr = btf; 8322 8291 8323 - if (cur->active_locks) { 8324 - verbose(env, 8325 - "Locking two bpf_spin_locks are not allowed\n"); 8326 - return -EINVAL; 8292 + if (!is_res_lock && cur->active_locks) { 8293 + if (find_lock_state(env->cur_state, REF_TYPE_LOCK, 0, NULL)) { 8294 + verbose(env, 8295 + "Locking two bpf_spin_locks are not allowed\n"); 8296 + return -EINVAL; 8297 + } 8298 + } else if (is_res_lock && cur->active_locks) { 8299 + if (find_lock_state(env->cur_state, REF_TYPE_RES_LOCK | REF_TYPE_RES_LOCK_IRQ, reg->id, ptr)) { 8300 + verbose(env, "Acquiring the same lock again, AA deadlock detected\n"); 8301 + return -EINVAL; 8302 + } 8327 8303 } 8328 - err = acquire_lock_state(env, env->insn_idx, REF_TYPE_LOCK, reg->id, ptr); 8304 + 8305 + if (is_res_lock && is_irq) 8306 + type = REF_TYPE_RES_LOCK_IRQ; 8307 + else if (is_res_lock) 8308 + type = REF_TYPE_RES_LOCK; 8309 + else 8310 + type = REF_TYPE_LOCK; 8311 + err = acquire_lock_state(env, env->insn_idx, type, reg->id, ptr); 8329 8312 if (err < 0) { 8330 8313 verbose(env, "Failed to acquire lock state\n"); 8331 8314 return err; 8332 8315 } 8333 8316 } else { 8334 8317 void *ptr; 8318 + int type; 8335 8319 8336 8320 if (map) 8337 8321 ptr = map; ··· 8356 8306 ptr = btf; 8357 8307 8358 8308 if (!cur->active_locks) { 8359 - verbose(env, "bpf_spin_unlock without taking a lock\n"); 8309 + verbose(env, "%s_unlock without taking a lock\n", lock_str); 8360 8310 return -EINVAL; 8361 8311 } 8362 8312 8363 - if (release_lock_state(env->cur_state, REF_TYPE_LOCK, reg->id, ptr)) { 8364 - verbose(env, "bpf_spin_unlock of different lock\n"); 8313 + if (is_res_lock && is_irq) 8314 + type = REF_TYPE_RES_LOCK_IRQ; 8315 + else if (is_res_lock) 8316 + type = REF_TYPE_RES_LOCK; 8317 + else 8318 + type = REF_TYPE_LOCK; 8319 + if (!find_lock_state(cur, type, reg->id, ptr)) { 8320 + verbose(env, "%s_unlock of different lock\n", lock_str); 8321 + return -EINVAL; 8322 + } 8323 + if (reg->id != cur->active_lock_id || ptr != cur->active_lock_ptr) { 8324 + verbose(env, "%s_unlock cannot be out of order\n", lock_str); 8325 + return -EINVAL; 8326 + } 8327 + if (release_lock_state(cur, type, reg->id, ptr)) { 8328 + verbose(env, "%s_unlock of different lock\n", lock_str); 8365 8329 return -EINVAL; 8366 8330 } 8367 8331 ··· 9701 9637 return -EACCES; 9702 9638 } 9703 9639 if (meta->func_id == BPF_FUNC_spin_lock) { 9704 - err = process_spin_lock(env, regno, true); 9640 + err = process_spin_lock(env, regno, PROCESS_SPIN_LOCK); 9705 9641 if (err) 9706 9642 return err; 9707 9643 } else if (meta->func_id == BPF_FUNC_spin_unlock) { 9708 - err = process_spin_lock(env, regno, false); 9644 + err = process_spin_lock(env, regno, 0); 9709 9645 if (err) 9710 9646 return err; 9711 9647 } else { ··· 11587 11523 regs[BPF_REG_0].map_uid = meta.map_uid; 11588 11524 regs[BPF_REG_0].type = PTR_TO_MAP_VALUE | ret_flag; 11589 11525 if (!type_may_be_null(ret_flag) && 11590 - btf_record_has_field(meta.map_ptr->record, BPF_SPIN_LOCK)) { 11526 + btf_record_has_field(meta.map_ptr->record, BPF_SPIN_LOCK | BPF_RES_SPIN_LOCK)) { 11591 11527 regs[BPF_REG_0].id = ++env->id_gen; 11592 11528 } 11593 11529 break; ··· 11759 11695 /* mark_btf_func_reg_size() is used when the reg size is determined by 11760 11696 * the BTF func_proto's return value size and argument. 11761 11697 */ 11762 - static void mark_btf_func_reg_size(struct bpf_verifier_env *env, u32 regno, 11763 - size_t reg_size) 11698 + static void __mark_btf_func_reg_size(struct bpf_verifier_env *env, struct bpf_reg_state *regs, 11699 + u32 regno, size_t reg_size) 11764 11700 { 11765 - struct bpf_reg_state *reg = &cur_regs(env)[regno]; 11701 + struct bpf_reg_state *reg = &regs[regno]; 11766 11702 11767 11703 if (regno == BPF_REG_0) { 11768 11704 /* Function return value */ ··· 11778 11714 mark_reg_read(env, reg, reg->parent, REG_LIVE_READ32); 11779 11715 } 11780 11716 } 11717 + } 11718 + 11719 + static void mark_btf_func_reg_size(struct bpf_verifier_env *env, u32 regno, 11720 + size_t reg_size) 11721 + { 11722 + return __mark_btf_func_reg_size(env, cur_regs(env), regno, reg_size); 11781 11723 } 11782 11724 11783 11725 static bool is_kfunc_acquire(struct bpf_kfunc_call_arg_meta *meta) ··· 11923 11853 KF_ARG_RB_ROOT_ID, 11924 11854 KF_ARG_RB_NODE_ID, 11925 11855 KF_ARG_WORKQUEUE_ID, 11856 + KF_ARG_RES_SPIN_LOCK_ID, 11926 11857 }; 11927 11858 11928 11859 BTF_ID_LIST(kf_arg_btf_ids) ··· 11933 11862 BTF_ID(struct, bpf_rb_root) 11934 11863 BTF_ID(struct, bpf_rb_node) 11935 11864 BTF_ID(struct, bpf_wq) 11865 + BTF_ID(struct, bpf_res_spin_lock) 11936 11866 11937 11867 static bool __is_kfunc_ptr_arg_type(const struct btf *btf, 11938 11868 const struct btf_param *arg, int type) ··· 11980 11908 static bool is_kfunc_arg_wq(const struct btf *btf, const struct btf_param *arg) 11981 11909 { 11982 11910 return __is_kfunc_ptr_arg_type(btf, arg, KF_ARG_WORKQUEUE_ID); 11911 + } 11912 + 11913 + static bool is_kfunc_arg_res_spin_lock(const struct btf *btf, const struct btf_param *arg) 11914 + { 11915 + return __is_kfunc_ptr_arg_type(btf, arg, KF_ARG_RES_SPIN_LOCK_ID); 11983 11916 } 11984 11917 11985 11918 static bool is_kfunc_arg_callback(struct bpf_verifier_env *env, const struct btf *btf, ··· 12058 11981 KF_ARG_PTR_TO_MAP, 12059 11982 KF_ARG_PTR_TO_WORKQUEUE, 12060 11983 KF_ARG_PTR_TO_IRQ_FLAG, 11984 + KF_ARG_PTR_TO_RES_SPIN_LOCK, 12061 11985 }; 12062 11986 12063 11987 enum special_kfunc_type { ··· 12097 12019 KF_bpf_iter_num_destroy, 12098 12020 KF_bpf_set_dentry_xattr, 12099 12021 KF_bpf_remove_dentry_xattr, 12022 + KF_bpf_res_spin_lock, 12023 + KF_bpf_res_spin_unlock, 12024 + KF_bpf_res_spin_lock_irqsave, 12025 + KF_bpf_res_spin_unlock_irqrestore, 12100 12026 }; 12101 12027 12102 12028 BTF_SET_START(special_kfunc_set) ··· 12190 12108 BTF_ID_UNUSED 12191 12109 BTF_ID_UNUSED 12192 12110 #endif 12111 + BTF_ID(func, bpf_res_spin_lock) 12112 + BTF_ID(func, bpf_res_spin_unlock) 12113 + BTF_ID(func, bpf_res_spin_lock_irqsave) 12114 + BTF_ID(func, bpf_res_spin_unlock_irqrestore) 12193 12115 12194 12116 static bool is_kfunc_ret_null(struct bpf_kfunc_call_arg_meta *meta) 12195 12117 { ··· 12286 12200 12287 12201 if (is_kfunc_arg_irq_flag(meta->btf, &args[argno])) 12288 12202 return KF_ARG_PTR_TO_IRQ_FLAG; 12203 + 12204 + if (is_kfunc_arg_res_spin_lock(meta->btf, &args[argno])) 12205 + return KF_ARG_PTR_TO_RES_SPIN_LOCK; 12289 12206 12290 12207 if ((base_type(reg->type) == PTR_TO_BTF_ID || reg2btf_ids[base_type(reg->type)])) { 12291 12208 if (!btf_type_is_struct(ref_t)) { ··· 12397 12308 struct bpf_kfunc_call_arg_meta *meta) 12398 12309 { 12399 12310 struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno]; 12311 + int err, kfunc_class = IRQ_NATIVE_KFUNC; 12400 12312 bool irq_save; 12401 - int err; 12402 12313 12403 - if (meta->func_id == special_kfunc_list[KF_bpf_local_irq_save]) { 12314 + if (meta->func_id == special_kfunc_list[KF_bpf_local_irq_save] || 12315 + meta->func_id == special_kfunc_list[KF_bpf_res_spin_lock_irqsave]) { 12404 12316 irq_save = true; 12405 - } else if (meta->func_id == special_kfunc_list[KF_bpf_local_irq_restore]) { 12317 + if (meta->func_id == special_kfunc_list[KF_bpf_res_spin_lock_irqsave]) 12318 + kfunc_class = IRQ_LOCK_KFUNC; 12319 + } else if (meta->func_id == special_kfunc_list[KF_bpf_local_irq_restore] || 12320 + meta->func_id == special_kfunc_list[KF_bpf_res_spin_unlock_irqrestore]) { 12406 12321 irq_save = false; 12322 + if (meta->func_id == special_kfunc_list[KF_bpf_res_spin_unlock_irqrestore]) 12323 + kfunc_class = IRQ_LOCK_KFUNC; 12407 12324 } else { 12408 12325 verbose(env, "verifier internal error: unknown irq flags kfunc\n"); 12409 12326 return -EFAULT; ··· 12425 12330 if (err) 12426 12331 return err; 12427 12332 12428 - err = mark_stack_slot_irq_flag(env, meta, reg, env->insn_idx); 12333 + err = mark_stack_slot_irq_flag(env, meta, reg, env->insn_idx, kfunc_class); 12429 12334 if (err) 12430 12335 return err; 12431 12336 } else { ··· 12439 12344 if (err) 12440 12345 return err; 12441 12346 12442 - err = unmark_stack_slot_irq_flag(env, reg); 12347 + err = unmark_stack_slot_irq_flag(env, reg, kfunc_class); 12443 12348 if (err) 12444 12349 return err; 12445 12350 } ··· 12566 12471 12567 12472 if (!env->cur_state->active_locks) 12568 12473 return -EINVAL; 12569 - s = find_lock_state(env->cur_state, REF_TYPE_LOCK, id, ptr); 12474 + s = find_lock_state(env->cur_state, REF_TYPE_LOCK_MASK, id, ptr); 12570 12475 if (!s) { 12571 12476 verbose(env, "held lock and object are not in the same allocation\n"); 12572 12477 return -EINVAL; ··· 12602 12507 btf_id == special_kfunc_list[KF_bpf_refcount_acquire_impl]; 12603 12508 } 12604 12509 12510 + static bool is_bpf_res_spin_lock_kfunc(u32 btf_id) 12511 + { 12512 + return btf_id == special_kfunc_list[KF_bpf_res_spin_lock] || 12513 + btf_id == special_kfunc_list[KF_bpf_res_spin_unlock] || 12514 + btf_id == special_kfunc_list[KF_bpf_res_spin_lock_irqsave] || 12515 + btf_id == special_kfunc_list[KF_bpf_res_spin_unlock_irqrestore]; 12516 + } 12517 + 12605 12518 static bool kfunc_spin_allowed(u32 btf_id) 12606 12519 { 12607 - return is_bpf_graph_api_kfunc(btf_id) || is_bpf_iter_num_api_kfunc(btf_id); 12520 + return is_bpf_graph_api_kfunc(btf_id) || is_bpf_iter_num_api_kfunc(btf_id) || 12521 + is_bpf_res_spin_lock_kfunc(btf_id); 12608 12522 } 12609 12523 12610 12524 static bool is_sync_callback_calling_kfunc(u32 btf_id) ··· 13045 12941 case KF_ARG_PTR_TO_CONST_STR: 13046 12942 case KF_ARG_PTR_TO_WORKQUEUE: 13047 12943 case KF_ARG_PTR_TO_IRQ_FLAG: 12944 + case KF_ARG_PTR_TO_RES_SPIN_LOCK: 13048 12945 break; 13049 12946 default: 13050 12947 WARN_ON_ONCE(1); ··· 13344 13239 if (ret < 0) 13345 13240 return ret; 13346 13241 break; 13242 + case KF_ARG_PTR_TO_RES_SPIN_LOCK: 13243 + { 13244 + int flags = PROCESS_RES_LOCK; 13245 + 13246 + if (reg->type != PTR_TO_MAP_VALUE && reg->type != (PTR_TO_BTF_ID | MEM_ALLOC)) { 13247 + verbose(env, "arg#%d doesn't point to map value or allocated object\n", i); 13248 + return -EINVAL; 13249 + } 13250 + 13251 + if (!is_bpf_res_spin_lock_kfunc(meta->func_id)) 13252 + return -EFAULT; 13253 + if (meta->func_id == special_kfunc_list[KF_bpf_res_spin_lock] || 13254 + meta->func_id == special_kfunc_list[KF_bpf_res_spin_lock_irqsave]) 13255 + flags |= PROCESS_SPIN_LOCK; 13256 + if (meta->func_id == special_kfunc_list[KF_bpf_res_spin_lock_irqsave] || 13257 + meta->func_id == special_kfunc_list[KF_bpf_res_spin_unlock_irqrestore]) 13258 + flags |= PROCESS_LOCK_IRQ; 13259 + ret = process_spin_lock(env, regno, flags); 13260 + if (ret < 0) 13261 + return ret; 13262 + break; 13263 + } 13347 13264 } 13348 13265 } 13349 13266 ··· 13450 13323 insn_aux = &env->insn_aux_data[insn_idx]; 13451 13324 13452 13325 insn_aux->is_iter_next = is_iter_next_kfunc(&meta); 13326 + 13327 + if (!insn->off && 13328 + (insn->imm == special_kfunc_list[KF_bpf_res_spin_lock] || 13329 + insn->imm == special_kfunc_list[KF_bpf_res_spin_lock_irqsave])) { 13330 + struct bpf_verifier_state *branch; 13331 + struct bpf_reg_state *regs; 13332 + 13333 + branch = push_stack(env, env->insn_idx + 1, env->insn_idx, false); 13334 + if (!branch) { 13335 + verbose(env, "failed to push state for failed lock acquisition\n"); 13336 + return -ENOMEM; 13337 + } 13338 + 13339 + regs = branch->frame[branch->curframe]->regs; 13340 + 13341 + /* Clear r0-r5 registers in forked state */ 13342 + for (i = 0; i < CALLER_SAVED_REGS; i++) 13343 + mark_reg_not_init(env, regs, caller_saved[i]); 13344 + 13345 + mark_reg_unknown(env, regs, BPF_REG_0); 13346 + err = __mark_reg_s32_range(env, regs, BPF_REG_0, -MAX_ERRNO, -1); 13347 + if (err) { 13348 + verbose(env, "failed to mark s32 range for retval in forked state for lock\n"); 13349 + return err; 13350 + } 13351 + __mark_btf_func_reg_size(env, regs, BPF_REG_0, sizeof(u32)); 13352 + } 13453 13353 13454 13354 if (is_kfunc_destructive(&meta) && !capable(CAP_SYS_BOOT)) { 13455 13355 verbose(env, "destructive kfunc calls require CAP_SYS_BOOT capability\n"); ··· 13648 13494 13649 13495 if (btf_type_is_scalar(t)) { 13650 13496 mark_reg_unknown(env, regs, BPF_REG_0); 13497 + if (meta.btf == btf_vmlinux && (meta.func_id == special_kfunc_list[KF_bpf_res_spin_lock] || 13498 + meta.func_id == special_kfunc_list[KF_bpf_res_spin_lock_irqsave])) 13499 + __mark_reg_const_zero(env, &regs[BPF_REG_0]); 13651 13500 mark_btf_func_reg_size(env, BPF_REG_0, t->size); 13652 13501 } else if (btf_type_is_ptr(t)) { 13653 13502 ptr_type = btf_type_skip_modifiers(desc_btf, t->type, &ptr_type_id); ··· 18586 18429 case STACK_IRQ_FLAG: 18587 18430 old_reg = &old->stack[spi].spilled_ptr; 18588 18431 cur_reg = &cur->stack[spi].spilled_ptr; 18589 - if (!check_ids(old_reg->ref_obj_id, cur_reg->ref_obj_id, idmap)) 18432 + if (!check_ids(old_reg->ref_obj_id, cur_reg->ref_obj_id, idmap) || 18433 + old_reg->irq.kfunc_class != cur_reg->irq.kfunc_class) 18590 18434 return false; 18591 18435 break; 18592 18436 case STACK_MISC: ··· 18622 18464 if (!check_ids(old->active_irq_id, cur->active_irq_id, idmap)) 18623 18465 return false; 18624 18466 18467 + if (!check_ids(old->active_lock_id, cur->active_lock_id, idmap) || 18468 + old->active_lock_ptr != cur->active_lock_ptr) 18469 + return false; 18470 + 18625 18471 for (i = 0; i < old->acquired_refs; i++) { 18626 18472 if (!check_ids(old->refs[i].id, cur->refs[i].id, idmap) || 18627 18473 old->refs[i].type != cur->refs[i].type) ··· 18635 18473 case REF_TYPE_IRQ: 18636 18474 break; 18637 18475 case REF_TYPE_LOCK: 18476 + case REF_TYPE_RES_LOCK: 18477 + case REF_TYPE_RES_LOCK_IRQ: 18638 18478 if (old->refs[i].ptr != cur->refs[i].ptr) 18639 18479 return false; 18640 18480 break; ··· 19922 19758 } 19923 19759 } 19924 19760 19925 - if (btf_record_has_field(map->record, BPF_SPIN_LOCK)) { 19761 + if (btf_record_has_field(map->record, BPF_SPIN_LOCK | BPF_RES_SPIN_LOCK)) { 19926 19762 if (prog_type == BPF_PROG_TYPE_SOCKET_FILTER) { 19927 19763 verbose(env, "socket filter progs cannot use bpf_spin_lock yet\n"); 19928 19764 return -EINVAL;

+5

kernel/locking/lock_events_list.h

··· 50 50 #endif /* CONFIG_QUEUED_SPINLOCKS */ 51 51 52 52 /* 53 + * Locking events for Resilient Queued Spin Lock 54 + */ 55 + LOCK_EVENT(rqspinlock_lock_timeout) /* # of locking ops that timeout */ 56 + 57 + /* 53 58 * Locking events for rwsem 54 59 */ 55 60 LOCK_EVENT(rwsem_sleep_reader) /* # of reader sleeps */

+57

kernel/locking/locktorture.c

··· 362 362 .name = "raw_spin_lock_irq" 363 363 }; 364 364 365 + #ifdef CONFIG_BPF_SYSCALL 366 + 367 + #include <asm/rqspinlock.h> 368 + static rqspinlock_t rqspinlock; 369 + 370 + static int torture_raw_res_spin_write_lock(int tid __maybe_unused) 371 + { 372 + raw_res_spin_lock(&rqspinlock); 373 + return 0; 374 + } 375 + 376 + static void torture_raw_res_spin_write_unlock(int tid __maybe_unused) 377 + { 378 + raw_res_spin_unlock(&rqspinlock); 379 + } 380 + 381 + static struct lock_torture_ops raw_res_spin_lock_ops = { 382 + .writelock = torture_raw_res_spin_write_lock, 383 + .write_delay = torture_spin_lock_write_delay, 384 + .task_boost = torture_rt_boost, 385 + .writeunlock = torture_raw_res_spin_write_unlock, 386 + .readlock = NULL, 387 + .read_delay = NULL, 388 + .readunlock = NULL, 389 + .name = "raw_res_spin_lock" 390 + }; 391 + 392 + static int torture_raw_res_spin_write_lock_irq(int tid __maybe_unused) 393 + { 394 + unsigned long flags; 395 + 396 + raw_res_spin_lock_irqsave(&rqspinlock, flags); 397 + cxt.cur_ops->flags = flags; 398 + return 0; 399 + } 400 + 401 + static void torture_raw_res_spin_write_unlock_irq(int tid __maybe_unused) 402 + { 403 + raw_res_spin_unlock_irqrestore(&rqspinlock, cxt.cur_ops->flags); 404 + } 405 + 406 + static struct lock_torture_ops raw_res_spin_lock_irq_ops = { 407 + .writelock = torture_raw_res_spin_write_lock_irq, 408 + .write_delay = torture_spin_lock_write_delay, 409 + .task_boost = torture_rt_boost, 410 + .writeunlock = torture_raw_res_spin_write_unlock_irq, 411 + .readlock = NULL, 412 + .read_delay = NULL, 413 + .readunlock = NULL, 414 + .name = "raw_res_spin_lock_irq" 415 + }; 416 + 417 + #endif 418 + 365 419 static DEFINE_RWLOCK(torture_rwlock); 366 420 367 421 static int torture_rwlock_write_lock(int tid __maybe_unused) ··· 1222 1168 &lock_busted_ops, 1223 1169 &spin_lock_ops, &spin_lock_irq_ops, 1224 1170 &raw_spin_lock_ops, &raw_spin_lock_irq_ops, 1171 + #ifdef CONFIG_BPF_SYSCALL 1172 + &raw_res_spin_lock_ops, &raw_res_spin_lock_irq_ops, 1173 + #endif 1225 1174 &rw_lock_ops, &rw_lock_irq_ops, 1226 1175 &mutex_lock_ops, 1227 1176 &ww_mutex_lock_ops,

+1 -9

kernel/locking/mcs_spinlock.h

··· 15 15 16 16 #include <asm/mcs_spinlock.h> 17 17 18 - struct mcs_spinlock { 19 - struct mcs_spinlock *next; 20 - int locked; /* 1 if lock acquired */ 21 - int count; /* nesting count, see qspinlock.c */ 22 - }; 23 - 24 18 #ifndef arch_mcs_spin_lock_contended 25 19 /* 26 20 * Using smp_cond_load_acquire() provides the acquire semantics ··· 24 30 * spinning, and smp_cond_load_acquire() provides that behavior. 25 31 */ 26 32 #define arch_mcs_spin_lock_contended(l) \ 27 - do { \ 28 - smp_cond_load_acquire(l, VAL); \ 29 - } while (0) 33 + smp_cond_load_acquire(l, VAL) 30 34 #endif 31 35 32 36 #ifndef arch_mcs_spin_unlock_contended

+5 -188

kernel/locking/qspinlock.c

··· 25 25 #include <trace/events/lock.h> 26 26 27 27 /* 28 - * Include queued spinlock statistics code 28 + * Include queued spinlock definitions and statistics code 29 29 */ 30 + #include "qspinlock.h" 30 31 #include "qspinlock_stat.h" 31 32 32 33 /* ··· 68 67 */ 69 68 70 69 #include "mcs_spinlock.h" 71 - #define MAX_NODES 4 72 - 73 - /* 74 - * On 64-bit architectures, the mcs_spinlock structure will be 16 bytes in 75 - * size and four of them will fit nicely in one 64-byte cacheline. For 76 - * pvqspinlock, however, we need more space for extra data. To accommodate 77 - * that, we insert two more long words to pad it up to 32 bytes. IOW, only 78 - * two of them can fit in a cacheline in this case. That is OK as it is rare 79 - * to have more than 2 levels of slowpath nesting in actual use. We don't 80 - * want to penalize pvqspinlocks to optimize for a rare case in native 81 - * qspinlocks. 82 - */ 83 - struct qnode { 84 - struct mcs_spinlock mcs; 85 - #ifdef CONFIG_PARAVIRT_SPINLOCKS 86 - long reserved[2]; 87 - #endif 88 - }; 89 - 90 - /* 91 - * The pending bit spinning loop count. 92 - * This heuristic is used to limit the number of lockword accesses 93 - * made by atomic_cond_read_relaxed when waiting for the lock to 94 - * transition out of the "== _Q_PENDING_VAL" state. We don't spin 95 - * indefinitely because there's no guarantee that we'll make forward 96 - * progress. 97 - */ 98 - #ifndef _Q_PENDING_LOOPS 99 - #define _Q_PENDING_LOOPS 1 100 - #endif 101 70 102 71 /* 103 72 * Per-CPU queue node structures; we can never have more than 4 nested ··· 77 106 * 78 107 * PV doubles the storage and uses the second cacheline for PV state. 79 108 */ 80 - static DEFINE_PER_CPU_ALIGNED(struct qnode, qnodes[MAX_NODES]); 81 - 82 - /* 83 - * We must be able to distinguish between no-tail and the tail at 0:0, 84 - * therefore increment the cpu number by one. 85 - */ 86 - 87 - static inline __pure u32 encode_tail(int cpu, int idx) 88 - { 89 - u32 tail; 90 - 91 - tail = (cpu + 1) << _Q_TAIL_CPU_OFFSET; 92 - tail |= idx << _Q_TAIL_IDX_OFFSET; /* assume < 4 */ 93 - 94 - return tail; 95 - } 96 - 97 - static inline __pure struct mcs_spinlock *decode_tail(u32 tail) 98 - { 99 - int cpu = (tail >> _Q_TAIL_CPU_OFFSET) - 1; 100 - int idx = (tail & _Q_TAIL_IDX_MASK) >> _Q_TAIL_IDX_OFFSET; 101 - 102 - return per_cpu_ptr(&qnodes[idx].mcs, cpu); 103 - } 104 - 105 - static inline __pure 106 - struct mcs_spinlock *grab_mcs_node(struct mcs_spinlock *base, int idx) 107 - { 108 - return &((struct qnode *)base + idx)->mcs; 109 - } 110 - 111 - #define _Q_LOCKED_PENDING_MASK (_Q_LOCKED_MASK | _Q_PENDING_MASK) 112 - 113 - #if _Q_PENDING_BITS == 8 114 - /** 115 - * clear_pending - clear the pending bit. 116 - * @lock: Pointer to queued spinlock structure 117 - * 118 - * *,1,* -> *,0,* 119 - */ 120 - static __always_inline void clear_pending(struct qspinlock *lock) 121 - { 122 - WRITE_ONCE(lock->pending, 0); 123 - } 124 - 125 - /** 126 - * clear_pending_set_locked - take ownership and clear the pending bit. 127 - * @lock: Pointer to queued spinlock structure 128 - * 129 - * *,1,0 -> *,0,1 130 - * 131 - * Lock stealing is not allowed if this function is used. 132 - */ 133 - static __always_inline void clear_pending_set_locked(struct qspinlock *lock) 134 - { 135 - WRITE_ONCE(lock->locked_pending, _Q_LOCKED_VAL); 136 - } 137 - 138 - /* 139 - * xchg_tail - Put in the new queue tail code word & retrieve previous one 140 - * @lock : Pointer to queued spinlock structure 141 - * @tail : The new queue tail code word 142 - * Return: The previous queue tail code word 143 - * 144 - * xchg(lock, tail), which heads an address dependency 145 - * 146 - * p,*,* -> n,*,* ; prev = xchg(lock, node) 147 - */ 148 - static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail) 149 - { 150 - /* 151 - * We can use relaxed semantics since the caller ensures that the 152 - * MCS node is properly initialized before updating the tail. 153 - */ 154 - return (u32)xchg_relaxed(&lock->tail, 155 - tail >> _Q_TAIL_OFFSET) << _Q_TAIL_OFFSET; 156 - } 157 - 158 - #else /* _Q_PENDING_BITS == 8 */ 159 - 160 - /** 161 - * clear_pending - clear the pending bit. 162 - * @lock: Pointer to queued spinlock structure 163 - * 164 - * *,1,* -> *,0,* 165 - */ 166 - static __always_inline void clear_pending(struct qspinlock *lock) 167 - { 168 - atomic_andnot(_Q_PENDING_VAL, &lock->val); 169 - } 170 - 171 - /** 172 - * clear_pending_set_locked - take ownership and clear the pending bit. 173 - * @lock: Pointer to queued spinlock structure 174 - * 175 - * *,1,0 -> *,0,1 176 - */ 177 - static __always_inline void clear_pending_set_locked(struct qspinlock *lock) 178 - { 179 - atomic_add(-_Q_PENDING_VAL + _Q_LOCKED_VAL, &lock->val); 180 - } 181 - 182 - /** 183 - * xchg_tail - Put in the new queue tail code word & retrieve previous one 184 - * @lock : Pointer to queued spinlock structure 185 - * @tail : The new queue tail code word 186 - * Return: The previous queue tail code word 187 - * 188 - * xchg(lock, tail) 189 - * 190 - * p,*,* -> n,*,* ; prev = xchg(lock, node) 191 - */ 192 - static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail) 193 - { 194 - u32 old, new; 195 - 196 - old = atomic_read(&lock->val); 197 - do { 198 - new = (old & _Q_LOCKED_PENDING_MASK) | tail; 199 - /* 200 - * We can use relaxed semantics since the caller ensures that 201 - * the MCS node is properly initialized before updating the 202 - * tail. 203 - */ 204 - } while (!atomic_try_cmpxchg_relaxed(&lock->val, &old, new)); 205 - 206 - return old; 207 - } 208 - #endif /* _Q_PENDING_BITS == 8 */ 209 - 210 - /** 211 - * queued_fetch_set_pending_acquire - fetch the whole lock value and set pending 212 - * @lock : Pointer to queued spinlock structure 213 - * Return: The previous lock value 214 - * 215 - * *,*,* -> *,1,* 216 - */ 217 - #ifndef queued_fetch_set_pending_acquire 218 - static __always_inline u32 queued_fetch_set_pending_acquire(struct qspinlock *lock) 219 - { 220 - return atomic_fetch_or_acquire(_Q_PENDING_VAL, &lock->val); 221 - } 222 - #endif 223 - 224 - /** 225 - * set_locked - Set the lock bit and own the lock 226 - * @lock: Pointer to queued spinlock structure 227 - * 228 - * *,*,0 -> *,0,1 229 - */ 230 - static __always_inline void set_locked(struct qspinlock *lock) 231 - { 232 - WRITE_ONCE(lock->locked, _Q_LOCKED_VAL); 233 - } 234 - 109 + static DEFINE_PER_CPU_ALIGNED(struct qnode, qnodes[_Q_MAX_NODES]); 235 110 236 111 /* 237 112 * Generate the native code for queued_spin_unlock_slowpath(); provide NOPs for ··· 227 410 * any MCS node. This is not the most elegant solution, but is 228 411 * simple enough. 229 412 */ 230 - if (unlikely(idx >= MAX_NODES)) { 413 + if (unlikely(idx >= _Q_MAX_NODES)) { 231 414 lockevent_inc(lock_no_node); 232 415 while (!queued_spin_trylock(lock)) 233 416 cpu_relax(); ··· 282 465 * head of the waitqueue. 283 466 */ 284 467 if (old & _Q_TAIL_MASK) { 285 - prev = decode_tail(old); 468 + prev = decode_tail(old, qnodes); 286 469 287 470 /* Link @node into the waitqueue. */ 288 471 WRITE_ONCE(prev->next, node);

+201

kernel/locking/qspinlock.h

··· 1 + /* SPDX-License-Identifier: GPL-2.0-or-later */ 2 + /* 3 + * Queued spinlock defines 4 + * 5 + * This file contains macro definitions and functions shared between different 6 + * qspinlock slow path implementations. 7 + */ 8 + #ifndef __LINUX_QSPINLOCK_H 9 + #define __LINUX_QSPINLOCK_H 10 + 11 + #include <asm-generic/percpu.h> 12 + #include <linux/percpu-defs.h> 13 + #include <asm-generic/qspinlock.h> 14 + #include <asm-generic/mcs_spinlock.h> 15 + 16 + #define _Q_MAX_NODES 4 17 + 18 + /* 19 + * The pending bit spinning loop count. 20 + * This heuristic is used to limit the number of lockword accesses 21 + * made by atomic_cond_read_relaxed when waiting for the lock to 22 + * transition out of the "== _Q_PENDING_VAL" state. We don't spin 23 + * indefinitely because there's no guarantee that we'll make forward 24 + * progress. 25 + */ 26 + #ifndef _Q_PENDING_LOOPS 27 + #define _Q_PENDING_LOOPS 1 28 + #endif 29 + 30 + /* 31 + * On 64-bit architectures, the mcs_spinlock structure will be 16 bytes in 32 + * size and four of them will fit nicely in one 64-byte cacheline. For 33 + * pvqspinlock, however, we need more space for extra data. To accommodate 34 + * that, we insert two more long words to pad it up to 32 bytes. IOW, only 35 + * two of them can fit in a cacheline in this case. That is OK as it is rare 36 + * to have more than 2 levels of slowpath nesting in actual use. We don't 37 + * want to penalize pvqspinlocks to optimize for a rare case in native 38 + * qspinlocks. 39 + */ 40 + struct qnode { 41 + struct mcs_spinlock mcs; 42 + #ifdef CONFIG_PARAVIRT_SPINLOCKS 43 + long reserved[2]; 44 + #endif 45 + }; 46 + 47 + /* 48 + * We must be able to distinguish between no-tail and the tail at 0:0, 49 + * therefore increment the cpu number by one. 50 + */ 51 + 52 + static inline __pure u32 encode_tail(int cpu, int idx) 53 + { 54 + u32 tail; 55 + 56 + tail = (cpu + 1) << _Q_TAIL_CPU_OFFSET; 57 + tail |= idx << _Q_TAIL_IDX_OFFSET; /* assume < 4 */ 58 + 59 + return tail; 60 + } 61 + 62 + static inline __pure struct mcs_spinlock *decode_tail(u32 tail, 63 + struct qnode __percpu *qnodes) 64 + { 65 + int cpu = (tail >> _Q_TAIL_CPU_OFFSET) - 1; 66 + int idx = (tail & _Q_TAIL_IDX_MASK) >> _Q_TAIL_IDX_OFFSET; 67 + 68 + return per_cpu_ptr(&qnodes[idx].mcs, cpu); 69 + } 70 + 71 + static inline __pure 72 + struct mcs_spinlock *grab_mcs_node(struct mcs_spinlock *base, int idx) 73 + { 74 + return &((struct qnode *)base + idx)->mcs; 75 + } 76 + 77 + #define _Q_LOCKED_PENDING_MASK (_Q_LOCKED_MASK | _Q_PENDING_MASK) 78 + 79 + #if _Q_PENDING_BITS == 8 80 + /** 81 + * clear_pending - clear the pending bit. 82 + * @lock: Pointer to queued spinlock structure 83 + * 84 + * *,1,* -> *,0,* 85 + */ 86 + static __always_inline void clear_pending(struct qspinlock *lock) 87 + { 88 + WRITE_ONCE(lock->pending, 0); 89 + } 90 + 91 + /** 92 + * clear_pending_set_locked - take ownership and clear the pending bit. 93 + * @lock: Pointer to queued spinlock structure 94 + * 95 + * *,1,0 -> *,0,1 96 + * 97 + * Lock stealing is not allowed if this function is used. 98 + */ 99 + static __always_inline void clear_pending_set_locked(struct qspinlock *lock) 100 + { 101 + WRITE_ONCE(lock->locked_pending, _Q_LOCKED_VAL); 102 + } 103 + 104 + /* 105 + * xchg_tail - Put in the new queue tail code word & retrieve previous one 106 + * @lock : Pointer to queued spinlock structure 107 + * @tail : The new queue tail code word 108 + * Return: The previous queue tail code word 109 + * 110 + * xchg(lock, tail), which heads an address dependency 111 + * 112 + * p,*,* -> n,*,* ; prev = xchg(lock, node) 113 + */ 114 + static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail) 115 + { 116 + /* 117 + * We can use relaxed semantics since the caller ensures that the 118 + * MCS node is properly initialized before updating the tail. 119 + */ 120 + return (u32)xchg_relaxed(&lock->tail, 121 + tail >> _Q_TAIL_OFFSET) << _Q_TAIL_OFFSET; 122 + } 123 + 124 + #else /* _Q_PENDING_BITS == 8 */ 125 + 126 + /** 127 + * clear_pending - clear the pending bit. 128 + * @lock: Pointer to queued spinlock structure 129 + * 130 + * *,1,* -> *,0,* 131 + */ 132 + static __always_inline void clear_pending(struct qspinlock *lock) 133 + { 134 + atomic_andnot(_Q_PENDING_VAL, &lock->val); 135 + } 136 + 137 + /** 138 + * clear_pending_set_locked - take ownership and clear the pending bit. 139 + * @lock: Pointer to queued spinlock structure 140 + * 141 + * *,1,0 -> *,0,1 142 + */ 143 + static __always_inline void clear_pending_set_locked(struct qspinlock *lock) 144 + { 145 + atomic_add(-_Q_PENDING_VAL + _Q_LOCKED_VAL, &lock->val); 146 + } 147 + 148 + /** 149 + * xchg_tail - Put in the new queue tail code word & retrieve previous one 150 + * @lock : Pointer to queued spinlock structure 151 + * @tail : The new queue tail code word 152 + * Return: The previous queue tail code word 153 + * 154 + * xchg(lock, tail) 155 + * 156 + * p,*,* -> n,*,* ; prev = xchg(lock, node) 157 + */ 158 + static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail) 159 + { 160 + u32 old, new; 161 + 162 + old = atomic_read(&lock->val); 163 + do { 164 + new = (old & _Q_LOCKED_PENDING_MASK) | tail; 165 + /* 166 + * We can use relaxed semantics since the caller ensures that 167 + * the MCS node is properly initialized before updating the 168 + * tail. 169 + */ 170 + } while (!atomic_try_cmpxchg_relaxed(&lock->val, &old, new)); 171 + 172 + return old; 173 + } 174 + #endif /* _Q_PENDING_BITS == 8 */ 175 + 176 + /** 177 + * queued_fetch_set_pending_acquire - fetch the whole lock value and set pending 178 + * @lock : Pointer to queued spinlock structure 179 + * Return: The previous lock value 180 + * 181 + * *,*,* -> *,1,* 182 + */ 183 + #ifndef queued_fetch_set_pending_acquire 184 + static __always_inline u32 queued_fetch_set_pending_acquire(struct qspinlock *lock) 185 + { 186 + return atomic_fetch_or_acquire(_Q_PENDING_VAL, &lock->val); 187 + } 188 + #endif 189 + 190 + /** 191 + * set_locked - Set the lock bit and own the lock 192 + * @lock: Pointer to queued spinlock structure 193 + * 194 + * *,*,0 -> *,0,1 195 + */ 196 + static __always_inline void set_locked(struct qspinlock *lock) 197 + { 198 + WRITE_ONCE(lock->locked, _Q_LOCKED_VAL); 199 + } 200 + 201 + #endif /* __LINUX_QSPINLOCK_H */

+98

tools/testing/selftests/bpf/prog_tests/res_spin_lock.c

··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + /* Copyright (c) 2024-2025 Meta Platforms, Inc. and affiliates. */ 3 + #include <test_progs.h> 4 + #include <network_helpers.h> 5 + #include <sys/sysinfo.h> 6 + 7 + #include "res_spin_lock.skel.h" 8 + #include "res_spin_lock_fail.skel.h" 9 + 10 + void test_res_spin_lock_failure(void) 11 + { 12 + RUN_TESTS(res_spin_lock_fail); 13 + } 14 + 15 + static volatile int skip; 16 + 17 + static void *spin_lock_thread(void *arg) 18 + { 19 + int err, prog_fd = *(u32 *) arg; 20 + LIBBPF_OPTS(bpf_test_run_opts, topts, 21 + .data_in = &pkt_v4, 22 + .data_size_in = sizeof(pkt_v4), 23 + .repeat = 10000, 24 + ); 25 + 26 + while (!READ_ONCE(skip)) { 27 + err = bpf_prog_test_run_opts(prog_fd, &topts); 28 + ASSERT_OK(err, "test_run"); 29 + ASSERT_OK(topts.retval, "test_run retval"); 30 + } 31 + pthread_exit(arg); 32 + } 33 + 34 + void test_res_spin_lock_success(void) 35 + { 36 + LIBBPF_OPTS(bpf_test_run_opts, topts, 37 + .data_in = &pkt_v4, 38 + .data_size_in = sizeof(pkt_v4), 39 + .repeat = 1, 40 + ); 41 + struct res_spin_lock *skel; 42 + pthread_t thread_id[16]; 43 + int prog_fd, i, err; 44 + void *ret; 45 + 46 + if (get_nprocs() < 2) { 47 + test__skip(); 48 + return; 49 + } 50 + 51 + skel = res_spin_lock__open_and_load(); 52 + if (!ASSERT_OK_PTR(skel, "res_spin_lock__open_and_load")) 53 + return; 54 + /* AA deadlock */ 55 + prog_fd = bpf_program__fd(skel->progs.res_spin_lock_test); 56 + err = bpf_prog_test_run_opts(prog_fd, &topts); 57 + ASSERT_OK(err, "error"); 58 + ASSERT_OK(topts.retval, "retval"); 59 + 60 + prog_fd = bpf_program__fd(skel->progs.res_spin_lock_test_held_lock_max); 61 + err = bpf_prog_test_run_opts(prog_fd, &topts); 62 + ASSERT_OK(err, "error"); 63 + ASSERT_OK(topts.retval, "retval"); 64 + 65 + /* Multi-threaded ABBA deadlock. */ 66 + 67 + prog_fd = bpf_program__fd(skel->progs.res_spin_lock_test_AB); 68 + for (i = 0; i < 16; i++) { 69 + int err; 70 + 71 + err = pthread_create(&thread_id[i], NULL, &spin_lock_thread, &prog_fd); 72 + if (!ASSERT_OK(err, "pthread_create")) 73 + goto end; 74 + } 75 + 76 + topts.retval = 0; 77 + topts.repeat = 1000; 78 + int fd = bpf_program__fd(skel->progs.res_spin_lock_test_BA); 79 + while (!topts.retval && !err && !READ_ONCE(skel->bss->err)) { 80 + err = bpf_prog_test_run_opts(fd, &topts); 81 + } 82 + 83 + WRITE_ONCE(skip, true); 84 + 85 + for (i = 0; i < 16; i++) { 86 + if (!ASSERT_OK(pthread_join(thread_id[i], &ret), "pthread_join")) 87 + goto end; 88 + if (!ASSERT_EQ(ret, &prog_fd, "ret == prog_fd")) 89 + goto end; 90 + } 91 + 92 + ASSERT_EQ(READ_ONCE(skel->bss->err), -EDEADLK, "timeout err"); 93 + ASSERT_OK(err, "err"); 94 + ASSERT_EQ(topts.retval, -EDEADLK, "timeout"); 95 + end: 96 + res_spin_lock__destroy(skel); 97 + return; 98 + }

+53

tools/testing/selftests/bpf/progs/irq.c

··· 11 11 extern void bpf_local_irq_restore(unsigned long *) __weak __ksym; 12 12 extern int bpf_copy_from_user_str(void *dst, u32 dst__sz, const void *unsafe_ptr__ign, u64 flags) __weak __ksym; 13 13 14 + struct bpf_res_spin_lock lockA __hidden SEC(".data.A"); 15 + struct bpf_res_spin_lock lockB __hidden SEC(".data.B"); 16 + 14 17 SEC("?tc") 15 18 __failure __msg("arg#0 doesn't point to an irq flag on stack") 16 19 int irq_save_bad_arg(struct __sk_buff *ctx) ··· 510 507 bpf_local_irq_save(&flags); 511 508 global_subprog_calling_sleepable_global(0); 512 509 bpf_local_irq_restore(&flags); 510 + return 0; 511 + } 512 + 513 + SEC("?tc") 514 + __failure __msg("cannot restore irq state out of order") 515 + int irq_ooo_lock_cond_inv(struct __sk_buff *ctx) 516 + { 517 + unsigned long flags1, flags2; 518 + 519 + if (bpf_res_spin_lock_irqsave(&lockA, &flags1)) 520 + return 0; 521 + if (bpf_res_spin_lock_irqsave(&lockB, &flags2)) { 522 + bpf_res_spin_unlock_irqrestore(&lockA, &flags1); 523 + return 0; 524 + } 525 + 526 + bpf_res_spin_unlock_irqrestore(&lockB, &flags1); 527 + bpf_res_spin_unlock_irqrestore(&lockA, &flags2); 528 + return 0; 529 + } 530 + 531 + SEC("?tc") 532 + __failure __msg("function calls are not allowed") 533 + int irq_wrong_kfunc_class_1(struct __sk_buff *ctx) 534 + { 535 + unsigned long flags1; 536 + 537 + if (bpf_res_spin_lock_irqsave(&lockA, &flags1)) 538 + return 0; 539 + /* For now, bpf_local_irq_restore is not allowed in critical section, 540 + * but this test ensures error will be caught with kfunc_class when it's 541 + * opened up. Tested by temporarily permitting this kfunc in critical 542 + * section. 543 + */ 544 + bpf_local_irq_restore(&flags1); 545 + bpf_res_spin_unlock_irqrestore(&lockA, &flags1); 546 + return 0; 547 + } 548 + 549 + SEC("?tc") 550 + __failure __msg("function calls are not allowed") 551 + int irq_wrong_kfunc_class_2(struct __sk_buff *ctx) 552 + { 553 + unsigned long flags1, flags2; 554 + 555 + bpf_local_irq_save(&flags1); 556 + if (bpf_res_spin_lock_irqsave(&lockA, &flags2)) 557 + return 0; 558 + bpf_local_irq_restore(&flags2); 559 + bpf_res_spin_unlock_irqrestore(&lockA, &flags1); 513 560 return 0; 514 561 } 515 562

+143

tools/testing/selftests/bpf/progs/res_spin_lock.c

··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + /* Copyright (c) 2024-2025 Meta Platforms, Inc. and affiliates. */ 3 + #include <vmlinux.h> 4 + #include <bpf/bpf_tracing.h> 5 + #include <bpf/bpf_helpers.h> 6 + #include "bpf_misc.h" 7 + 8 + #define EDEADLK 35 9 + #define ETIMEDOUT 110 10 + 11 + struct arr_elem { 12 + struct bpf_res_spin_lock lock; 13 + }; 14 + 15 + struct { 16 + __uint(type, BPF_MAP_TYPE_ARRAY); 17 + __uint(max_entries, 64); 18 + __type(key, int); 19 + __type(value, struct arr_elem); 20 + } arrmap SEC(".maps"); 21 + 22 + struct bpf_res_spin_lock lockA __hidden SEC(".data.A"); 23 + struct bpf_res_spin_lock lockB __hidden SEC(".data.B"); 24 + 25 + SEC("tc") 26 + int res_spin_lock_test(struct __sk_buff *ctx) 27 + { 28 + struct arr_elem *elem1, *elem2; 29 + int r; 30 + 31 + elem1 = bpf_map_lookup_elem(&arrmap, &(int){0}); 32 + if (!elem1) 33 + return -1; 34 + elem2 = bpf_map_lookup_elem(&arrmap, &(int){0}); 35 + if (!elem2) 36 + return -1; 37 + 38 + r = bpf_res_spin_lock(&elem1->lock); 39 + if (r) 40 + return r; 41 + if (!bpf_res_spin_lock(&elem2->lock)) { 42 + bpf_res_spin_unlock(&elem2->lock); 43 + bpf_res_spin_unlock(&elem1->lock); 44 + return -1; 45 + } 46 + bpf_res_spin_unlock(&elem1->lock); 47 + return 0; 48 + } 49 + 50 + SEC("tc") 51 + int res_spin_lock_test_AB(struct __sk_buff *ctx) 52 + { 53 + int r; 54 + 55 + r = bpf_res_spin_lock(&lockA); 56 + if (r) 57 + return !r; 58 + /* Only unlock if we took the lock. */ 59 + if (!bpf_res_spin_lock(&lockB)) 60 + bpf_res_spin_unlock(&lockB); 61 + bpf_res_spin_unlock(&lockA); 62 + return 0; 63 + } 64 + 65 + int err; 66 + 67 + SEC("tc") 68 + int res_spin_lock_test_BA(struct __sk_buff *ctx) 69 + { 70 + int r; 71 + 72 + r = bpf_res_spin_lock(&lockB); 73 + if (r) 74 + return !r; 75 + if (!bpf_res_spin_lock(&lockA)) 76 + bpf_res_spin_unlock(&lockA); 77 + else 78 + err = -EDEADLK; 79 + bpf_res_spin_unlock(&lockB); 80 + return err ?: 0; 81 + } 82 + 83 + SEC("tc") 84 + int res_spin_lock_test_held_lock_max(struct __sk_buff *ctx) 85 + { 86 + struct bpf_res_spin_lock *locks[48] = {}; 87 + struct arr_elem *e; 88 + u64 time_beg, time; 89 + int ret = 0, i; 90 + 91 + _Static_assert(ARRAY_SIZE(((struct rqspinlock_held){}).locks) == 31, 92 + "RES_NR_HELD assumed to be 31"); 93 + 94 + for (i = 0; i < 34; i++) { 95 + int key = i; 96 + 97 + /* We cannot pass in i as it will get spilled/filled by the compiler and 98 + * loses bounds in verifier state. 99 + */ 100 + e = bpf_map_lookup_elem(&arrmap, &key); 101 + if (!e) 102 + return 1; 103 + locks[i] = &e->lock; 104 + } 105 + 106 + for (; i < 48; i++) { 107 + int key = i - 2; 108 + 109 + /* We cannot pass in i as it will get spilled/filled by the compiler and 110 + * loses bounds in verifier state. 111 + */ 112 + e = bpf_map_lookup_elem(&arrmap, &key); 113 + if (!e) 114 + return 1; 115 + locks[i] = &e->lock; 116 + } 117 + 118 + time_beg = bpf_ktime_get_ns(); 119 + for (i = 0; i < 34; i++) { 120 + if (bpf_res_spin_lock(locks[i])) 121 + goto end; 122 + } 123 + 124 + /* Trigger AA, after exhausting entries in the held lock table. This 125 + * time, only the timeout can save us, as AA detection won't succeed. 126 + */ 127 + if (!bpf_res_spin_lock(locks[34])) { 128 + bpf_res_spin_unlock(locks[34]); 129 + ret = 1; 130 + goto end; 131 + } 132 + 133 + end: 134 + for (i = i - 1; i >= 0; i--) 135 + bpf_res_spin_unlock(locks[i]); 136 + time = bpf_ktime_get_ns() - time_beg; 137 + /* Time spent should be easily above our limit (1/4 s), since AA 138 + * detection won't be expedited due to lack of held lock entry. 139 + */ 140 + return ret ?: (time > 1000000000 / 4 ? 0 : 1); 141 + } 142 + 143 + char _license[] SEC("license") = "GPL";

+244

tools/testing/selftests/bpf/progs/res_spin_lock_fail.c

··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + /* Copyright (c) 2024-2025 Meta Platforms, Inc. and affiliates. */ 3 + #include <vmlinux.h> 4 + #include <bpf/bpf_tracing.h> 5 + #include <bpf/bpf_helpers.h> 6 + #include <bpf/bpf_core_read.h> 7 + #include "bpf_misc.h" 8 + #include "bpf_experimental.h" 9 + 10 + struct arr_elem { 11 + struct bpf_res_spin_lock lock; 12 + }; 13 + 14 + struct { 15 + __uint(type, BPF_MAP_TYPE_ARRAY); 16 + __uint(max_entries, 1); 17 + __type(key, int); 18 + __type(value, struct arr_elem); 19 + } arrmap SEC(".maps"); 20 + 21 + long value; 22 + 23 + struct bpf_spin_lock lock __hidden SEC(".data.A"); 24 + struct bpf_res_spin_lock res_lock __hidden SEC(".data.B"); 25 + 26 + SEC("?tc") 27 + __failure __msg("point to map value or allocated object") 28 + int res_spin_lock_arg(struct __sk_buff *ctx) 29 + { 30 + struct arr_elem *elem; 31 + 32 + elem = bpf_map_lookup_elem(&arrmap, &(int){0}); 33 + if (!elem) 34 + return 0; 35 + bpf_res_spin_lock((struct bpf_res_spin_lock *)bpf_core_cast(&elem->lock, struct __sk_buff)); 36 + bpf_res_spin_lock(&elem->lock); 37 + return 0; 38 + } 39 + 40 + SEC("?tc") 41 + __failure __msg("AA deadlock detected") 42 + int res_spin_lock_AA(struct __sk_buff *ctx) 43 + { 44 + struct arr_elem *elem; 45 + 46 + elem = bpf_map_lookup_elem(&arrmap, &(int){0}); 47 + if (!elem) 48 + return 0; 49 + bpf_res_spin_lock(&elem->lock); 50 + bpf_res_spin_lock(&elem->lock); 51 + return 0; 52 + } 53 + 54 + SEC("?tc") 55 + __failure __msg("AA deadlock detected") 56 + int res_spin_lock_cond_AA(struct __sk_buff *ctx) 57 + { 58 + struct arr_elem *elem; 59 + 60 + elem = bpf_map_lookup_elem(&arrmap, &(int){0}); 61 + if (!elem) 62 + return 0; 63 + if (bpf_res_spin_lock(&elem->lock)) 64 + return 0; 65 + bpf_res_spin_lock(&elem->lock); 66 + return 0; 67 + } 68 + 69 + SEC("?tc") 70 + __failure __msg("unlock of different lock") 71 + int res_spin_lock_mismatch_1(struct __sk_buff *ctx) 72 + { 73 + struct arr_elem *elem; 74 + 75 + elem = bpf_map_lookup_elem(&arrmap, &(int){0}); 76 + if (!elem) 77 + return 0; 78 + if (bpf_res_spin_lock(&elem->lock)) 79 + return 0; 80 + bpf_res_spin_unlock(&res_lock); 81 + return 0; 82 + } 83 + 84 + SEC("?tc") 85 + __failure __msg("unlock of different lock") 86 + int res_spin_lock_mismatch_2(struct __sk_buff *ctx) 87 + { 88 + struct arr_elem *elem; 89 + 90 + elem = bpf_map_lookup_elem(&arrmap, &(int){0}); 91 + if (!elem) 92 + return 0; 93 + if (bpf_res_spin_lock(&res_lock)) 94 + return 0; 95 + bpf_res_spin_unlock(&elem->lock); 96 + return 0; 97 + } 98 + 99 + SEC("?tc") 100 + __failure __msg("unlock of different lock") 101 + int res_spin_lock_irq_mismatch_1(struct __sk_buff *ctx) 102 + { 103 + struct arr_elem *elem; 104 + unsigned long f1; 105 + 106 + elem = bpf_map_lookup_elem(&arrmap, &(int){0}); 107 + if (!elem) 108 + return 0; 109 + bpf_local_irq_save(&f1); 110 + if (bpf_res_spin_lock(&res_lock)) 111 + return 0; 112 + bpf_res_spin_unlock_irqrestore(&res_lock, &f1); 113 + return 0; 114 + } 115 + 116 + SEC("?tc") 117 + __failure __msg("unlock of different lock") 118 + int res_spin_lock_irq_mismatch_2(struct __sk_buff *ctx) 119 + { 120 + struct arr_elem *elem; 121 + unsigned long f1; 122 + 123 + elem = bpf_map_lookup_elem(&arrmap, &(int){0}); 124 + if (!elem) 125 + return 0; 126 + if (bpf_res_spin_lock_irqsave(&res_lock, &f1)) 127 + return 0; 128 + bpf_res_spin_unlock(&res_lock); 129 + return 0; 130 + } 131 + 132 + SEC("?tc") 133 + __success 134 + int res_spin_lock_ooo(struct __sk_buff *ctx) 135 + { 136 + struct arr_elem *elem; 137 + 138 + elem = bpf_map_lookup_elem(&arrmap, &(int){0}); 139 + if (!elem) 140 + return 0; 141 + if (bpf_res_spin_lock(&res_lock)) 142 + return 0; 143 + if (bpf_res_spin_lock(&elem->lock)) { 144 + bpf_res_spin_unlock(&res_lock); 145 + return 0; 146 + } 147 + bpf_res_spin_unlock(&elem->lock); 148 + bpf_res_spin_unlock(&res_lock); 149 + return 0; 150 + } 151 + 152 + SEC("?tc") 153 + __success 154 + int res_spin_lock_ooo_irq(struct __sk_buff *ctx) 155 + { 156 + struct arr_elem *elem; 157 + unsigned long f1, f2; 158 + 159 + elem = bpf_map_lookup_elem(&arrmap, &(int){0}); 160 + if (!elem) 161 + return 0; 162 + if (bpf_res_spin_lock_irqsave(&res_lock, &f1)) 163 + return 0; 164 + if (bpf_res_spin_lock_irqsave(&elem->lock, &f2)) { 165 + bpf_res_spin_unlock_irqrestore(&res_lock, &f1); 166 + /* We won't have a unreleased IRQ flag error here. */ 167 + return 0; 168 + } 169 + bpf_res_spin_unlock_irqrestore(&elem->lock, &f2); 170 + bpf_res_spin_unlock_irqrestore(&res_lock, &f1); 171 + return 0; 172 + } 173 + 174 + struct bpf_res_spin_lock lock1 __hidden SEC(".data.OO1"); 175 + struct bpf_res_spin_lock lock2 __hidden SEC(".data.OO2"); 176 + 177 + SEC("?tc") 178 + __failure __msg("bpf_res_spin_unlock cannot be out of order") 179 + int res_spin_lock_ooo_unlock(struct __sk_buff *ctx) 180 + { 181 + if (bpf_res_spin_lock(&lock1)) 182 + return 0; 183 + if (bpf_res_spin_lock(&lock2)) { 184 + bpf_res_spin_unlock(&lock1); 185 + return 0; 186 + } 187 + bpf_res_spin_unlock(&lock1); 188 + bpf_res_spin_unlock(&lock2); 189 + return 0; 190 + } 191 + 192 + SEC("?tc") 193 + __failure __msg("off 1 doesn't point to 'struct bpf_res_spin_lock' that is at 0") 194 + int res_spin_lock_bad_off(struct __sk_buff *ctx) 195 + { 196 + struct arr_elem *elem; 197 + 198 + elem = bpf_map_lookup_elem(&arrmap, &(int){0}); 199 + if (!elem) 200 + return 0; 201 + bpf_res_spin_lock((void *)&elem->lock + 1); 202 + return 0; 203 + } 204 + 205 + SEC("?tc") 206 + __failure __msg("R1 doesn't have constant offset. bpf_res_spin_lock has to be at the constant offset") 207 + int res_spin_lock_var_off(struct __sk_buff *ctx) 208 + { 209 + struct arr_elem *elem; 210 + u64 val = value; 211 + 212 + elem = bpf_map_lookup_elem(&arrmap, &(int){0}); 213 + if (!elem) { 214 + // FIXME: Only inline assembly use in assert macro doesn't emit 215 + // BTF definition. 216 + bpf_throw(0); 217 + return 0; 218 + } 219 + bpf_assert_range(val, 0, 40); 220 + bpf_res_spin_lock((void *)&value + val); 221 + return 0; 222 + } 223 + 224 + SEC("?tc") 225 + __failure __msg("map 'res_spin.bss' has no valid bpf_res_spin_lock") 226 + int res_spin_lock_no_lock_map(struct __sk_buff *ctx) 227 + { 228 + bpf_res_spin_lock((void *)&value + 1); 229 + return 0; 230 + } 231 + 232 + SEC("?tc") 233 + __failure __msg("local 'kptr' has no valid bpf_res_spin_lock") 234 + int res_spin_lock_no_lock_kptr(struct __sk_buff *ctx) 235 + { 236 + struct { int i; } *p = bpf_obj_new(typeof(*p)); 237 + 238 + if (!p) 239 + return 0; 240 + bpf_res_spin_lock((void *)p); 241 + return 0; 242 + } 243 + 244 + char _license[] SEC("license") = "GPL";