Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

at v6.15-rc7 533 lines 14 kB view raw
1// SPDX-License-Identifier: GPL-2.0 2/* Copyright (c) 2025 Meta Platforms, Inc. and affiliates. */ 3#ifndef BPF_ARENA_SPIN_LOCK_H 4#define BPF_ARENA_SPIN_LOCK_H 5 6#include <vmlinux.h> 7#include <bpf/bpf_helpers.h> 8#include "bpf_atomic.h" 9 10#define arch_mcs_spin_lock_contended_label(l, label) smp_cond_load_acquire_label(l, VAL, label) 11#define arch_mcs_spin_unlock_contended(l) smp_store_release((l), 1) 12 13#if defined(ENABLE_ATOMICS_TESTS) && defined(__BPF_FEATURE_ADDR_SPACE_CAST) 14 15#define EBUSY 16 16#define EOPNOTSUPP 95 17#define ETIMEDOUT 110 18 19#ifndef __arena 20#define __arena __attribute__((address_space(1))) 21#endif 22 23extern unsigned long CONFIG_NR_CPUS __kconfig; 24 25/* 26 * Typically, we'd just rely on the definition in vmlinux.h for qspinlock, but 27 * PowerPC overrides the definition to define lock->val as u32 instead of 28 * atomic_t, leading to compilation errors. Import a local definition below so 29 * that we don't depend on the vmlinux.h version. 30 */ 31 32struct __qspinlock { 33 union { 34 atomic_t val; 35 struct { 36 u8 locked; 37 u8 pending; 38 }; 39 struct { 40 u16 locked_pending; 41 u16 tail; 42 }; 43 }; 44}; 45 46#define arena_spinlock_t struct __qspinlock 47/* FIXME: Using typedef causes CO-RE relocation error */ 48/* typedef struct qspinlock arena_spinlock_t; */ 49 50struct arena_mcs_spinlock { 51 struct arena_mcs_spinlock __arena *next; 52 int locked; 53 int count; 54}; 55 56struct arena_qnode { 57 struct arena_mcs_spinlock mcs; 58}; 59 60#define _Q_MAX_NODES 4 61#define _Q_PENDING_LOOPS 1 62 63/* 64 * Bitfields in the atomic value: 65 * 66 * 0- 7: locked byte 67 * 8: pending 68 * 9-15: not used 69 * 16-17: tail index 70 * 18-31: tail cpu (+1) 71 */ 72#define _Q_MAX_CPUS 1024 73 74#define _Q_SET_MASK(type) (((1U << _Q_ ## type ## _BITS) - 1)\ 75 << _Q_ ## type ## _OFFSET) 76#define _Q_LOCKED_OFFSET 0 77#define _Q_LOCKED_BITS 8 78#define _Q_LOCKED_MASK _Q_SET_MASK(LOCKED) 79 80#define _Q_PENDING_OFFSET (_Q_LOCKED_OFFSET + _Q_LOCKED_BITS) 81#define _Q_PENDING_BITS 8 82#define _Q_PENDING_MASK _Q_SET_MASK(PENDING) 83 84#define _Q_TAIL_IDX_OFFSET (_Q_PENDING_OFFSET + _Q_PENDING_BITS) 85#define _Q_TAIL_IDX_BITS 2 86#define _Q_TAIL_IDX_MASK _Q_SET_MASK(TAIL_IDX) 87 88#define _Q_TAIL_CPU_OFFSET (_Q_TAIL_IDX_OFFSET + _Q_TAIL_IDX_BITS) 89#define _Q_TAIL_CPU_BITS (32 - _Q_TAIL_CPU_OFFSET) 90#define _Q_TAIL_CPU_MASK _Q_SET_MASK(TAIL_CPU) 91 92#define _Q_TAIL_OFFSET _Q_TAIL_IDX_OFFSET 93#define _Q_TAIL_MASK (_Q_TAIL_IDX_MASK | _Q_TAIL_CPU_MASK) 94 95#define _Q_LOCKED_VAL (1U << _Q_LOCKED_OFFSET) 96#define _Q_PENDING_VAL (1U << _Q_PENDING_OFFSET) 97 98#define likely(x) __builtin_expect(!!(x), 1) 99#define unlikely(x) __builtin_expect(!!(x), 0) 100 101struct arena_qnode __arena qnodes[_Q_MAX_CPUS][_Q_MAX_NODES]; 102 103static inline u32 encode_tail(int cpu, int idx) 104{ 105 u32 tail; 106 107 tail = (cpu + 1) << _Q_TAIL_CPU_OFFSET; 108 tail |= idx << _Q_TAIL_IDX_OFFSET; /* assume < 4 */ 109 110 return tail; 111} 112 113static inline struct arena_mcs_spinlock __arena *decode_tail(u32 tail) 114{ 115 u32 cpu = (tail >> _Q_TAIL_CPU_OFFSET) - 1; 116 u32 idx = (tail & _Q_TAIL_IDX_MASK) >> _Q_TAIL_IDX_OFFSET; 117 118 return &qnodes[cpu][idx].mcs; 119} 120 121static inline 122struct arena_mcs_spinlock __arena *grab_mcs_node(struct arena_mcs_spinlock __arena *base, int idx) 123{ 124 return &((struct arena_qnode __arena *)base + idx)->mcs; 125} 126 127#define _Q_LOCKED_PENDING_MASK (_Q_LOCKED_MASK | _Q_PENDING_MASK) 128 129/** 130 * xchg_tail - Put in the new queue tail code word & retrieve previous one 131 * @lock : Pointer to queued spinlock structure 132 * @tail : The new queue tail code word 133 * Return: The previous queue tail code word 134 * 135 * xchg(lock, tail) 136 * 137 * p,*,* -> n,*,* ; prev = xchg(lock, node) 138 */ 139static __always_inline u32 xchg_tail(arena_spinlock_t __arena *lock, u32 tail) 140{ 141 u32 old, new; 142 143 old = atomic_read(&lock->val); 144 do { 145 new = (old & _Q_LOCKED_PENDING_MASK) | tail; 146 /* 147 * We can use relaxed semantics since the caller ensures that 148 * the MCS node is properly initialized before updating the 149 * tail. 150 */ 151 /* These loops are not expected to stall, but we still need to 152 * prove to the verifier they will terminate eventually. 153 */ 154 cond_break_label(out); 155 } while (!atomic_try_cmpxchg_relaxed(&lock->val, &old, new)); 156 157 return old; 158out: 159 bpf_printk("RUNTIME ERROR: %s unexpected cond_break exit!!!", __func__); 160 return old; 161} 162 163/** 164 * clear_pending - clear the pending bit. 165 * @lock: Pointer to queued spinlock structure 166 * 167 * *,1,* -> *,0,* 168 */ 169static __always_inline void clear_pending(arena_spinlock_t __arena *lock) 170{ 171 WRITE_ONCE(lock->pending, 0); 172} 173 174/** 175 * clear_pending_set_locked - take ownership and clear the pending bit. 176 * @lock: Pointer to queued spinlock structure 177 * 178 * *,1,0 -> *,0,1 179 * 180 * Lock stealing is not allowed if this function is used. 181 */ 182static __always_inline void clear_pending_set_locked(arena_spinlock_t __arena *lock) 183{ 184 WRITE_ONCE(lock->locked_pending, _Q_LOCKED_VAL); 185} 186 187/** 188 * set_locked - Set the lock bit and own the lock 189 * @lock: Pointer to queued spinlock structure 190 * 191 * *,*,0 -> *,0,1 192 */ 193static __always_inline void set_locked(arena_spinlock_t __arena *lock) 194{ 195 WRITE_ONCE(lock->locked, _Q_LOCKED_VAL); 196} 197 198static __always_inline 199u32 arena_fetch_set_pending_acquire(arena_spinlock_t __arena *lock) 200{ 201 u32 old, new; 202 203 old = atomic_read(&lock->val); 204 do { 205 new = old | _Q_PENDING_VAL; 206 /* 207 * These loops are not expected to stall, but we still need to 208 * prove to the verifier they will terminate eventually. 209 */ 210 cond_break_label(out); 211 } while (!atomic_try_cmpxchg_acquire(&lock->val, &old, new)); 212 213 return old; 214out: 215 bpf_printk("RUNTIME ERROR: %s unexpected cond_break exit!!!", __func__); 216 return old; 217} 218 219/** 220 * arena_spin_trylock - try to acquire the queued spinlock 221 * @lock : Pointer to queued spinlock structure 222 * Return: 1 if lock acquired, 0 if failed 223 */ 224static __always_inline int arena_spin_trylock(arena_spinlock_t __arena *lock) 225{ 226 int val = atomic_read(&lock->val); 227 228 if (unlikely(val)) 229 return 0; 230 231 return likely(atomic_try_cmpxchg_acquire(&lock->val, &val, _Q_LOCKED_VAL)); 232} 233 234__noinline 235int arena_spin_lock_slowpath(arena_spinlock_t __arena __arg_arena *lock, u32 val) 236{ 237 struct arena_mcs_spinlock __arena *prev, *next, *node0, *node; 238 int ret = -ETIMEDOUT; 239 u32 old, tail; 240 int idx; 241 242 /* 243 * Wait for in-progress pending->locked hand-overs with a bounded 244 * number of spins so that we guarantee forward progress. 245 * 246 * 0,1,0 -> 0,0,1 247 */ 248 if (val == _Q_PENDING_VAL) { 249 int cnt = _Q_PENDING_LOOPS; 250 val = atomic_cond_read_relaxed_label(&lock->val, 251 (VAL != _Q_PENDING_VAL) || !cnt--, 252 release_err); 253 } 254 255 /* 256 * If we observe any contention; queue. 257 */ 258 if (val & ~_Q_LOCKED_MASK) 259 goto queue; 260 261 /* 262 * trylock || pending 263 * 264 * 0,0,* -> 0,1,* -> 0,0,1 pending, trylock 265 */ 266 val = arena_fetch_set_pending_acquire(lock); 267 268 /* 269 * If we observe contention, there is a concurrent locker. 270 * 271 * Undo and queue; our setting of PENDING might have made the 272 * n,0,0 -> 0,0,0 transition fail and it will now be waiting 273 * on @next to become !NULL. 274 */ 275 if (unlikely(val & ~_Q_LOCKED_MASK)) { 276 277 /* Undo PENDING if we set it. */ 278 if (!(val & _Q_PENDING_MASK)) 279 clear_pending(lock); 280 281 goto queue; 282 } 283 284 /* 285 * We're pending, wait for the owner to go away. 286 * 287 * 0,1,1 -> *,1,0 288 * 289 * this wait loop must be a load-acquire such that we match the 290 * store-release that clears the locked bit and create lock 291 * sequentiality; this is because not all 292 * clear_pending_set_locked() implementations imply full 293 * barriers. 294 */ 295 if (val & _Q_LOCKED_MASK) 296 smp_cond_load_acquire_label(&lock->locked, !VAL, release_err); 297 298 /* 299 * take ownership and clear the pending bit. 300 * 301 * 0,1,0 -> 0,0,1 302 */ 303 clear_pending_set_locked(lock); 304 return 0; 305 306 /* 307 * End of pending bit optimistic spinning and beginning of MCS 308 * queuing. 309 */ 310queue: 311 node0 = &(qnodes[bpf_get_smp_processor_id()])[0].mcs; 312 idx = node0->count++; 313 tail = encode_tail(bpf_get_smp_processor_id(), idx); 314 315 /* 316 * 4 nodes are allocated based on the assumption that there will not be 317 * nested NMIs taking spinlocks. That may not be true in some 318 * architectures even though the chance of needing more than 4 nodes 319 * will still be extremely unlikely. When that happens, we simply return 320 * an error. Original qspinlock has a trylock fallback in this case. 321 */ 322 if (unlikely(idx >= _Q_MAX_NODES)) { 323 ret = -EBUSY; 324 goto release_node_err; 325 } 326 327 node = grab_mcs_node(node0, idx); 328 329 /* 330 * Ensure that we increment the head node->count before initialising 331 * the actual node. If the compiler is kind enough to reorder these 332 * stores, then an IRQ could overwrite our assignments. 333 */ 334 barrier(); 335 336 node->locked = 0; 337 node->next = NULL; 338 339 /* 340 * We touched a (possibly) cold cacheline in the per-cpu queue node; 341 * attempt the trylock once more in the hope someone let go while we 342 * weren't watching. 343 */ 344 if (arena_spin_trylock(lock)) 345 goto release; 346 347 /* 348 * Ensure that the initialisation of @node is complete before we 349 * publish the updated tail via xchg_tail() and potentially link 350 * @node into the waitqueue via WRITE_ONCE(prev->next, node) below. 351 */ 352 smp_wmb(); 353 354 /* 355 * Publish the updated tail. 356 * We have already touched the queueing cacheline; don't bother with 357 * pending stuff. 358 * 359 * p,*,* -> n,*,* 360 */ 361 old = xchg_tail(lock, tail); 362 next = NULL; 363 364 /* 365 * if there was a previous node; link it and wait until reaching the 366 * head of the waitqueue. 367 */ 368 if (old & _Q_TAIL_MASK) { 369 prev = decode_tail(old); 370 371 /* Link @node into the waitqueue. */ 372 WRITE_ONCE(prev->next, node); 373 374 arch_mcs_spin_lock_contended_label(&node->locked, release_node_err); 375 376 /* 377 * While waiting for the MCS lock, the next pointer may have 378 * been set by another lock waiter. We cannot prefetch here 379 * due to lack of equivalent instruction in BPF ISA. 380 */ 381 next = READ_ONCE(node->next); 382 } 383 384 /* 385 * we're at the head of the waitqueue, wait for the owner & pending to 386 * go away. 387 * 388 * *,x,y -> *,0,0 389 * 390 * this wait loop must use a load-acquire such that we match the 391 * store-release that clears the locked bit and create lock 392 * sequentiality; this is because the set_locked() function below 393 * does not imply a full barrier. 394 */ 395 val = atomic_cond_read_acquire_label(&lock->val, !(VAL & _Q_LOCKED_PENDING_MASK), 396 release_node_err); 397 398 /* 399 * claim the lock: 400 * 401 * n,0,0 -> 0,0,1 : lock, uncontended 402 * *,*,0 -> *,*,1 : lock, contended 403 * 404 * If the queue head is the only one in the queue (lock value == tail) 405 * and nobody is pending, clear the tail code and grab the lock. 406 * Otherwise, we only need to grab the lock. 407 */ 408 409 /* 410 * In the PV case we might already have _Q_LOCKED_VAL set, because 411 * of lock stealing; therefore we must also allow: 412 * 413 * n,0,1 -> 0,0,1 414 * 415 * Note: at this point: (val & _Q_PENDING_MASK) == 0, because of the 416 * above wait condition, therefore any concurrent setting of 417 * PENDING will make the uncontended transition fail. 418 */ 419 if ((val & _Q_TAIL_MASK) == tail) { 420 if (atomic_try_cmpxchg_relaxed(&lock->val, &val, _Q_LOCKED_VAL)) 421 goto release; /* No contention */ 422 } 423 424 /* 425 * Either somebody is queued behind us or _Q_PENDING_VAL got set 426 * which will then detect the remaining tail and queue behind us 427 * ensuring we'll see a @next. 428 */ 429 set_locked(lock); 430 431 /* 432 * contended path; wait for next if not observed yet, release. 433 */ 434 if (!next) 435 next = smp_cond_load_relaxed_label(&node->next, (VAL), release_node_err); 436 437 arch_mcs_spin_unlock_contended(&next->locked); 438 439release:; 440 /* 441 * release the node 442 * 443 * Doing a normal dec vs this_cpu_dec is fine. An upper context always 444 * decrements count it incremented before returning, thus we're fine. 445 * For contexts interrupting us, they either observe our dec or not. 446 * Just ensure the compiler doesn't reorder this statement, as a 447 * this_cpu_dec implicitly implied that. 448 */ 449 barrier(); 450 node0->count--; 451 return 0; 452release_node_err: 453 barrier(); 454 node0->count--; 455 goto release_err; 456release_err: 457 return ret; 458} 459 460/** 461 * arena_spin_lock - acquire a queued spinlock 462 * @lock: Pointer to queued spinlock structure 463 * 464 * On error, returned value will be negative. 465 * On success, zero is returned. 466 * 467 * The return value _must_ be tested against zero for success, 468 * instead of checking it against negative, for passing the 469 * BPF verifier. 470 * 471 * The user should do: 472 * if (arena_spin_lock(...) != 0) // failure 473 * or 474 * if (arena_spin_lock(...) == 0) // success 475 * or 476 * if (arena_spin_lock(...)) // failure 477 * or 478 * if (!arena_spin_lock(...)) // success 479 * instead of: 480 * if (arena_spin_lock(...) < 0) // failure 481 * 482 * The return value can still be inspected later. 483 */ 484static __always_inline int arena_spin_lock(arena_spinlock_t __arena *lock) 485{ 486 int val = 0; 487 488 if (CONFIG_NR_CPUS > 1024) 489 return -EOPNOTSUPP; 490 491 bpf_preempt_disable(); 492 if (likely(atomic_try_cmpxchg_acquire(&lock->val, &val, _Q_LOCKED_VAL))) 493 return 0; 494 495 val = arena_spin_lock_slowpath(lock, val); 496 /* FIXME: bpf_assert_range(-MAX_ERRNO, 0) once we have it working for all cases. */ 497 if (val) 498 bpf_preempt_enable(); 499 return val; 500} 501 502/** 503 * arena_spin_unlock - release a queued spinlock 504 * @lock : Pointer to queued spinlock structure 505 */ 506static __always_inline void arena_spin_unlock(arena_spinlock_t __arena *lock) 507{ 508 /* 509 * unlock() needs release semantics: 510 */ 511 smp_store_release(&lock->locked, 0); 512 bpf_preempt_enable(); 513} 514 515#define arena_spin_lock_irqsave(lock, flags) \ 516 ({ \ 517 int __ret; \ 518 bpf_local_irq_save(&(flags)); \ 519 __ret = arena_spin_lock((lock)); \ 520 if (__ret) \ 521 bpf_local_irq_restore(&(flags)); \ 522 (__ret); \ 523 }) 524 525#define arena_spin_unlock_irqrestore(lock, flags) \ 526 ({ \ 527 arena_spin_unlock((lock)); \ 528 bpf_local_irq_restore(&(flags)); \ 529 }) 530 531#endif 532 533#endif /* BPF_ARENA_SPIN_LOCK_H */