tools/testing/selftests/bpf/bpf_arena_spin_lock.h at v6.15-rc7

tjh.dev / kernel
fork
Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
fork
kernel / tools / testing / selftests / bpf / bpf_arena_spin_lock.h
at v6.15-rc7 533 lines 14 kB view raw
wrap content
  1// SPDX-License-Identifier: GPL-2.0
  2/* Copyright (c) 2025 Meta Platforms, Inc. and affiliates. */
  3#ifndef BPF_ARENA_SPIN_LOCK_H
  4#define BPF_ARENA_SPIN_LOCK_H
  5
  6#include <vmlinux.h>
  7#include <bpf/bpf_helpers.h>
  8#include "bpf_atomic.h"
  9
 10#define arch_mcs_spin_lock_contended_label(l, label) smp_cond_load_acquire_label(l, VAL, label)
 11#define arch_mcs_spin_unlock_contended(l) smp_store_release((l), 1)
 12
 13#if defined(ENABLE_ATOMICS_TESTS) && defined(__BPF_FEATURE_ADDR_SPACE_CAST)
 14
 15#define EBUSY 16
 16#define EOPNOTSUPP 95
 17#define ETIMEDOUT 110
 18
 19#ifndef __arena
 20#define __arena __attribute__((address_space(1)))
 21#endif
 22
 23extern unsigned long CONFIG_NR_CPUS __kconfig;
 24
 25/*
 26 * Typically, we'd just rely on the definition in vmlinux.h for qspinlock, but
 27 * PowerPC overrides the definition to define lock->val as u32 instead of
 28 * atomic_t, leading to compilation errors.  Import a local definition below so
 29 * that we don't depend on the vmlinux.h version.
 30 */
 31
 32struct __qspinlock {
 33	union {
 34		atomic_t val;
 35		struct {
 36			u8 locked;
 37			u8 pending;
 38		};
 39		struct {
 40			u16 locked_pending;
 41			u16 tail;
 42		};
 43	};
 44};
 45
 46#define arena_spinlock_t struct __qspinlock
 47/* FIXME: Using typedef causes CO-RE relocation error */
 48/* typedef struct qspinlock arena_spinlock_t; */
 49
 50struct arena_mcs_spinlock {
 51	struct arena_mcs_spinlock __arena *next;
 52	int locked;
 53	int count;
 54};
 55
 56struct arena_qnode {
 57	struct arena_mcs_spinlock mcs;
 58};
 59
 60#define _Q_MAX_NODES		4
 61#define _Q_PENDING_LOOPS	1
 62
 63/*
 64 * Bitfields in the atomic value:
 65 *
 66 *  0- 7: locked byte
 67 *     8: pending
 68 *  9-15: not used
 69 * 16-17: tail index
 70 * 18-31: tail cpu (+1)
 71 */
 72#define _Q_MAX_CPUS		1024
 73
 74#define	_Q_SET_MASK(type)	(((1U << _Q_ ## type ## _BITS) - 1)\
 75				      << _Q_ ## type ## _OFFSET)
 76#define _Q_LOCKED_OFFSET	0
 77#define _Q_LOCKED_BITS		8
 78#define _Q_LOCKED_MASK		_Q_SET_MASK(LOCKED)
 79
 80#define _Q_PENDING_OFFSET	(_Q_LOCKED_OFFSET + _Q_LOCKED_BITS)
 81#define _Q_PENDING_BITS		8
 82#define _Q_PENDING_MASK		_Q_SET_MASK(PENDING)
 83
 84#define _Q_TAIL_IDX_OFFSET	(_Q_PENDING_OFFSET + _Q_PENDING_BITS)
 85#define _Q_TAIL_IDX_BITS	2
 86#define _Q_TAIL_IDX_MASK	_Q_SET_MASK(TAIL_IDX)
 87
 88#define _Q_TAIL_CPU_OFFSET	(_Q_TAIL_IDX_OFFSET + _Q_TAIL_IDX_BITS)
 89#define _Q_TAIL_CPU_BITS	(32 - _Q_TAIL_CPU_OFFSET)
 90#define _Q_TAIL_CPU_MASK	_Q_SET_MASK(TAIL_CPU)
 91
 92#define _Q_TAIL_OFFSET		_Q_TAIL_IDX_OFFSET
 93#define _Q_TAIL_MASK		(_Q_TAIL_IDX_MASK | _Q_TAIL_CPU_MASK)
 94
 95#define _Q_LOCKED_VAL		(1U << _Q_LOCKED_OFFSET)
 96#define _Q_PENDING_VAL		(1U << _Q_PENDING_OFFSET)
 97
 98#define likely(x) __builtin_expect(!!(x), 1)
 99#define unlikely(x) __builtin_expect(!!(x), 0)
100
101struct arena_qnode __arena qnodes[_Q_MAX_CPUS][_Q_MAX_NODES];
102
103static inline u32 encode_tail(int cpu, int idx)
104{
105	u32 tail;
106
107	tail  = (cpu + 1) << _Q_TAIL_CPU_OFFSET;
108	tail |= idx << _Q_TAIL_IDX_OFFSET; /* assume < 4 */
109
110	return tail;
111}
112
113static inline struct arena_mcs_spinlock __arena *decode_tail(u32 tail)
114{
115	u32 cpu = (tail >> _Q_TAIL_CPU_OFFSET) - 1;
116	u32 idx = (tail &  _Q_TAIL_IDX_MASK) >> _Q_TAIL_IDX_OFFSET;
117
118	return &qnodes[cpu][idx].mcs;
119}
120
121static inline
122struct arena_mcs_spinlock __arena *grab_mcs_node(struct arena_mcs_spinlock __arena *base, int idx)
123{
124	return &((struct arena_qnode __arena *)base + idx)->mcs;
125}
126
127#define _Q_LOCKED_PENDING_MASK (_Q_LOCKED_MASK | _Q_PENDING_MASK)
128
129/**
130 * xchg_tail - Put in the new queue tail code word & retrieve previous one
131 * @lock : Pointer to queued spinlock structure
132 * @tail : The new queue tail code word
133 * Return: The previous queue tail code word
134 *
135 * xchg(lock, tail)
136 *
137 * p,*,* -> n,*,* ; prev = xchg(lock, node)
138 */
139static __always_inline u32 xchg_tail(arena_spinlock_t __arena *lock, u32 tail)
140{
141	u32 old, new;
142
143	old = atomic_read(&lock->val);
144	do {
145		new = (old & _Q_LOCKED_PENDING_MASK) | tail;
146		/*
147		 * We can use relaxed semantics since the caller ensures that
148		 * the MCS node is properly initialized before updating the
149		 * tail.
150		 */
151		/* These loops are not expected to stall, but we still need to
152		 * prove to the verifier they will terminate eventually.
153		 */
154		cond_break_label(out);
155	} while (!atomic_try_cmpxchg_relaxed(&lock->val, &old, new));
156
157	return old;
158out:
159	bpf_printk("RUNTIME ERROR: %s unexpected cond_break exit!!!", __func__);
160	return old;
161}
162
163/**
164 * clear_pending - clear the pending bit.
165 * @lock: Pointer to queued spinlock structure
166 *
167 * *,1,* -> *,0,*
168 */
169static __always_inline void clear_pending(arena_spinlock_t __arena *lock)
170{
171	WRITE_ONCE(lock->pending, 0);
172}
173
174/**
175 * clear_pending_set_locked - take ownership and clear the pending bit.
176 * @lock: Pointer to queued spinlock structure
177 *
178 * *,1,0 -> *,0,1
179 *
180 * Lock stealing is not allowed if this function is used.
181 */
182static __always_inline void clear_pending_set_locked(arena_spinlock_t __arena *lock)
183{
184	WRITE_ONCE(lock->locked_pending, _Q_LOCKED_VAL);
185}
186
187/**
188 * set_locked - Set the lock bit and own the lock
189 * @lock: Pointer to queued spinlock structure
190 *
191 * *,*,0 -> *,0,1
192 */
193static __always_inline void set_locked(arena_spinlock_t __arena *lock)
194{
195	WRITE_ONCE(lock->locked, _Q_LOCKED_VAL);
196}
197
198static __always_inline
199u32 arena_fetch_set_pending_acquire(arena_spinlock_t __arena *lock)
200{
201	u32 old, new;
202
203	old = atomic_read(&lock->val);
204	do {
205		new = old | _Q_PENDING_VAL;
206		/*
207		 * These loops are not expected to stall, but we still need to
208		 * prove to the verifier they will terminate eventually.
209		 */
210		cond_break_label(out);
211	} while (!atomic_try_cmpxchg_acquire(&lock->val, &old, new));
212
213	return old;
214out:
215	bpf_printk("RUNTIME ERROR: %s unexpected cond_break exit!!!", __func__);
216	return old;
217}
218
219/**
220 * arena_spin_trylock - try to acquire the queued spinlock
221 * @lock : Pointer to queued spinlock structure
222 * Return: 1 if lock acquired, 0 if failed
223 */
224static __always_inline int arena_spin_trylock(arena_spinlock_t __arena *lock)
225{
226	int val = atomic_read(&lock->val);
227
228	if (unlikely(val))
229		return 0;
230
231	return likely(atomic_try_cmpxchg_acquire(&lock->val, &val, _Q_LOCKED_VAL));
232}
233
234__noinline
235int arena_spin_lock_slowpath(arena_spinlock_t __arena __arg_arena *lock, u32 val)
236{
237	struct arena_mcs_spinlock __arena *prev, *next, *node0, *node;
238	int ret = -ETIMEDOUT;
239	u32 old, tail;
240	int idx;
241
242	/*
243	 * Wait for in-progress pending->locked hand-overs with a bounded
244	 * number of spins so that we guarantee forward progress.
245	 *
246	 * 0,1,0 -> 0,0,1
247	 */
248	if (val == _Q_PENDING_VAL) {
249		int cnt = _Q_PENDING_LOOPS;
250		val = atomic_cond_read_relaxed_label(&lock->val,
251						     (VAL != _Q_PENDING_VAL) || !cnt--,
252						     release_err);
253	}
254
255	/*
256	 * If we observe any contention; queue.
257	 */
258	if (val & ~_Q_LOCKED_MASK)
259		goto queue;
260
261	/*
262	 * trylock || pending
263	 *
264	 * 0,0,* -> 0,1,* -> 0,0,1 pending, trylock
265	 */
266	val = arena_fetch_set_pending_acquire(lock);
267
268	/*
269	 * If we observe contention, there is a concurrent locker.
270	 *
271	 * Undo and queue; our setting of PENDING might have made the
272	 * n,0,0 -> 0,0,0 transition fail and it will now be waiting
273	 * on @next to become !NULL.
274	 */
275	if (unlikely(val & ~_Q_LOCKED_MASK)) {
276
277		/* Undo PENDING if we set it. */
278		if (!(val & _Q_PENDING_MASK))
279			clear_pending(lock);
280
281		goto queue;
282	}
283
284	/*
285	 * We're pending, wait for the owner to go away.
286	 *
287	 * 0,1,1 -> *,1,0
288	 *
289	 * this wait loop must be a load-acquire such that we match the
290	 * store-release that clears the locked bit and create lock
291	 * sequentiality; this is because not all
292	 * clear_pending_set_locked() implementations imply full
293	 * barriers.
294	 */
295	if (val & _Q_LOCKED_MASK)
296		smp_cond_load_acquire_label(&lock->locked, !VAL, release_err);
297
298	/*
299	 * take ownership and clear the pending bit.
300	 *
301	 * 0,1,0 -> 0,0,1
302	 */
303	clear_pending_set_locked(lock);
304	return 0;
305
306	/*
307	 * End of pending bit optimistic spinning and beginning of MCS
308	 * queuing.
309	 */
310queue:
311	node0 = &(qnodes[bpf_get_smp_processor_id()])[0].mcs;
312	idx = node0->count++;
313	tail = encode_tail(bpf_get_smp_processor_id(), idx);
314
315	/*
316	 * 4 nodes are allocated based on the assumption that there will not be
317	 * nested NMIs taking spinlocks. That may not be true in some
318	 * architectures even though the chance of needing more than 4 nodes
319	 * will still be extremely unlikely. When that happens, we simply return
320	 * an error. Original qspinlock has a trylock fallback in this case.
321	 */
322	if (unlikely(idx >= _Q_MAX_NODES)) {
323		ret = -EBUSY;
324		goto release_node_err;
325	}
326
327	node = grab_mcs_node(node0, idx);
328
329	/*
330	 * Ensure that we increment the head node->count before initialising
331	 * the actual node. If the compiler is kind enough to reorder these
332	 * stores, then an IRQ could overwrite our assignments.
333	 */
334	barrier();
335
336	node->locked = 0;
337	node->next = NULL;
338
339	/*
340	 * We touched a (possibly) cold cacheline in the per-cpu queue node;
341	 * attempt the trylock once more in the hope someone let go while we
342	 * weren't watching.
343	 */
344	if (arena_spin_trylock(lock))
345		goto release;
346
347	/*
348	 * Ensure that the initialisation of @node is complete before we
349	 * publish the updated tail via xchg_tail() and potentially link
350	 * @node into the waitqueue via WRITE_ONCE(prev->next, node) below.
351	 */
352	smp_wmb();
353
354	/*
355	 * Publish the updated tail.
356	 * We have already touched the queueing cacheline; don't bother with
357	 * pending stuff.
358	 *
359	 * p,*,* -> n,*,*
360	 */
361	old = xchg_tail(lock, tail);
362	next = NULL;
363
364	/*
365	 * if there was a previous node; link it and wait until reaching the
366	 * head of the waitqueue.
367	 */
368	if (old & _Q_TAIL_MASK) {
369		prev = decode_tail(old);
370
371		/* Link @node into the waitqueue. */
372		WRITE_ONCE(prev->next, node);
373
374		arch_mcs_spin_lock_contended_label(&node->locked, release_node_err);
375
376		/*
377		 * While waiting for the MCS lock, the next pointer may have
378		 * been set by another lock waiter. We cannot prefetch here
379		 * due to lack of equivalent instruction in BPF ISA.
380		 */
381		next = READ_ONCE(node->next);
382	}
383
384	/*
385	 * we're at the head of the waitqueue, wait for the owner & pending to
386	 * go away.
387	 *
388	 * *,x,y -> *,0,0
389	 *
390	 * this wait loop must use a load-acquire such that we match the
391	 * store-release that clears the locked bit and create lock
392	 * sequentiality; this is because the set_locked() function below
393	 * does not imply a full barrier.
394	 */
395	val = atomic_cond_read_acquire_label(&lock->val, !(VAL & _Q_LOCKED_PENDING_MASK),
396					     release_node_err);
397
398	/*
399	 * claim the lock:
400	 *
401	 * n,0,0 -> 0,0,1 : lock, uncontended
402	 * *,*,0 -> *,*,1 : lock, contended
403	 *
404	 * If the queue head is the only one in the queue (lock value == tail)
405	 * and nobody is pending, clear the tail code and grab the lock.
406	 * Otherwise, we only need to grab the lock.
407	 */
408
409	/*
410	 * In the PV case we might already have _Q_LOCKED_VAL set, because
411	 * of lock stealing; therefore we must also allow:
412	 *
413	 * n,0,1 -> 0,0,1
414	 *
415	 * Note: at this point: (val & _Q_PENDING_MASK) == 0, because of the
416	 *       above wait condition, therefore any concurrent setting of
417	 *       PENDING will make the uncontended transition fail.
418	 */
419	if ((val & _Q_TAIL_MASK) == tail) {
420		if (atomic_try_cmpxchg_relaxed(&lock->val, &val, _Q_LOCKED_VAL))
421			goto release; /* No contention */
422	}
423
424	/*
425	 * Either somebody is queued behind us or _Q_PENDING_VAL got set
426	 * which will then detect the remaining tail and queue behind us
427	 * ensuring we'll see a @next.
428	 */
429	set_locked(lock);
430
431	/*
432	 * contended path; wait for next if not observed yet, release.
433	 */
434	if (!next)
435		next = smp_cond_load_relaxed_label(&node->next, (VAL), release_node_err);
436
437	arch_mcs_spin_unlock_contended(&next->locked);
438
439release:;
440	/*
441	 * release the node
442	 *
443	 * Doing a normal dec vs this_cpu_dec is fine. An upper context always
444	 * decrements count it incremented before returning, thus we're fine.
445	 * For contexts interrupting us, they either observe our dec or not.
446	 * Just ensure the compiler doesn't reorder this statement, as a
447	 * this_cpu_dec implicitly implied that.
448	 */
449	barrier();
450	node0->count--;
451	return 0;
452release_node_err:
453	barrier();
454	node0->count--;
455	goto release_err;
456release_err:
457	return ret;
458}
459
460/**
461 * arena_spin_lock - acquire a queued spinlock
462 * @lock: Pointer to queued spinlock structure
463 *
464 * On error, returned value will be negative.
465 * On success, zero is returned.
466 *
467 * The return value _must_ be tested against zero for success,
468 * instead of checking it against negative, for passing the
469 * BPF verifier.
470 *
471 * The user should do:
472 *	if (arena_spin_lock(...) != 0) // failure
473 *		or
474 *	if (arena_spin_lock(...) == 0) // success
475 *		or
476 *	if (arena_spin_lock(...)) // failure
477 *		or
478 *	if (!arena_spin_lock(...)) // success
479 * instead of:
480 *	if (arena_spin_lock(...) < 0) // failure
481 *
482 * The return value can still be inspected later.
483 */
484static __always_inline int arena_spin_lock(arena_spinlock_t __arena *lock)
485{
486	int val = 0;
487
488	if (CONFIG_NR_CPUS > 1024)
489		return -EOPNOTSUPP;
490
491	bpf_preempt_disable();
492	if (likely(atomic_try_cmpxchg_acquire(&lock->val, &val, _Q_LOCKED_VAL)))
493		return 0;
494
495	val = arena_spin_lock_slowpath(lock, val);
496	/* FIXME: bpf_assert_range(-MAX_ERRNO, 0) once we have it working for all cases. */
497	if (val)
498		bpf_preempt_enable();
499	return val;
500}
501
502/**
503 * arena_spin_unlock - release a queued spinlock
504 * @lock : Pointer to queued spinlock structure
505 */
506static __always_inline void arena_spin_unlock(arena_spinlock_t __arena *lock)
507{
508	/*
509	 * unlock() needs release semantics:
510	 */
511	smp_store_release(&lock->locked, 0);
512	bpf_preempt_enable();
513}
514
515#define arena_spin_lock_irqsave(lock, flags)             \
516	({                                               \
517		int __ret;                               \
518		bpf_local_irq_save(&(flags));            \
519		__ret = arena_spin_lock((lock));         \
520		if (__ret)                               \
521			bpf_local_irq_restore(&(flags)); \
522		(__ret);                                 \
523	})
524
525#define arena_spin_unlock_irqrestore(lock, flags) \
526	({                                        \
527		arena_spin_unlock((lock));        \
528		bpf_local_irq_restore(&(flags));  \
529	})
530
531#endif
532
533#endif /* BPF_ARENA_SPIN_LOCK_H */
Configure Feed

Configure Feed