Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge branch 'af_xdp-smp_mb-fixes'

Magnus Karlsson says:

====================
This patch set fixes one bug and removes two dependencies on Linux
kernel headers from the XDP socket code in libbpf. A number of people
have pointed out that these two dependencies make it hard to build the
XDP socket part of libbpf without any kernel header dependencies. The
two removed dependecies are:

* Remove the usage of likely and unlikely (compiler.h) in xsk.h. It
has been reported that the use of these actually decreases the
performance of the ring access code due to an increase in
instruction cache misses, so let us just remove these.

* Remove the dependency on barrier.h as it brings in a lot of kernel
headers. As the XDP socket code only uses two simple functions from
it, we can reimplement these. As a bonus, the new implementation is
faster as it uses the same barrier primitives as the kernel does
when the same code is compiled there. Without this patch, the user
land code uses lfence and sfence on x86, which are unnecessarily
harsh/thorough.

In the process of removing these dependencies a missing barrier
function for at least PPC64 was discovered. For a full explanation on
the missing barrier, please refer to patch 1. So the patch set now
starts with two patches fixing this. I have also added a patch at the
end removing this full memory barrier for x86 only, as it is not
needed there.

Structure of the patch set:
Patch 1-2: Adds the missing barrier function in kernel and user space.
Patch 3-4: Removes the dependencies
Patch 5: Optimizes the added barrier from patch 2 so that it does not
do unnecessary work on x86.

v2 -> v3:
* Added missing memory barrier in ring code
* Added an explanation on the three barriers we use in the code
* Moved barrier functions from xsk.h to libbpf_util.h
* Added comment on why we have these functions in libbpf_util.h
* Added a new barrier function in user space that makes it possible to
remove the full memory barrier on x86.

v1 -> v2:
* Added comment about validity of ARM 32-bit barriers.
Only armv7 and above.

/Magnus
====================

Acked-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>

+98 -10
+52 -4
net/xdp/xsk_queue.h
··· 43 43 u64 invalid_descs; 44 44 }; 45 45 46 + /* The structure of the shared state of the rings are the same as the 47 + * ring buffer in kernel/events/ring_buffer.c. For the Rx and completion 48 + * ring, the kernel is the producer and user space is the consumer. For 49 + * the Tx and fill rings, the kernel is the consumer and user space is 50 + * the producer. 51 + * 52 + * producer consumer 53 + * 54 + * if (LOAD ->consumer) { LOAD ->producer 55 + * (A) smp_rmb() (C) 56 + * STORE $data LOAD $data 57 + * smp_wmb() (B) smp_mb() (D) 58 + * STORE ->producer STORE ->consumer 59 + * } 60 + * 61 + * (A) pairs with (D), and (B) pairs with (C). 62 + * 63 + * Starting with (B), it protects the data from being written after 64 + * the producer pointer. If this barrier was missing, the consumer 65 + * could observe the producer pointer being set and thus load the data 66 + * before the producer has written the new data. The consumer would in 67 + * this case load the old data. 68 + * 69 + * (C) protects the consumer from speculatively loading the data before 70 + * the producer pointer actually has been read. If we do not have this 71 + * barrier, some architectures could load old data as speculative loads 72 + * are not discarded as the CPU does not know there is a dependency 73 + * between ->producer and data. 74 + * 75 + * (A) is a control dependency that separates the load of ->consumer 76 + * from the stores of $data. In case ->consumer indicates there is no 77 + * room in the buffer to store $data we do not. So no barrier is needed. 78 + * 79 + * (D) protects the load of the data to be observed to happen after the 80 + * store of the consumer pointer. If we did not have this memory 81 + * barrier, the producer could observe the consumer pointer being set 82 + * and overwrite the data with a new value before the consumer got the 83 + * chance to read the old value. The consumer would thus miss reading 84 + * the old entry and very likely read the new entry twice, once right 85 + * now and again after circling through the ring. 86 + */ 87 + 46 88 /* Common functions operating for both RXTX and umem queues */ 47 89 48 90 static inline u64 xskq_nb_invalid_descs(struct xsk_queue *q) ··· 148 106 static inline u64 *xskq_peek_addr(struct xsk_queue *q, u64 *addr) 149 107 { 150 108 if (q->cons_tail == q->cons_head) { 109 + smp_mb(); /* D, matches A */ 151 110 WRITE_ONCE(q->ring->consumer, q->cons_tail); 152 111 q->cons_head = q->cons_tail + xskq_nb_avail(q, RX_BATCH_SIZE); 153 112 ··· 171 128 if (xskq_nb_free(q, q->prod_tail, 1) == 0) 172 129 return -ENOSPC; 173 130 131 + /* A, matches D */ 174 132 ring->desc[q->prod_tail++ & q->ring_mask] = addr; 175 133 176 134 /* Order producer and data */ 177 - smp_wmb(); 135 + smp_wmb(); /* B, matches C */ 178 136 179 137 WRITE_ONCE(q->ring->producer, q->prod_tail); 180 138 return 0; ··· 188 144 if (xskq_nb_free(q, q->prod_head, LAZY_UPDATE_THRESHOLD) == 0) 189 145 return -ENOSPC; 190 146 147 + /* A, matches D */ 191 148 ring->desc[q->prod_head++ & q->ring_mask] = addr; 192 149 return 0; 193 150 } ··· 197 152 u32 nb_entries) 198 153 { 199 154 /* Order producer and data */ 200 - smp_wmb(); 155 + smp_wmb(); /* B, matches C */ 201 156 202 157 q->prod_tail += nb_entries; 203 158 WRITE_ONCE(q->ring->producer, q->prod_tail); ··· 208 163 if (xskq_nb_free(q, q->prod_head, 1) == 0) 209 164 return -ENOSPC; 210 165 166 + /* A, matches D */ 211 167 q->prod_head++; 212 168 return 0; 213 169 } ··· 250 204 struct xdp_desc *desc) 251 205 { 252 206 if (q->cons_tail == q->cons_head) { 207 + smp_mb(); /* D, matches A */ 253 208 WRITE_ONCE(q->ring->consumer, q->cons_tail); 254 209 q->cons_head = q->cons_tail + xskq_nb_avail(q, RX_BATCH_SIZE); 255 210 256 211 /* Order consumer and data */ 257 - smp_rmb(); 212 + smp_rmb(); /* C, matches B */ 258 213 } 259 214 260 215 return xskq_validate_desc(q, desc); ··· 275 228 if (xskq_nb_free(q, q->prod_head, 1) == 0) 276 229 return -ENOSPC; 277 230 231 + /* A, matches D */ 278 232 idx = (q->prod_head++) & q->ring_mask; 279 233 ring->desc[idx].addr = addr; 280 234 ring->desc[idx].len = len; ··· 286 238 static inline void xskq_produce_flush_desc(struct xsk_queue *q) 287 239 { 288 240 /* Order producer and data */ 289 - smp_wmb(); 241 + smp_wmb(); /* B, matches C */ 290 242 291 243 q->prod_tail = q->prod_head, 292 244 WRITE_ONCE(q->ring->producer, q->prod_tail);
+30
tools/lib/bpf/libbpf_util.h
··· 23 23 #define pr_info(fmt, ...) __pr(LIBBPF_INFO, fmt, ##__VA_ARGS__) 24 24 #define pr_debug(fmt, ...) __pr(LIBBPF_DEBUG, fmt, ##__VA_ARGS__) 25 25 26 + /* Use these barrier functions instead of smp_[rw]mb() when they are 27 + * used in a libbpf header file. That way they can be built into the 28 + * application that uses libbpf. 29 + */ 30 + #if defined(__i386__) || defined(__x86_64__) 31 + # define libbpf_smp_rmb() asm volatile("" : : : "memory") 32 + # define libbpf_smp_wmb() asm volatile("" : : : "memory") 33 + # define libbpf_smp_mb() \ 34 + asm volatile("lock; addl $0,-4(%%rsp)" : : : "memory", "cc") 35 + /* Hinders stores to be observed before older loads. */ 36 + # define libbpf_smp_rwmb() asm volatile("" : : : "memory") 37 + #elif defined(__aarch64__) 38 + # define libbpf_smp_rmb() asm volatile("dmb ishld" : : : "memory") 39 + # define libbpf_smp_wmb() asm volatile("dmb ishst" : : : "memory") 40 + # define libbpf_smp_mb() asm volatile("dmb ish" : : : "memory") 41 + # define libbpf_smp_rwmb() libbpf_smp_mb() 42 + #elif defined(__arm__) 43 + /* These are only valid for armv7 and above */ 44 + # define libbpf_smp_rmb() asm volatile("dmb ish" : : : "memory") 45 + # define libbpf_smp_wmb() asm volatile("dmb ishst" : : : "memory") 46 + # define libbpf_smp_mb() asm volatile("dmb ish" : : : "memory") 47 + # define libbpf_smp_rwmb() libbpf_smp_mb() 48 + #else 49 + # warning Architecture missing native barrier functions in libbpf_util.h. 50 + # define libbpf_smp_rmb() __sync_synchronize() 51 + # define libbpf_smp_wmb() __sync_synchronize() 52 + # define libbpf_smp_mb() __sync_synchronize() 53 + # define libbpf_smp_rwmb() __sync_synchronize() 54 + #endif 55 + 26 56 #ifdef __cplusplus 27 57 } /* extern "C" */ 28 58 #endif
+16 -6
tools/lib/bpf/xsk.h
··· 16 16 #include <linux/if_xdp.h> 17 17 18 18 #include "libbpf.h" 19 + #include "libbpf_util.h" 19 20 20 21 #ifdef __cplusplus 21 22 extern "C" { ··· 36 35 37 36 DEFINE_XSK_RING(xsk_ring_prod); 38 37 DEFINE_XSK_RING(xsk_ring_cons); 38 + 39 + /* For a detailed explanation on the memory barriers associated with the 40 + * ring, please take a look at net/xdp/xsk_queue.h. 41 + */ 39 42 40 43 struct xsk_umem; 41 44 struct xsk_socket; ··· 110 105 static inline size_t xsk_ring_prod__reserve(struct xsk_ring_prod *prod, 111 106 size_t nb, __u32 *idx) 112 107 { 113 - if (unlikely(xsk_prod_nb_free(prod, nb) < nb)) 108 + if (xsk_prod_nb_free(prod, nb) < nb) 114 109 return 0; 115 110 116 111 *idx = prod->cached_prod; ··· 121 116 122 117 static inline void xsk_ring_prod__submit(struct xsk_ring_prod *prod, size_t nb) 123 118 { 124 - /* Make sure everything has been written to the ring before signalling 125 - * this to the kernel. 119 + /* Make sure everything has been written to the ring before indicating 120 + * this to the kernel by writing the producer pointer. 126 121 */ 127 - smp_wmb(); 122 + libbpf_smp_wmb(); 128 123 129 124 *prod->producer += nb; 130 125 } ··· 134 129 { 135 130 size_t entries = xsk_cons_nb_avail(cons, nb); 136 131 137 - if (likely(entries > 0)) { 132 + if (entries > 0) { 138 133 /* Make sure we do not speculatively read the data before 139 134 * we have received the packet buffers from the ring. 140 135 */ 141 - smp_rmb(); 136 + libbpf_smp_rmb(); 142 137 143 138 *idx = cons->cached_cons; 144 139 cons->cached_cons += entries; ··· 149 144 150 145 static inline void xsk_ring_cons__release(struct xsk_ring_cons *cons, size_t nb) 151 146 { 147 + /* Make sure data has been read before indicating we are done 148 + * with the entries by updating the consumer pointer. 149 + */ 150 + libbpf_smp_rwmb(); 151 + 152 152 *cons->consumer += nb; 153 153 } 154 154