Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

nft_set_pipapo: Introduce AVX2-based lookup implementation

If the AVX2 set is available, we can exploit the repetitive
characteristic of this algorithm to provide a fast, vectorised
version by using 256-bit wide AVX2 operations for bucket loads and
bitwise intersections.

In most cases, this implementation consistently outperforms rbtree
set instances despite the fact they are configured to use a given,
single, ranged data type out of the ones used for performance
measurements by the nft_concat_range.sh kselftest.

That script, injecting packets directly on the ingoing device path
with pktgen, reports, averaged over five runs on a single AMD Epyc
7402 thread (3.35GHz, 768 KiB L1D$, 12 MiB L2$), the figures below.
CONFIG_RETPOLINE was not set here.

Note that this is not a fair comparison over hash and rbtree set
types: non-ranged entries (used to have a reference for hash types)
would be matched faster than this, and matching on a single field
only (which is the case for rbtree) is also significantly faster.

However, it's not possible at the moment to choose this set type
for non-ranged entries, and the current implementation also needs
a few minor adjustments in order to match on less than two fields.

---------------.-----------------------------------.------------.
AMD Epyc 7402 | baselines, Mpps | this patch |
1 thread |___________________________________|____________|
3.35GHz | | | | | |
768KiB L1D$ | netdev | hash | rbtree | | |
---------------| hook | no | single | | pipapo |
type entries | drop | ranges | field | pipapo | AVX2 |
---------------|--------|--------|--------|--------|------------|
net,port | | | | | |
1000 | 19.0 | 10.4 | 3.8 | 4.0 | 7.5 +87% |
---------------|--------|--------|--------|--------|------------|
port,net | | | | | |
100 | 18.8 | 10.3 | 5.8 | 6.3 | 8.1 +29% |
---------------|--------|--------|--------|--------|------------|
net6,port | | | | | |
1000 | 16.4 | 7.6 | 1.8 | 2.1 | 4.8 +128% |
---------------|--------|--------|--------|--------|------------|
port,proto | | | | | |
30000 | 19.6 | 11.6 | 3.9 | 0.5 | 2.6 +420% |
---------------|--------|--------|--------|--------|------------|
net6,port,mac | | | | | |
10 | 16.5 | 5.4 | 4.3 | 3.4 | 4.7 +38% |
---------------|--------|--------|--------|--------|------------|
net6,port,mac, | | | | | |
proto 1000 | 16.5 | 5.7 | 1.9 | 1.4 | 3.6 +26% |
---------------|--------|--------|--------|--------|------------|
net,mac | | | | | |
1000 | 19.0 | 8.4 | 3.9 | 2.5 | 6.4 +156% |
---------------'--------'--------'--------'--------'------------'

A similar strategy could be easily reused to implement specialised
versions for other SIMD sets, and I plan to post at least a NEON
version at a later time.

Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>

authored by

Stefano Brivio and committed by
Pablo Neira Ayuso
7400b063 8683f4b9

+1270
+1
include/net/netfilter/nf_tables_core.h
··· 75 75 extern const struct nft_set_type nft_set_rbtree_type; 76 76 extern const struct nft_set_type nft_set_bitmap_type; 77 77 extern const struct nft_set_type nft_set_pipapo_type; 78 + extern const struct nft_set_type nft_set_pipapo_avx2_type; 78 79 79 80 struct nft_expr; 80 81 struct nft_regs;
+6
net/netfilter/Makefile
··· 82 82 nft_set_hash.o nft_set_bitmap.o nft_set_rbtree.o \ 83 83 nft_set_pipapo.o 84 84 85 + ifdef CONFIG_X86_64 86 + ifneq (,$(findstring -DCONFIG_AS_AVX2=1,$(KBUILD_CFLAGS))) 87 + nf_tables-objs += nft_set_pipapo_avx2.o 88 + endif 89 + endif 90 + 85 91 obj-$(CONFIG_NF_TABLES) += nf_tables.o 86 92 obj-$(CONFIG_NFT_COMPAT) += nft_compat.o 87 93 obj-$(CONFIG_NFT_CONNLIMIT) += nft_connlimit.o
+3
net/netfilter/nf_tables_api.c
··· 3272 3272 &nft_set_rhash_type, 3273 3273 &nft_set_bitmap_type, 3274 3274 &nft_set_rbtree_type, 3275 + #if defined(CONFIG_X86_64) && defined(CONFIG_AS_AVX2) 3276 + &nft_set_pipapo_avx2_type, 3277 + #endif 3275 3278 &nft_set_pipapo_type, 3276 3279 }; 3277 3280
+24
net/netfilter/nft_set_pipapo.c
··· 339 339 #include <linux/bitmap.h> 340 340 #include <linux/bitops.h> 341 341 342 + #include "nft_set_pipapo_avx2.h" 342 343 #include "nft_set_pipapo.h" 343 344 344 345 /* Current working bitmap index, toggled between field matches */ ··· 2175 2174 .elemsize = offsetof(struct nft_pipapo_elem, ext), 2176 2175 }, 2177 2176 }; 2177 + 2178 + #if defined(CONFIG_X86_64) && defined(CONFIG_AS_AVX2) 2179 + const struct nft_set_type nft_set_pipapo_avx2_type = { 2180 + .features = NFT_SET_INTERVAL | NFT_SET_MAP | NFT_SET_OBJECT | 2181 + NFT_SET_TIMEOUT, 2182 + .ops = { 2183 + .lookup = nft_pipapo_avx2_lookup, 2184 + .insert = nft_pipapo_insert, 2185 + .activate = nft_pipapo_activate, 2186 + .deactivate = nft_pipapo_deactivate, 2187 + .flush = nft_pipapo_flush, 2188 + .remove = nft_pipapo_remove, 2189 + .walk = nft_pipapo_walk, 2190 + .get = nft_pipapo_get, 2191 + .privsize = nft_pipapo_privsize, 2192 + .estimate = nft_pipapo_avx2_estimate, 2193 + .init = nft_pipapo_init, 2194 + .destroy = nft_pipapo_destroy, 2195 + .gc_init = nft_pipapo_gc_init, 2196 + .elemsize = offsetof(struct nft_pipapo_elem, ext), 2197 + }, 2198 + }; 2199 + #endif
+1222
net/netfilter/nft_set_pipapo_avx2.c
··· 1 + // SPDX-License-Identifier: GPL-2.0-only 2 + 3 + /* PIPAPO: PIle PAcket POlicies: AVX2 packet lookup routines 4 + * 5 + * Copyright (c) 2019-2020 Red Hat GmbH 6 + * 7 + * Author: Stefano Brivio <sbrivio@redhat.com> 8 + */ 9 + 10 + #include <linux/kernel.h> 11 + #include <linux/init.h> 12 + #include <linux/module.h> 13 + #include <linux/netlink.h> 14 + #include <linux/netfilter.h> 15 + #include <linux/netfilter/nf_tables.h> 16 + #include <net/netfilter/nf_tables_core.h> 17 + #include <uapi/linux/netfilter/nf_tables.h> 18 + #include <linux/bitmap.h> 19 + #include <linux/bitops.h> 20 + 21 + #include <linux/compiler.h> 22 + #include <asm/fpu/api.h> 23 + 24 + #include "nft_set_pipapo_avx2.h" 25 + #include "nft_set_pipapo.h" 26 + 27 + #define NFT_PIPAPO_LONGS_PER_M256 (XSAVE_YMM_SIZE / BITS_PER_LONG) 28 + 29 + /* Load from memory into YMM register with non-temporal hint ("stream load"), 30 + * that is, don't fetch lines from memory into the cache. This avoids pushing 31 + * precious packet data out of the cache hierarchy, and is appropriate when: 32 + * 33 + * - loading buckets from lookup tables, as they are not going to be used 34 + * again before packets are entirely classified 35 + * 36 + * - loading the result bitmap from the previous field, as it's never used 37 + * again 38 + */ 39 + #define NFT_PIPAPO_AVX2_LOAD(reg, loc) \ 40 + asm volatile("vmovntdqa %0, %%ymm" #reg : : "m" (loc)) 41 + 42 + /* Stream a single lookup table bucket into YMM register given lookup table, 43 + * group index, value of packet bits, bucket size. 44 + */ 45 + #define NFT_PIPAPO_AVX2_BUCKET_LOAD4(reg, lt, group, v, bsize) \ 46 + NFT_PIPAPO_AVX2_LOAD(reg, \ 47 + lt[((group) * NFT_PIPAPO_BUCKETS(4) + \ 48 + (v)) * (bsize)]) 49 + #define NFT_PIPAPO_AVX2_BUCKET_LOAD8(reg, lt, group, v, bsize) \ 50 + NFT_PIPAPO_AVX2_LOAD(reg, \ 51 + lt[((group) * NFT_PIPAPO_BUCKETS(8) + \ 52 + (v)) * (bsize)]) 53 + 54 + /* Bitwise AND: the staple operation of this algorithm */ 55 + #define NFT_PIPAPO_AVX2_AND(dst, a, b) \ 56 + asm volatile("vpand %ymm" #a ", %ymm" #b ", %ymm" #dst) 57 + 58 + /* Jump to label if @reg is zero */ 59 + #define NFT_PIPAPO_AVX2_NOMATCH_GOTO(reg, label) \ 60 + asm_volatile_goto("vptest %%ymm" #reg ", %%ymm" #reg ";" \ 61 + "je %l[" #label "]" : : : : label) 62 + 63 + /* Store 256 bits from YMM register into memory. Contrary to bucket load 64 + * operation, we don't bypass the cache here, as stored matching results 65 + * are always used shortly after. 66 + */ 67 + #define NFT_PIPAPO_AVX2_STORE(loc, reg) \ 68 + asm volatile("vmovdqa %%ymm" #reg ", %0" : "=m" (loc)) 69 + 70 + /* Zero out a complete YMM register, @reg */ 71 + #define NFT_PIPAPO_AVX2_ZERO(reg) \ 72 + asm volatile("vpxor %ymm" #reg ", %ymm" #reg ", %ymm" #reg) 73 + 74 + /* Current working bitmap index, toggled between field matches */ 75 + static DEFINE_PER_CPU(bool, nft_pipapo_avx2_scratch_index); 76 + 77 + /** 78 + * nft_pipapo_avx2_prepare() - Prepare before main algorithm body 79 + * 80 + * This zeroes out ymm15, which is later used whenever we need to clear a 81 + * memory location, by storing its content into memory. 82 + */ 83 + static void nft_pipapo_avx2_prepare(void) 84 + { 85 + NFT_PIPAPO_AVX2_ZERO(15); 86 + } 87 + 88 + /** 89 + * nft_pipapo_avx2_fill() - Fill a bitmap region with ones 90 + * @data: Base memory area 91 + * @start: First bit to set 92 + * @len: Count of bits to fill 93 + * 94 + * This is nothing else than a version of bitmap_set(), as used e.g. by 95 + * pipapo_refill(), tailored for the microarchitectures using it and better 96 + * suited for the specific usage: it's very likely that we'll set a small number 97 + * of bits, not crossing a word boundary, and correct branch prediction is 98 + * critical here. 99 + * 100 + * This function doesn't actually use any AVX2 instruction. 101 + */ 102 + static void nft_pipapo_avx2_fill(unsigned long *data, int start, int len) 103 + { 104 + int offset = start % BITS_PER_LONG; 105 + unsigned long mask; 106 + 107 + data += start / BITS_PER_LONG; 108 + 109 + if (likely(len == 1)) { 110 + *data |= BIT(offset); 111 + return; 112 + } 113 + 114 + if (likely(len < BITS_PER_LONG || offset)) { 115 + if (likely(len + offset <= BITS_PER_LONG)) { 116 + *data |= GENMASK(len - 1 + offset, offset); 117 + return; 118 + } 119 + 120 + *data |= ~0UL << offset; 121 + len -= BITS_PER_LONG - offset; 122 + data++; 123 + 124 + if (len <= BITS_PER_LONG) { 125 + mask = ~0UL >> (BITS_PER_LONG - len); 126 + *data |= mask; 127 + return; 128 + } 129 + } 130 + 131 + memset(data, 0xff, len / BITS_PER_BYTE); 132 + data += len / BITS_PER_LONG; 133 + 134 + len %= BITS_PER_LONG; 135 + if (len) 136 + *data |= ~0UL >> (BITS_PER_LONG - len); 137 + } 138 + 139 + /** 140 + * nft_pipapo_avx2_refill() - Scan bitmap, select mapping table item, set bits 141 + * @offset: Start from given bitmap (equivalent to bucket) offset, in longs 142 + * @map: Bitmap to be scanned for set bits 143 + * @dst: Destination bitmap 144 + * @mt: Mapping table containing bit set specifiers 145 + * @len: Length of bitmap in longs 146 + * @last: Return index of first set bit, if this is the last field 147 + * 148 + * This is an alternative implementation of pipapo_refill() suitable for usage 149 + * with AVX2 lookup routines: we know there are four words to be scanned, at 150 + * a given offset inside the map, for each matching iteration. 151 + * 152 + * This function doesn't actually use any AVX2 instruction. 153 + * 154 + * Return: first set bit index if @last, index of first filled word otherwise. 155 + */ 156 + static int nft_pipapo_avx2_refill(int offset, unsigned long *map, 157 + unsigned long *dst, 158 + union nft_pipapo_map_bucket *mt, bool last) 159 + { 160 + int ret = -1; 161 + 162 + #define NFT_PIPAPO_AVX2_REFILL_ONE_WORD(x) \ 163 + do { \ 164 + while (map[(x)]) { \ 165 + int r = __builtin_ctzl(map[(x)]); \ 166 + int i = (offset + (x)) * BITS_PER_LONG + r; \ 167 + \ 168 + if (last) \ 169 + return i; \ 170 + \ 171 + nft_pipapo_avx2_fill(dst, mt[i].to, mt[i].n); \ 172 + \ 173 + if (ret == -1) \ 174 + ret = mt[i].to; \ 175 + \ 176 + map[(x)] &= ~(1UL << r); \ 177 + } \ 178 + } while (0) 179 + 180 + NFT_PIPAPO_AVX2_REFILL_ONE_WORD(0); 181 + NFT_PIPAPO_AVX2_REFILL_ONE_WORD(1); 182 + NFT_PIPAPO_AVX2_REFILL_ONE_WORD(2); 183 + NFT_PIPAPO_AVX2_REFILL_ONE_WORD(3); 184 + #undef NFT_PIPAPO_AVX2_REFILL_ONE_WORD 185 + 186 + return ret; 187 + } 188 + 189 + /** 190 + * nft_pipapo_avx2_lookup_4b_2() - AVX2-based lookup for 2 four-bit groups 191 + * @map: Previous match result, used as initial bitmap 192 + * @fill: Destination bitmap to be filled with current match result 193 + * @f: Field, containing lookup and mapping tables 194 + * @offset: Ignore buckets before the given index, no bits are filled there 195 + * @pkt: Packet data, pointer to input nftables register 196 + * @first: If this is the first field, don't source previous result 197 + * @last: Last field: stop at the first match and return bit index 198 + * 199 + * Load buckets from lookup table corresponding to the values of each 4-bit 200 + * group of packet bytes, and perform a bitwise intersection between them. If 201 + * this is the first field in the set, simply AND the buckets together 202 + * (equivalent to using an all-ones starting bitmap), use the provided starting 203 + * bitmap otherwise. Then call nft_pipapo_avx2_refill() to generate the next 204 + * working bitmap, @fill. 205 + * 206 + * This is used for 8-bit fields (i.e. protocol numbers). 207 + * 208 + * Out-of-order (and superscalar) execution is vital here, so it's critical to 209 + * avoid false data dependencies. CPU and compiler could (mostly) take care of 210 + * this on their own, but the operation ordering is explicitly given here with 211 + * a likely execution order in mind, to highlight possible stalls. That's why 212 + * a number of logically distinct operations (i.e. loading buckets, intersecting 213 + * buckets) are interleaved. 214 + * 215 + * Return: -1 on no match, rule index of match if @last, otherwise first long 216 + * word index to be checked next (i.e. first filled word). 217 + */ 218 + static int nft_pipapo_avx2_lookup_4b_2(unsigned long *map, unsigned long *fill, 219 + struct nft_pipapo_field *f, int offset, 220 + const u8 *pkt, bool first, bool last) 221 + { 222 + int i, ret = -1, m256_size = f->bsize / NFT_PIPAPO_LONGS_PER_M256, b; 223 + u8 pg[2] = { pkt[0] >> 4, pkt[0] & 0xf }; 224 + unsigned long *lt = f->lt, bsize = f->bsize; 225 + 226 + lt += offset * NFT_PIPAPO_LONGS_PER_M256; 227 + for (i = offset; i < m256_size; i++, lt += NFT_PIPAPO_LONGS_PER_M256) { 228 + int i_ul = i * NFT_PIPAPO_LONGS_PER_M256; 229 + 230 + if (first) { 231 + NFT_PIPAPO_AVX2_BUCKET_LOAD4(0, lt, 0, pg[0], bsize); 232 + NFT_PIPAPO_AVX2_BUCKET_LOAD4(1, lt, 1, pg[1], bsize); 233 + NFT_PIPAPO_AVX2_AND(4, 0, 1); 234 + } else { 235 + NFT_PIPAPO_AVX2_BUCKET_LOAD4(0, lt, 0, pg[0], bsize); 236 + NFT_PIPAPO_AVX2_LOAD(2, map[i_ul]); 237 + NFT_PIPAPO_AVX2_BUCKET_LOAD4(1, lt, 1, pg[1], bsize); 238 + NFT_PIPAPO_AVX2_NOMATCH_GOTO(2, nothing); 239 + NFT_PIPAPO_AVX2_AND(3, 0, 1); 240 + NFT_PIPAPO_AVX2_AND(4, 2, 3); 241 + } 242 + 243 + NFT_PIPAPO_AVX2_NOMATCH_GOTO(4, nomatch); 244 + NFT_PIPAPO_AVX2_STORE(map[i_ul], 4); 245 + 246 + b = nft_pipapo_avx2_refill(i_ul, &map[i_ul], fill, f->mt, last); 247 + if (last) 248 + return b; 249 + 250 + if (unlikely(ret == -1)) 251 + ret = b / XSAVE_YMM_SIZE; 252 + 253 + continue; 254 + nomatch: 255 + NFT_PIPAPO_AVX2_STORE(map[i_ul], 15); 256 + nothing: 257 + ; 258 + } 259 + 260 + return ret; 261 + } 262 + 263 + /** 264 + * nft_pipapo_avx2_lookup_4b_4() - AVX2-based lookup for 4 four-bit groups 265 + * @map: Previous match result, used as initial bitmap 266 + * @fill: Destination bitmap to be filled with current match result 267 + * @f: Field, containing lookup and mapping tables 268 + * @offset: Ignore buckets before the given index, no bits are filled there 269 + * @pkt: Packet data, pointer to input nftables register 270 + * @first: If this is the first field, don't source previous result 271 + * @last: Last field: stop at the first match and return bit index 272 + * 273 + * See nft_pipapo_avx2_lookup_4b_2(). 274 + * 275 + * This is used for 16-bit fields (i.e. ports). 276 + * 277 + * Return: -1 on no match, rule index of match if @last, otherwise first long 278 + * word index to be checked next (i.e. first filled word). 279 + */ 280 + static int nft_pipapo_avx2_lookup_4b_4(unsigned long *map, unsigned long *fill, 281 + struct nft_pipapo_field *f, int offset, 282 + const u8 *pkt, bool first, bool last) 283 + { 284 + int i, ret = -1, m256_size = f->bsize / NFT_PIPAPO_LONGS_PER_M256, b; 285 + u8 pg[4] = { pkt[0] >> 4, pkt[0] & 0xf, pkt[1] >> 4, pkt[1] & 0xf }; 286 + unsigned long *lt = f->lt, bsize = f->bsize; 287 + 288 + lt += offset * NFT_PIPAPO_LONGS_PER_M256; 289 + for (i = offset; i < m256_size; i++, lt += NFT_PIPAPO_LONGS_PER_M256) { 290 + int i_ul = i * NFT_PIPAPO_LONGS_PER_M256; 291 + 292 + if (first) { 293 + NFT_PIPAPO_AVX2_BUCKET_LOAD4(0, lt, 0, pg[0], bsize); 294 + NFT_PIPAPO_AVX2_BUCKET_LOAD4(1, lt, 1, pg[1], bsize); 295 + NFT_PIPAPO_AVX2_BUCKET_LOAD4(2, lt, 2, pg[2], bsize); 296 + NFT_PIPAPO_AVX2_BUCKET_LOAD4(3, lt, 3, pg[3], bsize); 297 + NFT_PIPAPO_AVX2_AND(4, 0, 1); 298 + NFT_PIPAPO_AVX2_AND(5, 2, 3); 299 + NFT_PIPAPO_AVX2_AND(7, 4, 5); 300 + } else { 301 + NFT_PIPAPO_AVX2_BUCKET_LOAD4(0, lt, 0, pg[0], bsize); 302 + 303 + NFT_PIPAPO_AVX2_LOAD(1, map[i_ul]); 304 + 305 + NFT_PIPAPO_AVX2_BUCKET_LOAD4(2, lt, 1, pg[1], bsize); 306 + NFT_PIPAPO_AVX2_BUCKET_LOAD4(3, lt, 2, pg[2], bsize); 307 + NFT_PIPAPO_AVX2_BUCKET_LOAD4(4, lt, 3, pg[3], bsize); 308 + NFT_PIPAPO_AVX2_AND(5, 0, 1); 309 + 310 + NFT_PIPAPO_AVX2_NOMATCH_GOTO(1, nothing); 311 + 312 + NFT_PIPAPO_AVX2_AND(6, 2, 3); 313 + NFT_PIPAPO_AVX2_AND(7, 4, 5); 314 + /* Stall */ 315 + NFT_PIPAPO_AVX2_AND(7, 6, 7); 316 + } 317 + 318 + /* Stall */ 319 + NFT_PIPAPO_AVX2_NOMATCH_GOTO(7, nomatch); 320 + NFT_PIPAPO_AVX2_STORE(map[i_ul], 7); 321 + 322 + b = nft_pipapo_avx2_refill(i_ul, &map[i_ul], fill, f->mt, last); 323 + if (last) 324 + return b; 325 + 326 + if (unlikely(ret == -1)) 327 + ret = b / XSAVE_YMM_SIZE; 328 + 329 + continue; 330 + nomatch: 331 + NFT_PIPAPO_AVX2_STORE(map[i_ul], 15); 332 + nothing: 333 + ; 334 + } 335 + 336 + return ret; 337 + } 338 + 339 + /** 340 + * nft_pipapo_avx2_lookup_4b_8() - AVX2-based lookup for 8 four-bit groups 341 + * @map: Previous match result, used as initial bitmap 342 + * @fill: Destination bitmap to be filled with current match result 343 + * @f: Field, containing lookup and mapping tables 344 + * @offset: Ignore buckets before the given index, no bits are filled there 345 + * @pkt: Packet data, pointer to input nftables register 346 + * @first: If this is the first field, don't source previous result 347 + * @last: Last field: stop at the first match and return bit index 348 + * 349 + * See nft_pipapo_avx2_lookup_4b_2(). 350 + * 351 + * This is used for 32-bit fields (i.e. IPv4 addresses). 352 + * 353 + * Return: -1 on no match, rule index of match if @last, otherwise first long 354 + * word index to be checked next (i.e. first filled word). 355 + */ 356 + static int nft_pipapo_avx2_lookup_4b_8(unsigned long *map, unsigned long *fill, 357 + struct nft_pipapo_field *f, int offset, 358 + const u8 *pkt, bool first, bool last) 359 + { 360 + u8 pg[8] = { pkt[0] >> 4, pkt[0] & 0xf, pkt[1] >> 4, pkt[1] & 0xf, 361 + pkt[2] >> 4, pkt[2] & 0xf, pkt[3] >> 4, pkt[3] & 0xf, 362 + }; 363 + int i, ret = -1, m256_size = f->bsize / NFT_PIPAPO_LONGS_PER_M256, b; 364 + unsigned long *lt = f->lt, bsize = f->bsize; 365 + 366 + lt += offset * NFT_PIPAPO_LONGS_PER_M256; 367 + for (i = offset; i < m256_size; i++, lt += NFT_PIPAPO_LONGS_PER_M256) { 368 + int i_ul = i * NFT_PIPAPO_LONGS_PER_M256; 369 + 370 + if (first) { 371 + NFT_PIPAPO_AVX2_BUCKET_LOAD4(0, lt, 0, pg[0], bsize); 372 + NFT_PIPAPO_AVX2_BUCKET_LOAD4(1, lt, 1, pg[1], bsize); 373 + NFT_PIPAPO_AVX2_BUCKET_LOAD4(2, lt, 2, pg[2], bsize); 374 + NFT_PIPAPO_AVX2_BUCKET_LOAD4(3, lt, 3, pg[3], bsize); 375 + NFT_PIPAPO_AVX2_BUCKET_LOAD4(4, lt, 4, pg[4], bsize); 376 + NFT_PIPAPO_AVX2_AND(5, 0, 1); 377 + NFT_PIPAPO_AVX2_BUCKET_LOAD4(6, lt, 5, pg[5], bsize); 378 + NFT_PIPAPO_AVX2_BUCKET_LOAD4(7, lt, 6, pg[6], bsize); 379 + NFT_PIPAPO_AVX2_AND(8, 2, 3); 380 + NFT_PIPAPO_AVX2_AND(9, 4, 5); 381 + NFT_PIPAPO_AVX2_BUCKET_LOAD4(10, lt, 7, pg[7], bsize); 382 + NFT_PIPAPO_AVX2_AND(11, 6, 7); 383 + NFT_PIPAPO_AVX2_AND(12, 8, 9); 384 + NFT_PIPAPO_AVX2_AND(13, 10, 11); 385 + 386 + /* Stall */ 387 + NFT_PIPAPO_AVX2_AND(1, 12, 13); 388 + } else { 389 + NFT_PIPAPO_AVX2_BUCKET_LOAD4(0, lt, 0, pg[0], bsize); 390 + NFT_PIPAPO_AVX2_LOAD(1, map[i_ul]); 391 + NFT_PIPAPO_AVX2_BUCKET_LOAD4(2, lt, 1, pg[1], bsize); 392 + NFT_PIPAPO_AVX2_BUCKET_LOAD4(3, lt, 2, pg[2], bsize); 393 + NFT_PIPAPO_AVX2_BUCKET_LOAD4(4, lt, 3, pg[3], bsize); 394 + 395 + NFT_PIPAPO_AVX2_NOMATCH_GOTO(1, nothing); 396 + 397 + NFT_PIPAPO_AVX2_AND(5, 0, 1); 398 + NFT_PIPAPO_AVX2_BUCKET_LOAD4(6, lt, 4, pg[4], bsize); 399 + NFT_PIPAPO_AVX2_BUCKET_LOAD4(7, lt, 5, pg[5], bsize); 400 + NFT_PIPAPO_AVX2_AND(8, 2, 3); 401 + NFT_PIPAPO_AVX2_BUCKET_LOAD4(9, lt, 6, pg[6], bsize); 402 + NFT_PIPAPO_AVX2_AND(10, 4, 5); 403 + NFT_PIPAPO_AVX2_BUCKET_LOAD4(11, lt, 7, pg[7], bsize); 404 + NFT_PIPAPO_AVX2_AND(12, 6, 7); 405 + NFT_PIPAPO_AVX2_AND(13, 8, 9); 406 + NFT_PIPAPO_AVX2_AND(14, 10, 11); 407 + 408 + /* Stall */ 409 + NFT_PIPAPO_AVX2_AND(1, 12, 13); 410 + NFT_PIPAPO_AVX2_AND(1, 1, 14); 411 + } 412 + 413 + NFT_PIPAPO_AVX2_NOMATCH_GOTO(1, nomatch); 414 + NFT_PIPAPO_AVX2_STORE(map[i_ul], 1); 415 + 416 + b = nft_pipapo_avx2_refill(i_ul, &map[i_ul], fill, f->mt, last); 417 + if (last) 418 + return b; 419 + 420 + if (unlikely(ret == -1)) 421 + ret = b / XSAVE_YMM_SIZE; 422 + 423 + continue; 424 + 425 + nomatch: 426 + NFT_PIPAPO_AVX2_STORE(map[i_ul], 15); 427 + nothing: 428 + ; 429 + } 430 + 431 + return ret; 432 + } 433 + 434 + /** 435 + * nft_pipapo_avx2_lookup_4b_12() - AVX2-based lookup for 12 four-bit groups 436 + * @map: Previous match result, used as initial bitmap 437 + * @fill: Destination bitmap to be filled with current match result 438 + * @f: Field, containing lookup and mapping tables 439 + * @offset: Ignore buckets before the given index, no bits are filled there 440 + * @pkt: Packet data, pointer to input nftables register 441 + * @first: If this is the first field, don't source previous result 442 + * @last: Last field: stop at the first match and return bit index 443 + * 444 + * See nft_pipapo_avx2_lookup_4b_2(). 445 + * 446 + * This is used for 48-bit fields (i.e. MAC addresses/EUI-48). 447 + * 448 + * Return: -1 on no match, rule index of match if @last, otherwise first long 449 + * word index to be checked next (i.e. first filled word). 450 + */ 451 + static int nft_pipapo_avx2_lookup_4b_12(unsigned long *map, unsigned long *fill, 452 + struct nft_pipapo_field *f, int offset, 453 + const u8 *pkt, bool first, bool last) 454 + { 455 + u8 pg[12] = { pkt[0] >> 4, pkt[0] & 0xf, pkt[1] >> 4, pkt[1] & 0xf, 456 + pkt[2] >> 4, pkt[2] & 0xf, pkt[3] >> 4, pkt[3] & 0xf, 457 + pkt[4] >> 4, pkt[4] & 0xf, pkt[5] >> 4, pkt[5] & 0xf, 458 + }; 459 + int i, ret = -1, m256_size = f->bsize / NFT_PIPAPO_LONGS_PER_M256, b; 460 + unsigned long *lt = f->lt, bsize = f->bsize; 461 + 462 + lt += offset * NFT_PIPAPO_LONGS_PER_M256; 463 + for (i = offset; i < m256_size; i++, lt += NFT_PIPAPO_LONGS_PER_M256) { 464 + int i_ul = i * NFT_PIPAPO_LONGS_PER_M256; 465 + 466 + if (!first) 467 + NFT_PIPAPO_AVX2_LOAD(0, map[i_ul]); 468 + 469 + NFT_PIPAPO_AVX2_BUCKET_LOAD4(1, lt, 0, pg[0], bsize); 470 + NFT_PIPAPO_AVX2_BUCKET_LOAD4(2, lt, 1, pg[1], bsize); 471 + NFT_PIPAPO_AVX2_BUCKET_LOAD4(3, lt, 2, pg[2], bsize); 472 + 473 + if (!first) { 474 + NFT_PIPAPO_AVX2_NOMATCH_GOTO(0, nothing); 475 + NFT_PIPAPO_AVX2_AND(1, 1, 0); 476 + } 477 + 478 + NFT_PIPAPO_AVX2_BUCKET_LOAD4(4, lt, 3, pg[3], bsize); 479 + NFT_PIPAPO_AVX2_BUCKET_LOAD4(5, lt, 4, pg[4], bsize); 480 + NFT_PIPAPO_AVX2_AND(6, 2, 3); 481 + NFT_PIPAPO_AVX2_BUCKET_LOAD4(7, lt, 5, pg[5], bsize); 482 + NFT_PIPAPO_AVX2_BUCKET_LOAD4(8, lt, 6, pg[6], bsize); 483 + NFT_PIPAPO_AVX2_AND(9, 1, 4); 484 + NFT_PIPAPO_AVX2_BUCKET_LOAD4(10, lt, 7, pg[7], bsize); 485 + NFT_PIPAPO_AVX2_AND(11, 5, 6); 486 + NFT_PIPAPO_AVX2_BUCKET_LOAD4(12, lt, 8, pg[8], bsize); 487 + NFT_PIPAPO_AVX2_AND(13, 7, 8); 488 + NFT_PIPAPO_AVX2_BUCKET_LOAD4(14, lt, 9, pg[9], bsize); 489 + 490 + NFT_PIPAPO_AVX2_AND(0, 9, 10); 491 + NFT_PIPAPO_AVX2_BUCKET_LOAD4(1, lt, 10, pg[10], bsize); 492 + NFT_PIPAPO_AVX2_AND(2, 11, 12); 493 + NFT_PIPAPO_AVX2_BUCKET_LOAD4(3, lt, 11, pg[11], bsize); 494 + NFT_PIPAPO_AVX2_AND(4, 13, 14); 495 + NFT_PIPAPO_AVX2_AND(5, 0, 1); 496 + 497 + NFT_PIPAPO_AVX2_AND(6, 2, 3); 498 + 499 + /* Stalls */ 500 + NFT_PIPAPO_AVX2_AND(7, 4, 5); 501 + NFT_PIPAPO_AVX2_AND(8, 6, 7); 502 + 503 + NFT_PIPAPO_AVX2_NOMATCH_GOTO(8, nomatch); 504 + NFT_PIPAPO_AVX2_STORE(map[i_ul], 8); 505 + 506 + b = nft_pipapo_avx2_refill(i_ul, &map[i_ul], fill, f->mt, last); 507 + if (last) 508 + return b; 509 + 510 + if (unlikely(ret == -1)) 511 + ret = b / XSAVE_YMM_SIZE; 512 + 513 + continue; 514 + nomatch: 515 + NFT_PIPAPO_AVX2_STORE(map[i_ul], 15); 516 + nothing: 517 + ; 518 + } 519 + 520 + return ret; 521 + } 522 + 523 + /** 524 + * nft_pipapo_avx2_lookup_4b_32() - AVX2-based lookup for 32 four-bit groups 525 + * @map: Previous match result, used as initial bitmap 526 + * @fill: Destination bitmap to be filled with current match result 527 + * @f: Field, containing lookup and mapping tables 528 + * @offset: Ignore buckets before the given index, no bits are filled there 529 + * @pkt: Packet data, pointer to input nftables register 530 + * @first: If this is the first field, don't source previous result 531 + * @last: Last field: stop at the first match and return bit index 532 + * 533 + * See nft_pipapo_avx2_lookup_4b_2(). 534 + * 535 + * This is used for 128-bit fields (i.e. IPv6 addresses). 536 + * 537 + * Return: -1 on no match, rule index of match if @last, otherwise first long 538 + * word index to be checked next (i.e. first filled word). 539 + */ 540 + static int nft_pipapo_avx2_lookup_4b_32(unsigned long *map, unsigned long *fill, 541 + struct nft_pipapo_field *f, int offset, 542 + const u8 *pkt, bool first, bool last) 543 + { 544 + u8 pg[32] = { pkt[0] >> 4, pkt[0] & 0xf, pkt[1] >> 4, pkt[1] & 0xf, 545 + pkt[2] >> 4, pkt[2] & 0xf, pkt[3] >> 4, pkt[3] & 0xf, 546 + pkt[4] >> 4, pkt[4] & 0xf, pkt[5] >> 4, pkt[5] & 0xf, 547 + pkt[6] >> 4, pkt[6] & 0xf, pkt[7] >> 4, pkt[7] & 0xf, 548 + pkt[8] >> 4, pkt[8] & 0xf, pkt[9] >> 4, pkt[9] & 0xf, 549 + pkt[10] >> 4, pkt[10] & 0xf, pkt[11] >> 4, pkt[11] & 0xf, 550 + pkt[12] >> 4, pkt[12] & 0xf, pkt[13] >> 4, pkt[13] & 0xf, 551 + pkt[14] >> 4, pkt[14] & 0xf, pkt[15] >> 4, pkt[15] & 0xf, 552 + }; 553 + int i, ret = -1, m256_size = f->bsize / NFT_PIPAPO_LONGS_PER_M256, b; 554 + unsigned long *lt = f->lt, bsize = f->bsize; 555 + 556 + lt += offset * NFT_PIPAPO_LONGS_PER_M256; 557 + for (i = offset; i < m256_size; i++, lt += NFT_PIPAPO_LONGS_PER_M256) { 558 + int i_ul = i * NFT_PIPAPO_LONGS_PER_M256; 559 + 560 + if (!first) 561 + NFT_PIPAPO_AVX2_LOAD(0, map[i_ul]); 562 + 563 + NFT_PIPAPO_AVX2_BUCKET_LOAD4(1, lt, 0, pg[0], bsize); 564 + NFT_PIPAPO_AVX2_BUCKET_LOAD4(2, lt, 1, pg[1], bsize); 565 + NFT_PIPAPO_AVX2_BUCKET_LOAD4(3, lt, 2, pg[2], bsize); 566 + NFT_PIPAPO_AVX2_BUCKET_LOAD4(4, lt, 3, pg[3], bsize); 567 + if (!first) { 568 + NFT_PIPAPO_AVX2_NOMATCH_GOTO(0, nothing); 569 + NFT_PIPAPO_AVX2_AND(1, 1, 0); 570 + } 571 + 572 + NFT_PIPAPO_AVX2_AND(5, 2, 3); 573 + NFT_PIPAPO_AVX2_BUCKET_LOAD4(6, lt, 4, pg[4], bsize); 574 + NFT_PIPAPO_AVX2_BUCKET_LOAD4(7, lt, 5, pg[5], bsize); 575 + NFT_PIPAPO_AVX2_AND(8, 1, 4); 576 + NFT_PIPAPO_AVX2_BUCKET_LOAD4(9, lt, 6, pg[6], bsize); 577 + NFT_PIPAPO_AVX2_AND(10, 5, 6); 578 + NFT_PIPAPO_AVX2_BUCKET_LOAD4(11, lt, 7, pg[7], bsize); 579 + NFT_PIPAPO_AVX2_AND(12, 7, 8); 580 + NFT_PIPAPO_AVX2_BUCKET_LOAD4(13, lt, 8, pg[8], bsize); 581 + NFT_PIPAPO_AVX2_AND(14, 9, 10); 582 + 583 + NFT_PIPAPO_AVX2_BUCKET_LOAD4(0, lt, 9, pg[9], bsize); 584 + NFT_PIPAPO_AVX2_AND(1, 11, 12); 585 + NFT_PIPAPO_AVX2_BUCKET_LOAD4(2, lt, 10, pg[10], bsize); 586 + NFT_PIPAPO_AVX2_BUCKET_LOAD4(3, lt, 11, pg[11], bsize); 587 + NFT_PIPAPO_AVX2_AND(4, 13, 14); 588 + NFT_PIPAPO_AVX2_BUCKET_LOAD4(5, lt, 12, pg[12], bsize); 589 + NFT_PIPAPO_AVX2_BUCKET_LOAD4(6, lt, 13, pg[13], bsize); 590 + NFT_PIPAPO_AVX2_AND(7, 0, 1); 591 + NFT_PIPAPO_AVX2_BUCKET_LOAD4(8, lt, 14, pg[14], bsize); 592 + NFT_PIPAPO_AVX2_AND(9, 2, 3); 593 + NFT_PIPAPO_AVX2_BUCKET_LOAD4(10, lt, 15, pg[15], bsize); 594 + NFT_PIPAPO_AVX2_AND(11, 4, 5); 595 + NFT_PIPAPO_AVX2_BUCKET_LOAD4(12, lt, 16, pg[16], bsize); 596 + NFT_PIPAPO_AVX2_AND(13, 6, 7); 597 + NFT_PIPAPO_AVX2_BUCKET_LOAD4(14, lt, 17, pg[17], bsize); 598 + 599 + NFT_PIPAPO_AVX2_AND(0, 8, 9); 600 + NFT_PIPAPO_AVX2_BUCKET_LOAD4(1, lt, 18, pg[18], bsize); 601 + NFT_PIPAPO_AVX2_AND(2, 10, 11); 602 + NFT_PIPAPO_AVX2_BUCKET_LOAD4(3, lt, 19, pg[19], bsize); 603 + NFT_PIPAPO_AVX2_AND(4, 12, 13); 604 + NFT_PIPAPO_AVX2_BUCKET_LOAD4(5, lt, 20, pg[20], bsize); 605 + NFT_PIPAPO_AVX2_AND(6, 14, 0); 606 + NFT_PIPAPO_AVX2_AND(7, 1, 2); 607 + NFT_PIPAPO_AVX2_BUCKET_LOAD4(8, lt, 21, pg[21], bsize); 608 + NFT_PIPAPO_AVX2_AND(9, 3, 4); 609 + NFT_PIPAPO_AVX2_BUCKET_LOAD4(10, lt, 22, pg[22], bsize); 610 + NFT_PIPAPO_AVX2_AND(11, 5, 6); 611 + NFT_PIPAPO_AVX2_BUCKET_LOAD4(12, lt, 23, pg[23], bsize); 612 + NFT_PIPAPO_AVX2_AND(13, 7, 8); 613 + 614 + NFT_PIPAPO_AVX2_BUCKET_LOAD4(14, lt, 24, pg[24], bsize); 615 + NFT_PIPAPO_AVX2_BUCKET_LOAD4(0, lt, 25, pg[25], bsize); 616 + NFT_PIPAPO_AVX2_AND(1, 9, 10); 617 + NFT_PIPAPO_AVX2_AND(2, 11, 12); 618 + NFT_PIPAPO_AVX2_BUCKET_LOAD4(3, lt, 26, pg[26], bsize); 619 + NFT_PIPAPO_AVX2_AND(4, 13, 14); 620 + NFT_PIPAPO_AVX2_BUCKET_LOAD4(5, lt, 27, pg[27], bsize); 621 + NFT_PIPAPO_AVX2_AND(6, 0, 1); 622 + NFT_PIPAPO_AVX2_BUCKET_LOAD4(7, lt, 28, pg[28], bsize); 623 + NFT_PIPAPO_AVX2_BUCKET_LOAD4(8, lt, 29, pg[29], bsize); 624 + NFT_PIPAPO_AVX2_AND(9, 2, 3); 625 + NFT_PIPAPO_AVX2_BUCKET_LOAD4(10, lt, 30, pg[30], bsize); 626 + NFT_PIPAPO_AVX2_AND(11, 4, 5); 627 + NFT_PIPAPO_AVX2_BUCKET_LOAD4(12, lt, 31, pg[31], bsize); 628 + 629 + NFT_PIPAPO_AVX2_AND(0, 6, 7); 630 + NFT_PIPAPO_AVX2_AND(1, 8, 9); 631 + NFT_PIPAPO_AVX2_AND(2, 10, 11); 632 + NFT_PIPAPO_AVX2_AND(3, 12, 0); 633 + 634 + /* Stalls */ 635 + NFT_PIPAPO_AVX2_AND(4, 1, 2); 636 + NFT_PIPAPO_AVX2_AND(5, 3, 4); 637 + 638 + NFT_PIPAPO_AVX2_NOMATCH_GOTO(5, nomatch); 639 + NFT_PIPAPO_AVX2_STORE(map[i_ul], 5); 640 + 641 + b = nft_pipapo_avx2_refill(i_ul, &map[i_ul], fill, f->mt, last); 642 + if (last) 643 + return b; 644 + 645 + if (unlikely(ret == -1)) 646 + ret = b / XSAVE_YMM_SIZE; 647 + 648 + continue; 649 + nomatch: 650 + NFT_PIPAPO_AVX2_STORE(map[i_ul], 15); 651 + nothing: 652 + ; 653 + } 654 + 655 + return ret; 656 + } 657 + 658 + /** 659 + * nft_pipapo_avx2_lookup_8b_1() - AVX2-based lookup for one eight-bit group 660 + * @map: Previous match result, used as initial bitmap 661 + * @fill: Destination bitmap to be filled with current match result 662 + * @f: Field, containing lookup and mapping tables 663 + * @offset: Ignore buckets before the given index, no bits are filled there 664 + * @pkt: Packet data, pointer to input nftables register 665 + * @first: If this is the first field, don't source previous result 666 + * @last: Last field: stop at the first match and return bit index 667 + * 668 + * See nft_pipapo_avx2_lookup_4b_2(). 669 + * 670 + * This is used for 8-bit fields (i.e. protocol numbers). 671 + * 672 + * Return: -1 on no match, rule index of match if @last, otherwise first long 673 + * word index to be checked next (i.e. first filled word). 674 + */ 675 + static int nft_pipapo_avx2_lookup_8b_1(unsigned long *map, unsigned long *fill, 676 + struct nft_pipapo_field *f, int offset, 677 + const u8 *pkt, bool first, bool last) 678 + { 679 + int i, ret = -1, m256_size = f->bsize / NFT_PIPAPO_LONGS_PER_M256, b; 680 + unsigned long *lt = f->lt, bsize = f->bsize; 681 + 682 + lt += offset * NFT_PIPAPO_LONGS_PER_M256; 683 + for (i = offset; i < m256_size; i++, lt += NFT_PIPAPO_LONGS_PER_M256) { 684 + int i_ul = i * NFT_PIPAPO_LONGS_PER_M256; 685 + 686 + if (first) { 687 + NFT_PIPAPO_AVX2_BUCKET_LOAD8(2, lt, 0, pkt[0], bsize); 688 + } else { 689 + NFT_PIPAPO_AVX2_BUCKET_LOAD8(0, lt, 0, pkt[0], bsize); 690 + NFT_PIPAPO_AVX2_LOAD(1, map[i_ul]); 691 + NFT_PIPAPO_AVX2_AND(2, 0, 1); 692 + NFT_PIPAPO_AVX2_NOMATCH_GOTO(1, nothing); 693 + } 694 + 695 + NFT_PIPAPO_AVX2_NOMATCH_GOTO(2, nomatch); 696 + NFT_PIPAPO_AVX2_STORE(map[i_ul], 2); 697 + 698 + b = nft_pipapo_avx2_refill(i_ul, &map[i_ul], fill, f->mt, last); 699 + if (last) 700 + return b; 701 + 702 + if (unlikely(ret == -1)) 703 + ret = b / XSAVE_YMM_SIZE; 704 + 705 + continue; 706 + nomatch: 707 + NFT_PIPAPO_AVX2_STORE(map[i_ul], 15); 708 + nothing: 709 + ; 710 + } 711 + 712 + return ret; 713 + } 714 + 715 + /** 716 + * nft_pipapo_avx2_lookup_8b_2() - AVX2-based lookup for 2 eight-bit groups 717 + * @map: Previous match result, used as initial bitmap 718 + * @fill: Destination bitmap to be filled with current match result 719 + * @f: Field, containing lookup and mapping tables 720 + * @offset: Ignore buckets before the given index, no bits are filled there 721 + * @pkt: Packet data, pointer to input nftables register 722 + * @first: If this is the first field, don't source previous result 723 + * @last: Last field: stop at the first match and return bit index 724 + * 725 + * See nft_pipapo_avx2_lookup_4b_2(). 726 + * 727 + * This is used for 16-bit fields (i.e. ports). 728 + * 729 + * Return: -1 on no match, rule index of match if @last, otherwise first long 730 + * word index to be checked next (i.e. first filled word). 731 + */ 732 + static int nft_pipapo_avx2_lookup_8b_2(unsigned long *map, unsigned long *fill, 733 + struct nft_pipapo_field *f, int offset, 734 + const u8 *pkt, bool first, bool last) 735 + { 736 + int i, ret = -1, m256_size = f->bsize / NFT_PIPAPO_LONGS_PER_M256, b; 737 + unsigned long *lt = f->lt, bsize = f->bsize; 738 + 739 + lt += offset * NFT_PIPAPO_LONGS_PER_M256; 740 + for (i = offset; i < m256_size; i++, lt += NFT_PIPAPO_LONGS_PER_M256) { 741 + int i_ul = i * NFT_PIPAPO_LONGS_PER_M256; 742 + 743 + if (first) { 744 + NFT_PIPAPO_AVX2_BUCKET_LOAD8(0, lt, 0, pkt[0], bsize); 745 + NFT_PIPAPO_AVX2_BUCKET_LOAD8(1, lt, 1, pkt[1], bsize); 746 + NFT_PIPAPO_AVX2_AND(4, 0, 1); 747 + } else { 748 + NFT_PIPAPO_AVX2_LOAD(0, map[i_ul]); 749 + NFT_PIPAPO_AVX2_BUCKET_LOAD8(1, lt, 0, pkt[0], bsize); 750 + NFT_PIPAPO_AVX2_BUCKET_LOAD8(2, lt, 1, pkt[1], bsize); 751 + 752 + /* Stall */ 753 + NFT_PIPAPO_AVX2_AND(3, 0, 1); 754 + NFT_PIPAPO_AVX2_NOMATCH_GOTO(0, nothing); 755 + NFT_PIPAPO_AVX2_AND(4, 3, 2); 756 + } 757 + 758 + /* Stall */ 759 + NFT_PIPAPO_AVX2_NOMATCH_GOTO(4, nomatch); 760 + NFT_PIPAPO_AVX2_STORE(map[i_ul], 4); 761 + 762 + b = nft_pipapo_avx2_refill(i_ul, &map[i_ul], fill, f->mt, last); 763 + if (last) 764 + return b; 765 + 766 + if (unlikely(ret == -1)) 767 + ret = b / XSAVE_YMM_SIZE; 768 + 769 + continue; 770 + nomatch: 771 + NFT_PIPAPO_AVX2_STORE(map[i_ul], 15); 772 + nothing: 773 + ; 774 + } 775 + 776 + return ret; 777 + } 778 + 779 + /** 780 + * nft_pipapo_avx2_lookup_8b_4() - AVX2-based lookup for 4 eight-bit groups 781 + * @map: Previous match result, used as initial bitmap 782 + * @fill: Destination bitmap to be filled with current match result 783 + * @f: Field, containing lookup and mapping tables 784 + * @offset: Ignore buckets before the given index, no bits are filled there 785 + * @pkt: Packet data, pointer to input nftables register 786 + * @first: If this is the first field, don't source previous result 787 + * @last: Last field: stop at the first match and return bit index 788 + * 789 + * See nft_pipapo_avx2_lookup_4b_2(). 790 + * 791 + * This is used for 32-bit fields (i.e. IPv4 addresses). 792 + * 793 + * Return: -1 on no match, rule index of match if @last, otherwise first long 794 + * word index to be checked next (i.e. first filled word). 795 + */ 796 + static int nft_pipapo_avx2_lookup_8b_4(unsigned long *map, unsigned long *fill, 797 + struct nft_pipapo_field *f, int offset, 798 + const u8 *pkt, bool first, bool last) 799 + { 800 + int i, ret = -1, m256_size = f->bsize / NFT_PIPAPO_LONGS_PER_M256, b; 801 + unsigned long *lt = f->lt, bsize = f->bsize; 802 + 803 + lt += offset * NFT_PIPAPO_LONGS_PER_M256; 804 + for (i = offset; i < m256_size; i++, lt += NFT_PIPAPO_LONGS_PER_M256) { 805 + int i_ul = i * NFT_PIPAPO_LONGS_PER_M256; 806 + 807 + if (first) { 808 + NFT_PIPAPO_AVX2_BUCKET_LOAD8(0, lt, 0, pkt[0], bsize); 809 + NFT_PIPAPO_AVX2_BUCKET_LOAD8(1, lt, 1, pkt[1], bsize); 810 + NFT_PIPAPO_AVX2_BUCKET_LOAD8(2, lt, 2, pkt[2], bsize); 811 + NFT_PIPAPO_AVX2_BUCKET_LOAD8(3, lt, 3, pkt[3], bsize); 812 + 813 + /* Stall */ 814 + NFT_PIPAPO_AVX2_AND(4, 0, 1); 815 + NFT_PIPAPO_AVX2_AND(5, 2, 3); 816 + NFT_PIPAPO_AVX2_AND(0, 4, 5); 817 + } else { 818 + NFT_PIPAPO_AVX2_BUCKET_LOAD8(0, lt, 0, pkt[0], bsize); 819 + NFT_PIPAPO_AVX2_LOAD(1, map[i_ul]); 820 + NFT_PIPAPO_AVX2_BUCKET_LOAD8(2, lt, 1, pkt[1], bsize); 821 + NFT_PIPAPO_AVX2_BUCKET_LOAD8(3, lt, 2, pkt[2], bsize); 822 + NFT_PIPAPO_AVX2_BUCKET_LOAD8(4, lt, 3, pkt[3], bsize); 823 + 824 + NFT_PIPAPO_AVX2_AND(5, 0, 1); 825 + NFT_PIPAPO_AVX2_NOMATCH_GOTO(1, nothing); 826 + NFT_PIPAPO_AVX2_AND(6, 2, 3); 827 + 828 + /* Stall */ 829 + NFT_PIPAPO_AVX2_AND(7, 4, 5); 830 + NFT_PIPAPO_AVX2_AND(0, 6, 7); 831 + } 832 + 833 + NFT_PIPAPO_AVX2_NOMATCH_GOTO(0, nomatch); 834 + NFT_PIPAPO_AVX2_STORE(map[i_ul], 0); 835 + 836 + b = nft_pipapo_avx2_refill(i_ul, &map[i_ul], fill, f->mt, last); 837 + if (last) 838 + return b; 839 + 840 + if (unlikely(ret == -1)) 841 + ret = b / XSAVE_YMM_SIZE; 842 + 843 + continue; 844 + 845 + nomatch: 846 + NFT_PIPAPO_AVX2_STORE(map[i_ul], 15); 847 + nothing: 848 + ; 849 + } 850 + 851 + return ret; 852 + } 853 + 854 + /** 855 + * nft_pipapo_avx2_lookup_8b_6() - AVX2-based lookup for 6 eight-bit groups 856 + * @map: Previous match result, used as initial bitmap 857 + * @fill: Destination bitmap to be filled with current match result 858 + * @f: Field, containing lookup and mapping tables 859 + * @offset: Ignore buckets before the given index, no bits are filled there 860 + * @pkt: Packet data, pointer to input nftables register 861 + * @first: If this is the first field, don't source previous result 862 + * @last: Last field: stop at the first match and return bit index 863 + * 864 + * See nft_pipapo_avx2_lookup_4b_2(). 865 + * 866 + * This is used for 48-bit fields (i.e. MAC addresses/EUI-48). 867 + * 868 + * Return: -1 on no match, rule index of match if @last, otherwise first long 869 + * word index to be checked next (i.e. first filled word). 870 + */ 871 + static int nft_pipapo_avx2_lookup_8b_6(unsigned long *map, unsigned long *fill, 872 + struct nft_pipapo_field *f, int offset, 873 + const u8 *pkt, bool first, bool last) 874 + { 875 + int i, ret = -1, m256_size = f->bsize / NFT_PIPAPO_LONGS_PER_M256, b; 876 + unsigned long *lt = f->lt, bsize = f->bsize; 877 + 878 + lt += offset * NFT_PIPAPO_LONGS_PER_M256; 879 + for (i = offset; i < m256_size; i++, lt += NFT_PIPAPO_LONGS_PER_M256) { 880 + int i_ul = i * NFT_PIPAPO_LONGS_PER_M256; 881 + 882 + if (first) { 883 + NFT_PIPAPO_AVX2_BUCKET_LOAD8(0, lt, 0, pkt[0], bsize); 884 + NFT_PIPAPO_AVX2_BUCKET_LOAD8(1, lt, 1, pkt[1], bsize); 885 + NFT_PIPAPO_AVX2_BUCKET_LOAD8(2, lt, 2, pkt[2], bsize); 886 + NFT_PIPAPO_AVX2_BUCKET_LOAD8(3, lt, 3, pkt[3], bsize); 887 + NFT_PIPAPO_AVX2_BUCKET_LOAD8(4, lt, 4, pkt[4], bsize); 888 + 889 + NFT_PIPAPO_AVX2_AND(5, 0, 1); 890 + NFT_PIPAPO_AVX2_BUCKET_LOAD8(6, lt, 6, pkt[5], bsize); 891 + NFT_PIPAPO_AVX2_AND(7, 2, 3); 892 + 893 + /* Stall */ 894 + NFT_PIPAPO_AVX2_AND(0, 4, 5); 895 + NFT_PIPAPO_AVX2_AND(1, 6, 7); 896 + NFT_PIPAPO_AVX2_AND(4, 0, 1); 897 + } else { 898 + NFT_PIPAPO_AVX2_BUCKET_LOAD8(0, lt, 0, pkt[0], bsize); 899 + NFT_PIPAPO_AVX2_LOAD(1, map[i_ul]); 900 + NFT_PIPAPO_AVX2_BUCKET_LOAD8(2, lt, 1, pkt[1], bsize); 901 + NFT_PIPAPO_AVX2_BUCKET_LOAD8(3, lt, 2, pkt[2], bsize); 902 + NFT_PIPAPO_AVX2_BUCKET_LOAD8(4, lt, 3, pkt[3], bsize); 903 + 904 + NFT_PIPAPO_AVX2_AND(5, 0, 1); 905 + NFT_PIPAPO_AVX2_NOMATCH_GOTO(1, nothing); 906 + 907 + NFT_PIPAPO_AVX2_AND(6, 2, 3); 908 + NFT_PIPAPO_AVX2_BUCKET_LOAD8(7, lt, 4, pkt[4], bsize); 909 + NFT_PIPAPO_AVX2_AND(0, 4, 5); 910 + NFT_PIPAPO_AVX2_BUCKET_LOAD8(1, lt, 5, pkt[5], bsize); 911 + NFT_PIPAPO_AVX2_AND(2, 6, 7); 912 + 913 + /* Stall */ 914 + NFT_PIPAPO_AVX2_AND(3, 0, 1); 915 + NFT_PIPAPO_AVX2_AND(4, 2, 3); 916 + } 917 + 918 + NFT_PIPAPO_AVX2_NOMATCH_GOTO(4, nomatch); 919 + NFT_PIPAPO_AVX2_STORE(map[i_ul], 4); 920 + 921 + b = nft_pipapo_avx2_refill(i_ul, &map[i_ul], fill, f->mt, last); 922 + if (last) 923 + return b; 924 + 925 + if (unlikely(ret == -1)) 926 + ret = b / XSAVE_YMM_SIZE; 927 + 928 + continue; 929 + 930 + nomatch: 931 + NFT_PIPAPO_AVX2_STORE(map[i_ul], 15); 932 + nothing: 933 + ; 934 + } 935 + 936 + return ret; 937 + } 938 + 939 + /** 940 + * nft_pipapo_avx2_lookup_8b_16() - AVX2-based lookup for 16 eight-bit groups 941 + * @map: Previous match result, used as initial bitmap 942 + * @fill: Destination bitmap to be filled with current match result 943 + * @f: Field, containing lookup and mapping tables 944 + * @offset: Ignore buckets before the given index, no bits are filled there 945 + * @pkt: Packet data, pointer to input nftables register 946 + * @first: If this is the first field, don't source previous result 947 + * @last: Last field: stop at the first match and return bit index 948 + * 949 + * See nft_pipapo_avx2_lookup_4b_2(). 950 + * 951 + * This is used for 128-bit fields (i.e. IPv6 addresses). 952 + * 953 + * Return: -1 on no match, rule index of match if @last, otherwise first long 954 + * word index to be checked next (i.e. first filled word). 955 + */ 956 + static int nft_pipapo_avx2_lookup_8b_16(unsigned long *map, unsigned long *fill, 957 + struct nft_pipapo_field *f, int offset, 958 + const u8 *pkt, bool first, bool last) 959 + { 960 + int i, ret = -1, m256_size = f->bsize / NFT_PIPAPO_LONGS_PER_M256, b; 961 + unsigned long *lt = f->lt, bsize = f->bsize; 962 + 963 + lt += offset * NFT_PIPAPO_LONGS_PER_M256; 964 + for (i = offset; i < m256_size; i++, lt += NFT_PIPAPO_LONGS_PER_M256) { 965 + int i_ul = i * NFT_PIPAPO_LONGS_PER_M256; 966 + 967 + if (!first) 968 + NFT_PIPAPO_AVX2_LOAD(0, map[i_ul]); 969 + 970 + NFT_PIPAPO_AVX2_BUCKET_LOAD8(1, lt, 0, pkt[0], bsize); 971 + NFT_PIPAPO_AVX2_BUCKET_LOAD8(2, lt, 1, pkt[1], bsize); 972 + NFT_PIPAPO_AVX2_BUCKET_LOAD8(3, lt, 2, pkt[2], bsize); 973 + if (!first) { 974 + NFT_PIPAPO_AVX2_NOMATCH_GOTO(0, nothing); 975 + NFT_PIPAPO_AVX2_AND(1, 1, 0); 976 + } 977 + NFT_PIPAPO_AVX2_BUCKET_LOAD8(4, lt, 3, pkt[3], bsize); 978 + 979 + NFT_PIPAPO_AVX2_BUCKET_LOAD8(5, lt, 4, pkt[4], bsize); 980 + NFT_PIPAPO_AVX2_AND(6, 1, 2); 981 + NFT_PIPAPO_AVX2_BUCKET_LOAD8(7, lt, 5, pkt[5], bsize); 982 + NFT_PIPAPO_AVX2_AND(0, 3, 4); 983 + NFT_PIPAPO_AVX2_BUCKET_LOAD8(1, lt, 6, pkt[6], bsize); 984 + 985 + NFT_PIPAPO_AVX2_BUCKET_LOAD8(2, lt, 7, pkt[7], bsize); 986 + NFT_PIPAPO_AVX2_AND(3, 5, 6); 987 + NFT_PIPAPO_AVX2_AND(4, 0, 1); 988 + NFT_PIPAPO_AVX2_BUCKET_LOAD8(5, lt, 8, pkt[8], bsize); 989 + 990 + NFT_PIPAPO_AVX2_AND(6, 2, 3); 991 + NFT_PIPAPO_AVX2_BUCKET_LOAD8(7, lt, 9, pkt[9], bsize); 992 + NFT_PIPAPO_AVX2_AND(0, 4, 5); 993 + NFT_PIPAPO_AVX2_BUCKET_LOAD8(1, lt, 10, pkt[10], bsize); 994 + NFT_PIPAPO_AVX2_AND(2, 6, 7); 995 + NFT_PIPAPO_AVX2_BUCKET_LOAD8(3, lt, 11, pkt[11], bsize); 996 + NFT_PIPAPO_AVX2_AND(4, 0, 1); 997 + NFT_PIPAPO_AVX2_BUCKET_LOAD8(5, lt, 12, pkt[12], bsize); 998 + NFT_PIPAPO_AVX2_AND(6, 2, 3); 999 + NFT_PIPAPO_AVX2_BUCKET_LOAD8(7, lt, 13, pkt[13], bsize); 1000 + NFT_PIPAPO_AVX2_AND(0, 4, 5); 1001 + NFT_PIPAPO_AVX2_BUCKET_LOAD8(1, lt, 14, pkt[14], bsize); 1002 + NFT_PIPAPO_AVX2_AND(2, 6, 7); 1003 + NFT_PIPAPO_AVX2_BUCKET_LOAD8(3, lt, 15, pkt[15], bsize); 1004 + NFT_PIPAPO_AVX2_AND(4, 0, 1); 1005 + 1006 + /* Stall */ 1007 + NFT_PIPAPO_AVX2_AND(5, 2, 3); 1008 + NFT_PIPAPO_AVX2_AND(6, 4, 5); 1009 + 1010 + NFT_PIPAPO_AVX2_NOMATCH_GOTO(6, nomatch); 1011 + NFT_PIPAPO_AVX2_STORE(map[i_ul], 6); 1012 + 1013 + b = nft_pipapo_avx2_refill(i_ul, &map[i_ul], fill, f->mt, last); 1014 + if (last) 1015 + return b; 1016 + 1017 + if (unlikely(ret == -1)) 1018 + ret = b / XSAVE_YMM_SIZE; 1019 + 1020 + continue; 1021 + 1022 + nomatch: 1023 + NFT_PIPAPO_AVX2_STORE(map[i_ul], 15); 1024 + nothing: 1025 + ; 1026 + } 1027 + 1028 + return ret; 1029 + } 1030 + 1031 + /** 1032 + * nft_pipapo_avx2_lookup_slow() - Fallback function for uncommon field sizes 1033 + * @map: Previous match result, used as initial bitmap 1034 + * @fill: Destination bitmap to be filled with current match result 1035 + * @f: Field, containing lookup and mapping tables 1036 + * @offset: Ignore buckets before the given index, no bits are filled there 1037 + * @pkt: Packet data, pointer to input nftables register 1038 + * @first: If this is the first field, don't source previous result 1039 + * @last: Last field: stop at the first match and return bit index 1040 + * 1041 + * This function should never be called, but is provided for the case the field 1042 + * size doesn't match any of the known data types. Matching rate is 1043 + * substantially lower than AVX2 routines. 1044 + * 1045 + * Return: -1 on no match, rule index of match if @last, otherwise first long 1046 + * word index to be checked next (i.e. first filled word). 1047 + */ 1048 + static int nft_pipapo_avx2_lookup_slow(unsigned long *map, unsigned long *fill, 1049 + struct nft_pipapo_field *f, int offset, 1050 + const u8 *pkt, bool first, bool last) 1051 + { 1052 + unsigned long *lt = f->lt, bsize = f->bsize; 1053 + int i, ret = -1, b; 1054 + 1055 + lt += offset * NFT_PIPAPO_LONGS_PER_M256; 1056 + 1057 + if (first) 1058 + memset(map, 0xff, bsize * sizeof(*map)); 1059 + 1060 + for (i = offset; i < bsize; i++) { 1061 + if (f->bb == 8) 1062 + pipapo_and_field_buckets_8bit(f, map, pkt); 1063 + else 1064 + pipapo_and_field_buckets_4bit(f, map, pkt); 1065 + NFT_PIPAPO_GROUP_BITS_ARE_8_OR_4; 1066 + 1067 + b = pipapo_refill(map, bsize, f->rules, fill, f->mt, last); 1068 + 1069 + if (last) 1070 + return b; 1071 + 1072 + if (ret == -1) 1073 + ret = b / XSAVE_YMM_SIZE; 1074 + } 1075 + 1076 + return ret; 1077 + } 1078 + 1079 + /** 1080 + * nft_pipapo_avx2_estimate() - Set size, space and lookup complexity 1081 + * @desc: Set description, element count and field description used 1082 + * @features: Flags: NFT_SET_INTERVAL needs to be there 1083 + * @est: Storage for estimation data 1084 + * 1085 + * Return: true if set is compatible and AVX2 available, false otherwise. 1086 + */ 1087 + bool nft_pipapo_avx2_estimate(const struct nft_set_desc *desc, u32 features, 1088 + struct nft_set_estimate *est) 1089 + { 1090 + if (!(features & NFT_SET_INTERVAL) || desc->field_count <= 1) 1091 + return false; 1092 + 1093 + if (!boot_cpu_has(X86_FEATURE_AVX2) || !boot_cpu_has(X86_FEATURE_AVX)) 1094 + return false; 1095 + 1096 + est->size = pipapo_estimate_size(desc); 1097 + if (!est->size) 1098 + return false; 1099 + 1100 + est->lookup = NFT_SET_CLASS_O_LOG_N; 1101 + 1102 + est->space = NFT_SET_CLASS_O_N; 1103 + 1104 + return true; 1105 + } 1106 + 1107 + /** 1108 + * nft_pipapo_avx2_lookup() - Lookup function for AVX2 implementation 1109 + * @net: Network namespace 1110 + * @set: nftables API set representation 1111 + * @elem: nftables API element representation containing key data 1112 + * @ext: nftables API extension pointer, filled with matching reference 1113 + * 1114 + * For more details, see DOC: Theory of Operation in nft_set_pipapo.c. 1115 + * 1116 + * This implementation exploits the repetitive characteristic of the algorithm 1117 + * to provide a fast, vectorised version using the AVX2 SIMD instruction set. 1118 + * 1119 + * Return: true on match, false otherwise. 1120 + */ 1121 + bool nft_pipapo_avx2_lookup(const struct net *net, const struct nft_set *set, 1122 + const u32 *key, const struct nft_set_ext **ext) 1123 + { 1124 + struct nft_pipapo *priv = nft_set_priv(set); 1125 + unsigned long *res, *fill, *scratch; 1126 + u8 genmask = nft_genmask_cur(net); 1127 + const u8 *rp = (const u8 *)key; 1128 + struct nft_pipapo_match *m; 1129 + struct nft_pipapo_field *f; 1130 + bool map_index; 1131 + int i, ret = 0; 1132 + 1133 + m = rcu_dereference(priv->match); 1134 + 1135 + /* This also protects access to all data related to scratch maps */ 1136 + kernel_fpu_begin(); 1137 + 1138 + scratch = *raw_cpu_ptr(m->scratch_aligned); 1139 + if (unlikely(!scratch)) { 1140 + kernel_fpu_end(); 1141 + return false; 1142 + } 1143 + map_index = raw_cpu_read(nft_pipapo_avx2_scratch_index); 1144 + 1145 + res = scratch + (map_index ? m->bsize_max : 0); 1146 + fill = scratch + (map_index ? 0 : m->bsize_max); 1147 + 1148 + /* Starting map doesn't need to be set for this implementation */ 1149 + 1150 + nft_pipapo_avx2_prepare(); 1151 + 1152 + next_match: 1153 + nft_pipapo_for_each_field(f, i, m) { 1154 + bool last = i == m->field_count - 1, first = !i; 1155 + 1156 + #define NFT_SET_PIPAPO_AVX2_LOOKUP(b, n) \ 1157 + (ret = nft_pipapo_avx2_lookup_##b##b_##n(res, fill, f, \ 1158 + ret, rp, \ 1159 + first, last)) 1160 + 1161 + if (likely(f->bb == 8)) { 1162 + if (f->groups == 1) { 1163 + NFT_SET_PIPAPO_AVX2_LOOKUP(8, 1); 1164 + } else if (f->groups == 2) { 1165 + NFT_SET_PIPAPO_AVX2_LOOKUP(8, 2); 1166 + } else if (f->groups == 4) { 1167 + NFT_SET_PIPAPO_AVX2_LOOKUP(8, 4); 1168 + } else if (f->groups == 6) { 1169 + NFT_SET_PIPAPO_AVX2_LOOKUP(8, 6); 1170 + } else if (f->groups == 16) { 1171 + NFT_SET_PIPAPO_AVX2_LOOKUP(8, 16); 1172 + } else { 1173 + ret = nft_pipapo_avx2_lookup_slow(res, fill, f, 1174 + ret, rp, 1175 + first, last); 1176 + } 1177 + } else { 1178 + if (f->groups == 2) { 1179 + NFT_SET_PIPAPO_AVX2_LOOKUP(4, 2); 1180 + } else if (f->groups == 4) { 1181 + NFT_SET_PIPAPO_AVX2_LOOKUP(4, 4); 1182 + } else if (f->groups == 8) { 1183 + NFT_SET_PIPAPO_AVX2_LOOKUP(4, 8); 1184 + } else if (f->groups == 12) { 1185 + NFT_SET_PIPAPO_AVX2_LOOKUP(4, 12); 1186 + } else if (f->groups == 32) { 1187 + NFT_SET_PIPAPO_AVX2_LOOKUP(4, 32); 1188 + } else { 1189 + ret = nft_pipapo_avx2_lookup_slow(res, fill, f, 1190 + ret, rp, 1191 + first, last); 1192 + } 1193 + } 1194 + NFT_PIPAPO_GROUP_BITS_ARE_8_OR_4; 1195 + 1196 + #undef NFT_SET_PIPAPO_AVX2_LOOKUP 1197 + 1198 + if (ret < 0) 1199 + goto out; 1200 + 1201 + if (last) { 1202 + *ext = &f->mt[ret].e->ext; 1203 + if (unlikely(nft_set_elem_expired(*ext) || 1204 + !nft_set_elem_active(*ext, genmask))) { 1205 + ret = 0; 1206 + goto next_match; 1207 + } 1208 + 1209 + goto out; 1210 + } 1211 + 1212 + swap(res, fill); 1213 + rp += NFT_PIPAPO_GROUPS_PADDED_SIZE(f); 1214 + } 1215 + 1216 + out: 1217 + if (i % 2) 1218 + raw_cpu_write(nft_pipapo_avx2_scratch_index, !map_index); 1219 + kernel_fpu_end(); 1220 + 1221 + return ret >= 0; 1222 + }
+14
net/netfilter/nft_set_pipapo_avx2.h
··· 1 + /* SPDX-License-Identifier: GPL-2.0-only */ 2 + #ifndef _NFT_SET_PIPAPO_AVX2_H 3 + 4 + #ifdef CONFIG_AS_AVX2 5 + #include <asm/fpu/xstate.h> 6 + #define NFT_PIPAPO_ALIGN (XSAVE_YMM_SIZE / BITS_PER_BYTE) 7 + 8 + bool nft_pipapo_avx2_lookup(const struct net *net, const struct nft_set *set, 9 + const u32 *key, const struct nft_set_ext **ext); 10 + bool nft_pipapo_avx2_estimate(const struct nft_set_desc *desc, u32 features, 11 + struct nft_set_estimate *est); 12 + #endif /* CONFIG_AS_AVX2 */ 13 + 14 + #endif /* _NFT_SET_PIPAPO_AVX2_H */