Merge git://git.kernel.org/pub/scm/linux/kernel/git/pablo/nf

Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git

kernel os linux

Pablo Neira Ayuso says:

====================
Netfilter/IPVS fixes for net

1) ipset limits the max allocatable memory via kvmalloc() to MAX_INT,
from Jozsef Kadlecsik.

2) Check ip_vs_conn_tab_bits value to be in the range specified
in Kconfig, from Andrea Claudi.

3) Initialize fragment offset in ip6tables, from Jeremy Sowden.

4) Make conntrack hash chain length random, from Florian Westphal.

5) Add zone ID to conntrack and NAT hashtuple again, also from Florian.

6) Add selftests for bidirectional zone support and colliding tuples,
from Florian Westphal.

7) Unlink table before synchronize_rcu when cleaning tables with
owner, from Florian.

8) ipset limits the max allocatable memory via kvmalloc() to MAX_INT.

9) Release conntrack entries via workqueue in masquerade, from Florian.

10) Fix bogus net_init in iptables raw table definition, also from Florian.

11) Work around missing softdep in log extensions, from Florian Westphal.

12) Serialize hash resizes and cleanups with mutex, from Eric Dumazet.

* git://git.kernel.org/pub/scm/linux/kernel/git/pablo/nf:
netfilter: conntrack: serialize hash resizes and cleanups
netfilter: log: work around missing softdep backend module
netfilter: iptable_raw: drop bogus net_init annotation
netfilter: nf_nat_masquerade: defer conntrack walk to work queue
netfilter: nf_nat_masquerade: make async masq_inet6_event handling generic
netfilter: nf_tables: Fix oversized kvmalloc() calls
netfilter: nf_tables: unlink table before deleting it
selftests: netfilter: add zone stress test with colliding tuples
selftests: netfilter: add selftest for directional zone support
netfilter: nat: include zone id in nat table hash again
netfilter: conntrack: include zone id in tuple hash again
netfilter: conntrack: make max chain length random
netfilter: ip6_tables: zero-initialize fragment offset
ipvs: check that ip_vs_conn_tab_bits is between 8 and 20
netfilter: ipset: Fix oversized kvmalloc() calls
====================

Link: https://lore.kernel.org/r/20210924221113.348767-1-pablo@netfilter.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>

Jakub Kicinski 4 years ago 7fe7f318 4526fe74

+735 -147

13 changed files

expand all

net

ipv4

netfilter

iptable_raw.c

ipv6

netfilter

ip6_tables.c

netfilter

ipset

ip_set_hash_gen.h

ipvs

ip_vs_conn.c

nf_conntrack_core.c

nf_nat_core.c

nf_nat_masquerade.c

nf_tables_api.c

nft_compat.c

xt_LOG.c

xt_NFLOG.c

tools

testing

selftests

netfilter

nft_nat_zones.sh

nft_zones_many.sh

+1 -1

net/ipv4/netfilter/iptable_raw.c

··· 42 42 43 43 static struct nf_hook_ops *rawtable_ops __read_mostly; 44 44 45 - static int __net_init iptable_raw_table_init(struct net *net) 45 + static int iptable_raw_table_init(struct net *net) 46 46 { 47 47 struct ipt_replace *repl; 48 48 const struct xt_table *table = &packet_raw;

net/ipv6/netfilter/ip6_tables.c

··· 273 273 * things we don't know, ie. tcp syn flag or ports). If the 274 274 * rule is also a fragment-specific rule, non-fragments won't 275 275 * match it. */ 276 + acpar.fragoff = 0; 276 277 acpar.hotdrop = false; 277 278 acpar.state = state; 278 279

+2 -2

net/netfilter/ipset/ip_set_hash_gen.h

··· 130 130 { 131 131 size_t hsize; 132 132 133 - /* We must fit both into u32 in jhash and size_t */ 133 + /* We must fit both into u32 in jhash and INT_MAX in kvmalloc_node() */ 134 134 if (hbits > 31) 135 135 return 0; 136 136 hsize = jhash_size(hbits); 137 - if ((((size_t)-1) - sizeof(struct htable)) / sizeof(struct hbucket *) 137 + if ((INT_MAX - sizeof(struct htable)) / sizeof(struct hbucket *) 138 138 < hsize) 139 139 return 0; 140 140

net/netfilter/ipvs/ip_vs_conn.c

··· 1468 1468 int idx; 1469 1469 1470 1470 /* Compute size and mask */ 1471 + if (ip_vs_conn_tab_bits < 8 || ip_vs_conn_tab_bits > 20) { 1472 + pr_info("conn_tab_bits not in [8, 20]. Using default value\n"); 1473 + ip_vs_conn_tab_bits = CONFIG_IP_VS_TAB_BITS; 1474 + } 1471 1475 ip_vs_conn_tab_size = 1 << ip_vs_conn_tab_bits; 1472 1476 ip_vs_conn_tab_mask = ip_vs_conn_tab_size - 1; 1473 1477

+100 -54

net/netfilter/nf_conntrack_core.c

··· 74 74 static DEFINE_SPINLOCK(nf_conntrack_locks_all_lock); 75 75 static __read_mostly bool nf_conntrack_locks_all; 76 76 77 + /* serialize hash resizes and nf_ct_iterate_cleanup */ 78 + static DEFINE_MUTEX(nf_conntrack_mutex); 79 + 77 80 #define GC_SCAN_INTERVAL (120u * HZ) 78 81 #define GC_SCAN_MAX_DURATION msecs_to_jiffies(10) 79 82 80 - #define MAX_CHAINLEN 64u 83 + #define MIN_CHAINLEN 8u 84 + #define MAX_CHAINLEN (32u - MIN_CHAINLEN) 81 85 82 86 static struct conntrack_gc_work conntrack_gc_work; 83 87 ··· 192 188 static siphash_key_t nf_conntrack_hash_rnd __read_mostly; 193 189 194 190 static u32 hash_conntrack_raw(const struct nf_conntrack_tuple *tuple, 191 + unsigned int zoneid, 195 192 const struct net *net) 196 193 { 197 194 struct { 198 195 struct nf_conntrack_man src; 199 196 union nf_inet_addr dst_addr; 197 + unsigned int zone; 200 198 u32 net_mix; 201 199 u16 dport; 202 200 u16 proto; ··· 211 205 /* The direction must be ignored, so handle usable members manually. */ 212 206 combined.src = tuple->src; 213 207 combined.dst_addr = tuple->dst.u3; 208 + combined.zone = zoneid; 214 209 combined.net_mix = net_hash_mix(net); 215 210 combined.dport = (__force __u16)tuple->dst.u.all; 216 211 combined.proto = tuple->dst.protonum; ··· 226 219 227 220 static u32 __hash_conntrack(const struct net *net, 228 221 const struct nf_conntrack_tuple *tuple, 222 + unsigned int zoneid, 229 223 unsigned int size) 230 224 { 231 - return reciprocal_scale(hash_conntrack_raw(tuple, net), size); 225 + return reciprocal_scale(hash_conntrack_raw(tuple, zoneid, net), size); 232 226 } 233 227 234 228 static u32 hash_conntrack(const struct net *net, 235 - const struct nf_conntrack_tuple *tuple) 229 + const struct nf_conntrack_tuple *tuple, 230 + unsigned int zoneid) 236 231 { 237 - return scale_hash(hash_conntrack_raw(tuple, net)); 232 + return scale_hash(hash_conntrack_raw(tuple, zoneid, net)); 238 233 } 239 234 240 235 static bool nf_ct_get_tuple_ports(const struct sk_buff *skb, ··· 659 650 do { 660 651 sequence = read_seqcount_begin(&nf_conntrack_generation); 661 652 hash = hash_conntrack(net, 662 - &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); 653 + &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, 654 + nf_ct_zone_id(nf_ct_zone(ct), IP_CT_DIR_ORIGINAL)); 663 655 reply_hash = hash_conntrack(net, 664 - &ct->tuplehash[IP_CT_DIR_REPLY].tuple); 656 + &ct->tuplehash[IP_CT_DIR_REPLY].tuple, 657 + nf_ct_zone_id(nf_ct_zone(ct), IP_CT_DIR_REPLY)); 665 658 } while (nf_conntrack_double_lock(net, hash, reply_hash, sequence)); 666 659 667 660 clean_from_lists(ct); ··· 830 819 nf_conntrack_find_get(struct net *net, const struct nf_conntrack_zone *zone, 831 820 const struct nf_conntrack_tuple *tuple) 832 821 { 833 - return __nf_conntrack_find_get(net, zone, tuple, 834 - hash_conntrack_raw(tuple, net)); 822 + unsigned int rid, zone_id = nf_ct_zone_id(zone, IP_CT_DIR_ORIGINAL); 823 + struct nf_conntrack_tuple_hash *thash; 824 + 825 + thash = __nf_conntrack_find_get(net, zone, tuple, 826 + hash_conntrack_raw(tuple, zone_id, net)); 827 + 828 + if (thash) 829 + return thash; 830 + 831 + rid = nf_ct_zone_id(zone, IP_CT_DIR_REPLY); 832 + if (rid != zone_id) 833 + return __nf_conntrack_find_get(net, zone, tuple, 834 + hash_conntrack_raw(tuple, rid, net)); 835 + return thash; 835 836 } 836 837 EXPORT_SYMBOL_GPL(nf_conntrack_find_get); 837 838 ··· 865 842 unsigned int hash, reply_hash; 866 843 struct nf_conntrack_tuple_hash *h; 867 844 struct hlist_nulls_node *n; 845 + unsigned int max_chainlen; 868 846 unsigned int chainlen = 0; 869 847 unsigned int sequence; 870 848 int err = -EEXIST; ··· 876 852 do { 877 853 sequence = read_seqcount_begin(&nf_conntrack_generation); 878 854 hash = hash_conntrack(net, 879 - &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); 855 + &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, 856 + nf_ct_zone_id(nf_ct_zone(ct), IP_CT_DIR_ORIGINAL)); 880 857 reply_hash = hash_conntrack(net, 881 - &ct->tuplehash[IP_CT_DIR_REPLY].tuple); 858 + &ct->tuplehash[IP_CT_DIR_REPLY].tuple, 859 + nf_ct_zone_id(nf_ct_zone(ct), IP_CT_DIR_REPLY)); 882 860 } while (nf_conntrack_double_lock(net, hash, reply_hash, sequence)); 861 + 862 + max_chainlen = MIN_CHAINLEN + prandom_u32_max(MAX_CHAINLEN); 883 863 884 864 /* See if there's one in the list already, including reverse */ 885 865 hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[hash], hnnode) { ··· 891 863 zone, net)) 892 864 goto out; 893 865 894 - if (chainlen++ > MAX_CHAINLEN) 866 + if (chainlen++ > max_chainlen) 895 867 goto chaintoolong; 896 868 } 897 869 ··· 901 873 if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_REPLY].tuple, 902 874 zone, net)) 903 875 goto out; 904 - if (chainlen++ > MAX_CHAINLEN) 876 + if (chainlen++ > max_chainlen) 905 877 goto chaintoolong; 906 878 } 907 879 ··· 1131 1103 int 1132 1104 __nf_conntrack_confirm(struct sk_buff *skb) 1133 1105 { 1106 + unsigned int chainlen = 0, sequence, max_chainlen; 1134 1107 const struct nf_conntrack_zone *zone; 1135 - unsigned int chainlen = 0, sequence; 1136 1108 unsigned int hash, reply_hash; 1137 1109 struct nf_conntrack_tuple_hash *h; 1138 1110 struct nf_conn *ct; ··· 1161 1133 hash = *(unsigned long *)&ct->tuplehash[IP_CT_DIR_REPLY].hnnode.pprev; 1162 1134 hash = scale_hash(hash); 1163 1135 reply_hash = hash_conntrack(net, 1164 - &ct->tuplehash[IP_CT_DIR_REPLY].tuple); 1165 - 1136 + &ct->tuplehash[IP_CT_DIR_REPLY].tuple, 1137 + nf_ct_zone_id(nf_ct_zone(ct), IP_CT_DIR_REPLY)); 1166 1138 } while (nf_conntrack_double_lock(net, hash, reply_hash, sequence)); 1167 1139 1168 1140 /* We're not in hash table, and we refuse to set up related ··· 1196 1168 goto dying; 1197 1169 } 1198 1170 1171 + max_chainlen = MIN_CHAINLEN + prandom_u32_max(MAX_CHAINLEN); 1199 1172 /* See if there's one in the list already, including reverse: 1200 1173 NAT could have grabbed it without realizing, since we're 1201 1174 not in the hash. If there is, we lost race. */ ··· 1204 1175 if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, 1205 1176 zone, net)) 1206 1177 goto out; 1207 - if (chainlen++ > MAX_CHAINLEN) 1178 + if (chainlen++ > max_chainlen) 1208 1179 goto chaintoolong; 1209 1180 } 1210 1181 ··· 1213 1184 if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_REPLY].tuple, 1214 1185 zone, net)) 1215 1186 goto out; 1216 - if (chainlen++ > MAX_CHAINLEN) { 1187 + if (chainlen++ > max_chainlen) { 1217 1188 chaintoolong: 1218 1189 nf_ct_add_to_dying_list(ct); 1219 1190 NF_CT_STAT_INC(net, chaintoolong); ··· 1275 1246 rcu_read_lock(); 1276 1247 begin: 1277 1248 nf_conntrack_get_ht(&ct_hash, &hsize); 1278 - hash = __hash_conntrack(net, tuple, hsize); 1249 + hash = __hash_conntrack(net, tuple, nf_ct_zone_id(zone, IP_CT_DIR_REPLY), hsize); 1279 1250 1280 1251 hlist_nulls_for_each_entry_rcu(h, n, &ct_hash[hash], hnnode) { 1281 1252 ct = nf_ct_tuplehash_to_ctrack(h); ··· 1716 1687 struct nf_conntrack_tuple_hash *h; 1717 1688 enum ip_conntrack_info ctinfo; 1718 1689 struct nf_conntrack_zone tmp; 1690 + u32 hash, zone_id, rid; 1719 1691 struct nf_conn *ct; 1720 - u32 hash; 1721 1692 1722 1693 if (!nf_ct_get_tuple(skb, skb_network_offset(skb), 1723 1694 dataoff, state->pf, protonum, state->net, ··· 1728 1699 1729 1700 /* look for tuple match */ 1730 1701 zone = nf_ct_zone_tmpl(tmpl, skb, &tmp); 1731 - hash = hash_conntrack_raw(&tuple, state->net); 1702 + 1703 + zone_id = nf_ct_zone_id(zone, IP_CT_DIR_ORIGINAL); 1704 + hash = hash_conntrack_raw(&tuple, zone_id, state->net); 1732 1705 h = __nf_conntrack_find_get(state->net, zone, &tuple, hash); 1706 + 1707 + if (!h) { 1708 + rid = nf_ct_zone_id(zone, IP_CT_DIR_REPLY); 1709 + if (zone_id != rid) { 1710 + u32 tmp = hash_conntrack_raw(&tuple, rid, state->net); 1711 + 1712 + h = __nf_conntrack_find_get(state->net, zone, &tuple, tmp); 1713 + } 1714 + } 1715 + 1733 1716 if (!h) { 1734 1717 h = init_conntrack(state->net, tmpl, &tuple, 1735 1718 skb, dataoff, hash); ··· 2266 2225 spinlock_t *lockp; 2267 2226 2268 2227 for (; *bucket < nf_conntrack_htable_size; (*bucket)++) { 2228 + struct hlist_nulls_head *hslot = &nf_conntrack_hash[*bucket]; 2229 + 2230 + if (hlist_nulls_empty(hslot)) 2231 + continue; 2232 + 2269 2233 lockp = &nf_conntrack_locks[*bucket % CONNTRACK_LOCKS]; 2270 2234 local_bh_disable(); 2271 2235 nf_conntrack_lock(lockp); 2272 - if (*bucket < nf_conntrack_htable_size) { 2273 - hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[*bucket], hnnode) { 2274 - if (NF_CT_DIRECTION(h) != IP_CT_DIR_REPLY) 2275 - continue; 2276 - /* All nf_conn objects are added to hash table twice, one 2277 - * for original direction tuple, once for the reply tuple. 2278 - * 2279 - * Exception: In the IPS_NAT_CLASH case, only the reply 2280 - * tuple is added (the original tuple already existed for 2281 - * a different object). 2282 - * 2283 - * We only need to call the iterator once for each 2284 - * conntrack, so we just use the 'reply' direction 2285 - * tuple while iterating. 2286 - */ 2287 - ct = nf_ct_tuplehash_to_ctrack(h); 2288 - if (iter(ct, data)) 2289 - goto found; 2290 - } 2236 + hlist_nulls_for_each_entry(h, n, hslot, hnnode) { 2237 + if (NF_CT_DIRECTION(h) != IP_CT_DIR_REPLY) 2238 + continue; 2239 + /* All nf_conn objects are added to hash table twice, one 2240 + * for original direction tuple, once for the reply tuple. 2241 + * 2242 + * Exception: In the IPS_NAT_CLASH case, only the reply 2243 + * tuple is added (the original tuple already existed for 2244 + * a different object). 2245 + * 2246 + * We only need to call the iterator once for each 2247 + * conntrack, so we just use the 'reply' direction 2248 + * tuple while iterating. 2249 + */ 2250 + ct = nf_ct_tuplehash_to_ctrack(h); 2251 + if (iter(ct, data)) 2252 + goto found; 2291 2253 } 2292 2254 spin_unlock(lockp); 2293 2255 local_bh_enable(); ··· 2308 2264 static void nf_ct_iterate_cleanup(int (*iter)(struct nf_conn *i, void *data), 2309 2265 void *data, u32 portid, int report) 2310 2266 { 2311 - unsigned int bucket = 0, sequence; 2267 + unsigned int bucket = 0; 2312 2268 struct nf_conn *ct; 2313 2269 2314 2270 might_sleep(); 2315 2271 2316 - for (;;) { 2317 - sequence = read_seqcount_begin(&nf_conntrack_generation); 2272 + mutex_lock(&nf_conntrack_mutex); 2273 + while ((ct = get_next_corpse(iter, data, &bucket)) != NULL) { 2274 + /* Time to push up daises... */ 2318 2275 2319 - while ((ct = get_next_corpse(iter, data, &bucket)) != NULL) { 2320 - /* Time to push up daises... */ 2321 - 2322 - nf_ct_delete(ct, portid, report); 2323 - nf_ct_put(ct); 2324 - cond_resched(); 2325 - } 2326 - 2327 - if (!read_seqcount_retry(&nf_conntrack_generation, sequence)) 2328 - break; 2329 - bucket = 0; 2276 + nf_ct_delete(ct, portid, report); 2277 + nf_ct_put(ct); 2278 + cond_resched(); 2330 2279 } 2280 + mutex_unlock(&nf_conntrack_mutex); 2331 2281 } 2332 2282 2333 2283 struct iter_data { ··· 2557 2519 if (!hash) 2558 2520 return -ENOMEM; 2559 2521 2522 + mutex_lock(&nf_conntrack_mutex); 2560 2523 old_size = nf_conntrack_htable_size; 2561 2524 if (old_size == hashsize) { 2525 + mutex_unlock(&nf_conntrack_mutex); 2562 2526 kvfree(hash); 2563 2527 return 0; 2564 2528 } ··· 2577 2537 2578 2538 for (i = 0; i < nf_conntrack_htable_size; i++) { 2579 2539 while (!hlist_nulls_empty(&nf_conntrack_hash[i])) { 2540 + unsigned int zone_id; 2541 + 2580 2542 h = hlist_nulls_entry(nf_conntrack_hash[i].first, 2581 2543 struct nf_conntrack_tuple_hash, hnnode); 2582 2544 ct = nf_ct_tuplehash_to_ctrack(h); 2583 2545 hlist_nulls_del_rcu(&h->hnnode); 2546 + 2547 + zone_id = nf_ct_zone_id(nf_ct_zone(ct), NF_CT_DIRECTION(h)); 2584 2548 bucket = __hash_conntrack(nf_ct_net(ct), 2585 - &h->tuple, hashsize); 2549 + &h->tuple, zone_id, hashsize); 2586 2550 hlist_nulls_add_head_rcu(&h->hnnode, &hash[bucket]); 2587 2551 } 2588 2552 } ··· 2599 2555 write_seqcount_end(&nf_conntrack_generation); 2600 2556 nf_conntrack_all_unlock(); 2601 2557 local_bh_enable(); 2558 + 2559 + mutex_unlock(&nf_conntrack_mutex); 2602 2560 2603 2561 synchronize_net(); 2604 2562 kvfree(old_hash);

+12 -5

net/netfilter/nf_nat_core.c

··· 150 150 151 151 /* We keep an extra hash for each conntrack, for fast searching. */ 152 152 static unsigned int 153 - hash_by_src(const struct net *n, const struct nf_conntrack_tuple *tuple) 153 + hash_by_src(const struct net *net, 154 + const struct nf_conntrack_zone *zone, 155 + const struct nf_conntrack_tuple *tuple) 154 156 { 155 157 unsigned int hash; 156 158 struct { 157 159 struct nf_conntrack_man src; 158 160 u32 net_mix; 159 161 u32 protonum; 162 + u32 zone; 160 163 } __aligned(SIPHASH_ALIGNMENT) combined; 161 164 162 165 get_random_once(&nf_nat_hash_rnd, sizeof(nf_nat_hash_rnd)); ··· 168 165 169 166 /* Original src, to ensure we map it consistently if poss. */ 170 167 combined.src = tuple->src; 171 - combined.net_mix = net_hash_mix(n); 168 + combined.net_mix = net_hash_mix(net); 172 169 combined.protonum = tuple->dst.protonum; 170 + 171 + /* Zone ID can be used provided its valid for both directions */ 172 + if (zone->dir == NF_CT_DEFAULT_ZONE_DIR) 173 + combined.zone = zone->id; 173 174 174 175 hash = siphash(&combined, sizeof(combined), &nf_nat_hash_rnd); 175 176 ··· 279 272 struct nf_conntrack_tuple *result, 280 273 const struct nf_nat_range2 *range) 281 274 { 282 - unsigned int h = hash_by_src(net, tuple); 275 + unsigned int h = hash_by_src(net, zone, tuple); 283 276 const struct nf_conn *ct; 284 277 285 278 hlist_for_each_entry_rcu(ct, &nf_nat_bysource[h], nat_bysource) { ··· 626 619 unsigned int srchash; 627 620 spinlock_t *lock; 628 621 629 - srchash = hash_by_src(net, 622 + srchash = hash_by_src(net, nf_ct_zone(ct), 630 623 &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); 631 624 lock = &nf_nat_locks[srchash % CONNTRACK_LOCKS]; 632 625 spin_lock_bh(lock); ··· 795 788 { 796 789 unsigned int h; 797 790 798 - h = hash_by_src(nf_ct_net(ct), &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); 791 + h = hash_by_src(nf_ct_net(ct), nf_ct_zone(ct), &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); 799 792 spin_lock_bh(&nf_nat_locks[h % CONNTRACK_LOCKS]); 800 793 hlist_del_rcu(&ct->nat_bysource); 801 794 spin_unlock_bh(&nf_nat_locks[h % CONNTRACK_LOCKS]);

+97 -71

net/netfilter/nf_nat_masquerade.c

··· 9 9 10 10 #include <net/netfilter/nf_nat_masquerade.h> 11 11 12 + struct masq_dev_work { 13 + struct work_struct work; 14 + struct net *net; 15 + union nf_inet_addr addr; 16 + int ifindex; 17 + int (*iter)(struct nf_conn *i, void *data); 18 + }; 19 + 20 + #define MAX_MASQ_WORKER_COUNT 16 21 + 12 22 static DEFINE_MUTEX(masq_mutex); 13 23 static unsigned int masq_refcnt __read_mostly; 24 + static atomic_t masq_worker_count __read_mostly; 14 25 15 26 unsigned int 16 27 nf_nat_masquerade_ipv4(struct sk_buff *skb, unsigned int hooknum, ··· 74 63 } 75 64 EXPORT_SYMBOL_GPL(nf_nat_masquerade_ipv4); 76 65 77 - static int device_cmp(struct nf_conn *i, void *ifindex) 66 + static void iterate_cleanup_work(struct work_struct *work) 67 + { 68 + struct masq_dev_work *w; 69 + 70 + w = container_of(work, struct masq_dev_work, work); 71 + 72 + nf_ct_iterate_cleanup_net(w->net, w->iter, (void *)w, 0, 0); 73 + 74 + put_net(w->net); 75 + kfree(w); 76 + atomic_dec(&masq_worker_count); 77 + module_put(THIS_MODULE); 78 + } 79 + 80 + /* Iterate conntrack table in the background and remove conntrack entries 81 + * that use the device/address being removed. 82 + * 83 + * In case too many work items have been queued already or memory allocation 84 + * fails iteration is skipped, conntrack entries will time out eventually. 85 + */ 86 + static void nf_nat_masq_schedule(struct net *net, union nf_inet_addr *addr, 87 + int ifindex, 88 + int (*iter)(struct nf_conn *i, void *data), 89 + gfp_t gfp_flags) 90 + { 91 + struct masq_dev_work *w; 92 + 93 + if (atomic_read(&masq_worker_count) > MAX_MASQ_WORKER_COUNT) 94 + return; 95 + 96 + net = maybe_get_net(net); 97 + if (!net) 98 + return; 99 + 100 + if (!try_module_get(THIS_MODULE)) 101 + goto err_module; 102 + 103 + w = kzalloc(sizeof(*w), gfp_flags); 104 + if (w) { 105 + /* We can overshoot MAX_MASQ_WORKER_COUNT, no big deal */ 106 + atomic_inc(&masq_worker_count); 107 + 108 + INIT_WORK(&w->work, iterate_cleanup_work); 109 + w->ifindex = ifindex; 110 + w->net = net; 111 + w->iter = iter; 112 + if (addr) 113 + w->addr = *addr; 114 + schedule_work(&w->work); 115 + return; 116 + } 117 + 118 + module_put(THIS_MODULE); 119 + err_module: 120 + put_net(net); 121 + } 122 + 123 + static int device_cmp(struct nf_conn *i, void *arg) 78 124 { 79 125 const struct nf_conn_nat *nat = nfct_nat(i); 126 + const struct masq_dev_work *w = arg; 80 127 81 128 if (!nat) 82 129 return 0; 83 - return nat->masq_index == (int)(long)ifindex; 130 + return nat->masq_index == w->ifindex; 84 131 } 85 132 86 133 static int masq_device_event(struct notifier_block *this, ··· 154 85 * and forget them. 155 86 */ 156 87 157 - nf_ct_iterate_cleanup_net(net, device_cmp, 158 - (void *)(long)dev->ifindex, 0, 0); 88 + nf_nat_masq_schedule(net, NULL, dev->ifindex, 89 + device_cmp, GFP_KERNEL); 159 90 } 160 91 161 92 return NOTIFY_DONE; ··· 163 94 164 95 static int inet_cmp(struct nf_conn *ct, void *ptr) 165 96 { 166 - struct in_ifaddr *ifa = (struct in_ifaddr *)ptr; 167 - struct net_device *dev = ifa->ifa_dev->dev; 168 97 struct nf_conntrack_tuple *tuple; 98 + struct masq_dev_work *w = ptr; 169 99 170 - if (!device_cmp(ct, (void *)(long)dev->ifindex)) 100 + if (!device_cmp(ct, ptr)) 171 101 return 0; 172 102 173 103 tuple = &ct->tuplehash[IP_CT_DIR_REPLY].tuple; 174 104 175 - return ifa->ifa_address == tuple->dst.u3.ip; 105 + return nf_inet_addr_cmp(&w->addr, &tuple->dst.u3); 176 106 } 177 107 178 108 static int masq_inet_event(struct notifier_block *this, 179 109 unsigned long event, 180 110 void *ptr) 181 111 { 182 - struct in_device *idev = ((struct in_ifaddr *)ptr)->ifa_dev; 183 - struct net *net = dev_net(idev->dev); 112 + const struct in_ifaddr *ifa = ptr; 113 + const struct in_device *idev; 114 + const struct net_device *dev; 115 + union nf_inet_addr addr; 116 + 117 + if (event != NETDEV_DOWN) 118 + return NOTIFY_DONE; 184 119 185 120 /* The masq_dev_notifier will catch the case of the device going 186 121 * down. So if the inetdev is dead and being destroyed we have 187 122 * no work to do. Otherwise this is an individual address removal 188 123 * and we have to perform the flush. 189 124 */ 125 + idev = ifa->ifa_dev; 190 126 if (idev->dead) 191 127 return NOTIFY_DONE; 192 128 193 - if (event == NETDEV_DOWN) 194 - nf_ct_iterate_cleanup_net(net, inet_cmp, ptr, 0, 0); 129 + memset(&addr, 0, sizeof(addr)); 130 + 131 + addr.ip = ifa->ifa_address; 132 + 133 + dev = idev->dev; 134 + nf_nat_masq_schedule(dev_net(idev->dev), &addr, dev->ifindex, 135 + inet_cmp, GFP_KERNEL); 195 136 196 137 return NOTIFY_DONE; 197 138 } ··· 215 136 }; 216 137 217 138 #if IS_ENABLED(CONFIG_IPV6) 218 - static atomic_t v6_worker_count __read_mostly; 219 - 220 139 static int 221 140 nat_ipv6_dev_get_saddr(struct net *net, const struct net_device *dev, 222 141 const struct in6_addr *daddr, unsigned int srcprefs, ··· 264 187 } 265 188 EXPORT_SYMBOL_GPL(nf_nat_masquerade_ipv6); 266 189 267 - struct masq_dev_work { 268 - struct work_struct work; 269 - struct net *net; 270 - struct in6_addr addr; 271 - int ifindex; 272 - }; 273 - 274 - static int inet6_cmp(struct nf_conn *ct, void *work) 275 - { 276 - struct masq_dev_work *w = (struct masq_dev_work *)work; 277 - struct nf_conntrack_tuple *tuple; 278 - 279 - if (!device_cmp(ct, (void *)(long)w->ifindex)) 280 - return 0; 281 - 282 - tuple = &ct->tuplehash[IP_CT_DIR_REPLY].tuple; 283 - 284 - return ipv6_addr_equal(&w->addr, &tuple->dst.u3.in6); 285 - } 286 - 287 - static void iterate_cleanup_work(struct work_struct *work) 288 - { 289 - struct masq_dev_work *w; 290 - 291 - w = container_of(work, struct masq_dev_work, work); 292 - 293 - nf_ct_iterate_cleanup_net(w->net, inet6_cmp, (void *)w, 0, 0); 294 - 295 - put_net(w->net); 296 - kfree(w); 297 - atomic_dec(&v6_worker_count); 298 - module_put(THIS_MODULE); 299 - } 300 - 301 190 /* atomic notifier; can't call nf_ct_iterate_cleanup_net (it can sleep). 302 191 * 303 192 * Defer it to the system workqueue. ··· 276 233 { 277 234 struct inet6_ifaddr *ifa = ptr; 278 235 const struct net_device *dev; 279 - struct masq_dev_work *w; 280 - struct net *net; 236 + union nf_inet_addr addr; 281 237 282 - if (event != NETDEV_DOWN || atomic_read(&v6_worker_count) >= 16) 238 + if (event != NETDEV_DOWN) 283 239 return NOTIFY_DONE; 284 240 285 241 dev = ifa->idev->dev; 286 - net = maybe_get_net(dev_net(dev)); 287 - if (!net) 288 - return NOTIFY_DONE; 289 242 290 - if (!try_module_get(THIS_MODULE)) 291 - goto err_module; 243 + memset(&addr, 0, sizeof(addr)); 292 244 293 - w = kmalloc(sizeof(*w), GFP_ATOMIC); 294 - if (w) { 295 - atomic_inc(&v6_worker_count); 245 + addr.in6 = ifa->addr; 296 246 297 - INIT_WORK(&w->work, iterate_cleanup_work); 298 - w->ifindex = dev->ifindex; 299 - w->net = net; 300 - w->addr = ifa->addr; 301 - schedule_work(&w->work); 302 - 303 - return NOTIFY_DONE; 304 - } 305 - 306 - module_put(THIS_MODULE); 307 - err_module: 308 - put_net(net); 247 + nf_nat_masq_schedule(dev_net(dev), &addr, dev->ifindex, inet_cmp, 248 + GFP_ATOMIC); 309 249 return NOTIFY_DONE; 310 250 } 311 251

+19 -11

net/netfilter/nf_tables_api.c

··· 4336 4336 if (ops->privsize != NULL) 4337 4337 size = ops->privsize(nla, &desc); 4338 4338 alloc_size = sizeof(*set) + size + udlen; 4339 - if (alloc_size < size) 4339 + if (alloc_size < size || alloc_size > INT_MAX) 4340 4340 return -ENOMEM; 4341 4341 set = kvzalloc(alloc_size, GFP_KERNEL); 4342 4342 if (!set) ··· 9599 9599 table->use--; 9600 9600 nf_tables_chain_destroy(&ctx); 9601 9601 } 9602 - list_del(&table->list); 9603 9602 nf_tables_table_destroy(&ctx); 9604 9603 } 9605 9604 ··· 9611 9612 if (nft_table_has_owner(table)) 9612 9613 continue; 9613 9614 9615 + list_del(&table->list); 9616 + 9614 9617 __nft_release_table(net, table); 9615 9618 } 9616 9619 } ··· 9620 9619 static int nft_rcv_nl_event(struct notifier_block *this, unsigned long event, 9621 9620 void *ptr) 9622 9621 { 9622 + struct nft_table *table, *to_delete[8]; 9623 9623 struct nftables_pernet *nft_net; 9624 9624 struct netlink_notify *n = ptr; 9625 - struct nft_table *table, *nt; 9626 9625 struct net *net = n->net; 9627 - bool release = false; 9626 + unsigned int deleted; 9627 + bool restart = false; 9628 9628 9629 9629 if (event != NETLINK_URELEASE || n->protocol != NETLINK_NETFILTER) 9630 9630 return NOTIFY_DONE; 9631 9631 9632 9632 nft_net = nft_pernet(net); 9633 + deleted = 0; 9633 9634 mutex_lock(&nft_net->commit_mutex); 9635 + again: 9634 9636 list_for_each_entry(table, &nft_net->tables, list) { 9635 9637 if (nft_table_has_owner(table) && 9636 9638 n->portid == table->nlpid) { 9637 9639 __nft_release_hook(net, table); 9638 - release = true; 9640 + list_del_rcu(&table->list); 9641 + to_delete[deleted++] = table; 9642 + if (deleted >= ARRAY_SIZE(to_delete)) 9643 + break; 9639 9644 } 9640 9645 } 9641 - if (release) { 9646 + if (deleted) { 9647 + restart = deleted >= ARRAY_SIZE(to_delete); 9642 9648 synchronize_rcu(); 9643 - list_for_each_entry_safe(table, nt, &nft_net->tables, list) { 9644 - if (nft_table_has_owner(table) && 9645 - n->portid == table->nlpid) 9646 - __nft_release_table(net, table); 9647 - } 9649 + while (deleted) 9650 + __nft_release_table(net, to_delete[--deleted]); 9651 + 9652 + if (restart) 9653 + goto again; 9648 9654 } 9649 9655 mutex_unlock(&nft_net->commit_mutex); 9650 9656

+16 -1

net/netfilter/nft_compat.c

··· 19 19 #include <linux/netfilter_bridge/ebtables.h> 20 20 #include <linux/netfilter_arp/arp_tables.h> 21 21 #include <net/netfilter/nf_tables.h> 22 + #include <net/netfilter/nf_log.h> 22 23 23 24 /* Used for matches where *info is larger than X byte */ 24 25 #define NFT_MATCH_LARGE_THRESH 192 ··· 258 257 nft_compat_wait_for_destructors(); 259 258 260 259 ret = xt_check_target(&par, size, proto, inv); 261 - if (ret < 0) 260 + if (ret < 0) { 261 + if (ret == -ENOENT) { 262 + const char *modname = NULL; 263 + 264 + if (strcmp(target->name, "LOG") == 0) 265 + modname = "nf_log_syslog"; 266 + else if (strcmp(target->name, "NFLOG") == 0) 267 + modname = "nfnetlink_log"; 268 + 269 + if (modname && 270 + nft_request_module(ctx->net, "%s", modname) == -EAGAIN) 271 + return -EAGAIN; 272 + } 273 + 262 274 return ret; 275 + } 263 276 264 277 /* The standard target cannot be used */ 265 278 if (!target->target)

+9 -1

net/netfilter/xt_LOG.c

··· 44 44 static int log_tg_check(const struct xt_tgchk_param *par) 45 45 { 46 46 const struct xt_log_info *loginfo = par->targinfo; 47 + int ret; 47 48 48 49 if (par->family != NFPROTO_IPV4 && par->family != NFPROTO_IPV6) 49 50 return -EINVAL; ··· 59 58 return -EINVAL; 60 59 } 61 60 62 - return nf_logger_find_get(par->family, NF_LOG_TYPE_LOG); 61 + ret = nf_logger_find_get(par->family, NF_LOG_TYPE_LOG); 62 + if (ret != 0 && !par->nft_compat) { 63 + request_module("%s", "nf_log_syslog"); 64 + 65 + ret = nf_logger_find_get(par->family, NF_LOG_TYPE_LOG); 66 + } 67 + 68 + return ret; 63 69 } 64 70 65 71 static void log_tg_destroy(const struct xt_tgdtor_param *par)

+9 -1

net/netfilter/xt_NFLOG.c

··· 42 42 static int nflog_tg_check(const struct xt_tgchk_param *par) 43 43 { 44 44 const struct xt_nflog_info *info = par->targinfo; 45 + int ret; 45 46 46 47 if (info->flags & ~XT_NFLOG_MASK) 47 48 return -EINVAL; 48 49 if (info->prefix[sizeof(info->prefix) - 1] != '\0') 49 50 return -EINVAL; 50 51 51 - return nf_logger_find_get(par->family, NF_LOG_TYPE_ULOG); 52 + ret = nf_logger_find_get(par->family, NF_LOG_TYPE_ULOG); 53 + if (ret != 0 && !par->nft_compat) { 54 + request_module("%s", "nfnetlink_log"); 55 + 56 + ret = nf_logger_find_get(par->family, NF_LOG_TYPE_ULOG); 57 + } 58 + 59 + return ret; 52 60 } 53 61 54 62 static void nflog_tg_destroy(const struct xt_tgdtor_param *par)

+309

tools/testing/selftests/netfilter/nft_nat_zones.sh

··· 1 + #!/bin/bash 2 + # 3 + # Test connection tracking zone and NAT source port reallocation support. 4 + # 5 + 6 + # Kselftest framework requirement - SKIP code is 4. 7 + ksft_skip=4 8 + 9 + # Don't increase too much, 2000 clients should work 10 + # just fine but script can then take several minutes with 11 + # KASAN/debug builds. 12 + maxclients=100 13 + 14 + have_iperf=1 15 + ret=0 16 + 17 + # client1---. 18 + # veth1-. 19 + # | 20 + # NAT Gateway --veth0--> Server 21 + # | | 22 + # veth2-' | 23 + # client2---' | 24 + # .... | 25 + # clientX----vethX---' 26 + 27 + # All clients share identical IP address. 28 + # NAT Gateway uses policy routing and conntrack zones to isolate client 29 + # namespaces. Each client connects to Server, each with colliding tuples: 30 + # clientsaddr:10000 -> serveraddr:dport 31 + # NAT Gateway is supposed to do port reallocation for each of the 32 + # connections. 33 + 34 + sfx=$(mktemp -u "XXXXXXXX") 35 + gw="ns-gw-$sfx" 36 + cl1="ns-cl1-$sfx" 37 + cl2="ns-cl2-$sfx" 38 + srv="ns-srv-$sfx" 39 + 40 + v4gc1=$(sysctl -n net.ipv4.neigh.default.gc_thresh1 2>/dev/null) 41 + v4gc2=$(sysctl -n net.ipv4.neigh.default.gc_thresh2 2>/dev/null) 42 + v4gc3=$(sysctl -n net.ipv4.neigh.default.gc_thresh3 2>/dev/null) 43 + v6gc1=$(sysctl -n net.ipv6.neigh.default.gc_thresh1 2>/dev/null) 44 + v6gc2=$(sysctl -n net.ipv6.neigh.default.gc_thresh2 2>/dev/null) 45 + v6gc3=$(sysctl -n net.ipv6.neigh.default.gc_thresh3 2>/dev/null) 46 + 47 + cleanup() 48 + { 49 + ip netns del $gw 50 + ip netns del $srv 51 + for i in $(seq 1 $maxclients); do 52 + ip netns del ns-cl$i-$sfx 2>/dev/null 53 + done 54 + 55 + sysctl -q net.ipv4.neigh.default.gc_thresh1=$v4gc1 2>/dev/null 56 + sysctl -q net.ipv4.neigh.default.gc_thresh2=$v4gc2 2>/dev/null 57 + sysctl -q net.ipv4.neigh.default.gc_thresh3=$v4gc3 2>/dev/null 58 + sysctl -q net.ipv6.neigh.default.gc_thresh1=$v6gc1 2>/dev/null 59 + sysctl -q net.ipv6.neigh.default.gc_thresh2=$v6gc2 2>/dev/null 60 + sysctl -q net.ipv6.neigh.default.gc_thresh3=$v6gc3 2>/dev/null 61 + } 62 + 63 + nft --version > /dev/null 2>&1 64 + if [ $? -ne 0 ];then 65 + echo "SKIP: Could not run test without nft tool" 66 + exit $ksft_skip 67 + fi 68 + 69 + ip -Version > /dev/null 2>&1 70 + if [ $? -ne 0 ];then 71 + echo "SKIP: Could not run test without ip tool" 72 + exit $ksft_skip 73 + fi 74 + 75 + conntrack -V > /dev/null 2>&1 76 + if [ $? -ne 0 ];then 77 + echo "SKIP: Could not run test without conntrack tool" 78 + exit $ksft_skip 79 + fi 80 + 81 + iperf3 -v >/dev/null 2>&1 82 + if [ $? -ne 0 ];then 83 + have_iperf=0 84 + fi 85 + 86 + ip netns add "$gw" 87 + if [ $? -ne 0 ];then 88 + echo "SKIP: Could not create net namespace $gw" 89 + exit $ksft_skip 90 + fi 91 + ip -net "$gw" link set lo up 92 + 93 + trap cleanup EXIT 94 + 95 + ip netns add "$srv" 96 + if [ $? -ne 0 ];then 97 + echo "SKIP: Could not create server netns $srv" 98 + exit $ksft_skip 99 + fi 100 + 101 + ip link add veth0 netns "$gw" type veth peer name eth0 netns "$srv" 102 + ip -net "$gw" link set veth0 up 103 + ip -net "$srv" link set lo up 104 + ip -net "$srv" link set eth0 up 105 + 106 + sysctl -q net.ipv6.neigh.default.gc_thresh1=512 2>/dev/null 107 + sysctl -q net.ipv6.neigh.default.gc_thresh2=1024 2>/dev/null 108 + sysctl -q net.ipv6.neigh.default.gc_thresh3=4096 2>/dev/null 109 + sysctl -q net.ipv4.neigh.default.gc_thresh1=512 2>/dev/null 110 + sysctl -q net.ipv4.neigh.default.gc_thresh2=1024 2>/dev/null 111 + sysctl -q net.ipv4.neigh.default.gc_thresh3=4096 2>/dev/null 112 + 113 + for i in $(seq 1 $maxclients);do 114 + cl="ns-cl$i-$sfx" 115 + 116 + ip netns add "$cl" 117 + if [ $? -ne 0 ];then 118 + echo "SKIP: Could not create client netns $cl" 119 + exit $ksft_skip 120 + fi 121 + ip link add veth$i netns "$gw" type veth peer name eth0 netns "$cl" > /dev/null 2>&1 122 + if [ $? -ne 0 ];then 123 + echo "SKIP: No virtual ethernet pair device support in kernel" 124 + exit $ksft_skip 125 + fi 126 + done 127 + 128 + for i in $(seq 1 $maxclients);do 129 + cl="ns-cl$i-$sfx" 130 + echo netns exec "$cl" ip link set lo up 131 + echo netns exec "$cl" ip link set eth0 up 132 + echo netns exec "$cl" sysctl -q net.ipv4.tcp_syn_retries=2 133 + echo netns exec "$gw" ip link set veth$i up 134 + echo netns exec "$gw" sysctl -q net.ipv4.conf.veth$i.arp_ignore=2 135 + echo netns exec "$gw" sysctl -q net.ipv4.conf.veth$i.rp_filter=0 136 + 137 + # clients have same IP addresses. 138 + echo netns exec "$cl" ip addr add 10.1.0.3/24 dev eth0 139 + echo netns exec "$cl" ip addr add dead:1::3/64 dev eth0 140 + echo netns exec "$cl" ip route add default via 10.1.0.2 dev eth0 141 + echo netns exec "$cl" ip route add default via dead:1::2 dev eth0 142 + 143 + # NB: same addresses on client-facing interfaces. 144 + echo netns exec "$gw" ip addr add 10.1.0.2/24 dev veth$i 145 + echo netns exec "$gw" ip addr add dead:1::2/64 dev veth$i 146 + 147 + # gw: policy routing 148 + echo netns exec "$gw" ip route add 10.1.0.0/24 dev veth$i table $((1000+i)) 149 + echo netns exec "$gw" ip route add dead:1::0/64 dev veth$i table $((1000+i)) 150 + echo netns exec "$gw" ip route add 10.3.0.0/24 dev veth0 table $((1000+i)) 151 + echo netns exec "$gw" ip route add dead:3::0/64 dev veth0 table $((1000+i)) 152 + echo netns exec "$gw" ip rule add fwmark $i lookup $((1000+i)) 153 + done | ip -batch /dev/stdin 154 + 155 + ip -net "$gw" addr add 10.3.0.1/24 dev veth0 156 + ip -net "$gw" addr add dead:3::1/64 dev veth0 157 + 158 + ip -net "$srv" addr add 10.3.0.99/24 dev eth0 159 + ip -net "$srv" addr add dead:3::99/64 dev eth0 160 + 161 + ip netns exec $gw nft -f /dev/stdin<<EOF 162 + table inet raw { 163 + map iiftomark { 164 + type ifname : mark 165 + } 166 + 167 + map iiftozone { 168 + typeof iifname : ct zone 169 + } 170 + 171 + set inicmp { 172 + flags dynamic 173 + type ipv4_addr . ifname . ipv4_addr 174 + } 175 + set inflows { 176 + flags dynamic 177 + type ipv4_addr . inet_service . ifname . ipv4_addr . inet_service 178 + } 179 + 180 + set inflows6 { 181 + flags dynamic 182 + type ipv6_addr . inet_service . ifname . ipv6_addr . inet_service 183 + } 184 + 185 + chain prerouting { 186 + type filter hook prerouting priority -64000; policy accept; 187 + ct original zone set meta iifname map @iiftozone 188 + meta mark set meta iifname map @iiftomark 189 + 190 + tcp flags & (syn|ack) == ack add @inflows { ip saddr . tcp sport . meta iifname . ip daddr . tcp dport counter } 191 + add @inflows6 { ip6 saddr . tcp sport . meta iifname . ip6 daddr . tcp dport counter } 192 + ip protocol icmp add @inicmp { ip saddr . meta iifname . ip daddr counter } 193 + } 194 + 195 + chain nat_postrouting { 196 + type nat hook postrouting priority 0; policy accept; 197 + ct mark set meta mark meta oifname veth0 masquerade 198 + } 199 + 200 + chain mangle_prerouting { 201 + type filter hook prerouting priority -100; policy accept; 202 + ct direction reply meta mark set ct mark 203 + } 204 + } 205 + EOF 206 + 207 + ( echo add element inet raw iiftomark \{ 208 + for i in $(seq 1 $((maxclients-1))); do 209 + echo \"veth$i\" : $i, 210 + done 211 + echo \"veth$maxclients\" : $maxclients \} 212 + echo add element inet raw iiftozone \{ 213 + for i in $(seq 1 $((maxclients-1))); do 214 + echo \"veth$i\" : $i, 215 + done 216 + echo \"veth$maxclients\" : $maxclients \} 217 + ) | ip netns exec $gw nft -f /dev/stdin 218 + 219 + ip netns exec "$gw" sysctl -q net.ipv4.conf.all.forwarding=1 > /dev/null 220 + ip netns exec "$gw" sysctl -q net.ipv6.conf.all.forwarding=1 > /dev/null 221 + ip netns exec "$gw" sysctl -q net.ipv4.conf.all.rp_filter=0 >/dev/null 222 + 223 + # useful for debugging: allows to use 'ping' from clients to gateway. 224 + ip netns exec "$gw" sysctl -q net.ipv4.fwmark_reflect=1 > /dev/null 225 + ip netns exec "$gw" sysctl -q net.ipv6.fwmark_reflect=1 > /dev/null 226 + 227 + for i in $(seq 1 $maxclients); do 228 + cl="ns-cl$i-$sfx" 229 + ip netns exec $cl ping -i 0.5 -q -c 3 10.3.0.99 > /dev/null 2>&1 & 230 + if [ $? -ne 0 ]; then 231 + echo FAIL: Ping failure from $cl 1>&2 232 + ret=1 233 + break 234 + fi 235 + done 236 + 237 + wait 238 + 239 + for i in $(seq 1 $maxclients); do 240 + ip netns exec $gw nft get element inet raw inicmp "{ 10.1.0.3 . \"veth$i\" . 10.3.0.99 }" | grep -q "{ 10.1.0.3 . \"veth$i\" . 10.3.0.99 counter packets 3 bytes 252 }" 241 + if [ $? -ne 0 ];then 242 + ret=1 243 + echo "FAIL: counter icmp mismatch for veth$i" 1>&2 244 + ip netns exec $gw nft get element inet raw inicmp "{ 10.1.0.3 . \"veth$i\" . 10.3.0.99 }" 1>&2 245 + break 246 + fi 247 + done 248 + 249 + ip netns exec $gw nft get element inet raw inicmp "{ 10.3.0.99 . \"veth0\" . 10.3.0.1 }" | grep -q "{ 10.3.0.99 . \"veth0\" . 10.3.0.1 counter packets $((3 * $maxclients)) bytes $((252 * $maxclients)) }" 250 + if [ $? -ne 0 ];then 251 + ret=1 252 + echo "FAIL: counter icmp mismatch for veth0: { 10.3.0.99 . \"veth0\" . 10.3.0.1 counter packets $((3 * $maxclients)) bytes $((252 * $maxclients)) }" 253 + ip netns exec $gw nft get element inet raw inicmp "{ 10.3.99 . \"veth0\" . 10.3.0.1 }" 1>&2 254 + fi 255 + 256 + if [ $ret -eq 0 ]; then 257 + echo "PASS: ping test from all $maxclients namespaces" 258 + fi 259 + 260 + if [ $have_iperf -eq 0 ];then 261 + echo "SKIP: iperf3 not installed" 262 + if [ $ret -ne 0 ];then 263 + exit $ret 264 + fi 265 + exit $ksft_skip 266 + fi 267 + 268 + ip netns exec $srv iperf3 -s > /dev/null 2>&1 & 269 + iperfpid=$! 270 + sleep 1 271 + 272 + for i in $(seq 1 $maxclients); do 273 + if [ $ret -ne 0 ]; then 274 + break 275 + fi 276 + cl="ns-cl$i-$sfx" 277 + ip netns exec $cl iperf3 -c 10.3.0.99 --cport 10000 -n 1 > /dev/null 278 + if [ $? -ne 0 ]; then 279 + echo FAIL: Failure to connect for $cl 1>&2 280 + ip netns exec $gw conntrack -S 1>&2 281 + ret=1 282 + fi 283 + done 284 + if [ $ret -eq 0 ];then 285 + echo "PASS: iperf3 connections for all $maxclients net namespaces" 286 + fi 287 + 288 + kill $iperfpid 289 + wait 290 + 291 + for i in $(seq 1 $maxclients); do 292 + ip netns exec $gw nft get element inet raw inflows "{ 10.1.0.3 . 10000 . \"veth$i\" . 10.3.0.99 . 5201 }" > /dev/null 293 + if [ $? -ne 0 ];then 294 + ret=1 295 + echo "FAIL: can't find expected tcp entry for veth$i" 1>&2 296 + break 297 + fi 298 + done 299 + if [ $ret -eq 0 ];then 300 + echo "PASS: Found client connection for all $maxclients net namespaces" 301 + fi 302 + 303 + ip netns exec $gw nft get element inet raw inflows "{ 10.3.0.99 . 5201 . \"veth0\" . 10.3.0.1 . 10000 }" > /dev/null 304 + if [ $? -ne 0 ];then 305 + ret=1 306 + echo "FAIL: cannot find return entry on veth0" 1>&2 307 + fi 308 + 309 + exit $ret

+156

tools/testing/selftests/netfilter/nft_zones_many.sh

··· 1 + #!/bin/bash 2 + 3 + # Test insertion speed for packets with identical addresses/ports 4 + # that are all placed in distinct conntrack zones. 5 + 6 + sfx=$(mktemp -u "XXXXXXXX") 7 + ns="ns-$sfx" 8 + 9 + # Kselftest framework requirement - SKIP code is 4. 10 + ksft_skip=4 11 + 12 + zones=20000 13 + have_ct_tool=0 14 + ret=0 15 + 16 + cleanup() 17 + { 18 + ip netns del $ns 19 + } 20 + 21 + ip netns add $ns 22 + if [ $? -ne 0 ];then 23 + echo "SKIP: Could not create net namespace $gw" 24 + exit $ksft_skip 25 + fi 26 + 27 + trap cleanup EXIT 28 + 29 + conntrack -V > /dev/null 2>&1 30 + if [ $? -eq 0 ];then 31 + have_ct_tool=1 32 + fi 33 + 34 + ip -net "$ns" link set lo up 35 + 36 + test_zones() { 37 + local max_zones=$1 38 + 39 + ip netns exec $ns sysctl -q net.netfilter.nf_conntrack_udp_timeout=3600 40 + ip netns exec $ns nft -f /dev/stdin<<EOF 41 + flush ruleset 42 + table inet raw { 43 + map rndzone { 44 + typeof numgen inc mod $max_zones : ct zone 45 + } 46 + 47 + chain output { 48 + type filter hook output priority -64000; policy accept; 49 + udp dport 12345 ct zone set numgen inc mod 65536 map @rndzone 50 + } 51 + } 52 + EOF 53 + ( 54 + echo "add element inet raw rndzone {" 55 + for i in $(seq 1 $max_zones);do 56 + echo -n "$i : $i" 57 + if [ $i -lt $max_zones ]; then 58 + echo "," 59 + else 60 + echo "}" 61 + fi 62 + done 63 + ) | ip netns exec $ns nft -f /dev/stdin 64 + 65 + local i=0 66 + local j=0 67 + local outerstart=$(date +%s%3N) 68 + local stop=$outerstart 69 + 70 + while [ $i -lt $max_zones ]; do 71 + local start=$(date +%s%3N) 72 + i=$((i + 10000)) 73 + j=$((j + 1)) 74 + dd if=/dev/zero of=/dev/stdout bs=8k count=10000 2>/dev/null | ip netns exec "$ns" nc -w 1 -q 1 -u -p 12345 127.0.0.1 12345 > /dev/null 75 + if [ $? -ne 0 ] ;then 76 + ret=1 77 + break 78 + fi 79 + 80 + stop=$(date +%s%3N) 81 + local duration=$((stop-start)) 82 + echo "PASS: added 10000 entries in $duration ms (now $i total, loop $j)" 83 + done 84 + 85 + if [ $have_ct_tool -eq 1 ]; then 86 + local count=$(ip netns exec "$ns" conntrack -C) 87 + local duration=$((stop-outerstart)) 88 + 89 + if [ $count -eq $max_zones ]; then 90 + echo "PASS: inserted $count entries from packet path in $duration ms total" 91 + else 92 + ip netns exec $ns conntrack -S 1>&2 93 + echo "FAIL: inserted $count entries from packet path in $duration ms total, expected $max_zones entries" 94 + ret=1 95 + fi 96 + fi 97 + 98 + if [ $ret -ne 0 ];then 99 + echo "FAIL: insert $max_zones entries from packet path" 1>&2 100 + fi 101 + } 102 + 103 + test_conntrack_tool() { 104 + local max_zones=$1 105 + 106 + ip netns exec $ns conntrack -F >/dev/null 2>/dev/null 107 + 108 + local outerstart=$(date +%s%3N) 109 + local start=$(date +%s%3N) 110 + local stop=$start 111 + local i=0 112 + while [ $i -lt $max_zones ]; do 113 + i=$((i + 1)) 114 + ip netns exec "$ns" conntrack -I -s 1.1.1.1 -d 2.2.2.2 --protonum 6 \ 115 + --timeout 3600 --state ESTABLISHED --sport 12345 --dport 1000 --zone $i >/dev/null 2>&1 116 + if [ $? -ne 0 ];then 117 + ip netns exec "$ns" conntrack -I -s 1.1.1.1 -d 2.2.2.2 --protonum 6 \ 118 + --timeout 3600 --state ESTABLISHED --sport 12345 --dport 1000 --zone $i > /dev/null 119 + echo "FAIL: conntrack -I returned an error" 120 + ret=1 121 + break 122 + fi 123 + 124 + if [ $((i%10000)) -eq 0 ];then 125 + stop=$(date +%s%3N) 126 + 127 + local duration=$((stop-start)) 128 + echo "PASS: added 10000 entries in $duration ms (now $i total)" 129 + start=$stop 130 + fi 131 + done 132 + 133 + local count=$(ip netns exec "$ns" conntrack -C) 134 + local duration=$((stop-outerstart)) 135 + 136 + if [ $count -eq $max_zones ]; then 137 + echo "PASS: inserted $count entries via ctnetlink in $duration ms" 138 + else 139 + ip netns exec $ns conntrack -S 1>&2 140 + echo "FAIL: inserted $count entries via ctnetlink in $duration ms, expected $max_zones entries ($duration ms)" 141 + ret=1 142 + fi 143 + } 144 + 145 + test_zones $zones 146 + 147 + if [ $have_ct_tool -eq 1 ];then 148 + test_conntrack_tool $zones 149 + else 150 + echo "SKIP: Could not run ctnetlink insertion test without conntrack tool" 151 + if [ $ret -eq 0 ];then 152 + exit $ksft_skip 153 + fi 154 + fi 155 + 156 + exit $ret