Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge tag 'nf-next-25-09-11' of https://git.kernel.org/pub/scm/linux/kernel/git/netfilter/nf-next

Florian Westphal says:

====================
netfilter: updates for net-next

1) Don't respond to ICMP_UNREACH errors with another ICMP_UNREACH
error.
2) Support fetching the current bridge ethernet address.
This allows a more flexible approach to packet redirection
on bridges without need to use hardcoded addresses. From
Fernando Fernandez Mancera.
3) Zap a few no-longer needed conditionals from ipvs packet path
and convert to READ/WRITE_ONCE to avoid KCSAN warnings.
From Zhang Tengfei.
4) Remove a no-longer-used macro argument in ipset, from Zhen Ni.

* tag 'nf-next-25-09-11' of https://git.kernel.org/pub/scm/linux/kernel/git/netfilter/nf-next:
netfilter: nf_reject: don't reply to icmp error messages
ipvs: Use READ_ONCE/WRITE_ONCE for ipvs->enable
netfilter: nft_meta_bridge: introduce NFT_META_BRI_IIFHWADDR support
netfilter: ipset: Remove unused htable_bits in macro ahash_region
selftest:net: fixed spelling mistakes
====================

Link: https://patch.msgid.link/20250911143819.14753-1-fw@strlen.de
Signed-off-by: Jakub Kicinski <kuba@kernel.org>

+91 -26
+2
include/uapi/linux/netfilter/nf_tables.h
··· 959 959 * @NFT_META_SDIF: slave device interface index 960 960 * @NFT_META_SDIFNAME: slave device interface name 961 961 * @NFT_META_BRI_BROUTE: packet br_netfilter_broute bit 962 + * @NFT_META_BRI_IIFHWADDR: packet input bridge interface ethernet address 962 963 */ 963 964 enum nft_meta_keys { 964 965 NFT_META_LEN, ··· 1000 999 NFT_META_SDIFNAME, 1001 1000 NFT_META_BRI_BROUTE, 1002 1001 __NFT_META_IIFTYPE, 1002 + NFT_META_BRI_IIFHWADDR, 1003 1003 }; 1004 1004 1005 1005 /**
+11
net/bridge/netfilter/nft_meta_bridge.c
··· 59 59 nft_reg_store_be16(dest, htons(p_proto)); 60 60 return; 61 61 } 62 + case NFT_META_BRI_IIFHWADDR: 63 + br_dev = nft_meta_get_bridge(in); 64 + if (!br_dev) 65 + goto err; 66 + 67 + memcpy(dest, br_dev->dev_addr, ETH_ALEN); 68 + return; 62 69 default: 63 70 return nft_meta_get_eval(expr, regs, pkt); 64 71 } ··· 92 85 case NFT_META_BRI_IIFPVID: 93 86 case NFT_META_BRI_IIFVPROTO: 94 87 len = sizeof(u16); 88 + break; 89 + case NFT_META_BRI_IIFHWADDR: 90 + len = ETH_ALEN; 95 91 break; 96 92 default: 97 93 return nft_meta_get_init(ctx, expr, tb); ··· 185 175 186 176 switch (priv->key) { 187 177 case NFT_META_BRI_BROUTE: 178 + case NFT_META_BRI_IIFHWADDR: 188 179 hooks = 1 << NF_BR_PRE_ROUTING; 189 180 break; 190 181 default:
+25
net/ipv4/netfilter/nf_reject_ipv4.c
··· 80 80 } 81 81 EXPORT_SYMBOL_GPL(nf_reject_skb_v4_tcp_reset); 82 82 83 + static bool nf_skb_is_icmp_unreach(const struct sk_buff *skb) 84 + { 85 + const struct iphdr *iph = ip_hdr(skb); 86 + u8 *tp, _type; 87 + int thoff; 88 + 89 + if (iph->protocol != IPPROTO_ICMP) 90 + return false; 91 + 92 + thoff = skb_network_offset(skb) + sizeof(*iph); 93 + 94 + tp = skb_header_pointer(skb, 95 + thoff + offsetof(struct icmphdr, type), 96 + sizeof(_type), &_type); 97 + 98 + if (!tp) 99 + return false; 100 + 101 + return *tp == ICMP_DEST_UNREACH; 102 + } 103 + 83 104 struct sk_buff *nf_reject_skb_v4_unreach(struct net *net, 84 105 struct sk_buff *oldskb, 85 106 const struct net_device *dev, ··· 119 98 120 99 /* IP header checks: fragment. */ 121 100 if (ip_hdr(oldskb)->frag_off & htons(IP_OFFSET)) 101 + return NULL; 102 + 103 + /* don't reply to ICMP_DEST_UNREACH with ICMP_DEST_UNREACH. */ 104 + if (nf_skb_is_icmp_unreach(oldskb)) 122 105 return NULL; 123 106 124 107 /* RFC says return as much as we can without exceeding 576 bytes. */
+30
net/ipv6/netfilter/nf_reject_ipv6.c
··· 104 104 } 105 105 EXPORT_SYMBOL_GPL(nf_reject_skb_v6_tcp_reset); 106 106 107 + static bool nf_skb_is_icmp6_unreach(const struct sk_buff *skb) 108 + { 109 + const struct ipv6hdr *ip6h = ipv6_hdr(skb); 110 + u8 proto = ip6h->nexthdr; 111 + u8 _type, *tp; 112 + int thoff; 113 + __be16 fo; 114 + 115 + thoff = ipv6_skip_exthdr(skb, ((u8 *)(ip6h + 1) - skb->data), &proto, &fo); 116 + 117 + if (thoff < 0 || thoff >= skb->len || fo != 0) 118 + return false; 119 + 120 + if (proto != IPPROTO_ICMPV6) 121 + return false; 122 + 123 + tp = skb_header_pointer(skb, 124 + thoff + offsetof(struct icmp6hdr, icmp6_type), 125 + sizeof(_type), &_type); 126 + 127 + if (!tp) 128 + return false; 129 + 130 + return *tp == ICMPV6_DEST_UNREACH; 131 + } 132 + 107 133 struct sk_buff *nf_reject_skb_v6_unreach(struct net *net, 108 134 struct sk_buff *oldskb, 109 135 const struct net_device *dev, ··· 141 115 unsigned int len; 142 116 143 117 if (!nf_reject_ip6hdr_validate(oldskb)) 118 + return NULL; 119 + 120 + /* Don't reply to ICMPV6_DEST_UNREACH with ICMPV6_DEST_UNREACH */ 121 + if (nf_skb_is_icmp6_unreach(oldskb)) 144 122 return NULL; 145 123 146 124 /* Include "As much of invoking packet as possible without the ICMPv6
+4 -4
net/netfilter/ipset/ip_set_hash_gen.h
··· 63 63 : jhash_size((htable_bits) - HTABLE_REGION_BITS)) 64 64 #define ahash_sizeof_regions(htable_bits) \ 65 65 (ahash_numof_locks(htable_bits) * sizeof(struct ip_set_region)) 66 - #define ahash_region(n, htable_bits) \ 66 + #define ahash_region(n) \ 67 67 ((n) / jhash_size(HTABLE_REGION_BITS)) 68 68 #define ahash_bucket_start(h, htable_bits) \ 69 69 ((htable_bits) < HTABLE_REGION_BITS ? 0 \ ··· 702 702 #endif 703 703 key = HKEY(data, h->initval, htable_bits); 704 704 m = __ipset_dereference(hbucket(t, key)); 705 - nr = ahash_region(key, htable_bits); 705 + nr = ahash_region(key); 706 706 if (!m) { 707 707 m = kzalloc(sizeof(*m) + 708 708 AHASH_INIT_SIZE * dsize, ··· 852 852 rcu_read_lock_bh(); 853 853 t = rcu_dereference_bh(h->table); 854 854 key = HKEY(value, h->initval, t->htable_bits); 855 - r = ahash_region(key, t->htable_bits); 855 + r = ahash_region(key); 856 856 atomic_inc(&t->uref); 857 857 elements = t->hregion[r].elements; 858 858 maxelem = t->maxelem; ··· 1050 1050 rcu_read_lock_bh(); 1051 1051 t = rcu_dereference_bh(h->table); 1052 1052 key = HKEY(value, h->initval, t->htable_bits); 1053 - r = ahash_region(key, t->htable_bits); 1053 + r = ahash_region(key); 1054 1054 atomic_inc(&t->uref); 1055 1055 rcu_read_unlock_bh(); 1056 1056
+2 -2
net/netfilter/ipvs/ip_vs_conn.c
··· 885 885 * conntrack cleanup for the net. 886 886 */ 887 887 smp_rmb(); 888 - if (ipvs->enable) 888 + if (READ_ONCE(ipvs->enable)) 889 889 ip_vs_conn_drop_conntrack(cp); 890 890 } 891 891 ··· 1439 1439 cond_resched_rcu(); 1440 1440 1441 1441 /* netns clean up started, abort delayed work */ 1442 - if (!ipvs->enable) 1442 + if (!READ_ONCE(ipvs->enable)) 1443 1443 break; 1444 1444 } 1445 1445 rcu_read_unlock();
+4 -7
net/netfilter/ipvs/ip_vs_core.c
··· 1353 1353 if (unlikely(!skb_dst(skb))) 1354 1354 return NF_ACCEPT; 1355 1355 1356 - if (!ipvs->enable) 1357 - return NF_ACCEPT; 1358 - 1359 1356 ip_vs_fill_iph_skb(af, skb, false, &iph); 1360 1357 #ifdef CONFIG_IP_VS_IPV6 1361 1358 if (af == AF_INET6) { ··· 1937 1940 return NF_ACCEPT; 1938 1941 } 1939 1942 /* ipvs enabled in this netns ? */ 1940 - if (unlikely(sysctl_backup_only(ipvs) || !ipvs->enable)) 1943 + if (unlikely(sysctl_backup_only(ipvs))) 1941 1944 return NF_ACCEPT; 1942 1945 1943 1946 ip_vs_fill_iph_skb(af, skb, false, &iph); ··· 2105 2108 int r; 2106 2109 2107 2110 /* ipvs enabled in this netns ? */ 2108 - if (unlikely(sysctl_backup_only(ipvs) || !ipvs->enable)) 2111 + if (unlikely(sysctl_backup_only(ipvs))) 2109 2112 return NF_ACCEPT; 2110 2113 2111 2114 if (state->pf == NFPROTO_IPV4) { ··· 2292 2295 return -ENOMEM; 2293 2296 2294 2297 /* Hold the beast until a service is registered */ 2295 - ipvs->enable = 0; 2298 + WRITE_ONCE(ipvs->enable, 0); 2296 2299 ipvs->net = net; 2297 2300 /* Counters used for creating unique names */ 2298 2301 ipvs->gen = atomic_read(&ipvs_netns_cnt); ··· 2364 2367 ipvs = net_ipvs(net); 2365 2368 ip_vs_unregister_hooks(ipvs, AF_INET); 2366 2369 ip_vs_unregister_hooks(ipvs, AF_INET6); 2367 - ipvs->enable = 0; /* Disable packet reception */ 2370 + WRITE_ONCE(ipvs->enable, 0); /* Disable packet reception */ 2368 2371 smp_wmb(); 2369 2372 ip_vs_sync_net_cleanup(ipvs); 2370 2373 }
+3 -3
net/netfilter/ipvs/ip_vs_ctl.c
··· 256 256 struct ip_vs_est_kt_data *kd = ipvs->est_kt_arr[id]; 257 257 258 258 /* netns clean up started, abort delayed work */ 259 - if (!ipvs->enable) 259 + if (!READ_ONCE(ipvs->enable)) 260 260 goto unlock; 261 261 if (!kd) 262 262 continue; ··· 1483 1483 1484 1484 *svc_p = svc; 1485 1485 1486 - if (!ipvs->enable) { 1486 + if (!READ_ONCE(ipvs->enable)) { 1487 1487 /* Now there is a service - full throttle */ 1488 - ipvs->enable = 1; 1488 + WRITE_ONCE(ipvs->enable, 1); 1489 1489 1490 1490 /* Start estimation for first time */ 1491 1491 ip_vs_est_reload_start(ipvs);
+8 -8
net/netfilter/ipvs/ip_vs_est.c
··· 231 231 void ip_vs_est_reload_start(struct netns_ipvs *ipvs) 232 232 { 233 233 /* Ignore reloads before first service is added */ 234 - if (!ipvs->enable) 234 + if (!READ_ONCE(ipvs->enable)) 235 235 return; 236 236 ip_vs_est_stopped_recalc(ipvs); 237 237 /* Bump the kthread configuration genid */ ··· 306 306 int i; 307 307 308 308 if ((unsigned long)ipvs->est_kt_count >= ipvs->est_max_threads && 309 - ipvs->enable && ipvs->est_max_threads) 309 + READ_ONCE(ipvs->enable) && ipvs->est_max_threads) 310 310 return -EINVAL; 311 311 312 312 mutex_lock(&ipvs->est_mutex); ··· 343 343 } 344 344 345 345 /* Start kthread tasks only when services are present */ 346 - if (ipvs->enable && !ip_vs_est_stopped(ipvs)) { 346 + if (READ_ONCE(ipvs->enable) && !ip_vs_est_stopped(ipvs)) { 347 347 ret = ip_vs_est_kthread_start(ipvs, kd); 348 348 if (ret < 0) 349 349 goto out; ··· 486 486 struct ip_vs_estimator *est = &stats->est; 487 487 int ret; 488 488 489 - if (!ipvs->est_max_threads && ipvs->enable) 489 + if (!ipvs->est_max_threads && READ_ONCE(ipvs->enable)) 490 490 ipvs->est_max_threads = ip_vs_est_max_threads(ipvs); 491 491 492 492 est->ktid = -1; ··· 663 663 /* Wait for cpufreq frequency transition */ 664 664 wait_event_idle_timeout(wq, kthread_should_stop(), 665 665 HZ / 50); 666 - if (!ipvs->enable || kthread_should_stop()) 666 + if (!READ_ONCE(ipvs->enable) || kthread_should_stop()) 667 667 goto stop; 668 668 } 669 669 ··· 681 681 rcu_read_unlock(); 682 682 local_bh_enable(); 683 683 684 - if (!ipvs->enable || kthread_should_stop()) 684 + if (!READ_ONCE(ipvs->enable) || kthread_should_stop()) 685 685 goto stop; 686 686 cond_resched(); 687 687 ··· 757 757 mutex_lock(&ipvs->est_mutex); 758 758 for (id = 1; id < ipvs->est_kt_count; id++) { 759 759 /* netns clean up started, abort */ 760 - if (!ipvs->enable) 760 + if (!READ_ONCE(ipvs->enable)) 761 761 goto unlock2; 762 762 kd = ipvs->est_kt_arr[id]; 763 763 if (!kd) ··· 787 787 id = ipvs->est_kt_count; 788 788 789 789 next_kt: 790 - if (!ipvs->enable || kthread_should_stop()) 790 + if (!READ_ONCE(ipvs->enable) || kthread_should_stop()) 791 791 goto unlock; 792 792 id--; 793 793 if (id < 0)
+2 -2
tools/testing/selftests/net/netfilter/nft_nat.sh
··· 569 569 ip netns exec "$ns0" sysctl net.ipv6.conf.all.forwarding=1 > /dev/null 570 570 571 571 if ! ip netns exec "$ns2" ping -q -c 1 dead:1::99 > /dev/null;then 572 - echo "ERROR: cannnot ping $ns1 from $ns2 via ipv6" 572 + echo "ERROR: cannot ping $ns1 from $ns2 via ipv6" 573 573 lret=1 574 574 fi 575 575 ··· 859 859 # from router:service bypass connection tracking. 860 860 test_port_shadow_notrack "$family" 861 861 862 - # test nat based mitigation: fowarded packets coming from service port 862 + # test nat based mitigation: forwarded packets coming from service port 863 863 # are masqueraded with random highport. 864 864 test_port_shadow_pat "$family" 865 865