Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge git://git.kernel.org/pub/scm/linux/kernel/git/pablo/nf

Pablo Neira Ayuso says:

====================
Netfilter fixes for net

1) Fix bogus compilter warning in nfnetlink_queue, from Florian Westphal.

2) Don't run conntrack on vrf with !dflt qdisc, from Nicolas Dichtel.

3) Fix nft_pipapo bucket load in AVX2 lookup routine for six 8-bit
groups, from Stefano Brivio.

4) Break rule evaluation on malformed TCP options.

5) Use socat instead of nc in selftests/netfilter/nft_zones_many.sh,
also from Florian

6) Fix KCSAN data-race in conntrack timeout updates, from Eric Dumazet.

* git://git.kernel.org/pub/scm/linux/kernel/git/pablo/nf:
netfilter: conntrack: annotate data-races around ct->timeout
selftests: netfilter: switch zone stress to socat
netfilter: nft_exthdr: break evaluation if setting TCP option fails
selftests: netfilter: Add correctness test for mac,net set type
nft_set_pipapo: Fix bucket load in AVX2 lookup routine for six 8-bit groups
vrf: don't run conntrack on vrf with !dflt qdisc
netfilter: nfnetlink_queue: silence bogus compiler warning
====================

Link: https://lore.kernel.org/r/20211209000847.102598-1-pablo@netfilter.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>

+82 -32
+4 -4
drivers/net/vrf.c
··· 770 770 771 771 skb->dev = vrf_dev; 772 772 773 - vrf_nf_set_untracked(skb); 774 - 775 773 err = nf_hook(NFPROTO_IPV6, NF_INET_LOCAL_OUT, net, sk, 776 774 skb, NULL, vrf_dev, vrf_ip6_out_direct_finish); 777 775 ··· 789 791 /* don't divert link scope packets */ 790 792 if (rt6_need_strict(&ipv6_hdr(skb)->daddr)) 791 793 return skb; 794 + 795 + vrf_nf_set_untracked(skb); 792 796 793 797 if (qdisc_tx_is_default(vrf_dev) || 794 798 IP6CB(skb)->flags & IP6SKB_XFRM_TRANSFORMED) ··· 1000 1000 1001 1001 skb->dev = vrf_dev; 1002 1002 1003 - vrf_nf_set_untracked(skb); 1004 - 1005 1003 err = nf_hook(NFPROTO_IPV4, NF_INET_LOCAL_OUT, net, sk, 1006 1004 skb, NULL, vrf_dev, vrf_ip_out_direct_finish); 1007 1005 ··· 1020 1022 if (ipv4_is_multicast(ip_hdr(skb)->daddr) || 1021 1023 ipv4_is_lbcast(ip_hdr(skb)->daddr)) 1022 1024 return skb; 1025 + 1026 + vrf_nf_set_untracked(skb); 1023 1027 1024 1028 if (qdisc_tx_is_default(vrf_dev) || 1025 1029 IPCB(skb)->flags & IPSKB_XFRM_TRANSFORMED)
+3 -3
include/net/netfilter/nf_conntrack.h
··· 276 276 /* jiffies until ct expires, 0 if already expired */ 277 277 static inline unsigned long nf_ct_expires(const struct nf_conn *ct) 278 278 { 279 - s32 timeout = ct->timeout - nfct_time_stamp; 279 + s32 timeout = READ_ONCE(ct->timeout) - nfct_time_stamp; 280 280 281 281 return timeout > 0 ? timeout : 0; 282 282 } 283 283 284 284 static inline bool nf_ct_is_expired(const struct nf_conn *ct) 285 285 { 286 - return (__s32)(ct->timeout - nfct_time_stamp) <= 0; 286 + return (__s32)(READ_ONCE(ct->timeout) - nfct_time_stamp) <= 0; 287 287 } 288 288 289 289 /* use after obtaining a reference count */ ··· 302 302 static inline void nf_ct_offload_timeout(struct nf_conn *ct) 303 303 { 304 304 if (nf_ct_expires(ct) < NF_CT_DAY / 2) 305 - ct->timeout = nfct_time_stamp + NF_CT_DAY; 305 + WRITE_ONCE(ct->timeout, nfct_time_stamp + NF_CT_DAY); 306 306 } 307 307 308 308 struct kernel_param;
+3 -3
net/netfilter/nf_conntrack_core.c
··· 684 684 685 685 tstamp = nf_conn_tstamp_find(ct); 686 686 if (tstamp) { 687 - s32 timeout = ct->timeout - nfct_time_stamp; 687 + s32 timeout = READ_ONCE(ct->timeout) - nfct_time_stamp; 688 688 689 689 tstamp->stop = ktime_get_real_ns(); 690 690 if (timeout < 0) ··· 1036 1036 } 1037 1037 1038 1038 /* We want the clashing entry to go away real soon: 1 second timeout. */ 1039 - loser_ct->timeout = nfct_time_stamp + HZ; 1039 + WRITE_ONCE(loser_ct->timeout, nfct_time_stamp + HZ); 1040 1040 1041 1041 /* IPS_NAT_CLASH removes the entry automatically on the first 1042 1042 * reply. Also prevents UDP tracker from moving the entry to ··· 1560 1560 /* save hash for reusing when confirming */ 1561 1561 *(unsigned long *)(&ct->tuplehash[IP_CT_DIR_REPLY].hnnode.pprev) = hash; 1562 1562 ct->status = 0; 1563 - ct->timeout = 0; 1563 + WRITE_ONCE(ct->timeout, 0); 1564 1564 write_pnet(&ct->ct_net, net); 1565 1565 memset(&ct->__nfct_init_offset, 0, 1566 1566 offsetof(struct nf_conn, proto) -
+1 -1
net/netfilter/nf_conntrack_netlink.c
··· 1998 1998 1999 1999 if (timeout > INT_MAX) 2000 2000 timeout = INT_MAX; 2001 - ct->timeout = nfct_time_stamp + (u32)timeout; 2001 + WRITE_ONCE(ct->timeout, nfct_time_stamp + (u32)timeout); 2002 2002 2003 2003 if (test_bit(IPS_DYING_BIT, &ct->status)) 2004 2004 return -ETIME;
+2 -2
net/netfilter/nf_flow_table_core.c
··· 201 201 if (timeout < 0) 202 202 timeout = 0; 203 203 204 - if (nf_flow_timeout_delta(ct->timeout) > (__s32)timeout) 205 - ct->timeout = nfct_time_stamp + timeout; 204 + if (nf_flow_timeout_delta(READ_ONCE(ct->timeout)) > (__s32)timeout) 205 + WRITE_ONCE(ct->timeout, nfct_time_stamp + timeout); 206 206 } 207 207 208 208 static void flow_offload_fixup_ct_state(struct nf_conn *ct)
+7 -4
net/netfilter/nft_exthdr.c
··· 236 236 237 237 tcph = nft_tcp_header_pointer(pkt, sizeof(buff), buff, &tcphdr_len); 238 238 if (!tcph) 239 - return; 239 + goto err; 240 240 241 241 opt = (u8 *)tcph; 242 242 for (i = sizeof(*tcph); i < tcphdr_len - 1; i += optl) { ··· 251 251 continue; 252 252 253 253 if (i + optl > tcphdr_len || priv->len + priv->offset > optl) 254 - return; 254 + goto err; 255 255 256 256 if (skb_ensure_writable(pkt->skb, 257 257 nft_thoff(pkt) + i + priv->len)) 258 - return; 258 + goto err; 259 259 260 260 tcph = nft_tcp_header_pointer(pkt, sizeof(buff), buff, 261 261 &tcphdr_len); 262 262 if (!tcph) 263 - return; 263 + goto err; 264 264 265 265 offset = i + priv->offset; 266 266 ··· 303 303 304 304 return; 305 305 } 306 + return; 307 + err: 308 + regs->verdict.code = NFT_BREAK; 306 309 } 307 310 308 311 static void nft_exthdr_sctp_eval(const struct nft_expr *expr,
+1 -1
net/netfilter/nft_set_pipapo_avx2.c
··· 886 886 NFT_PIPAPO_AVX2_BUCKET_LOAD8(4, lt, 4, pkt[4], bsize); 887 887 888 888 NFT_PIPAPO_AVX2_AND(5, 0, 1); 889 - NFT_PIPAPO_AVX2_BUCKET_LOAD8(6, lt, 6, pkt[5], bsize); 889 + NFT_PIPAPO_AVX2_BUCKET_LOAD8(6, lt, 5, pkt[5], bsize); 890 890 NFT_PIPAPO_AVX2_AND(7, 2, 3); 891 891 892 892 /* Stall */
+26 -4
tools/testing/selftests/netfilter/conntrack_vrf.sh
··· 150 150 # oifname is the vrf device. 151 151 test_masquerade_vrf() 152 152 { 153 + local qdisc=$1 154 + 155 + if [ "$qdisc" != "default" ]; then 156 + tc -net $ns0 qdisc add dev tvrf root $qdisc 157 + fi 158 + 153 159 ip netns exec $ns0 conntrack -F 2>/dev/null 154 160 155 161 ip netns exec $ns0 nft -f - <<EOF 156 162 flush ruleset 157 163 table ip nat { 164 + chain rawout { 165 + type filter hook output priority raw; 166 + 167 + oif tvrf ct state untracked counter 168 + } 169 + chain postrouting2 { 170 + type filter hook postrouting priority mangle; 171 + 172 + oif tvrf ct state untracked counter 173 + } 158 174 chain postrouting { 159 175 type nat hook postrouting priority 0; 160 176 # NB: masquerade should always be combined with 'oif(name) bla', ··· 187 171 fi 188 172 189 173 # must also check that nat table was evaluated on second (lower device) iteration. 190 - ip netns exec $ns0 nft list table ip nat |grep -q 'counter packets 2' 174 + ip netns exec $ns0 nft list table ip nat |grep -q 'counter packets 2' && 175 + ip netns exec $ns0 nft list table ip nat |grep -q 'untracked counter packets [1-9]' 191 176 if [ $? -eq 0 ]; then 192 - echo "PASS: iperf3 connect with masquerade + sport rewrite on vrf device" 177 + echo "PASS: iperf3 connect with masquerade + sport rewrite on vrf device ($qdisc qdisc)" 193 178 else 194 - echo "FAIL: vrf masq rule has unexpected counter value" 179 + echo "FAIL: vrf rules have unexpected counter value" 195 180 ret=1 181 + fi 182 + 183 + if [ "$qdisc" != "default" ]; then 184 + tc -net $ns0 qdisc del dev tvrf root 196 185 fi 197 186 } 198 187 ··· 234 213 } 235 214 236 215 test_ct_zone_in 237 - test_masquerade_vrf 216 + test_masquerade_vrf "default" 217 + test_masquerade_vrf "pfifo" 238 218 test_masquerade_veth 239 219 240 220 exit $ret
+21 -3
tools/testing/selftests/netfilter/nft_concat_range.sh
··· 23 23 24 24 # Set types, defined by TYPE_ variables below 25 25 TYPES="net_port port_net net6_port port_proto net6_port_mac net6_port_mac_proto 26 - net_port_net net_mac net_mac_icmp net6_mac_icmp net6_port_net6_port 27 - net_port_mac_proto_net" 26 + net_port_net net_mac mac_net net_mac_icmp net6_mac_icmp 27 + net6_port_net6_port net_port_mac_proto_net" 28 28 29 29 # Reported bugs, also described by TYPE_ variables below 30 30 BUGS="flush_remove_add" ··· 275 275 perf_src 276 276 perf_entries 1000 277 277 perf_proto ipv4 278 + " 279 + 280 + TYPE_mac_net=" 281 + display mac,net 282 + type_spec ether_addr . ipv4_addr 283 + chain_spec ether saddr . ip saddr 284 + dst 285 + src mac addr4 286 + start 1 287 + count 5 288 + src_delta 2000 289 + tools sendip nc bash 290 + proto udp 291 + 292 + race_repeat 0 293 + 294 + perf_duration 0 278 295 " 279 296 280 297 TYPE_net_mac_icmp=" ··· 1001 984 fi 1002 985 done 1003 986 for f in ${src}; do 1004 - __expr="${__expr} . " 987 + [ "${__expr}" != "{ " ] && __expr="${__expr} . " 988 + 1005 989 __start="$(eval format_"${f}" "${srcstart}")" 1006 990 __end="$(eval format_"${f}" "${srcend}")" 1007 991
+13 -6
tools/testing/selftests/netfilter/nft_zones_many.sh
··· 18 18 ip netns del $ns 19 19 } 20 20 21 - ip netns add $ns 22 - if [ $? -ne 0 ];then 23 - echo "SKIP: Could not create net namespace $gw" 24 - exit $ksft_skip 25 - fi 21 + checktool (){ 22 + if ! $1 > /dev/null 2>&1; then 23 + echo "SKIP: Could not $2" 24 + exit $ksft_skip 25 + fi 26 + } 27 + 28 + checktool "nft --version" "run test without nft tool" 29 + checktool "ip -Version" "run test without ip tool" 30 + checktool "socat -V" "run test without socat tool" 31 + checktool "ip netns add $ns" "create net namespace" 26 32 27 33 trap cleanup EXIT 28 34 ··· 77 71 local start=$(date +%s%3N) 78 72 i=$((i + 10000)) 79 73 j=$((j + 1)) 80 - dd if=/dev/zero of=/dev/stdout bs=8k count=10000 2>/dev/null | ip netns exec "$ns" nc -w 1 -q 1 -u -p 12345 127.0.0.1 12345 > /dev/null 74 + # nft rule in output places each packet in a different zone. 75 + dd if=/dev/zero of=/dev/stdout bs=8k count=10000 2>/dev/null | ip netns exec "$ns" socat STDIN UDP:127.0.0.1:12345,sourceport=12345 81 76 if [ $? -ne 0 ] ;then 82 77 ret=1 83 78 break