Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge branch 'listener_refactor_part_11'

Eric Dumazet says:

====================
inet: tcp listener refactoring, part 11

Before inserting request sockets into general (ehash) table,
we need to prepare netfilter to cope with them, as they are
not full sockets.

I'll later change xt_socket to get full support, including for
request sockets (NEW_SYN_RECV)

Save 8 bytes in inet_request_sock on 64bit arches. We'll soon add
a pointer to the listener socket.

I included two TCP changes in this patch series.
====================

Signed-off-by: David S. Miller <davem@davemloft.net>

+102 -85
+5 -6
include/net/inet_sock.h
··· 94 94 acked : 1, 95 95 no_srccheck: 1; 96 96 kmemcheck_bitfield_end(flags); 97 + u32 ir_mark; 97 98 union { 98 99 struct ip_options_rcu *opt; 99 100 struct sk_buff *pktopts; 100 101 }; 101 - u32 ir_mark; 102 102 }; 103 103 104 104 static inline struct inet_request_sock *inet_rsk(const struct request_sock *sk) ··· 106 106 return (struct inet_request_sock *)sk; 107 107 } 108 108 109 - static inline u32 inet_request_mark(struct sock *sk, struct sk_buff *skb) 109 + static inline u32 inet_request_mark(const struct sock *sk, struct sk_buff *skb) 110 110 { 111 - if (!sk->sk_mark && sock_net(sk)->ipv4.sysctl_tcp_fwmark_accept) { 111 + if (!sk->sk_mark && sock_net(sk)->ipv4.sysctl_tcp_fwmark_accept) 112 112 return skb->mark; 113 - } else { 114 - return sk->sk_mark; 115 - } 113 + 114 + return sk->sk_mark; 116 115 } 117 116 118 117 struct inet_cork {
+2 -55
include/net/tcp.h
··· 1137 1137 return tcp_win_from_space(sk->sk_rcvbuf); 1138 1138 } 1139 1139 1140 - static inline void tcp_openreq_init(struct request_sock *req, 1141 - struct tcp_options_received *rx_opt, 1142 - struct sk_buff *skb, struct sock *sk) 1143 - { 1144 - struct inet_request_sock *ireq = inet_rsk(req); 1145 - 1146 - req->rcv_wnd = 0; /* So that tcp_send_synack() knows! */ 1147 - req->cookie_ts = 0; 1148 - tcp_rsk(req)->rcv_isn = TCP_SKB_CB(skb)->seq; 1149 - tcp_rsk(req)->rcv_nxt = TCP_SKB_CB(skb)->seq + 1; 1150 - tcp_rsk(req)->snt_synack = tcp_time_stamp; 1151 - tcp_rsk(req)->last_oow_ack_time = 0; 1152 - req->mss = rx_opt->mss_clamp; 1153 - req->ts_recent = rx_opt->saw_tstamp ? rx_opt->rcv_tsval : 0; 1154 - ireq->tstamp_ok = rx_opt->tstamp_ok; 1155 - ireq->sack_ok = rx_opt->sack_ok; 1156 - ireq->snd_wscale = rx_opt->snd_wscale; 1157 - ireq->wscale_ok = rx_opt->wscale_ok; 1158 - ireq->acked = 0; 1159 - ireq->ecn_ok = 0; 1160 - ireq->ir_rmt_port = tcp_hdr(skb)->source; 1161 - ireq->ir_num = ntohs(tcp_hdr(skb)->dest); 1162 - ireq->ir_mark = inet_request_mark(sk, skb); 1163 - } 1164 - 1165 1140 extern void tcp_openreq_init_rwin(struct request_sock *req, 1166 1141 struct sock *sk, struct dst_entry *dst); 1167 1142 ··· 1216 1241 return true; 1217 1242 } 1218 1243 1219 - /* Return true if we're currently rate-limiting out-of-window ACKs and 1220 - * thus shouldn't send a dupack right now. We rate-limit dupacks in 1221 - * response to out-of-window SYNs or ACKs to mitigate ACK loops or DoS 1222 - * attacks that send repeated SYNs or ACKs for the same connection. To 1223 - * do this, we do not send a duplicate SYNACK or ACK if the remote 1224 - * endpoint is sending out-of-window SYNs or pure ACKs at a high rate. 1225 - */ 1226 - static inline bool tcp_oow_rate_limited(struct net *net, 1227 - const struct sk_buff *skb, 1228 - int mib_idx, u32 *last_oow_ack_time) 1229 - { 1230 - /* Data packets without SYNs are not likely part of an ACK loop. */ 1231 - if ((TCP_SKB_CB(skb)->seq != TCP_SKB_CB(skb)->end_seq) && 1232 - !tcp_hdr(skb)->syn) 1233 - goto not_rate_limited; 1234 - 1235 - if (*last_oow_ack_time) { 1236 - s32 elapsed = (s32)(tcp_time_stamp - *last_oow_ack_time); 1237 - 1238 - if (0 <= elapsed && elapsed < sysctl_tcp_invalid_ratelimit) { 1239 - NET_INC_STATS_BH(net, mib_idx); 1240 - return true; /* rate-limited: don't send yet! */ 1241 - } 1242 - } 1243 - 1244 - *last_oow_ack_time = tcp_time_stamp; 1245 - 1246 - not_rate_limited: 1247 - return false; /* not rate-limited: go ahead, send dupack now! */ 1248 - } 1244 + bool tcp_oow_rate_limited(struct net *net, const struct sk_buff *skb, 1245 + int mib_idx, u32 *last_oow_ack_time); 1249 1246 1250 1247 static inline void tcp_mib_init(struct net *net) 1251 1248 {
+55
net/ipv4/tcp_input.c
··· 3321 3321 return flag; 3322 3322 } 3323 3323 3324 + /* Return true if we're currently rate-limiting out-of-window ACKs and 3325 + * thus shouldn't send a dupack right now. We rate-limit dupacks in 3326 + * response to out-of-window SYNs or ACKs to mitigate ACK loops or DoS 3327 + * attacks that send repeated SYNs or ACKs for the same connection. To 3328 + * do this, we do not send a duplicate SYNACK or ACK if the remote 3329 + * endpoint is sending out-of-window SYNs or pure ACKs at a high rate. 3330 + */ 3331 + bool tcp_oow_rate_limited(struct net *net, const struct sk_buff *skb, 3332 + int mib_idx, u32 *last_oow_ack_time) 3333 + { 3334 + /* Data packets without SYNs are not likely part of an ACK loop. */ 3335 + if ((TCP_SKB_CB(skb)->seq != TCP_SKB_CB(skb)->end_seq) && 3336 + !tcp_hdr(skb)->syn) 3337 + goto not_rate_limited; 3338 + 3339 + if (*last_oow_ack_time) { 3340 + s32 elapsed = (s32)(tcp_time_stamp - *last_oow_ack_time); 3341 + 3342 + if (0 <= elapsed && elapsed < sysctl_tcp_invalid_ratelimit) { 3343 + NET_INC_STATS_BH(net, mib_idx); 3344 + return true; /* rate-limited: don't send yet! */ 3345 + } 3346 + } 3347 + 3348 + *last_oow_ack_time = tcp_time_stamp; 3349 + 3350 + not_rate_limited: 3351 + return false; /* not rate-limited: go ahead, send dupack now! */ 3352 + } 3353 + 3324 3354 /* RFC 5961 7 [ACK Throttling] */ 3325 3355 static void tcp_send_challenge_ack(struct sock *sk, const struct sk_buff *skb) 3326 3356 { ··· 5940 5910 5941 5911 if ((!ect && ecn_ok) || tcp_ca_needs_ecn(listen_sk)) 5942 5912 inet_rsk(req)->ecn_ok = 1; 5913 + } 5914 + 5915 + static void tcp_openreq_init(struct request_sock *req, 5916 + const struct tcp_options_received *rx_opt, 5917 + struct sk_buff *skb, const struct sock *sk) 5918 + { 5919 + struct inet_request_sock *ireq = inet_rsk(req); 5920 + 5921 + req->rcv_wnd = 0; /* So that tcp_send_synack() knows! */ 5922 + req->cookie_ts = 0; 5923 + tcp_rsk(req)->rcv_isn = TCP_SKB_CB(skb)->seq; 5924 + tcp_rsk(req)->rcv_nxt = TCP_SKB_CB(skb)->seq + 1; 5925 + tcp_rsk(req)->snt_synack = tcp_time_stamp; 5926 + tcp_rsk(req)->last_oow_ack_time = 0; 5927 + req->mss = rx_opt->mss_clamp; 5928 + req->ts_recent = rx_opt->saw_tstamp ? rx_opt->rcv_tsval : 0; 5929 + ireq->tstamp_ok = rx_opt->tstamp_ok; 5930 + ireq->sack_ok = rx_opt->sack_ok; 5931 + ireq->snd_wscale = rx_opt->snd_wscale; 5932 + ireq->wscale_ok = rx_opt->wscale_ok; 5933 + ireq->acked = 0; 5934 + ireq->ecn_ok = 0; 5935 + ireq->ir_rmt_port = tcp_hdr(skb)->source; 5936 + ireq->ir_num = ntohs(tcp_hdr(skb)->dest); 5937 + ireq->ir_mark = inet_request_mark(sk, skb); 5943 5938 } 5944 5939 5945 5940 int tcp_conn_request(struct request_sock_ops *rsk_ops,
+1 -1
net/netfilter/ipvs/ip_vs_xmit.c
··· 209 209 struct sock *sk = skb->sk; 210 210 struct rtable *ort = skb_rtable(skb); 211 211 212 - if (!skb->dev && sk && sk->sk_state != TCP_TIME_WAIT) 212 + if (!skb->dev && sk && sk_fullsock(sk)) 213 213 ort->dst.ops->update_pmtu(&ort->dst, sk, NULL, mtu); 214 214 } 215 215
+1 -1
net/netfilter/nf_log_common.c
··· 133 133 134 134 void nf_log_dump_sk_uid_gid(struct nf_log_buf *m, struct sock *sk) 135 135 { 136 - if (!sk || sk->sk_state == TCP_TIME_WAIT) 136 + if (!sk || !sk_fullsock(sk)) 137 137 return; 138 138 139 139 read_lock_bh(&sk->sk_callback_lock);
+2 -2
net/netfilter/nft_meta.c
··· 83 83 *(u16 *)dest->data = out->type; 84 84 break; 85 85 case NFT_META_SKUID: 86 - if (skb->sk == NULL || skb->sk->sk_state == TCP_TIME_WAIT) 86 + if (skb->sk == NULL || !sk_fullsock(skb->sk)) 87 87 goto err; 88 88 89 89 read_lock_bh(&skb->sk->sk_callback_lock); ··· 99 99 read_unlock_bh(&skb->sk->sk_callback_lock); 100 100 break; 101 101 case NFT_META_SKGID: 102 - if (skb->sk == NULL || skb->sk->sk_state == TCP_TIME_WAIT) 102 + if (skb->sk == NULL || !sk_fullsock(skb->sk)) 103 103 goto err; 104 104 105 105 read_lock_bh(&skb->sk->sk_callback_lock);
+12 -6
net/netfilter/xt_TPROXY.c
··· 42 42 43 43 static bool tproxy_sk_is_transparent(struct sock *sk) 44 44 { 45 - if (sk->sk_state != TCP_TIME_WAIT) { 46 - if (inet_sk(sk)->transparent) 47 - return true; 48 - sock_put(sk); 49 - } else { 45 + switch (sk->sk_state) { 46 + case TCP_TIME_WAIT: 50 47 if (inet_twsk(sk)->tw_transparent) 51 48 return true; 52 - inet_twsk_put(inet_twsk(sk)); 49 + break; 50 + case TCP_NEW_SYN_RECV: 51 + if (inet_rsk(inet_reqsk(sk))->no_srccheck) 52 + return true; 53 + break; 54 + default: 55 + if (inet_sk(sk)->transparent) 56 + return true; 53 57 } 58 + 59 + sock_gen_put(sk); 54 60 return false; 55 61 } 56 62
+22 -12
net/netfilter/xt_socket.c
··· 129 129 return NULL; 130 130 } 131 131 132 + static bool xt_socket_sk_is_transparent(struct sock *sk) 133 + { 134 + switch (sk->sk_state) { 135 + case TCP_TIME_WAIT: 136 + return inet_twsk(sk)->tw_transparent; 137 + 138 + case TCP_NEW_SYN_RECV: 139 + return inet_rsk(inet_reqsk(sk))->no_srccheck; 140 + 141 + default: 142 + return inet_sk(sk)->transparent; 143 + } 144 + } 145 + 132 146 static bool 133 147 socket_match(const struct sk_buff *skb, struct xt_action_param *par, 134 148 const struct xt_socket_mtinfo1 *info) ··· 209 195 * unless XT_SOCKET_NOWILDCARD is set 210 196 */ 211 197 wildcard = (!(info->flags & XT_SOCKET_NOWILDCARD) && 212 - sk->sk_state != TCP_TIME_WAIT && 198 + sk_fullsock(sk) && 213 199 inet_sk(sk)->inet_rcv_saddr == 0); 214 200 215 201 /* Ignore non-transparent sockets, 216 - if XT_SOCKET_TRANSPARENT is used */ 202 + * if XT_SOCKET_TRANSPARENT is used 203 + */ 217 204 if (info->flags & XT_SOCKET_TRANSPARENT) 218 - transparent = ((sk->sk_state != TCP_TIME_WAIT && 219 - inet_sk(sk)->transparent) || 220 - (sk->sk_state == TCP_TIME_WAIT && 221 - inet_twsk(sk)->tw_transparent)); 205 + transparent = xt_socket_sk_is_transparent(sk); 222 206 223 207 if (sk != skb->sk) 224 208 sock_gen_put(sk); ··· 375 363 * unless XT_SOCKET_NOWILDCARD is set 376 364 */ 377 365 wildcard = (!(info->flags & XT_SOCKET_NOWILDCARD) && 378 - sk->sk_state != TCP_TIME_WAIT && 366 + sk_fullsock(sk) && 379 367 ipv6_addr_any(&sk->sk_v6_rcv_saddr)); 380 368 381 369 /* Ignore non-transparent sockets, 382 - if XT_SOCKET_TRANSPARENT is used */ 370 + * if XT_SOCKET_TRANSPARENT is used 371 + */ 383 372 if (info->flags & XT_SOCKET_TRANSPARENT) 384 - transparent = ((sk->sk_state != TCP_TIME_WAIT && 385 - inet_sk(sk)->transparent) || 386 - (sk->sk_state == TCP_TIME_WAIT && 387 - inet_twsk(sk)->tw_transparent)); 373 + transparent = xt_socket_sk_is_transparent(sk); 388 374 389 375 if (sk != skb->sk) 390 376 sock_gen_put(sk);