Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

ipv4: tcp: fix TOS value in ACK messages sent from TIME_WAIT

There is a long standing bug in linux tcp stack, about ACK messages sent
on behalf of TIME_WAIT sockets.

In the IP header of the ACK message, we choose to reflect TOS field of
incoming message, and this might break some setups.

Example of things that were broken :
- Routing using TOS as a selector
- Firewalls
- Trafic classification / shaping

We now remember in timewait structure the inet tos field and use it in
ACK generation, and route lookup.

Notes :
- We still reflect incoming TOS in RST messages.
- We could extend MuraliRaja Muniraju patch to report TOS value in
netlink messages for TIME_WAIT sockets.
- A patch is needed for IPv6

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>

authored by

Eric Dumazet and committed by
David S. Miller
66b13d99 318cf7aa

+15 -9
+2 -1
include/net/inet_timewait_sock.h
··· 126 126 /* And these are ours. */ 127 127 unsigned int tw_ipv6only : 1, 128 128 tw_transparent : 1, 129 - tw_pad : 14, /* 14 bits hole */ 129 + tw_pad : 6, /* 6 bits hole */ 130 + tw_tos : 8, 130 131 tw_ipv6_offset : 16; 131 132 kmemcheck_bitfield_end(flags); 132 133 unsigned long tw_ttd;
+2 -1
include/net/ip.h
··· 165 165 int csumoffset; /* u16 offset of csum in iov[0].iov_base */ 166 166 /* -1 if not needed */ 167 167 int bound_dev_if; 168 + u8 tos; 168 169 }; 169 170 170 171 #define IP_REPLY_ARG_NOSRCCHECK 1 ··· 176 175 } 177 176 178 177 void ip_send_reply(struct sock *sk, struct sk_buff *skb, __be32 daddr, 179 - struct ip_reply_arg *arg, unsigned int len); 178 + const struct ip_reply_arg *arg, unsigned int len); 180 179 181 180 struct ipv4_config { 182 181 int log_martians;
+1
net/ipv4/inet_timewait_sock.c
··· 183 183 tw->tw_daddr = inet->inet_daddr; 184 184 tw->tw_rcv_saddr = inet->inet_rcv_saddr; 185 185 tw->tw_bound_dev_if = sk->sk_bound_dev_if; 186 + tw->tw_tos = inet->tos; 186 187 tw->tw_num = inet->inet_num; 187 188 tw->tw_state = TCP_TIME_WAIT; 188 189 tw->tw_substate = state;
+3 -3
net/ipv4/ip_output.c
··· 1466 1466 * structure to pass arguments. 1467 1467 */ 1468 1468 void ip_send_reply(struct sock *sk, struct sk_buff *skb, __be32 daddr, 1469 - struct ip_reply_arg *arg, unsigned int len) 1469 + const struct ip_reply_arg *arg, unsigned int len) 1470 1470 { 1471 1471 struct inet_sock *inet = inet_sk(sk); 1472 1472 struct ip_options_data replyopts; ··· 1489 1489 } 1490 1490 1491 1491 flowi4_init_output(&fl4, arg->bound_dev_if, 0, 1492 - RT_TOS(ip_hdr(skb)->tos), 1492 + RT_TOS(arg->tos), 1493 1493 RT_SCOPE_UNIVERSE, sk->sk_protocol, 1494 1494 ip_reply_arg_flowi_flags(arg), 1495 1495 daddr, rt->rt_spec_dst, ··· 1506 1506 with locally disabled BH and that sk cannot be already spinlocked. 1507 1507 */ 1508 1508 bh_lock_sock(sk); 1509 - inet->tos = ip_hdr(skb)->tos; 1509 + inet->tos = arg->tos; 1510 1510 sk->sk_priority = skb->priority; 1511 1511 sk->sk_protocol = ip_hdr(skb)->protocol; 1512 1512 sk->sk_bound_dev_if = arg->bound_dev_if;
+7 -4
net/ipv4/tcp_ipv4.c
··· 652 652 arg.flags = (sk && inet_sk(sk)->transparent) ? IP_REPLY_ARG_NOSRCCHECK : 0; 653 653 654 654 net = dev_net(skb_dst(skb)->dev); 655 + arg.tos = ip_hdr(skb)->tos; 655 656 ip_send_reply(net->ipv4.tcp_sock, skb, ip_hdr(skb)->saddr, 656 657 &arg, arg.iov[0].iov_len); 657 658 ··· 667 666 static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack, 668 667 u32 win, u32 ts, int oif, 669 668 struct tcp_md5sig_key *key, 670 - int reply_flags) 669 + int reply_flags, u8 tos) 671 670 { 672 671 const struct tcphdr *th = tcp_hdr(skb); 673 672 struct { ··· 727 726 arg.csumoffset = offsetof(struct tcphdr, check) / 2; 728 727 if (oif) 729 728 arg.bound_dev_if = oif; 730 - 729 + arg.tos = tos; 731 730 ip_send_reply(net->ipv4.tcp_sock, skb, ip_hdr(skb)->saddr, 732 731 &arg, arg.iov[0].iov_len); 733 732 ··· 744 743 tcptw->tw_ts_recent, 745 744 tw->tw_bound_dev_if, 746 745 tcp_twsk_md5_key(tcptw), 747 - tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0 746 + tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0, 747 + tw->tw_tos 748 748 ); 749 749 750 750 inet_twsk_put(tw); ··· 759 757 req->ts_recent, 760 758 0, 761 759 tcp_v4_md5_do_lookup(sk, ip_hdr(skb)->daddr), 762 - inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0); 760 + inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0, 761 + ip_hdr(skb)->tos); 763 762 } 764 763 765 764 /*