Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge branch 'SO_PRIORITY'

Eric Dumazet says:

====================
tcp: provide correct skb->priority

SO_PRIORITY socket option requests TCP egress packets
to contain a user provided value.

TCP manages to send most packets with the requested values,
notably for TCP_ESTABLISHED state, but fails to do so for
few packets.

These packets are control packets sent on behalf
of SYN_RECV or TIME_WAIT states.

Note that to test this with packetdrill, it is a bit
of a hassle, since packetdrill can not verify priority
of egress packets, other than indirect observations,
using for example sch_prio on its tunnel device.

The bad skb priorities cause problems for GCP,
as this field is one of the keys used in routing.
====================

Signed-off-by: David S. Miller <davem@davemloft.net>

+29 -17
+1
include/net/inet_timewait_sock.h
··· 71 71 tw_pad : 2, /* 2 bits hole */ 72 72 tw_tos : 8; 73 73 u32 tw_txhash; 74 + u32 tw_priority; 74 75 struct timer_list tw_timer; 75 76 struct inet_bind_bucket *tw_tb; 76 77 };
+1 -1
include/net/ipv6.h
··· 981 981 * upper-layer output functions 982 982 */ 983 983 int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6, 984 - __u32 mark, struct ipv6_txoptions *opt, int tclass); 984 + __u32 mark, struct ipv6_txoptions *opt, int tclass, u32 priority); 985 985 986 986 int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr); 987 987
+3 -2
net/dccp/ipv6.c
··· 230 230 opt = ireq->ipv6_opt; 231 231 if (!opt) 232 232 opt = rcu_dereference(np->opt); 233 - err = ip6_xmit(sk, skb, &fl6, sk->sk_mark, opt, np->tclass); 233 + err = ip6_xmit(sk, skb, &fl6, sk->sk_mark, opt, np->tclass, 234 + sk->sk_priority); 234 235 rcu_read_unlock(); 235 236 err = net_xmit_eval(err); 236 237 } ··· 285 284 dst = ip6_dst_lookup_flow(ctl_sk, &fl6, NULL); 286 285 if (!IS_ERR(dst)) { 287 286 skb_dst_set(skb, dst); 288 - ip6_xmit(ctl_sk, skb, &fl6, 0, NULL, 0); 287 + ip6_xmit(ctl_sk, skb, &fl6, 0, NULL, 0, 0); 289 288 DCCP_INC_STATS(DCCP_MIB_OUTSEGS); 290 289 DCCP_INC_STATS(DCCP_MIB_OUTRSTS); 291 290 return;
-1
net/ipv4/ip_output.c
··· 1694 1694 1695 1695 inet_sk(sk)->tos = arg->tos; 1696 1696 1697 - sk->sk_priority = skb->priority; 1698 1697 sk->sk_protocol = ip_hdr(skb)->protocol; 1699 1698 sk->sk_bound_dev_if = arg->bound_dev_if; 1700 1699 sk->sk_sndbuf = sysctl_wmem_default;
+4
net/ipv4/tcp_ipv4.c
··· 771 771 if (sk) { 772 772 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ? 773 773 inet_twsk(sk)->tw_mark : sk->sk_mark; 774 + ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ? 775 + inet_twsk(sk)->tw_priority : sk->sk_priority; 774 776 transmit_time = tcp_transmit_time(sk); 775 777 } 776 778 ip_send_unicast_reply(ctl_sk, ··· 868 866 ctl_sk = this_cpu_read(*net->ipv4.tcp_sk); 869 867 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ? 870 868 inet_twsk(sk)->tw_mark : sk->sk_mark; 869 + ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ? 870 + inet_twsk(sk)->tw_priority : sk->sk_priority; 871 871 transmit_time = tcp_transmit_time(sk); 872 872 ip_send_unicast_reply(ctl_sk, 873 873 skb, &TCP_SKB_CB(skb)->header.h4.opt,
+1
net/ipv4/tcp_minisocks.c
··· 266 266 267 267 tw->tw_transparent = inet->transparent; 268 268 tw->tw_mark = sk->sk_mark; 269 + tw->tw_priority = sk->sk_priority; 269 270 tw->tw_rcv_wscale = tp->rx_opt.rcv_wscale; 270 271 tcptw->tw_rcv_nxt = tp->rcv_nxt; 271 272 tcptw->tw_snd_nxt = tp->snd_nxt;
+1 -1
net/ipv6/inet6_connection_sock.c
··· 133 133 fl6.daddr = sk->sk_v6_daddr; 134 134 135 135 res = ip6_xmit(sk, skb, &fl6, sk->sk_mark, rcu_dereference(np->opt), 136 - np->tclass); 136 + np->tclass, sk->sk_priority); 137 137 rcu_read_unlock(); 138 138 return res; 139 139 }
+2 -2
net/ipv6/ip6_output.c
··· 193 193 * which are using proper atomic operations or spinlocks. 194 194 */ 195 195 int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6, 196 - __u32 mark, struct ipv6_txoptions *opt, int tclass) 196 + __u32 mark, struct ipv6_txoptions *opt, int tclass, u32 priority) 197 197 { 198 198 struct net *net = sock_net(sk); 199 199 const struct ipv6_pinfo *np = inet6_sk(sk); ··· 258 258 hdr->daddr = *first_hop; 259 259 260 260 skb->protocol = htons(ETH_P_IPV6); 261 - skb->priority = sk->sk_priority; 261 + skb->priority = priority; 262 262 skb->mark = mark; 263 263 264 264 mtu = dst_mtu(dst);
+15 -9
net/ipv6/tcp_ipv6.c
··· 512 512 opt = ireq->ipv6_opt; 513 513 if (!opt) 514 514 opt = rcu_dereference(np->opt); 515 - err = ip6_xmit(sk, skb, fl6, sk->sk_mark, opt, np->tclass); 515 + err = ip6_xmit(sk, skb, fl6, sk->sk_mark, opt, np->tclass, 516 + sk->sk_priority); 516 517 rcu_read_unlock(); 517 518 err = net_xmit_eval(err); 518 519 } ··· 804 803 static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32 seq, 805 804 u32 ack, u32 win, u32 tsval, u32 tsecr, 806 805 int oif, struct tcp_md5sig_key *key, int rst, 807 - u8 tclass, __be32 label) 806 + u8 tclass, __be32 label, u32 priority) 808 807 { 809 808 const struct tcphdr *th = tcp_hdr(skb); 810 809 struct tcphdr *t1; ··· 908 907 dst = ip6_dst_lookup_flow(ctl_sk, &fl6, NULL); 909 908 if (!IS_ERR(dst)) { 910 909 skb_dst_set(buff, dst); 911 - ip6_xmit(ctl_sk, buff, &fl6, fl6.flowi6_mark, NULL, tclass); 910 + ip6_xmit(ctl_sk, buff, &fl6, fl6.flowi6_mark, NULL, tclass, 911 + priority); 912 912 TCP_INC_STATS(net, TCP_MIB_OUTSEGS); 913 913 if (rst) 914 914 TCP_INC_STATS(net, TCP_MIB_OUTRSTS); ··· 932 930 struct sock *sk1 = NULL; 933 931 #endif 934 932 __be32 label = 0; 933 + u32 priority = 0; 935 934 struct net *net; 936 935 int oif = 0; 937 936 ··· 993 990 trace_tcp_send_reset(sk, skb); 994 991 if (np->repflow) 995 992 label = ip6_flowlabel(ipv6h); 993 + priority = sk->sk_priority; 996 994 } 997 - if (sk->sk_state == TCP_TIME_WAIT) 995 + if (sk->sk_state == TCP_TIME_WAIT) { 998 996 label = cpu_to_be32(inet_twsk(sk)->tw_flowlabel); 997 + priority = inet_twsk(sk)->tw_priority; 998 + } 999 999 } else { 1000 1000 if (net->ipv6.sysctl.flowlabel_reflect & FLOWLABEL_REFLECT_TCP_RESET) 1001 1001 label = ip6_flowlabel(ipv6h); 1002 1002 } 1003 1003 1004 1004 tcp_v6_send_response(sk, skb, seq, ack_seq, 0, 0, 0, oif, key, 1, 0, 1005 - label); 1005 + label, priority); 1006 1006 1007 1007 #ifdef CONFIG_TCP_MD5SIG 1008 1008 out: ··· 1016 1010 static void tcp_v6_send_ack(const struct sock *sk, struct sk_buff *skb, u32 seq, 1017 1011 u32 ack, u32 win, u32 tsval, u32 tsecr, int oif, 1018 1012 struct tcp_md5sig_key *key, u8 tclass, 1019 - __be32 label) 1013 + __be32 label, u32 priority) 1020 1014 { 1021 1015 tcp_v6_send_response(sk, skb, seq, ack, win, tsval, tsecr, oif, key, 0, 1022 - tclass, label); 1016 + tclass, label, priority); 1023 1017 } 1024 1018 1025 1019 static void tcp_v6_timewait_ack(struct sock *sk, struct sk_buff *skb) ··· 1031 1025 tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale, 1032 1026 tcp_time_stamp_raw() + tcptw->tw_ts_offset, 1033 1027 tcptw->tw_ts_recent, tw->tw_bound_dev_if, tcp_twsk_md5_key(tcptw), 1034 - tw->tw_tclass, cpu_to_be32(tw->tw_flowlabel)); 1028 + tw->tw_tclass, cpu_to_be32(tw->tw_flowlabel), tw->tw_priority); 1035 1029 1036 1030 inet_twsk_put(tw); 1037 1031 } ··· 1054 1048 tcp_time_stamp_raw() + tcp_rsk(req)->ts_off, 1055 1049 req->ts_recent, sk->sk_bound_dev_if, 1056 1050 tcp_v6_md5_do_lookup(sk, &ipv6_hdr(skb)->saddr), 1057 - 0, 0); 1051 + 0, 0, sk->sk_priority); 1058 1052 } 1059 1053 1060 1054
+1 -1
net/sctp/ipv6.c
··· 215 215 216 216 rcu_read_lock(); 217 217 res = ip6_xmit(sk, skb, fl6, sk->sk_mark, rcu_dereference(np->opt), 218 - tclass); 218 + tclass, sk->sk_priority); 219 219 rcu_read_unlock(); 220 220 return res; 221 221 }