[TCP]: Allow choosing TCP congestion control via sockopt.

Allow using setsockopt to set TCP congestion control to use on a per
socket basis.

Signed-off-by: Stephen Hemminger <shemminger@osdl.org>
Signed-off-by: David S. Miller <davem@davemloft.net>

authored by Stephen Hemminger and committed by David S. Miller 5f8ef48d 51b0bded

+79 -6
+1
include/linux/tcp.h
··· 127 #define TCP_WINDOW_CLAMP 10 /* Bound advertised window */ 128 #define TCP_INFO 11 /* Information about this connection. */ 129 #define TCP_QUICKACK 12 /* Block/reenable quick acks */ 130 131 #define TCPI_OPT_TIMESTAMPS 1 132 #define TCPI_OPT_SACK 2
··· 127 #define TCP_WINDOW_CLAMP 10 /* Bound advertised window */ 128 #define TCP_INFO 11 /* Information about this connection. */ 129 #define TCP_QUICKACK 12 /* Block/reenable quick acks */ 130 + #define TCP_CONGESTION 13 /* Congestion control algorithm */ 131 132 #define TCPI_OPT_TIMESTAMPS 1 133 #define TCPI_OPT_SACK 2
+2 -1
include/net/tcp.h
··· 1162 extern void tcp_cleanup_congestion_control(struct tcp_sock *tp); 1163 extern int tcp_set_default_congestion_control(const char *name); 1164 extern void tcp_get_default_congestion_control(char *name); 1165 1166 - extern struct tcp_congestion_ops tcp_reno; 1167 extern u32 tcp_reno_ssthresh(struct tcp_sock *tp); 1168 extern void tcp_reno_cong_avoid(struct tcp_sock *tp, u32 ack, 1169 u32 rtt, u32 in_flight, int flag);
··· 1162 extern void tcp_cleanup_congestion_control(struct tcp_sock *tp); 1163 extern int tcp_set_default_congestion_control(const char *name); 1164 extern void tcp_get_default_congestion_control(char *name); 1165 + extern int tcp_set_congestion_control(struct tcp_sock *tp, const char *name); 1166 1167 + extern struct tcp_congestion_ops tcp_init_congestion_ops; 1168 extern u32 tcp_reno_ssthresh(struct tcp_sock *tp); 1169 extern void tcp_reno_cong_avoid(struct tcp_sock *tp, u32 ack, 1170 u32 rtt, u32 in_flight, int flag);
+30 -1
net/ipv4/tcp.c
··· 1927 return tp->af_specific->setsockopt(sk, level, optname, 1928 optval, optlen); 1929 1930 if (optlen < sizeof(int)) 1931 return -EINVAL; 1932 ··· 2230 case TCP_QUICKACK: 2231 val = !tp->ack.pingpong; 2232 break; 2233 default: 2234 return -ENOPROTOOPT; 2235 }; ··· 2253 2254 2255 extern void __skb_cb_too_small_for_tcp(int, int); 2256 - extern void tcpdiag_init(void); 2257 2258 static __initdata unsigned long thash_entries; 2259 static int __init set_thash_entries(char *str)
··· 1927 return tp->af_specific->setsockopt(sk, level, optname, 1928 optval, optlen); 1929 1930 + /* This is a string value all the others are int's */ 1931 + if (optname == TCP_CONGESTION) { 1932 + char name[TCP_CA_NAME_MAX]; 1933 + 1934 + if (optlen < 1) 1935 + return -EINVAL; 1936 + 1937 + val = strncpy_from_user(name, optval, 1938 + min(TCP_CA_NAME_MAX-1, optlen)); 1939 + if (val < 0) 1940 + return -EFAULT; 1941 + name[val] = 0; 1942 + 1943 + lock_sock(sk); 1944 + err = tcp_set_congestion_control(tp, name); 1945 + release_sock(sk); 1946 + return err; 1947 + } 1948 + 1949 if (optlen < sizeof(int)) 1950 return -EINVAL; 1951 ··· 2211 case TCP_QUICKACK: 2212 val = !tp->ack.pingpong; 2213 break; 2214 + 2215 + case TCP_CONGESTION: 2216 + if (get_user(len, optlen)) 2217 + return -EFAULT; 2218 + len = min_t(unsigned int, len, TCP_CA_NAME_MAX); 2219 + if (put_user(len, optlen)) 2220 + return -EFAULT; 2221 + if (copy_to_user(optval, tp->ca_ops->name, len)) 2222 + return -EFAULT; 2223 + return 0; 2224 default: 2225 return -ENOPROTOOPT; 2226 }; ··· 2224 2225 2226 extern void __skb_cb_too_small_for_tcp(int, int); 2227 + extern struct tcp_congestion_ops tcp_reno; 2228 2229 static __initdata unsigned long thash_entries; 2230 static int __init set_thash_entries(char *str)
+44 -2
net/ipv4/tcp_cong.c
··· 21 { 22 struct tcp_congestion_ops *e; 23 24 - list_for_each_entry(e, &tcp_cong_list, list) { 25 if (strcmp(e->name, name) == 0) 26 return e; 27 } ··· 76 void tcp_init_congestion_control(struct tcp_sock *tp) 77 { 78 struct tcp_congestion_ops *ca; 79 80 rcu_read_lock(); 81 list_for_each_entry_rcu(ca, &tcp_cong_list, list) { ··· 142 rcu_read_unlock(); 143 } 144 145 /* 146 * TCP Reno congestion control 147 * This is special case used for fallback as well. ··· 223 .min_cwnd = tcp_reno_min_cwnd, 224 }; 225 226 - EXPORT_SYMBOL_GPL(tcp_reno);
··· 21 { 22 struct tcp_congestion_ops *e; 23 24 + list_for_each_entry_rcu(e, &tcp_cong_list, list) { 25 if (strcmp(e->name, name) == 0) 26 return e; 27 } ··· 76 void tcp_init_congestion_control(struct tcp_sock *tp) 77 { 78 struct tcp_congestion_ops *ca; 79 + 80 + if (tp->ca_ops != &tcp_init_congestion_ops) 81 + return; 82 83 rcu_read_lock(); 84 list_for_each_entry_rcu(ca, &tcp_cong_list, list) { ··· 139 rcu_read_unlock(); 140 } 141 142 + /* Change congestion control for socket */ 143 + int tcp_set_congestion_control(struct tcp_sock *tp, const char *name) 144 + { 145 + struct tcp_congestion_ops *ca; 146 + int err = 0; 147 + 148 + rcu_read_lock(); 149 + ca = tcp_ca_find(name); 150 + if (ca == tp->ca_ops) 151 + goto out; 152 + 153 + if (!ca) 154 + err = -ENOENT; 155 + 156 + else if (!try_module_get(ca->owner)) 157 + err = -EBUSY; 158 + 159 + else { 160 + tcp_cleanup_congestion_control(tp); 161 + tp->ca_ops = ca; 162 + if (tp->ca_ops->init) 163 + tp->ca_ops->init(tp); 164 + } 165 + out: 166 + rcu_read_unlock(); 167 + return err; 168 + } 169 + 170 /* 171 * TCP Reno congestion control 172 * This is special case used for fallback as well. ··· 192 .min_cwnd = tcp_reno_min_cwnd, 193 }; 194 195 + /* Initial congestion control used (until SYN) 196 + * really reno under another name so we can tell difference 197 + * during tcp_set_default_congestion_control 198 + */ 199 + struct tcp_congestion_ops tcp_init_congestion_ops = { 200 + .name = "", 201 + .owner = THIS_MODULE, 202 + .ssthresh = tcp_reno_ssthresh, 203 + .cong_avoid = tcp_reno_cong_avoid, 204 + .min_cwnd = tcp_reno_min_cwnd, 205 + }; 206 + EXPORT_SYMBOL_GPL(tcp_init_congestion_ops);
+1 -1
net/ipv4/tcp_ipv4.c
··· 2048 tp->mss_cache_std = tp->mss_cache = 536; 2049 2050 tp->reordering = sysctl_tcp_reordering; 2051 - tp->ca_ops = &tcp_reno; 2052 2053 sk->sk_state = TCP_CLOSE; 2054
··· 2048 tp->mss_cache_std = tp->mss_cache = 536; 2049 2050 tp->reordering = sysctl_tcp_reordering; 2051 + tp->ca_ops = &tcp_init_congestion_ops; 2052 2053 sk->sk_state = TCP_CLOSE; 2054
+1 -1
net/ipv6/tcp_ipv6.c
··· 2025 sk->sk_state = TCP_CLOSE; 2026 2027 tp->af_specific = &ipv6_specific; 2028 - tp->ca_ops = &tcp_reno; 2029 sk->sk_write_space = sk_stream_write_space; 2030 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE); 2031
··· 2025 sk->sk_state = TCP_CLOSE; 2026 2027 tp->af_specific = &ipv6_specific; 2028 + tp->ca_ops = &tcp_init_congestion_ops; 2029 sk->sk_write_space = sk_stream_write_space; 2030 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE); 2031