Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

tcp: Namespace-ify sysctl_tcp_default_congestion_control

Make default TCP default congestion control to a per namespace
value. This changes default congestion control to a pointer to congestion ops
(rather than implicit as first element of available lsit).

The congestion control setting of new namespaces is inherited
from the current setting of the root namespace.

Signed-off-by: Stephen Hemminger <sthemmin@microsoft.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>

authored by

Stephen Hemminger and committed by
David S. Miller
6670e152 11bf284f

+64 -54
+1
include/net/netns/ipv4.h
··· 160 160 struct inet_timewait_death_row tcp_death_row; 161 161 int sysctl_max_syn_backlog; 162 162 int sysctl_tcp_fastopen; 163 + const struct tcp_congestion_ops __rcu *tcp_congestion_control; 163 164 struct tcp_fastopen_context __rcu *tcp_fastopen_ctx; 164 165 spinlock_t tcp_fastopen_ctx_lock; 165 166 unsigned int sysctl_tcp_fastopen_blackhole_timeout;
+3 -3
include/net/tcp.h
··· 1002 1002 void tcp_assign_congestion_control(struct sock *sk); 1003 1003 void tcp_init_congestion_control(struct sock *sk); 1004 1004 void tcp_cleanup_congestion_control(struct sock *sk); 1005 - int tcp_set_default_congestion_control(const char *name); 1006 - void tcp_get_default_congestion_control(char *name); 1005 + int tcp_set_default_congestion_control(struct net *net, const char *name); 1006 + void tcp_get_default_congestion_control(struct net *net, char *name); 1007 1007 void tcp_get_available_congestion_control(char *buf, size_t len); 1008 1008 void tcp_get_allowed_congestion_control(char *buf, size_t len); 1009 1009 int tcp_set_allowed_congestion_control(char *allowed); ··· 1017 1017 extern struct tcp_congestion_ops tcp_reno; 1018 1018 1019 1019 struct tcp_congestion_ops *tcp_ca_find_key(u32 key); 1020 - u32 tcp_ca_get_key_by_name(const char *name, bool *ecn_ca); 1020 + u32 tcp_ca_get_key_by_name(struct net *net, const char *name, bool *ecn_ca); 1021 1021 #ifdef CONFIG_INET 1022 1022 char *tcp_ca_get_name_by_key(u32 key, char *buffer); 1023 1023 #else
+2 -2
net/ipv4/fib_semantics.c
··· 710 710 bool ecn_ca = false; 711 711 712 712 nla_strlcpy(tmp, nla, sizeof(tmp)); 713 - val = tcp_ca_get_key_by_name(tmp, &ecn_ca); 713 + val = tcp_ca_get_key_by_name(fi->fib_net, tmp, &ecn_ca); 714 714 } else { 715 715 val = nla_get_u32(nla); 716 716 } ··· 1030 1030 char tmp[TCP_CA_NAME_MAX]; 1031 1031 1032 1032 nla_strlcpy(tmp, nla, sizeof(tmp)); 1033 - val = tcp_ca_get_key_by_name(tmp, &ecn_ca); 1033 + val = tcp_ca_get_key_by_name(fi->fib_net, tmp, &ecn_ca); 1034 1034 if (val == TCP_CA_UNSPEC) 1035 1035 return -EINVAL; 1036 1036 } else {
+11 -8
net/ipv4/sysctl_net_ipv4.c
··· 201 201 static int proc_tcp_congestion_control(struct ctl_table *ctl, int write, 202 202 void __user *buffer, size_t *lenp, loff_t *ppos) 203 203 { 204 + struct net *net = container_of(ctl->data, struct net, 205 + ipv4.tcp_congestion_control); 204 206 char val[TCP_CA_NAME_MAX]; 205 207 struct ctl_table tbl = { 206 208 .data = val, ··· 210 208 }; 211 209 int ret; 212 210 213 - tcp_get_default_congestion_control(val); 211 + tcp_get_default_congestion_control(net, val); 214 212 215 213 ret = proc_dostring(&tbl, write, buffer, lenp, ppos); 216 214 if (write && ret == 0) 217 - ret = tcp_set_default_congestion_control(val); 215 + ret = tcp_set_default_congestion_control(net, val); 218 216 return ret; 219 217 } 220 218 ··· 448 446 .maxlen = sizeof(int), 449 447 .mode = 0644, 450 448 .proc_handler = proc_dointvec 451 - }, 452 - { 453 - .procname = "tcp_congestion_control", 454 - .mode = 0644, 455 - .maxlen = TCP_CA_NAME_MAX, 456 - .proc_handler = proc_tcp_congestion_control, 457 449 }, 458 450 #ifdef CONFIG_NETLABEL 459 451 { ··· 759 763 .extra1 = &one 760 764 }, 761 765 #endif 766 + { 767 + .procname = "tcp_congestion_control", 768 + .data = &init_net.ipv4.tcp_congestion_control, 769 + .mode = 0644, 770 + .maxlen = TCP_CA_NAME_MAX, 771 + .proc_handler = proc_tcp_congestion_control, 772 + }, 762 773 { 763 774 .procname = "tcp_keepalive_time", 764 775 .data = &init_net.ipv4.sysctl_tcp_keepalive_time,
+36 -40
net/ipv4/tcp_cong.c
··· 33 33 } 34 34 35 35 /* Must be called with rcu lock held */ 36 - static const struct tcp_congestion_ops *__tcp_ca_find_autoload(const char *name) 36 + static struct tcp_congestion_ops *tcp_ca_find_autoload(struct net *net, 37 + const char *name) 37 38 { 38 - const struct tcp_congestion_ops *ca = tcp_ca_find(name); 39 + struct tcp_congestion_ops *ca = tcp_ca_find(name); 40 + 39 41 #ifdef CONFIG_MODULES 40 42 if (!ca && capable(CAP_NET_ADMIN)) { 41 43 rcu_read_unlock(); ··· 117 115 } 118 116 EXPORT_SYMBOL_GPL(tcp_unregister_congestion_control); 119 117 120 - u32 tcp_ca_get_key_by_name(const char *name, bool *ecn_ca) 118 + u32 tcp_ca_get_key_by_name(struct net *net, const char *name, bool *ecn_ca) 121 119 { 122 120 const struct tcp_congestion_ops *ca; 123 121 u32 key = TCP_CA_UNSPEC; ··· 125 123 might_sleep(); 126 124 127 125 rcu_read_lock(); 128 - ca = __tcp_ca_find_autoload(name); 126 + ca = tcp_ca_find_autoload(net, name); 129 127 if (ca) { 130 128 key = ca->key; 131 129 *ecn_ca = ca->flags & TCP_CONG_NEEDS_ECN; ··· 155 153 /* Assign choice of congestion control. */ 156 154 void tcp_assign_congestion_control(struct sock *sk) 157 155 { 156 + struct net *net = sock_net(sk); 158 157 struct inet_connection_sock *icsk = inet_csk(sk); 159 - struct tcp_congestion_ops *ca; 158 + const struct tcp_congestion_ops *ca; 160 159 161 160 rcu_read_lock(); 162 - list_for_each_entry_rcu(ca, &tcp_cong_list, list) { 163 - if (likely(try_module_get(ca->owner))) { 164 - icsk->icsk_ca_ops = ca; 165 - goto out; 166 - } 167 - /* Fallback to next available. The last really 168 - * guaranteed fallback is Reno from this list. 169 - */ 170 - } 171 - out: 161 + ca = rcu_dereference(net->ipv4.tcp_congestion_control); 162 + if (unlikely(!try_module_get(ca->owner))) 163 + ca = &tcp_reno; 164 + icsk->icsk_ca_ops = ca; 172 165 rcu_read_unlock(); 173 - memset(icsk->icsk_ca_priv, 0, sizeof(icsk->icsk_ca_priv)); 174 166 167 + memset(icsk->icsk_ca_priv, 0, sizeof(icsk->icsk_ca_priv)); 175 168 if (ca->flags & TCP_CONG_NEEDS_ECN) 176 169 INET_ECN_xmit(sk); 177 170 else ··· 211 214 } 212 215 213 216 /* Used by sysctl to change default congestion control */ 214 - int tcp_set_default_congestion_control(const char *name) 217 + int tcp_set_default_congestion_control(struct net *net, const char *name) 215 218 { 216 219 struct tcp_congestion_ops *ca; 217 - int ret = -ENOENT; 220 + const struct tcp_congestion_ops *prev; 221 + int ret; 218 222 219 - spin_lock(&tcp_cong_list_lock); 220 - ca = tcp_ca_find(name); 221 - #ifdef CONFIG_MODULES 222 - if (!ca && capable(CAP_NET_ADMIN)) { 223 - spin_unlock(&tcp_cong_list_lock); 223 + rcu_read_lock(); 224 + ca = tcp_ca_find_autoload(net, name); 225 + if (!ca) { 226 + ret = -ENOENT; 227 + } else if (!try_module_get(ca->owner)) { 228 + ret = -EBUSY; 229 + } else { 230 + prev = xchg(&net->ipv4.tcp_congestion_control, ca); 231 + if (prev) 232 + module_put(prev->owner); 224 233 225 - request_module("tcp_%s", name); 226 - spin_lock(&tcp_cong_list_lock); 227 - ca = tcp_ca_find(name); 228 - } 229 - #endif 230 - 231 - if (ca) { 232 - ca->flags |= TCP_CONG_NON_RESTRICTED; /* default is always allowed */ 233 - list_move(&ca->list, &tcp_cong_list); 234 + ca->flags |= TCP_CONG_NON_RESTRICTED; 234 235 ret = 0; 235 236 } 236 - spin_unlock(&tcp_cong_list_lock); 237 + rcu_read_unlock(); 237 238 238 239 return ret; 239 240 } ··· 239 244 /* Set default value from kernel configuration at bootup */ 240 245 static int __init tcp_congestion_default(void) 241 246 { 242 - return tcp_set_default_congestion_control(CONFIG_DEFAULT_TCP_CONG); 247 + return tcp_set_default_congestion_control(&init_net, 248 + CONFIG_DEFAULT_TCP_CONG); 243 249 } 244 250 late_initcall(tcp_congestion_default); 245 251 ··· 260 264 } 261 265 262 266 /* Get current default congestion control */ 263 - void tcp_get_default_congestion_control(char *name) 267 + void tcp_get_default_congestion_control(struct net *net, char *name) 264 268 { 265 - struct tcp_congestion_ops *ca; 266 - /* We will always have reno... */ 267 - BUG_ON(list_empty(&tcp_cong_list)); 269 + const struct tcp_congestion_ops *ca; 268 270 269 271 rcu_read_lock(); 270 - ca = list_entry(tcp_cong_list.next, struct tcp_congestion_ops, list); 272 + ca = rcu_dereference(net->ipv4.tcp_congestion_control); 271 273 strncpy(name, ca->name, TCP_CA_NAME_MAX); 272 274 rcu_read_unlock(); 273 275 } ··· 345 351 if (!load) 346 352 ca = tcp_ca_find(name); 347 353 else 348 - ca = __tcp_ca_find_autoload(name); 354 + ca = tcp_ca_find_autoload(sock_net(sk), name); 355 + 349 356 /* No change asking for existing value */ 350 357 if (ca == icsk->icsk_ca_ops) { 351 358 icsk->icsk_ca_setsockopt = 1; 352 359 goto out; 353 360 } 361 + 354 362 if (!ca) { 355 363 err = -ENOENT; 356 364 } else if (!load) {
+9
net/ipv4/tcp_ipv4.c
··· 2430 2430 { 2431 2431 int cpu; 2432 2432 2433 + module_put(net->ipv4.tcp_congestion_control->owner); 2434 + 2433 2435 for_each_possible_cpu(cpu) 2434 2436 inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu)); 2435 2437 free_percpu(net->ipv4.tcp_sk); ··· 2523 2521 spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock); 2524 2522 net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 60 * 60; 2525 2523 atomic_set(&net->ipv4.tfo_active_disable_times, 0); 2524 + 2525 + /* Reno is always built in */ 2526 + if (!net_eq(net, &init_net) && 2527 + try_module_get(init_net.ipv4.tcp_congestion_control->owner)) 2528 + net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control; 2529 + else 2530 + net->ipv4.tcp_congestion_control = &tcp_reno; 2526 2531 2527 2532 return 0; 2528 2533 fail:
+2 -1
net/ipv6/route.c
··· 2378 2378 static int ip6_convert_metrics(struct mx6_config *mxc, 2379 2379 const struct fib6_config *cfg) 2380 2380 { 2381 + struct net *net = cfg->fc_nlinfo.nl_net; 2381 2382 bool ecn_ca = false; 2382 2383 struct nlattr *nla; 2383 2384 int remaining; ··· 2404 2403 char tmp[TCP_CA_NAME_MAX]; 2405 2404 2406 2405 nla_strlcpy(tmp, nla, sizeof(tmp)); 2407 - val = tcp_ca_get_key_by_name(tmp, &ecn_ca); 2406 + val = tcp_ca_get_key_by_name(net, tmp, &ecn_ca); 2408 2407 if (val == TCP_CA_UNSPEC) 2409 2408 goto err; 2410 2409 } else {