at v4.20-rc5 600 lines 14 kB view raw
1// SPDX-License-Identifier: GPL-2.0 2/* -*- linux-c -*- 3 * sysctl_net_core.c: sysctl interface to net core subsystem. 4 * 5 * Begun April 1, 1996, Mike Shaver. 6 * Added /proc/sys/net/core directory entry (empty =) ). [MS] 7 */ 8 9#include <linux/mm.h> 10#include <linux/sysctl.h> 11#include <linux/module.h> 12#include <linux/socket.h> 13#include <linux/netdevice.h> 14#include <linux/ratelimit.h> 15#include <linux/vmalloc.h> 16#include <linux/init.h> 17#include <linux/slab.h> 18 19#include <net/ip.h> 20#include <net/sock.h> 21#include <net/net_ratelimit.h> 22#include <net/busy_poll.h> 23#include <net/pkt_sched.h> 24 25static int zero = 0; 26static int one = 1; 27static int two __maybe_unused = 2; 28static int min_sndbuf = SOCK_MIN_SNDBUF; 29static int min_rcvbuf = SOCK_MIN_RCVBUF; 30static int max_skb_frags = MAX_SKB_FRAGS; 31 32static int net_msg_warn; /* Unused, but still a sysctl */ 33 34int sysctl_fb_tunnels_only_for_init_net __read_mostly = 0; 35EXPORT_SYMBOL(sysctl_fb_tunnels_only_for_init_net); 36 37#ifdef CONFIG_RPS 38static int rps_sock_flow_sysctl(struct ctl_table *table, int write, 39 void __user *buffer, size_t *lenp, loff_t *ppos) 40{ 41 unsigned int orig_size, size; 42 int ret, i; 43 struct ctl_table tmp = { 44 .data = &size, 45 .maxlen = sizeof(size), 46 .mode = table->mode 47 }; 48 struct rps_sock_flow_table *orig_sock_table, *sock_table; 49 static DEFINE_MUTEX(sock_flow_mutex); 50 51 mutex_lock(&sock_flow_mutex); 52 53 orig_sock_table = rcu_dereference_protected(rps_sock_flow_table, 54 lockdep_is_held(&sock_flow_mutex)); 55 size = orig_size = orig_sock_table ? orig_sock_table->mask + 1 : 0; 56 57 ret = proc_dointvec(&tmp, write, buffer, lenp, ppos); 58 59 if (write) { 60 if (size) { 61 if (size > 1<<29) { 62 /* Enforce limit to prevent overflow */ 63 mutex_unlock(&sock_flow_mutex); 64 return -EINVAL; 65 } 66 size = roundup_pow_of_two(size); 67 if (size != orig_size) { 68 sock_table = 69 vmalloc(RPS_SOCK_FLOW_TABLE_SIZE(size)); 70 if (!sock_table) { 71 mutex_unlock(&sock_flow_mutex); 72 return -ENOMEM; 73 } 74 rps_cpu_mask = roundup_pow_of_two(nr_cpu_ids) - 1; 75 sock_table->mask = size - 1; 76 } else 77 sock_table = orig_sock_table; 78 79 for (i = 0; i < size; i++) 80 sock_table->ents[i] = RPS_NO_CPU; 81 } else 82 sock_table = NULL; 83 84 if (sock_table != orig_sock_table) { 85 rcu_assign_pointer(rps_sock_flow_table, sock_table); 86 if (sock_table) { 87 static_key_slow_inc(&rps_needed); 88 static_key_slow_inc(&rfs_needed); 89 } 90 if (orig_sock_table) { 91 static_key_slow_dec(&rps_needed); 92 static_key_slow_dec(&rfs_needed); 93 synchronize_rcu(); 94 vfree(orig_sock_table); 95 } 96 } 97 } 98 99 mutex_unlock(&sock_flow_mutex); 100 101 return ret; 102} 103#endif /* CONFIG_RPS */ 104 105#ifdef CONFIG_NET_FLOW_LIMIT 106static DEFINE_MUTEX(flow_limit_update_mutex); 107 108static int flow_limit_cpu_sysctl(struct ctl_table *table, int write, 109 void __user *buffer, size_t *lenp, 110 loff_t *ppos) 111{ 112 struct sd_flow_limit *cur; 113 struct softnet_data *sd; 114 cpumask_var_t mask; 115 int i, len, ret = 0; 116 117 if (!alloc_cpumask_var(&mask, GFP_KERNEL)) 118 return -ENOMEM; 119 120 if (write) { 121 ret = cpumask_parse_user(buffer, *lenp, mask); 122 if (ret) 123 goto done; 124 125 mutex_lock(&flow_limit_update_mutex); 126 len = sizeof(*cur) + netdev_flow_limit_table_len; 127 for_each_possible_cpu(i) { 128 sd = &per_cpu(softnet_data, i); 129 cur = rcu_dereference_protected(sd->flow_limit, 130 lockdep_is_held(&flow_limit_update_mutex)); 131 if (cur && !cpumask_test_cpu(i, mask)) { 132 RCU_INIT_POINTER(sd->flow_limit, NULL); 133 synchronize_rcu(); 134 kfree(cur); 135 } else if (!cur && cpumask_test_cpu(i, mask)) { 136 cur = kzalloc_node(len, GFP_KERNEL, 137 cpu_to_node(i)); 138 if (!cur) { 139 /* not unwinding previous changes */ 140 ret = -ENOMEM; 141 goto write_unlock; 142 } 143 cur->num_buckets = netdev_flow_limit_table_len; 144 rcu_assign_pointer(sd->flow_limit, cur); 145 } 146 } 147write_unlock: 148 mutex_unlock(&flow_limit_update_mutex); 149 } else { 150 char kbuf[128]; 151 152 if (*ppos || !*lenp) { 153 *lenp = 0; 154 goto done; 155 } 156 157 cpumask_clear(mask); 158 rcu_read_lock(); 159 for_each_possible_cpu(i) { 160 sd = &per_cpu(softnet_data, i); 161 if (rcu_dereference(sd->flow_limit)) 162 cpumask_set_cpu(i, mask); 163 } 164 rcu_read_unlock(); 165 166 len = min(sizeof(kbuf) - 1, *lenp); 167 len = scnprintf(kbuf, len, "%*pb", cpumask_pr_args(mask)); 168 if (!len) { 169 *lenp = 0; 170 goto done; 171 } 172 if (len < *lenp) 173 kbuf[len++] = '\n'; 174 if (copy_to_user(buffer, kbuf, len)) { 175 ret = -EFAULT; 176 goto done; 177 } 178 *lenp = len; 179 *ppos += len; 180 } 181 182done: 183 free_cpumask_var(mask); 184 return ret; 185} 186 187static int flow_limit_table_len_sysctl(struct ctl_table *table, int write, 188 void __user *buffer, size_t *lenp, 189 loff_t *ppos) 190{ 191 unsigned int old, *ptr; 192 int ret; 193 194 mutex_lock(&flow_limit_update_mutex); 195 196 ptr = table->data; 197 old = *ptr; 198 ret = proc_dointvec(table, write, buffer, lenp, ppos); 199 if (!ret && write && !is_power_of_2(*ptr)) { 200 *ptr = old; 201 ret = -EINVAL; 202 } 203 204 mutex_unlock(&flow_limit_update_mutex); 205 return ret; 206} 207#endif /* CONFIG_NET_FLOW_LIMIT */ 208 209#ifdef CONFIG_NET_SCHED 210static int set_default_qdisc(struct ctl_table *table, int write, 211 void __user *buffer, size_t *lenp, loff_t *ppos) 212{ 213 char id[IFNAMSIZ]; 214 struct ctl_table tbl = { 215 .data = id, 216 .maxlen = IFNAMSIZ, 217 }; 218 int ret; 219 220 qdisc_get_default(id, IFNAMSIZ); 221 222 ret = proc_dostring(&tbl, write, buffer, lenp, ppos); 223 if (write && ret == 0) 224 ret = qdisc_set_default(id); 225 return ret; 226} 227#endif 228 229static int proc_do_dev_weight(struct ctl_table *table, int write, 230 void __user *buffer, size_t *lenp, loff_t *ppos) 231{ 232 int ret; 233 234 ret = proc_dointvec(table, write, buffer, lenp, ppos); 235 if (ret != 0) 236 return ret; 237 238 dev_rx_weight = weight_p * dev_weight_rx_bias; 239 dev_tx_weight = weight_p * dev_weight_tx_bias; 240 241 return ret; 242} 243 244static int proc_do_rss_key(struct ctl_table *table, int write, 245 void __user *buffer, size_t *lenp, loff_t *ppos) 246{ 247 struct ctl_table fake_table; 248 char buf[NETDEV_RSS_KEY_LEN * 3]; 249 250 snprintf(buf, sizeof(buf), "%*phC", NETDEV_RSS_KEY_LEN, netdev_rss_key); 251 fake_table.data = buf; 252 fake_table.maxlen = sizeof(buf); 253 return proc_dostring(&fake_table, write, buffer, lenp, ppos); 254} 255 256#ifdef CONFIG_BPF_JIT 257static int proc_dointvec_minmax_bpf_enable(struct ctl_table *table, int write, 258 void __user *buffer, size_t *lenp, 259 loff_t *ppos) 260{ 261 int ret, jit_enable = *(int *)table->data; 262 struct ctl_table tmp = *table; 263 264 if (write && !capable(CAP_SYS_ADMIN)) 265 return -EPERM; 266 267 tmp.data = &jit_enable; 268 ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos); 269 if (write && !ret) { 270 if (jit_enable < 2 || 271 (jit_enable == 2 && bpf_dump_raw_ok())) { 272 *(int *)table->data = jit_enable; 273 if (jit_enable == 2) 274 pr_warn("bpf_jit_enable = 2 was set! NEVER use this in production, only for JIT debugging!\n"); 275 } else { 276 ret = -EPERM; 277 } 278 } 279 return ret; 280} 281 282static int 283proc_dointvec_minmax_bpf_restricted(struct ctl_table *table, int write, 284 void __user *buffer, size_t *lenp, 285 loff_t *ppos) 286{ 287 if (!capable(CAP_SYS_ADMIN)) 288 return -EPERM; 289 290 return proc_dointvec_minmax(table, write, buffer, lenp, ppos); 291} 292#endif 293 294static struct ctl_table net_core_table[] = { 295#ifdef CONFIG_NET 296 { 297 .procname = "wmem_max", 298 .data = &sysctl_wmem_max, 299 .maxlen = sizeof(int), 300 .mode = 0644, 301 .proc_handler = proc_dointvec_minmax, 302 .extra1 = &min_sndbuf, 303 }, 304 { 305 .procname = "rmem_max", 306 .data = &sysctl_rmem_max, 307 .maxlen = sizeof(int), 308 .mode = 0644, 309 .proc_handler = proc_dointvec_minmax, 310 .extra1 = &min_rcvbuf, 311 }, 312 { 313 .procname = "wmem_default", 314 .data = &sysctl_wmem_default, 315 .maxlen = sizeof(int), 316 .mode = 0644, 317 .proc_handler = proc_dointvec_minmax, 318 .extra1 = &min_sndbuf, 319 }, 320 { 321 .procname = "rmem_default", 322 .data = &sysctl_rmem_default, 323 .maxlen = sizeof(int), 324 .mode = 0644, 325 .proc_handler = proc_dointvec_minmax, 326 .extra1 = &min_rcvbuf, 327 }, 328 { 329 .procname = "dev_weight", 330 .data = &weight_p, 331 .maxlen = sizeof(int), 332 .mode = 0644, 333 .proc_handler = proc_do_dev_weight, 334 }, 335 { 336 .procname = "dev_weight_rx_bias", 337 .data = &dev_weight_rx_bias, 338 .maxlen = sizeof(int), 339 .mode = 0644, 340 .proc_handler = proc_do_dev_weight, 341 }, 342 { 343 .procname = "dev_weight_tx_bias", 344 .data = &dev_weight_tx_bias, 345 .maxlen = sizeof(int), 346 .mode = 0644, 347 .proc_handler = proc_do_dev_weight, 348 }, 349 { 350 .procname = "netdev_max_backlog", 351 .data = &netdev_max_backlog, 352 .maxlen = sizeof(int), 353 .mode = 0644, 354 .proc_handler = proc_dointvec 355 }, 356 { 357 .procname = "netdev_rss_key", 358 .data = &netdev_rss_key, 359 .maxlen = sizeof(int), 360 .mode = 0444, 361 .proc_handler = proc_do_rss_key, 362 }, 363#ifdef CONFIG_BPF_JIT 364 { 365 .procname = "bpf_jit_enable", 366 .data = &bpf_jit_enable, 367 .maxlen = sizeof(int), 368 .mode = 0644, 369 .proc_handler = proc_dointvec_minmax_bpf_enable, 370# ifdef CONFIG_BPF_JIT_ALWAYS_ON 371 .extra1 = &one, 372 .extra2 = &one, 373# else 374 .extra1 = &zero, 375 .extra2 = &two, 376# endif 377 }, 378# ifdef CONFIG_HAVE_EBPF_JIT 379 { 380 .procname = "bpf_jit_harden", 381 .data = &bpf_jit_harden, 382 .maxlen = sizeof(int), 383 .mode = 0600, 384 .proc_handler = proc_dointvec_minmax_bpf_restricted, 385 .extra1 = &zero, 386 .extra2 = &two, 387 }, 388 { 389 .procname = "bpf_jit_kallsyms", 390 .data = &bpf_jit_kallsyms, 391 .maxlen = sizeof(int), 392 .mode = 0600, 393 .proc_handler = proc_dointvec_minmax_bpf_restricted, 394 .extra1 = &zero, 395 .extra2 = &one, 396 }, 397# endif 398 { 399 .procname = "bpf_jit_limit", 400 .data = &bpf_jit_limit, 401 .maxlen = sizeof(int), 402 .mode = 0600, 403 .proc_handler = proc_dointvec_minmax_bpf_restricted, 404 .extra1 = &one, 405 }, 406#endif 407 { 408 .procname = "netdev_tstamp_prequeue", 409 .data = &netdev_tstamp_prequeue, 410 .maxlen = sizeof(int), 411 .mode = 0644, 412 .proc_handler = proc_dointvec 413 }, 414 { 415 .procname = "message_cost", 416 .data = &net_ratelimit_state.interval, 417 .maxlen = sizeof(int), 418 .mode = 0644, 419 .proc_handler = proc_dointvec_jiffies, 420 }, 421 { 422 .procname = "message_burst", 423 .data = &net_ratelimit_state.burst, 424 .maxlen = sizeof(int), 425 .mode = 0644, 426 .proc_handler = proc_dointvec, 427 }, 428 { 429 .procname = "optmem_max", 430 .data = &sysctl_optmem_max, 431 .maxlen = sizeof(int), 432 .mode = 0644, 433 .proc_handler = proc_dointvec 434 }, 435 { 436 .procname = "tstamp_allow_data", 437 .data = &sysctl_tstamp_allow_data, 438 .maxlen = sizeof(int), 439 .mode = 0644, 440 .proc_handler = proc_dointvec_minmax, 441 .extra1 = &zero, 442 .extra2 = &one 443 }, 444#ifdef CONFIG_RPS 445 { 446 .procname = "rps_sock_flow_entries", 447 .maxlen = sizeof(int), 448 .mode = 0644, 449 .proc_handler = rps_sock_flow_sysctl 450 }, 451#endif 452#ifdef CONFIG_NET_FLOW_LIMIT 453 { 454 .procname = "flow_limit_cpu_bitmap", 455 .mode = 0644, 456 .proc_handler = flow_limit_cpu_sysctl 457 }, 458 { 459 .procname = "flow_limit_table_len", 460 .data = &netdev_flow_limit_table_len, 461 .maxlen = sizeof(int), 462 .mode = 0644, 463 .proc_handler = flow_limit_table_len_sysctl 464 }, 465#endif /* CONFIG_NET_FLOW_LIMIT */ 466#ifdef CONFIG_NET_RX_BUSY_POLL 467 { 468 .procname = "busy_poll", 469 .data = &sysctl_net_busy_poll, 470 .maxlen = sizeof(unsigned int), 471 .mode = 0644, 472 .proc_handler = proc_dointvec_minmax, 473 .extra1 = &zero, 474 }, 475 { 476 .procname = "busy_read", 477 .data = &sysctl_net_busy_read, 478 .maxlen = sizeof(unsigned int), 479 .mode = 0644, 480 .proc_handler = proc_dointvec_minmax, 481 .extra1 = &zero, 482 }, 483#endif 484#ifdef CONFIG_NET_SCHED 485 { 486 .procname = "default_qdisc", 487 .mode = 0644, 488 .maxlen = IFNAMSIZ, 489 .proc_handler = set_default_qdisc 490 }, 491#endif 492#endif /* CONFIG_NET */ 493 { 494 .procname = "netdev_budget", 495 .data = &netdev_budget, 496 .maxlen = sizeof(int), 497 .mode = 0644, 498 .proc_handler = proc_dointvec 499 }, 500 { 501 .procname = "warnings", 502 .data = &net_msg_warn, 503 .maxlen = sizeof(int), 504 .mode = 0644, 505 .proc_handler = proc_dointvec 506 }, 507 { 508 .procname = "max_skb_frags", 509 .data = &sysctl_max_skb_frags, 510 .maxlen = sizeof(int), 511 .mode = 0644, 512 .proc_handler = proc_dointvec_minmax, 513 .extra1 = &one, 514 .extra2 = &max_skb_frags, 515 }, 516 { 517 .procname = "netdev_budget_usecs", 518 .data = &netdev_budget_usecs, 519 .maxlen = sizeof(unsigned int), 520 .mode = 0644, 521 .proc_handler = proc_dointvec_minmax, 522 .extra1 = &zero, 523 }, 524 { 525 .procname = "fb_tunnels_only_for_init_net", 526 .data = &sysctl_fb_tunnels_only_for_init_net, 527 .maxlen = sizeof(int), 528 .mode = 0644, 529 .proc_handler = proc_dointvec_minmax, 530 .extra1 = &zero, 531 .extra2 = &one, 532 }, 533 { } 534}; 535 536static struct ctl_table netns_core_table[] = { 537 { 538 .procname = "somaxconn", 539 .data = &init_net.core.sysctl_somaxconn, 540 .maxlen = sizeof(int), 541 .mode = 0644, 542 .extra1 = &zero, 543 .proc_handler = proc_dointvec_minmax 544 }, 545 { } 546}; 547 548static __net_init int sysctl_core_net_init(struct net *net) 549{ 550 struct ctl_table *tbl; 551 552 tbl = netns_core_table; 553 if (!net_eq(net, &init_net)) { 554 tbl = kmemdup(tbl, sizeof(netns_core_table), GFP_KERNEL); 555 if (tbl == NULL) 556 goto err_dup; 557 558 tbl[0].data = &net->core.sysctl_somaxconn; 559 560 /* Don't export any sysctls to unprivileged users */ 561 if (net->user_ns != &init_user_ns) { 562 tbl[0].procname = NULL; 563 } 564 } 565 566 net->core.sysctl_hdr = register_net_sysctl(net, "net/core", tbl); 567 if (net->core.sysctl_hdr == NULL) 568 goto err_reg; 569 570 return 0; 571 572err_reg: 573 if (tbl != netns_core_table) 574 kfree(tbl); 575err_dup: 576 return -ENOMEM; 577} 578 579static __net_exit void sysctl_core_net_exit(struct net *net) 580{ 581 struct ctl_table *tbl; 582 583 tbl = net->core.sysctl_hdr->ctl_table_arg; 584 unregister_net_sysctl_table(net->core.sysctl_hdr); 585 BUG_ON(tbl == netns_core_table); 586 kfree(tbl); 587} 588 589static __net_initdata struct pernet_operations sysctl_core_ops = { 590 .init = sysctl_core_net_init, 591 .exit = sysctl_core_net_exit, 592}; 593 594static __init int sysctl_core_init(void) 595{ 596 register_net_sysctl(&init_net, "net/core", net_core_table); 597 return register_pernet_subsys(&sysctl_core_ops); 598} 599 600fs_initcall(sysctl_core_init);