Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

net: provide a sysctl raw_l3mdev_accept for raw socket lookup with VRFs

Add a sysctl raw_l3mdev_accept to control raw socket lookup in a manner
similar to use of tcp_l3mdev_accept for stream and of udp_l3mdev_accept
for datagram sockets. Have this default to enabled for reasons of
backwards compatibility. This is so as to specify the output device
with cmsg and IP_PKTINFO, but using a socket not bound to the
corresponding VRF. This allows e.g. older ping implementations to be
run with specifying the device but without executing it in the VRF.
If the option is disabled, packets received in a VRF context are only
handled by a raw socket bound to the VRF, and correspondingly packets
in the default VRF are only handled by a socket not bound to any VRF.

Signed-off-by: Mike Manning <mmanning@vyatta.att-mail.com>
Reviewed-by: David Ahern <dsahern@gmail.com>
Tested-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>

authored by

Mike Manning and committed by
David S. Miller
6897445f 6da5b0f0

+68 -2
+12
Documentation/networking/ip-sysctl.txt
··· 370 370 derived from the listen socket to be bound to the L3 domain in 371 371 which the packets originated. Only valid when the kernel was 372 372 compiled with CONFIG_NET_L3_MASTER_DEV. 373 + Default: 0 (disabled) 373 374 374 375 tcp_low_latency - BOOLEAN 375 376 This is a legacy option, it has no effect anymore. ··· 774 773 being received regardless of the L3 domain in which they 775 774 originated. Only valid when the kernel was compiled with 776 775 CONFIG_NET_L3_MASTER_DEV. 776 + Default: 0 (disabled) 777 777 778 778 udp_mem - vector of 3 INTEGERs: min, pressure, max 779 779 Number of pages allowed for queueing by all UDP sockets. ··· 800 798 Each UDP socket is able to use the size for sending data, even if 801 799 total pages of UDP sockets exceed udp_mem pressure. The unit is byte. 802 800 Default: 4K 801 + 802 + RAW variables: 803 + 804 + raw_l3mdev_accept - BOOLEAN 805 + Enabling this option allows a "global" bound socket to work 806 + across L3 master domains (e.g., VRFs) with packets capable of 807 + being received regardless of the L3 domain in which they 808 + originated. Only valid when the kernel was compiled with 809 + CONFIG_NET_L3_MASTER_DEV. 810 + Default: 1 (enabled) 803 811 804 812 CIPSOv4 Variables: 805 813
+13
Documentation/networking/vrf.txt
··· 111 111 TCP & UDP services running in the default VRF context (ie., not bound 112 112 to any VRF device) can work across all VRF domains by enabling the 113 113 tcp_l3mdev_accept and udp_l3mdev_accept sysctl options: 114 + 114 115 sysctl -w net.ipv4.tcp_l3mdev_accept=1 115 116 sysctl -w net.ipv4.udp_l3mdev_accept=1 117 + 118 + These options are disabled by default so that a socket in a VRF is only 119 + selected for packets in that VRF. There is a similar option for RAW 120 + sockets, which is enabled by default for reasons of backwards compatibility. 121 + This is so as to specify the output device with cmsg and IP_PKTINFO, but 122 + using a socket not bound to the corresponding VRF. This allows e.g. older ping 123 + implementations to be run with specifying the device but without executing it 124 + in the VRF. This option can be disabled so that packets received in a VRF 125 + context are only handled by a raw socket bound to the VRF, and packets in the 126 + default VRF are only handled by a socket not bound to any VRF: 127 + 128 + sysctl -w net.ipv4.raw_l3mdev_accept=0 116 129 117 130 netfilter rules on the VRF device can be used to limit access to services 118 131 running in the default VRF context as well.
+3
include/net/netns/ipv4.h
··· 103 103 /* Shall we try to damage output packets if routing dev changes? */ 104 104 int sysctl_ip_dynaddr; 105 105 int sysctl_ip_early_demux; 106 + #ifdef CONFIG_NET_L3_MASTER_DEV 107 + int sysctl_raw_l3mdev_accept; 108 + #endif 106 109 int sysctl_tcp_early_demux; 107 110 int sysctl_udp_early_demux; 108 111
+1
include/net/raw.h
··· 61 61 62 62 int raw_hash_sk(struct sock *sk); 63 63 void raw_unhash_sk(struct sock *sk); 64 + void raw_init(void); 64 65 65 66 struct raw_sock { 66 67 /* inet_sock has to be the first member */
+2
net/ipv4/af_inet.c
··· 1964 1964 /* Add UDP-Lite (RFC 3828) */ 1965 1965 udplite4_register(); 1966 1966 1967 + raw_init(); 1968 + 1967 1969 ping_init(); 1968 1970 1969 1971 /*
+26 -2
net/ipv4/raw.c
··· 805 805 return copied; 806 806 } 807 807 808 - static int raw_init(struct sock *sk) 808 + static int raw_sk_init(struct sock *sk) 809 809 { 810 810 struct raw_sock *rp = raw_sk(sk); 811 811 ··· 970 970 .connect = ip4_datagram_connect, 971 971 .disconnect = __udp_disconnect, 972 972 .ioctl = raw_ioctl, 973 - .init = raw_init, 973 + .init = raw_sk_init, 974 974 .setsockopt = raw_setsockopt, 975 975 .getsockopt = raw_getsockopt, 976 976 .sendmsg = raw_sendmsg, ··· 1132 1132 void __init raw_proc_exit(void) 1133 1133 { 1134 1134 unregister_pernet_subsys(&raw_net_ops); 1135 + } 1136 + 1137 + static void raw_sysctl_init_net(struct net *net) 1138 + { 1139 + #ifdef CONFIG_NET_L3_MASTER_DEV 1140 + net->ipv4.sysctl_raw_l3mdev_accept = 1; 1141 + #endif 1142 + } 1143 + 1144 + static int __net_init raw_sysctl_init(struct net *net) 1145 + { 1146 + raw_sysctl_init_net(net); 1147 + return 0; 1148 + } 1149 + 1150 + static struct pernet_operations __net_initdata raw_sysctl_ops = { 1151 + .init = raw_sysctl_init, 1152 + }; 1153 + 1154 + void __init raw_init(void) 1155 + { 1156 + raw_sysctl_init_net(&init_net); 1157 + if (register_pernet_subsys(&raw_sysctl_ops)) 1158 + panic("RAW: failed to init sysctl parameters.\n"); 1135 1159 } 1136 1160 #endif /* CONFIG_PROC_FS */
+11
net/ipv4/sysctl_net_ipv4.c
··· 602 602 .mode = 0644, 603 603 .proc_handler = ipv4_ping_group_range, 604 604 }, 605 + #ifdef CONFIG_NET_L3_MASTER_DEV 606 + { 607 + .procname = "raw_l3mdev_accept", 608 + .data = &init_net.ipv4.sysctl_raw_l3mdev_accept, 609 + .maxlen = sizeof(int), 610 + .mode = 0644, 611 + .proc_handler = proc_dointvec_minmax, 612 + .extra1 = &zero, 613 + .extra2 = &one, 614 + }, 615 + #endif 605 616 { 606 617 .procname = "tcp_ecn", 607 618 .data = &init_net.ipv4.sysctl_tcp_ecn,