Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge branch 'master' of git://1984.lsi.us.es/net-next

+856 -1708
+9
Documentation/ABI/removed/ip_queue
··· 1 + What: ip_queue 2 + Date: finally removed in kernel v3.5.0 3 + Contact: Pablo Neira Ayuso <pablo@netfilter.org> 4 + Description: 5 + ip_queue has been replaced by nfnetlink_queue which provides 6 + more advanced queueing mechanism to user-space. The ip_queue 7 + module was already announced to become obsolete years ago. 8 + 9 + Users:
+11 -2
Documentation/networking/ip-sysctl.txt
··· 1301 1301 bridge-nf-filter-vlan-tagged - BOOLEAN 1302 1302 1 : pass bridged vlan-tagged ARP/IP/IPv6 traffic to {arp,ip,ip6}tables. 1303 1303 0 : disable this. 1304 - Default: 1 1304 + Default: 0 1305 1305 1306 1306 bridge-nf-filter-pppoe-tagged - BOOLEAN 1307 1307 1 : pass bridged pppoe-tagged IP/IPv6 traffic to {ip,ip6}tables. 1308 1308 0 : disable this. 1309 - Default: 1 1309 + Default: 0 1310 1310 1311 + bridge-nf-pass-vlan-input-dev - BOOLEAN 1312 + 1: if bridge-nf-filter-vlan-tagged is enabled, try to find a vlan 1313 + interface on the bridge and set the netfilter input device to the vlan. 1314 + This allows use of e.g. "iptables -i br0.1" and makes the REDIRECT 1315 + target work with vlan-on-top-of-bridge interfaces. When no matching 1316 + vlan interface is found, or this switch is off, the input device is 1317 + set to the bridge interface. 1318 + 0: disable bridge netfilter vlan interface lookup. 1319 + Default: 0 1311 1320 1312 1321 proc/sys/net/sctp/* Variables: 1313 1322
+5
include/linux/ip_vs.h
··· 89 89 #define IP_VS_CONN_F_TEMPLATE 0x1000 /* template, not connection */ 90 90 #define IP_VS_CONN_F_ONE_PACKET 0x2000 /* forward only one packet */ 91 91 92 + /* Initial bits allowed in backup server */ 92 93 #define IP_VS_CONN_F_BACKUP_MASK (IP_VS_CONN_F_FWD_MASK | \ 93 94 IP_VS_CONN_F_NOOUTPUT | \ 94 95 IP_VS_CONN_F_INACTIVE | \ ··· 97 96 IP_VS_CONN_F_NO_CPORT | \ 98 97 IP_VS_CONN_F_TEMPLATE \ 99 98 ) 99 + 100 + /* Bits allowed to update in backup server */ 101 + #define IP_VS_CONN_F_BACKUP_UPD_MASK (IP_VS_CONN_F_INACTIVE | \ 102 + IP_VS_CONN_F_SEQ_MASK) 100 103 101 104 /* Flags that are not sent to backup server start from bit 16 */ 102 105 #define IP_VS_CONN_F_NFCT (1 << 16) /* use netfilter conntrack */
+4
include/linux/netfilter/nf_conntrack_common.h
··· 83 83 /* Conntrack is a fake untracked entry */ 84 84 IPS_UNTRACKED_BIT = 12, 85 85 IPS_UNTRACKED = (1 << IPS_UNTRACKED_BIT), 86 + 87 + /* Conntrack got a helper explicitly attached via CT target. */ 88 + IPS_HELPER_BIT = 13, 89 + IPS_HELPER = (1 << IPS_HELPER_BIT), 86 90 }; 87 91 88 92 /* Connection tracking event types */
-1
include/linux/netfilter_ipv4/Kbuild
··· 1 - header-y += ip_queue.h 2 1 header-y += ip_tables.h 3 2 header-y += ipt_CLUSTERIP.h 4 3 header-y += ipt_ECN.h
-72
include/linux/netfilter_ipv4/ip_queue.h
··· 1 - /* 2 - * This is a module which is used for queueing IPv4 packets and 3 - * communicating with userspace via netlink. 4 - * 5 - * (C) 2000 James Morris, this code is GPL. 6 - */ 7 - #ifndef _IP_QUEUE_H 8 - #define _IP_QUEUE_H 9 - 10 - #ifdef __KERNEL__ 11 - #ifdef DEBUG_IPQ 12 - #define QDEBUG(x...) printk(KERN_DEBUG ## x) 13 - #else 14 - #define QDEBUG(x...) 15 - #endif /* DEBUG_IPQ */ 16 - #else 17 - #include <net/if.h> 18 - #endif /* ! __KERNEL__ */ 19 - 20 - /* Messages sent from kernel */ 21 - typedef struct ipq_packet_msg { 22 - unsigned long packet_id; /* ID of queued packet */ 23 - unsigned long mark; /* Netfilter mark value */ 24 - long timestamp_sec; /* Packet arrival time (seconds) */ 25 - long timestamp_usec; /* Packet arrvial time (+useconds) */ 26 - unsigned int hook; /* Netfilter hook we rode in on */ 27 - char indev_name[IFNAMSIZ]; /* Name of incoming interface */ 28 - char outdev_name[IFNAMSIZ]; /* Name of outgoing interface */ 29 - __be16 hw_protocol; /* Hardware protocol (network order) */ 30 - unsigned short hw_type; /* Hardware type */ 31 - unsigned char hw_addrlen; /* Hardware address length */ 32 - unsigned char hw_addr[8]; /* Hardware address */ 33 - size_t data_len; /* Length of packet data */ 34 - unsigned char payload[0]; /* Optional packet data */ 35 - } ipq_packet_msg_t; 36 - 37 - /* Messages sent from userspace */ 38 - typedef struct ipq_mode_msg { 39 - unsigned char value; /* Requested mode */ 40 - size_t range; /* Optional range of packet requested */ 41 - } ipq_mode_msg_t; 42 - 43 - typedef struct ipq_verdict_msg { 44 - unsigned int value; /* Verdict to hand to netfilter */ 45 - unsigned long id; /* Packet ID for this verdict */ 46 - size_t data_len; /* Length of replacement data */ 47 - unsigned char payload[0]; /* Optional replacement packet */ 48 - } ipq_verdict_msg_t; 49 - 50 - typedef struct ipq_peer_msg { 51 - union { 52 - ipq_verdict_msg_t verdict; 53 - ipq_mode_msg_t mode; 54 - } msg; 55 - } ipq_peer_msg_t; 56 - 57 - /* Packet delivery modes */ 58 - enum { 59 - IPQ_COPY_NONE, /* Initial mode, packets are dropped */ 60 - IPQ_COPY_META, /* Copy metadata */ 61 - IPQ_COPY_PACKET /* Copy metadata + packet (range) */ 62 - }; 63 - #define IPQ_COPY_MAX IPQ_COPY_PACKET 64 - 65 - /* Types of messages */ 66 - #define IPQM_BASE 0x10 /* standard netlink messages below this */ 67 - #define IPQM_MODE (IPQM_BASE + 1) /* Mode request from peer */ 68 - #define IPQM_VERDICT (IPQM_BASE + 2) /* Verdict from peer */ 69 - #define IPQM_PACKET (IPQM_BASE + 3) /* Packet from kernel */ 70 - #define IPQM_MAX (IPQM_BASE + 4) 71 - 72 - #endif /*_IP_QUEUE_H*/
+1 -1
include/linux/netlink.h
··· 7 7 #define NETLINK_ROUTE 0 /* Routing/device hook */ 8 8 #define NETLINK_UNUSED 1 /* Unused number */ 9 9 #define NETLINK_USERSOCK 2 /* Reserved for user mode socket protocols */ 10 - #define NETLINK_FIREWALL 3 /* Firewalling hook */ 10 + #define NETLINK_FIREWALL 3 /* Unused number, formerly ip_queue */ 11 11 #define NETLINK_SOCK_DIAG 4 /* socket monitoring */ 12 12 #define NETLINK_NFLOG 5 /* netfilter/iptables ULOG */ 13 13 #define NETLINK_XFRM 6 /* ipsec */
+79 -8
include/net/ip_vs.h
··· 504 504 * state transition triggerd 505 505 * synchronization 506 506 */ 507 + unsigned long sync_endtime; /* jiffies + sent_retries */ 507 508 508 509 /* Control members */ 509 510 struct ip_vs_conn *control; /* Master control connection */ ··· 784 783 void (*timeout_change)(struct ip_vs_app *app, int flags); 785 784 }; 786 785 786 + struct ipvs_master_sync_state { 787 + struct list_head sync_queue; 788 + struct ip_vs_sync_buff *sync_buff; 789 + int sync_queue_len; 790 + unsigned int sync_queue_delay; 791 + struct task_struct *master_thread; 792 + struct delayed_work master_wakeup_work; 793 + struct netns_ipvs *ipvs; 794 + }; 795 + 787 796 /* IPVS in network namespace */ 788 797 struct netns_ipvs { 789 798 int gen; /* Generation */ ··· 880 869 #endif 881 870 int sysctl_snat_reroute; 882 871 int sysctl_sync_ver; 872 + int sysctl_sync_ports; 873 + int sysctl_sync_qlen_max; 874 + int sysctl_sync_sock_size; 883 875 int sysctl_cache_bypass; 884 876 int sysctl_expire_nodest_conn; 885 877 int sysctl_expire_quiescent_template; 886 878 int sysctl_sync_threshold[2]; 879 + unsigned int sysctl_sync_refresh_period; 880 + int sysctl_sync_retries; 887 881 int sysctl_nat_icmp_send; 888 882 889 883 /* ip_vs_lblc */ ··· 904 888 spinlock_t est_lock; 905 889 struct timer_list est_timer; /* Estimation timer */ 906 890 /* ip_vs_sync */ 907 - struct list_head sync_queue; 908 891 spinlock_t sync_lock; 909 - struct ip_vs_sync_buff *sync_buff; 892 + struct ipvs_master_sync_state *ms; 910 893 spinlock_t sync_buff_lock; 911 - struct sockaddr_in sync_mcast_addr; 912 - struct task_struct *master_thread; 913 - struct task_struct *backup_thread; 894 + struct task_struct **backup_threads; 895 + int threads_mask; 914 896 int send_mesg_maxlen; 915 897 int recv_mesg_maxlen; 916 898 volatile int sync_state; ··· 925 911 #define DEFAULT_SYNC_THRESHOLD 3 926 912 #define DEFAULT_SYNC_PERIOD 50 927 913 #define DEFAULT_SYNC_VER 1 914 + #define DEFAULT_SYNC_REFRESH_PERIOD (0U * HZ) 915 + #define DEFAULT_SYNC_RETRIES 0 916 + #define IPVS_SYNC_WAKEUP_RATE 8 917 + #define IPVS_SYNC_QLEN_MAX (IPVS_SYNC_WAKEUP_RATE * 4) 918 + #define IPVS_SYNC_SEND_DELAY (HZ / 50) 919 + #define IPVS_SYNC_CHECK_PERIOD HZ 920 + #define IPVS_SYNC_FLUSH_TIME (HZ * 2) 921 + #define IPVS_SYNC_PORTS_MAX (1 << 6) 928 922 929 923 #ifdef CONFIG_SYSCTL 930 924 ··· 943 921 944 922 static inline int sysctl_sync_period(struct netns_ipvs *ipvs) 945 923 { 946 - return ipvs->sysctl_sync_threshold[1]; 924 + return ACCESS_ONCE(ipvs->sysctl_sync_threshold[1]); 925 + } 926 + 927 + static inline unsigned int sysctl_sync_refresh_period(struct netns_ipvs *ipvs) 928 + { 929 + return ACCESS_ONCE(ipvs->sysctl_sync_refresh_period); 930 + } 931 + 932 + static inline int sysctl_sync_retries(struct netns_ipvs *ipvs) 933 + { 934 + return ipvs->sysctl_sync_retries; 947 935 } 948 936 949 937 static inline int sysctl_sync_ver(struct netns_ipvs *ipvs) 950 938 { 951 939 return ipvs->sysctl_sync_ver; 940 + } 941 + 942 + static inline int sysctl_sync_ports(struct netns_ipvs *ipvs) 943 + { 944 + return ACCESS_ONCE(ipvs->sysctl_sync_ports); 945 + } 946 + 947 + static inline int sysctl_sync_qlen_max(struct netns_ipvs *ipvs) 948 + { 949 + return ipvs->sysctl_sync_qlen_max; 950 + } 951 + 952 + static inline int sysctl_sync_sock_size(struct netns_ipvs *ipvs) 953 + { 954 + return ipvs->sysctl_sync_sock_size; 952 955 } 953 956 954 957 #else ··· 988 941 return DEFAULT_SYNC_PERIOD; 989 942 } 990 943 944 + static inline unsigned int sysctl_sync_refresh_period(struct netns_ipvs *ipvs) 945 + { 946 + return DEFAULT_SYNC_REFRESH_PERIOD; 947 + } 948 + 949 + static inline int sysctl_sync_retries(struct netns_ipvs *ipvs) 950 + { 951 + return DEFAULT_SYNC_RETRIES & 3; 952 + } 953 + 991 954 static inline int sysctl_sync_ver(struct netns_ipvs *ipvs) 992 955 { 993 956 return DEFAULT_SYNC_VER; 957 + } 958 + 959 + static inline int sysctl_sync_ports(struct netns_ipvs *ipvs) 960 + { 961 + return 1; 962 + } 963 + 964 + static inline int sysctl_sync_qlen_max(struct netns_ipvs *ipvs) 965 + { 966 + return IPVS_SYNC_QLEN_MAX; 967 + } 968 + 969 + static inline int sysctl_sync_sock_size(struct netns_ipvs *ipvs) 970 + { 971 + return 0; 994 972 } 995 973 996 974 #endif ··· 1257 1185 extern struct ip_vs_stats ip_vs_stats; 1258 1186 extern int sysctl_ip_vs_sync_ver; 1259 1187 1260 - extern void ip_vs_sync_switch_mode(struct net *net, int mode); 1261 1188 extern struct ip_vs_service * 1262 1189 ip_vs_service_get(struct net *net, int af, __u32 fwmark, __u16 protocol, 1263 1190 const union nf_inet_addr *vaddr, __be16 vport); ··· 1290 1219 extern int start_sync_thread(struct net *net, int state, char *mcast_ifn, 1291 1220 __u8 syncid); 1292 1221 extern int stop_sync_thread(struct net *net, int state); 1293 - extern void ip_vs_sync_conn(struct net *net, struct ip_vs_conn *cp); 1222 + extern void ip_vs_sync_conn(struct net *net, struct ip_vs_conn *cp, int pkts); 1294 1223 1295 1224 1296 1225 /*
+2 -8
include/net/netfilter/nf_conntrack.h
··· 321 321 extern unsigned int nf_conntrack_hash_rnd; 322 322 void init_nf_conntrack_hash_rnd(void); 323 323 324 - #define NF_CT_STAT_INC(net, count) \ 325 - __this_cpu_inc((net)->ct.stat->count) 326 - #define NF_CT_STAT_INC_ATOMIC(net, count) \ 327 - do { \ 328 - local_bh_disable(); \ 329 - __this_cpu_inc((net)->ct.stat->count); \ 330 - local_bh_enable(); \ 331 - } while (0) 324 + #define NF_CT_STAT_INC(net, count) __this_cpu_inc((net)->ct.stat->count) 325 + #define NF_CT_STAT_INC_ATOMIC(net, count) this_cpu_inc((net)->ct.stat->count) 332 326 333 327 #define MODULE_ALIAS_NFCT_HELPER(helper) \ 334 328 MODULE_ALIAS("nfct-helper-" helper)
+2 -2
include/net/netfilter/nf_conntrack_helper.h
··· 60 60 return nf_ct_ext_find(ct, NF_CT_EXT_HELPER); 61 61 } 62 62 63 - extern int nf_conntrack_helper_init(void); 64 - extern void nf_conntrack_helper_fini(void); 63 + extern int nf_conntrack_helper_init(struct net *net); 64 + extern void nf_conntrack_helper_fini(struct net *net); 65 65 66 66 extern int nf_conntrack_broadcast_help(struct sk_buff *skb, 67 67 unsigned int protoff,
+3
include/net/netns/conntrack.h
··· 26 26 int sysctl_tstamp; 27 27 int sysctl_checksum; 28 28 unsigned int sysctl_log_invalid; /* Log invalid packets */ 29 + int sysctl_auto_assign_helper; 30 + bool auto_assign_helper_warned; 29 31 #ifdef CONFIG_SYSCTL 30 32 struct ctl_table_header *sysctl_header; 31 33 struct ctl_table_header *acct_sysctl_header; 32 34 struct ctl_table_header *tstamp_sysctl_header; 33 35 struct ctl_table_header *event_sysctl_header; 36 + struct ctl_table_header *helper_sysctl_header; 34 37 #endif 35 38 char *slabname; 36 39 };
+24 -2
net/bridge/br_netfilter.c
··· 54 54 static int brnf_call_arptables __read_mostly = 1; 55 55 static int brnf_filter_vlan_tagged __read_mostly = 0; 56 56 static int brnf_filter_pppoe_tagged __read_mostly = 0; 57 + static int brnf_pass_vlan_indev __read_mostly = 0; 57 58 #else 58 59 #define brnf_call_iptables 1 59 60 #define brnf_call_ip6tables 1 60 61 #define brnf_call_arptables 1 61 62 #define brnf_filter_vlan_tagged 0 62 63 #define brnf_filter_pppoe_tagged 0 64 + #define brnf_pass_vlan_indev 0 63 65 #endif 64 66 65 67 #define IS_IP(skb) \ ··· 505 503 return 0; 506 504 } 507 505 506 + static struct net_device *brnf_get_logical_dev(struct sk_buff *skb, const struct net_device *dev) 507 + { 508 + struct net_device *vlan, *br; 509 + 510 + br = bridge_parent(dev); 511 + if (brnf_pass_vlan_indev == 0 || !vlan_tx_tag_present(skb)) 512 + return br; 513 + 514 + vlan = __vlan_find_dev_deep(br, vlan_tx_tag_get(skb) & VLAN_VID_MASK); 515 + 516 + return vlan ? vlan : br; 517 + } 518 + 508 519 /* Some common code for IPv4/IPv6 */ 509 520 static struct net_device *setup_pre_routing(struct sk_buff *skb) 510 521 { ··· 530 515 531 516 nf_bridge->mask |= BRNF_NF_BRIDGE_PREROUTING; 532 517 nf_bridge->physindev = skb->dev; 533 - skb->dev = bridge_parent(skb->dev); 518 + skb->dev = brnf_get_logical_dev(skb, skb->dev); 534 519 if (skb->protocol == htons(ETH_P_8021Q)) 535 520 nf_bridge->mask |= BRNF_8021Q; 536 521 else if (skb->protocol == htons(ETH_P_PPP_SES)) ··· 789 774 else 790 775 skb->protocol = htons(ETH_P_IPV6); 791 776 792 - NF_HOOK(pf, NF_INET_FORWARD, skb, bridge_parent(in), parent, 777 + NF_HOOK(pf, NF_INET_FORWARD, skb, brnf_get_logical_dev(skb, in), parent, 793 778 br_nf_forward_finish); 794 779 795 780 return NF_STOLEN; ··· 1013 998 { 1014 999 .procname = "bridge-nf-filter-pppoe-tagged", 1015 1000 .data = &brnf_filter_pppoe_tagged, 1001 + .maxlen = sizeof(int), 1002 + .mode = 0644, 1003 + .proc_handler = brnf_sysctl_call_tables, 1004 + }, 1005 + { 1006 + .procname = "bridge-nf-pass-vlan-input-dev", 1007 + .data = &brnf_pass_vlan_indev, 1016 1008 .maxlen = sizeof(int), 1017 1009 .mode = 0644, 1018 1010 .proc_handler = brnf_sysctl_call_tables,
+2
net/core/sock.c
··· 259 259 260 260 /* Run time adjustable parameters. */ 261 261 __u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX; 262 + EXPORT_SYMBOL(sysctl_wmem_max); 262 263 __u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX; 264 + EXPORT_SYMBOL(sysctl_rmem_max); 263 265 __u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX; 264 266 __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX; 265 267
-3
net/ipv4/netfilter/Makefile
··· 66 66 67 67 # just filtering instance of ARP tables for now 68 68 obj-$(CONFIG_IP_NF_ARPFILTER) += arptable_filter.o 69 - 70 - obj-$(CONFIG_IP_NF_QUEUE) += ip_queue.o 71 -
-639
net/ipv4/netfilter/ip_queue.c
··· 1 - /* 2 - * This is a module which is used for queueing IPv4 packets and 3 - * communicating with userspace via netlink. 4 - * 5 - * (C) 2000-2002 James Morris <jmorris@intercode.com.au> 6 - * (C) 2003-2005 Netfilter Core Team <coreteam@netfilter.org> 7 - * 8 - * This program is free software; you can redistribute it and/or modify 9 - * it under the terms of the GNU General Public License version 2 as 10 - * published by the Free Software Foundation. 11 - */ 12 - #include <linux/module.h> 13 - #include <linux/skbuff.h> 14 - #include <linux/init.h> 15 - #include <linux/ip.h> 16 - #include <linux/notifier.h> 17 - #include <linux/netdevice.h> 18 - #include <linux/netfilter.h> 19 - #include <linux/netfilter_ipv4/ip_queue.h> 20 - #include <linux/netfilter_ipv4/ip_tables.h> 21 - #include <linux/netlink.h> 22 - #include <linux/spinlock.h> 23 - #include <linux/sysctl.h> 24 - #include <linux/proc_fs.h> 25 - #include <linux/seq_file.h> 26 - #include <linux/security.h> 27 - #include <linux/net.h> 28 - #include <linux/mutex.h> 29 - #include <linux/slab.h> 30 - #include <net/net_namespace.h> 31 - #include <net/sock.h> 32 - #include <net/route.h> 33 - #include <net/netfilter/nf_queue.h> 34 - #include <net/ip.h> 35 - 36 - #define IPQ_QMAX_DEFAULT 1024 37 - #define IPQ_PROC_FS_NAME "ip_queue" 38 - #define NET_IPQ_QMAX 2088 39 - #define NET_IPQ_QMAX_NAME "ip_queue_maxlen" 40 - 41 - typedef int (*ipq_cmpfn)(struct nf_queue_entry *, unsigned long); 42 - 43 - static unsigned char copy_mode __read_mostly = IPQ_COPY_NONE; 44 - static unsigned int queue_maxlen __read_mostly = IPQ_QMAX_DEFAULT; 45 - static DEFINE_SPINLOCK(queue_lock); 46 - static int peer_pid __read_mostly; 47 - static unsigned int copy_range __read_mostly; 48 - static unsigned int queue_total; 49 - static unsigned int queue_dropped = 0; 50 - static unsigned int queue_user_dropped = 0; 51 - static struct sock *ipqnl __read_mostly; 52 - static LIST_HEAD(queue_list); 53 - static DEFINE_MUTEX(ipqnl_mutex); 54 - 55 - static inline void 56 - __ipq_enqueue_entry(struct nf_queue_entry *entry) 57 - { 58 - list_add_tail(&entry->list, &queue_list); 59 - queue_total++; 60 - } 61 - 62 - static inline int 63 - __ipq_set_mode(unsigned char mode, unsigned int range) 64 - { 65 - int status = 0; 66 - 67 - switch(mode) { 68 - case IPQ_COPY_NONE: 69 - case IPQ_COPY_META: 70 - copy_mode = mode; 71 - copy_range = 0; 72 - break; 73 - 74 - case IPQ_COPY_PACKET: 75 - if (range > 0xFFFF) 76 - range = 0xFFFF; 77 - copy_range = range; 78 - copy_mode = mode; 79 - break; 80 - 81 - default: 82 - status = -EINVAL; 83 - 84 - } 85 - return status; 86 - } 87 - 88 - static void __ipq_flush(ipq_cmpfn cmpfn, unsigned long data); 89 - 90 - static inline void 91 - __ipq_reset(void) 92 - { 93 - peer_pid = 0; 94 - net_disable_timestamp(); 95 - __ipq_set_mode(IPQ_COPY_NONE, 0); 96 - __ipq_flush(NULL, 0); 97 - } 98 - 99 - static struct nf_queue_entry * 100 - ipq_find_dequeue_entry(unsigned long id) 101 - { 102 - struct nf_queue_entry *entry = NULL, *i; 103 - 104 - spin_lock_bh(&queue_lock); 105 - 106 - list_for_each_entry(i, &queue_list, list) { 107 - if ((unsigned long)i == id) { 108 - entry = i; 109 - break; 110 - } 111 - } 112 - 113 - if (entry) { 114 - list_del(&entry->list); 115 - queue_total--; 116 - } 117 - 118 - spin_unlock_bh(&queue_lock); 119 - return entry; 120 - } 121 - 122 - static void 123 - __ipq_flush(ipq_cmpfn cmpfn, unsigned long data) 124 - { 125 - struct nf_queue_entry *entry, *next; 126 - 127 - list_for_each_entry_safe(entry, next, &queue_list, list) { 128 - if (!cmpfn || cmpfn(entry, data)) { 129 - list_del(&entry->list); 130 - queue_total--; 131 - nf_reinject(entry, NF_DROP); 132 - } 133 - } 134 - } 135 - 136 - static void 137 - ipq_flush(ipq_cmpfn cmpfn, unsigned long data) 138 - { 139 - spin_lock_bh(&queue_lock); 140 - __ipq_flush(cmpfn, data); 141 - spin_unlock_bh(&queue_lock); 142 - } 143 - 144 - static struct sk_buff * 145 - ipq_build_packet_message(struct nf_queue_entry *entry, int *errp) 146 - { 147 - sk_buff_data_t old_tail; 148 - size_t size = 0; 149 - size_t data_len = 0; 150 - struct sk_buff *skb; 151 - struct ipq_packet_msg *pmsg; 152 - struct nlmsghdr *nlh; 153 - struct timeval tv; 154 - 155 - switch (ACCESS_ONCE(copy_mode)) { 156 - case IPQ_COPY_META: 157 - case IPQ_COPY_NONE: 158 - size = NLMSG_SPACE(sizeof(*pmsg)); 159 - break; 160 - 161 - case IPQ_COPY_PACKET: 162 - if (entry->skb->ip_summed == CHECKSUM_PARTIAL && 163 - (*errp = skb_checksum_help(entry->skb))) 164 - return NULL; 165 - 166 - data_len = ACCESS_ONCE(copy_range); 167 - if (data_len == 0 || data_len > entry->skb->len) 168 - data_len = entry->skb->len; 169 - 170 - size = NLMSG_SPACE(sizeof(*pmsg) + data_len); 171 - break; 172 - 173 - default: 174 - *errp = -EINVAL; 175 - return NULL; 176 - } 177 - 178 - skb = alloc_skb(size, GFP_ATOMIC); 179 - if (!skb) 180 - goto nlmsg_failure; 181 - 182 - old_tail = skb->tail; 183 - nlh = NLMSG_PUT(skb, 0, 0, IPQM_PACKET, size - sizeof(*nlh)); 184 - pmsg = NLMSG_DATA(nlh); 185 - memset(pmsg, 0, sizeof(*pmsg)); 186 - 187 - pmsg->packet_id = (unsigned long )entry; 188 - pmsg->data_len = data_len; 189 - tv = ktime_to_timeval(entry->skb->tstamp); 190 - pmsg->timestamp_sec = tv.tv_sec; 191 - pmsg->timestamp_usec = tv.tv_usec; 192 - pmsg->mark = entry->skb->mark; 193 - pmsg->hook = entry->hook; 194 - pmsg->hw_protocol = entry->skb->protocol; 195 - 196 - if (entry->indev) 197 - strcpy(pmsg->indev_name, entry->indev->name); 198 - else 199 - pmsg->indev_name[0] = '\0'; 200 - 201 - if (entry->outdev) 202 - strcpy(pmsg->outdev_name, entry->outdev->name); 203 - else 204 - pmsg->outdev_name[0] = '\0'; 205 - 206 - if (entry->indev && entry->skb->dev && 207 - entry->skb->mac_header != entry->skb->network_header) { 208 - pmsg->hw_type = entry->skb->dev->type; 209 - pmsg->hw_addrlen = dev_parse_header(entry->skb, 210 - pmsg->hw_addr); 211 - } 212 - 213 - if (data_len) 214 - if (skb_copy_bits(entry->skb, 0, pmsg->payload, data_len)) 215 - BUG(); 216 - 217 - nlh->nlmsg_len = skb->tail - old_tail; 218 - return skb; 219 - 220 - nlmsg_failure: 221 - kfree_skb(skb); 222 - *errp = -EINVAL; 223 - printk(KERN_ERR "ip_queue: error creating packet message\n"); 224 - return NULL; 225 - } 226 - 227 - static int 228 - ipq_enqueue_packet(struct nf_queue_entry *entry, unsigned int queuenum) 229 - { 230 - int status = -EINVAL; 231 - struct sk_buff *nskb; 232 - 233 - if (copy_mode == IPQ_COPY_NONE) 234 - return -EAGAIN; 235 - 236 - nskb = ipq_build_packet_message(entry, &status); 237 - if (nskb == NULL) 238 - return status; 239 - 240 - spin_lock_bh(&queue_lock); 241 - 242 - if (!peer_pid) 243 - goto err_out_free_nskb; 244 - 245 - if (queue_total >= queue_maxlen) { 246 - queue_dropped++; 247 - status = -ENOSPC; 248 - if (net_ratelimit()) 249 - printk (KERN_WARNING "ip_queue: full at %d entries, " 250 - "dropping packets(s). Dropped: %d\n", queue_total, 251 - queue_dropped); 252 - goto err_out_free_nskb; 253 - } 254 - 255 - /* netlink_unicast will either free the nskb or attach it to a socket */ 256 - status = netlink_unicast(ipqnl, nskb, peer_pid, MSG_DONTWAIT); 257 - if (status < 0) { 258 - queue_user_dropped++; 259 - goto err_out_unlock; 260 - } 261 - 262 - __ipq_enqueue_entry(entry); 263 - 264 - spin_unlock_bh(&queue_lock); 265 - return status; 266 - 267 - err_out_free_nskb: 268 - kfree_skb(nskb); 269 - 270 - err_out_unlock: 271 - spin_unlock_bh(&queue_lock); 272 - return status; 273 - } 274 - 275 - static int 276 - ipq_mangle_ipv4(ipq_verdict_msg_t *v, struct nf_queue_entry *e) 277 - { 278 - int diff; 279 - struct iphdr *user_iph = (struct iphdr *)v->payload; 280 - struct sk_buff *nskb; 281 - 282 - if (v->data_len < sizeof(*user_iph)) 283 - return 0; 284 - diff = v->data_len - e->skb->len; 285 - if (diff < 0) { 286 - if (pskb_trim(e->skb, v->data_len)) 287 - return -ENOMEM; 288 - } else if (diff > 0) { 289 - if (v->data_len > 0xFFFF) 290 - return -EINVAL; 291 - if (diff > skb_tailroom(e->skb)) { 292 - nskb = skb_copy_expand(e->skb, skb_headroom(e->skb), 293 - diff, GFP_ATOMIC); 294 - if (!nskb) { 295 - printk(KERN_WARNING "ip_queue: error " 296 - "in mangle, dropping packet\n"); 297 - return -ENOMEM; 298 - } 299 - kfree_skb(e->skb); 300 - e->skb = nskb; 301 - } 302 - skb_put(e->skb, diff); 303 - } 304 - if (!skb_make_writable(e->skb, v->data_len)) 305 - return -ENOMEM; 306 - skb_copy_to_linear_data(e->skb, v->payload, v->data_len); 307 - e->skb->ip_summed = CHECKSUM_NONE; 308 - 309 - return 0; 310 - } 311 - 312 - static int 313 - ipq_set_verdict(struct ipq_verdict_msg *vmsg, unsigned int len) 314 - { 315 - struct nf_queue_entry *entry; 316 - 317 - if (vmsg->value > NF_MAX_VERDICT || vmsg->value == NF_STOLEN) 318 - return -EINVAL; 319 - 320 - entry = ipq_find_dequeue_entry(vmsg->id); 321 - if (entry == NULL) 322 - return -ENOENT; 323 - else { 324 - int verdict = vmsg->value; 325 - 326 - if (vmsg->data_len && vmsg->data_len == len) 327 - if (ipq_mangle_ipv4(vmsg, entry) < 0) 328 - verdict = NF_DROP; 329 - 330 - nf_reinject(entry, verdict); 331 - return 0; 332 - } 333 - } 334 - 335 - static int 336 - ipq_set_mode(unsigned char mode, unsigned int range) 337 - { 338 - int status; 339 - 340 - spin_lock_bh(&queue_lock); 341 - status = __ipq_set_mode(mode, range); 342 - spin_unlock_bh(&queue_lock); 343 - return status; 344 - } 345 - 346 - static int 347 - ipq_receive_peer(struct ipq_peer_msg *pmsg, 348 - unsigned char type, unsigned int len) 349 - { 350 - int status = 0; 351 - 352 - if (len < sizeof(*pmsg)) 353 - return -EINVAL; 354 - 355 - switch (type) { 356 - case IPQM_MODE: 357 - status = ipq_set_mode(pmsg->msg.mode.value, 358 - pmsg->msg.mode.range); 359 - break; 360 - 361 - case IPQM_VERDICT: 362 - status = ipq_set_verdict(&pmsg->msg.verdict, 363 - len - sizeof(*pmsg)); 364 - break; 365 - default: 366 - status = -EINVAL; 367 - } 368 - return status; 369 - } 370 - 371 - static int 372 - dev_cmp(struct nf_queue_entry *entry, unsigned long ifindex) 373 - { 374 - if (entry->indev) 375 - if (entry->indev->ifindex == ifindex) 376 - return 1; 377 - if (entry->outdev) 378 - if (entry->outdev->ifindex == ifindex) 379 - return 1; 380 - #ifdef CONFIG_BRIDGE_NETFILTER 381 - if (entry->skb->nf_bridge) { 382 - if (entry->skb->nf_bridge->physindev && 383 - entry->skb->nf_bridge->physindev->ifindex == ifindex) 384 - return 1; 385 - if (entry->skb->nf_bridge->physoutdev && 386 - entry->skb->nf_bridge->physoutdev->ifindex == ifindex) 387 - return 1; 388 - } 389 - #endif 390 - return 0; 391 - } 392 - 393 - static void 394 - ipq_dev_drop(int ifindex) 395 - { 396 - ipq_flush(dev_cmp, ifindex); 397 - } 398 - 399 - #define RCV_SKB_FAIL(err) do { netlink_ack(skb, nlh, (err)); return; } while (0) 400 - 401 - static inline void 402 - __ipq_rcv_skb(struct sk_buff *skb) 403 - { 404 - int status, type, pid, flags; 405 - unsigned int nlmsglen, skblen; 406 - struct nlmsghdr *nlh; 407 - bool enable_timestamp = false; 408 - 409 - skblen = skb->len; 410 - if (skblen < sizeof(*nlh)) 411 - return; 412 - 413 - nlh = nlmsg_hdr(skb); 414 - nlmsglen = nlh->nlmsg_len; 415 - if (nlmsglen < sizeof(*nlh) || skblen < nlmsglen) 416 - return; 417 - 418 - pid = nlh->nlmsg_pid; 419 - flags = nlh->nlmsg_flags; 420 - 421 - if(pid <= 0 || !(flags & NLM_F_REQUEST) || flags & NLM_F_MULTI) 422 - RCV_SKB_FAIL(-EINVAL); 423 - 424 - if (flags & MSG_TRUNC) 425 - RCV_SKB_FAIL(-ECOMM); 426 - 427 - type = nlh->nlmsg_type; 428 - if (type < NLMSG_NOOP || type >= IPQM_MAX) 429 - RCV_SKB_FAIL(-EINVAL); 430 - 431 - if (type <= IPQM_BASE) 432 - return; 433 - 434 - if (!capable(CAP_NET_ADMIN)) 435 - RCV_SKB_FAIL(-EPERM); 436 - 437 - spin_lock_bh(&queue_lock); 438 - 439 - if (peer_pid) { 440 - if (peer_pid != pid) { 441 - spin_unlock_bh(&queue_lock); 442 - RCV_SKB_FAIL(-EBUSY); 443 - } 444 - } else { 445 - enable_timestamp = true; 446 - peer_pid = pid; 447 - } 448 - 449 - spin_unlock_bh(&queue_lock); 450 - if (enable_timestamp) 451 - net_enable_timestamp(); 452 - status = ipq_receive_peer(NLMSG_DATA(nlh), type, 453 - nlmsglen - NLMSG_LENGTH(0)); 454 - if (status < 0) 455 - RCV_SKB_FAIL(status); 456 - 457 - if (flags & NLM_F_ACK) 458 - netlink_ack(skb, nlh, 0); 459 - } 460 - 461 - static void 462 - ipq_rcv_skb(struct sk_buff *skb) 463 - { 464 - mutex_lock(&ipqnl_mutex); 465 - __ipq_rcv_skb(skb); 466 - mutex_unlock(&ipqnl_mutex); 467 - } 468 - 469 - static int 470 - ipq_rcv_dev_event(struct notifier_block *this, 471 - unsigned long event, void *ptr) 472 - { 473 - struct net_device *dev = ptr; 474 - 475 - if (!net_eq(dev_net(dev), &init_net)) 476 - return NOTIFY_DONE; 477 - 478 - /* Drop any packets associated with the downed device */ 479 - if (event == NETDEV_DOWN) 480 - ipq_dev_drop(dev->ifindex); 481 - return NOTIFY_DONE; 482 - } 483 - 484 - static struct notifier_block ipq_dev_notifier = { 485 - .notifier_call = ipq_rcv_dev_event, 486 - }; 487 - 488 - static int 489 - ipq_rcv_nl_event(struct notifier_block *this, 490 - unsigned long event, void *ptr) 491 - { 492 - struct netlink_notify *n = ptr; 493 - 494 - if (event == NETLINK_URELEASE && n->protocol == NETLINK_FIREWALL) { 495 - spin_lock_bh(&queue_lock); 496 - if ((net_eq(n->net, &init_net)) && (n->pid == peer_pid)) 497 - __ipq_reset(); 498 - spin_unlock_bh(&queue_lock); 499 - } 500 - return NOTIFY_DONE; 501 - } 502 - 503 - static struct notifier_block ipq_nl_notifier = { 504 - .notifier_call = ipq_rcv_nl_event, 505 - }; 506 - 507 - #ifdef CONFIG_SYSCTL 508 - static struct ctl_table_header *ipq_sysctl_header; 509 - 510 - static ctl_table ipq_table[] = { 511 - { 512 - .procname = NET_IPQ_QMAX_NAME, 513 - .data = &queue_maxlen, 514 - .maxlen = sizeof(queue_maxlen), 515 - .mode = 0644, 516 - .proc_handler = proc_dointvec 517 - }, 518 - { } 519 - }; 520 - #endif 521 - 522 - #ifdef CONFIG_PROC_FS 523 - static int ip_queue_show(struct seq_file *m, void *v) 524 - { 525 - spin_lock_bh(&queue_lock); 526 - 527 - seq_printf(m, 528 - "Peer PID : %d\n" 529 - "Copy mode : %hu\n" 530 - "Copy range : %u\n" 531 - "Queue length : %u\n" 532 - "Queue max. length : %u\n" 533 - "Queue dropped : %u\n" 534 - "Netlink dropped : %u\n", 535 - peer_pid, 536 - copy_mode, 537 - copy_range, 538 - queue_total, 539 - queue_maxlen, 540 - queue_dropped, 541 - queue_user_dropped); 542 - 543 - spin_unlock_bh(&queue_lock); 544 - return 0; 545 - } 546 - 547 - static int ip_queue_open(struct inode *inode, struct file *file) 548 - { 549 - return single_open(file, ip_queue_show, NULL); 550 - } 551 - 552 - static const struct file_operations ip_queue_proc_fops = { 553 - .open = ip_queue_open, 554 - .read = seq_read, 555 - .llseek = seq_lseek, 556 - .release = single_release, 557 - .owner = THIS_MODULE, 558 - }; 559 - #endif 560 - 561 - static const struct nf_queue_handler nfqh = { 562 - .name = "ip_queue", 563 - .outfn = &ipq_enqueue_packet, 564 - }; 565 - 566 - static int __init ip_queue_init(void) 567 - { 568 - int status = -ENOMEM; 569 - struct proc_dir_entry *proc __maybe_unused; 570 - 571 - netlink_register_notifier(&ipq_nl_notifier); 572 - ipqnl = netlink_kernel_create(&init_net, NETLINK_FIREWALL, 0, 573 - ipq_rcv_skb, NULL, THIS_MODULE); 574 - if (ipqnl == NULL) { 575 - printk(KERN_ERR "ip_queue: failed to create netlink socket\n"); 576 - goto cleanup_netlink_notifier; 577 - } 578 - 579 - #ifdef CONFIG_PROC_FS 580 - proc = proc_create(IPQ_PROC_FS_NAME, 0, init_net.proc_net, 581 - &ip_queue_proc_fops); 582 - if (!proc) { 583 - printk(KERN_ERR "ip_queue: failed to create proc entry\n"); 584 - goto cleanup_ipqnl; 585 - } 586 - #endif 587 - register_netdevice_notifier(&ipq_dev_notifier); 588 - #ifdef CONFIG_SYSCTL 589 - ipq_sysctl_header = register_net_sysctl(&init_net, "net/ipv4", ipq_table); 590 - #endif 591 - status = nf_register_queue_handler(NFPROTO_IPV4, &nfqh); 592 - if (status < 0) { 593 - printk(KERN_ERR "ip_queue: failed to register queue handler\n"); 594 - goto cleanup_sysctl; 595 - } 596 - return status; 597 - 598 - cleanup_sysctl: 599 - #ifdef CONFIG_SYSCTL 600 - unregister_net_sysctl_table(ipq_sysctl_header); 601 - #endif 602 - unregister_netdevice_notifier(&ipq_dev_notifier); 603 - proc_net_remove(&init_net, IPQ_PROC_FS_NAME); 604 - cleanup_ipqnl: __maybe_unused 605 - netlink_kernel_release(ipqnl); 606 - mutex_lock(&ipqnl_mutex); 607 - mutex_unlock(&ipqnl_mutex); 608 - 609 - cleanup_netlink_notifier: 610 - netlink_unregister_notifier(&ipq_nl_notifier); 611 - return status; 612 - } 613 - 614 - static void __exit ip_queue_fini(void) 615 - { 616 - nf_unregister_queue_handlers(&nfqh); 617 - 618 - ipq_flush(NULL, 0); 619 - 620 - #ifdef CONFIG_SYSCTL 621 - unregister_net_sysctl_table(ipq_sysctl_header); 622 - #endif 623 - unregister_netdevice_notifier(&ipq_dev_notifier); 624 - proc_net_remove(&init_net, IPQ_PROC_FS_NAME); 625 - 626 - netlink_kernel_release(ipqnl); 627 - mutex_lock(&ipqnl_mutex); 628 - mutex_unlock(&ipqnl_mutex); 629 - 630 - netlink_unregister_notifier(&ipq_nl_notifier); 631 - } 632 - 633 - MODULE_DESCRIPTION("IPv4 packet queue handler"); 634 - MODULE_AUTHOR("James Morris <jmorris@intercode.com.au>"); 635 - MODULE_LICENSE("GPL"); 636 - MODULE_ALIAS_NET_PF_PROTO(PF_NETLINK, NETLINK_FIREWALL); 637 - 638 - module_init(ip_queue_init); 639 - module_exit(ip_queue_fini);
-22
net/ipv6/netfilter/Kconfig
··· 25 25 26 26 To compile it as a module, choose M here. If unsure, say N. 27 27 28 - config IP6_NF_QUEUE 29 - tristate "IP6 Userspace queueing via NETLINK (OBSOLETE)" 30 - depends on INET && IPV6 && NETFILTER 31 - depends on NETFILTER_ADVANCED 32 - ---help--- 33 - 34 - This option adds a queue handler to the kernel for IPv6 35 - packets which enables users to receive the filtered packets 36 - with QUEUE target using libipq. 37 - 38 - This option enables the old IPv6-only "ip6_queue" implementation 39 - which has been obsoleted by the new "nfnetlink_queue" code (see 40 - CONFIG_NETFILTER_NETLINK_QUEUE). 41 - 42 - (C) Fernando Anton 2001 43 - IPv64 Project - Work based in IPv64 draft by Arturo Azcorra. 44 - Universidad Carlos III de Madrid 45 - Universidad Politecnica de Alcala de Henares 46 - email: <fanton@it.uc3m.es>. 47 - 48 - To compile it as a module, choose M here. If unsure, say N. 49 - 50 28 config IP6_NF_IPTABLES 51 29 tristate "IP6 tables support (required for filtering)" 52 30 depends on INET && IPV6
-1
net/ipv6/netfilter/Makefile
··· 6 6 obj-$(CONFIG_IP6_NF_IPTABLES) += ip6_tables.o 7 7 obj-$(CONFIG_IP6_NF_FILTER) += ip6table_filter.o 8 8 obj-$(CONFIG_IP6_NF_MANGLE) += ip6table_mangle.o 9 - obj-$(CONFIG_IP6_NF_QUEUE) += ip6_queue.o 10 9 obj-$(CONFIG_IP6_NF_RAW) += ip6table_raw.o 11 10 obj-$(CONFIG_IP6_NF_SECURITY) += ip6table_security.o 12 11
-641
net/ipv6/netfilter/ip6_queue.c
··· 1 - /* 2 - * This is a module which is used for queueing IPv6 packets and 3 - * communicating with userspace via netlink. 4 - * 5 - * (C) 2001 Fernando Anton, this code is GPL. 6 - * IPv64 Project - Work based in IPv64 draft by Arturo Azcorra. 7 - * Universidad Carlos III de Madrid - Leganes (Madrid) - Spain 8 - * Universidad Politecnica de Alcala de Henares - Alcala de H. (Madrid) - Spain 9 - * email: fanton@it.uc3m.es 10 - * 11 - * This program is free software; you can redistribute it and/or modify 12 - * it under the terms of the GNU General Public License version 2 as 13 - * published by the Free Software Foundation. 14 - */ 15 - #include <linux/module.h> 16 - #include <linux/skbuff.h> 17 - #include <linux/init.h> 18 - #include <linux/ipv6.h> 19 - #include <linux/notifier.h> 20 - #include <linux/netdevice.h> 21 - #include <linux/netfilter.h> 22 - #include <linux/netlink.h> 23 - #include <linux/spinlock.h> 24 - #include <linux/sysctl.h> 25 - #include <linux/proc_fs.h> 26 - #include <linux/seq_file.h> 27 - #include <linux/mutex.h> 28 - #include <linux/slab.h> 29 - #include <net/net_namespace.h> 30 - #include <net/sock.h> 31 - #include <net/ipv6.h> 32 - #include <net/ip6_route.h> 33 - #include <net/netfilter/nf_queue.h> 34 - #include <linux/netfilter_ipv4/ip_queue.h> 35 - #include <linux/netfilter_ipv4/ip_tables.h> 36 - #include <linux/netfilter_ipv6/ip6_tables.h> 37 - 38 - #define IPQ_QMAX_DEFAULT 1024 39 - #define IPQ_PROC_FS_NAME "ip6_queue" 40 - #define NET_IPQ_QMAX_NAME "ip6_queue_maxlen" 41 - 42 - typedef int (*ipq_cmpfn)(struct nf_queue_entry *, unsigned long); 43 - 44 - static unsigned char copy_mode __read_mostly = IPQ_COPY_NONE; 45 - static unsigned int queue_maxlen __read_mostly = IPQ_QMAX_DEFAULT; 46 - static DEFINE_SPINLOCK(queue_lock); 47 - static int peer_pid __read_mostly; 48 - static unsigned int copy_range __read_mostly; 49 - static unsigned int queue_total; 50 - static unsigned int queue_dropped = 0; 51 - static unsigned int queue_user_dropped = 0; 52 - static struct sock *ipqnl __read_mostly; 53 - static LIST_HEAD(queue_list); 54 - static DEFINE_MUTEX(ipqnl_mutex); 55 - 56 - static inline void 57 - __ipq_enqueue_entry(struct nf_queue_entry *entry) 58 - { 59 - list_add_tail(&entry->list, &queue_list); 60 - queue_total++; 61 - } 62 - 63 - static inline int 64 - __ipq_set_mode(unsigned char mode, unsigned int range) 65 - { 66 - int status = 0; 67 - 68 - switch(mode) { 69 - case IPQ_COPY_NONE: 70 - case IPQ_COPY_META: 71 - copy_mode = mode; 72 - copy_range = 0; 73 - break; 74 - 75 - case IPQ_COPY_PACKET: 76 - if (range > 0xFFFF) 77 - range = 0xFFFF; 78 - copy_range = range; 79 - copy_mode = mode; 80 - break; 81 - 82 - default: 83 - status = -EINVAL; 84 - 85 - } 86 - return status; 87 - } 88 - 89 - static void __ipq_flush(ipq_cmpfn cmpfn, unsigned long data); 90 - 91 - static inline void 92 - __ipq_reset(void) 93 - { 94 - peer_pid = 0; 95 - net_disable_timestamp(); 96 - __ipq_set_mode(IPQ_COPY_NONE, 0); 97 - __ipq_flush(NULL, 0); 98 - } 99 - 100 - static struct nf_queue_entry * 101 - ipq_find_dequeue_entry(unsigned long id) 102 - { 103 - struct nf_queue_entry *entry = NULL, *i; 104 - 105 - spin_lock_bh(&queue_lock); 106 - 107 - list_for_each_entry(i, &queue_list, list) { 108 - if ((unsigned long)i == id) { 109 - entry = i; 110 - break; 111 - } 112 - } 113 - 114 - if (entry) { 115 - list_del(&entry->list); 116 - queue_total--; 117 - } 118 - 119 - spin_unlock_bh(&queue_lock); 120 - return entry; 121 - } 122 - 123 - static void 124 - __ipq_flush(ipq_cmpfn cmpfn, unsigned long data) 125 - { 126 - struct nf_queue_entry *entry, *next; 127 - 128 - list_for_each_entry_safe(entry, next, &queue_list, list) { 129 - if (!cmpfn || cmpfn(entry, data)) { 130 - list_del(&entry->list); 131 - queue_total--; 132 - nf_reinject(entry, NF_DROP); 133 - } 134 - } 135 - } 136 - 137 - static void 138 - ipq_flush(ipq_cmpfn cmpfn, unsigned long data) 139 - { 140 - spin_lock_bh(&queue_lock); 141 - __ipq_flush(cmpfn, data); 142 - spin_unlock_bh(&queue_lock); 143 - } 144 - 145 - static struct sk_buff * 146 - ipq_build_packet_message(struct nf_queue_entry *entry, int *errp) 147 - { 148 - sk_buff_data_t old_tail; 149 - size_t size = 0; 150 - size_t data_len = 0; 151 - struct sk_buff *skb; 152 - struct ipq_packet_msg *pmsg; 153 - struct nlmsghdr *nlh; 154 - struct timeval tv; 155 - 156 - switch (ACCESS_ONCE(copy_mode)) { 157 - case IPQ_COPY_META: 158 - case IPQ_COPY_NONE: 159 - size = NLMSG_SPACE(sizeof(*pmsg)); 160 - break; 161 - 162 - case IPQ_COPY_PACKET: 163 - if (entry->skb->ip_summed == CHECKSUM_PARTIAL && 164 - (*errp = skb_checksum_help(entry->skb))) 165 - return NULL; 166 - 167 - data_len = ACCESS_ONCE(copy_range); 168 - if (data_len == 0 || data_len > entry->skb->len) 169 - data_len = entry->skb->len; 170 - 171 - size = NLMSG_SPACE(sizeof(*pmsg) + data_len); 172 - break; 173 - 174 - default: 175 - *errp = -EINVAL; 176 - return NULL; 177 - } 178 - 179 - skb = alloc_skb(size, GFP_ATOMIC); 180 - if (!skb) 181 - goto nlmsg_failure; 182 - 183 - old_tail = skb->tail; 184 - nlh = NLMSG_PUT(skb, 0, 0, IPQM_PACKET, size - sizeof(*nlh)); 185 - pmsg = NLMSG_DATA(nlh); 186 - memset(pmsg, 0, sizeof(*pmsg)); 187 - 188 - pmsg->packet_id = (unsigned long )entry; 189 - pmsg->data_len = data_len; 190 - tv = ktime_to_timeval(entry->skb->tstamp); 191 - pmsg->timestamp_sec = tv.tv_sec; 192 - pmsg->timestamp_usec = tv.tv_usec; 193 - pmsg->mark = entry->skb->mark; 194 - pmsg->hook = entry->hook; 195 - pmsg->hw_protocol = entry->skb->protocol; 196 - 197 - if (entry->indev) 198 - strcpy(pmsg->indev_name, entry->indev->name); 199 - else 200 - pmsg->indev_name[0] = '\0'; 201 - 202 - if (entry->outdev) 203 - strcpy(pmsg->outdev_name, entry->outdev->name); 204 - else 205 - pmsg->outdev_name[0] = '\0'; 206 - 207 - if (entry->indev && entry->skb->dev && 208 - entry->skb->mac_header != entry->skb->network_header) { 209 - pmsg->hw_type = entry->skb->dev->type; 210 - pmsg->hw_addrlen = dev_parse_header(entry->skb, pmsg->hw_addr); 211 - } 212 - 213 - if (data_len) 214 - if (skb_copy_bits(entry->skb, 0, pmsg->payload, data_len)) 215 - BUG(); 216 - 217 - nlh->nlmsg_len = skb->tail - old_tail; 218 - return skb; 219 - 220 - nlmsg_failure: 221 - kfree_skb(skb); 222 - *errp = -EINVAL; 223 - printk(KERN_ERR "ip6_queue: error creating packet message\n"); 224 - return NULL; 225 - } 226 - 227 - static int 228 - ipq_enqueue_packet(struct nf_queue_entry *entry, unsigned int queuenum) 229 - { 230 - int status = -EINVAL; 231 - struct sk_buff *nskb; 232 - 233 - if (copy_mode == IPQ_COPY_NONE) 234 - return -EAGAIN; 235 - 236 - nskb = ipq_build_packet_message(entry, &status); 237 - if (nskb == NULL) 238 - return status; 239 - 240 - spin_lock_bh(&queue_lock); 241 - 242 - if (!peer_pid) 243 - goto err_out_free_nskb; 244 - 245 - if (queue_total >= queue_maxlen) { 246 - queue_dropped++; 247 - status = -ENOSPC; 248 - if (net_ratelimit()) 249 - printk (KERN_WARNING "ip6_queue: fill at %d entries, " 250 - "dropping packet(s). Dropped: %d\n", queue_total, 251 - queue_dropped); 252 - goto err_out_free_nskb; 253 - } 254 - 255 - /* netlink_unicast will either free the nskb or attach it to a socket */ 256 - status = netlink_unicast(ipqnl, nskb, peer_pid, MSG_DONTWAIT); 257 - if (status < 0) { 258 - queue_user_dropped++; 259 - goto err_out_unlock; 260 - } 261 - 262 - __ipq_enqueue_entry(entry); 263 - 264 - spin_unlock_bh(&queue_lock); 265 - return status; 266 - 267 - err_out_free_nskb: 268 - kfree_skb(nskb); 269 - 270 - err_out_unlock: 271 - spin_unlock_bh(&queue_lock); 272 - return status; 273 - } 274 - 275 - static int 276 - ipq_mangle_ipv6(ipq_verdict_msg_t *v, struct nf_queue_entry *e) 277 - { 278 - int diff; 279 - struct ipv6hdr *user_iph = (struct ipv6hdr *)v->payload; 280 - struct sk_buff *nskb; 281 - 282 - if (v->data_len < sizeof(*user_iph)) 283 - return 0; 284 - diff = v->data_len - e->skb->len; 285 - if (diff < 0) { 286 - if (pskb_trim(e->skb, v->data_len)) 287 - return -ENOMEM; 288 - } else if (diff > 0) { 289 - if (v->data_len > 0xFFFF) 290 - return -EINVAL; 291 - if (diff > skb_tailroom(e->skb)) { 292 - nskb = skb_copy_expand(e->skb, skb_headroom(e->skb), 293 - diff, GFP_ATOMIC); 294 - if (!nskb) { 295 - printk(KERN_WARNING "ip6_queue: OOM " 296 - "in mangle, dropping packet\n"); 297 - return -ENOMEM; 298 - } 299 - kfree_skb(e->skb); 300 - e->skb = nskb; 301 - } 302 - skb_put(e->skb, diff); 303 - } 304 - if (!skb_make_writable(e->skb, v->data_len)) 305 - return -ENOMEM; 306 - skb_copy_to_linear_data(e->skb, v->payload, v->data_len); 307 - e->skb->ip_summed = CHECKSUM_NONE; 308 - 309 - return 0; 310 - } 311 - 312 - static int 313 - ipq_set_verdict(struct ipq_verdict_msg *vmsg, unsigned int len) 314 - { 315 - struct nf_queue_entry *entry; 316 - 317 - if (vmsg->value > NF_MAX_VERDICT || vmsg->value == NF_STOLEN) 318 - return -EINVAL; 319 - 320 - entry = ipq_find_dequeue_entry(vmsg->id); 321 - if (entry == NULL) 322 - return -ENOENT; 323 - else { 324 - int verdict = vmsg->value; 325 - 326 - if (vmsg->data_len && vmsg->data_len == len) 327 - if (ipq_mangle_ipv6(vmsg, entry) < 0) 328 - verdict = NF_DROP; 329 - 330 - nf_reinject(entry, verdict); 331 - return 0; 332 - } 333 - } 334 - 335 - static int 336 - ipq_set_mode(unsigned char mode, unsigned int range) 337 - { 338 - int status; 339 - 340 - spin_lock_bh(&queue_lock); 341 - status = __ipq_set_mode(mode, range); 342 - spin_unlock_bh(&queue_lock); 343 - return status; 344 - } 345 - 346 - static int 347 - ipq_receive_peer(struct ipq_peer_msg *pmsg, 348 - unsigned char type, unsigned int len) 349 - { 350 - int status = 0; 351 - 352 - if (len < sizeof(*pmsg)) 353 - return -EINVAL; 354 - 355 - switch (type) { 356 - case IPQM_MODE: 357 - status = ipq_set_mode(pmsg->msg.mode.value, 358 - pmsg->msg.mode.range); 359 - break; 360 - 361 - case IPQM_VERDICT: 362 - status = ipq_set_verdict(&pmsg->msg.verdict, 363 - len - sizeof(*pmsg)); 364 - break; 365 - default: 366 - status = -EINVAL; 367 - } 368 - return status; 369 - } 370 - 371 - static int 372 - dev_cmp(struct nf_queue_entry *entry, unsigned long ifindex) 373 - { 374 - if (entry->indev) 375 - if (entry->indev->ifindex == ifindex) 376 - return 1; 377 - 378 - if (entry->outdev) 379 - if (entry->outdev->ifindex == ifindex) 380 - return 1; 381 - #ifdef CONFIG_BRIDGE_NETFILTER 382 - if (entry->skb->nf_bridge) { 383 - if (entry->skb->nf_bridge->physindev && 384 - entry->skb->nf_bridge->physindev->ifindex == ifindex) 385 - return 1; 386 - if (entry->skb->nf_bridge->physoutdev && 387 - entry->skb->nf_bridge->physoutdev->ifindex == ifindex) 388 - return 1; 389 - } 390 - #endif 391 - return 0; 392 - } 393 - 394 - static void 395 - ipq_dev_drop(int ifindex) 396 - { 397 - ipq_flush(dev_cmp, ifindex); 398 - } 399 - 400 - #define RCV_SKB_FAIL(err) do { netlink_ack(skb, nlh, (err)); return; } while (0) 401 - 402 - static inline void 403 - __ipq_rcv_skb(struct sk_buff *skb) 404 - { 405 - int status, type, pid, flags; 406 - unsigned int nlmsglen, skblen; 407 - struct nlmsghdr *nlh; 408 - bool enable_timestamp = false; 409 - 410 - skblen = skb->len; 411 - if (skblen < sizeof(*nlh)) 412 - return; 413 - 414 - nlh = nlmsg_hdr(skb); 415 - nlmsglen = nlh->nlmsg_len; 416 - if (nlmsglen < sizeof(*nlh) || skblen < nlmsglen) 417 - return; 418 - 419 - pid = nlh->nlmsg_pid; 420 - flags = nlh->nlmsg_flags; 421 - 422 - if(pid <= 0 || !(flags & NLM_F_REQUEST) || flags & NLM_F_MULTI) 423 - RCV_SKB_FAIL(-EINVAL); 424 - 425 - if (flags & MSG_TRUNC) 426 - RCV_SKB_FAIL(-ECOMM); 427 - 428 - type = nlh->nlmsg_type; 429 - if (type < NLMSG_NOOP || type >= IPQM_MAX) 430 - RCV_SKB_FAIL(-EINVAL); 431 - 432 - if (type <= IPQM_BASE) 433 - return; 434 - 435 - if (!capable(CAP_NET_ADMIN)) 436 - RCV_SKB_FAIL(-EPERM); 437 - 438 - spin_lock_bh(&queue_lock); 439 - 440 - if (peer_pid) { 441 - if (peer_pid != pid) { 442 - spin_unlock_bh(&queue_lock); 443 - RCV_SKB_FAIL(-EBUSY); 444 - } 445 - } else { 446 - enable_timestamp = true; 447 - peer_pid = pid; 448 - } 449 - 450 - spin_unlock_bh(&queue_lock); 451 - if (enable_timestamp) 452 - net_enable_timestamp(); 453 - 454 - status = ipq_receive_peer(NLMSG_DATA(nlh), type, 455 - nlmsglen - NLMSG_LENGTH(0)); 456 - if (status < 0) 457 - RCV_SKB_FAIL(status); 458 - 459 - if (flags & NLM_F_ACK) 460 - netlink_ack(skb, nlh, 0); 461 - } 462 - 463 - static void 464 - ipq_rcv_skb(struct sk_buff *skb) 465 - { 466 - mutex_lock(&ipqnl_mutex); 467 - __ipq_rcv_skb(skb); 468 - mutex_unlock(&ipqnl_mutex); 469 - } 470 - 471 - static int 472 - ipq_rcv_dev_event(struct notifier_block *this, 473 - unsigned long event, void *ptr) 474 - { 475 - struct net_device *dev = ptr; 476 - 477 - if (!net_eq(dev_net(dev), &init_net)) 478 - return NOTIFY_DONE; 479 - 480 - /* Drop any packets associated with the downed device */ 481 - if (event == NETDEV_DOWN) 482 - ipq_dev_drop(dev->ifindex); 483 - return NOTIFY_DONE; 484 - } 485 - 486 - static struct notifier_block ipq_dev_notifier = { 487 - .notifier_call = ipq_rcv_dev_event, 488 - }; 489 - 490 - static int 491 - ipq_rcv_nl_event(struct notifier_block *this, 492 - unsigned long event, void *ptr) 493 - { 494 - struct netlink_notify *n = ptr; 495 - 496 - if (event == NETLINK_URELEASE && n->protocol == NETLINK_IP6_FW) { 497 - spin_lock_bh(&queue_lock); 498 - if ((net_eq(n->net, &init_net)) && (n->pid == peer_pid)) 499 - __ipq_reset(); 500 - spin_unlock_bh(&queue_lock); 501 - } 502 - return NOTIFY_DONE; 503 - } 504 - 505 - static struct notifier_block ipq_nl_notifier = { 506 - .notifier_call = ipq_rcv_nl_event, 507 - }; 508 - 509 - #ifdef CONFIG_SYSCTL 510 - static struct ctl_table_header *ipq_sysctl_header; 511 - 512 - static ctl_table ipq_table[] = { 513 - { 514 - .procname = NET_IPQ_QMAX_NAME, 515 - .data = &queue_maxlen, 516 - .maxlen = sizeof(queue_maxlen), 517 - .mode = 0644, 518 - .proc_handler = proc_dointvec 519 - }, 520 - { } 521 - }; 522 - #endif 523 - 524 - #ifdef CONFIG_PROC_FS 525 - static int ip6_queue_show(struct seq_file *m, void *v) 526 - { 527 - spin_lock_bh(&queue_lock); 528 - 529 - seq_printf(m, 530 - "Peer PID : %d\n" 531 - "Copy mode : %hu\n" 532 - "Copy range : %u\n" 533 - "Queue length : %u\n" 534 - "Queue max. length : %u\n" 535 - "Queue dropped : %u\n" 536 - "Netfilter dropped : %u\n", 537 - peer_pid, 538 - copy_mode, 539 - copy_range, 540 - queue_total, 541 - queue_maxlen, 542 - queue_dropped, 543 - queue_user_dropped); 544 - 545 - spin_unlock_bh(&queue_lock); 546 - return 0; 547 - } 548 - 549 - static int ip6_queue_open(struct inode *inode, struct file *file) 550 - { 551 - return single_open(file, ip6_queue_show, NULL); 552 - } 553 - 554 - static const struct file_operations ip6_queue_proc_fops = { 555 - .open = ip6_queue_open, 556 - .read = seq_read, 557 - .llseek = seq_lseek, 558 - .release = single_release, 559 - .owner = THIS_MODULE, 560 - }; 561 - #endif 562 - 563 - static const struct nf_queue_handler nfqh = { 564 - .name = "ip6_queue", 565 - .outfn = &ipq_enqueue_packet, 566 - }; 567 - 568 - static int __init ip6_queue_init(void) 569 - { 570 - int status = -ENOMEM; 571 - struct proc_dir_entry *proc __maybe_unused; 572 - 573 - netlink_register_notifier(&ipq_nl_notifier); 574 - ipqnl = netlink_kernel_create(&init_net, NETLINK_IP6_FW, 0, 575 - ipq_rcv_skb, NULL, THIS_MODULE); 576 - if (ipqnl == NULL) { 577 - printk(KERN_ERR "ip6_queue: failed to create netlink socket\n"); 578 - goto cleanup_netlink_notifier; 579 - } 580 - 581 - #ifdef CONFIG_PROC_FS 582 - proc = proc_create(IPQ_PROC_FS_NAME, 0, init_net.proc_net, 583 - &ip6_queue_proc_fops); 584 - if (!proc) { 585 - printk(KERN_ERR "ip6_queue: failed to create proc entry\n"); 586 - goto cleanup_ipqnl; 587 - } 588 - #endif 589 - register_netdevice_notifier(&ipq_dev_notifier); 590 - #ifdef CONFIG_SYSCTL 591 - ipq_sysctl_header = register_net_sysctl(&init_net, "net/ipv6", ipq_table); 592 - #endif 593 - status = nf_register_queue_handler(NFPROTO_IPV6, &nfqh); 594 - if (status < 0) { 595 - printk(KERN_ERR "ip6_queue: failed to register queue handler\n"); 596 - goto cleanup_sysctl; 597 - } 598 - return status; 599 - 600 - cleanup_sysctl: 601 - #ifdef CONFIG_SYSCTL 602 - unregister_net_sysctl_table(ipq_sysctl_header); 603 - #endif 604 - unregister_netdevice_notifier(&ipq_dev_notifier); 605 - proc_net_remove(&init_net, IPQ_PROC_FS_NAME); 606 - 607 - cleanup_ipqnl: __maybe_unused 608 - netlink_kernel_release(ipqnl); 609 - mutex_lock(&ipqnl_mutex); 610 - mutex_unlock(&ipqnl_mutex); 611 - 612 - cleanup_netlink_notifier: 613 - netlink_unregister_notifier(&ipq_nl_notifier); 614 - return status; 615 - } 616 - 617 - static void __exit ip6_queue_fini(void) 618 - { 619 - nf_unregister_queue_handlers(&nfqh); 620 - 621 - ipq_flush(NULL, 0); 622 - 623 - #ifdef CONFIG_SYSCTL 624 - unregister_net_sysctl_table(ipq_sysctl_header); 625 - #endif 626 - unregister_netdevice_notifier(&ipq_dev_notifier); 627 - proc_net_remove(&init_net, IPQ_PROC_FS_NAME); 628 - 629 - netlink_kernel_release(ipqnl); 630 - mutex_lock(&ipqnl_mutex); 631 - mutex_unlock(&ipqnl_mutex); 632 - 633 - netlink_unregister_notifier(&ipq_nl_notifier); 634 - } 635 - 636 - MODULE_DESCRIPTION("IPv6 packet queue handler"); 637 - MODULE_LICENSE("GPL"); 638 - MODULE_ALIAS_NET_PF_PROTO(PF_NETLINK, NETLINK_IP6_FW); 639 - 640 - module_init(ip6_queue_init); 641 - module_exit(ip6_queue_fini);
+52 -18
net/netfilter/ipvs/ip_vs_conn.c
··· 548 548 ip_vs_bind_dest(struct ip_vs_conn *cp, struct ip_vs_dest *dest) 549 549 { 550 550 unsigned int conn_flags; 551 + __u32 flags; 551 552 552 553 /* if dest is NULL, then return directly */ 553 554 if (!dest) ··· 560 559 conn_flags = atomic_read(&dest->conn_flags); 561 560 if (cp->protocol != IPPROTO_UDP) 562 561 conn_flags &= ~IP_VS_CONN_F_ONE_PACKET; 562 + flags = cp->flags; 563 563 /* Bind with the destination and its corresponding transmitter */ 564 - if (cp->flags & IP_VS_CONN_F_SYNC) { 564 + if (flags & IP_VS_CONN_F_SYNC) { 565 565 /* if the connection is not template and is created 566 566 * by sync, preserve the activity flag. 567 567 */ 568 - if (!(cp->flags & IP_VS_CONN_F_TEMPLATE)) 568 + if (!(flags & IP_VS_CONN_F_TEMPLATE)) 569 569 conn_flags &= ~IP_VS_CONN_F_INACTIVE; 570 570 /* connections inherit forwarding method from dest */ 571 - cp->flags &= ~IP_VS_CONN_F_FWD_MASK; 571 + flags &= ~(IP_VS_CONN_F_FWD_MASK | IP_VS_CONN_F_NOOUTPUT); 572 572 } 573 - cp->flags |= conn_flags; 573 + flags |= conn_flags; 574 + cp->flags = flags; 574 575 cp->dest = dest; 575 576 576 577 IP_VS_DBG_BUF(7, "Bind-dest %s c:%s:%d v:%s:%d " ··· 587 584 atomic_read(&dest->refcnt)); 588 585 589 586 /* Update the connection counters */ 590 - if (!(cp->flags & IP_VS_CONN_F_TEMPLATE)) { 591 - /* It is a normal connection, so increase the inactive 592 - connection counter because it is in TCP SYNRECV 593 - state (inactive) or other protocol inacive state */ 594 - if ((cp->flags & IP_VS_CONN_F_SYNC) && 595 - (!(cp->flags & IP_VS_CONN_F_INACTIVE))) 587 + if (!(flags & IP_VS_CONN_F_TEMPLATE)) { 588 + /* It is a normal connection, so modify the counters 589 + * according to the flags, later the protocol can 590 + * update them on state change 591 + */ 592 + if (!(flags & IP_VS_CONN_F_INACTIVE)) 596 593 atomic_inc(&dest->activeconns); 597 594 else 598 595 atomic_inc(&dest->inactconns); ··· 616 613 { 617 614 struct ip_vs_dest *dest; 618 615 619 - if ((cp) && (!cp->dest)) { 620 - dest = ip_vs_find_dest(ip_vs_conn_net(cp), cp->af, &cp->daddr, 621 - cp->dport, &cp->vaddr, cp->vport, 622 - cp->protocol, cp->fwmark, cp->flags); 616 + dest = ip_vs_find_dest(ip_vs_conn_net(cp), cp->af, &cp->daddr, 617 + cp->dport, &cp->vaddr, cp->vport, 618 + cp->protocol, cp->fwmark, cp->flags); 619 + if (dest) { 620 + struct ip_vs_proto_data *pd; 621 + 622 + spin_lock(&cp->lock); 623 + if (cp->dest) { 624 + spin_unlock(&cp->lock); 625 + return dest; 626 + } 627 + 628 + /* Applications work depending on the forwarding method 629 + * but better to reassign them always when binding dest */ 630 + if (cp->app) 631 + ip_vs_unbind_app(cp); 632 + 623 633 ip_vs_bind_dest(cp, dest); 624 - return dest; 625 - } else 626 - return NULL; 634 + spin_unlock(&cp->lock); 635 + 636 + /* Update its packet transmitter */ 637 + cp->packet_xmit = NULL; 638 + #ifdef CONFIG_IP_VS_IPV6 639 + if (cp->af == AF_INET6) 640 + ip_vs_bind_xmit_v6(cp); 641 + else 642 + #endif 643 + ip_vs_bind_xmit(cp); 644 + 645 + pd = ip_vs_proto_data_get(ip_vs_conn_net(cp), cp->protocol); 646 + if (pd && atomic_read(&pd->appcnt)) 647 + ip_vs_bind_app(cp, pd->pp); 648 + } 649 + return dest; 627 650 } 628 651 629 652 ··· 772 743 static void ip_vs_conn_expire(unsigned long data) 773 744 { 774 745 struct ip_vs_conn *cp = (struct ip_vs_conn *)data; 775 - struct netns_ipvs *ipvs = net_ipvs(ip_vs_conn_net(cp)); 746 + struct net *net = ip_vs_conn_net(cp); 747 + struct netns_ipvs *ipvs = net_ipvs(net); 776 748 777 749 cp->timeout = 60*HZ; 778 750 ··· 837 807 IP_VS_DBG(7, "delayed: conn->refcnt-1=%d conn->n_control=%d\n", 838 808 atomic_read(&cp->refcnt)-1, 839 809 atomic_read(&cp->n_control)); 810 + 811 + if (ipvs->sync_state & IP_VS_STATE_MASTER) 812 + ip_vs_sync_conn(net, cp, sysctl_sync_threshold(ipvs)); 840 813 841 814 ip_vs_conn_put(cp); 842 815 } ··· 914 881 /* Set its state and timeout */ 915 882 cp->state = 0; 916 883 cp->timeout = 3*HZ; 884 + cp->sync_endtime = jiffies & ~3UL; 917 885 918 886 /* Bind its packet transmitter */ 919 887 #ifdef CONFIG_IP_VS_IPV6
+2 -28
net/netfilter/ipvs/ip_vs_core.c
··· 1613 1613 else 1614 1614 pkts = atomic_add_return(1, &cp->in_pkts); 1615 1615 1616 - if ((ipvs->sync_state & IP_VS_STATE_MASTER) && 1617 - cp->protocol == IPPROTO_SCTP) { 1618 - if ((cp->state == IP_VS_SCTP_S_ESTABLISHED && 1619 - (pkts % sysctl_sync_period(ipvs) 1620 - == sysctl_sync_threshold(ipvs))) || 1621 - (cp->old_state != cp->state && 1622 - ((cp->state == IP_VS_SCTP_S_CLOSED) || 1623 - (cp->state == IP_VS_SCTP_S_SHUT_ACK_CLI) || 1624 - (cp->state == IP_VS_SCTP_S_SHUT_ACK_SER)))) { 1625 - ip_vs_sync_conn(net, cp); 1626 - goto out; 1627 - } 1628 - } 1629 - 1630 - /* Keep this block last: TCP and others with pp->num_states <= 1 */ 1631 - else if ((ipvs->sync_state & IP_VS_STATE_MASTER) && 1632 - (((cp->protocol != IPPROTO_TCP || 1633 - cp->state == IP_VS_TCP_S_ESTABLISHED) && 1634 - (pkts % sysctl_sync_period(ipvs) 1635 - == sysctl_sync_threshold(ipvs))) || 1636 - ((cp->protocol == IPPROTO_TCP) && (cp->old_state != cp->state) && 1637 - ((cp->state == IP_VS_TCP_S_FIN_WAIT) || 1638 - (cp->state == IP_VS_TCP_S_CLOSE) || 1639 - (cp->state == IP_VS_TCP_S_CLOSE_WAIT) || 1640 - (cp->state == IP_VS_TCP_S_TIME_WAIT))))) 1641 - ip_vs_sync_conn(net, cp); 1642 - out: 1643 - cp->old_state = cp->state; 1616 + if (ipvs->sync_state & IP_VS_STATE_MASTER) 1617 + ip_vs_sync_conn(net, cp, pkts); 1644 1618 1645 1619 ip_vs_conn_put(cp); 1646 1620 return ret;
+66 -4
net/netfilter/ipvs/ip_vs_ctl.c
··· 1599 1599 } 1600 1600 1601 1601 #ifdef CONFIG_SYSCTL 1602 + 1603 + static int zero; 1604 + static int three = 3; 1605 + 1602 1606 static int 1603 1607 proc_do_defense_mode(ctl_table *table, int write, 1604 1608 void __user *buffer, size_t *lenp, loff_t *ppos) ··· 1636 1632 memcpy(val, valp, sizeof(val)); 1637 1633 1638 1634 rc = proc_dointvec(table, write, buffer, lenp, ppos); 1639 - if (write && (valp[0] < 0 || valp[1] < 0 || valp[0] >= valp[1])) { 1635 + if (write && (valp[0] < 0 || valp[1] < 0 || 1636 + (valp[0] >= valp[1] && valp[1]))) { 1640 1637 /* Restore the correct value */ 1641 1638 memcpy(valp, val, sizeof(val)); 1642 1639 } ··· 1657 1652 if ((*valp < 0) || (*valp > 1)) { 1658 1653 /* Restore the correct value */ 1659 1654 *valp = val; 1660 - } else { 1661 - struct net *net = current->nsproxy->net_ns; 1662 - ip_vs_sync_switch_mode(net, val); 1655 + } 1656 + } 1657 + return rc; 1658 + } 1659 + 1660 + static int 1661 + proc_do_sync_ports(ctl_table *table, int write, 1662 + void __user *buffer, size_t *lenp, loff_t *ppos) 1663 + { 1664 + int *valp = table->data; 1665 + int val = *valp; 1666 + int rc; 1667 + 1668 + rc = proc_dointvec(table, write, buffer, lenp, ppos); 1669 + if (write && (*valp != val)) { 1670 + if (*valp < 1 || !is_power_of_2(*valp)) { 1671 + /* Restore the correct value */ 1672 + *valp = val; 1663 1673 } 1664 1674 } 1665 1675 return rc; ··· 1738 1718 .proc_handler = &proc_do_sync_mode, 1739 1719 }, 1740 1720 { 1721 + .procname = "sync_ports", 1722 + .maxlen = sizeof(int), 1723 + .mode = 0644, 1724 + .proc_handler = &proc_do_sync_ports, 1725 + }, 1726 + { 1727 + .procname = "sync_qlen_max", 1728 + .maxlen = sizeof(int), 1729 + .mode = 0644, 1730 + .proc_handler = proc_dointvec, 1731 + }, 1732 + { 1733 + .procname = "sync_sock_size", 1734 + .maxlen = sizeof(int), 1735 + .mode = 0644, 1736 + .proc_handler = proc_dointvec, 1737 + }, 1738 + { 1741 1739 .procname = "cache_bypass", 1742 1740 .maxlen = sizeof(int), 1743 1741 .mode = 0644, ··· 1779 1741 sizeof(((struct netns_ipvs *)0)->sysctl_sync_threshold), 1780 1742 .mode = 0644, 1781 1743 .proc_handler = proc_do_sync_threshold, 1744 + }, 1745 + { 1746 + .procname = "sync_refresh_period", 1747 + .maxlen = sizeof(int), 1748 + .mode = 0644, 1749 + .proc_handler = proc_dointvec_jiffies, 1750 + }, 1751 + { 1752 + .procname = "sync_retries", 1753 + .maxlen = sizeof(int), 1754 + .mode = 0644, 1755 + .proc_handler = proc_dointvec_minmax, 1756 + .extra1 = &zero, 1757 + .extra2 = &three, 1782 1758 }, 1783 1759 { 1784 1760 .procname = "nat_icmp_send", ··· 3707 3655 tbl[idx++].data = &ipvs->sysctl_snat_reroute; 3708 3656 ipvs->sysctl_sync_ver = 1; 3709 3657 tbl[idx++].data = &ipvs->sysctl_sync_ver; 3658 + ipvs->sysctl_sync_ports = 1; 3659 + tbl[idx++].data = &ipvs->sysctl_sync_ports; 3660 + ipvs->sysctl_sync_qlen_max = nr_free_buffer_pages() / 32; 3661 + tbl[idx++].data = &ipvs->sysctl_sync_qlen_max; 3662 + ipvs->sysctl_sync_sock_size = 0; 3663 + tbl[idx++].data = &ipvs->sysctl_sync_sock_size; 3710 3664 tbl[idx++].data = &ipvs->sysctl_cache_bypass; 3711 3665 tbl[idx++].data = &ipvs->sysctl_expire_nodest_conn; 3712 3666 tbl[idx++].data = &ipvs->sysctl_expire_quiescent_template; ··· 3720 3662 ipvs->sysctl_sync_threshold[1] = DEFAULT_SYNC_PERIOD; 3721 3663 tbl[idx].data = &ipvs->sysctl_sync_threshold; 3722 3664 tbl[idx++].maxlen = sizeof(ipvs->sysctl_sync_threshold); 3665 + ipvs->sysctl_sync_refresh_period = DEFAULT_SYNC_REFRESH_PERIOD; 3666 + tbl[idx++].data = &ipvs->sysctl_sync_refresh_period; 3667 + ipvs->sysctl_sync_retries = clamp_t(int, DEFAULT_SYNC_RETRIES, 0, 3); 3668 + tbl[idx++].data = &ipvs->sysctl_sync_retries; 3723 3669 tbl[idx++].data = &ipvs->sysctl_nat_icmp_send; 3724 3670 3725 3671
+1 -1
net/netfilter/ipvs/ip_vs_dh.c
··· 149 149 150 150 /* allocate the DH table for this service */ 151 151 tbl = kmalloc(sizeof(struct ip_vs_dh_bucket)*IP_VS_DH_TAB_SIZE, 152 - GFP_ATOMIC); 152 + GFP_KERNEL); 153 153 if (tbl == NULL) 154 154 return -ENOMEM; 155 155
+1 -1
net/netfilter/ipvs/ip_vs_ftp.c
··· 485 485 .exit = __ip_vs_ftp_exit, 486 486 }; 487 487 488 - int __init ip_vs_ftp_init(void) 488 + static int __init ip_vs_ftp_init(void) 489 489 { 490 490 int rv; 491 491
+1 -1
net/netfilter/ipvs/ip_vs_lblc.c
··· 342 342 /* 343 343 * Allocate the ip_vs_lblc_table for this service 344 344 */ 345 - tbl = kmalloc(sizeof(*tbl), GFP_ATOMIC); 345 + tbl = kmalloc(sizeof(*tbl), GFP_KERNEL); 346 346 if (tbl == NULL) 347 347 return -ENOMEM; 348 348
+1 -1
net/netfilter/ipvs/ip_vs_lblcr.c
··· 511 511 /* 512 512 * Allocate the ip_vs_lblcr_table for this service 513 513 */ 514 - tbl = kmalloc(sizeof(*tbl), GFP_ATOMIC); 514 + tbl = kmalloc(sizeof(*tbl), GFP_KERNEL); 515 515 if (tbl == NULL) 516 516 return -ENOMEM; 517 517
+3 -3
net/netfilter/ipvs/ip_vs_proto.c
··· 68 68 struct netns_ipvs *ipvs = net_ipvs(net); 69 69 unsigned int hash = IP_VS_PROTO_HASH(pp->protocol); 70 70 struct ip_vs_proto_data *pd = 71 - kzalloc(sizeof(struct ip_vs_proto_data), GFP_ATOMIC); 71 + kzalloc(sizeof(struct ip_vs_proto_data), GFP_KERNEL); 72 72 73 73 if (!pd) 74 74 return -ENOMEM; ··· 156 156 /* 157 157 * get ip_vs_protocol object data by netns and proto 158 158 */ 159 - struct ip_vs_proto_data * 159 + static struct ip_vs_proto_data * 160 160 __ipvs_proto_data_get(struct netns_ipvs *ipvs, unsigned short proto) 161 161 { 162 162 struct ip_vs_proto_data *pd; ··· 199 199 int * 200 200 ip_vs_create_timeout_table(int *table, int size) 201 201 { 202 - return kmemdup(table, size, GFP_ATOMIC); 202 + return kmemdup(table, size, GFP_KERNEL); 203 203 } 204 204 205 205
+1 -1
net/netfilter/ipvs/ip_vs_sh.c
··· 162 162 163 163 /* allocate the SH table for this service */ 164 164 tbl = kmalloc(sizeof(struct ip_vs_sh_bucket)*IP_VS_SH_TAB_SIZE, 165 - GFP_ATOMIC); 165 + GFP_KERNEL); 166 166 if (tbl == NULL) 167 167 return -ENOMEM; 168 168
+455 -207
net/netfilter/ipvs/ip_vs_sync.c
··· 196 196 struct net *net; 197 197 struct socket *sock; 198 198 char *buf; 199 + int id; 199 200 }; 200 201 201 202 /* Version 0 definition of packet sizes */ ··· 272 271 unsigned char *end; 273 272 }; 274 273 275 - /* multicast addr */ 276 - static struct sockaddr_in mcast_addr = { 277 - .sin_family = AF_INET, 278 - .sin_port = cpu_to_be16(IP_VS_SYNC_PORT), 279 - .sin_addr.s_addr = cpu_to_be32(IP_VS_SYNC_GROUP), 280 - }; 281 - 282 274 /* 283 275 * Copy of struct ip_vs_seq 284 276 * From unaligned network order to aligned host order ··· 294 300 put_unaligned_be32(ho->previous_delta, &no->previous_delta); 295 301 } 296 302 297 - static inline struct ip_vs_sync_buff *sb_dequeue(struct netns_ipvs *ipvs) 303 + static inline struct ip_vs_sync_buff * 304 + sb_dequeue(struct netns_ipvs *ipvs, struct ipvs_master_sync_state *ms) 298 305 { 299 306 struct ip_vs_sync_buff *sb; 300 307 301 308 spin_lock_bh(&ipvs->sync_lock); 302 - if (list_empty(&ipvs->sync_queue)) { 309 + if (list_empty(&ms->sync_queue)) { 303 310 sb = NULL; 311 + __set_current_state(TASK_INTERRUPTIBLE); 304 312 } else { 305 - sb = list_entry(ipvs->sync_queue.next, 306 - struct ip_vs_sync_buff, 313 + sb = list_entry(ms->sync_queue.next, struct ip_vs_sync_buff, 307 314 list); 308 315 list_del(&sb->list); 316 + ms->sync_queue_len--; 317 + if (!ms->sync_queue_len) 318 + ms->sync_queue_delay = 0; 309 319 } 310 320 spin_unlock_bh(&ipvs->sync_lock); 311 321 ··· 332 334 kfree(sb); 333 335 return NULL; 334 336 } 335 - sb->mesg->reserved = 0; /* old nr_conns i.e. must be zeo now */ 337 + sb->mesg->reserved = 0; /* old nr_conns i.e. must be zero now */ 336 338 sb->mesg->version = SYNC_PROTO_VER; 337 339 sb->mesg->syncid = ipvs->master_syncid; 338 340 sb->mesg->size = sizeof(struct ip_vs_sync_mesg); ··· 351 353 kfree(sb); 352 354 } 353 355 354 - static inline void sb_queue_tail(struct netns_ipvs *ipvs) 356 + static inline void sb_queue_tail(struct netns_ipvs *ipvs, 357 + struct ipvs_master_sync_state *ms) 355 358 { 356 - struct ip_vs_sync_buff *sb = ipvs->sync_buff; 359 + struct ip_vs_sync_buff *sb = ms->sync_buff; 357 360 358 361 spin_lock(&ipvs->sync_lock); 359 - if (ipvs->sync_state & IP_VS_STATE_MASTER) 360 - list_add_tail(&sb->list, &ipvs->sync_queue); 361 - else 362 + if (ipvs->sync_state & IP_VS_STATE_MASTER && 363 + ms->sync_queue_len < sysctl_sync_qlen_max(ipvs)) { 364 + if (!ms->sync_queue_len) 365 + schedule_delayed_work(&ms->master_wakeup_work, 366 + max(IPVS_SYNC_SEND_DELAY, 1)); 367 + ms->sync_queue_len++; 368 + list_add_tail(&sb->list, &ms->sync_queue); 369 + if ((++ms->sync_queue_delay) == IPVS_SYNC_WAKEUP_RATE) 370 + wake_up_process(ms->master_thread); 371 + } else 362 372 ip_vs_sync_buff_release(sb); 363 373 spin_unlock(&ipvs->sync_lock); 364 374 } ··· 376 370 * than the specified time or the specified time is zero. 377 371 */ 378 372 static inline struct ip_vs_sync_buff * 379 - get_curr_sync_buff(struct netns_ipvs *ipvs, unsigned long time) 373 + get_curr_sync_buff(struct netns_ipvs *ipvs, struct ipvs_master_sync_state *ms, 374 + unsigned long time) 380 375 { 381 376 struct ip_vs_sync_buff *sb; 382 377 383 378 spin_lock_bh(&ipvs->sync_buff_lock); 384 - if (ipvs->sync_buff && 385 - time_after_eq(jiffies - ipvs->sync_buff->firstuse, time)) { 386 - sb = ipvs->sync_buff; 387 - ipvs->sync_buff = NULL; 379 + sb = ms->sync_buff; 380 + if (sb && time_after_eq(jiffies - sb->firstuse, time)) { 381 + ms->sync_buff = NULL; 382 + __set_current_state(TASK_RUNNING); 388 383 } else 389 384 sb = NULL; 390 385 spin_unlock_bh(&ipvs->sync_buff_lock); 391 386 return sb; 392 387 } 393 388 394 - /* 395 - * Switch mode from sending version 0 or 1 396 - * - must handle sync_buf 397 - */ 398 - void ip_vs_sync_switch_mode(struct net *net, int mode) 389 + static inline int 390 + select_master_thread_id(struct netns_ipvs *ipvs, struct ip_vs_conn *cp) 399 391 { 400 - struct netns_ipvs *ipvs = net_ipvs(net); 401 - 402 - if (!(ipvs->sync_state & IP_VS_STATE_MASTER)) 403 - return; 404 - if (mode == sysctl_sync_ver(ipvs) || !ipvs->sync_buff) 405 - return; 406 - 407 - spin_lock_bh(&ipvs->sync_buff_lock); 408 - /* Buffer empty ? then let buf_create do the job */ 409 - if (ipvs->sync_buff->mesg->size <= sizeof(struct ip_vs_sync_mesg)) { 410 - kfree(ipvs->sync_buff); 411 - ipvs->sync_buff = NULL; 412 - } else { 413 - spin_lock_bh(&ipvs->sync_lock); 414 - if (ipvs->sync_state & IP_VS_STATE_MASTER) 415 - list_add_tail(&ipvs->sync_buff->list, 416 - &ipvs->sync_queue); 417 - else 418 - ip_vs_sync_buff_release(ipvs->sync_buff); 419 - spin_unlock_bh(&ipvs->sync_lock); 420 - } 421 - spin_unlock_bh(&ipvs->sync_buff_lock); 392 + return ((long) cp >> (1 + ilog2(sizeof(*cp)))) & ipvs->threads_mask; 422 393 } 423 394 424 395 /* ··· 425 442 return sb; 426 443 } 427 444 445 + /* Check if conn should be synced. 446 + * pkts: conn packets, use sysctl_sync_threshold to avoid packet check 447 + * - (1) sync_refresh_period: reduce sync rate. Additionally, retry 448 + * sync_retries times with period of sync_refresh_period/8 449 + * - (2) if both sync_refresh_period and sync_period are 0 send sync only 450 + * for state changes or only once when pkts matches sync_threshold 451 + * - (3) templates: rate can be reduced only with sync_refresh_period or 452 + * with (2) 453 + */ 454 + static int ip_vs_sync_conn_needed(struct netns_ipvs *ipvs, 455 + struct ip_vs_conn *cp, int pkts) 456 + { 457 + unsigned long orig = ACCESS_ONCE(cp->sync_endtime); 458 + unsigned long now = jiffies; 459 + unsigned long n = (now + cp->timeout) & ~3UL; 460 + unsigned int sync_refresh_period; 461 + int sync_period; 462 + int force; 463 + 464 + /* Check if we sync in current state */ 465 + if (unlikely(cp->flags & IP_VS_CONN_F_TEMPLATE)) 466 + force = 0; 467 + else if (likely(cp->protocol == IPPROTO_TCP)) { 468 + if (!((1 << cp->state) & 469 + ((1 << IP_VS_TCP_S_ESTABLISHED) | 470 + (1 << IP_VS_TCP_S_FIN_WAIT) | 471 + (1 << IP_VS_TCP_S_CLOSE) | 472 + (1 << IP_VS_TCP_S_CLOSE_WAIT) | 473 + (1 << IP_VS_TCP_S_TIME_WAIT)))) 474 + return 0; 475 + force = cp->state != cp->old_state; 476 + if (force && cp->state != IP_VS_TCP_S_ESTABLISHED) 477 + goto set; 478 + } else if (unlikely(cp->protocol == IPPROTO_SCTP)) { 479 + if (!((1 << cp->state) & 480 + ((1 << IP_VS_SCTP_S_ESTABLISHED) | 481 + (1 << IP_VS_SCTP_S_CLOSED) | 482 + (1 << IP_VS_SCTP_S_SHUT_ACK_CLI) | 483 + (1 << IP_VS_SCTP_S_SHUT_ACK_SER)))) 484 + return 0; 485 + force = cp->state != cp->old_state; 486 + if (force && cp->state != IP_VS_SCTP_S_ESTABLISHED) 487 + goto set; 488 + } else { 489 + /* UDP or another protocol with single state */ 490 + force = 0; 491 + } 492 + 493 + sync_refresh_period = sysctl_sync_refresh_period(ipvs); 494 + if (sync_refresh_period > 0) { 495 + long diff = n - orig; 496 + long min_diff = max(cp->timeout >> 1, 10UL * HZ); 497 + 498 + /* Avoid sync if difference is below sync_refresh_period 499 + * and below the half timeout. 500 + */ 501 + if (abs(diff) < min_t(long, sync_refresh_period, min_diff)) { 502 + int retries = orig & 3; 503 + 504 + if (retries >= sysctl_sync_retries(ipvs)) 505 + return 0; 506 + if (time_before(now, orig - cp->timeout + 507 + (sync_refresh_period >> 3))) 508 + return 0; 509 + n |= retries + 1; 510 + } 511 + } 512 + sync_period = sysctl_sync_period(ipvs); 513 + if (sync_period > 0) { 514 + if (!(cp->flags & IP_VS_CONN_F_TEMPLATE) && 515 + pkts % sync_period != sysctl_sync_threshold(ipvs)) 516 + return 0; 517 + } else if (sync_refresh_period <= 0 && 518 + pkts != sysctl_sync_threshold(ipvs)) 519 + return 0; 520 + 521 + set: 522 + cp->old_state = cp->state; 523 + n = cmpxchg(&cp->sync_endtime, orig, n); 524 + return n == orig || force; 525 + } 526 + 428 527 /* 429 528 * Version 0 , could be switched in by sys_ctl. 430 529 * Add an ip_vs_conn information into the current sync_buff. 431 530 */ 432 - void ip_vs_sync_conn_v0(struct net *net, struct ip_vs_conn *cp) 531 + static void ip_vs_sync_conn_v0(struct net *net, struct ip_vs_conn *cp, 532 + int pkts) 433 533 { 434 534 struct netns_ipvs *ipvs = net_ipvs(net); 435 535 struct ip_vs_sync_mesg_v0 *m; 436 536 struct ip_vs_sync_conn_v0 *s; 537 + struct ip_vs_sync_buff *buff; 538 + struct ipvs_master_sync_state *ms; 539 + int id; 437 540 int len; 438 541 439 542 if (unlikely(cp->af != AF_INET)) ··· 528 459 if (cp->flags & IP_VS_CONN_F_ONE_PACKET) 529 460 return; 530 461 462 + if (!ip_vs_sync_conn_needed(ipvs, cp, pkts)) 463 + return; 464 + 531 465 spin_lock(&ipvs->sync_buff_lock); 532 - if (!ipvs->sync_buff) { 533 - ipvs->sync_buff = 534 - ip_vs_sync_buff_create_v0(ipvs); 535 - if (!ipvs->sync_buff) { 466 + if (!(ipvs->sync_state & IP_VS_STATE_MASTER)) { 467 + spin_unlock(&ipvs->sync_buff_lock); 468 + return; 469 + } 470 + 471 + id = select_master_thread_id(ipvs, cp); 472 + ms = &ipvs->ms[id]; 473 + buff = ms->sync_buff; 474 + if (buff) { 475 + m = (struct ip_vs_sync_mesg_v0 *) buff->mesg; 476 + /* Send buffer if it is for v1 */ 477 + if (!m->nr_conns) { 478 + sb_queue_tail(ipvs, ms); 479 + ms->sync_buff = NULL; 480 + buff = NULL; 481 + } 482 + } 483 + if (!buff) { 484 + buff = ip_vs_sync_buff_create_v0(ipvs); 485 + if (!buff) { 536 486 spin_unlock(&ipvs->sync_buff_lock); 537 487 pr_err("ip_vs_sync_buff_create failed.\n"); 538 488 return; 539 489 } 490 + ms->sync_buff = buff; 540 491 } 541 492 542 493 len = (cp->flags & IP_VS_CONN_F_SEQ_MASK) ? FULL_CONN_SIZE : 543 494 SIMPLE_CONN_SIZE; 544 - m = (struct ip_vs_sync_mesg_v0 *)ipvs->sync_buff->mesg; 545 - s = (struct ip_vs_sync_conn_v0 *)ipvs->sync_buff->head; 495 + m = (struct ip_vs_sync_mesg_v0 *) buff->mesg; 496 + s = (struct ip_vs_sync_conn_v0 *) buff->head; 546 497 547 498 /* copy members */ 548 499 s->reserved = 0; ··· 583 494 584 495 m->nr_conns++; 585 496 m->size += len; 586 - ipvs->sync_buff->head += len; 497 + buff->head += len; 587 498 588 499 /* check if there is a space for next one */ 589 - if (ipvs->sync_buff->head + FULL_CONN_SIZE > ipvs->sync_buff->end) { 590 - sb_queue_tail(ipvs); 591 - ipvs->sync_buff = NULL; 500 + if (buff->head + FULL_CONN_SIZE > buff->end) { 501 + sb_queue_tail(ipvs, ms); 502 + ms->sync_buff = NULL; 592 503 } 593 504 spin_unlock(&ipvs->sync_buff_lock); 594 505 595 506 /* synchronize its controller if it has */ 596 - if (cp->control) 597 - ip_vs_sync_conn(net, cp->control); 507 + cp = cp->control; 508 + if (cp) { 509 + if (cp->flags & IP_VS_CONN_F_TEMPLATE) 510 + pkts = atomic_add_return(1, &cp->in_pkts); 511 + else 512 + pkts = sysctl_sync_threshold(ipvs); 513 + ip_vs_sync_conn(net, cp->control, pkts); 514 + } 598 515 } 599 516 600 517 /* ··· 608 513 * Called by ip_vs_in. 609 514 * Sending Version 1 messages 610 515 */ 611 - void ip_vs_sync_conn(struct net *net, struct ip_vs_conn *cp) 516 + void ip_vs_sync_conn(struct net *net, struct ip_vs_conn *cp, int pkts) 612 517 { 613 518 struct netns_ipvs *ipvs = net_ipvs(net); 614 519 struct ip_vs_sync_mesg *m; 615 520 union ip_vs_sync_conn *s; 521 + struct ip_vs_sync_buff *buff; 522 + struct ipvs_master_sync_state *ms; 523 + int id; 616 524 __u8 *p; 617 525 unsigned int len, pe_name_len, pad; 618 526 619 527 /* Handle old version of the protocol */ 620 528 if (sysctl_sync_ver(ipvs) == 0) { 621 - ip_vs_sync_conn_v0(net, cp); 529 + ip_vs_sync_conn_v0(net, cp, pkts); 622 530 return; 623 531 } 624 532 /* Do not sync ONE PACKET */ 625 533 if (cp->flags & IP_VS_CONN_F_ONE_PACKET) 626 534 goto control; 627 535 sloop: 536 + if (!ip_vs_sync_conn_needed(ipvs, cp, pkts)) 537 + goto control; 538 + 628 539 /* Sanity checks */ 629 540 pe_name_len = 0; 630 541 if (cp->pe_data_len) { ··· 642 541 } 643 542 644 543 spin_lock(&ipvs->sync_buff_lock); 544 + if (!(ipvs->sync_state & IP_VS_STATE_MASTER)) { 545 + spin_unlock(&ipvs->sync_buff_lock); 546 + return; 547 + } 548 + 549 + id = select_master_thread_id(ipvs, cp); 550 + ms = &ipvs->ms[id]; 645 551 646 552 #ifdef CONFIG_IP_VS_IPV6 647 553 if (cp->af == AF_INET6) ··· 667 559 668 560 /* check if there is a space for this one */ 669 561 pad = 0; 670 - if (ipvs->sync_buff) { 671 - pad = (4 - (size_t)ipvs->sync_buff->head) & 3; 672 - if (ipvs->sync_buff->head + len + pad > ipvs->sync_buff->end) { 673 - sb_queue_tail(ipvs); 674 - ipvs->sync_buff = NULL; 562 + buff = ms->sync_buff; 563 + if (buff) { 564 + m = buff->mesg; 565 + pad = (4 - (size_t) buff->head) & 3; 566 + /* Send buffer if it is for v0 */ 567 + if (buff->head + len + pad > buff->end || m->reserved) { 568 + sb_queue_tail(ipvs, ms); 569 + ms->sync_buff = NULL; 570 + buff = NULL; 675 571 pad = 0; 676 572 } 677 573 } 678 574 679 - if (!ipvs->sync_buff) { 680 - ipvs->sync_buff = ip_vs_sync_buff_create(ipvs); 681 - if (!ipvs->sync_buff) { 575 + if (!buff) { 576 + buff = ip_vs_sync_buff_create(ipvs); 577 + if (!buff) { 682 578 spin_unlock(&ipvs->sync_buff_lock); 683 579 pr_err("ip_vs_sync_buff_create failed.\n"); 684 580 return; 685 581 } 582 + ms->sync_buff = buff; 583 + m = buff->mesg; 686 584 } 687 585 688 - m = ipvs->sync_buff->mesg; 689 - p = ipvs->sync_buff->head; 690 - ipvs->sync_buff->head += pad + len; 586 + p = buff->head; 587 + buff->head += pad + len; 691 588 m->size += pad + len; 692 589 /* Add ev. padding from prev. sync_conn */ 693 590 while (pad--) ··· 757 644 cp = cp->control; 758 645 if (!cp) 759 646 return; 760 - /* 761 - * Reduce sync rate for templates 762 - * i.e only increment in_pkts for Templates. 763 - */ 764 - if (cp->flags & IP_VS_CONN_F_TEMPLATE) { 765 - int pkts = atomic_add_return(1, &cp->in_pkts); 766 - 767 - if (pkts % sysctl_sync_period(ipvs) != 1) 768 - return; 769 - } 647 + if (cp->flags & IP_VS_CONN_F_TEMPLATE) 648 + pkts = atomic_add_return(1, &cp->in_pkts); 649 + else 650 + pkts = sysctl_sync_threshold(ipvs); 770 651 goto sloop; 771 652 } 772 653 ··· 838 731 else 839 732 cp = ip_vs_ct_in_get(param); 840 733 841 - if (cp && param->pe_data) /* Free pe_data */ 734 + if (cp) { 735 + /* Free pe_data */ 842 736 kfree(param->pe_data); 843 - if (!cp) { 737 + 738 + dest = cp->dest; 739 + spin_lock(&cp->lock); 740 + if ((cp->flags ^ flags) & IP_VS_CONN_F_INACTIVE && 741 + !(flags & IP_VS_CONN_F_TEMPLATE) && dest) { 742 + if (flags & IP_VS_CONN_F_INACTIVE) { 743 + atomic_dec(&dest->activeconns); 744 + atomic_inc(&dest->inactconns); 745 + } else { 746 + atomic_inc(&dest->activeconns); 747 + atomic_dec(&dest->inactconns); 748 + } 749 + } 750 + flags &= IP_VS_CONN_F_BACKUP_UPD_MASK; 751 + flags |= cp->flags & ~IP_VS_CONN_F_BACKUP_UPD_MASK; 752 + cp->flags = flags; 753 + spin_unlock(&cp->lock); 754 + if (!dest) { 755 + dest = ip_vs_try_bind_dest(cp); 756 + if (dest) 757 + atomic_dec(&dest->refcnt); 758 + } 759 + } else { 844 760 /* 845 761 * Find the appropriate destination for the connection. 846 762 * If it is not found the connection will remain unbound ··· 872 742 dest = ip_vs_find_dest(net, type, daddr, dport, param->vaddr, 873 743 param->vport, protocol, fwmark, flags); 874 744 875 - /* Set the approprite ativity flag */ 876 - if (protocol == IPPROTO_TCP) { 877 - if (state != IP_VS_TCP_S_ESTABLISHED) 878 - flags |= IP_VS_CONN_F_INACTIVE; 879 - else 880 - flags &= ~IP_VS_CONN_F_INACTIVE; 881 - } else if (protocol == IPPROTO_SCTP) { 882 - if (state != IP_VS_SCTP_S_ESTABLISHED) 883 - flags |= IP_VS_CONN_F_INACTIVE; 884 - else 885 - flags &= ~IP_VS_CONN_F_INACTIVE; 886 - } 887 745 cp = ip_vs_conn_new(param, daddr, dport, flags, dest, fwmark); 888 746 if (dest) 889 747 atomic_dec(&dest->refcnt); ··· 880 762 kfree(param->pe_data); 881 763 IP_VS_DBG(2, "BACKUP, add new conn. failed\n"); 882 764 return; 883 - } 884 - } else if (!cp->dest) { 885 - dest = ip_vs_try_bind_dest(cp); 886 - if (dest) 887 - atomic_dec(&dest->refcnt); 888 - } else if ((cp->dest) && (cp->protocol == IPPROTO_TCP) && 889 - (cp->state != state)) { 890 - /* update active/inactive flag for the connection */ 891 - dest = cp->dest; 892 - if (!(cp->flags & IP_VS_CONN_F_INACTIVE) && 893 - (state != IP_VS_TCP_S_ESTABLISHED)) { 894 - atomic_dec(&dest->activeconns); 895 - atomic_inc(&dest->inactconns); 896 - cp->flags |= IP_VS_CONN_F_INACTIVE; 897 - } else if ((cp->flags & IP_VS_CONN_F_INACTIVE) && 898 - (state == IP_VS_TCP_S_ESTABLISHED)) { 899 - atomic_inc(&dest->activeconns); 900 - atomic_dec(&dest->inactconns); 901 - cp->flags &= ~IP_VS_CONN_F_INACTIVE; 902 - } 903 - } else if ((cp->dest) && (cp->protocol == IPPROTO_SCTP) && 904 - (cp->state != state)) { 905 - dest = cp->dest; 906 - if (!(cp->flags & IP_VS_CONN_F_INACTIVE) && 907 - (state != IP_VS_SCTP_S_ESTABLISHED)) { 908 - atomic_dec(&dest->activeconns); 909 - atomic_inc(&dest->inactconns); 910 - cp->flags &= ~IP_VS_CONN_F_INACTIVE; 911 765 } 912 766 } 913 767 ··· 1239 1149 1240 1150 1241 1151 /* 1152 + * Setup sndbuf (mode=1) or rcvbuf (mode=0) 1153 + */ 1154 + static void set_sock_size(struct sock *sk, int mode, int val) 1155 + { 1156 + /* setsockopt(sock, SOL_SOCKET, SO_SNDBUF, &val, sizeof(val)); */ 1157 + /* setsockopt(sock, SOL_SOCKET, SO_RCVBUF, &val, sizeof(val)); */ 1158 + lock_sock(sk); 1159 + if (mode) { 1160 + val = clamp_t(int, val, (SOCK_MIN_SNDBUF + 1) / 2, 1161 + sysctl_wmem_max); 1162 + sk->sk_sndbuf = val * 2; 1163 + sk->sk_userlocks |= SOCK_SNDBUF_LOCK; 1164 + } else { 1165 + val = clamp_t(int, val, (SOCK_MIN_RCVBUF + 1) / 2, 1166 + sysctl_rmem_max); 1167 + sk->sk_rcvbuf = val * 2; 1168 + sk->sk_userlocks |= SOCK_RCVBUF_LOCK; 1169 + } 1170 + release_sock(sk); 1171 + } 1172 + 1173 + /* 1242 1174 * Setup loopback of outgoing multicasts on a sending socket 1243 1175 */ 1244 1176 static void set_mcast_loop(struct sock *sk, u_char loop) ··· 1410 1298 /* 1411 1299 * Set up sending multicast socket over UDP 1412 1300 */ 1413 - static struct socket *make_send_sock(struct net *net) 1301 + static struct socket *make_send_sock(struct net *net, int id) 1414 1302 { 1415 1303 struct netns_ipvs *ipvs = net_ipvs(net); 1304 + /* multicast addr */ 1305 + struct sockaddr_in mcast_addr = { 1306 + .sin_family = AF_INET, 1307 + .sin_port = cpu_to_be16(IP_VS_SYNC_PORT + id), 1308 + .sin_addr.s_addr = cpu_to_be32(IP_VS_SYNC_GROUP), 1309 + }; 1416 1310 struct socket *sock; 1417 1311 int result; 1418 1312 ··· 1442 1324 1443 1325 set_mcast_loop(sock->sk, 0); 1444 1326 set_mcast_ttl(sock->sk, 1); 1327 + result = sysctl_sync_sock_size(ipvs); 1328 + if (result > 0) 1329 + set_sock_size(sock->sk, 1, result); 1445 1330 1446 1331 result = bind_mcastif_addr(sock, ipvs->master_mcast_ifn); 1447 1332 if (result < 0) { ··· 1470 1349 /* 1471 1350 * Set up receiving multicast socket over UDP 1472 1351 */ 1473 - static struct socket *make_receive_sock(struct net *net) 1352 + static struct socket *make_receive_sock(struct net *net, int id) 1474 1353 { 1475 1354 struct netns_ipvs *ipvs = net_ipvs(net); 1355 + /* multicast addr */ 1356 + struct sockaddr_in mcast_addr = { 1357 + .sin_family = AF_INET, 1358 + .sin_port = cpu_to_be16(IP_VS_SYNC_PORT + id), 1359 + .sin_addr.s_addr = cpu_to_be32(IP_VS_SYNC_GROUP), 1360 + }; 1476 1361 struct socket *sock; 1477 1362 int result; 1478 1363 ··· 1496 1369 sk_change_net(sock->sk, net); 1497 1370 /* it is equivalent to the REUSEADDR option in user-space */ 1498 1371 sock->sk->sk_reuse = SK_CAN_REUSE; 1372 + result = sysctl_sync_sock_size(ipvs); 1373 + if (result > 0) 1374 + set_sock_size(sock->sk, 0, result); 1499 1375 1500 1376 result = sock->ops->bind(sock, (struct sockaddr *) &mcast_addr, 1501 1377 sizeof(struct sockaddr)); ··· 1541 1411 return len; 1542 1412 } 1543 1413 1544 - static void 1414 + static int 1545 1415 ip_vs_send_sync_msg(struct socket *sock, struct ip_vs_sync_mesg *msg) 1546 1416 { 1547 1417 int msize; 1418 + int ret; 1548 1419 1549 1420 msize = msg->size; 1550 1421 1551 1422 /* Put size in network byte order */ 1552 1423 msg->size = htons(msg->size); 1553 1424 1554 - if (ip_vs_send_async(sock, (char *)msg, msize) != msize) 1555 - pr_err("ip_vs_send_async error\n"); 1425 + ret = ip_vs_send_async(sock, (char *)msg, msize); 1426 + if (ret >= 0 || ret == -EAGAIN) 1427 + return ret; 1428 + pr_err("ip_vs_send_async error %d\n", ret); 1429 + return 0; 1556 1430 } 1557 1431 1558 1432 static int ··· 1572 1438 iov.iov_base = buffer; 1573 1439 iov.iov_len = (size_t)buflen; 1574 1440 1575 - len = kernel_recvmsg(sock, &msg, &iov, 1, buflen, 0); 1441 + len = kernel_recvmsg(sock, &msg, &iov, 1, buflen, MSG_DONTWAIT); 1576 1442 1577 1443 if (len < 0) 1578 - return -1; 1444 + return len; 1579 1445 1580 1446 LeaveFunction(7); 1581 1447 return len; 1582 1448 } 1583 1449 1450 + /* Wakeup the master thread for sending */ 1451 + static void master_wakeup_work_handler(struct work_struct *work) 1452 + { 1453 + struct ipvs_master_sync_state *ms = 1454 + container_of(work, struct ipvs_master_sync_state, 1455 + master_wakeup_work.work); 1456 + struct netns_ipvs *ipvs = ms->ipvs; 1457 + 1458 + spin_lock_bh(&ipvs->sync_lock); 1459 + if (ms->sync_queue_len && 1460 + ms->sync_queue_delay < IPVS_SYNC_WAKEUP_RATE) { 1461 + ms->sync_queue_delay = IPVS_SYNC_WAKEUP_RATE; 1462 + wake_up_process(ms->master_thread); 1463 + } 1464 + spin_unlock_bh(&ipvs->sync_lock); 1465 + } 1466 + 1467 + /* Get next buffer to send */ 1468 + static inline struct ip_vs_sync_buff * 1469 + next_sync_buff(struct netns_ipvs *ipvs, struct ipvs_master_sync_state *ms) 1470 + { 1471 + struct ip_vs_sync_buff *sb; 1472 + 1473 + sb = sb_dequeue(ipvs, ms); 1474 + if (sb) 1475 + return sb; 1476 + /* Do not delay entries in buffer for more than 2 seconds */ 1477 + return get_curr_sync_buff(ipvs, ms, IPVS_SYNC_FLUSH_TIME); 1478 + } 1584 1479 1585 1480 static int sync_thread_master(void *data) 1586 1481 { 1587 1482 struct ip_vs_sync_thread_data *tinfo = data; 1588 1483 struct netns_ipvs *ipvs = net_ipvs(tinfo->net); 1484 + struct ipvs_master_sync_state *ms = &ipvs->ms[tinfo->id]; 1485 + struct sock *sk = tinfo->sock->sk; 1589 1486 struct ip_vs_sync_buff *sb; 1590 1487 1591 1488 pr_info("sync thread started: state = MASTER, mcast_ifn = %s, " 1592 - "syncid = %d\n", 1593 - ipvs->master_mcast_ifn, ipvs->master_syncid); 1489 + "syncid = %d, id = %d\n", 1490 + ipvs->master_mcast_ifn, ipvs->master_syncid, tinfo->id); 1594 1491 1595 - while (!kthread_should_stop()) { 1596 - while ((sb = sb_dequeue(ipvs))) { 1597 - ip_vs_send_sync_msg(tinfo->sock, sb->mesg); 1598 - ip_vs_sync_buff_release(sb); 1492 + for (;;) { 1493 + sb = next_sync_buff(ipvs, ms); 1494 + if (unlikely(kthread_should_stop())) 1495 + break; 1496 + if (!sb) { 1497 + schedule_timeout(IPVS_SYNC_CHECK_PERIOD); 1498 + continue; 1599 1499 } 1500 + while (ip_vs_send_sync_msg(tinfo->sock, sb->mesg) < 0) { 1501 + int ret = 0; 1600 1502 1601 - /* check if entries stay in ipvs->sync_buff for 2 seconds */ 1602 - sb = get_curr_sync_buff(ipvs, 2 * HZ); 1603 - if (sb) { 1604 - ip_vs_send_sync_msg(tinfo->sock, sb->mesg); 1605 - ip_vs_sync_buff_release(sb); 1503 + __wait_event_interruptible(*sk_sleep(sk), 1504 + sock_writeable(sk) || 1505 + kthread_should_stop(), 1506 + ret); 1507 + if (unlikely(kthread_should_stop())) 1508 + goto done; 1606 1509 } 1607 - 1608 - schedule_timeout_interruptible(HZ); 1510 + ip_vs_sync_buff_release(sb); 1609 1511 } 1610 1512 1611 - /* clean up the sync_buff queue */ 1612 - while ((sb = sb_dequeue(ipvs))) 1513 + done: 1514 + __set_current_state(TASK_RUNNING); 1515 + if (sb) 1613 1516 ip_vs_sync_buff_release(sb); 1614 1517 1518 + /* clean up the sync_buff queue */ 1519 + while ((sb = sb_dequeue(ipvs, ms))) 1520 + ip_vs_sync_buff_release(sb); 1521 + __set_current_state(TASK_RUNNING); 1522 + 1615 1523 /* clean up the current sync_buff */ 1616 - sb = get_curr_sync_buff(ipvs, 0); 1524 + sb = get_curr_sync_buff(ipvs, ms, 0); 1617 1525 if (sb) 1618 1526 ip_vs_sync_buff_release(sb); 1619 1527 ··· 1674 1498 int len; 1675 1499 1676 1500 pr_info("sync thread started: state = BACKUP, mcast_ifn = %s, " 1677 - "syncid = %d\n", 1678 - ipvs->backup_mcast_ifn, ipvs->backup_syncid); 1501 + "syncid = %d, id = %d\n", 1502 + ipvs->backup_mcast_ifn, ipvs->backup_syncid, tinfo->id); 1679 1503 1680 1504 while (!kthread_should_stop()) { 1681 1505 wait_event_interruptible(*sk_sleep(tinfo->sock->sk), ··· 1687 1511 len = ip_vs_receive(tinfo->sock, tinfo->buf, 1688 1512 ipvs->recv_mesg_maxlen); 1689 1513 if (len <= 0) { 1690 - pr_err("receiving message error\n"); 1514 + if (len != -EAGAIN) 1515 + pr_err("receiving message error\n"); 1691 1516 break; 1692 1517 } 1693 1518 ··· 1712 1535 int start_sync_thread(struct net *net, int state, char *mcast_ifn, __u8 syncid) 1713 1536 { 1714 1537 struct ip_vs_sync_thread_data *tinfo; 1715 - struct task_struct **realtask, *task; 1538 + struct task_struct **array = NULL, *task; 1716 1539 struct socket *sock; 1717 1540 struct netns_ipvs *ipvs = net_ipvs(net); 1718 - char *name, *buf = NULL; 1541 + char *name; 1719 1542 int (*threadfn)(void *data); 1543 + int id, count; 1720 1544 int result = -ENOMEM; 1721 1545 1722 1546 IP_VS_DBG(7, "%s(): pid %d\n", __func__, task_pid_nr(current)); 1723 1547 IP_VS_DBG(7, "Each ip_vs_sync_conn entry needs %Zd bytes\n", 1724 1548 sizeof(struct ip_vs_sync_conn_v0)); 1725 1549 1550 + if (!ipvs->sync_state) { 1551 + count = clamp(sysctl_sync_ports(ipvs), 1, IPVS_SYNC_PORTS_MAX); 1552 + ipvs->threads_mask = count - 1; 1553 + } else 1554 + count = ipvs->threads_mask + 1; 1726 1555 1727 1556 if (state == IP_VS_STATE_MASTER) { 1728 - if (ipvs->master_thread) 1557 + if (ipvs->ms) 1729 1558 return -EEXIST; 1730 1559 1731 1560 strlcpy(ipvs->master_mcast_ifn, mcast_ifn, 1732 1561 sizeof(ipvs->master_mcast_ifn)); 1733 1562 ipvs->master_syncid = syncid; 1734 - realtask = &ipvs->master_thread; 1735 - name = "ipvs_master:%d"; 1563 + name = "ipvs-m:%d:%d"; 1736 1564 threadfn = sync_thread_master; 1737 - sock = make_send_sock(net); 1738 1565 } else if (state == IP_VS_STATE_BACKUP) { 1739 - if (ipvs->backup_thread) 1566 + if (ipvs->backup_threads) 1740 1567 return -EEXIST; 1741 1568 1742 1569 strlcpy(ipvs->backup_mcast_ifn, mcast_ifn, 1743 1570 sizeof(ipvs->backup_mcast_ifn)); 1744 1571 ipvs->backup_syncid = syncid; 1745 - realtask = &ipvs->backup_thread; 1746 - name = "ipvs_backup:%d"; 1572 + name = "ipvs-b:%d:%d"; 1747 1573 threadfn = sync_thread_backup; 1748 - sock = make_receive_sock(net); 1749 1574 } else { 1750 1575 return -EINVAL; 1751 1576 } 1752 1577 1753 - if (IS_ERR(sock)) { 1754 - result = PTR_ERR(sock); 1755 - goto out; 1756 - } 1578 + if (state == IP_VS_STATE_MASTER) { 1579 + struct ipvs_master_sync_state *ms; 1757 1580 1581 + ipvs->ms = kzalloc(count * sizeof(ipvs->ms[0]), GFP_KERNEL); 1582 + if (!ipvs->ms) 1583 + goto out; 1584 + ms = ipvs->ms; 1585 + for (id = 0; id < count; id++, ms++) { 1586 + INIT_LIST_HEAD(&ms->sync_queue); 1587 + ms->sync_queue_len = 0; 1588 + ms->sync_queue_delay = 0; 1589 + INIT_DELAYED_WORK(&ms->master_wakeup_work, 1590 + master_wakeup_work_handler); 1591 + ms->ipvs = ipvs; 1592 + } 1593 + } else { 1594 + array = kzalloc(count * sizeof(struct task_struct *), 1595 + GFP_KERNEL); 1596 + if (!array) 1597 + goto out; 1598 + } 1758 1599 set_sync_mesg_maxlen(net, state); 1759 - if (state == IP_VS_STATE_BACKUP) { 1760 - buf = kmalloc(ipvs->recv_mesg_maxlen, GFP_KERNEL); 1761 - if (!buf) 1600 + 1601 + tinfo = NULL; 1602 + for (id = 0; id < count; id++) { 1603 + if (state == IP_VS_STATE_MASTER) 1604 + sock = make_send_sock(net, id); 1605 + else 1606 + sock = make_receive_sock(net, id); 1607 + if (IS_ERR(sock)) { 1608 + result = PTR_ERR(sock); 1609 + goto outtinfo; 1610 + } 1611 + tinfo = kmalloc(sizeof(*tinfo), GFP_KERNEL); 1612 + if (!tinfo) 1762 1613 goto outsocket; 1763 - } 1614 + tinfo->net = net; 1615 + tinfo->sock = sock; 1616 + if (state == IP_VS_STATE_BACKUP) { 1617 + tinfo->buf = kmalloc(ipvs->recv_mesg_maxlen, 1618 + GFP_KERNEL); 1619 + if (!tinfo->buf) 1620 + goto outtinfo; 1621 + } 1622 + tinfo->id = id; 1764 1623 1765 - tinfo = kmalloc(sizeof(*tinfo), GFP_KERNEL); 1766 - if (!tinfo) 1767 - goto outbuf; 1768 - 1769 - tinfo->net = net; 1770 - tinfo->sock = sock; 1771 - tinfo->buf = buf; 1772 - 1773 - task = kthread_run(threadfn, tinfo, name, ipvs->gen); 1774 - if (IS_ERR(task)) { 1775 - result = PTR_ERR(task); 1776 - goto outtinfo; 1624 + task = kthread_run(threadfn, tinfo, name, ipvs->gen, id); 1625 + if (IS_ERR(task)) { 1626 + result = PTR_ERR(task); 1627 + goto outtinfo; 1628 + } 1629 + tinfo = NULL; 1630 + if (state == IP_VS_STATE_MASTER) 1631 + ipvs->ms[id].master_thread = task; 1632 + else 1633 + array[id] = task; 1777 1634 } 1778 1635 1779 1636 /* mark as active */ 1780 - *realtask = task; 1637 + 1638 + if (state == IP_VS_STATE_BACKUP) 1639 + ipvs->backup_threads = array; 1640 + spin_lock_bh(&ipvs->sync_buff_lock); 1781 1641 ipvs->sync_state |= state; 1642 + spin_unlock_bh(&ipvs->sync_buff_lock); 1782 1643 1783 1644 /* increase the module use count */ 1784 1645 ip_vs_use_count_inc(); 1785 1646 1786 1647 return 0; 1787 1648 1788 - outtinfo: 1789 - kfree(tinfo); 1790 - outbuf: 1791 - kfree(buf); 1792 1649 outsocket: 1793 1650 sk_release_kernel(sock->sk); 1651 + 1652 + outtinfo: 1653 + if (tinfo) { 1654 + sk_release_kernel(tinfo->sock->sk); 1655 + kfree(tinfo->buf); 1656 + kfree(tinfo); 1657 + } 1658 + count = id; 1659 + while (count-- > 0) { 1660 + if (state == IP_VS_STATE_MASTER) 1661 + kthread_stop(ipvs->ms[count].master_thread); 1662 + else 1663 + kthread_stop(array[count]); 1664 + } 1665 + kfree(array); 1666 + 1794 1667 out: 1668 + if (!(ipvs->sync_state & IP_VS_STATE_MASTER)) { 1669 + kfree(ipvs->ms); 1670 + ipvs->ms = NULL; 1671 + } 1795 1672 return result; 1796 1673 } 1797 1674 ··· 1853 1622 int stop_sync_thread(struct net *net, int state) 1854 1623 { 1855 1624 struct netns_ipvs *ipvs = net_ipvs(net); 1625 + struct task_struct **array; 1626 + int id; 1856 1627 int retc = -EINVAL; 1857 1628 1858 1629 IP_VS_DBG(7, "%s(): pid %d\n", __func__, task_pid_nr(current)); 1859 1630 1860 1631 if (state == IP_VS_STATE_MASTER) { 1861 - if (!ipvs->master_thread) 1632 + if (!ipvs->ms) 1862 1633 return -ESRCH; 1863 - 1864 - pr_info("stopping master sync thread %d ...\n", 1865 - task_pid_nr(ipvs->master_thread)); 1866 1634 1867 1635 /* 1868 1636 * The lock synchronizes with sb_queue_tail(), so that we don't ··· 1869 1639 * progress of stopping the master sync daemon. 1870 1640 */ 1871 1641 1872 - spin_lock_bh(&ipvs->sync_lock); 1642 + spin_lock_bh(&ipvs->sync_buff_lock); 1643 + spin_lock(&ipvs->sync_lock); 1873 1644 ipvs->sync_state &= ~IP_VS_STATE_MASTER; 1874 - spin_unlock_bh(&ipvs->sync_lock); 1875 - retc = kthread_stop(ipvs->master_thread); 1876 - ipvs->master_thread = NULL; 1645 + spin_unlock(&ipvs->sync_lock); 1646 + spin_unlock_bh(&ipvs->sync_buff_lock); 1647 + 1648 + retc = 0; 1649 + for (id = ipvs->threads_mask; id >= 0; id--) { 1650 + struct ipvs_master_sync_state *ms = &ipvs->ms[id]; 1651 + int ret; 1652 + 1653 + pr_info("stopping master sync thread %d ...\n", 1654 + task_pid_nr(ms->master_thread)); 1655 + cancel_delayed_work_sync(&ms->master_wakeup_work); 1656 + ret = kthread_stop(ms->master_thread); 1657 + if (retc >= 0) 1658 + retc = ret; 1659 + } 1660 + kfree(ipvs->ms); 1661 + ipvs->ms = NULL; 1877 1662 } else if (state == IP_VS_STATE_BACKUP) { 1878 - if (!ipvs->backup_thread) 1663 + if (!ipvs->backup_threads) 1879 1664 return -ESRCH; 1880 1665 1881 - pr_info("stopping backup sync thread %d ...\n", 1882 - task_pid_nr(ipvs->backup_thread)); 1883 - 1884 1666 ipvs->sync_state &= ~IP_VS_STATE_BACKUP; 1885 - retc = kthread_stop(ipvs->backup_thread); 1886 - ipvs->backup_thread = NULL; 1667 + array = ipvs->backup_threads; 1668 + retc = 0; 1669 + for (id = ipvs->threads_mask; id >= 0; id--) { 1670 + int ret; 1671 + 1672 + pr_info("stopping backup sync thread %d ...\n", 1673 + task_pid_nr(array[id])); 1674 + ret = kthread_stop(array[id]); 1675 + if (retc >= 0) 1676 + retc = ret; 1677 + } 1678 + kfree(array); 1679 + ipvs->backup_threads = NULL; 1887 1680 } 1888 1681 1889 1682 /* decrease the module use count */ ··· 1923 1670 struct netns_ipvs *ipvs = net_ipvs(net); 1924 1671 1925 1672 __mutex_init(&ipvs->sync_mutex, "ipvs->sync_mutex", &__ipvs_sync_key); 1926 - INIT_LIST_HEAD(&ipvs->sync_queue); 1927 1673 spin_lock_init(&ipvs->sync_lock); 1928 1674 spin_lock_init(&ipvs->sync_buff_lock); 1929 - 1930 - ipvs->sync_mcast_addr.sin_family = AF_INET; 1931 - ipvs->sync_mcast_addr.sin_port = cpu_to_be16(IP_VS_SYNC_PORT); 1932 - ipvs->sync_mcast_addr.sin_addr.s_addr = cpu_to_be32(IP_VS_SYNC_GROUP); 1933 1675 return 0; 1934 1676 } 1935 1677
+1 -1
net/netfilter/ipvs/ip_vs_wrr.c
··· 84 84 /* 85 85 * Allocate the mark variable for WRR scheduling 86 86 */ 87 - mark = kmalloc(sizeof(struct ip_vs_wrr_mark), GFP_ATOMIC); 87 + mark = kmalloc(sizeof(struct ip_vs_wrr_mark), GFP_KERNEL); 88 88 if (mark == NULL) 89 89 return -ENOMEM; 90 90
+7 -8
net/netfilter/nf_conntrack_core.c
··· 1336 1336 while (untrack_refs() > 0) 1337 1337 schedule(); 1338 1338 1339 - nf_conntrack_helper_fini(); 1340 1339 nf_conntrack_proto_fini(); 1341 1340 #ifdef CONFIG_NF_CONNTRACK_ZONES 1342 1341 nf_ct_extend_unregister(&nf_ct_zone_extend); ··· 1353 1354 } 1354 1355 1355 1356 nf_ct_free_hashtable(net->ct.hash, net->ct.htable_size); 1357 + nf_conntrack_helper_fini(net); 1356 1358 nf_conntrack_timeout_fini(net); 1357 1359 nf_conntrack_ecache_fini(net); 1358 1360 nf_conntrack_tstamp_fini(net); ··· 1504 1504 if (ret < 0) 1505 1505 goto err_proto; 1506 1506 1507 - ret = nf_conntrack_helper_init(); 1508 - if (ret < 0) 1509 - goto err_helper; 1510 - 1511 1507 #ifdef CONFIG_NF_CONNTRACK_ZONES 1512 1508 ret = nf_ct_extend_register(&nf_ct_zone_extend); 1513 1509 if (ret < 0) ··· 1521 1525 1522 1526 #ifdef CONFIG_NF_CONNTRACK_ZONES 1523 1527 err_extend: 1524 - nf_conntrack_helper_fini(); 1525 - #endif 1526 - err_helper: 1527 1528 nf_conntrack_proto_fini(); 1529 + #endif 1528 1530 err_proto: 1529 1531 return ret; 1530 1532 } ··· 1583 1589 ret = nf_conntrack_timeout_init(net); 1584 1590 if (ret < 0) 1585 1591 goto err_timeout; 1592 + ret = nf_conntrack_helper_init(net); 1593 + if (ret < 0) 1594 + goto err_helper; 1586 1595 1587 1596 return 0; 1588 1597 1598 + err_helper: 1599 + nf_conntrack_timeout_fini(net); 1589 1600 err_timeout: 1590 1601 nf_conntrack_ecache_fini(net); 1591 1602 err_ecache:
+4 -6
net/netfilter/nf_conntrack_ecache.c
··· 84 84 int nf_conntrack_register_notifier(struct net *net, 85 85 struct nf_ct_event_notifier *new) 86 86 { 87 - int ret = 0; 87 + int ret; 88 88 struct nf_ct_event_notifier *notify; 89 89 90 90 mutex_lock(&nf_ct_ecache_mutex); ··· 95 95 goto out_unlock; 96 96 } 97 97 rcu_assign_pointer(net->ct.nf_conntrack_event_cb, new); 98 - mutex_unlock(&nf_ct_ecache_mutex); 99 - return ret; 98 + ret = 0; 100 99 101 100 out_unlock: 102 101 mutex_unlock(&nf_ct_ecache_mutex); ··· 120 121 int nf_ct_expect_register_notifier(struct net *net, 121 122 struct nf_exp_event_notifier *new) 122 123 { 123 - int ret = 0; 124 + int ret; 124 125 struct nf_exp_event_notifier *notify; 125 126 126 127 mutex_lock(&nf_ct_ecache_mutex); ··· 131 132 goto out_unlock; 132 133 } 133 134 rcu_assign_pointer(net->ct.nf_expect_event_cb, new); 134 - mutex_unlock(&nf_ct_ecache_mutex); 135 - return ret; 135 + ret = 0; 136 136 137 137 out_unlock: 138 138 mutex_unlock(&nf_ct_ecache_mutex);
+110 -12
net/netfilter/nf_conntrack_helper.c
··· 34 34 static unsigned int nf_ct_helper_hsize __read_mostly; 35 35 static unsigned int nf_ct_helper_count __read_mostly; 36 36 37 + static bool nf_ct_auto_assign_helper __read_mostly = true; 38 + module_param_named(nf_conntrack_helper, nf_ct_auto_assign_helper, bool, 0644); 39 + MODULE_PARM_DESC(nf_conntrack_helper, 40 + "Enable automatic conntrack helper assignment (default 1)"); 41 + 42 + #ifdef CONFIG_SYSCTL 43 + static struct ctl_table helper_sysctl_table[] = { 44 + { 45 + .procname = "nf_conntrack_helper", 46 + .data = &init_net.ct.sysctl_auto_assign_helper, 47 + .maxlen = sizeof(unsigned int), 48 + .mode = 0644, 49 + .proc_handler = proc_dointvec, 50 + }, 51 + {} 52 + }; 53 + 54 + static int nf_conntrack_helper_init_sysctl(struct net *net) 55 + { 56 + struct ctl_table *table; 57 + 58 + table = kmemdup(helper_sysctl_table, sizeof(helper_sysctl_table), 59 + GFP_KERNEL); 60 + if (!table) 61 + goto out; 62 + 63 + table[0].data = &net->ct.sysctl_auto_assign_helper; 64 + 65 + net->ct.helper_sysctl_header = 66 + register_net_sysctl(net, "net/netfilter", table); 67 + 68 + if (!net->ct.helper_sysctl_header) { 69 + pr_err("nf_conntrack_helper: can't register to sysctl.\n"); 70 + goto out_register; 71 + } 72 + return 0; 73 + 74 + out_register: 75 + kfree(table); 76 + out: 77 + return -ENOMEM; 78 + } 79 + 80 + static void nf_conntrack_helper_fini_sysctl(struct net *net) 81 + { 82 + struct ctl_table *table; 83 + 84 + table = net->ct.helper_sysctl_header->ctl_table_arg; 85 + unregister_net_sysctl_table(net->ct.helper_sysctl_header); 86 + kfree(table); 87 + } 88 + #else 89 + static int nf_conntrack_helper_init_sysctl(struct net *net) 90 + { 91 + return 0; 92 + } 93 + 94 + static void nf_conntrack_helper_fini_sysctl(struct net *net) 95 + { 96 + } 97 + #endif /* CONFIG_SYSCTL */ 37 98 38 99 /* Stupid hash, but collision free for the default registrations of the 39 100 * helpers currently in the kernel. */ ··· 179 118 { 180 119 struct nf_conntrack_helper *helper = NULL; 181 120 struct nf_conn_help *help; 121 + struct net *net = nf_ct_net(ct); 182 122 int ret = 0; 123 + 124 + /* We already got a helper explicitly attached. The function 125 + * nf_conntrack_alter_reply - in case NAT is in use - asks for looking 126 + * the helper up again. Since now the user is in full control of 127 + * making consistent helper configurations, skip this automatic 128 + * re-lookup, otherwise we'll lose the helper. 129 + */ 130 + if (test_bit(IPS_HELPER_BIT, &ct->status)) 131 + return 0; 183 132 184 133 if (tmpl != NULL) { 185 134 help = nfct_help(tmpl); 186 - if (help != NULL) 135 + if (help != NULL) { 187 136 helper = help->helper; 137 + set_bit(IPS_HELPER_BIT, &ct->status); 138 + } 188 139 } 189 140 190 141 help = nfct_help(ct); 191 - if (helper == NULL) 142 + if (net->ct.sysctl_auto_assign_helper && helper == NULL) { 192 143 helper = __nf_ct_helper_find(&ct->tuplehash[IP_CT_DIR_REPLY].tuple); 144 + if (unlikely(!net->ct.auto_assign_helper_warned && helper)) { 145 + pr_info("nf_conntrack: automatic helper " 146 + "assignment is deprecated and it will " 147 + "be removed soon. Use the iptables CT target " 148 + "to attach helpers instead.\n"); 149 + net->ct.auto_assign_helper_warned = true; 150 + } 151 + } 152 + 193 153 if (helper == NULL) { 194 154 if (help) 195 155 RCU_INIT_POINTER(help->helper, NULL); ··· 397 315 .id = NF_CT_EXT_HELPER, 398 316 }; 399 317 400 - int nf_conntrack_helper_init(void) 318 + int nf_conntrack_helper_init(struct net *net) 401 319 { 402 320 int err; 403 321 404 - nf_ct_helper_hsize = 1; /* gets rounded up to use one page */ 405 - nf_ct_helper_hash = nf_ct_alloc_hashtable(&nf_ct_helper_hsize, 0); 406 - if (!nf_ct_helper_hash) 407 - return -ENOMEM; 322 + net->ct.auto_assign_helper_warned = false; 323 + net->ct.sysctl_auto_assign_helper = nf_ct_auto_assign_helper; 408 324 409 - err = nf_ct_extend_register(&helper_extend); 325 + if (net_eq(net, &init_net)) { 326 + nf_ct_helper_hsize = 1; /* gets rounded up to use one page */ 327 + nf_ct_helper_hash = 328 + nf_ct_alloc_hashtable(&nf_ct_helper_hsize, 0); 329 + if (!nf_ct_helper_hash) 330 + return -ENOMEM; 331 + 332 + err = nf_ct_extend_register(&helper_extend); 333 + if (err < 0) 334 + goto err1; 335 + } 336 + 337 + err = nf_conntrack_helper_init_sysctl(net); 410 338 if (err < 0) 411 - goto err1; 339 + goto out_sysctl; 412 340 413 341 return 0; 414 342 343 + out_sysctl: 344 + if (net_eq(net, &init_net)) 345 + nf_ct_extend_unregister(&helper_extend); 415 346 err1: 416 347 nf_ct_free_hashtable(nf_ct_helper_hash, nf_ct_helper_hsize); 417 348 return err; 418 349 } 419 350 420 - void nf_conntrack_helper_fini(void) 351 + void nf_conntrack_helper_fini(struct net *net) 421 352 { 422 - nf_ct_extend_unregister(&helper_extend); 423 - nf_ct_free_hashtable(nf_ct_helper_hash, nf_ct_helper_hsize); 353 + nf_conntrack_helper_fini_sysctl(net); 354 + if (net_eq(net, &init_net)) { 355 + nf_ct_extend_unregister(&helper_extend); 356 + nf_ct_free_hashtable(nf_ct_helper_hash, nf_ct_helper_hsize); 357 + } 424 358 }
+9 -1
net/netfilter/nf_conntrack_netlink.c
··· 2080 2080 ctnetlink_change_expect(struct nf_conntrack_expect *x, 2081 2081 const struct nlattr * const cda[]) 2082 2082 { 2083 - return -EOPNOTSUPP; 2083 + if (cda[CTA_EXPECT_TIMEOUT]) { 2084 + if (!del_timer(&x->timeout)) 2085 + return -ETIME; 2086 + 2087 + x->timeout.expires = jiffies + 2088 + ntohl(nla_get_be32(cda[CTA_EXPECT_TIMEOUT])) * HZ; 2089 + add_timer(&x->timeout); 2090 + } 2091 + return 0; 2084 2092 } 2085 2093 2086 2094 static const struct nla_policy exp_nat_nla_policy[CTA_EXPECT_NAT_MAX+1] = {
-13
security/selinux/nlmsgtab.c
··· 14 14 #include <linux/netlink.h> 15 15 #include <linux/rtnetlink.h> 16 16 #include <linux/if.h> 17 - #include <linux/netfilter_ipv4/ip_queue.h> 18 17 #include <linux/inet_diag.h> 19 18 #include <linux/xfrm.h> 20 19 #include <linux/audit.h> ··· 67 68 { RTM_GETADDRLABEL, NETLINK_ROUTE_SOCKET__NLMSG_READ }, 68 69 { RTM_GETDCB, NETLINK_ROUTE_SOCKET__NLMSG_READ }, 69 70 { RTM_SETDCB, NETLINK_ROUTE_SOCKET__NLMSG_WRITE }, 70 - }; 71 - 72 - static struct nlmsg_perm nlmsg_firewall_perms[] = 73 - { 74 - { IPQM_MODE, NETLINK_FIREWALL_SOCKET__NLMSG_WRITE }, 75 - { IPQM_VERDICT, NETLINK_FIREWALL_SOCKET__NLMSG_WRITE }, 76 71 }; 77 72 78 73 static struct nlmsg_perm nlmsg_tcpdiag_perms[] = ··· 136 143 case SECCLASS_NETLINK_ROUTE_SOCKET: 137 144 err = nlmsg_perm(nlmsg_type, perm, nlmsg_route_perms, 138 145 sizeof(nlmsg_route_perms)); 139 - break; 140 - 141 - case SECCLASS_NETLINK_FIREWALL_SOCKET: 142 - case SECCLASS_NETLINK_IP6FW_SOCKET: 143 - err = nlmsg_perm(nlmsg_type, perm, nlmsg_firewall_perms, 144 - sizeof(nlmsg_firewall_perms)); 145 146 break; 146 147 147 148 case SECCLASS_NETLINK_TCPDIAG_SOCKET: