Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

bonding: modify the old and add new xmit hash policies

This patch adds two new hash policy modes which use skb_flow_dissect:
3 - Encapsulated layer 2+3
4 - Encapsulated layer 3+4
There should be a good improvement for tunnel users in those modes.
It also changes the old hash functions to:
hash ^= (__force u32)flow.dst ^ (__force u32)flow.src;
hash ^= (hash >> 16);
hash ^= (hash >> 8);

Where hash will be initialized either to L2 hash, that is
SRCMAC[5] XOR DSTMAC[5], or to flow->ports which should be extracted
from the upper layer. Flow's dst and src are also extracted based on the
xmit policy either directly from the buffer or by using skb_flow_dissect,
but in both cases if the protocol is IPv6 then dst and src are obtained by
ipv6_addr_hash() on the real addresses. In case of a non-dissectable
packet, the algorithms fall back to L2 hashing.
The bond_set_mode_ops() function is now obsolete and thus deleted
because it was used only to set the proper hash policy. Also we trim a
pointer from struct bonding because we no longer need to keep the hash
function, now there's only a single hash function - bond_xmit_hash that
works based on bond->params.xmit_policy.

The hash function and skb_flow_dissect were suggested by Eric Dumazet.
The layer names were suggested by Andy Gospodarek, because I suck at
semantics.

Signed-off-by: Nikolay Aleksandrov <nikolay@redhat.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Acked-by: Veaceslav Falico <vfalico@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>

authored by

Nikolay Aleksandrov and committed by
David S. Miller
32819dc1 357afe9c

+69 -131
+1 -1
drivers/net/bonding/bond_3ad.c
··· 2403 2403 goto out; 2404 2404 } 2405 2405 2406 - slave_agg_no = bond->xmit_hash_policy(skb, slaves_in_agg); 2406 + slave_agg_no = bond_xmit_hash(bond, skb, slaves_in_agg); 2407 2407 first_ok_slave = NULL; 2408 2408 2409 2409 bond_for_each_slave(bond, slave, iter) {
+65 -126
drivers/net/bonding/bond_main.c
··· 78 78 #include <net/netns/generic.h> 79 79 #include <net/pkt_sched.h> 80 80 #include <linux/rculist.h> 81 + #include <net/flow_keys.h> 81 82 #include "bonding.h" 82 83 #include "bond_3ad.h" 83 84 #include "bond_alb.h" ··· 160 159 module_param(xmit_hash_policy, charp, 0); 161 160 MODULE_PARM_DESC(xmit_hash_policy, "balance-xor and 802.3ad hashing method; " 162 161 "0 for layer 2 (default), 1 for layer 3+4, " 163 - "2 for layer 2+3"); 162 + "2 for layer 2+3, 3 for encap layer 2+3, " 163 + "4 for encap layer 3+4"); 164 164 module_param(arp_interval, int, 0); 165 165 MODULE_PARM_DESC(arp_interval, "arp interval in milliseconds"); 166 166 module_param_array(arp_ip_target, charp, NULL, 0); ··· 219 217 { "layer2", BOND_XMIT_POLICY_LAYER2}, 220 218 { "layer3+4", BOND_XMIT_POLICY_LAYER34}, 221 219 { "layer2+3", BOND_XMIT_POLICY_LAYER23}, 220 + { "encap2+3", BOND_XMIT_POLICY_ENCAP23}, 221 + { "encap3+4", BOND_XMIT_POLICY_ENCAP34}, 222 222 { NULL, -1}, 223 223 }; 224 224 ··· 3039 3035 3040 3036 /*---------------------------- Hashing Policies -----------------------------*/ 3041 3037 3042 - /* 3043 - * Hash for the output device based upon layer 2 data 3044 - */ 3045 - static int bond_xmit_hash_policy_l2(struct sk_buff *skb, int count) 3038 + /* L2 hash helper */ 3039 + static inline u32 bond_eth_hash(struct sk_buff *skb) 3046 3040 { 3047 3041 struct ethhdr *data = (struct ethhdr *)skb->data; 3048 3042 3049 3043 if (skb_headlen(skb) >= offsetof(struct ethhdr, h_proto)) 3050 - return (data->h_dest[5] ^ data->h_source[5]) % count; 3044 + return data->h_dest[5] ^ data->h_source[5]; 3051 3045 3052 3046 return 0; 3053 3047 } 3054 3048 3055 - /* 3056 - * Hash for the output device based upon layer 2 and layer 3 data. If 3057 - * the packet is not IP, fall back on bond_xmit_hash_policy_l2() 3058 - */ 3059 - static int bond_xmit_hash_policy_l23(struct sk_buff *skb, int count) 3049 + /* Extract the appropriate headers based on bond's xmit policy */ 3050 + static bool bond_flow_dissect(struct bonding *bond, struct sk_buff *skb, 3051 + struct flow_keys *fk) 3060 3052 { 3061 - const struct ethhdr *data; 3053 + const struct ipv6hdr *iph6; 3062 3054 const struct iphdr *iph; 3063 - const struct ipv6hdr *ipv6h; 3064 - u32 v6hash; 3065 - const __be32 *s, *d; 3055 + int noff, proto = -1; 3066 3056 3067 - if (skb->protocol == htons(ETH_P_IP) && 3068 - pskb_network_may_pull(skb, sizeof(*iph))) { 3057 + if (bond->params.xmit_policy > BOND_XMIT_POLICY_LAYER23) 3058 + return skb_flow_dissect(skb, fk); 3059 + 3060 + fk->ports = 0; 3061 + noff = skb_network_offset(skb); 3062 + if (skb->protocol == htons(ETH_P_IP)) { 3063 + if (!pskb_may_pull(skb, noff + sizeof(*iph))) 3064 + return false; 3069 3065 iph = ip_hdr(skb); 3070 - data = (struct ethhdr *)skb->data; 3071 - return ((ntohl(iph->saddr ^ iph->daddr) & 0xffff) ^ 3072 - (data->h_dest[5] ^ data->h_source[5])) % count; 3073 - } else if (skb->protocol == htons(ETH_P_IPV6) && 3074 - pskb_network_may_pull(skb, sizeof(*ipv6h))) { 3075 - ipv6h = ipv6_hdr(skb); 3076 - data = (struct ethhdr *)skb->data; 3077 - s = &ipv6h->saddr.s6_addr32[0]; 3078 - d = &ipv6h->daddr.s6_addr32[0]; 3079 - v6hash = (s[1] ^ d[1]) ^ (s[2] ^ d[2]) ^ (s[3] ^ d[3]); 3080 - v6hash ^= (v6hash >> 24) ^ (v6hash >> 16) ^ (v6hash >> 8); 3081 - return (v6hash ^ data->h_dest[5] ^ data->h_source[5]) % count; 3066 + fk->src = iph->saddr; 3067 + fk->dst = iph->daddr; 3068 + noff += iph->ihl << 2; 3069 + if (!ip_is_fragment(iph)) 3070 + proto = iph->protocol; 3071 + } else if (skb->protocol == htons(ETH_P_IPV6)) { 3072 + if (!pskb_may_pull(skb, noff + sizeof(*iph6))) 3073 + return false; 3074 + iph6 = ipv6_hdr(skb); 3075 + fk->src = (__force __be32)ipv6_addr_hash(&iph6->saddr); 3076 + fk->dst = (__force __be32)ipv6_addr_hash(&iph6->daddr); 3077 + noff += sizeof(*iph6); 3078 + proto = iph6->nexthdr; 3079 + } else { 3080 + return false; 3082 3081 } 3082 + if (bond->params.xmit_policy == BOND_XMIT_POLICY_LAYER34 && proto >= 0) 3083 + fk->ports = skb_flow_get_ports(skb, noff, proto); 3083 3084 3084 - return bond_xmit_hash_policy_l2(skb, count); 3085 + return true; 3085 3086 } 3086 3087 3087 - /* 3088 - * Hash for the output device based upon layer 3 and layer 4 data. If 3089 - * the packet is a frag or not TCP or UDP, just use layer 3 data. If it is 3090 - * altogether not IP, fall back on bond_xmit_hash_policy_l2() 3088 + /** 3089 + * bond_xmit_hash - generate a hash value based on the xmit policy 3090 + * @bond: bonding device 3091 + * @skb: buffer to use for headers 3092 + * @count: modulo value 3093 + * 3094 + * This function will extract the necessary headers from the skb buffer and use 3095 + * them to generate a hash based on the xmit_policy set in the bonding device 3096 + * which will be reduced modulo count before returning. 3091 3097 */ 3092 - static int bond_xmit_hash_policy_l34(struct sk_buff *skb, int count) 3098 + int bond_xmit_hash(struct bonding *bond, struct sk_buff *skb, int count) 3093 3099 { 3094 - u32 layer4_xor = 0; 3095 - const struct iphdr *iph; 3096 - const struct ipv6hdr *ipv6h; 3097 - const __be32 *s, *d; 3098 - const __be16 *l4 = NULL; 3099 - __be16 _l4[2]; 3100 - int noff = skb_network_offset(skb); 3101 - int poff; 3100 + struct flow_keys flow; 3101 + u32 hash; 3102 3102 3103 - if (skb->protocol == htons(ETH_P_IP) && 3104 - pskb_may_pull(skb, noff + sizeof(*iph))) { 3105 - iph = ip_hdr(skb); 3106 - poff = proto_ports_offset(iph->protocol); 3103 + if (bond->params.xmit_policy == BOND_XMIT_POLICY_LAYER2 || 3104 + !bond_flow_dissect(bond, skb, &flow)) 3105 + return bond_eth_hash(skb) % count; 3107 3106 3108 - if (!ip_is_fragment(iph) && poff >= 0) { 3109 - l4 = skb_header_pointer(skb, noff + (iph->ihl << 2) + poff, 3110 - sizeof(_l4), &_l4); 3111 - if (l4) 3112 - layer4_xor = ntohs(l4[0] ^ l4[1]); 3113 - } 3114 - return (layer4_xor ^ 3115 - ((ntohl(iph->saddr ^ iph->daddr)) & 0xffff)) % count; 3116 - } else if (skb->protocol == htons(ETH_P_IPV6) && 3117 - pskb_may_pull(skb, noff + sizeof(*ipv6h))) { 3118 - ipv6h = ipv6_hdr(skb); 3119 - poff = proto_ports_offset(ipv6h->nexthdr); 3120 - if (poff >= 0) { 3121 - l4 = skb_header_pointer(skb, noff + sizeof(*ipv6h) + poff, 3122 - sizeof(_l4), &_l4); 3123 - if (l4) 3124 - layer4_xor = ntohs(l4[0] ^ l4[1]); 3125 - } 3126 - s = &ipv6h->saddr.s6_addr32[0]; 3127 - d = &ipv6h->daddr.s6_addr32[0]; 3128 - layer4_xor ^= (s[1] ^ d[1]) ^ (s[2] ^ d[2]) ^ (s[3] ^ d[3]); 3129 - layer4_xor ^= (layer4_xor >> 24) ^ (layer4_xor >> 16) ^ 3130 - (layer4_xor >> 8); 3131 - return layer4_xor % count; 3132 - } 3107 + if (bond->params.xmit_policy == BOND_XMIT_POLICY_LAYER23 || 3108 + bond->params.xmit_policy == BOND_XMIT_POLICY_ENCAP23) 3109 + hash = bond_eth_hash(skb); 3110 + else 3111 + hash = (__force u32)flow.ports; 3112 + hash ^= (__force u32)flow.dst ^ (__force u32)flow.src; 3113 + hash ^= (hash >> 16); 3114 + hash ^= (hash >> 8); 3133 3115 3134 - return bond_xmit_hash_policy_l2(skb, count); 3116 + return hash % count; 3135 3117 } 3136 3118 3137 3119 /*-------------------------- Device entry points ----------------------------*/ ··· 3711 3721 return NETDEV_TX_OK; 3712 3722 } 3713 3723 3714 - /* 3715 - * In bond_xmit_xor() , we determine the output device by using a pre- 3724 + /* In bond_xmit_xor() , we determine the output device by using a pre- 3716 3725 * determined xmit_hash_policy(), If the selected device is not enabled, 3717 3726 * find the next active slave. 3718 3727 */ ··· 3719 3730 { 3720 3731 struct bonding *bond = netdev_priv(bond_dev); 3721 3732 3722 - bond_xmit_slave_id(bond, skb, 3723 - bond->xmit_hash_policy(skb, bond->slave_cnt)); 3733 + bond_xmit_slave_id(bond, skb, bond_xmit_hash(bond, skb, bond->slave_cnt)); 3724 3734 3725 3735 return NETDEV_TX_OK; 3726 3736 } ··· 3755 3767 } 3756 3768 3757 3769 /*------------------------- Device initialization ---------------------------*/ 3758 - 3759 - static void bond_set_xmit_hash_policy(struct bonding *bond) 3760 - { 3761 - switch (bond->params.xmit_policy) { 3762 - case BOND_XMIT_POLICY_LAYER23: 3763 - bond->xmit_hash_policy = bond_xmit_hash_policy_l23; 3764 - break; 3765 - case BOND_XMIT_POLICY_LAYER34: 3766 - bond->xmit_hash_policy = bond_xmit_hash_policy_l34; 3767 - break; 3768 - case BOND_XMIT_POLICY_LAYER2: 3769 - default: 3770 - bond->xmit_hash_policy = bond_xmit_hash_policy_l2; 3771 - break; 3772 - } 3773 - } 3774 3770 3775 3771 /* 3776 3772 * Lookup the slave that corresponds to a qid ··· 3866 3894 return ret; 3867 3895 } 3868 3896 3869 - /* 3870 - * set bond mode specific net device operations 3871 - */ 3872 - void bond_set_mode_ops(struct bonding *bond, int mode) 3873 - { 3874 - struct net_device *bond_dev = bond->dev; 3875 - 3876 - switch (mode) { 3877 - case BOND_MODE_ROUNDROBIN: 3878 - break; 3879 - case BOND_MODE_ACTIVEBACKUP: 3880 - break; 3881 - case BOND_MODE_XOR: 3882 - bond_set_xmit_hash_policy(bond); 3883 - break; 3884 - case BOND_MODE_BROADCAST: 3885 - break; 3886 - case BOND_MODE_8023AD: 3887 - bond_set_xmit_hash_policy(bond); 3888 - break; 3889 - case BOND_MODE_ALB: 3890 - /* FALLTHRU */ 3891 - case BOND_MODE_TLB: 3892 - break; 3893 - default: 3894 - /* Should never happen, mode already checked */ 3895 - pr_err("%s: Error: Unknown bonding mode %d\n", 3896 - bond_dev->name, mode); 3897 - break; 3898 - } 3899 - } 3900 - 3901 3897 static int bond_ethtool_get_settings(struct net_device *bond_dev, 3902 3898 struct ethtool_cmd *ecmd) 3903 3899 { ··· 3967 4027 ether_setup(bond_dev); 3968 4028 bond_dev->netdev_ops = &bond_netdev_ops; 3969 4029 bond_dev->ethtool_ops = &bond_ethtool_ops; 3970 - bond_set_mode_ops(bond, bond->params.mode); 3971 4030 3972 4031 bond_dev->destructor = bond_destructor; 3973 4032
-2
drivers/net/bonding/bond_sysfs.c
··· 318 318 /* don't cache arp_validate between modes */ 319 319 bond->params.arp_validate = BOND_ARP_VALIDATE_NONE; 320 320 bond->params.mode = new_value; 321 - bond_set_mode_ops(bond, bond->params.mode); 322 321 pr_info("%s: setting mode to %s (%d).\n", 323 322 bond->dev->name, bond_mode_tbl[new_value].modename, 324 323 new_value); ··· 357 358 ret = -EINVAL; 358 359 } else { 359 360 bond->params.xmit_policy = new_value; 360 - bond_set_mode_ops(bond, bond->params.mode); 361 361 pr_info("%s: setting xmit hash policy to %s (%d).\n", 362 362 bond->dev->name, 363 363 xmit_hashtype_tbl[new_value].modename, new_value);
+1 -2
drivers/net/bonding/bonding.h
··· 217 217 char proc_file_name[IFNAMSIZ]; 218 218 #endif /* CONFIG_PROC_FS */ 219 219 struct list_head bond_list; 220 - int (*xmit_hash_policy)(struct sk_buff *, int); 221 220 u16 rr_tx_counter; 222 221 struct ad_bond_info ad_info; 223 222 struct alb_bond_info alb_info; ··· 408 409 void bond_mii_monitor(struct work_struct *); 409 410 void bond_loadbalance_arp_mon(struct work_struct *); 410 411 void bond_activebackup_arp_mon(struct work_struct *); 411 - void bond_set_mode_ops(struct bonding *bond, int mode); 412 + int bond_xmit_hash(struct bonding *bond, struct sk_buff *skb, int count); 412 413 int bond_parse_parm(const char *mode_arg, const struct bond_parm_tbl *tbl); 413 414 void bond_select_active_slave(struct bonding *bond); 414 415 void bond_change_active_slave(struct bonding *bond, struct slave *new_active);
+2
include/uapi/linux/if_bonding.h
··· 91 91 #define BOND_XMIT_POLICY_LAYER2 0 /* layer 2 (MAC only), default */ 92 92 #define BOND_XMIT_POLICY_LAYER34 1 /* layer 3+4 (IP ^ (TCP || UDP)) */ 93 93 #define BOND_XMIT_POLICY_LAYER23 2 /* layer 2+3 (IP ^ MAC) */ 94 + #define BOND_XMIT_POLICY_ENCAP23 3 /* encapsulated layer 2+3 */ 95 + #define BOND_XMIT_POLICY_ENCAP34 4 /* encapsulated layer 3+4 */ 94 96 95 97 typedef struct ifbond { 96 98 __s32 bond_mode;