Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

tipc: improve link resiliency when rps is activated

Currently, the TIPC RPS dissector is based only on the incoming packets'
source node address, hence steering all traffic from a node to the same
core. We have seen that this makes the links vulnerable to starvation
and unnecessary resets when we turn down the link tolerance to very low
values.

To reduce the risk of this happening, we exempt probe and probe replies
packets from the convergence to one core per source node. Instead, we do
the opposite, - we try to diverge those packets across as many cores as
possible, by randomizing the flow selector key.

To make such packets identifiable to the dissector, we add a new
'is_keepalive' bit to word 0 of the LINK_PROTOCOL header. This bit is
set both for PROBE and PROBE_REPLY messages, and only for those.

It should be noted that these packets are not part of any flow anyway,
and only constitute a minuscule fraction of all packets sent across a
link. Hence, there is no risk that this will affect overall performance.

Acked-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>

authored by

Jon Maloy and committed by
David S. Miller
8d6e79d3 141f575f

+108 -32
+6 -6
include/net/flow_dissector.h
··· 84 84 }; 85 85 86 86 /** 87 - * struct flow_dissector_key_tipc_addrs: 88 - * @srcnode: source node address 87 + * struct flow_dissector_key_tipc: 88 + * @key: source node address combined with selector 89 89 */ 90 - struct flow_dissector_key_tipc_addrs { 91 - __be32 srcnode; 90 + struct flow_dissector_key_tipc { 91 + __be32 key; 92 92 }; 93 93 94 94 /** ··· 100 100 union { 101 101 struct flow_dissector_key_ipv4_addrs v4addrs; 102 102 struct flow_dissector_key_ipv6_addrs v6addrs; 103 - struct flow_dissector_key_tipc_addrs tipcaddrs; 103 + struct flow_dissector_key_tipc tipckey; 104 104 }; 105 105 }; 106 106 ··· 192 192 FLOW_DISSECTOR_KEY_PORTS, /* struct flow_dissector_key_ports */ 193 193 FLOW_DISSECTOR_KEY_ICMP, /* struct flow_dissector_key_icmp */ 194 194 FLOW_DISSECTOR_KEY_ETH_ADDRS, /* struct flow_dissector_key_eth_addrs */ 195 - FLOW_DISSECTOR_KEY_TIPC_ADDRS, /* struct flow_dissector_key_tipc_addrs */ 195 + FLOW_DISSECTOR_KEY_TIPC, /* struct flow_dissector_key_tipc */ 196 196 FLOW_DISSECTOR_KEY_ARP, /* struct flow_dissector_key_arp */ 197 197 FLOW_DISSECTOR_KEY_VLAN, /* struct flow_dissector_key_flow_vlan */ 198 198 FLOW_DISSECTOR_KEY_FLOW_LABEL, /* struct flow_dissector_key_flow_tags */
+62
include/net/tipc.h
··· 1 + /* 2 + * include/net/tipc.h: Include file for TIPC message header routines 3 + * 4 + * Copyright (c) 2017 Ericsson AB 5 + * All rights reserved. 6 + * 7 + * Redistribution and use in source and binary forms, with or without 8 + * modification, are permitted provided that the following conditions are met: 9 + * 10 + * 1. Redistributions of source code must retain the above copyright 11 + * notice, this list of conditions and the following disclaimer. 12 + * 2. Redistributions in binary form must reproduce the above copyright 13 + * notice, this list of conditions and the following disclaimer in the 14 + * documentation and/or other materials provided with the distribution. 15 + * 3. Neither the names of the copyright holders nor the names of its 16 + * contributors may be used to endorse or promote products derived from 17 + * this software without specific prior written permission. 18 + * 19 + * Alternatively, this software may be distributed under the terms of the 20 + * GNU General Public License ("GPL") version 2 as published by the Free 21 + * Software Foundation. 22 + * 23 + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 24 + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 25 + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 26 + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 27 + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 28 + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 29 + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 30 + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 31 + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 32 + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 33 + * POSSIBILITY OF SUCH DAMAGE. 34 + */ 35 + 36 + #ifndef _TIPC_HDR_H 37 + #define _TIPC_HDR_H 38 + 39 + #include <linux/random.h> 40 + 41 + #define KEEPALIVE_MSG_MASK 0x0e080000 /* LINK_PROTOCOL + MSG_IS_KEEPALIVE */ 42 + 43 + struct tipc_basic_hdr { 44 + __be32 w[4]; 45 + }; 46 + 47 + static inline u32 tipc_hdr_rps_key(struct tipc_basic_hdr *hdr) 48 + { 49 + u32 w0 = ntohl(hdr->w[0]); 50 + bool keepalive_msg = (w0 & KEEPALIVE_MSG_MASK) == KEEPALIVE_MSG_MASK; 51 + int key; 52 + 53 + /* Return source node identity as key */ 54 + if (likely(!keepalive_msg)) 55 + return hdr->w[3]; 56 + 57 + /* Spread PROBE/PROBE_REPLY messages across the cores */ 58 + get_random_bytes(&key, sizeof(key)); 59 + return key; 60 + } 61 + 62 + #endif
+15 -15
net/core/flow_dissector.c
··· 10 10 #include <net/ipv6.h> 11 11 #include <net/gre.h> 12 12 #include <net/pptp.h> 13 + #include <net/tipc.h> 13 14 #include <linux/igmp.h> 14 15 #include <linux/icmp.h> 15 16 #include <linux/sctp.h> ··· 773 772 break; 774 773 } 775 774 case htons(ETH_P_TIPC): { 776 - struct { 777 - __be32 pre[3]; 778 - __be32 srcnode; 779 - } *hdr, _hdr; 780 - hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr), data, hlen, &_hdr); 775 + struct tipc_basic_hdr *hdr, _hdr; 776 + 777 + hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr), 778 + data, hlen, &_hdr); 781 779 if (!hdr) { 782 780 fdret = FLOW_DISSECT_RET_OUT_BAD; 783 781 break; 784 782 } 785 783 786 784 if (dissector_uses_key(flow_dissector, 787 - FLOW_DISSECTOR_KEY_TIPC_ADDRS)) { 785 + FLOW_DISSECTOR_KEY_TIPC)) { 788 786 key_addrs = skb_flow_dissector_target(flow_dissector, 789 - FLOW_DISSECTOR_KEY_TIPC_ADDRS, 787 + FLOW_DISSECTOR_KEY_TIPC, 790 788 target_container); 791 - key_addrs->tipcaddrs.srcnode = hdr->srcnode; 792 - key_control->addr_type = FLOW_DISSECTOR_KEY_TIPC_ADDRS; 789 + key_addrs->tipckey.key = tipc_hdr_rps_key(hdr); 790 + key_control->addr_type = FLOW_DISSECTOR_KEY_TIPC; 793 791 } 794 792 fdret = FLOW_DISSECT_RET_OUT_GOOD; 795 793 break; ··· 1024 1024 case FLOW_DISSECTOR_KEY_IPV6_ADDRS: 1025 1025 diff -= sizeof(flow->addrs.v6addrs); 1026 1026 break; 1027 - case FLOW_DISSECTOR_KEY_TIPC_ADDRS: 1028 - diff -= sizeof(flow->addrs.tipcaddrs); 1027 + case FLOW_DISSECTOR_KEY_TIPC: 1028 + diff -= sizeof(flow->addrs.tipckey); 1029 1029 break; 1030 1030 } 1031 1031 return (sizeof(*flow) - diff) / sizeof(u32); ··· 1039 1039 case FLOW_DISSECTOR_KEY_IPV6_ADDRS: 1040 1040 return (__force __be32)ipv6_addr_hash( 1041 1041 &flow->addrs.v6addrs.src); 1042 - case FLOW_DISSECTOR_KEY_TIPC_ADDRS: 1043 - return flow->addrs.tipcaddrs.srcnode; 1042 + case FLOW_DISSECTOR_KEY_TIPC: 1043 + return flow->addrs.tipckey.key; 1044 1044 default: 1045 1045 return 0; 1046 1046 } ··· 1321 1321 .offset = offsetof(struct flow_keys, addrs.v6addrs), 1322 1322 }, 1323 1323 { 1324 - .key_id = FLOW_DISSECTOR_KEY_TIPC_ADDRS, 1325 - .offset = offsetof(struct flow_keys, addrs.tipcaddrs), 1324 + .key_id = FLOW_DISSECTOR_KEY_TIPC, 1325 + .offset = offsetof(struct flow_keys, addrs.tipckey), 1326 1326 }, 1327 1327 { 1328 1328 .key_id = FLOW_DISSECTOR_KEY_PORTS,
+15 -11
net/tipc/link.c
··· 239 239 static int tipc_link_proto_rcv(struct tipc_link *l, struct sk_buff *skb, 240 240 struct sk_buff_head *xmitq); 241 241 static void tipc_link_build_proto_msg(struct tipc_link *l, int mtyp, bool probe, 242 - u16 rcvgap, int tolerance, int priority, 242 + bool probe_reply, u16 rcvgap, 243 + int tolerance, int priority, 243 244 struct sk_buff_head *xmitq); 244 245 static void link_print(struct tipc_link *l, const char *str); 245 246 static int tipc_link_build_nack_msg(struct tipc_link *l, ··· 774 773 } 775 774 776 775 if (state || probe || setup) 777 - tipc_link_build_proto_msg(l, mtyp, probe, 0, 0, 0, xmitq); 776 + tipc_link_build_proto_msg(l, mtyp, probe, 0, 0, 0, 0, xmitq); 778 777 779 778 return rc; 780 779 } ··· 1175 1174 /* Unicast ACK */ 1176 1175 l->rcv_unacked = 0; 1177 1176 l->stats.sent_acks++; 1178 - tipc_link_build_proto_msg(l, STATE_MSG, 0, 0, 0, 0, xmitq); 1177 + tipc_link_build_proto_msg(l, STATE_MSG, 0, 0, 0, 0, 0, xmitq); 1179 1178 return 0; 1180 1179 } 1181 1180 ··· 1189 1188 if (l->state == LINK_ESTABLISHING) 1190 1189 mtyp = ACTIVATE_MSG; 1191 1190 1192 - tipc_link_build_proto_msg(l, mtyp, 0, 0, 0, 0, xmitq); 1191 + tipc_link_build_proto_msg(l, mtyp, 0, 0, 0, 0, 0, xmitq); 1193 1192 1194 1193 /* Inform peer that this endpoint is going down if applicable */ 1195 1194 skb = skb_peek_tail(xmitq); ··· 1216 1215 } 1217 1216 1218 1217 if ((skb_queue_len(&l->deferdq) == 1) || !(def_cnt % TIPC_NACK_INTV)) 1219 - tipc_link_build_proto_msg(l, STATE_MSG, 0, 0, 0, 0, xmitq); 1218 + tipc_link_build_proto_msg(l, STATE_MSG, 0, 0, 0, 0, 0, xmitq); 1220 1219 return 0; 1221 1220 } 1222 1221 ··· 1290 1289 } 1291 1290 1292 1291 static void tipc_link_build_proto_msg(struct tipc_link *l, int mtyp, bool probe, 1293 - u16 rcvgap, int tolerance, int priority, 1292 + bool probe_reply, u16 rcvgap, 1293 + int tolerance, int priority, 1294 1294 struct sk_buff_head *xmitq) 1295 1295 { 1296 1296 struct tipc_link *bcl = l->bc_rcvlink; ··· 1339 1337 msg_set_seq_gap(hdr, rcvgap); 1340 1338 msg_set_bc_gap(hdr, link_bc_rcv_gap(bcl)); 1341 1339 msg_set_probe(hdr, probe); 1340 + msg_set_is_keepalive(hdr, probe || probe_reply); 1342 1341 tipc_mon_prep(l->net, data, &dlen, mstate, l->bearer_id); 1343 1342 msg_set_size(hdr, INT_H_SIZE + dlen); 1344 1343 skb_trim(skb, INT_H_SIZE + dlen); ··· 1445 1442 u16 rcv_nxt = l->rcv_nxt; 1446 1443 u16 dlen = msg_data_sz(hdr); 1447 1444 int mtyp = msg_type(hdr); 1445 + bool reply = msg_probe(hdr); 1448 1446 void *data; 1449 1447 char *if_name; 1450 1448 int rc = 0; ··· 1532 1528 /* Send NACK if peer has sent pkts we haven't received yet */ 1533 1529 if (more(peers_snd_nxt, rcv_nxt) && !tipc_link_is_synching(l)) 1534 1530 rcvgap = peers_snd_nxt - l->rcv_nxt; 1535 - if (rcvgap || (msg_probe(hdr))) 1536 - tipc_link_build_proto_msg(l, STATE_MSG, 0, rcvgap, 1537 - 0, 0, xmitq); 1531 + if (rcvgap || reply) 1532 + tipc_link_build_proto_msg(l, STATE_MSG, 0, reply, 1533 + rcvgap, 0, 0, xmitq); 1538 1534 tipc_link_release_pkts(l, ack); 1539 1535 1540 1536 /* If NACK, retransmit will now start at right position */ ··· 2126 2122 struct sk_buff_head *xmitq) 2127 2123 { 2128 2124 l->tolerance = tol; 2129 - tipc_link_build_proto_msg(l, STATE_MSG, 0, 0, tol, 0, xmitq); 2125 + tipc_link_build_proto_msg(l, STATE_MSG, 0, 0, 0, tol, 0, xmitq); 2130 2126 } 2131 2127 2132 2128 void tipc_link_set_prio(struct tipc_link *l, u32 prio, 2133 2129 struct sk_buff_head *xmitq) 2134 2130 { 2135 2131 l->priority = prio; 2136 - tipc_link_build_proto_msg(l, STATE_MSG, 0, 0, 0, prio, xmitq); 2132 + tipc_link_build_proto_msg(l, STATE_MSG, 0, 0, 0, 0, prio, xmitq); 2137 2133 } 2138 2134 2139 2135 void tipc_link_set_abort_limit(struct tipc_link *l, u32 limit)
+10
net/tipc/msg.h
··· 226 226 msg_set_bits(m, 0, 19, 1, d); 227 227 } 228 228 229 + static inline int msg_is_keepalive(struct tipc_msg *m) 230 + { 231 + return msg_bits(m, 0, 19, 1); 232 + } 233 + 234 + static inline void msg_set_is_keepalive(struct tipc_msg *m, u32 d) 235 + { 236 + msg_set_bits(m, 0, 19, 1, d); 237 + } 238 + 229 239 static inline int msg_src_droppable(struct tipc_msg *m) 230 240 { 231 241 return msg_bits(m, 0, 18, 1);