Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge branch 'rds-ha-failover-fixes'

Sowmini Varadhan says:

====================
RDS: TCP: HA/Failover fixes

This series contains a set of fixes for bugs exposed when
we ran the following in a loop between a test machine pair:

while (1); do
# modprobe rds-tcp on test nodes
# run rds-stress in bi-dir mode between test machine pair
# modprobe -r rds-tcp on test nodes
done

rds-stress in bi-dir mode will cause both nodes to initiate
RDS-TCP connections at almost the same instant, exposing the
bugs fixed in this series.

Without the fixes, rds-stress reports sporadic packet drops,
and packets arriving out of sequence. After the fixes,we have
been able to run the test overnight, without any issues.

Each patch has a detailed description of the root-cause fixed
by the patch.
====================

Signed-off-by: David S. Miller <davem@davemloft.net>

+86 -21
+4
net/rds/af_rds.c
··· 605 605 } 606 606 module_exit(rds_exit); 607 607 608 + u32 rds_gen_num; 609 + 608 610 static int rds_init(void) 609 611 { 610 612 int ret; 613 + 614 + net_get_random_once(&rds_gen_num, sizeof(rds_gen_num)); 611 615 612 616 ret = rds_bind_lock_init(); 613 617 if (ret)
+3
net/rds/connection.c
··· 269 269 kmem_cache_free(rds_conn_slab, conn); 270 270 conn = found; 271 271 } else { 272 + conn->c_my_gen_num = rds_gen_num; 273 + conn->c_peer_gen_num = 0; 272 274 hlist_add_head_rcu(&conn->c_hash_node, head); 273 275 rds_cong_add_conn(conn); 274 276 rds_conn_count++; ··· 683 681 !test_and_set_bit(RDS_RECONNECT_PENDING, &cp->cp_flags)) 684 682 queue_delayed_work(rds_wq, &cp->cp_conn_w, 0); 685 683 } 684 + EXPORT_SYMBOL_GPL(rds_conn_path_connect_if_down); 686 685 687 686 void rds_conn_connect_if_down(struct rds_connection *conn) 688 687 {
+1
net/rds/message.c
··· 42 42 [RDS_EXTHDR_RDMA] = sizeof(struct rds_ext_header_rdma), 43 43 [RDS_EXTHDR_RDMA_DEST] = sizeof(struct rds_ext_header_rdma_dest), 44 44 [RDS_EXTHDR_NPATHS] = sizeof(u16), 45 + [RDS_EXTHDR_GEN_NUM] = sizeof(u32), 45 46 }; 46 47 47 48
+7 -1
net/rds/rds.h
··· 151 151 152 152 struct rds_conn_path c_path[RDS_MPATH_WORKERS]; 153 153 wait_queue_head_t c_hs_waitq; /* handshake waitq */ 154 + 155 + u32 c_my_gen_num; 156 + u32 c_peer_gen_num; 154 157 }; 155 158 156 159 static inline ··· 246 243 /* Extension header announcing number of paths. 247 244 * Implicit length = 2 bytes. 248 245 */ 249 - #define RDS_EXTHDR_NPATHS 4 246 + #define RDS_EXTHDR_NPATHS 5 247 + #define RDS_EXTHDR_GEN_NUM 6 250 248 251 249 #define __RDS_EXTHDR_MAX 16 /* for now */ 252 250 ··· 342 338 #define RDS_MSG_RETRANSMITTED 5 343 339 #define RDS_MSG_MAPPED 6 344 340 #define RDS_MSG_PAGEVEC 7 341 + #define RDS_MSG_FLUSH 8 345 342 346 343 struct rds_message { 347 344 atomic_t m_refcount; ··· 669 664 struct rds_message *rds_cong_update_alloc(struct rds_connection *conn); 670 665 671 666 /* conn.c */ 667 + extern u32 rds_gen_num; 672 668 int rds_conn_init(void); 673 669 void rds_conn_exit(void); 674 670 struct rds_connection *rds_conn_create(struct net *net,
+36
net/rds/recv.c
··· 120 120 /* do nothing if no change in cong state */ 121 121 } 122 122 123 + static void rds_conn_peer_gen_update(struct rds_connection *conn, 124 + u32 peer_gen_num) 125 + { 126 + int i; 127 + struct rds_message *rm, *tmp; 128 + unsigned long flags; 129 + 130 + WARN_ON(conn->c_trans->t_type != RDS_TRANS_TCP); 131 + if (peer_gen_num != 0) { 132 + if (conn->c_peer_gen_num != 0 && 133 + peer_gen_num != conn->c_peer_gen_num) { 134 + for (i = 0; i < RDS_MPATH_WORKERS; i++) { 135 + struct rds_conn_path *cp; 136 + 137 + cp = &conn->c_path[i]; 138 + spin_lock_irqsave(&cp->cp_lock, flags); 139 + cp->cp_next_tx_seq = 1; 140 + cp->cp_next_rx_seq = 0; 141 + list_for_each_entry_safe(rm, tmp, 142 + &cp->cp_retrans, 143 + m_conn_item) { 144 + set_bit(RDS_MSG_FLUSH, &rm->m_flags); 145 + } 146 + spin_unlock_irqrestore(&cp->cp_lock, flags); 147 + } 148 + } 149 + conn->c_peer_gen_num = peer_gen_num; 150 + } 151 + } 152 + 123 153 /* 124 154 * Process all extension headers that come with this message. 125 155 */ ··· 193 163 union { 194 164 struct rds_ext_header_version version; 195 165 u16 rds_npaths; 166 + u32 rds_gen_num; 196 167 } buffer; 168 + u32 new_peer_gen_num = 0; 197 169 198 170 while (1) { 199 171 len = sizeof(buffer); ··· 208 176 conn->c_npaths = min_t(int, RDS_MPATH_WORKERS, 209 177 buffer.rds_npaths); 210 178 break; 179 + case RDS_EXTHDR_GEN_NUM: 180 + new_peer_gen_num = buffer.rds_gen_num; 181 + break; 211 182 default: 212 183 pr_warn_ratelimited("ignoring unknown exthdr type " 213 184 "0x%x\n", type); ··· 218 183 } 219 184 /* if RDS_EXTHDR_NPATHS was not found, default to a single-path */ 220 185 conn->c_npaths = max_t(int, conn->c_npaths, 1); 186 + rds_conn_peer_gen_update(conn, new_peer_gen_num); 221 187 } 222 188 223 189 /* rds_start_mprds() will synchronously start multiple paths when appropriate.
+7 -2
net/rds/send.c
··· 259 259 * connection. 260 260 * Therefore, we never retransmit messages with RDMA ops. 261 261 */ 262 - if (rm->rdma.op_active && 263 - test_bit(RDS_MSG_RETRANSMITTED, &rm->m_flags)) { 262 + if (test_bit(RDS_MSG_FLUSH, &rm->m_flags) || 263 + (rm->rdma.op_active && 264 + test_bit(RDS_MSG_RETRANSMITTED, &rm->m_flags))) { 264 265 spin_lock_irqsave(&cp->cp_lock, flags); 265 266 if (test_and_clear_bit(RDS_MSG_ON_CONN, &rm->m_flags)) 266 267 list_move(&rm->m_conn_item, &to_be_dropped); ··· 1210 1209 rds_message_add_extension(&rm->m_inc.i_hdr, 1211 1210 RDS_EXTHDR_NPATHS, &npaths, 1212 1211 sizeof(npaths)); 1212 + rds_message_add_extension(&rm->m_inc.i_hdr, 1213 + RDS_EXTHDR_GEN_NUM, 1214 + &cp->cp_conn->c_my_gen_num, 1215 + sizeof(u32)); 1213 1216 } 1214 1217 spin_unlock_irqrestore(&cp->cp_lock, flags); 1215 1218
+13 -1
net/rds/tcp_connect.c
··· 60 60 case TCP_SYN_RECV: 61 61 break; 62 62 case TCP_ESTABLISHED: 63 - rds_connect_path_complete(cp, RDS_CONN_CONNECTING); 63 + /* Force the peer to reconnect so that we have the 64 + * TCP ports going from <smaller-ip>.<transient> to 65 + * <larger-ip>.<RDS_TCP_PORT>. We avoid marking the 66 + * RDS connection as RDS_CONN_UP until the reconnect, 67 + * to avoid RDS datagram loss. 68 + */ 69 + if (cp->cp_conn->c_laddr > cp->cp_conn->c_faddr && 70 + rds_conn_path_transition(cp, RDS_CONN_CONNECTING, 71 + RDS_CONN_ERROR)) { 72 + rds_conn_path_drop(cp); 73 + } else { 74 + rds_connect_path_complete(cp, RDS_CONN_CONNECTING); 75 + } 64 76 break; 65 77 case TCP_CLOSE_WAIT: 66 78 case TCP_CLOSE:
+12 -17
net/rds/tcp_listen.c
··· 83 83 { 84 84 int i; 85 85 bool peer_is_smaller = (conn->c_faddr < conn->c_laddr); 86 - int npaths = conn->c_npaths; 86 + int npaths = max_t(int, 1, conn->c_npaths); 87 87 88 - if (npaths <= 1) { 89 - struct rds_conn_path *cp = &conn->c_path[0]; 90 - int ret; 91 - 92 - ret = rds_conn_path_transition(cp, RDS_CONN_DOWN, 93 - RDS_CONN_CONNECTING); 94 - if (!ret) 95 - rds_conn_path_transition(cp, RDS_CONN_ERROR, 96 - RDS_CONN_CONNECTING); 97 - return cp->cp_transport_data; 98 - } 99 - 100 - /* for mprds, paths with cp_index > 0 MUST be initiated by the peer 88 + /* for mprds, all paths MUST be initiated by the peer 101 89 * with the smaller address. 102 90 */ 103 - if (!peer_is_smaller) 91 + if (!peer_is_smaller) { 92 + /* Make sure we initiate at least one path if this 93 + * has not already been done; rds_start_mprds() will 94 + * take care of additional paths, if necessary. 95 + */ 96 + if (npaths == 1) 97 + rds_conn_path_connect_if_down(&conn->c_path[0]); 104 98 return NULL; 99 + } 105 100 106 101 for (i = 0; i < npaths; i++) { 107 102 struct rds_conn_path *cp = &conn->c_path[i]; ··· 166 171 mutex_lock(&rs_tcp->t_conn_path_lock); 167 172 cp = rs_tcp->t_cpath; 168 173 conn_state = rds_conn_path_state(cp); 169 - if (conn_state != RDS_CONN_CONNECTING && conn_state != RDS_CONN_UP && 170 - conn_state != RDS_CONN_ERROR) 174 + WARN_ON(conn_state == RDS_CONN_UP); 175 + if (conn_state != RDS_CONN_CONNECTING && conn_state != RDS_CONN_ERROR) 171 176 goto rst_nsk; 172 177 if (rs_tcp->t_sock) { 173 178 /* Need to resolve a duelling SYN between peers.
+3
net/rds/tcp_send.c
··· 100 100 set_bit(RDS_MSG_HAS_ACK_SEQ, &rm->m_flags); 101 101 tc->t_last_expected_una = rm->m_ack_seq + 1; 102 102 103 + if (test_bit(RDS_MSG_RETRANSMITTED, &rm->m_flags)) 104 + rm->m_inc.i_hdr.h_flags |= RDS_FLAG_RETRANSMITTED; 105 + 103 106 rdsdebug("rm %p tcp nxt %u ack_seq %llu\n", 104 107 rm, rds_tcp_snd_nxt(tc), 105 108 (unsigned long long)rm->m_ack_seq);