[patch 3/3] OCFS2 Configurable timeouts - Protocol changes

Modify the OCFS2 handshake to ensure essential timeouts are configured
identically on all nodes.

Only allow changes when there are no connected peers

Improves the logic in o2net_advance_rx() which broke now that
sizeof(struct o2net_handshake) is greater than sizeof(struct o2net_msg)

Included is the field for userspace-heartbeat timeout to avoid the need for
further protocol changes.

Uses a global spinlock to ensure the decisions to update configfs entries
are made on the correct value. The region covered by the spinlock when
incrementing the counter is much larger as this is the more critical case.

Small cleanup contributed by Adrian Bunk <bunk@stusta.de>

Signed-off-by: Andrew Beekhof <abeekhof@suse.de>
Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>

authored by Andrew Beekhof and committed by Mark Fasheh 828ae6af b5dd8030

+116 -16
+24 -6
fs/ocfs2/cluster/nodemanager.c
··· 573 573 ret = o2nm_cluster_attr_write(page, count, &val); 574 574 575 575 if (ret > 0) { 576 - if (val <= cluster->cl_keepalive_delay_ms) { 576 + if (cluster->cl_idle_timeout_ms != val 577 + && o2net_num_connected_peers()) { 578 + mlog(ML_NOTICE, 579 + "o2net: cannot change idle timeout after " 580 + "the first peer has agreed to it." 581 + " %d connected peers\n", 582 + o2net_num_connected_peers()); 583 + ret = -EINVAL; 584 + } else if (val <= cluster->cl_keepalive_delay_ms) { 577 585 mlog(ML_NOTICE, "o2net: idle timeout must be larger " 578 586 "than keepalive delay\n"); 579 - return -EINVAL; 587 + ret = -EINVAL; 588 + } else { 589 + cluster->cl_idle_timeout_ms = val; 580 590 } 581 - cluster->cl_idle_timeout_ms = val; 582 591 } 583 592 584 593 return ret; ··· 608 599 ret = o2nm_cluster_attr_write(page, count, &val); 609 600 610 601 if (ret > 0) { 611 - if (val >= cluster->cl_idle_timeout_ms) { 602 + if (cluster->cl_keepalive_delay_ms != val 603 + && o2net_num_connected_peers()) { 604 + mlog(ML_NOTICE, 605 + "o2net: cannot change keepalive delay after" 606 + " the first peer has agreed to it." 607 + " %d connected peers\n", 608 + o2net_num_connected_peers()); 609 + ret = -EINVAL; 610 + } else if (val >= cluster->cl_idle_timeout_ms) { 612 611 mlog(ML_NOTICE, "o2net: keepalive delay must be " 613 612 "smaller than idle timeout\n"); 614 - return -EINVAL; 613 + ret = -EINVAL; 614 + } else { 615 + cluster->cl_keepalive_delay_ms = val; 615 616 } 616 - cluster->cl_keepalive_delay_ms = val; 617 617 } 618 618 619 619 return ret;
+83 -9
fs/ocfs2/cluster/tcp.c
··· 380 380 sc_put(sc); 381 381 } 382 382 383 + static atomic_t o2net_connected_peers = ATOMIC_INIT(0); 384 + 385 + int o2net_num_connected_peers(void) 386 + { 387 + return atomic_read(&o2net_connected_peers); 388 + } 389 + 383 390 static void o2net_set_nn_state(struct o2net_node *nn, 384 391 struct o2net_sock_container *sc, 385 392 unsigned valid, int err) ··· 396 389 struct o2net_sock_container *old_sc = nn->nn_sc; 397 390 398 391 assert_spin_locked(&nn->nn_lock); 392 + 393 + if (old_sc && !sc) 394 + atomic_dec(&o2net_connected_peers); 395 + else if (!old_sc && sc) 396 + atomic_inc(&o2net_connected_peers); 399 397 400 398 /* the node num comparison and single connect/accept path should stop 401 399 * an non-null sc from being overwritten with another */ ··· 1135 1123 return -1; 1136 1124 } 1137 1125 1126 + /* 1127 + * Ensure timeouts are consistent with other nodes, otherwise 1128 + * we can end up with one node thinking that the other must be down, 1129 + * but isn't. This can ultimately cause corruption. 1130 + */ 1131 + if (be32_to_cpu(hand->o2net_idle_timeout_ms) != 1132 + o2net_idle_timeout(sc->sc_node)) { 1133 + mlog(ML_NOTICE, SC_NODEF_FMT " uses a network idle timeout of " 1134 + "%u ms, but we use %u ms locally. disconnecting\n", 1135 + SC_NODEF_ARGS(sc), 1136 + be32_to_cpu(hand->o2net_idle_timeout_ms), 1137 + o2net_idle_timeout(sc->sc_node)); 1138 + o2net_ensure_shutdown(nn, sc, -ENOTCONN); 1139 + return -1; 1140 + } 1141 + 1142 + if (be32_to_cpu(hand->o2net_keepalive_delay_ms) != 1143 + o2net_keepalive_delay(sc->sc_node)) { 1144 + mlog(ML_NOTICE, SC_NODEF_FMT " uses a keepalive delay of " 1145 + "%u ms, but we use %u ms locally. disconnecting\n", 1146 + SC_NODEF_ARGS(sc), 1147 + be32_to_cpu(hand->o2net_keepalive_delay_ms), 1148 + o2net_keepalive_delay(sc->sc_node)); 1149 + o2net_ensure_shutdown(nn, sc, -ENOTCONN); 1150 + return -1; 1151 + } 1152 + 1153 + if (be32_to_cpu(hand->o2hb_heartbeat_timeout_ms) != 1154 + O2HB_MAX_WRITE_TIMEOUT_MS) { 1155 + mlog(ML_NOTICE, SC_NODEF_FMT " uses a heartbeat timeout of " 1156 + "%u ms, but we use %u ms locally. disconnecting\n", 1157 + SC_NODEF_ARGS(sc), 1158 + be32_to_cpu(hand->o2hb_heartbeat_timeout_ms), 1159 + O2HB_MAX_WRITE_TIMEOUT_MS); 1160 + o2net_ensure_shutdown(nn, sc, -ENOTCONN); 1161 + return -1; 1162 + } 1163 + 1138 1164 sc->sc_handshake_ok = 1; 1139 1165 1140 1166 spin_lock(&nn->nn_lock); ··· 1205 1155 sclog(sc, "receiving\n"); 1206 1156 do_gettimeofday(&sc->sc_tv_advance_start); 1207 1157 1158 + if (unlikely(sc->sc_handshake_ok == 0)) { 1159 + if(sc->sc_page_off < sizeof(struct o2net_handshake)) { 1160 + data = page_address(sc->sc_page) + sc->sc_page_off; 1161 + datalen = sizeof(struct o2net_handshake) - sc->sc_page_off; 1162 + ret = o2net_recv_tcp_msg(sc->sc_sock, data, datalen); 1163 + if (ret > 0) 1164 + sc->sc_page_off += ret; 1165 + } 1166 + 1167 + if (sc->sc_page_off == sizeof(struct o2net_handshake)) { 1168 + o2net_check_handshake(sc); 1169 + if (unlikely(sc->sc_handshake_ok == 0)) 1170 + ret = -EPROTO; 1171 + } 1172 + goto out; 1173 + } 1174 + 1208 1175 /* do we need more header? */ 1209 1176 if (sc->sc_page_off < sizeof(struct o2net_msg)) { 1210 1177 data = page_address(sc->sc_page) + sc->sc_page_off; ··· 1229 1162 ret = o2net_recv_tcp_msg(sc->sc_sock, data, datalen); 1230 1163 if (ret > 0) { 1231 1164 sc->sc_page_off += ret; 1232 - 1233 - /* this working relies on the handshake being 1234 - * smaller than the normal message header */ 1235 - if (sc->sc_page_off >= sizeof(struct o2net_handshake)&& 1236 - !sc->sc_handshake_ok && o2net_check_handshake(sc)) { 1237 - ret = -EPROTO; 1238 - goto out; 1239 - } 1240 - 1241 1165 /* only swab incoming here.. we can 1242 1166 * only get here once as we cross from 1243 1167 * being under to over */ ··· 1330 1272 return ret; 1331 1273 } 1332 1274 1275 + static void o2net_initialize_handshake(void) 1276 + { 1277 + o2net_hand->o2hb_heartbeat_timeout_ms = cpu_to_be32( 1278 + O2HB_MAX_WRITE_TIMEOUT_MS); 1279 + o2net_hand->o2net_idle_timeout_ms = cpu_to_be32( 1280 + o2net_idle_timeout(NULL)); 1281 + o2net_hand->o2net_keepalive_delay_ms = cpu_to_be32( 1282 + o2net_keepalive_delay(NULL)); 1283 + o2net_hand->o2net_reconnect_delay_ms = cpu_to_be32( 1284 + o2net_reconnect_delay(NULL)); 1285 + } 1286 + 1333 1287 /* ------------------------------------------------------------ */ 1334 1288 1335 1289 /* called when a connect completes and after a sock is accepted. the ··· 1356 1286 (unsigned long long)O2NET_PROTOCOL_VERSION, 1357 1287 (unsigned long long)be64_to_cpu(o2net_hand->connector_id)); 1358 1288 1289 + o2net_initialize_handshake(); 1359 1290 o2net_sendpage(sc, o2net_hand, sizeof(*o2net_hand)); 1360 1291 sc_put(sc); 1361 1292 } ··· 1585 1514 1586 1515 if (node_num != o2nm_this_node()) 1587 1516 o2net_disconnect_node(node); 1517 + 1518 + BUG_ON(atomic_read(&o2net_connected_peers) < 0); 1588 1519 } 1589 1520 1590 1521 static void o2net_hb_node_up_cb(struct o2nm_node *node, int node_num, ··· 1750 1677 o2net_register_callbacks(sc->sc_sock->sk, sc); 1751 1678 o2net_sc_queue_work(sc, &sc->sc_rx_work); 1752 1679 1680 + o2net_initialize_handshake(); 1753 1681 o2net_sendpage(sc, o2net_hand, sizeof(*o2net_hand)); 1754 1682 1755 1683 out:
+1
fs/ocfs2/cluster/tcp.h
··· 108 108 int o2net_start_listening(struct o2nm_node *node); 109 109 void o2net_stop_listening(struct o2nm_node *node); 110 110 void o2net_disconnect_node(struct o2nm_node *node); 111 + int o2net_num_connected_peers(void); 111 112 112 113 int o2net_init(void); 113 114 void o2net_exit(void);
+8 -1
fs/ocfs2/cluster/tcp_internal.h
··· 38 38 * locking semantics of the file system using the protocol. It should 39 39 * be somewhere else, I'm sure, but right now it isn't. 40 40 * 41 + * New in version 5: 42 + * - Network timeout checking protocol 43 + * 41 44 * New in version 4: 42 45 * - Remove i_generation from lock names for better stat performance. 43 46 * ··· 51 48 * - full 64 bit i_size in the metadata lock lvbs 52 49 * - introduction of "rw" lock and pushing meta/data locking down 53 50 */ 54 - #define O2NET_PROTOCOL_VERSION 4ULL 51 + #define O2NET_PROTOCOL_VERSION 5ULL 55 52 struct o2net_handshake { 56 53 __be64 protocol_version; 57 54 __be64 connector_id; 55 + __be32 o2hb_heartbeat_timeout_ms; 56 + __be32 o2net_idle_timeout_ms; 57 + __be32 o2net_keepalive_delay_ms; 58 + __be32 o2net_reconnect_delay_ms; 58 59 }; 59 60 60 61 struct o2net_node {