[patch 3/3] OCFS2 Configurable timeouts - Protocol changes

Modify the OCFS2 handshake to ensure essential timeouts are configured
identically on all nodes.

Only allow changes when there are no connected peers

Improves the logic in o2net_advance_rx() which broke now that
sizeof(struct o2net_handshake) is greater than sizeof(struct o2net_msg)

Included is the field for userspace-heartbeat timeout to avoid the need for
further protocol changes.

Uses a global spinlock to ensure the decisions to update configfs entries
are made on the correct value. The region covered by the spinlock when
incrementing the counter is much larger as this is the more critical case.

Small cleanup contributed by Adrian Bunk <bunk@stusta.de>

Signed-off-by: Andrew Beekhof <abeekhof@suse.de>
Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>

authored by Andrew Beekhof and committed by Mark Fasheh 828ae6af b5dd8030

+116 -16
+24 -6
fs/ocfs2/cluster/nodemanager.c
··· 573 ret = o2nm_cluster_attr_write(page, count, &val); 574 575 if (ret > 0) { 576 - if (val <= cluster->cl_keepalive_delay_ms) { 577 mlog(ML_NOTICE, "o2net: idle timeout must be larger " 578 "than keepalive delay\n"); 579 - return -EINVAL; 580 } 581 - cluster->cl_idle_timeout_ms = val; 582 } 583 584 return ret; ··· 608 ret = o2nm_cluster_attr_write(page, count, &val); 609 610 if (ret > 0) { 611 - if (val >= cluster->cl_idle_timeout_ms) { 612 mlog(ML_NOTICE, "o2net: keepalive delay must be " 613 "smaller than idle timeout\n"); 614 - return -EINVAL; 615 } 616 - cluster->cl_keepalive_delay_ms = val; 617 } 618 619 return ret;
··· 573 ret = o2nm_cluster_attr_write(page, count, &val); 574 575 if (ret > 0) { 576 + if (cluster->cl_idle_timeout_ms != val 577 + && o2net_num_connected_peers()) { 578 + mlog(ML_NOTICE, 579 + "o2net: cannot change idle timeout after " 580 + "the first peer has agreed to it." 581 + " %d connected peers\n", 582 + o2net_num_connected_peers()); 583 + ret = -EINVAL; 584 + } else if (val <= cluster->cl_keepalive_delay_ms) { 585 mlog(ML_NOTICE, "o2net: idle timeout must be larger " 586 "than keepalive delay\n"); 587 + ret = -EINVAL; 588 + } else { 589 + cluster->cl_idle_timeout_ms = val; 590 } 591 } 592 593 return ret; ··· 599 ret = o2nm_cluster_attr_write(page, count, &val); 600 601 if (ret > 0) { 602 + if (cluster->cl_keepalive_delay_ms != val 603 + && o2net_num_connected_peers()) { 604 + mlog(ML_NOTICE, 605 + "o2net: cannot change keepalive delay after" 606 + " the first peer has agreed to it." 607 + " %d connected peers\n", 608 + o2net_num_connected_peers()); 609 + ret = -EINVAL; 610 + } else if (val >= cluster->cl_idle_timeout_ms) { 611 mlog(ML_NOTICE, "o2net: keepalive delay must be " 612 "smaller than idle timeout\n"); 613 + ret = -EINVAL; 614 + } else { 615 + cluster->cl_keepalive_delay_ms = val; 616 } 617 } 618 619 return ret;
+83 -9
fs/ocfs2/cluster/tcp.c
··· 380 sc_put(sc); 381 } 382 383 static void o2net_set_nn_state(struct o2net_node *nn, 384 struct o2net_sock_container *sc, 385 unsigned valid, int err) ··· 396 struct o2net_sock_container *old_sc = nn->nn_sc; 397 398 assert_spin_locked(&nn->nn_lock); 399 400 /* the node num comparison and single connect/accept path should stop 401 * an non-null sc from being overwritten with another */ ··· 1135 return -1; 1136 } 1137 1138 sc->sc_handshake_ok = 1; 1139 1140 spin_lock(&nn->nn_lock); ··· 1205 sclog(sc, "receiving\n"); 1206 do_gettimeofday(&sc->sc_tv_advance_start); 1207 1208 /* do we need more header? */ 1209 if (sc->sc_page_off < sizeof(struct o2net_msg)) { 1210 data = page_address(sc->sc_page) + sc->sc_page_off; ··· 1229 ret = o2net_recv_tcp_msg(sc->sc_sock, data, datalen); 1230 if (ret > 0) { 1231 sc->sc_page_off += ret; 1232 - 1233 - /* this working relies on the handshake being 1234 - * smaller than the normal message header */ 1235 - if (sc->sc_page_off >= sizeof(struct o2net_handshake)&& 1236 - !sc->sc_handshake_ok && o2net_check_handshake(sc)) { 1237 - ret = -EPROTO; 1238 - goto out; 1239 - } 1240 - 1241 /* only swab incoming here.. we can 1242 * only get here once as we cross from 1243 * being under to over */ ··· 1330 return ret; 1331 } 1332 1333 /* ------------------------------------------------------------ */ 1334 1335 /* called when a connect completes and after a sock is accepted. the ··· 1356 (unsigned long long)O2NET_PROTOCOL_VERSION, 1357 (unsigned long long)be64_to_cpu(o2net_hand->connector_id)); 1358 1359 o2net_sendpage(sc, o2net_hand, sizeof(*o2net_hand)); 1360 sc_put(sc); 1361 } ··· 1585 1586 if (node_num != o2nm_this_node()) 1587 o2net_disconnect_node(node); 1588 } 1589 1590 static void o2net_hb_node_up_cb(struct o2nm_node *node, int node_num, ··· 1750 o2net_register_callbacks(sc->sc_sock->sk, sc); 1751 o2net_sc_queue_work(sc, &sc->sc_rx_work); 1752 1753 o2net_sendpage(sc, o2net_hand, sizeof(*o2net_hand)); 1754 1755 out:
··· 380 sc_put(sc); 381 } 382 383 + static atomic_t o2net_connected_peers = ATOMIC_INIT(0); 384 + 385 + int o2net_num_connected_peers(void) 386 + { 387 + return atomic_read(&o2net_connected_peers); 388 + } 389 + 390 static void o2net_set_nn_state(struct o2net_node *nn, 391 struct o2net_sock_container *sc, 392 unsigned valid, int err) ··· 389 struct o2net_sock_container *old_sc = nn->nn_sc; 390 391 assert_spin_locked(&nn->nn_lock); 392 + 393 + if (old_sc && !sc) 394 + atomic_dec(&o2net_connected_peers); 395 + else if (!old_sc && sc) 396 + atomic_inc(&o2net_connected_peers); 397 398 /* the node num comparison and single connect/accept path should stop 399 * an non-null sc from being overwritten with another */ ··· 1123 return -1; 1124 } 1125 1126 + /* 1127 + * Ensure timeouts are consistent with other nodes, otherwise 1128 + * we can end up with one node thinking that the other must be down, 1129 + * but isn't. This can ultimately cause corruption. 1130 + */ 1131 + if (be32_to_cpu(hand->o2net_idle_timeout_ms) != 1132 + o2net_idle_timeout(sc->sc_node)) { 1133 + mlog(ML_NOTICE, SC_NODEF_FMT " uses a network idle timeout of " 1134 + "%u ms, but we use %u ms locally. disconnecting\n", 1135 + SC_NODEF_ARGS(sc), 1136 + be32_to_cpu(hand->o2net_idle_timeout_ms), 1137 + o2net_idle_timeout(sc->sc_node)); 1138 + o2net_ensure_shutdown(nn, sc, -ENOTCONN); 1139 + return -1; 1140 + } 1141 + 1142 + if (be32_to_cpu(hand->o2net_keepalive_delay_ms) != 1143 + o2net_keepalive_delay(sc->sc_node)) { 1144 + mlog(ML_NOTICE, SC_NODEF_FMT " uses a keepalive delay of " 1145 + "%u ms, but we use %u ms locally. disconnecting\n", 1146 + SC_NODEF_ARGS(sc), 1147 + be32_to_cpu(hand->o2net_keepalive_delay_ms), 1148 + o2net_keepalive_delay(sc->sc_node)); 1149 + o2net_ensure_shutdown(nn, sc, -ENOTCONN); 1150 + return -1; 1151 + } 1152 + 1153 + if (be32_to_cpu(hand->o2hb_heartbeat_timeout_ms) != 1154 + O2HB_MAX_WRITE_TIMEOUT_MS) { 1155 + mlog(ML_NOTICE, SC_NODEF_FMT " uses a heartbeat timeout of " 1156 + "%u ms, but we use %u ms locally. disconnecting\n", 1157 + SC_NODEF_ARGS(sc), 1158 + be32_to_cpu(hand->o2hb_heartbeat_timeout_ms), 1159 + O2HB_MAX_WRITE_TIMEOUT_MS); 1160 + o2net_ensure_shutdown(nn, sc, -ENOTCONN); 1161 + return -1; 1162 + } 1163 + 1164 sc->sc_handshake_ok = 1; 1165 1166 spin_lock(&nn->nn_lock); ··· 1155 sclog(sc, "receiving\n"); 1156 do_gettimeofday(&sc->sc_tv_advance_start); 1157 1158 + if (unlikely(sc->sc_handshake_ok == 0)) { 1159 + if(sc->sc_page_off < sizeof(struct o2net_handshake)) { 1160 + data = page_address(sc->sc_page) + sc->sc_page_off; 1161 + datalen = sizeof(struct o2net_handshake) - sc->sc_page_off; 1162 + ret = o2net_recv_tcp_msg(sc->sc_sock, data, datalen); 1163 + if (ret > 0) 1164 + sc->sc_page_off += ret; 1165 + } 1166 + 1167 + if (sc->sc_page_off == sizeof(struct o2net_handshake)) { 1168 + o2net_check_handshake(sc); 1169 + if (unlikely(sc->sc_handshake_ok == 0)) 1170 + ret = -EPROTO; 1171 + } 1172 + goto out; 1173 + } 1174 + 1175 /* do we need more header? */ 1176 if (sc->sc_page_off < sizeof(struct o2net_msg)) { 1177 data = page_address(sc->sc_page) + sc->sc_page_off; ··· 1162 ret = o2net_recv_tcp_msg(sc->sc_sock, data, datalen); 1163 if (ret > 0) { 1164 sc->sc_page_off += ret; 1165 /* only swab incoming here.. we can 1166 * only get here once as we cross from 1167 * being under to over */ ··· 1272 return ret; 1273 } 1274 1275 + static void o2net_initialize_handshake(void) 1276 + { 1277 + o2net_hand->o2hb_heartbeat_timeout_ms = cpu_to_be32( 1278 + O2HB_MAX_WRITE_TIMEOUT_MS); 1279 + o2net_hand->o2net_idle_timeout_ms = cpu_to_be32( 1280 + o2net_idle_timeout(NULL)); 1281 + o2net_hand->o2net_keepalive_delay_ms = cpu_to_be32( 1282 + o2net_keepalive_delay(NULL)); 1283 + o2net_hand->o2net_reconnect_delay_ms = cpu_to_be32( 1284 + o2net_reconnect_delay(NULL)); 1285 + } 1286 + 1287 /* ------------------------------------------------------------ */ 1288 1289 /* called when a connect completes and after a sock is accepted. the ··· 1286 (unsigned long long)O2NET_PROTOCOL_VERSION, 1287 (unsigned long long)be64_to_cpu(o2net_hand->connector_id)); 1288 1289 + o2net_initialize_handshake(); 1290 o2net_sendpage(sc, o2net_hand, sizeof(*o2net_hand)); 1291 sc_put(sc); 1292 } ··· 1514 1515 if (node_num != o2nm_this_node()) 1516 o2net_disconnect_node(node); 1517 + 1518 + BUG_ON(atomic_read(&o2net_connected_peers) < 0); 1519 } 1520 1521 static void o2net_hb_node_up_cb(struct o2nm_node *node, int node_num, ··· 1677 o2net_register_callbacks(sc->sc_sock->sk, sc); 1678 o2net_sc_queue_work(sc, &sc->sc_rx_work); 1679 1680 + o2net_initialize_handshake(); 1681 o2net_sendpage(sc, o2net_hand, sizeof(*o2net_hand)); 1682 1683 out:
+1
fs/ocfs2/cluster/tcp.h
··· 108 int o2net_start_listening(struct o2nm_node *node); 109 void o2net_stop_listening(struct o2nm_node *node); 110 void o2net_disconnect_node(struct o2nm_node *node); 111 112 int o2net_init(void); 113 void o2net_exit(void);
··· 108 int o2net_start_listening(struct o2nm_node *node); 109 void o2net_stop_listening(struct o2nm_node *node); 110 void o2net_disconnect_node(struct o2nm_node *node); 111 + int o2net_num_connected_peers(void); 112 113 int o2net_init(void); 114 void o2net_exit(void);
+8 -1
fs/ocfs2/cluster/tcp_internal.h
··· 38 * locking semantics of the file system using the protocol. It should 39 * be somewhere else, I'm sure, but right now it isn't. 40 * 41 * New in version 4: 42 * - Remove i_generation from lock names for better stat performance. 43 * ··· 51 * - full 64 bit i_size in the metadata lock lvbs 52 * - introduction of "rw" lock and pushing meta/data locking down 53 */ 54 - #define O2NET_PROTOCOL_VERSION 4ULL 55 struct o2net_handshake { 56 __be64 protocol_version; 57 __be64 connector_id; 58 }; 59 60 struct o2net_node {
··· 38 * locking semantics of the file system using the protocol. It should 39 * be somewhere else, I'm sure, but right now it isn't. 40 * 41 + * New in version 5: 42 + * - Network timeout checking protocol 43 + * 44 * New in version 4: 45 * - Remove i_generation from lock names for better stat performance. 46 * ··· 48 * - full 64 bit i_size in the metadata lock lvbs 49 * - introduction of "rw" lock and pushing meta/data locking down 50 */ 51 + #define O2NET_PROTOCOL_VERSION 5ULL 52 struct o2net_handshake { 53 __be64 protocol_version; 54 __be64 connector_id; 55 + __be32 o2hb_heartbeat_timeout_ms; 56 + __be32 o2net_idle_timeout_ms; 57 + __be32 o2net_keepalive_delay_ms; 58 + __be32 o2net_reconnect_delay_ms; 59 }; 60 61 struct o2net_node {