ocfs2: warn the user on a dead timeout mismatch

Print a warning to the user when a node with a different dead count joins
the region.

Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>

+21
+20
fs/ocfs2/cluster/heartbeat.c
··· 517 517 hb_block->hb_seq = cpu_to_le64(cputime); 518 518 hb_block->hb_node = node_num; 519 519 hb_block->hb_generation = cpu_to_le64(generation); 520 + hb_block->hb_dead_ms = cpu_to_le32(o2hb_dead_threshold * O2HB_REGION_TIMEOUT_MS); 520 521 521 522 /* This step must always happen last! */ 522 523 hb_block->hb_cksum = cpu_to_le32(o2hb_compute_block_crc_le(reg, ··· 646 645 struct o2nm_node *node; 647 646 struct o2hb_disk_heartbeat_block *hb_block = reg->hr_tmp_block; 648 647 u64 cputime; 648 + unsigned int dead_ms = o2hb_dead_threshold * O2HB_REGION_TIMEOUT_MS; 649 + unsigned int slot_dead_ms; 649 650 650 651 memcpy(hb_block, slot->ds_raw_block, reg->hr_block_bytes); 651 652 ··· 736 733 &o2hb_live_slots[slot->ds_node_num]); 737 734 738 735 slot->ds_equal_samples = 0; 736 + 737 + /* We want to be sure that all nodes agree on the 738 + * number of milliseconds before a node will be 739 + * considered dead. The self-fencing timeout is 740 + * computed from this value, and a discrepancy might 741 + * result in heartbeat calling a node dead when it 742 + * hasn't self-fenced yet. */ 743 + slot_dead_ms = le32_to_cpu(hb_block->hb_dead_ms); 744 + if (slot_dead_ms && slot_dead_ms != dead_ms) { 745 + /* TODO: Perhaps we can fail the region here. */ 746 + mlog(ML_ERROR, "Node %d on device %s has a dead count " 747 + "of %u ms, but our count is %u ms.\n" 748 + "Please double check your configuration values " 749 + "for 'O2CB_HEARTBEAT_THRESHOLD'\n", 750 + slot->ds_node_num, reg->hr_dev_name, slot_dead_ms, 751 + dead_ms); 752 + } 739 753 goto out; 740 754 } 741 755
+1
fs/ocfs2/cluster/ocfs2_heartbeat.h
··· 32 32 __u8 hb_pad1[3]; 33 33 __le32 hb_cksum; 34 34 __le64 hb_generation; 35 + __le32 hb_dead_ms; 35 36 }; 36 37 37 38 #endif /* _OCFS2_HEARTBEAT_H */