commit 0db638f44e7db9732d9c5704ca837f57ce061f42 · tjh.dev/kernel

tjh.dev / kernel

Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git

kernel os linux

ocfs2: warn the user on a dead timeout mismatch

Print a warning to the user when a node with a different dead count joins
the region.

Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>

Mark Fasheh 19 years ago 0db638f4 4ba63adc

+21

2 changed files

expand all

unified split

ocfs2

cluster

heartbeat.c

ocfs2_heartbeat.h

+20

fs/ocfs2/cluster/heartbeat.c

··· 517 hb_block->hb_seq = cpu_to_le64(cputime); 518 hb_block->hb_node = node_num; 519 hb_block->hb_generation = cpu_to_le64(generation); 520 521 /* This step must always happen last! */ 522 hb_block->hb_cksum = cpu_to_le32(o2hb_compute_block_crc_le(reg, ··· 646 struct o2nm_node *node; 647 struct o2hb_disk_heartbeat_block *hb_block = reg->hr_tmp_block; 648 u64 cputime; 649 650 memcpy(hb_block, slot->ds_raw_block, reg->hr_block_bytes); 651 ··· 736 &o2hb_live_slots[slot->ds_node_num]); 737 738 slot->ds_equal_samples = 0; 739 goto out; 740 } 741

··· 517 hb_block->hb_seq = cpu_to_le64(cputime); 518 hb_block->hb_node = node_num; 519 hb_block->hb_generation = cpu_to_le64(generation); 520 + hb_block->hb_dead_ms = cpu_to_le32(o2hb_dead_threshold * O2HB_REGION_TIMEOUT_MS); 521 522 /* This step must always happen last! */ 523 hb_block->hb_cksum = cpu_to_le32(o2hb_compute_block_crc_le(reg, ··· 645 struct o2nm_node *node; 646 struct o2hb_disk_heartbeat_block *hb_block = reg->hr_tmp_block; 647 u64 cputime; 648 + unsigned int dead_ms = o2hb_dead_threshold * O2HB_REGION_TIMEOUT_MS; 649 + unsigned int slot_dead_ms; 650 651 memcpy(hb_block, slot->ds_raw_block, reg->hr_block_bytes); 652 ··· 733 &o2hb_live_slots[slot->ds_node_num]); 734 735 slot->ds_equal_samples = 0; 736 + 737 + /* We want to be sure that all nodes agree on the 738 + * number of milliseconds before a node will be 739 + * considered dead. The self-fencing timeout is 740 + * computed from this value, and a discrepancy might 741 + * result in heartbeat calling a node dead when it 742 + * hasn't self-fenced yet. */ 743 + slot_dead_ms = le32_to_cpu(hb_block->hb_dead_ms); 744 + if (slot_dead_ms && slot_dead_ms != dead_ms) { 745 + /* TODO: Perhaps we can fail the region here. */ 746 + mlog(ML_ERROR, "Node %d on device %s has a dead count " 747 + "of %u ms, but our count is %u ms.\n" 748 + "Please double check your configuration values " 749 + "for 'O2CB_HEARTBEAT_THRESHOLD'\n", 750 + slot->ds_node_num, reg->hr_dev_name, slot_dead_ms, 751 + dead_ms); 752 + } 753 goto out; 754 } 755

fs/ocfs2/cluster/ocfs2_heartbeat.h

··· 32 __u8 hb_pad1[3]; 33 __le32 hb_cksum; 34 __le64 hb_generation; 35 }; 36 37 #endif /* _OCFS2_HEARTBEAT_H */

··· 32 __u8 hb_pad1[3]; 33 __le32 hb_cksum; 34 __le64 hb_generation; 35 + __le32 hb_dead_ms; 36 }; 37 38 #endif /* _OCFS2_HEARTBEAT_H */