ext4: add support for multiple mount protection

+2 -1

fs/ext4/Makefile

··· 6 6 7 7 ext4-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o page-io.o \ 8 8 ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \ 9 - ext4_jbd2.o migrate.o mballoc.o block_validity.o move_extent.o 9 + ext4_jbd2.o migrate.o mballoc.o block_validity.o move_extent.o \ 10 + mmp.o 10 11 11 12 ext4-$(CONFIG_EXT4_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o 12 13 ext4-$(CONFIG_EXT4_FS_POSIX_ACL) += acl.o

+74 -2

fs/ext4/ext4.h

··· 1028 1028 __le16 s_want_extra_isize; /* New inodes should reserve # bytes */ 1029 1029 __le32 s_flags; /* Miscellaneous flags */ 1030 1030 __le16 s_raid_stride; /* RAID stride */ 1031 - __le16 s_mmp_interval; /* # seconds to wait in MMP checking */ 1031 + __le16 s_mmp_update_interval; /* # seconds to wait in MMP checking */ 1032 1032 __le64 s_mmp_block; /* Block for multi-mount protection */ 1033 1033 __le32 s_raid_stripe_width; /* blocks on all data disks (N*stride)*/ 1034 1034 __u8 s_log_groups_per_flex; /* FLEX_BG group size */ ··· 1204 1204 struct ext4_li_request *s_li_request; 1205 1205 /* Wait multiplier for lazy initialization thread */ 1206 1206 unsigned int s_li_wait_mult; 1207 + 1208 + /* Kernel thread for multiple mount protection */ 1209 + struct task_struct *s_mmp_tsk; 1207 1210 }; 1208 1211 1209 1212 static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb) ··· 1378 1375 EXT4_FEATURE_INCOMPAT_META_BG| \ 1379 1376 EXT4_FEATURE_INCOMPAT_EXTENTS| \ 1380 1377 EXT4_FEATURE_INCOMPAT_64BIT| \ 1381 - EXT4_FEATURE_INCOMPAT_FLEX_BG) 1378 + EXT4_FEATURE_INCOMPAT_FLEX_BG| \ 1379 + EXT4_FEATURE_INCOMPAT_MMP) 1382 1380 #define EXT4_FEATURE_RO_COMPAT_SUPP (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \ 1383 1381 EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \ 1384 1382 EXT4_FEATURE_RO_COMPAT_GDT_CSUM| \ ··· 1631 1627 }; 1632 1628 1633 1629 /* 1630 + * This structure will be used for multiple mount protection. It will be 1631 + * written into the block number saved in the s_mmp_block field in the 1632 + * superblock. Programs that check MMP should assume that if 1633 + * SEQ_FSCK (or any unknown code above SEQ_MAX) is present then it is NOT safe 1634 + * to use the filesystem, regardless of how old the timestamp is. 1635 + */ 1636 + #define EXT4_MMP_MAGIC 0x004D4D50U /* ASCII for MMP */ 1637 + #define EXT4_MMP_SEQ_CLEAN 0xFF4D4D50U /* mmp_seq value for clean unmount */ 1638 + #define EXT4_MMP_SEQ_FSCK 0xE24D4D50U /* mmp_seq value when being fscked */ 1639 + #define EXT4_MMP_SEQ_MAX 0xE24D4D4FU /* maximum valid mmp_seq value */ 1640 + 1641 + struct mmp_struct { 1642 + __le32 mmp_magic; /* Magic number for MMP */ 1643 + __le32 mmp_seq; /* Sequence no. updated periodically */ 1644 + 1645 + /* 1646 + * mmp_time, mmp_nodename & mmp_bdevname are only used for information 1647 + * purposes and do not affect the correctness of the algorithm 1648 + */ 1649 + __le64 mmp_time; /* Time last updated */ 1650 + char mmp_nodename[64]; /* Node which last updated MMP block */ 1651 + char mmp_bdevname[32]; /* Bdev which last updated MMP block */ 1652 + 1653 + /* 1654 + * mmp_check_interval is used to verify if the MMP block has been 1655 + * updated on the block device. The value is updated based on the 1656 + * maximum time to write the MMP block during an update cycle. 1657 + */ 1658 + __le16 mmp_check_interval; 1659 + 1660 + __le16 mmp_pad1; 1661 + __le32 mmp_pad2[227]; 1662 + }; 1663 + 1664 + /* arguments passed to the mmp thread */ 1665 + struct mmpd_data { 1666 + struct buffer_head *bh; /* bh from initial read_mmp_block() */ 1667 + struct super_block *sb; /* super block of the fs */ 1668 + }; 1669 + 1670 + /* 1671 + * Check interval multiplier 1672 + * The MMP block is written every update interval and initially checked every 1673 + * update interval x the multiplier (the value is then adapted based on the 1674 + * write latency). The reason is that writes can be delayed under load and we 1675 + * don't want readers to incorrectly assume that the filesystem is no longer 1676 + * in use. 1677 + */ 1678 + #define EXT4_MMP_CHECK_MULT 2UL 1679 + 1680 + /* 1681 + * Minimum interval for MMP checking in seconds. 1682 + */ 1683 + #define EXT4_MMP_MIN_CHECK_INTERVAL 5UL 1684 + 1685 + /* 1686 + * Maximum interval for MMP checking in seconds. 1687 + */ 1688 + #define EXT4_MMP_MAX_CHECK_INTERVAL 300UL 1689 + 1690 + /* 1634 1691 * Function prototypes 1635 1692 */ 1636 1693 ··· 1865 1800 __LINE__, ## message) 1866 1801 extern void ext4_msg(struct super_block *, const char *, const char *, ...) 1867 1802 __attribute__ ((format (printf, 3, 4))); 1803 + extern void __dump_mmp_msg(struct super_block *, struct mmp_struct *mmp, 1804 + const char *, unsigned int, const char *); 1805 + #define dump_mmp_msg(sb, mmp, msg) __dump_mmp_msg(sb, mmp, __func__, \ 1806 + __LINE__, msg) 1868 1807 extern void __ext4_grp_locked_error(const char *, unsigned int, \ 1869 1808 struct super_block *, ext4_group_t, \ 1870 1809 unsigned long, ext4_fsblk_t, \ ··· 2172 2103 struct page *page, 2173 2104 int len, 2174 2105 struct writeback_control *wbc); 2106 + 2107 + /* mmp.c */ 2108 + extern int ext4_multi_mount_protect(struct super_block *, ext4_fsblk_t); 2175 2109 2176 2110 /* BH_Uninit flag: blocks are allocated but uninitialized on disk */ 2177 2111 enum ext4_state_bits {

+351

fs/ext4/mmp.c

··· 1 + #include <linux/fs.h> 2 + #include <linux/random.h> 3 + #include <linux/buffer_head.h> 4 + #include <linux/utsname.h> 5 + #include <linux/kthread.h> 6 + 7 + #include "ext4.h" 8 + 9 + /* 10 + * Write the MMP block using WRITE_SYNC to try to get the block on-disk 11 + * faster. 12 + */ 13 + static int write_mmp_block(struct buffer_head *bh) 14 + { 15 + mark_buffer_dirty(bh); 16 + lock_buffer(bh); 17 + bh->b_end_io = end_buffer_write_sync; 18 + get_bh(bh); 19 + submit_bh(WRITE_SYNC, bh); 20 + wait_on_buffer(bh); 21 + if (unlikely(!buffer_uptodate(bh))) 22 + return 1; 23 + 24 + return 0; 25 + } 26 + 27 + /* 28 + * Read the MMP block. It _must_ be read from disk and hence we clear the 29 + * uptodate flag on the buffer. 30 + */ 31 + static int read_mmp_block(struct super_block *sb, struct buffer_head **bh, 32 + ext4_fsblk_t mmp_block) 33 + { 34 + struct mmp_struct *mmp; 35 + 36 + if (*bh) 37 + clear_buffer_uptodate(*bh); 38 + 39 + /* This would be sb_bread(sb, mmp_block), except we need to be sure 40 + * that the MD RAID device cache has been bypassed, and that the read 41 + * is not blocked in the elevator. */ 42 + if (!*bh) 43 + *bh = sb_getblk(sb, mmp_block); 44 + if (*bh) { 45 + get_bh(*bh); 46 + lock_buffer(*bh); 47 + (*bh)->b_end_io = end_buffer_read_sync; 48 + submit_bh(READ_SYNC, *bh); 49 + wait_on_buffer(*bh); 50 + if (!buffer_uptodate(*bh)) { 51 + brelse(*bh); 52 + *bh = NULL; 53 + } 54 + } 55 + if (!*bh) { 56 + ext4_warning(sb, "Error while reading MMP block %llu", 57 + mmp_block); 58 + return -EIO; 59 + } 60 + 61 + mmp = (struct mmp_struct *)((*bh)->b_data); 62 + if (le32_to_cpu(mmp->mmp_magic) != EXT4_MMP_MAGIC) 63 + return -EINVAL; 64 + 65 + return 0; 66 + } 67 + 68 + /* 69 + * Dump as much information as possible to help the admin. 70 + */ 71 + void __dump_mmp_msg(struct super_block *sb, struct mmp_struct *mmp, 72 + const char *function, unsigned int line, const char *msg) 73 + { 74 + __ext4_warning(sb, function, line, msg); 75 + __ext4_warning(sb, function, line, 76 + "MMP failure info: last update time: %llu, last update " 77 + "node: %s, last update device: %s\n", 78 + (long long unsigned int) le64_to_cpu(mmp->mmp_time), 79 + mmp->mmp_nodename, mmp->mmp_bdevname); 80 + } 81 + 82 + /* 83 + * kmmpd will update the MMP sequence every s_mmp_update_interval seconds 84 + */ 85 + static int kmmpd(void *data) 86 + { 87 + struct super_block *sb = ((struct mmpd_data *) data)->sb; 88 + struct buffer_head *bh = ((struct mmpd_data *) data)->bh; 89 + struct ext4_super_block *es = EXT4_SB(sb)->s_es; 90 + struct mmp_struct *mmp; 91 + ext4_fsblk_t mmp_block; 92 + u32 seq = 0; 93 + unsigned long failed_writes = 0; 94 + int mmp_update_interval = le16_to_cpu(es->s_mmp_update_interval); 95 + unsigned mmp_check_interval; 96 + unsigned long last_update_time; 97 + unsigned long diff; 98 + int retval; 99 + 100 + mmp_block = le64_to_cpu(es->s_mmp_block); 101 + mmp = (struct mmp_struct *)(bh->b_data); 102 + mmp->mmp_time = cpu_to_le64(get_seconds()); 103 + /* 104 + * Start with the higher mmp_check_interval and reduce it if 105 + * the MMP block is being updated on time. 106 + */ 107 + mmp_check_interval = max(EXT4_MMP_CHECK_MULT * mmp_update_interval, 108 + EXT4_MMP_MIN_CHECK_INTERVAL); 109 + mmp->mmp_check_interval = cpu_to_le16(mmp_check_interval); 110 + bdevname(bh->b_bdev, mmp->mmp_bdevname); 111 + 112 + memcpy(mmp->mmp_nodename, init_utsname()->sysname, 113 + sizeof(mmp->mmp_nodename)); 114 + 115 + while (!kthread_should_stop()) { 116 + if (++seq > EXT4_MMP_SEQ_MAX) 117 + seq = 1; 118 + 119 + mmp->mmp_seq = cpu_to_le32(seq); 120 + mmp->mmp_time = cpu_to_le64(get_seconds()); 121 + last_update_time = jiffies; 122 + 123 + retval = write_mmp_block(bh); 124 + /* 125 + * Don't spew too many error messages. Print one every 126 + * (s_mmp_update_interval * 60) seconds. 127 + */ 128 + if (retval && (failed_writes % 60) == 0) { 129 + ext4_error(sb, "Error writing to MMP block"); 130 + failed_writes++; 131 + } 132 + 133 + if (!(le32_to_cpu(es->s_feature_incompat) & 134 + EXT4_FEATURE_INCOMPAT_MMP)) { 135 + ext4_warning(sb, "kmmpd being stopped since MMP feature" 136 + " has been disabled."); 137 + EXT4_SB(sb)->s_mmp_tsk = NULL; 138 + goto failed; 139 + } 140 + 141 + if (sb->s_flags & MS_RDONLY) { 142 + ext4_warning(sb, "kmmpd being stopped since filesystem " 143 + "has been remounted as readonly."); 144 + EXT4_SB(sb)->s_mmp_tsk = NULL; 145 + goto failed; 146 + } 147 + 148 + diff = jiffies - last_update_time; 149 + if (diff < mmp_update_interval * HZ) 150 + schedule_timeout_interruptible(mmp_update_interval * 151 + HZ - diff); 152 + 153 + /* 154 + * We need to make sure that more than mmp_check_interval 155 + * seconds have not passed since writing. If that has happened 156 + * we need to check if the MMP block is as we left it. 157 + */ 158 + diff = jiffies - last_update_time; 159 + if (diff > mmp_check_interval * HZ) { 160 + struct buffer_head *bh_check = NULL; 161 + struct mmp_struct *mmp_check; 162 + 163 + retval = read_mmp_block(sb, &bh_check, mmp_block); 164 + if (retval) { 165 + ext4_error(sb, "error reading MMP data: %d", 166 + retval); 167 + 168 + EXT4_SB(sb)->s_mmp_tsk = NULL; 169 + goto failed; 170 + } 171 + 172 + mmp_check = (struct mmp_struct *)(bh_check->b_data); 173 + if (mmp->mmp_seq != mmp_check->mmp_seq || 174 + memcmp(mmp->mmp_nodename, mmp_check->mmp_nodename, 175 + sizeof(mmp->mmp_nodename))) { 176 + dump_mmp_msg(sb, mmp_check, 177 + "Error while updating MMP info. " 178 + "The filesystem seems to have been" 179 + " multiply mounted."); 180 + ext4_error(sb, "abort"); 181 + goto failed; 182 + } 183 + put_bh(bh_check); 184 + } 185 + 186 + /* 187 + * Adjust the mmp_check_interval depending on how much time 188 + * it took for the MMP block to be written. 189 + */ 190 + mmp_check_interval = max(min(EXT4_MMP_CHECK_MULT * diff / HZ, 191 + EXT4_MMP_MAX_CHECK_INTERVAL), 192 + EXT4_MMP_MIN_CHECK_INTERVAL); 193 + mmp->mmp_check_interval = cpu_to_le16(mmp_check_interval); 194 + } 195 + 196 + /* 197 + * Unmount seems to be clean. 198 + */ 199 + mmp->mmp_seq = cpu_to_le32(EXT4_MMP_SEQ_CLEAN); 200 + mmp->mmp_time = cpu_to_le64(get_seconds()); 201 + 202 + retval = write_mmp_block(bh); 203 + 204 + failed: 205 + kfree(data); 206 + brelse(bh); 207 + return retval; 208 + } 209 + 210 + /* 211 + * Get a random new sequence number but make sure it is not greater than 212 + * EXT4_MMP_SEQ_MAX. 213 + */ 214 + static unsigned int mmp_new_seq(void) 215 + { 216 + u32 new_seq; 217 + 218 + do { 219 + get_random_bytes(&new_seq, sizeof(u32)); 220 + } while (new_seq > EXT4_MMP_SEQ_MAX); 221 + 222 + return new_seq; 223 + } 224 + 225 + /* 226 + * Protect the filesystem from being mounted more than once. 227 + */ 228 + int ext4_multi_mount_protect(struct super_block *sb, 229 + ext4_fsblk_t mmp_block) 230 + { 231 + struct ext4_super_block *es = EXT4_SB(sb)->s_es; 232 + struct buffer_head *bh = NULL; 233 + struct mmp_struct *mmp = NULL; 234 + struct mmpd_data *mmpd_data; 235 + u32 seq; 236 + unsigned int mmp_check_interval = le16_to_cpu(es->s_mmp_update_interval); 237 + unsigned int wait_time = 0; 238 + int retval; 239 + 240 + if (mmp_block < le32_to_cpu(es->s_first_data_block) || 241 + mmp_block >= ext4_blocks_count(es)) { 242 + ext4_warning(sb, "Invalid MMP block in superblock"); 243 + goto failed; 244 + } 245 + 246 + retval = read_mmp_block(sb, &bh, mmp_block); 247 + if (retval) 248 + goto failed; 249 + 250 + mmp = (struct mmp_struct *)(bh->b_data); 251 + 252 + if (mmp_check_interval < EXT4_MMP_MIN_CHECK_INTERVAL) 253 + mmp_check_interval = EXT4_MMP_MIN_CHECK_INTERVAL; 254 + 255 + /* 256 + * If check_interval in MMP block is larger, use that instead of 257 + * update_interval from the superblock. 258 + */ 259 + if (mmp->mmp_check_interval > mmp_check_interval) 260 + mmp_check_interval = mmp->mmp_check_interval; 261 + 262 + seq = le32_to_cpu(mmp->mmp_seq); 263 + if (seq == EXT4_MMP_SEQ_CLEAN) 264 + goto skip; 265 + 266 + if (seq == EXT4_MMP_SEQ_FSCK) { 267 + dump_mmp_msg(sb, mmp, "fsck is running on the filesystem"); 268 + goto failed; 269 + } 270 + 271 + wait_time = min(mmp_check_interval * 2 + 1, 272 + mmp_check_interval + 60); 273 + 274 + /* Print MMP interval if more than 20 secs. */ 275 + if (wait_time > EXT4_MMP_MIN_CHECK_INTERVAL * 4) 276 + ext4_warning(sb, "MMP interval %u higher than expected, please" 277 + " wait.\n", wait_time * 2); 278 + 279 + if (schedule_timeout_interruptible(HZ * wait_time) != 0) { 280 + ext4_warning(sb, "MMP startup interrupted, failing mount\n"); 281 + goto failed; 282 + } 283 + 284 + retval = read_mmp_block(sb, &bh, mmp_block); 285 + if (retval) 286 + goto failed; 287 + mmp = (struct mmp_struct *)(bh->b_data); 288 + if (seq != le32_to_cpu(mmp->mmp_seq)) { 289 + dump_mmp_msg(sb, mmp, 290 + "Device is already active on another node."); 291 + goto failed; 292 + } 293 + 294 + skip: 295 + /* 296 + * write a new random sequence number. 297 + */ 298 + mmp->mmp_seq = seq = cpu_to_le32(mmp_new_seq()); 299 + 300 + retval = write_mmp_block(bh); 301 + if (retval) 302 + goto failed; 303 + 304 + /* 305 + * wait for MMP interval and check mmp_seq. 306 + */ 307 + if (schedule_timeout_interruptible(HZ * wait_time) != 0) { 308 + ext4_warning(sb, "MMP startup interrupted, failing mount\n"); 309 + goto failed; 310 + } 311 + 312 + retval = read_mmp_block(sb, &bh, mmp_block); 313 + if (retval) 314 + goto failed; 315 + mmp = (struct mmp_struct *)(bh->b_data); 316 + if (seq != le32_to_cpu(mmp->mmp_seq)) { 317 + dump_mmp_msg(sb, mmp, 318 + "Device is already active on another node."); 319 + goto failed; 320 + } 321 + 322 + mmpd_data = kmalloc(sizeof(struct mmpd_data), GFP_KERNEL); 323 + if (!mmpd_data) { 324 + ext4_warning(sb, "not enough memory for mmpd_data"); 325 + goto failed; 326 + } 327 + mmpd_data->sb = sb; 328 + mmpd_data->bh = bh; 329 + 330 + /* 331 + * Start a kernel thread to update the MMP block periodically. 332 + */ 333 + EXT4_SB(sb)->s_mmp_tsk = kthread_run(kmmpd, mmpd_data, "kmmpd-%s", 334 + bdevname(bh->b_bdev, 335 + mmp->mmp_bdevname)); 336 + if (IS_ERR(EXT4_SB(sb)->s_mmp_tsk)) { 337 + EXT4_SB(sb)->s_mmp_tsk = NULL; 338 + kfree(mmpd_data); 339 + ext4_warning(sb, "Unable to create kmmpd thread for %s.", 340 + sb->s_id); 341 + goto failed; 342 + } 343 + 344 + return 0; 345 + 346 + failed: 347 + brelse(bh); 348 + return 1; 349 + } 350 + 351 +

+17 -1

fs/ext4/super.c

··· 822 822 invalidate_bdev(sbi->journal_bdev); 823 823 ext4_blkdev_remove(sbi); 824 824 } 825 + if (sbi->s_mmp_tsk) 826 + kthread_stop(sbi->s_mmp_tsk); 825 827 sb->s_fs_info = NULL; 826 828 /* 827 829 * Now that we are completely done shutting down the ··· 3488 3486 EXT4_HAS_INCOMPAT_FEATURE(sb, 3489 3487 EXT4_FEATURE_INCOMPAT_RECOVER)); 3490 3488 3489 + if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_MMP) && 3490 + !(sb->s_flags & MS_RDONLY)) 3491 + if (ext4_multi_mount_protect(sb, le64_to_cpu(es->s_mmp_block))) 3492 + goto failed_mount3; 3493 + 3491 3494 /* 3492 3495 * The first inode we look at is the journal inode. Don't try 3493 3496 * root first: it may be modified in the journal! ··· 3740 3733 percpu_counter_destroy(&sbi->s_freeinodes_counter); 3741 3734 percpu_counter_destroy(&sbi->s_dirs_counter); 3742 3735 percpu_counter_destroy(&sbi->s_dirtyblocks_counter); 3736 + if (sbi->s_mmp_tsk) 3737 + kthread_stop(sbi->s_mmp_tsk); 3743 3738 failed_mount2: 3744 3739 for (i = 0; i < db_count; i++) 3745 3740 brelse(sbi->s_group_desc[i]); ··· 4277 4268 int enable_quota = 0; 4278 4269 ext4_group_t g; 4279 4270 unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO; 4280 - int err; 4271 + int err = 0; 4281 4272 #ifdef CONFIG_QUOTA 4282 4273 int i; 4283 4274 #endif ··· 4403 4394 goto restore_opts; 4404 4395 if (!ext4_setup_super(sb, es, 0)) 4405 4396 sb->s_flags &= ~MS_RDONLY; 4397 + if (EXT4_HAS_INCOMPAT_FEATURE(sb, 4398 + EXT4_FEATURE_INCOMPAT_MMP)) 4399 + if (ext4_multi_mount_protect(sb, 4400 + le64_to_cpu(es->s_mmp_block))) { 4401 + err = -EROFS; 4402 + goto restore_opts; 4403 + } 4406 4404 enable_quota = 1; 4407 4405 } 4408 4406 }