Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

ext4: add support for multiple mount protection

Prevent an ext4 filesystem from being mounted multiple times.
A sequence number is stored on disk and is periodically updated (every 5
seconds by default) by a mounted filesystem.
At mount time, we now wait for s_mmp_update_interval seconds to make sure
that the MMP sequence does not change.
In case of failure, the nodename, bdevname and the time at which the MMP
block was last updated is displayed.

Signed-off-by: Andreas Dilger <adilger@whamcloud.com>
Signed-off-by: Johann Lombardi <johann@whamcloud.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>

authored by

Johann Lombardi and committed by
Theodore Ts'o
c5e06d10 d02a9391

+444 -4
+2 -1
fs/ext4/Makefile
··· 6 6 7 7 ext4-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o page-io.o \ 8 8 ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \ 9 - ext4_jbd2.o migrate.o mballoc.o block_validity.o move_extent.o 9 + ext4_jbd2.o migrate.o mballoc.o block_validity.o move_extent.o \ 10 + mmp.o 10 11 11 12 ext4-$(CONFIG_EXT4_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o 12 13 ext4-$(CONFIG_EXT4_FS_POSIX_ACL) += acl.o
+74 -2
fs/ext4/ext4.h
··· 1028 1028 __le16 s_want_extra_isize; /* New inodes should reserve # bytes */ 1029 1029 __le32 s_flags; /* Miscellaneous flags */ 1030 1030 __le16 s_raid_stride; /* RAID stride */ 1031 - __le16 s_mmp_interval; /* # seconds to wait in MMP checking */ 1031 + __le16 s_mmp_update_interval; /* # seconds to wait in MMP checking */ 1032 1032 __le64 s_mmp_block; /* Block for multi-mount protection */ 1033 1033 __le32 s_raid_stripe_width; /* blocks on all data disks (N*stride)*/ 1034 1034 __u8 s_log_groups_per_flex; /* FLEX_BG group size */ ··· 1204 1204 struct ext4_li_request *s_li_request; 1205 1205 /* Wait multiplier for lazy initialization thread */ 1206 1206 unsigned int s_li_wait_mult; 1207 + 1208 + /* Kernel thread for multiple mount protection */ 1209 + struct task_struct *s_mmp_tsk; 1207 1210 }; 1208 1211 1209 1212 static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb) ··· 1378 1375 EXT4_FEATURE_INCOMPAT_META_BG| \ 1379 1376 EXT4_FEATURE_INCOMPAT_EXTENTS| \ 1380 1377 EXT4_FEATURE_INCOMPAT_64BIT| \ 1381 - EXT4_FEATURE_INCOMPAT_FLEX_BG) 1378 + EXT4_FEATURE_INCOMPAT_FLEX_BG| \ 1379 + EXT4_FEATURE_INCOMPAT_MMP) 1382 1380 #define EXT4_FEATURE_RO_COMPAT_SUPP (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \ 1383 1381 EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \ 1384 1382 EXT4_FEATURE_RO_COMPAT_GDT_CSUM| \ ··· 1631 1627 }; 1632 1628 1633 1629 /* 1630 + * This structure will be used for multiple mount protection. It will be 1631 + * written into the block number saved in the s_mmp_block field in the 1632 + * superblock. Programs that check MMP should assume that if 1633 + * SEQ_FSCK (or any unknown code above SEQ_MAX) is present then it is NOT safe 1634 + * to use the filesystem, regardless of how old the timestamp is. 1635 + */ 1636 + #define EXT4_MMP_MAGIC 0x004D4D50U /* ASCII for MMP */ 1637 + #define EXT4_MMP_SEQ_CLEAN 0xFF4D4D50U /* mmp_seq value for clean unmount */ 1638 + #define EXT4_MMP_SEQ_FSCK 0xE24D4D50U /* mmp_seq value when being fscked */ 1639 + #define EXT4_MMP_SEQ_MAX 0xE24D4D4FU /* maximum valid mmp_seq value */ 1640 + 1641 + struct mmp_struct { 1642 + __le32 mmp_magic; /* Magic number for MMP */ 1643 + __le32 mmp_seq; /* Sequence no. updated periodically */ 1644 + 1645 + /* 1646 + * mmp_time, mmp_nodename & mmp_bdevname are only used for information 1647 + * purposes and do not affect the correctness of the algorithm 1648 + */ 1649 + __le64 mmp_time; /* Time last updated */ 1650 + char mmp_nodename[64]; /* Node which last updated MMP block */ 1651 + char mmp_bdevname[32]; /* Bdev which last updated MMP block */ 1652 + 1653 + /* 1654 + * mmp_check_interval is used to verify if the MMP block has been 1655 + * updated on the block device. The value is updated based on the 1656 + * maximum time to write the MMP block during an update cycle. 1657 + */ 1658 + __le16 mmp_check_interval; 1659 + 1660 + __le16 mmp_pad1; 1661 + __le32 mmp_pad2[227]; 1662 + }; 1663 + 1664 + /* arguments passed to the mmp thread */ 1665 + struct mmpd_data { 1666 + struct buffer_head *bh; /* bh from initial read_mmp_block() */ 1667 + struct super_block *sb; /* super block of the fs */ 1668 + }; 1669 + 1670 + /* 1671 + * Check interval multiplier 1672 + * The MMP block is written every update interval and initially checked every 1673 + * update interval x the multiplier (the value is then adapted based on the 1674 + * write latency). The reason is that writes can be delayed under load and we 1675 + * don't want readers to incorrectly assume that the filesystem is no longer 1676 + * in use. 1677 + */ 1678 + #define EXT4_MMP_CHECK_MULT 2UL 1679 + 1680 + /* 1681 + * Minimum interval for MMP checking in seconds. 1682 + */ 1683 + #define EXT4_MMP_MIN_CHECK_INTERVAL 5UL 1684 + 1685 + /* 1686 + * Maximum interval for MMP checking in seconds. 1687 + */ 1688 + #define EXT4_MMP_MAX_CHECK_INTERVAL 300UL 1689 + 1690 + /* 1634 1691 * Function prototypes 1635 1692 */ 1636 1693 ··· 1865 1800 __LINE__, ## message) 1866 1801 extern void ext4_msg(struct super_block *, const char *, const char *, ...) 1867 1802 __attribute__ ((format (printf, 3, 4))); 1803 + extern void __dump_mmp_msg(struct super_block *, struct mmp_struct *mmp, 1804 + const char *, unsigned int, const char *); 1805 + #define dump_mmp_msg(sb, mmp, msg) __dump_mmp_msg(sb, mmp, __func__, \ 1806 + __LINE__, msg) 1868 1807 extern void __ext4_grp_locked_error(const char *, unsigned int, \ 1869 1808 struct super_block *, ext4_group_t, \ 1870 1809 unsigned long, ext4_fsblk_t, \ ··· 2172 2103 struct page *page, 2173 2104 int len, 2174 2105 struct writeback_control *wbc); 2106 + 2107 + /* mmp.c */ 2108 + extern int ext4_multi_mount_protect(struct super_block *, ext4_fsblk_t); 2175 2109 2176 2110 /* BH_Uninit flag: blocks are allocated but uninitialized on disk */ 2177 2111 enum ext4_state_bits {
+351
fs/ext4/mmp.c
··· 1 + #include <linux/fs.h> 2 + #include <linux/random.h> 3 + #include <linux/buffer_head.h> 4 + #include <linux/utsname.h> 5 + #include <linux/kthread.h> 6 + 7 + #include "ext4.h" 8 + 9 + /* 10 + * Write the MMP block using WRITE_SYNC to try to get the block on-disk 11 + * faster. 12 + */ 13 + static int write_mmp_block(struct buffer_head *bh) 14 + { 15 + mark_buffer_dirty(bh); 16 + lock_buffer(bh); 17 + bh->b_end_io = end_buffer_write_sync; 18 + get_bh(bh); 19 + submit_bh(WRITE_SYNC, bh); 20 + wait_on_buffer(bh); 21 + if (unlikely(!buffer_uptodate(bh))) 22 + return 1; 23 + 24 + return 0; 25 + } 26 + 27 + /* 28 + * Read the MMP block. It _must_ be read from disk and hence we clear the 29 + * uptodate flag on the buffer. 30 + */ 31 + static int read_mmp_block(struct super_block *sb, struct buffer_head **bh, 32 + ext4_fsblk_t mmp_block) 33 + { 34 + struct mmp_struct *mmp; 35 + 36 + if (*bh) 37 + clear_buffer_uptodate(*bh); 38 + 39 + /* This would be sb_bread(sb, mmp_block), except we need to be sure 40 + * that the MD RAID device cache has been bypassed, and that the read 41 + * is not blocked in the elevator. */ 42 + if (!*bh) 43 + *bh = sb_getblk(sb, mmp_block); 44 + if (*bh) { 45 + get_bh(*bh); 46 + lock_buffer(*bh); 47 + (*bh)->b_end_io = end_buffer_read_sync; 48 + submit_bh(READ_SYNC, *bh); 49 + wait_on_buffer(*bh); 50 + if (!buffer_uptodate(*bh)) { 51 + brelse(*bh); 52 + *bh = NULL; 53 + } 54 + } 55 + if (!*bh) { 56 + ext4_warning(sb, "Error while reading MMP block %llu", 57 + mmp_block); 58 + return -EIO; 59 + } 60 + 61 + mmp = (struct mmp_struct *)((*bh)->b_data); 62 + if (le32_to_cpu(mmp->mmp_magic) != EXT4_MMP_MAGIC) 63 + return -EINVAL; 64 + 65 + return 0; 66 + } 67 + 68 + /* 69 + * Dump as much information as possible to help the admin. 70 + */ 71 + void __dump_mmp_msg(struct super_block *sb, struct mmp_struct *mmp, 72 + const char *function, unsigned int line, const char *msg) 73 + { 74 + __ext4_warning(sb, function, line, msg); 75 + __ext4_warning(sb, function, line, 76 + "MMP failure info: last update time: %llu, last update " 77 + "node: %s, last update device: %s\n", 78 + (long long unsigned int) le64_to_cpu(mmp->mmp_time), 79 + mmp->mmp_nodename, mmp->mmp_bdevname); 80 + } 81 + 82 + /* 83 + * kmmpd will update the MMP sequence every s_mmp_update_interval seconds 84 + */ 85 + static int kmmpd(void *data) 86 + { 87 + struct super_block *sb = ((struct mmpd_data *) data)->sb; 88 + struct buffer_head *bh = ((struct mmpd_data *) data)->bh; 89 + struct ext4_super_block *es = EXT4_SB(sb)->s_es; 90 + struct mmp_struct *mmp; 91 + ext4_fsblk_t mmp_block; 92 + u32 seq = 0; 93 + unsigned long failed_writes = 0; 94 + int mmp_update_interval = le16_to_cpu(es->s_mmp_update_interval); 95 + unsigned mmp_check_interval; 96 + unsigned long last_update_time; 97 + unsigned long diff; 98 + int retval; 99 + 100 + mmp_block = le64_to_cpu(es->s_mmp_block); 101 + mmp = (struct mmp_struct *)(bh->b_data); 102 + mmp->mmp_time = cpu_to_le64(get_seconds()); 103 + /* 104 + * Start with the higher mmp_check_interval and reduce it if 105 + * the MMP block is being updated on time. 106 + */ 107 + mmp_check_interval = max(EXT4_MMP_CHECK_MULT * mmp_update_interval, 108 + EXT4_MMP_MIN_CHECK_INTERVAL); 109 + mmp->mmp_check_interval = cpu_to_le16(mmp_check_interval); 110 + bdevname(bh->b_bdev, mmp->mmp_bdevname); 111 + 112 + memcpy(mmp->mmp_nodename, init_utsname()->sysname, 113 + sizeof(mmp->mmp_nodename)); 114 + 115 + while (!kthread_should_stop()) { 116 + if (++seq > EXT4_MMP_SEQ_MAX) 117 + seq = 1; 118 + 119 + mmp->mmp_seq = cpu_to_le32(seq); 120 + mmp->mmp_time = cpu_to_le64(get_seconds()); 121 + last_update_time = jiffies; 122 + 123 + retval = write_mmp_block(bh); 124 + /* 125 + * Don't spew too many error messages. Print one every 126 + * (s_mmp_update_interval * 60) seconds. 127 + */ 128 + if (retval && (failed_writes % 60) == 0) { 129 + ext4_error(sb, "Error writing to MMP block"); 130 + failed_writes++; 131 + } 132 + 133 + if (!(le32_to_cpu(es->s_feature_incompat) & 134 + EXT4_FEATURE_INCOMPAT_MMP)) { 135 + ext4_warning(sb, "kmmpd being stopped since MMP feature" 136 + " has been disabled."); 137 + EXT4_SB(sb)->s_mmp_tsk = NULL; 138 + goto failed; 139 + } 140 + 141 + if (sb->s_flags & MS_RDONLY) { 142 + ext4_warning(sb, "kmmpd being stopped since filesystem " 143 + "has been remounted as readonly."); 144 + EXT4_SB(sb)->s_mmp_tsk = NULL; 145 + goto failed; 146 + } 147 + 148 + diff = jiffies - last_update_time; 149 + if (diff < mmp_update_interval * HZ) 150 + schedule_timeout_interruptible(mmp_update_interval * 151 + HZ - diff); 152 + 153 + /* 154 + * We need to make sure that more than mmp_check_interval 155 + * seconds have not passed since writing. If that has happened 156 + * we need to check if the MMP block is as we left it. 157 + */ 158 + diff = jiffies - last_update_time; 159 + if (diff > mmp_check_interval * HZ) { 160 + struct buffer_head *bh_check = NULL; 161 + struct mmp_struct *mmp_check; 162 + 163 + retval = read_mmp_block(sb, &bh_check, mmp_block); 164 + if (retval) { 165 + ext4_error(sb, "error reading MMP data: %d", 166 + retval); 167 + 168 + EXT4_SB(sb)->s_mmp_tsk = NULL; 169 + goto failed; 170 + } 171 + 172 + mmp_check = (struct mmp_struct *)(bh_check->b_data); 173 + if (mmp->mmp_seq != mmp_check->mmp_seq || 174 + memcmp(mmp->mmp_nodename, mmp_check->mmp_nodename, 175 + sizeof(mmp->mmp_nodename))) { 176 + dump_mmp_msg(sb, mmp_check, 177 + "Error while updating MMP info. " 178 + "The filesystem seems to have been" 179 + " multiply mounted."); 180 + ext4_error(sb, "abort"); 181 + goto failed; 182 + } 183 + put_bh(bh_check); 184 + } 185 + 186 + /* 187 + * Adjust the mmp_check_interval depending on how much time 188 + * it took for the MMP block to be written. 189 + */ 190 + mmp_check_interval = max(min(EXT4_MMP_CHECK_MULT * diff / HZ, 191 + EXT4_MMP_MAX_CHECK_INTERVAL), 192 + EXT4_MMP_MIN_CHECK_INTERVAL); 193 + mmp->mmp_check_interval = cpu_to_le16(mmp_check_interval); 194 + } 195 + 196 + /* 197 + * Unmount seems to be clean. 198 + */ 199 + mmp->mmp_seq = cpu_to_le32(EXT4_MMP_SEQ_CLEAN); 200 + mmp->mmp_time = cpu_to_le64(get_seconds()); 201 + 202 + retval = write_mmp_block(bh); 203 + 204 + failed: 205 + kfree(data); 206 + brelse(bh); 207 + return retval; 208 + } 209 + 210 + /* 211 + * Get a random new sequence number but make sure it is not greater than 212 + * EXT4_MMP_SEQ_MAX. 213 + */ 214 + static unsigned int mmp_new_seq(void) 215 + { 216 + u32 new_seq; 217 + 218 + do { 219 + get_random_bytes(&new_seq, sizeof(u32)); 220 + } while (new_seq > EXT4_MMP_SEQ_MAX); 221 + 222 + return new_seq; 223 + } 224 + 225 + /* 226 + * Protect the filesystem from being mounted more than once. 227 + */ 228 + int ext4_multi_mount_protect(struct super_block *sb, 229 + ext4_fsblk_t mmp_block) 230 + { 231 + struct ext4_super_block *es = EXT4_SB(sb)->s_es; 232 + struct buffer_head *bh = NULL; 233 + struct mmp_struct *mmp = NULL; 234 + struct mmpd_data *mmpd_data; 235 + u32 seq; 236 + unsigned int mmp_check_interval = le16_to_cpu(es->s_mmp_update_interval); 237 + unsigned int wait_time = 0; 238 + int retval; 239 + 240 + if (mmp_block < le32_to_cpu(es->s_first_data_block) || 241 + mmp_block >= ext4_blocks_count(es)) { 242 + ext4_warning(sb, "Invalid MMP block in superblock"); 243 + goto failed; 244 + } 245 + 246 + retval = read_mmp_block(sb, &bh, mmp_block); 247 + if (retval) 248 + goto failed; 249 + 250 + mmp = (struct mmp_struct *)(bh->b_data); 251 + 252 + if (mmp_check_interval < EXT4_MMP_MIN_CHECK_INTERVAL) 253 + mmp_check_interval = EXT4_MMP_MIN_CHECK_INTERVAL; 254 + 255 + /* 256 + * If check_interval in MMP block is larger, use that instead of 257 + * update_interval from the superblock. 258 + */ 259 + if (mmp->mmp_check_interval > mmp_check_interval) 260 + mmp_check_interval = mmp->mmp_check_interval; 261 + 262 + seq = le32_to_cpu(mmp->mmp_seq); 263 + if (seq == EXT4_MMP_SEQ_CLEAN) 264 + goto skip; 265 + 266 + if (seq == EXT4_MMP_SEQ_FSCK) { 267 + dump_mmp_msg(sb, mmp, "fsck is running on the filesystem"); 268 + goto failed; 269 + } 270 + 271 + wait_time = min(mmp_check_interval * 2 + 1, 272 + mmp_check_interval + 60); 273 + 274 + /* Print MMP interval if more than 20 secs. */ 275 + if (wait_time > EXT4_MMP_MIN_CHECK_INTERVAL * 4) 276 + ext4_warning(sb, "MMP interval %u higher than expected, please" 277 + " wait.\n", wait_time * 2); 278 + 279 + if (schedule_timeout_interruptible(HZ * wait_time) != 0) { 280 + ext4_warning(sb, "MMP startup interrupted, failing mount\n"); 281 + goto failed; 282 + } 283 + 284 + retval = read_mmp_block(sb, &bh, mmp_block); 285 + if (retval) 286 + goto failed; 287 + mmp = (struct mmp_struct *)(bh->b_data); 288 + if (seq != le32_to_cpu(mmp->mmp_seq)) { 289 + dump_mmp_msg(sb, mmp, 290 + "Device is already active on another node."); 291 + goto failed; 292 + } 293 + 294 + skip: 295 + /* 296 + * write a new random sequence number. 297 + */ 298 + mmp->mmp_seq = seq = cpu_to_le32(mmp_new_seq()); 299 + 300 + retval = write_mmp_block(bh); 301 + if (retval) 302 + goto failed; 303 + 304 + /* 305 + * wait for MMP interval and check mmp_seq. 306 + */ 307 + if (schedule_timeout_interruptible(HZ * wait_time) != 0) { 308 + ext4_warning(sb, "MMP startup interrupted, failing mount\n"); 309 + goto failed; 310 + } 311 + 312 + retval = read_mmp_block(sb, &bh, mmp_block); 313 + if (retval) 314 + goto failed; 315 + mmp = (struct mmp_struct *)(bh->b_data); 316 + if (seq != le32_to_cpu(mmp->mmp_seq)) { 317 + dump_mmp_msg(sb, mmp, 318 + "Device is already active on another node."); 319 + goto failed; 320 + } 321 + 322 + mmpd_data = kmalloc(sizeof(struct mmpd_data), GFP_KERNEL); 323 + if (!mmpd_data) { 324 + ext4_warning(sb, "not enough memory for mmpd_data"); 325 + goto failed; 326 + } 327 + mmpd_data->sb = sb; 328 + mmpd_data->bh = bh; 329 + 330 + /* 331 + * Start a kernel thread to update the MMP block periodically. 332 + */ 333 + EXT4_SB(sb)->s_mmp_tsk = kthread_run(kmmpd, mmpd_data, "kmmpd-%s", 334 + bdevname(bh->b_bdev, 335 + mmp->mmp_bdevname)); 336 + if (IS_ERR(EXT4_SB(sb)->s_mmp_tsk)) { 337 + EXT4_SB(sb)->s_mmp_tsk = NULL; 338 + kfree(mmpd_data); 339 + ext4_warning(sb, "Unable to create kmmpd thread for %s.", 340 + sb->s_id); 341 + goto failed; 342 + } 343 + 344 + return 0; 345 + 346 + failed: 347 + brelse(bh); 348 + return 1; 349 + } 350 + 351 +
+17 -1
fs/ext4/super.c
··· 822 822 invalidate_bdev(sbi->journal_bdev); 823 823 ext4_blkdev_remove(sbi); 824 824 } 825 + if (sbi->s_mmp_tsk) 826 + kthread_stop(sbi->s_mmp_tsk); 825 827 sb->s_fs_info = NULL; 826 828 /* 827 829 * Now that we are completely done shutting down the ··· 3488 3486 EXT4_HAS_INCOMPAT_FEATURE(sb, 3489 3487 EXT4_FEATURE_INCOMPAT_RECOVER)); 3490 3488 3489 + if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_MMP) && 3490 + !(sb->s_flags & MS_RDONLY)) 3491 + if (ext4_multi_mount_protect(sb, le64_to_cpu(es->s_mmp_block))) 3492 + goto failed_mount3; 3493 + 3491 3494 /* 3492 3495 * The first inode we look at is the journal inode. Don't try 3493 3496 * root first: it may be modified in the journal! ··· 3740 3733 percpu_counter_destroy(&sbi->s_freeinodes_counter); 3741 3734 percpu_counter_destroy(&sbi->s_dirs_counter); 3742 3735 percpu_counter_destroy(&sbi->s_dirtyblocks_counter); 3736 + if (sbi->s_mmp_tsk) 3737 + kthread_stop(sbi->s_mmp_tsk); 3743 3738 failed_mount2: 3744 3739 for (i = 0; i < db_count; i++) 3745 3740 brelse(sbi->s_group_desc[i]); ··· 4277 4268 int enable_quota = 0; 4278 4269 ext4_group_t g; 4279 4270 unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO; 4280 - int err; 4271 + int err = 0; 4281 4272 #ifdef CONFIG_QUOTA 4282 4273 int i; 4283 4274 #endif ··· 4403 4394 goto restore_opts; 4404 4395 if (!ext4_setup_super(sb, es, 0)) 4405 4396 sb->s_flags &= ~MS_RDONLY; 4397 + if (EXT4_HAS_INCOMPAT_FEATURE(sb, 4398 + EXT4_FEATURE_INCOMPAT_MMP)) 4399 + if (ext4_multi_mount_protect(sb, 4400 + le64_to_cpu(es->s_mmp_block))) { 4401 + err = -EROFS; 4402 + goto restore_opts; 4403 + } 4406 4404 enable_quota = 1; 4407 4405 } 4408 4406 }