fs/ext4/super.c at v5.14-rc5

tjh.dev / kernel
fork atom
Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
fork atom
kernel / fs / ext4 / super.c
at v5.14-rc5 6808 lines 197 kB view raw
wrap content
   1// SPDX-License-Identifier: GPL-2.0
   2/*
   3 *  linux/fs/ext4/super.c
   4 *
   5 * Copyright (C) 1992, 1993, 1994, 1995
   6 * Remy Card (card@masi.ibp.fr)
   7 * Laboratoire MASI - Institut Blaise Pascal
   8 * Universite Pierre et Marie Curie (Paris VI)
   9 *
  10 *  from
  11 *
  12 *  linux/fs/minix/inode.c
  13 *
  14 *  Copyright (C) 1991, 1992  Linus Torvalds
  15 *
  16 *  Big-endian to little-endian byte-swapping/bitmaps by
  17 *        David S. Miller (davem@caip.rutgers.edu), 1995
  18 */
  19
  20#include <linux/module.h>
  21#include <linux/string.h>
  22#include <linux/fs.h>
  23#include <linux/time.h>
  24#include <linux/vmalloc.h>
  25#include <linux/slab.h>
  26#include <linux/init.h>
  27#include <linux/blkdev.h>
  28#include <linux/backing-dev.h>
  29#include <linux/parser.h>
  30#include <linux/buffer_head.h>
  31#include <linux/exportfs.h>
  32#include <linux/vfs.h>
  33#include <linux/random.h>
  34#include <linux/mount.h>
  35#include <linux/namei.h>
  36#include <linux/quotaops.h>
  37#include <linux/seq_file.h>
  38#include <linux/ctype.h>
  39#include <linux/log2.h>
  40#include <linux/crc16.h>
  41#include <linux/dax.h>
  42#include <linux/cleancache.h>
  43#include <linux/uaccess.h>
  44#include <linux/iversion.h>
  45#include <linux/unicode.h>
  46#include <linux/part_stat.h>
  47#include <linux/kthread.h>
  48#include <linux/freezer.h>
  49
  50#include "ext4.h"
  51#include "ext4_extents.h"	/* Needed for trace points definition */
  52#include "ext4_jbd2.h"
  53#include "xattr.h"
  54#include "acl.h"
  55#include "mballoc.h"
  56#include "fsmap.h"
  57
  58#define CREATE_TRACE_POINTS
  59#include <trace/events/ext4.h>
  60
  61static struct ext4_lazy_init *ext4_li_info;
  62static DEFINE_MUTEX(ext4_li_mtx);
  63static struct ratelimit_state ext4_mount_msg_ratelimit;
  64
  65static int ext4_load_journal(struct super_block *, struct ext4_super_block *,
  66			     unsigned long journal_devnum);
  67static int ext4_show_options(struct seq_file *seq, struct dentry *root);
  68static void ext4_update_super(struct super_block *sb);
  69static int ext4_commit_super(struct super_block *sb);
  70static int ext4_mark_recovery_complete(struct super_block *sb,
  71					struct ext4_super_block *es);
  72static int ext4_clear_journal_err(struct super_block *sb,
  73				  struct ext4_super_block *es);
  74static int ext4_sync_fs(struct super_block *sb, int wait);
  75static int ext4_remount(struct super_block *sb, int *flags, char *data);
  76static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf);
  77static int ext4_unfreeze(struct super_block *sb);
  78static int ext4_freeze(struct super_block *sb);
  79static struct dentry *ext4_mount(struct file_system_type *fs_type, int flags,
  80		       const char *dev_name, void *data);
  81static inline int ext2_feature_set_ok(struct super_block *sb);
  82static inline int ext3_feature_set_ok(struct super_block *sb);
  83static int ext4_feature_set_ok(struct super_block *sb, int readonly);
  84static void ext4_destroy_lazyinit_thread(void);
  85static void ext4_unregister_li_request(struct super_block *sb);
  86static void ext4_clear_request_list(void);
  87static struct inode *ext4_get_journal_inode(struct super_block *sb,
  88					    unsigned int journal_inum);
  89
  90/*
  91 * Lock ordering
  92 *
  93 * Note the difference between i_mmap_sem (EXT4_I(inode)->i_mmap_sem) and
  94 * i_mmap_rwsem (inode->i_mmap_rwsem)!
  95 *
  96 * page fault path:
  97 * mmap_lock -> sb_start_pagefault -> i_mmap_sem (r) -> transaction start ->
  98 *   page lock -> i_data_sem (rw)
  99 *
 100 * buffered write path:
 101 * sb_start_write -> i_mutex -> mmap_lock
 102 * sb_start_write -> i_mutex -> transaction start -> page lock ->
 103 *   i_data_sem (rw)
 104 *
 105 * truncate:
 106 * sb_start_write -> i_mutex -> i_mmap_sem (w) -> i_mmap_rwsem (w) -> page lock
 107 * sb_start_write -> i_mutex -> i_mmap_sem (w) -> transaction start ->
 108 *   i_data_sem (rw)
 109 *
 110 * direct IO:
 111 * sb_start_write -> i_mutex -> mmap_lock
 112 * sb_start_write -> i_mutex -> transaction start -> i_data_sem (rw)
 113 *
 114 * writepages:
 115 * transaction start -> page lock(s) -> i_data_sem (rw)
 116 */
 117
 118#if !defined(CONFIG_EXT2_FS) && !defined(CONFIG_EXT2_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT2)
 119static struct file_system_type ext2_fs_type = {
 120	.owner		= THIS_MODULE,
 121	.name		= "ext2",
 122	.mount		= ext4_mount,
 123	.kill_sb	= kill_block_super,
 124	.fs_flags	= FS_REQUIRES_DEV,
 125};
 126MODULE_ALIAS_FS("ext2");
 127MODULE_ALIAS("ext2");
 128#define IS_EXT2_SB(sb) ((sb)->s_bdev->bd_holder == &ext2_fs_type)
 129#else
 130#define IS_EXT2_SB(sb) (0)
 131#endif
 132
 133
 134static struct file_system_type ext3_fs_type = {
 135	.owner		= THIS_MODULE,
 136	.name		= "ext3",
 137	.mount		= ext4_mount,
 138	.kill_sb	= kill_block_super,
 139	.fs_flags	= FS_REQUIRES_DEV,
 140};
 141MODULE_ALIAS_FS("ext3");
 142MODULE_ALIAS("ext3");
 143#define IS_EXT3_SB(sb) ((sb)->s_bdev->bd_holder == &ext3_fs_type)
 144
 145
 146static inline void __ext4_read_bh(struct buffer_head *bh, int op_flags,
 147				  bh_end_io_t *end_io)
 148{
 149	/*
 150	 * buffer's verified bit is no longer valid after reading from
 151	 * disk again due to write out error, clear it to make sure we
 152	 * recheck the buffer contents.
 153	 */
 154	clear_buffer_verified(bh);
 155
 156	bh->b_end_io = end_io ? end_io : end_buffer_read_sync;
 157	get_bh(bh);
 158	submit_bh(REQ_OP_READ, op_flags, bh);
 159}
 160
 161void ext4_read_bh_nowait(struct buffer_head *bh, int op_flags,
 162			 bh_end_io_t *end_io)
 163{
 164	BUG_ON(!buffer_locked(bh));
 165
 166	if (ext4_buffer_uptodate(bh)) {
 167		unlock_buffer(bh);
 168		return;
 169	}
 170	__ext4_read_bh(bh, op_flags, end_io);
 171}
 172
 173int ext4_read_bh(struct buffer_head *bh, int op_flags, bh_end_io_t *end_io)
 174{
 175	BUG_ON(!buffer_locked(bh));
 176
 177	if (ext4_buffer_uptodate(bh)) {
 178		unlock_buffer(bh);
 179		return 0;
 180	}
 181
 182	__ext4_read_bh(bh, op_flags, end_io);
 183
 184	wait_on_buffer(bh);
 185	if (buffer_uptodate(bh))
 186		return 0;
 187	return -EIO;
 188}
 189
 190int ext4_read_bh_lock(struct buffer_head *bh, int op_flags, bool wait)
 191{
 192	if (trylock_buffer(bh)) {
 193		if (wait)
 194			return ext4_read_bh(bh, op_flags, NULL);
 195		ext4_read_bh_nowait(bh, op_flags, NULL);
 196		return 0;
 197	}
 198	if (wait) {
 199		wait_on_buffer(bh);
 200		if (buffer_uptodate(bh))
 201			return 0;
 202		return -EIO;
 203	}
 204	return 0;
 205}
 206
 207/*
 208 * This works like __bread_gfp() except it uses ERR_PTR for error
 209 * returns.  Currently with sb_bread it's impossible to distinguish
 210 * between ENOMEM and EIO situations (since both result in a NULL
 211 * return.
 212 */
 213static struct buffer_head *__ext4_sb_bread_gfp(struct super_block *sb,
 214					       sector_t block, int op_flags,
 215					       gfp_t gfp)
 216{
 217	struct buffer_head *bh;
 218	int ret;
 219
 220	bh = sb_getblk_gfp(sb, block, gfp);
 221	if (bh == NULL)
 222		return ERR_PTR(-ENOMEM);
 223	if (ext4_buffer_uptodate(bh))
 224		return bh;
 225
 226	ret = ext4_read_bh_lock(bh, REQ_META | op_flags, true);
 227	if (ret) {
 228		put_bh(bh);
 229		return ERR_PTR(ret);
 230	}
 231	return bh;
 232}
 233
 234struct buffer_head *ext4_sb_bread(struct super_block *sb, sector_t block,
 235				   int op_flags)
 236{
 237	return __ext4_sb_bread_gfp(sb, block, op_flags, __GFP_MOVABLE);
 238}
 239
 240struct buffer_head *ext4_sb_bread_unmovable(struct super_block *sb,
 241					    sector_t block)
 242{
 243	return __ext4_sb_bread_gfp(sb, block, 0, 0);
 244}
 245
 246void ext4_sb_breadahead_unmovable(struct super_block *sb, sector_t block)
 247{
 248	struct buffer_head *bh = sb_getblk_gfp(sb, block, 0);
 249
 250	if (likely(bh)) {
 251		ext4_read_bh_lock(bh, REQ_RAHEAD, false);
 252		brelse(bh);
 253	}
 254}
 255
 256static int ext4_verify_csum_type(struct super_block *sb,
 257				 struct ext4_super_block *es)
 258{
 259	if (!ext4_has_feature_metadata_csum(sb))
 260		return 1;
 261
 262	return es->s_checksum_type == EXT4_CRC32C_CHKSUM;
 263}
 264
 265static __le32 ext4_superblock_csum(struct super_block *sb,
 266				   struct ext4_super_block *es)
 267{
 268	struct ext4_sb_info *sbi = EXT4_SB(sb);
 269	int offset = offsetof(struct ext4_super_block, s_checksum);
 270	__u32 csum;
 271
 272	csum = ext4_chksum(sbi, ~0, (char *)es, offset);
 273
 274	return cpu_to_le32(csum);
 275}
 276
 277static int ext4_superblock_csum_verify(struct super_block *sb,
 278				       struct ext4_super_block *es)
 279{
 280	if (!ext4_has_metadata_csum(sb))
 281		return 1;
 282
 283	return es->s_checksum == ext4_superblock_csum(sb, es);
 284}
 285
 286void ext4_superblock_csum_set(struct super_block *sb)
 287{
 288	struct ext4_super_block *es = EXT4_SB(sb)->s_es;
 289
 290	if (!ext4_has_metadata_csum(sb))
 291		return;
 292
 293	es->s_checksum = ext4_superblock_csum(sb, es);
 294}
 295
 296ext4_fsblk_t ext4_block_bitmap(struct super_block *sb,
 297			       struct ext4_group_desc *bg)
 298{
 299	return le32_to_cpu(bg->bg_block_bitmap_lo) |
 300		(EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
 301		 (ext4_fsblk_t)le32_to_cpu(bg->bg_block_bitmap_hi) << 32 : 0);
 302}
 303
 304ext4_fsblk_t ext4_inode_bitmap(struct super_block *sb,
 305			       struct ext4_group_desc *bg)
 306{
 307	return le32_to_cpu(bg->bg_inode_bitmap_lo) |
 308		(EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
 309		 (ext4_fsblk_t)le32_to_cpu(bg->bg_inode_bitmap_hi) << 32 : 0);
 310}
 311
 312ext4_fsblk_t ext4_inode_table(struct super_block *sb,
 313			      struct ext4_group_desc *bg)
 314{
 315	return le32_to_cpu(bg->bg_inode_table_lo) |
 316		(EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
 317		 (ext4_fsblk_t)le32_to_cpu(bg->bg_inode_table_hi) << 32 : 0);
 318}
 319
 320__u32 ext4_free_group_clusters(struct super_block *sb,
 321			       struct ext4_group_desc *bg)
 322{
 323	return le16_to_cpu(bg->bg_free_blocks_count_lo) |
 324		(EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
 325		 (__u32)le16_to_cpu(bg->bg_free_blocks_count_hi) << 16 : 0);
 326}
 327
 328__u32 ext4_free_inodes_count(struct super_block *sb,
 329			      struct ext4_group_desc *bg)
 330{
 331	return le16_to_cpu(bg->bg_free_inodes_count_lo) |
 332		(EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
 333		 (__u32)le16_to_cpu(bg->bg_free_inodes_count_hi) << 16 : 0);
 334}
 335
 336__u32 ext4_used_dirs_count(struct super_block *sb,
 337			      struct ext4_group_desc *bg)
 338{
 339	return le16_to_cpu(bg->bg_used_dirs_count_lo) |
 340		(EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
 341		 (__u32)le16_to_cpu(bg->bg_used_dirs_count_hi) << 16 : 0);
 342}
 343
 344__u32 ext4_itable_unused_count(struct super_block *sb,
 345			      struct ext4_group_desc *bg)
 346{
 347	return le16_to_cpu(bg->bg_itable_unused_lo) |
 348		(EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
 349		 (__u32)le16_to_cpu(bg->bg_itable_unused_hi) << 16 : 0);
 350}
 351
 352void ext4_block_bitmap_set(struct super_block *sb,
 353			   struct ext4_group_desc *bg, ext4_fsblk_t blk)
 354{
 355	bg->bg_block_bitmap_lo = cpu_to_le32((u32)blk);
 356	if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT)
 357		bg->bg_block_bitmap_hi = cpu_to_le32(blk >> 32);
 358}
 359
 360void ext4_inode_bitmap_set(struct super_block *sb,
 361			   struct ext4_group_desc *bg, ext4_fsblk_t blk)
 362{
 363	bg->bg_inode_bitmap_lo  = cpu_to_le32((u32)blk);
 364	if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT)
 365		bg->bg_inode_bitmap_hi = cpu_to_le32(blk >> 32);
 366}
 367
 368void ext4_inode_table_set(struct super_block *sb,
 369			  struct ext4_group_desc *bg, ext4_fsblk_t blk)
 370{
 371	bg->bg_inode_table_lo = cpu_to_le32((u32)blk);
 372	if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT)
 373		bg->bg_inode_table_hi = cpu_to_le32(blk >> 32);
 374}
 375
 376void ext4_free_group_clusters_set(struct super_block *sb,
 377				  struct ext4_group_desc *bg, __u32 count)
 378{
 379	bg->bg_free_blocks_count_lo = cpu_to_le16((__u16)count);
 380	if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT)
 381		bg->bg_free_blocks_count_hi = cpu_to_le16(count >> 16);
 382}
 383
 384void ext4_free_inodes_set(struct super_block *sb,
 385			  struct ext4_group_desc *bg, __u32 count)
 386{
 387	bg->bg_free_inodes_count_lo = cpu_to_le16((__u16)count);
 388	if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT)
 389		bg->bg_free_inodes_count_hi = cpu_to_le16(count >> 16);
 390}
 391
 392void ext4_used_dirs_set(struct super_block *sb,
 393			  struct ext4_group_desc *bg, __u32 count)
 394{
 395	bg->bg_used_dirs_count_lo = cpu_to_le16((__u16)count);
 396	if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT)
 397		bg->bg_used_dirs_count_hi = cpu_to_le16(count >> 16);
 398}
 399
 400void ext4_itable_unused_set(struct super_block *sb,
 401			  struct ext4_group_desc *bg, __u32 count)
 402{
 403	bg->bg_itable_unused_lo = cpu_to_le16((__u16)count);
 404	if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT)
 405		bg->bg_itable_unused_hi = cpu_to_le16(count >> 16);
 406}
 407
 408static void __ext4_update_tstamp(__le32 *lo, __u8 *hi, time64_t now)
 409{
 410	now = clamp_val(now, 0, (1ull << 40) - 1);
 411
 412	*lo = cpu_to_le32(lower_32_bits(now));
 413	*hi = upper_32_bits(now);
 414}
 415
 416static time64_t __ext4_get_tstamp(__le32 *lo, __u8 *hi)
 417{
 418	return ((time64_t)(*hi) << 32) + le32_to_cpu(*lo);
 419}
 420#define ext4_update_tstamp(es, tstamp) \
 421	__ext4_update_tstamp(&(es)->tstamp, &(es)->tstamp ## _hi, \
 422			     ktime_get_real_seconds())
 423#define ext4_get_tstamp(es, tstamp) \
 424	__ext4_get_tstamp(&(es)->tstamp, &(es)->tstamp ## _hi)
 425
 426/*
 427 * The del_gendisk() function uninitializes the disk-specific data
 428 * structures, including the bdi structure, without telling anyone
 429 * else.  Once this happens, any attempt to call mark_buffer_dirty()
 430 * (for example, by ext4_commit_super), will cause a kernel OOPS.
 431 * This is a kludge to prevent these oops until we can put in a proper
 432 * hook in del_gendisk() to inform the VFS and file system layers.
 433 */
 434static int block_device_ejected(struct super_block *sb)
 435{
 436	struct inode *bd_inode = sb->s_bdev->bd_inode;
 437	struct backing_dev_info *bdi = inode_to_bdi(bd_inode);
 438
 439	return bdi->dev == NULL;
 440}
 441
 442static void ext4_journal_commit_callback(journal_t *journal, transaction_t *txn)
 443{
 444	struct super_block		*sb = journal->j_private;
 445	struct ext4_sb_info		*sbi = EXT4_SB(sb);
 446	int				error = is_journal_aborted(journal);
 447	struct ext4_journal_cb_entry	*jce;
 448
 449	BUG_ON(txn->t_state == T_FINISHED);
 450
 451	ext4_process_freed_data(sb, txn->t_tid);
 452
 453	spin_lock(&sbi->s_md_lock);
 454	while (!list_empty(&txn->t_private_list)) {
 455		jce = list_entry(txn->t_private_list.next,
 456				 struct ext4_journal_cb_entry, jce_list);
 457		list_del_init(&jce->jce_list);
 458		spin_unlock(&sbi->s_md_lock);
 459		jce->jce_func(sb, jce, error);
 460		spin_lock(&sbi->s_md_lock);
 461	}
 462	spin_unlock(&sbi->s_md_lock);
 463}
 464
 465/*
 466 * This writepage callback for write_cache_pages()
 467 * takes care of a few cases after page cleaning.
 468 *
 469 * write_cache_pages() already checks for dirty pages
 470 * and calls clear_page_dirty_for_io(), which we want,
 471 * to write protect the pages.
 472 *
 473 * However, we may have to redirty a page (see below.)
 474 */
 475static int ext4_journalled_writepage_callback(struct page *page,
 476					      struct writeback_control *wbc,
 477					      void *data)
 478{
 479	transaction_t *transaction = (transaction_t *) data;
 480	struct buffer_head *bh, *head;
 481	struct journal_head *jh;
 482
 483	bh = head = page_buffers(page);
 484	do {
 485		/*
 486		 * We have to redirty a page in these cases:
 487		 * 1) If buffer is dirty, it means the page was dirty because it
 488		 * contains a buffer that needs checkpointing. So the dirty bit
 489		 * needs to be preserved so that checkpointing writes the buffer
 490		 * properly.
 491		 * 2) If buffer is not part of the committing transaction
 492		 * (we may have just accidentally come across this buffer because
 493		 * inode range tracking is not exact) or if the currently running
 494		 * transaction already contains this buffer as well, dirty bit
 495		 * needs to be preserved so that the buffer gets writeprotected
 496		 * properly on running transaction's commit.
 497		 */
 498		jh = bh2jh(bh);
 499		if (buffer_dirty(bh) ||
 500		    (jh && (jh->b_transaction != transaction ||
 501			    jh->b_next_transaction))) {
 502			redirty_page_for_writepage(wbc, page);
 503			goto out;
 504		}
 505	} while ((bh = bh->b_this_page) != head);
 506
 507out:
 508	return AOP_WRITEPAGE_ACTIVATE;
 509}
 510
 511static int ext4_journalled_submit_inode_data_buffers(struct jbd2_inode *jinode)
 512{
 513	struct address_space *mapping = jinode->i_vfs_inode->i_mapping;
 514	struct writeback_control wbc = {
 515		.sync_mode =  WB_SYNC_ALL,
 516		.nr_to_write = LONG_MAX,
 517		.range_start = jinode->i_dirty_start,
 518		.range_end = jinode->i_dirty_end,
 519        };
 520
 521	return write_cache_pages(mapping, &wbc,
 522				 ext4_journalled_writepage_callback,
 523				 jinode->i_transaction);
 524}
 525
 526static int ext4_journal_submit_inode_data_buffers(struct jbd2_inode *jinode)
 527{
 528	int ret;
 529
 530	if (ext4_should_journal_data(jinode->i_vfs_inode))
 531		ret = ext4_journalled_submit_inode_data_buffers(jinode);
 532	else
 533		ret = jbd2_journal_submit_inode_data_buffers(jinode);
 534
 535	return ret;
 536}
 537
 538static int ext4_journal_finish_inode_data_buffers(struct jbd2_inode *jinode)
 539{
 540	int ret = 0;
 541
 542	if (!ext4_should_journal_data(jinode->i_vfs_inode))
 543		ret = jbd2_journal_finish_inode_data_buffers(jinode);
 544
 545	return ret;
 546}
 547
 548static bool system_going_down(void)
 549{
 550	return system_state == SYSTEM_HALT || system_state == SYSTEM_POWER_OFF
 551		|| system_state == SYSTEM_RESTART;
 552}
 553
 554struct ext4_err_translation {
 555	int code;
 556	int errno;
 557};
 558
 559#define EXT4_ERR_TRANSLATE(err) { .code = EXT4_ERR_##err, .errno = err }
 560
 561static struct ext4_err_translation err_translation[] = {
 562	EXT4_ERR_TRANSLATE(EIO),
 563	EXT4_ERR_TRANSLATE(ENOMEM),
 564	EXT4_ERR_TRANSLATE(EFSBADCRC),
 565	EXT4_ERR_TRANSLATE(EFSCORRUPTED),
 566	EXT4_ERR_TRANSLATE(ENOSPC),
 567	EXT4_ERR_TRANSLATE(ENOKEY),
 568	EXT4_ERR_TRANSLATE(EROFS),
 569	EXT4_ERR_TRANSLATE(EFBIG),
 570	EXT4_ERR_TRANSLATE(EEXIST),
 571	EXT4_ERR_TRANSLATE(ERANGE),
 572	EXT4_ERR_TRANSLATE(EOVERFLOW),
 573	EXT4_ERR_TRANSLATE(EBUSY),
 574	EXT4_ERR_TRANSLATE(ENOTDIR),
 575	EXT4_ERR_TRANSLATE(ENOTEMPTY),
 576	EXT4_ERR_TRANSLATE(ESHUTDOWN),
 577	EXT4_ERR_TRANSLATE(EFAULT),
 578};
 579
 580static int ext4_errno_to_code(int errno)
 581{
 582	int i;
 583
 584	for (i = 0; i < ARRAY_SIZE(err_translation); i++)
 585		if (err_translation[i].errno == errno)
 586			return err_translation[i].code;
 587	return EXT4_ERR_UNKNOWN;
 588}
 589
 590static void save_error_info(struct super_block *sb, int error,
 591			    __u32 ino, __u64 block,
 592			    const char *func, unsigned int line)
 593{
 594	struct ext4_sb_info *sbi = EXT4_SB(sb);
 595
 596	/* We default to EFSCORRUPTED error... */
 597	if (error == 0)
 598		error = EFSCORRUPTED;
 599
 600	spin_lock(&sbi->s_error_lock);
 601	sbi->s_add_error_count++;
 602	sbi->s_last_error_code = error;
 603	sbi->s_last_error_line = line;
 604	sbi->s_last_error_ino = ino;
 605	sbi->s_last_error_block = block;
 606	sbi->s_last_error_func = func;
 607	sbi->s_last_error_time = ktime_get_real_seconds();
 608	if (!sbi->s_first_error_time) {
 609		sbi->s_first_error_code = error;
 610		sbi->s_first_error_line = line;
 611		sbi->s_first_error_ino = ino;
 612		sbi->s_first_error_block = block;
 613		sbi->s_first_error_func = func;
 614		sbi->s_first_error_time = sbi->s_last_error_time;
 615	}
 616	spin_unlock(&sbi->s_error_lock);
 617}
 618
 619/* Deal with the reporting of failure conditions on a filesystem such as
 620 * inconsistencies detected or read IO failures.
 621 *
 622 * On ext2, we can store the error state of the filesystem in the
 623 * superblock.  That is not possible on ext4, because we may have other
 624 * write ordering constraints on the superblock which prevent us from
 625 * writing it out straight away; and given that the journal is about to
 626 * be aborted, we can't rely on the current, or future, transactions to
 627 * write out the superblock safely.
 628 *
 629 * We'll just use the jbd2_journal_abort() error code to record an error in
 630 * the journal instead.  On recovery, the journal will complain about
 631 * that error until we've noted it down and cleared it.
 632 *
 633 * If force_ro is set, we unconditionally force the filesystem into an
 634 * ABORT|READONLY state, unless the error response on the fs has been set to
 635 * panic in which case we take the easy way out and panic immediately. This is
 636 * used to deal with unrecoverable failures such as journal IO errors or ENOMEM
 637 * at a critical moment in log management.
 638 */
 639static void ext4_handle_error(struct super_block *sb, bool force_ro, int error,
 640			      __u32 ino, __u64 block,
 641			      const char *func, unsigned int line)
 642{
 643	journal_t *journal = EXT4_SB(sb)->s_journal;
 644	bool continue_fs = !force_ro && test_opt(sb, ERRORS_CONT);
 645
 646	EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
 647	if (test_opt(sb, WARN_ON_ERROR))
 648		WARN_ON_ONCE(1);
 649
 650	if (!continue_fs && !sb_rdonly(sb)) {
 651		ext4_set_mount_flag(sb, EXT4_MF_FS_ABORTED);
 652		if (journal)
 653			jbd2_journal_abort(journal, -EIO);
 654	}
 655
 656	if (!bdev_read_only(sb->s_bdev)) {
 657		save_error_info(sb, error, ino, block, func, line);
 658		/*
 659		 * In case the fs should keep running, we need to writeout
 660		 * superblock through the journal. Due to lock ordering
 661		 * constraints, it may not be safe to do it right here so we
 662		 * defer superblock flushing to a workqueue.
 663		 */
 664		if (continue_fs)
 665			schedule_work(&EXT4_SB(sb)->s_error_work);
 666		else
 667			ext4_commit_super(sb);
 668	}
 669
 670	/*
 671	 * We force ERRORS_RO behavior when system is rebooting. Otherwise we
 672	 * could panic during 'reboot -f' as the underlying device got already
 673	 * disabled.
 674	 */
 675	if (test_opt(sb, ERRORS_PANIC) && !system_going_down()) {
 676		panic("EXT4-fs (device %s): panic forced after error\n",
 677			sb->s_id);
 678	}
 679
 680	if (sb_rdonly(sb) || continue_fs)
 681		return;
 682
 683	ext4_msg(sb, KERN_CRIT, "Remounting filesystem read-only");
 684	/*
 685	 * Make sure updated value of ->s_mount_flags will be visible before
 686	 * ->s_flags update
 687	 */
 688	smp_wmb();
 689	sb->s_flags |= SB_RDONLY;
 690}
 691
 692static void flush_stashed_error_work(struct work_struct *work)
 693{
 694	struct ext4_sb_info *sbi = container_of(work, struct ext4_sb_info,
 695						s_error_work);
 696	journal_t *journal = sbi->s_journal;
 697	handle_t *handle;
 698
 699	/*
 700	 * If the journal is still running, we have to write out superblock
 701	 * through the journal to avoid collisions of other journalled sb
 702	 * updates.
 703	 *
 704	 * We use directly jbd2 functions here to avoid recursing back into
 705	 * ext4 error handling code during handling of previous errors.
 706	 */
 707	if (!sb_rdonly(sbi->s_sb) && journal) {
 708		struct buffer_head *sbh = sbi->s_sbh;
 709		handle = jbd2_journal_start(journal, 1);
 710		if (IS_ERR(handle))
 711			goto write_directly;
 712		if (jbd2_journal_get_write_access(handle, sbh)) {
 713			jbd2_journal_stop(handle);
 714			goto write_directly;
 715		}
 716		ext4_update_super(sbi->s_sb);
 717		if (buffer_write_io_error(sbh) || !buffer_uptodate(sbh)) {
 718			ext4_msg(sbi->s_sb, KERN_ERR, "previous I/O error to "
 719				 "superblock detected");
 720			clear_buffer_write_io_error(sbh);
 721			set_buffer_uptodate(sbh);
 722		}
 723
 724		if (jbd2_journal_dirty_metadata(handle, sbh)) {
 725			jbd2_journal_stop(handle);
 726			goto write_directly;
 727		}
 728		jbd2_journal_stop(handle);
 729		ext4_notify_error_sysfs(sbi);
 730		return;
 731	}
 732write_directly:
 733	/*
 734	 * Write through journal failed. Write sb directly to get error info
 735	 * out and hope for the best.
 736	 */
 737	ext4_commit_super(sbi->s_sb);
 738	ext4_notify_error_sysfs(sbi);
 739}
 740
 741#define ext4_error_ratelimit(sb)					\
 742		___ratelimit(&(EXT4_SB(sb)->s_err_ratelimit_state),	\
 743			     "EXT4-fs error")
 744
 745void __ext4_error(struct super_block *sb, const char *function,
 746		  unsigned int line, bool force_ro, int error, __u64 block,
 747		  const char *fmt, ...)
 748{
 749	struct va_format vaf;
 750	va_list args;
 751
 752	if (unlikely(ext4_forced_shutdown(EXT4_SB(sb))))
 753		return;
 754
 755	trace_ext4_error(sb, function, line);
 756	if (ext4_error_ratelimit(sb)) {
 757		va_start(args, fmt);
 758		vaf.fmt = fmt;
 759		vaf.va = &args;
 760		printk(KERN_CRIT
 761		       "EXT4-fs error (device %s): %s:%d: comm %s: %pV\n",
 762		       sb->s_id, function, line, current->comm, &vaf);
 763		va_end(args);
 764	}
 765	ext4_handle_error(sb, force_ro, error, 0, block, function, line);
 766}
 767
 768void __ext4_error_inode(struct inode *inode, const char *function,
 769			unsigned int line, ext4_fsblk_t block, int error,
 770			const char *fmt, ...)
 771{
 772	va_list args;
 773	struct va_format vaf;
 774
 775	if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
 776		return;
 777
 778	trace_ext4_error(inode->i_sb, function, line);
 779	if (ext4_error_ratelimit(inode->i_sb)) {
 780		va_start(args, fmt);
 781		vaf.fmt = fmt;
 782		vaf.va = &args;
 783		if (block)
 784			printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: "
 785			       "inode #%lu: block %llu: comm %s: %pV\n",
 786			       inode->i_sb->s_id, function, line, inode->i_ino,
 787			       block, current->comm, &vaf);
 788		else
 789			printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: "
 790			       "inode #%lu: comm %s: %pV\n",
 791			       inode->i_sb->s_id, function, line, inode->i_ino,
 792			       current->comm, &vaf);
 793		va_end(args);
 794	}
 795	ext4_handle_error(inode->i_sb, false, error, inode->i_ino, block,
 796			  function, line);
 797}
 798
 799void __ext4_error_file(struct file *file, const char *function,
 800		       unsigned int line, ext4_fsblk_t block,
 801		       const char *fmt, ...)
 802{
 803	va_list args;
 804	struct va_format vaf;
 805	struct inode *inode = file_inode(file);
 806	char pathname[80], *path;
 807
 808	if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
 809		return;
 810
 811	trace_ext4_error(inode->i_sb, function, line);
 812	if (ext4_error_ratelimit(inode->i_sb)) {
 813		path = file_path(file, pathname, sizeof(pathname));
 814		if (IS_ERR(path))
 815			path = "(unknown)";
 816		va_start(args, fmt);
 817		vaf.fmt = fmt;
 818		vaf.va = &args;
 819		if (block)
 820			printk(KERN_CRIT
 821			       "EXT4-fs error (device %s): %s:%d: inode #%lu: "
 822			       "block %llu: comm %s: path %s: %pV\n",
 823			       inode->i_sb->s_id, function, line, inode->i_ino,
 824			       block, current->comm, path, &vaf);
 825		else
 826			printk(KERN_CRIT
 827			       "EXT4-fs error (device %s): %s:%d: inode #%lu: "
 828			       "comm %s: path %s: %pV\n",
 829			       inode->i_sb->s_id, function, line, inode->i_ino,
 830			       current->comm, path, &vaf);
 831		va_end(args);
 832	}
 833	ext4_handle_error(inode->i_sb, false, EFSCORRUPTED, inode->i_ino, block,
 834			  function, line);
 835}
 836
 837const char *ext4_decode_error(struct super_block *sb, int errno,
 838			      char nbuf[16])
 839{
 840	char *errstr = NULL;
 841
 842	switch (errno) {
 843	case -EFSCORRUPTED:
 844		errstr = "Corrupt filesystem";
 845		break;
 846	case -EFSBADCRC:
 847		errstr = "Filesystem failed CRC";
 848		break;
 849	case -EIO:
 850		errstr = "IO failure";
 851		break;
 852	case -ENOMEM:
 853		errstr = "Out of memory";
 854		break;
 855	case -EROFS:
 856		if (!sb || (EXT4_SB(sb)->s_journal &&
 857			    EXT4_SB(sb)->s_journal->j_flags & JBD2_ABORT))
 858			errstr = "Journal has aborted";
 859		else
 860			errstr = "Readonly filesystem";
 861		break;
 862	default:
 863		/* If the caller passed in an extra buffer for unknown
 864		 * errors, textualise them now.  Else we just return
 865		 * NULL. */
 866		if (nbuf) {
 867			/* Check for truncated error codes... */
 868			if (snprintf(nbuf, 16, "error %d", -errno) >= 0)
 869				errstr = nbuf;
 870		}
 871		break;
 872	}
 873
 874	return errstr;
 875}
 876
 877/* __ext4_std_error decodes expected errors from journaling functions
 878 * automatically and invokes the appropriate error response.  */
 879
 880void __ext4_std_error(struct super_block *sb, const char *function,
 881		      unsigned int line, int errno)
 882{
 883	char nbuf[16];
 884	const char *errstr;
 885
 886	if (unlikely(ext4_forced_shutdown(EXT4_SB(sb))))
 887		return;
 888
 889	/* Special case: if the error is EROFS, and we're not already
 890	 * inside a transaction, then there's really no point in logging
 891	 * an error. */
 892	if (errno == -EROFS && journal_current_handle() == NULL && sb_rdonly(sb))
 893		return;
 894
 895	if (ext4_error_ratelimit(sb)) {
 896		errstr = ext4_decode_error(sb, errno, nbuf);
 897		printk(KERN_CRIT "EXT4-fs error (device %s) in %s:%d: %s\n",
 898		       sb->s_id, function, line, errstr);
 899	}
 900
 901	ext4_handle_error(sb, false, -errno, 0, 0, function, line);
 902}
 903
 904void __ext4_msg(struct super_block *sb,
 905		const char *prefix, const char *fmt, ...)
 906{
 907	struct va_format vaf;
 908	va_list args;
 909
 910	atomic_inc(&EXT4_SB(sb)->s_msg_count);
 911	if (!___ratelimit(&(EXT4_SB(sb)->s_msg_ratelimit_state), "EXT4-fs"))
 912		return;
 913
 914	va_start(args, fmt);
 915	vaf.fmt = fmt;
 916	vaf.va = &args;
 917	printk("%sEXT4-fs (%s): %pV\n", prefix, sb->s_id, &vaf);
 918	va_end(args);
 919}
 920
 921static int ext4_warning_ratelimit(struct super_block *sb)
 922{
 923	atomic_inc(&EXT4_SB(sb)->s_warning_count);
 924	return ___ratelimit(&(EXT4_SB(sb)->s_warning_ratelimit_state),
 925			    "EXT4-fs warning");
 926}
 927
 928void __ext4_warning(struct super_block *sb, const char *function,
 929		    unsigned int line, const char *fmt, ...)
 930{
 931	struct va_format vaf;
 932	va_list args;
 933
 934	if (!ext4_warning_ratelimit(sb))
 935		return;
 936
 937	va_start(args, fmt);
 938	vaf.fmt = fmt;
 939	vaf.va = &args;
 940	printk(KERN_WARNING "EXT4-fs warning (device %s): %s:%d: %pV\n",
 941	       sb->s_id, function, line, &vaf);
 942	va_end(args);
 943}
 944
 945void __ext4_warning_inode(const struct inode *inode, const char *function,
 946			  unsigned int line, const char *fmt, ...)
 947{
 948	struct va_format vaf;
 949	va_list args;
 950
 951	if (!ext4_warning_ratelimit(inode->i_sb))
 952		return;
 953
 954	va_start(args, fmt);
 955	vaf.fmt = fmt;
 956	vaf.va = &args;
 957	printk(KERN_WARNING "EXT4-fs warning (device %s): %s:%d: "
 958	       "inode #%lu: comm %s: %pV\n", inode->i_sb->s_id,
 959	       function, line, inode->i_ino, current->comm, &vaf);
 960	va_end(args);
 961}
 962
 963void __ext4_grp_locked_error(const char *function, unsigned int line,
 964			     struct super_block *sb, ext4_group_t grp,
 965			     unsigned long ino, ext4_fsblk_t block,
 966			     const char *fmt, ...)
 967__releases(bitlock)
 968__acquires(bitlock)
 969{
 970	struct va_format vaf;
 971	va_list args;
 972
 973	if (unlikely(ext4_forced_shutdown(EXT4_SB(sb))))
 974		return;
 975
 976	trace_ext4_error(sb, function, line);
 977	if (ext4_error_ratelimit(sb)) {
 978		va_start(args, fmt);
 979		vaf.fmt = fmt;
 980		vaf.va = &args;
 981		printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: group %u, ",
 982		       sb->s_id, function, line, grp);
 983		if (ino)
 984			printk(KERN_CONT "inode %lu: ", ino);
 985		if (block)
 986			printk(KERN_CONT "block %llu:",
 987			       (unsigned long long) block);
 988		printk(KERN_CONT "%pV\n", &vaf);
 989		va_end(args);
 990	}
 991
 992	if (test_opt(sb, ERRORS_CONT)) {
 993		if (test_opt(sb, WARN_ON_ERROR))
 994			WARN_ON_ONCE(1);
 995		EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
 996		if (!bdev_read_only(sb->s_bdev)) {
 997			save_error_info(sb, EFSCORRUPTED, ino, block, function,
 998					line);
 999			schedule_work(&EXT4_SB(sb)->s_error_work);
1000		}
1001		return;
1002	}
1003	ext4_unlock_group(sb, grp);
1004	ext4_handle_error(sb, false, EFSCORRUPTED, ino, block, function, line);
1005	/*
1006	 * We only get here in the ERRORS_RO case; relocking the group
1007	 * may be dangerous, but nothing bad will happen since the
1008	 * filesystem will have already been marked read/only and the
1009	 * journal has been aborted.  We return 1 as a hint to callers
1010	 * who might what to use the return value from
1011	 * ext4_grp_locked_error() to distinguish between the
1012	 * ERRORS_CONT and ERRORS_RO case, and perhaps return more
1013	 * aggressively from the ext4 function in question, with a
1014	 * more appropriate error code.
1015	 */
1016	ext4_lock_group(sb, grp);
1017	return;
1018}
1019
1020void ext4_mark_group_bitmap_corrupted(struct super_block *sb,
1021				     ext4_group_t group,
1022				     unsigned int flags)
1023{
1024	struct ext4_sb_info *sbi = EXT4_SB(sb);
1025	struct ext4_group_info *grp = ext4_get_group_info(sb, group);
1026	struct ext4_group_desc *gdp = ext4_get_group_desc(sb, group, NULL);
1027	int ret;
1028
1029	if (flags & EXT4_GROUP_INFO_BBITMAP_CORRUPT) {
1030		ret = ext4_test_and_set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT,
1031					    &grp->bb_state);
1032		if (!ret)
1033			percpu_counter_sub(&sbi->s_freeclusters_counter,
1034					   grp->bb_free);
1035	}
1036
1037	if (flags & EXT4_GROUP_INFO_IBITMAP_CORRUPT) {
1038		ret = ext4_test_and_set_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT,
1039					    &grp->bb_state);
1040		if (!ret && gdp) {
1041			int count;
1042
1043			count = ext4_free_inodes_count(sb, gdp);
1044			percpu_counter_sub(&sbi->s_freeinodes_counter,
1045					   count);
1046		}
1047	}
1048}
1049
1050void ext4_update_dynamic_rev(struct super_block *sb)
1051{
1052	struct ext4_super_block *es = EXT4_SB(sb)->s_es;
1053
1054	if (le32_to_cpu(es->s_rev_level) > EXT4_GOOD_OLD_REV)
1055		return;
1056
1057	ext4_warning(sb,
1058		     "updating to rev %d because of new feature flag, "
1059		     "running e2fsck is recommended",
1060		     EXT4_DYNAMIC_REV);
1061
1062	es->s_first_ino = cpu_to_le32(EXT4_GOOD_OLD_FIRST_INO);
1063	es->s_inode_size = cpu_to_le16(EXT4_GOOD_OLD_INODE_SIZE);
1064	es->s_rev_level = cpu_to_le32(EXT4_DYNAMIC_REV);
1065	/* leave es->s_feature_*compat flags alone */
1066	/* es->s_uuid will be set by e2fsck if empty */
1067
1068	/*
1069	 * The rest of the superblock fields should be zero, and if not it
1070	 * means they are likely already in use, so leave them alone.  We
1071	 * can leave it up to e2fsck to clean up any inconsistencies there.
1072	 */
1073}
1074
1075/*
1076 * Open the external journal device
1077 */
1078static struct block_device *ext4_blkdev_get(dev_t dev, struct super_block *sb)
1079{
1080	struct block_device *bdev;
1081
1082	bdev = blkdev_get_by_dev(dev, FMODE_READ|FMODE_WRITE|FMODE_EXCL, sb);
1083	if (IS_ERR(bdev))
1084		goto fail;
1085	return bdev;
1086
1087fail:
1088	ext4_msg(sb, KERN_ERR,
1089		 "failed to open journal device unknown-block(%u,%u) %ld",
1090		 MAJOR(dev), MINOR(dev), PTR_ERR(bdev));
1091	return NULL;
1092}
1093
1094/*
1095 * Release the journal device
1096 */
1097static void ext4_blkdev_put(struct block_device *bdev)
1098{
1099	blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
1100}
1101
1102static void ext4_blkdev_remove(struct ext4_sb_info *sbi)
1103{
1104	struct block_device *bdev;
1105	bdev = sbi->s_journal_bdev;
1106	if (bdev) {
1107		ext4_blkdev_put(bdev);
1108		sbi->s_journal_bdev = NULL;
1109	}
1110}
1111
1112static inline struct inode *orphan_list_entry(struct list_head *l)
1113{
1114	return &list_entry(l, struct ext4_inode_info, i_orphan)->vfs_inode;
1115}
1116
1117static void dump_orphan_list(struct super_block *sb, struct ext4_sb_info *sbi)
1118{
1119	struct list_head *l;
1120
1121	ext4_msg(sb, KERN_ERR, "sb orphan head is %d",
1122		 le32_to_cpu(sbi->s_es->s_last_orphan));
1123
1124	printk(KERN_ERR "sb_info orphan list:\n");
1125	list_for_each(l, &sbi->s_orphan) {
1126		struct inode *inode = orphan_list_entry(l);
1127		printk(KERN_ERR "  "
1128		       "inode %s:%lu at %p: mode %o, nlink %d, next %d\n",
1129		       inode->i_sb->s_id, inode->i_ino, inode,
1130		       inode->i_mode, inode->i_nlink,
1131		       NEXT_ORPHAN(inode));
1132	}
1133}
1134
1135#ifdef CONFIG_QUOTA
1136static int ext4_quota_off(struct super_block *sb, int type);
1137
1138static inline void ext4_quota_off_umount(struct super_block *sb)
1139{
1140	int type;
1141
1142	/* Use our quota_off function to clear inode flags etc. */
1143	for (type = 0; type < EXT4_MAXQUOTAS; type++)
1144		ext4_quota_off(sb, type);
1145}
1146
1147/*
1148 * This is a helper function which is used in the mount/remount
1149 * codepaths (which holds s_umount) to fetch the quota file name.
1150 */
1151static inline char *get_qf_name(struct super_block *sb,
1152				struct ext4_sb_info *sbi,
1153				int type)
1154{
1155	return rcu_dereference_protected(sbi->s_qf_names[type],
1156					 lockdep_is_held(&sb->s_umount));
1157}
1158#else
1159static inline void ext4_quota_off_umount(struct super_block *sb)
1160{
1161}
1162#endif
1163
1164static void ext4_put_super(struct super_block *sb)
1165{
1166	struct ext4_sb_info *sbi = EXT4_SB(sb);
1167	struct ext4_super_block *es = sbi->s_es;
1168	struct buffer_head **group_desc;
1169	struct flex_groups **flex_groups;
1170	int aborted = 0;
1171	int i, err;
1172
1173	ext4_unregister_li_request(sb);
1174	ext4_quota_off_umount(sb);
1175
1176	flush_work(&sbi->s_error_work);
1177	destroy_workqueue(sbi->rsv_conversion_wq);
1178
1179	/*
1180	 * Unregister sysfs before destroying jbd2 journal.
1181	 * Since we could still access attr_journal_task attribute via sysfs
1182	 * path which could have sbi->s_journal->j_task as NULL
1183	 */
1184	ext4_unregister_sysfs(sb);
1185
1186	if (sbi->s_journal) {
1187		aborted = is_journal_aborted(sbi->s_journal);
1188		err = jbd2_journal_destroy(sbi->s_journal);
1189		sbi->s_journal = NULL;
1190		if ((err < 0) && !aborted) {
1191			ext4_abort(sb, -err, "Couldn't clean up the journal");
1192		}
1193	}
1194
1195	ext4_es_unregister_shrinker(sbi);
1196	del_timer_sync(&sbi->s_err_report);
1197	ext4_release_system_zone(sb);
1198	ext4_mb_release(sb);
1199	ext4_ext_release(sb);
1200
1201	if (!sb_rdonly(sb) && !aborted) {
1202		ext4_clear_feature_journal_needs_recovery(sb);
1203		es->s_state = cpu_to_le16(sbi->s_mount_state);
1204	}
1205	if (!sb_rdonly(sb))
1206		ext4_commit_super(sb);
1207
1208	rcu_read_lock();
1209	group_desc = rcu_dereference(sbi->s_group_desc);
1210	for (i = 0; i < sbi->s_gdb_count; i++)
1211		brelse(group_desc[i]);
1212	kvfree(group_desc);
1213	flex_groups = rcu_dereference(sbi->s_flex_groups);
1214	if (flex_groups) {
1215		for (i = 0; i < sbi->s_flex_groups_allocated; i++)
1216			kvfree(flex_groups[i]);
1217		kvfree(flex_groups);
1218	}
1219	rcu_read_unlock();
1220	percpu_counter_destroy(&sbi->s_freeclusters_counter);
1221	percpu_counter_destroy(&sbi->s_freeinodes_counter);
1222	percpu_counter_destroy(&sbi->s_dirs_counter);
1223	percpu_counter_destroy(&sbi->s_dirtyclusters_counter);
1224	percpu_counter_destroy(&sbi->s_sra_exceeded_retry_limit);
1225	percpu_free_rwsem(&sbi->s_writepages_rwsem);
1226#ifdef CONFIG_QUOTA
1227	for (i = 0; i < EXT4_MAXQUOTAS; i++)
1228		kfree(get_qf_name(sb, sbi, i));
1229#endif
1230
1231	/* Debugging code just in case the in-memory inode orphan list
1232	 * isn't empty.  The on-disk one can be non-empty if we've
1233	 * detected an error and taken the fs readonly, but the
1234	 * in-memory list had better be clean by this point. */
1235	if (!list_empty(&sbi->s_orphan))
1236		dump_orphan_list(sb, sbi);
1237	ASSERT(list_empty(&sbi->s_orphan));
1238
1239	sync_blockdev(sb->s_bdev);
1240	invalidate_bdev(sb->s_bdev);
1241	if (sbi->s_journal_bdev && sbi->s_journal_bdev != sb->s_bdev) {
1242		/*
1243		 * Invalidate the journal device's buffers.  We don't want them
1244		 * floating about in memory - the physical journal device may
1245		 * hotswapped, and it breaks the `ro-after' testing code.
1246		 */
1247		sync_blockdev(sbi->s_journal_bdev);
1248		invalidate_bdev(sbi->s_journal_bdev);
1249		ext4_blkdev_remove(sbi);
1250	}
1251
1252	ext4_xattr_destroy_cache(sbi->s_ea_inode_cache);
1253	sbi->s_ea_inode_cache = NULL;
1254
1255	ext4_xattr_destroy_cache(sbi->s_ea_block_cache);
1256	sbi->s_ea_block_cache = NULL;
1257
1258	ext4_stop_mmpd(sbi);
1259
1260	brelse(sbi->s_sbh);
1261	sb->s_fs_info = NULL;
1262	/*
1263	 * Now that we are completely done shutting down the
1264	 * superblock, we need to actually destroy the kobject.
1265	 */
1266	kobject_put(&sbi->s_kobj);
1267	wait_for_completion(&sbi->s_kobj_unregister);
1268	if (sbi->s_chksum_driver)
1269		crypto_free_shash(sbi->s_chksum_driver);
1270	kfree(sbi->s_blockgroup_lock);
1271	fs_put_dax(sbi->s_daxdev);
1272	fscrypt_free_dummy_policy(&sbi->s_dummy_enc_policy);
1273#ifdef CONFIG_UNICODE
1274	utf8_unload(sb->s_encoding);
1275#endif
1276	kfree(sbi);
1277}
1278
1279static struct kmem_cache *ext4_inode_cachep;
1280
1281/*
1282 * Called inside transaction, so use GFP_NOFS
1283 */
1284static struct inode *ext4_alloc_inode(struct super_block *sb)
1285{
1286	struct ext4_inode_info *ei;
1287
1288	ei = kmem_cache_alloc(ext4_inode_cachep, GFP_NOFS);
1289	if (!ei)
1290		return NULL;
1291
1292	inode_set_iversion(&ei->vfs_inode, 1);
1293	spin_lock_init(&ei->i_raw_lock);
1294	INIT_LIST_HEAD(&ei->i_prealloc_list);
1295	atomic_set(&ei->i_prealloc_active, 0);
1296	spin_lock_init(&ei->i_prealloc_lock);
1297	ext4_es_init_tree(&ei->i_es_tree);
1298	rwlock_init(&ei->i_es_lock);
1299	INIT_LIST_HEAD(&ei->i_es_list);
1300	ei->i_es_all_nr = 0;
1301	ei->i_es_shk_nr = 0;
1302	ei->i_es_shrink_lblk = 0;
1303	ei->i_reserved_data_blocks = 0;
1304	spin_lock_init(&(ei->i_block_reservation_lock));
1305	ext4_init_pending_tree(&ei->i_pending_tree);
1306#ifdef CONFIG_QUOTA
1307	ei->i_reserved_quota = 0;
1308	memset(&ei->i_dquot, 0, sizeof(ei->i_dquot));
1309#endif
1310	ei->jinode = NULL;
1311	INIT_LIST_HEAD(&ei->i_rsv_conversion_list);
1312	spin_lock_init(&ei->i_completed_io_lock);
1313	ei->i_sync_tid = 0;
1314	ei->i_datasync_tid = 0;
1315	atomic_set(&ei->i_unwritten, 0);
1316	INIT_WORK(&ei->i_rsv_conversion_work, ext4_end_io_rsv_work);
1317	ext4_fc_init_inode(&ei->vfs_inode);
1318	mutex_init(&ei->i_fc_lock);
1319	return &ei->vfs_inode;
1320}
1321
1322static int ext4_drop_inode(struct inode *inode)
1323{
1324	int drop = generic_drop_inode(inode);
1325
1326	if (!drop)
1327		drop = fscrypt_drop_inode(inode);
1328
1329	trace_ext4_drop_inode(inode, drop);
1330	return drop;
1331}
1332
1333static void ext4_free_in_core_inode(struct inode *inode)
1334{
1335	fscrypt_free_inode(inode);
1336	if (!list_empty(&(EXT4_I(inode)->i_fc_list))) {
1337		pr_warn("%s: inode %ld still in fc list",
1338			__func__, inode->i_ino);
1339	}
1340	kmem_cache_free(ext4_inode_cachep, EXT4_I(inode));
1341}
1342
1343static void ext4_destroy_inode(struct inode *inode)
1344{
1345	if (!list_empty(&(EXT4_I(inode)->i_orphan))) {
1346		ext4_msg(inode->i_sb, KERN_ERR,
1347			 "Inode %lu (%p): orphan list check failed!",
1348			 inode->i_ino, EXT4_I(inode));
1349		print_hex_dump(KERN_INFO, "", DUMP_PREFIX_ADDRESS, 16, 4,
1350				EXT4_I(inode), sizeof(struct ext4_inode_info),
1351				true);
1352		dump_stack();
1353	}
1354}
1355
1356static void init_once(void *foo)
1357{
1358	struct ext4_inode_info *ei = (struct ext4_inode_info *) foo;
1359
1360	INIT_LIST_HEAD(&ei->i_orphan);
1361	init_rwsem(&ei->xattr_sem);
1362	init_rwsem(&ei->i_data_sem);
1363	init_rwsem(&ei->i_mmap_sem);
1364	inode_init_once(&ei->vfs_inode);
1365	ext4_fc_init_inode(&ei->vfs_inode);
1366}
1367
1368static int __init init_inodecache(void)
1369{
1370	ext4_inode_cachep = kmem_cache_create_usercopy("ext4_inode_cache",
1371				sizeof(struct ext4_inode_info), 0,
1372				(SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD|
1373					SLAB_ACCOUNT),
1374				offsetof(struct ext4_inode_info, i_data),
1375				sizeof_field(struct ext4_inode_info, i_data),
1376				init_once);
1377	if (ext4_inode_cachep == NULL)
1378		return -ENOMEM;
1379	return 0;
1380}
1381
1382static void destroy_inodecache(void)
1383{
1384	/*
1385	 * Make sure all delayed rcu free inodes are flushed before we
1386	 * destroy cache.
1387	 */
1388	rcu_barrier();
1389	kmem_cache_destroy(ext4_inode_cachep);
1390}
1391
1392void ext4_clear_inode(struct inode *inode)
1393{
1394	ext4_fc_del(inode);
1395	invalidate_inode_buffers(inode);
1396	clear_inode(inode);
1397	ext4_discard_preallocations(inode, 0);
1398	ext4_es_remove_extent(inode, 0, EXT_MAX_BLOCKS);
1399	dquot_drop(inode);
1400	if (EXT4_I(inode)->jinode) {
1401		jbd2_journal_release_jbd_inode(EXT4_JOURNAL(inode),
1402					       EXT4_I(inode)->jinode);
1403		jbd2_free_inode(EXT4_I(inode)->jinode);
1404		EXT4_I(inode)->jinode = NULL;
1405	}
1406	fscrypt_put_encryption_info(inode);
1407	fsverity_cleanup_inode(inode);
1408}
1409
1410static struct inode *ext4_nfs_get_inode(struct super_block *sb,
1411					u64 ino, u32 generation)
1412{
1413	struct inode *inode;
1414
1415	/*
1416	 * Currently we don't know the generation for parent directory, so
1417	 * a generation of 0 means "accept any"
1418	 */
1419	inode = ext4_iget(sb, ino, EXT4_IGET_HANDLE);
1420	if (IS_ERR(inode))
1421		return ERR_CAST(inode);
1422	if (generation && inode->i_generation != generation) {
1423		iput(inode);
1424		return ERR_PTR(-ESTALE);
1425	}
1426
1427	return inode;
1428}
1429
1430static struct dentry *ext4_fh_to_dentry(struct super_block *sb, struct fid *fid,
1431					int fh_len, int fh_type)
1432{
1433	return generic_fh_to_dentry(sb, fid, fh_len, fh_type,
1434				    ext4_nfs_get_inode);
1435}
1436
1437static struct dentry *ext4_fh_to_parent(struct super_block *sb, struct fid *fid,
1438					int fh_len, int fh_type)
1439{
1440	return generic_fh_to_parent(sb, fid, fh_len, fh_type,
1441				    ext4_nfs_get_inode);
1442}
1443
1444static int ext4_nfs_commit_metadata(struct inode *inode)
1445{
1446	struct writeback_control wbc = {
1447		.sync_mode = WB_SYNC_ALL
1448	};
1449
1450	trace_ext4_nfs_commit_metadata(inode);
1451	return ext4_write_inode(inode, &wbc);
1452}
1453
1454#ifdef CONFIG_FS_ENCRYPTION
1455static int ext4_get_context(struct inode *inode, void *ctx, size_t len)
1456{
1457	return ext4_xattr_get(inode, EXT4_XATTR_INDEX_ENCRYPTION,
1458				 EXT4_XATTR_NAME_ENCRYPTION_CONTEXT, ctx, len);
1459}
1460
1461static int ext4_set_context(struct inode *inode, const void *ctx, size_t len,
1462							void *fs_data)
1463{
1464	handle_t *handle = fs_data;
1465	int res, res2, credits, retries = 0;
1466
1467	/*
1468	 * Encrypting the root directory is not allowed because e2fsck expects
1469	 * lost+found to exist and be unencrypted, and encrypting the root
1470	 * directory would imply encrypting the lost+found directory as well as
1471	 * the filename "lost+found" itself.
1472	 */
1473	if (inode->i_ino == EXT4_ROOT_INO)
1474		return -EPERM;
1475
1476	if (WARN_ON_ONCE(IS_DAX(inode) && i_size_read(inode)))
1477		return -EINVAL;
1478
1479	if (ext4_test_inode_flag(inode, EXT4_INODE_DAX))
1480		return -EOPNOTSUPP;
1481
1482	res = ext4_convert_inline_data(inode);
1483	if (res)
1484		return res;
1485
1486	/*
1487	 * If a journal handle was specified, then the encryption context is
1488	 * being set on a new inode via inheritance and is part of a larger
1489	 * transaction to create the inode.  Otherwise the encryption context is
1490	 * being set on an existing inode in its own transaction.  Only in the
1491	 * latter case should the "retry on ENOSPC" logic be used.
1492	 */
1493
1494	if (handle) {
1495		res = ext4_xattr_set_handle(handle, inode,
1496					    EXT4_XATTR_INDEX_ENCRYPTION,
1497					    EXT4_XATTR_NAME_ENCRYPTION_CONTEXT,
1498					    ctx, len, 0);
1499		if (!res) {
1500			ext4_set_inode_flag(inode, EXT4_INODE_ENCRYPT);
1501			ext4_clear_inode_state(inode,
1502					EXT4_STATE_MAY_INLINE_DATA);
1503			/*
1504			 * Update inode->i_flags - S_ENCRYPTED will be enabled,
1505			 * S_DAX may be disabled
1506			 */
1507			ext4_set_inode_flags(inode, false);
1508		}
1509		return res;
1510	}
1511
1512	res = dquot_initialize(inode);
1513	if (res)
1514		return res;
1515retry:
1516	res = ext4_xattr_set_credits(inode, len, false /* is_create */,
1517				     &credits);
1518	if (res)
1519		return res;
1520
1521	handle = ext4_journal_start(inode, EXT4_HT_MISC, credits);
1522	if (IS_ERR(handle))
1523		return PTR_ERR(handle);
1524
1525	res = ext4_xattr_set_handle(handle, inode, EXT4_XATTR_INDEX_ENCRYPTION,
1526				    EXT4_XATTR_NAME_ENCRYPTION_CONTEXT,
1527				    ctx, len, 0);
1528	if (!res) {
1529		ext4_set_inode_flag(inode, EXT4_INODE_ENCRYPT);
1530		/*
1531		 * Update inode->i_flags - S_ENCRYPTED will be enabled,
1532		 * S_DAX may be disabled
1533		 */
1534		ext4_set_inode_flags(inode, false);
1535		res = ext4_mark_inode_dirty(handle, inode);
1536		if (res)
1537			EXT4_ERROR_INODE(inode, "Failed to mark inode dirty");
1538	}
1539	res2 = ext4_journal_stop(handle);
1540
1541	if (res == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
1542		goto retry;
1543	if (!res)
1544		res = res2;
1545	return res;
1546}
1547
1548static const union fscrypt_policy *ext4_get_dummy_policy(struct super_block *sb)
1549{
1550	return EXT4_SB(sb)->s_dummy_enc_policy.policy;
1551}
1552
1553static bool ext4_has_stable_inodes(struct super_block *sb)
1554{
1555	return ext4_has_feature_stable_inodes(sb);
1556}
1557
1558static void ext4_get_ino_and_lblk_bits(struct super_block *sb,
1559				       int *ino_bits_ret, int *lblk_bits_ret)
1560{
1561	*ino_bits_ret = 8 * sizeof(EXT4_SB(sb)->s_es->s_inodes_count);
1562	*lblk_bits_ret = 8 * sizeof(ext4_lblk_t);
1563}
1564
1565static const struct fscrypt_operations ext4_cryptops = {
1566	.key_prefix		= "ext4:",
1567	.get_context		= ext4_get_context,
1568	.set_context		= ext4_set_context,
1569	.get_dummy_policy	= ext4_get_dummy_policy,
1570	.empty_dir		= ext4_empty_dir,
1571	.max_namelen		= EXT4_NAME_LEN,
1572	.has_stable_inodes	= ext4_has_stable_inodes,
1573	.get_ino_and_lblk_bits	= ext4_get_ino_and_lblk_bits,
1574};
1575#endif
1576
1577#ifdef CONFIG_QUOTA
1578static const char * const quotatypes[] = INITQFNAMES;
1579#define QTYPE2NAME(t) (quotatypes[t])
1580
1581static int ext4_write_dquot(struct dquot *dquot);
1582static int ext4_acquire_dquot(struct dquot *dquot);
1583static int ext4_release_dquot(struct dquot *dquot);
1584static int ext4_mark_dquot_dirty(struct dquot *dquot);
1585static int ext4_write_info(struct super_block *sb, int type);
1586static int ext4_quota_on(struct super_block *sb, int type, int format_id,
1587			 const struct path *path);
1588static int ext4_quota_on_mount(struct super_block *sb, int type);
1589static ssize_t ext4_quota_read(struct super_block *sb, int type, char *data,
1590			       size_t len, loff_t off);
1591static ssize_t ext4_quota_write(struct super_block *sb, int type,
1592				const char *data, size_t len, loff_t off);
1593static int ext4_quota_enable(struct super_block *sb, int type, int format_id,
1594			     unsigned int flags);
1595static int ext4_enable_quotas(struct super_block *sb);
1596
1597static struct dquot **ext4_get_dquots(struct inode *inode)
1598{
1599	return EXT4_I(inode)->i_dquot;
1600}
1601
1602static const struct dquot_operations ext4_quota_operations = {
1603	.get_reserved_space	= ext4_get_reserved_space,
1604	.write_dquot		= ext4_write_dquot,
1605	.acquire_dquot		= ext4_acquire_dquot,
1606	.release_dquot		= ext4_release_dquot,
1607	.mark_dirty		= ext4_mark_dquot_dirty,
1608	.write_info		= ext4_write_info,
1609	.alloc_dquot		= dquot_alloc,
1610	.destroy_dquot		= dquot_destroy,
1611	.get_projid		= ext4_get_projid,
1612	.get_inode_usage	= ext4_get_inode_usage,
1613	.get_next_id		= dquot_get_next_id,
1614};
1615
1616static const struct quotactl_ops ext4_qctl_operations = {
1617	.quota_on	= ext4_quota_on,
1618	.quota_off	= ext4_quota_off,
1619	.quota_sync	= dquot_quota_sync,
1620	.get_state	= dquot_get_state,
1621	.set_info	= dquot_set_dqinfo,
1622	.get_dqblk	= dquot_get_dqblk,
1623	.set_dqblk	= dquot_set_dqblk,
1624	.get_nextdqblk	= dquot_get_next_dqblk,
1625};
1626#endif
1627
1628static const struct super_operations ext4_sops = {
1629	.alloc_inode	= ext4_alloc_inode,
1630	.free_inode	= ext4_free_in_core_inode,
1631	.destroy_inode	= ext4_destroy_inode,
1632	.write_inode	= ext4_write_inode,
1633	.dirty_inode	= ext4_dirty_inode,
1634	.drop_inode	= ext4_drop_inode,
1635	.evict_inode	= ext4_evict_inode,
1636	.put_super	= ext4_put_super,
1637	.sync_fs	= ext4_sync_fs,
1638	.freeze_fs	= ext4_freeze,
1639	.unfreeze_fs	= ext4_unfreeze,
1640	.statfs		= ext4_statfs,
1641	.remount_fs	= ext4_remount,
1642	.show_options	= ext4_show_options,
1643#ifdef CONFIG_QUOTA
1644	.quota_read	= ext4_quota_read,
1645	.quota_write	= ext4_quota_write,
1646	.get_dquots	= ext4_get_dquots,
1647#endif
1648};
1649
1650static const struct export_operations ext4_export_ops = {
1651	.fh_to_dentry = ext4_fh_to_dentry,
1652	.fh_to_parent = ext4_fh_to_parent,
1653	.get_parent = ext4_get_parent,
1654	.commit_metadata = ext4_nfs_commit_metadata,
1655};
1656
1657enum {
1658	Opt_bsd_df, Opt_minix_df, Opt_grpid, Opt_nogrpid,
1659	Opt_resgid, Opt_resuid, Opt_sb, Opt_err_cont, Opt_err_panic, Opt_err_ro,
1660	Opt_nouid32, Opt_debug, Opt_removed,
1661	Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_noacl,
1662	Opt_auto_da_alloc, Opt_noauto_da_alloc, Opt_noload,
1663	Opt_commit, Opt_min_batch_time, Opt_max_batch_time, Opt_journal_dev,
1664	Opt_journal_path, Opt_journal_checksum, Opt_journal_async_commit,
1665	Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback,
1666	Opt_data_err_abort, Opt_data_err_ignore, Opt_test_dummy_encryption,
1667	Opt_inlinecrypt,
1668	Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota,
1669	Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_jqfmt_vfsv1, Opt_quota,
1670	Opt_noquota, Opt_barrier, Opt_nobarrier, Opt_err,
1671	Opt_usrquota, Opt_grpquota, Opt_prjquota, Opt_i_version,
1672	Opt_dax, Opt_dax_always, Opt_dax_inode, Opt_dax_never,
1673	Opt_stripe, Opt_delalloc, Opt_nodelalloc, Opt_warn_on_error,
1674	Opt_nowarn_on_error, Opt_mblk_io_submit,
1675	Opt_lazytime, Opt_nolazytime, Opt_debug_want_extra_isize,
1676	Opt_nomblk_io_submit, Opt_block_validity, Opt_noblock_validity,
1677	Opt_inode_readahead_blks, Opt_journal_ioprio,
1678	Opt_dioread_nolock, Opt_dioread_lock,
1679	Opt_discard, Opt_nodiscard, Opt_init_itable, Opt_noinit_itable,
1680	Opt_max_dir_size_kb, Opt_nojournal_checksum, Opt_nombcache,
1681	Opt_no_prefetch_block_bitmaps, Opt_mb_optimize_scan,
1682#ifdef CONFIG_EXT4_DEBUG
1683	Opt_fc_debug_max_replay, Opt_fc_debug_force
1684#endif
1685};
1686
1687static const match_table_t tokens = {
1688	{Opt_bsd_df, "bsddf"},
1689	{Opt_minix_df, "minixdf"},
1690	{Opt_grpid, "grpid"},
1691	{Opt_grpid, "bsdgroups"},
1692	{Opt_nogrpid, "nogrpid"},
1693	{Opt_nogrpid, "sysvgroups"},
1694	{Opt_resgid, "resgid=%u"},
1695	{Opt_resuid, "resuid=%u"},
1696	{Opt_sb, "sb=%u"},
1697	{Opt_err_cont, "errors=continue"},
1698	{Opt_err_panic, "errors=panic"},
1699	{Opt_err_ro, "errors=remount-ro"},
1700	{Opt_nouid32, "nouid32"},
1701	{Opt_debug, "debug"},
1702	{Opt_removed, "oldalloc"},
1703	{Opt_removed, "orlov"},
1704	{Opt_user_xattr, "user_xattr"},
1705	{Opt_nouser_xattr, "nouser_xattr"},
1706	{Opt_acl, "acl"},
1707	{Opt_noacl, "noacl"},
1708	{Opt_noload, "norecovery"},
1709	{Opt_noload, "noload"},
1710	{Opt_removed, "nobh"},
1711	{Opt_removed, "bh"},
1712	{Opt_commit, "commit=%u"},
1713	{Opt_min_batch_time, "min_batch_time=%u"},
1714	{Opt_max_batch_time, "max_batch_time=%u"},
1715	{Opt_journal_dev, "journal_dev=%u"},
1716	{Opt_journal_path, "journal_path=%s"},
1717	{Opt_journal_checksum, "journal_checksum"},
1718	{Opt_nojournal_checksum, "nojournal_checksum"},
1719	{Opt_journal_async_commit, "journal_async_commit"},
1720	{Opt_abort, "abort"},
1721	{Opt_data_journal, "data=journal"},
1722	{Opt_data_ordered, "data=ordered"},
1723	{Opt_data_writeback, "data=writeback"},
1724	{Opt_data_err_abort, "data_err=abort"},
1725	{Opt_data_err_ignore, "data_err=ignore"},
1726	{Opt_offusrjquota, "usrjquota="},
1727	{Opt_usrjquota, "usrjquota=%s"},
1728	{Opt_offgrpjquota, "grpjquota="},
1729	{Opt_grpjquota, "grpjquota=%s"},
1730	{Opt_jqfmt_vfsold, "jqfmt=vfsold"},
1731	{Opt_jqfmt_vfsv0, "jqfmt=vfsv0"},
1732	{Opt_jqfmt_vfsv1, "jqfmt=vfsv1"},
1733	{Opt_grpquota, "grpquota"},
1734	{Opt_noquota, "noquota"},
1735	{Opt_quota, "quota"},
1736	{Opt_usrquota, "usrquota"},
1737	{Opt_prjquota, "prjquota"},
1738	{Opt_barrier, "barrier=%u"},
1739	{Opt_barrier, "barrier"},
1740	{Opt_nobarrier, "nobarrier"},
1741	{Opt_i_version, "i_version"},
1742	{Opt_dax, "dax"},
1743	{Opt_dax_always, "dax=always"},
1744	{Opt_dax_inode, "dax=inode"},
1745	{Opt_dax_never, "dax=never"},
1746	{Opt_stripe, "stripe=%u"},
1747	{Opt_delalloc, "delalloc"},
1748	{Opt_warn_on_error, "warn_on_error"},
1749	{Opt_nowarn_on_error, "nowarn_on_error"},
1750	{Opt_lazytime, "lazytime"},
1751	{Opt_nolazytime, "nolazytime"},
1752	{Opt_debug_want_extra_isize, "debug_want_extra_isize=%u"},
1753	{Opt_nodelalloc, "nodelalloc"},
1754	{Opt_removed, "mblk_io_submit"},
1755	{Opt_removed, "nomblk_io_submit"},
1756	{Opt_block_validity, "block_validity"},
1757	{Opt_noblock_validity, "noblock_validity"},
1758	{Opt_inode_readahead_blks, "inode_readahead_blks=%u"},
1759	{Opt_journal_ioprio, "journal_ioprio=%u"},
1760	{Opt_auto_da_alloc, "auto_da_alloc=%u"},
1761	{Opt_auto_da_alloc, "auto_da_alloc"},
1762	{Opt_noauto_da_alloc, "noauto_da_alloc"},
1763	{Opt_dioread_nolock, "dioread_nolock"},
1764	{Opt_dioread_lock, "nodioread_nolock"},
1765	{Opt_dioread_lock, "dioread_lock"},
1766	{Opt_discard, "discard"},
1767	{Opt_nodiscard, "nodiscard"},
1768	{Opt_init_itable, "init_itable=%u"},
1769	{Opt_init_itable, "init_itable"},
1770	{Opt_noinit_itable, "noinit_itable"},
1771#ifdef CONFIG_EXT4_DEBUG
1772	{Opt_fc_debug_force, "fc_debug_force"},
1773	{Opt_fc_debug_max_replay, "fc_debug_max_replay=%u"},
1774#endif
1775	{Opt_max_dir_size_kb, "max_dir_size_kb=%u"},
1776	{Opt_test_dummy_encryption, "test_dummy_encryption=%s"},
1777	{Opt_test_dummy_encryption, "test_dummy_encryption"},
1778	{Opt_inlinecrypt, "inlinecrypt"},
1779	{Opt_nombcache, "nombcache"},
1780	{Opt_nombcache, "no_mbcache"},	/* for backward compatibility */
1781	{Opt_removed, "prefetch_block_bitmaps"},
1782	{Opt_no_prefetch_block_bitmaps, "no_prefetch_block_bitmaps"},
1783	{Opt_mb_optimize_scan, "mb_optimize_scan=%d"},
1784	{Opt_removed, "check=none"},	/* mount option from ext2/3 */
1785	{Opt_removed, "nocheck"},	/* mount option from ext2/3 */
1786	{Opt_removed, "reservation"},	/* mount option from ext2/3 */
1787	{Opt_removed, "noreservation"}, /* mount option from ext2/3 */
1788	{Opt_removed, "journal=%u"},	/* mount option from ext2/3 */
1789	{Opt_err, NULL},
1790};
1791
1792static ext4_fsblk_t get_sb_block(void **data)
1793{
1794	ext4_fsblk_t	sb_block;
1795	char		*options = (char *) *data;
1796
1797	if (!options || strncmp(options, "sb=", 3) != 0)
1798		return 1;	/* Default location */
1799
1800	options += 3;
1801	/* TODO: use simple_strtoll with >32bit ext4 */
1802	sb_block = simple_strtoul(options, &options, 0);
1803	if (*options && *options != ',') {
1804		printk(KERN_ERR "EXT4-fs: Invalid sb specification: %s\n",
1805		       (char *) *data);
1806		return 1;
1807	}
1808	if (*options == ',')
1809		options++;
1810	*data = (void *) options;
1811
1812	return sb_block;
1813}
1814
1815#define DEFAULT_JOURNAL_IOPRIO (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 3))
1816#define DEFAULT_MB_OPTIMIZE_SCAN	(-1)
1817
1818static const char deprecated_msg[] =
1819	"Mount option \"%s\" will be removed by %s\n"
1820	"Contact linux-ext4@vger.kernel.org if you think we should keep it.\n";
1821
1822#ifdef CONFIG_QUOTA
1823static int set_qf_name(struct super_block *sb, int qtype, substring_t *args)
1824{
1825	struct ext4_sb_info *sbi = EXT4_SB(sb);
1826	char *qname, *old_qname = get_qf_name(sb, sbi, qtype);
1827	int ret = -1;
1828
1829	if (sb_any_quota_loaded(sb) && !old_qname) {
1830		ext4_msg(sb, KERN_ERR,
1831			"Cannot change journaled "
1832			"quota options when quota turned on");
1833		return -1;
1834	}
1835	if (ext4_has_feature_quota(sb)) {
1836		ext4_msg(sb, KERN_INFO, "Journaled quota options "
1837			 "ignored when QUOTA feature is enabled");
1838		return 1;
1839	}
1840	qname = match_strdup(args);
1841	if (!qname) {
1842		ext4_msg(sb, KERN_ERR,
1843			"Not enough memory for storing quotafile name");
1844		return -1;
1845	}
1846	if (old_qname) {
1847		if (strcmp(old_qname, qname) == 0)
1848			ret = 1;
1849		else
1850			ext4_msg(sb, KERN_ERR,
1851				 "%s quota file already specified",
1852				 QTYPE2NAME(qtype));
1853		goto errout;
1854	}
1855	if (strchr(qname, '/')) {
1856		ext4_msg(sb, KERN_ERR,
1857			"quotafile must be on filesystem root");
1858		goto errout;
1859	}
1860	rcu_assign_pointer(sbi->s_qf_names[qtype], qname);
1861	set_opt(sb, QUOTA);
1862	return 1;
1863errout:
1864	kfree(qname);
1865	return ret;
1866}
1867
1868static int clear_qf_name(struct super_block *sb, int qtype)
1869{
1870
1871	struct ext4_sb_info *sbi = EXT4_SB(sb);
1872	char *old_qname = get_qf_name(sb, sbi, qtype);
1873
1874	if (sb_any_quota_loaded(sb) && old_qname) {
1875		ext4_msg(sb, KERN_ERR, "Cannot change journaled quota options"
1876			" when quota turned on");
1877		return -1;
1878	}
1879	rcu_assign_pointer(sbi->s_qf_names[qtype], NULL);
1880	synchronize_rcu();
1881	kfree(old_qname);
1882	return 1;
1883}
1884#endif
1885
1886#define MOPT_SET	0x0001
1887#define MOPT_CLEAR	0x0002
1888#define MOPT_NOSUPPORT	0x0004
1889#define MOPT_EXPLICIT	0x0008
1890#define MOPT_CLEAR_ERR	0x0010
1891#define MOPT_GTE0	0x0020
1892#ifdef CONFIG_QUOTA
1893#define MOPT_Q		0
1894#define MOPT_QFMT	0x0040
1895#else
1896#define MOPT_Q		MOPT_NOSUPPORT
1897#define MOPT_QFMT	MOPT_NOSUPPORT
1898#endif
1899#define MOPT_DATAJ	0x0080
1900#define MOPT_NO_EXT2	0x0100
1901#define MOPT_NO_EXT3	0x0200
1902#define MOPT_EXT4_ONLY	(MOPT_NO_EXT2 | MOPT_NO_EXT3)
1903#define MOPT_STRING	0x0400
1904#define MOPT_SKIP	0x0800
1905#define	MOPT_2		0x1000
1906
1907static const struct mount_opts {
1908	int	token;
1909	int	mount_opt;
1910	int	flags;
1911} ext4_mount_opts[] = {
1912	{Opt_minix_df, EXT4_MOUNT_MINIX_DF, MOPT_SET},
1913	{Opt_bsd_df, EXT4_MOUNT_MINIX_DF, MOPT_CLEAR},
1914	{Opt_grpid, EXT4_MOUNT_GRPID, MOPT_SET},
1915	{Opt_nogrpid, EXT4_MOUNT_GRPID, MOPT_CLEAR},
1916	{Opt_block_validity, EXT4_MOUNT_BLOCK_VALIDITY, MOPT_SET},
1917	{Opt_noblock_validity, EXT4_MOUNT_BLOCK_VALIDITY, MOPT_CLEAR},
1918	{Opt_dioread_nolock, EXT4_MOUNT_DIOREAD_NOLOCK,
1919	 MOPT_EXT4_ONLY | MOPT_SET},
1920	{Opt_dioread_lock, EXT4_MOUNT_DIOREAD_NOLOCK,
1921	 MOPT_EXT4_ONLY | MOPT_CLEAR},
1922	{Opt_discard, EXT4_MOUNT_DISCARD, MOPT_SET},
1923	{Opt_nodiscard, EXT4_MOUNT_DISCARD, MOPT_CLEAR},
1924	{Opt_delalloc, EXT4_MOUNT_DELALLOC,
1925	 MOPT_EXT4_ONLY | MOPT_SET | MOPT_EXPLICIT},
1926	{Opt_nodelalloc, EXT4_MOUNT_DELALLOC,
1927	 MOPT_EXT4_ONLY | MOPT_CLEAR},
1928	{Opt_warn_on_error, EXT4_MOUNT_WARN_ON_ERROR, MOPT_SET},
1929	{Opt_nowarn_on_error, EXT4_MOUNT_WARN_ON_ERROR, MOPT_CLEAR},
1930	{Opt_nojournal_checksum, EXT4_MOUNT_JOURNAL_CHECKSUM,
1931	 MOPT_EXT4_ONLY | MOPT_CLEAR},
1932	{Opt_journal_checksum, EXT4_MOUNT_JOURNAL_CHECKSUM,
1933	 MOPT_EXT4_ONLY | MOPT_SET | MOPT_EXPLICIT},
1934	{Opt_journal_async_commit, (EXT4_MOUNT_JOURNAL_ASYNC_COMMIT |
1935				    EXT4_MOUNT_JOURNAL_CHECKSUM),
1936	 MOPT_EXT4_ONLY | MOPT_SET | MOPT_EXPLICIT},
1937	{Opt_noload, EXT4_MOUNT_NOLOAD, MOPT_NO_EXT2 | MOPT_SET},
1938	{Opt_err_panic, EXT4_MOUNT_ERRORS_PANIC, MOPT_SET | MOPT_CLEAR_ERR},
1939	{Opt_err_ro, EXT4_MOUNT_ERRORS_RO, MOPT_SET | MOPT_CLEAR_ERR},
1940	{Opt_err_cont, EXT4_MOUNT_ERRORS_CONT, MOPT_SET | MOPT_CLEAR_ERR},
1941	{Opt_data_err_abort, EXT4_MOUNT_DATA_ERR_ABORT,
1942	 MOPT_NO_EXT2},
1943	{Opt_data_err_ignore, EXT4_MOUNT_DATA_ERR_ABORT,
1944	 MOPT_NO_EXT2},
1945	{Opt_barrier, EXT4_MOUNT_BARRIER, MOPT_SET},
1946	{Opt_nobarrier, EXT4_MOUNT_BARRIER, MOPT_CLEAR},
1947	{Opt_noauto_da_alloc, EXT4_MOUNT_NO_AUTO_DA_ALLOC, MOPT_SET},
1948	{Opt_auto_da_alloc, EXT4_MOUNT_NO_AUTO_DA_ALLOC, MOPT_CLEAR},
1949	{Opt_noinit_itable, EXT4_MOUNT_INIT_INODE_TABLE, MOPT_CLEAR},
1950	{Opt_commit, 0, MOPT_GTE0},
1951	{Opt_max_batch_time, 0, MOPT_GTE0},
1952	{Opt_min_batch_time, 0, MOPT_GTE0},
1953	{Opt_inode_readahead_blks, 0, MOPT_GTE0},
1954	{Opt_init_itable, 0, MOPT_GTE0},
1955	{Opt_dax, EXT4_MOUNT_DAX_ALWAYS, MOPT_SET | MOPT_SKIP},
1956	{Opt_dax_always, EXT4_MOUNT_DAX_ALWAYS,
1957		MOPT_EXT4_ONLY | MOPT_SET | MOPT_SKIP},
1958	{Opt_dax_inode, EXT4_MOUNT2_DAX_INODE,
1959		MOPT_EXT4_ONLY | MOPT_SET | MOPT_SKIP},
1960	{Opt_dax_never, EXT4_MOUNT2_DAX_NEVER,
1961		MOPT_EXT4_ONLY | MOPT_SET | MOPT_SKIP},
1962	{Opt_stripe, 0, MOPT_GTE0},
1963	{Opt_resuid, 0, MOPT_GTE0},
1964	{Opt_resgid, 0, MOPT_GTE0},
1965	{Opt_journal_dev, 0, MOPT_NO_EXT2 | MOPT_GTE0},
1966	{Opt_journal_path, 0, MOPT_NO_EXT2 | MOPT_STRING},
1967	{Opt_journal_ioprio, 0, MOPT_NO_EXT2 | MOPT_GTE0},
1968	{Opt_data_journal, EXT4_MOUNT_JOURNAL_DATA, MOPT_NO_EXT2 | MOPT_DATAJ},
1969	{Opt_data_ordered, EXT4_MOUNT_ORDERED_DATA, MOPT_NO_EXT2 | MOPT_DATAJ},
1970	{Opt_data_writeback, EXT4_MOUNT_WRITEBACK_DATA,
1971	 MOPT_NO_EXT2 | MOPT_DATAJ},
1972	{Opt_user_xattr, EXT4_MOUNT_XATTR_USER, MOPT_SET},
1973	{Opt_nouser_xattr, EXT4_MOUNT_XATTR_USER, MOPT_CLEAR},
1974#ifdef CONFIG_EXT4_FS_POSIX_ACL
1975	{Opt_acl, EXT4_MOUNT_POSIX_ACL, MOPT_SET},
1976	{Opt_noacl, EXT4_MOUNT_POSIX_ACL, MOPT_CLEAR},
1977#else
1978	{Opt_acl, 0, MOPT_NOSUPPORT},
1979	{Opt_noacl, 0, MOPT_NOSUPPORT},
1980#endif
1981	{Opt_nouid32, EXT4_MOUNT_NO_UID32, MOPT_SET},
1982	{Opt_debug, EXT4_MOUNT_DEBUG, MOPT_SET},
1983	{Opt_debug_want_extra_isize, 0, MOPT_GTE0},
1984	{Opt_quota, EXT4_MOUNT_QUOTA | EXT4_MOUNT_USRQUOTA, MOPT_SET | MOPT_Q},
1985	{Opt_usrquota, EXT4_MOUNT_QUOTA | EXT4_MOUNT_USRQUOTA,
1986							MOPT_SET | MOPT_Q},
1987	{Opt_grpquota, EXT4_MOUNT_QUOTA | EXT4_MOUNT_GRPQUOTA,
1988							MOPT_SET | MOPT_Q},
1989	{Opt_prjquota, EXT4_MOUNT_QUOTA | EXT4_MOUNT_PRJQUOTA,
1990							MOPT_SET | MOPT_Q},
1991	{Opt_noquota, (EXT4_MOUNT_QUOTA | EXT4_MOUNT_USRQUOTA |
1992		       EXT4_MOUNT_GRPQUOTA | EXT4_MOUNT_PRJQUOTA),
1993							MOPT_CLEAR | MOPT_Q},
1994	{Opt_usrjquota, 0, MOPT_Q | MOPT_STRING},
1995	{Opt_grpjquota, 0, MOPT_Q | MOPT_STRING},
1996	{Opt_offusrjquota, 0, MOPT_Q},
1997	{Opt_offgrpjquota, 0, MOPT_Q},
1998	{Opt_jqfmt_vfsold, QFMT_VFS_OLD, MOPT_QFMT},
1999	{Opt_jqfmt_vfsv0, QFMT_VFS_V0, MOPT_QFMT},
2000	{Opt_jqfmt_vfsv1, QFMT_VFS_V1, MOPT_QFMT},
2001	{Opt_max_dir_size_kb, 0, MOPT_GTE0},
2002	{Opt_test_dummy_encryption, 0, MOPT_STRING},
2003	{Opt_nombcache, EXT4_MOUNT_NO_MBCACHE, MOPT_SET},
2004	{Opt_no_prefetch_block_bitmaps, EXT4_MOUNT_NO_PREFETCH_BLOCK_BITMAPS,
2005	 MOPT_SET},
2006	{Opt_mb_optimize_scan, EXT4_MOUNT2_MB_OPTIMIZE_SCAN, MOPT_GTE0},
2007#ifdef CONFIG_EXT4_DEBUG
2008	{Opt_fc_debug_force, EXT4_MOUNT2_JOURNAL_FAST_COMMIT,
2009	 MOPT_SET | MOPT_2 | MOPT_EXT4_ONLY},
2010	{Opt_fc_debug_max_replay, 0, MOPT_GTE0},
2011#endif
2012	{Opt_err, 0, 0}
2013};
2014
2015#ifdef CONFIG_UNICODE
2016static const struct ext4_sb_encodings {
2017	__u16 magic;
2018	char *name;
2019	char *version;
2020} ext4_sb_encoding_map[] = {
2021	{EXT4_ENC_UTF8_12_1, "utf8", "12.1.0"},
2022};
2023
2024static int ext4_sb_read_encoding(const struct ext4_super_block *es,
2025				 const struct ext4_sb_encodings **encoding,
2026				 __u16 *flags)
2027{
2028	__u16 magic = le16_to_cpu(es->s_encoding);
2029	int i;
2030
2031	for (i = 0; i < ARRAY_SIZE(ext4_sb_encoding_map); i++)
2032		if (magic == ext4_sb_encoding_map[i].magic)
2033			break;
2034
2035	if (i >= ARRAY_SIZE(ext4_sb_encoding_map))
2036		return -EINVAL;
2037
2038	*encoding = &ext4_sb_encoding_map[i];
2039	*flags = le16_to_cpu(es->s_encoding_flags);
2040
2041	return 0;
2042}
2043#endif
2044
2045static int ext4_set_test_dummy_encryption(struct super_block *sb,
2046					  const char *opt,
2047					  const substring_t *arg,
2048					  bool is_remount)
2049{
2050#ifdef CONFIG_FS_ENCRYPTION
2051	struct ext4_sb_info *sbi = EXT4_SB(sb);
2052	int err;
2053
2054	/*
2055	 * This mount option is just for testing, and it's not worthwhile to
2056	 * implement the extra complexity (e.g. RCU protection) that would be
2057	 * needed to allow it to be set or changed during remount.  We do allow
2058	 * it to be specified during remount, but only if there is no change.
2059	 */
2060	if (is_remount && !sbi->s_dummy_enc_policy.policy) {
2061		ext4_msg(sb, KERN_WARNING,
2062			 "Can't set test_dummy_encryption on remount");
2063		return -1;
2064	}
2065	err = fscrypt_set_test_dummy_encryption(sb, arg->from,
2066						&sbi->s_dummy_enc_policy);
2067	if (err) {
2068		if (err == -EEXIST)
2069			ext4_msg(sb, KERN_WARNING,
2070				 "Can't change test_dummy_encryption on remount");
2071		else if (err == -EINVAL)
2072			ext4_msg(sb, KERN_WARNING,
2073				 "Value of option \"%s\" is unrecognized", opt);
2074		else
2075			ext4_msg(sb, KERN_WARNING,
2076				 "Error processing option \"%s\" [%d]",
2077				 opt, err);
2078		return -1;
2079	}
2080	ext4_msg(sb, KERN_WARNING, "Test dummy encryption mode enabled");
2081#else
2082	ext4_msg(sb, KERN_WARNING,
2083		 "Test dummy encryption mount option ignored");
2084#endif
2085	return 1;
2086}
2087
2088struct ext4_parsed_options {
2089	unsigned long journal_devnum;
2090	unsigned int journal_ioprio;
2091	int mb_optimize_scan;
2092};
2093
2094static int handle_mount_opt(struct super_block *sb, char *opt, int token,
2095			    substring_t *args, struct ext4_parsed_options *parsed_opts,
2096			    int is_remount)
2097{
2098	struct ext4_sb_info *sbi = EXT4_SB(sb);
2099	const struct mount_opts *m;
2100	kuid_t uid;
2101	kgid_t gid;
2102	int arg = 0;
2103
2104#ifdef CONFIG_QUOTA
2105	if (token == Opt_usrjquota)
2106		return set_qf_name(sb, USRQUOTA, &args[0]);
2107	else if (token == Opt_grpjquota)
2108		return set_qf_name(sb, GRPQUOTA, &args[0]);
2109	else if (token == Opt_offusrjquota)
2110		return clear_qf_name(sb, USRQUOTA);
2111	else if (token == Opt_offgrpjquota)
2112		return clear_qf_name(sb, GRPQUOTA);
2113#endif
2114	switch (token) {
2115	case Opt_noacl:
2116	case Opt_nouser_xattr:
2117		ext4_msg(sb, KERN_WARNING, deprecated_msg, opt, "3.5");
2118		break;
2119	case Opt_sb:
2120		return 1;	/* handled by get_sb_block() */
2121	case Opt_removed:
2122		ext4_msg(sb, KERN_WARNING, "Ignoring removed %s option", opt);
2123		return 1;
2124	case Opt_abort:
2125		ext4_set_mount_flag(sb, EXT4_MF_FS_ABORTED);
2126		return 1;
2127	case Opt_i_version:
2128		sb->s_flags |= SB_I_VERSION;
2129		return 1;
2130	case Opt_lazytime:
2131		sb->s_flags |= SB_LAZYTIME;
2132		return 1;
2133	case Opt_nolazytime:
2134		sb->s_flags &= ~SB_LAZYTIME;
2135		return 1;
2136	case Opt_inlinecrypt:
2137#ifdef CONFIG_FS_ENCRYPTION_INLINE_CRYPT
2138		sb->s_flags |= SB_INLINECRYPT;
2139#else
2140		ext4_msg(sb, KERN_ERR, "inline encryption not supported");
2141#endif
2142		return 1;
2143	}
2144
2145	for (m = ext4_mount_opts; m->token != Opt_err; m++)
2146		if (token == m->token)
2147			break;
2148
2149	if (m->token == Opt_err) {
2150		ext4_msg(sb, KERN_ERR, "Unrecognized mount option \"%s\" "
2151			 "or missing value", opt);
2152		return -1;
2153	}
2154
2155	if ((m->flags & MOPT_NO_EXT2) && IS_EXT2_SB(sb)) {
2156		ext4_msg(sb, KERN_ERR,
2157			 "Mount option \"%s\" incompatible with ext2", opt);
2158		return -1;
2159	}
2160	if ((m->flags & MOPT_NO_EXT3) && IS_EXT3_SB(sb)) {
2161		ext4_msg(sb, KERN_ERR,
2162			 "Mount option \"%s\" incompatible with ext3", opt);
2163		return -1;
2164	}
2165
2166	if (args->from && !(m->flags & MOPT_STRING) && match_int(args, &arg))
2167		return -1;
2168	if (args->from && (m->flags & MOPT_GTE0) && (arg < 0))
2169		return -1;
2170	if (m->flags & MOPT_EXPLICIT) {
2171		if (m->mount_opt & EXT4_MOUNT_DELALLOC) {
2172			set_opt2(sb, EXPLICIT_DELALLOC);
2173		} else if (m->mount_opt & EXT4_MOUNT_JOURNAL_CHECKSUM) {
2174			set_opt2(sb, EXPLICIT_JOURNAL_CHECKSUM);
2175		} else
2176			return -1;
2177	}
2178	if (m->flags & MOPT_CLEAR_ERR)
2179		clear_opt(sb, ERRORS_MASK);
2180	if (token == Opt_noquota && sb_any_quota_loaded(sb)) {
2181		ext4_msg(sb, KERN_ERR, "Cannot change quota "
2182			 "options when quota turned on");
2183		return -1;
2184	}
2185
2186	if (m->flags & MOPT_NOSUPPORT) {
2187		ext4_msg(sb, KERN_ERR, "%s option not supported", opt);
2188	} else if (token == Opt_commit) {
2189		if (arg == 0)
2190			arg = JBD2_DEFAULT_MAX_COMMIT_AGE;
2191		else if (arg > INT_MAX / HZ) {
2192			ext4_msg(sb, KERN_ERR,
2193				 "Invalid commit interval %d, "
2194				 "must be smaller than %d",
2195				 arg, INT_MAX / HZ);
2196			return -1;
2197		}
2198		sbi->s_commit_interval = HZ * arg;
2199	} else if (token == Opt_debug_want_extra_isize) {
2200		if ((arg & 1) ||
2201		    (arg < 4) ||
2202		    (arg > (sbi->s_inode_size - EXT4_GOOD_OLD_INODE_SIZE))) {
2203			ext4_msg(sb, KERN_ERR,
2204				 "Invalid want_extra_isize %d", arg);
2205			return -1;
2206		}
2207		sbi->s_want_extra_isize = arg;
2208	} else if (token == Opt_max_batch_time) {
2209		sbi->s_max_batch_time = arg;
2210	} else if (token == Opt_min_batch_time) {
2211		sbi->s_min_batch_time = arg;
2212	} else if (token == Opt_inode_readahead_blks) {
2213		if (arg && (arg > (1 << 30) || !is_power_of_2(arg))) {
2214			ext4_msg(sb, KERN_ERR,
2215				 "EXT4-fs: inode_readahead_blks must be "
2216				 "0 or a power of 2 smaller than 2^31");
2217			return -1;
2218		}
2219		sbi->s_inode_readahead_blks = arg;
2220	} else if (token == Opt_init_itable) {
2221		set_opt(sb, INIT_INODE_TABLE);
2222		if (!args->from)
2223			arg = EXT4_DEF_LI_WAIT_MULT;
2224		sbi->s_li_wait_mult = arg;
2225	} else if (token == Opt_max_dir_size_kb) {
2226		sbi->s_max_dir_size_kb = arg;
2227#ifdef CONFIG_EXT4_DEBUG
2228	} else if (token == Opt_fc_debug_max_replay) {
2229		sbi->s_fc_debug_max_replay = arg;
2230#endif
2231	} else if (token == Opt_stripe) {
2232		sbi->s_stripe = arg;
2233	} else if (token == Opt_resuid) {
2234		uid = make_kuid(current_user_ns(), arg);
2235		if (!uid_valid(uid)) {
2236			ext4_msg(sb, KERN_ERR, "Invalid uid value %d", arg);
2237			return -1;
2238		}
2239		sbi->s_resuid = uid;
2240	} else if (token == Opt_resgid) {
2241		gid = make_kgid(current_user_ns(), arg);
2242		if (!gid_valid(gid)) {
2243			ext4_msg(sb, KERN_ERR, "Invalid gid value %d", arg);
2244			return -1;
2245		}
2246		sbi->s_resgid = gid;
2247	} else if (token == Opt_journal_dev) {
2248		if (is_remount) {
2249			ext4_msg(sb, KERN_ERR,
2250				 "Cannot specify journal on remount");
2251			return -1;
2252		}
2253		parsed_opts->journal_devnum = arg;
2254	} else if (token == Opt_journal_path) {
2255		char *journal_path;
2256		struct inode *journal_inode;
2257		struct path path;
2258		int error;
2259
2260		if (is_remount) {
2261			ext4_msg(sb, KERN_ERR,
2262				 "Cannot specify journal on remount");
2263			return -1;
2264		}
2265		journal_path = match_strdup(&args[0]);
2266		if (!journal_path) {
2267			ext4_msg(sb, KERN_ERR, "error: could not dup "
2268				"journal device string");
2269			return -1;
2270		}
2271
2272		error = kern_path(journal_path, LOOKUP_FOLLOW, &path);
2273		if (error) {
2274			ext4_msg(sb, KERN_ERR, "error: could not find "
2275				"journal device path: error %d", error);
2276			kfree(journal_path);
2277			return -1;
2278		}
2279
2280		journal_inode = d_inode(path.dentry);
2281		if (!S_ISBLK(journal_inode->i_mode)) {
2282			ext4_msg(sb, KERN_ERR, "error: journal path %s "
2283				"is not a block device", journal_path);
2284			path_put(&path);
2285			kfree(journal_path);
2286			return -1;
2287		}
2288
2289		parsed_opts->journal_devnum = new_encode_dev(journal_inode->i_rdev);
2290		path_put(&path);
2291		kfree(journal_path);
2292	} else if (token == Opt_journal_ioprio) {
2293		if (arg > 7) {
2294			ext4_msg(sb, KERN_ERR, "Invalid journal IO priority"
2295				 " (must be 0-7)");
2296			return -1;
2297		}
2298		parsed_opts->journal_ioprio =
2299			IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, arg);
2300	} else if (token == Opt_test_dummy_encryption) {
2301		return ext4_set_test_dummy_encryption(sb, opt, &args[0],
2302						      is_remount);
2303	} else if (m->flags & MOPT_DATAJ) {
2304		if (is_remount) {
2305			if (!sbi->s_journal)
2306				ext4_msg(sb, KERN_WARNING, "Remounting file system with no journal so ignoring journalled data option");
2307			else if (test_opt(sb, DATA_FLAGS) != m->mount_opt) {
2308				ext4_msg(sb, KERN_ERR,
2309					 "Cannot change data mode on remount");
2310				return -1;
2311			}
2312		} else {
2313			clear_opt(sb, DATA_FLAGS);
2314			sbi->s_mount_opt |= m->mount_opt;
2315		}
2316#ifdef CONFIG_QUOTA
2317	} else if (m->flags & MOPT_QFMT) {
2318		if (sb_any_quota_loaded(sb) &&
2319		    sbi->s_jquota_fmt != m->mount_opt) {
2320			ext4_msg(sb, KERN_ERR, "Cannot change journaled "
2321				 "quota options when quota turned on");
2322			return -1;
2323		}
2324		if (ext4_has_feature_quota(sb)) {
2325			ext4_msg(sb, KERN_INFO,
2326				 "Quota format mount options ignored "
2327				 "when QUOTA feature is enabled");
2328			return 1;
2329		}
2330		sbi->s_jquota_fmt = m->mount_opt;
2331#endif
2332	} else if (token == Opt_dax || token == Opt_dax_always ||
2333		   token == Opt_dax_inode || token == Opt_dax_never) {
2334#ifdef CONFIG_FS_DAX
2335		switch (token) {
2336		case Opt_dax:
2337		case Opt_dax_always:
2338			if (is_remount &&
2339			    (!(sbi->s_mount_opt & EXT4_MOUNT_DAX_ALWAYS) ||
2340			     (sbi->s_mount_opt2 & EXT4_MOUNT2_DAX_NEVER))) {
2341			fail_dax_change_remount:
2342				ext4_msg(sb, KERN_ERR, "can't change "
2343					 "dax mount option while remounting");
2344				return -1;
2345			}
2346			if (is_remount &&
2347			    (test_opt(sb, DATA_FLAGS) ==
2348			     EXT4_MOUNT_JOURNAL_DATA)) {
2349				    ext4_msg(sb, KERN_ERR, "can't mount with "
2350					     "both data=journal and dax");
2351				    return -1;
2352			}
2353			ext4_msg(sb, KERN_WARNING,
2354				"DAX enabled. Warning: EXPERIMENTAL, use at your own risk");
2355			sbi->s_mount_opt |= EXT4_MOUNT_DAX_ALWAYS;
2356			sbi->s_mount_opt2 &= ~EXT4_MOUNT2_DAX_NEVER;
2357			break;
2358		case Opt_dax_never:
2359			if (is_remount &&
2360			    (!(sbi->s_mount_opt2 & EXT4_MOUNT2_DAX_NEVER) ||
2361			     (sbi->s_mount_opt & EXT4_MOUNT_DAX_ALWAYS)))
2362				goto fail_dax_change_remount;
2363			sbi->s_mount_opt2 |= EXT4_MOUNT2_DAX_NEVER;
2364			sbi->s_mount_opt &= ~EXT4_MOUNT_DAX_ALWAYS;
2365			break;
2366		case Opt_dax_inode:
2367			if (is_remount &&
2368			    ((sbi->s_mount_opt & EXT4_MOUNT_DAX_ALWAYS) ||
2369			     (sbi->s_mount_opt2 & EXT4_MOUNT2_DAX_NEVER) ||
2370			     !(sbi->s_mount_opt2 & EXT4_MOUNT2_DAX_INODE)))
2371				goto fail_dax_change_remount;
2372			sbi->s_mount_opt &= ~EXT4_MOUNT_DAX_ALWAYS;
2373			sbi->s_mount_opt2 &= ~EXT4_MOUNT2_DAX_NEVER;
2374			/* Strictly for printing options */
2375			sbi->s_mount_opt2 |= EXT4_MOUNT2_DAX_INODE;
2376			break;
2377		}
2378#else
2379		ext4_msg(sb, KERN_INFO, "dax option not supported");
2380		sbi->s_mount_opt2 |= EXT4_MOUNT2_DAX_NEVER;
2381		sbi->s_mount_opt &= ~EXT4_MOUNT_DAX_ALWAYS;
2382		return -1;
2383#endif
2384	} else if (token == Opt_data_err_abort) {
2385		sbi->s_mount_opt |= m->mount_opt;
2386	} else if (token == Opt_data_err_ignore) {
2387		sbi->s_mount_opt &= ~m->mount_opt;
2388	} else if (token == Opt_mb_optimize_scan) {
2389		if (arg != 0 && arg != 1) {
2390			ext4_msg(sb, KERN_WARNING,
2391				 "mb_optimize_scan should be set to 0 or 1.");
2392			return -1;
2393		}
2394		parsed_opts->mb_optimize_scan = arg;
2395	} else {
2396		if (!args->from)
2397			arg = 1;
2398		if (m->flags & MOPT_CLEAR)
2399			arg = !arg;
2400		else if (unlikely(!(m->flags & MOPT_SET))) {
2401			ext4_msg(sb, KERN_WARNING,
2402				 "buggy handling of option %s", opt);
2403			WARN_ON(1);
2404			return -1;
2405		}
2406		if (m->flags & MOPT_2) {
2407			if (arg != 0)
2408				sbi->s_mount_opt2 |= m->mount_opt;
2409			else
2410				sbi->s_mount_opt2 &= ~m->mount_opt;
2411		} else {
2412			if (arg != 0)
2413				sbi->s_mount_opt |= m->mount_opt;
2414			else
2415				sbi->s_mount_opt &= ~m->mount_opt;
2416		}
2417	}
2418	return 1;
2419}
2420
2421static int parse_options(char *options, struct super_block *sb,
2422			 struct ext4_parsed_options *ret_opts,
2423			 int is_remount)
2424{
2425	struct ext4_sb_info __maybe_unused *sbi = EXT4_SB(sb);
2426	char *p, __maybe_unused *usr_qf_name, __maybe_unused *grp_qf_name;
2427	substring_t args[MAX_OPT_ARGS];
2428	int token;
2429
2430	if (!options)
2431		return 1;
2432
2433	while ((p = strsep(&options, ",")) != NULL) {
2434		if (!*p)
2435			continue;
2436		/*
2437		 * Initialize args struct so we know whether arg was
2438		 * found; some options take optional arguments.
2439		 */
2440		args[0].to = args[0].from = NULL;
2441		token = match_token(p, tokens, args);
2442		if (handle_mount_opt(sb, p, token, args, ret_opts,
2443				     is_remount) < 0)
2444			return 0;
2445	}
2446#ifdef CONFIG_QUOTA
2447	/*
2448	 * We do the test below only for project quotas. 'usrquota' and
2449	 * 'grpquota' mount options are allowed even without quota feature
2450	 * to support legacy quotas in quota files.
2451	 */
2452	if (test_opt(sb, PRJQUOTA) && !ext4_has_feature_project(sb)) {
2453		ext4_msg(sb, KERN_ERR, "Project quota feature not enabled. "
2454			 "Cannot enable project quota enforcement.");
2455		return 0;
2456	}
2457	usr_qf_name = get_qf_name(sb, sbi, USRQUOTA);
2458	grp_qf_name = get_qf_name(sb, sbi, GRPQUOTA);
2459	if (usr_qf_name || grp_qf_name) {
2460		if (test_opt(sb, USRQUOTA) && usr_qf_name)
2461			clear_opt(sb, USRQUOTA);
2462
2463		if (test_opt(sb, GRPQUOTA) && grp_qf_name)
2464			clear_opt(sb, GRPQUOTA);
2465
2466		if (test_opt(sb, GRPQUOTA) || test_opt(sb, USRQUOTA)) {
2467			ext4_msg(sb, KERN_ERR, "old and new quota "
2468					"format mixing");
2469			return 0;
2470		}
2471
2472		if (!sbi->s_jquota_fmt) {
2473			ext4_msg(sb, KERN_ERR, "journaled quota format "
2474					"not specified");
2475			return 0;
2476		}
2477	}
2478#endif
2479	if (test_opt(sb, DIOREAD_NOLOCK)) {
2480		int blocksize =
2481			BLOCK_SIZE << le32_to_cpu(sbi->s_es->s_log_block_size);
2482		if (blocksize < PAGE_SIZE)
2483			ext4_msg(sb, KERN_WARNING, "Warning: mounting with an "
2484				 "experimental mount option 'dioread_nolock' "
2485				 "for blocksize < PAGE_SIZE");
2486	}
2487	return 1;
2488}
2489
2490static inline void ext4_show_quota_options(struct seq_file *seq,
2491					   struct super_block *sb)
2492{
2493#if defined(CONFIG_QUOTA)
2494	struct ext4_sb_info *sbi = EXT4_SB(sb);
2495	char *usr_qf_name, *grp_qf_name;
2496
2497	if (sbi->s_jquota_fmt) {
2498		char *fmtname = "";
2499
2500		switch (sbi->s_jquota_fmt) {
2501		case QFMT_VFS_OLD:
2502			fmtname = "vfsold";
2503			break;
2504		case QFMT_VFS_V0:
2505			fmtname = "vfsv0";
2506			break;
2507		case QFMT_VFS_V1:
2508			fmtname = "vfsv1";
2509			break;
2510		}
2511		seq_printf(seq, ",jqfmt=%s", fmtname);
2512	}
2513
2514	rcu_read_lock();
2515	usr_qf_name = rcu_dereference(sbi->s_qf_names[USRQUOTA]);
2516	grp_qf_name = rcu_dereference(sbi->s_qf_names[GRPQUOTA]);
2517	if (usr_qf_name)
2518		seq_show_option(seq, "usrjquota", usr_qf_name);
2519	if (grp_qf_name)
2520		seq_show_option(seq, "grpjquota", grp_qf_name);
2521	rcu_read_unlock();
2522#endif
2523}
2524
2525static const char *token2str(int token)
2526{
2527	const struct match_token *t;
2528
2529	for (t = tokens; t->token != Opt_err; t++)
2530		if (t->token == token && !strchr(t->pattern, '='))
2531			break;
2532	return t->pattern;
2533}
2534
2535/*
2536 * Show an option if
2537 *  - it's set to a non-default value OR
2538 *  - if the per-sb default is different from the global default
2539 */
2540static int _ext4_show_options(struct seq_file *seq, struct super_block *sb,
2541			      int nodefs)
2542{
2543	struct ext4_sb_info *sbi = EXT4_SB(sb);
2544	struct ext4_super_block *es = sbi->s_es;
2545	int def_errors, def_mount_opt = sbi->s_def_mount_opt;
2546	const struct mount_opts *m;
2547	char sep = nodefs ? '\n' : ',';
2548
2549#define SEQ_OPTS_PUTS(str) seq_printf(seq, "%c" str, sep)
2550#define SEQ_OPTS_PRINT(str, arg) seq_printf(seq, "%c" str, sep, arg)
2551
2552	if (sbi->s_sb_block != 1)
2553		SEQ_OPTS_PRINT("sb=%llu", sbi->s_sb_block);
2554
2555	for (m = ext4_mount_opts; m->token != Opt_err; m++) {
2556		int want_set = m->flags & MOPT_SET;
2557		if (((m->flags & (MOPT_SET|MOPT_CLEAR)) == 0) ||
2558		    (m->flags & MOPT_CLEAR_ERR) || m->flags & MOPT_SKIP)
2559			continue;
2560		if (!nodefs && !(m->mount_opt & (sbi->s_mount_opt ^ def_mount_opt)))
2561			continue; /* skip if same as the default */
2562		if ((want_set &&
2563		     (sbi->s_mount_opt & m->mount_opt) != m->mount_opt) ||
2564		    (!want_set && (sbi->s_mount_opt & m->mount_opt)))
2565			continue; /* select Opt_noFoo vs Opt_Foo */
2566		SEQ_OPTS_PRINT("%s", token2str(m->token));
2567	}
2568
2569	if (nodefs || !uid_eq(sbi->s_resuid, make_kuid(&init_user_ns, EXT4_DEF_RESUID)) ||
2570	    le16_to_cpu(es->s_def_resuid) != EXT4_DEF_RESUID)
2571		SEQ_OPTS_PRINT("resuid=%u",
2572				from_kuid_munged(&init_user_ns, sbi->s_resuid));
2573	if (nodefs || !gid_eq(sbi->s_resgid, make_kgid(&init_user_ns, EXT4_DEF_RESGID)) ||
2574	    le16_to_cpu(es->s_def_resgid) != EXT4_DEF_RESGID)
2575		SEQ_OPTS_PRINT("resgid=%u",
2576				from_kgid_munged(&init_user_ns, sbi->s_resgid));
2577	def_errors = nodefs ? -1 : le16_to_cpu(es->s_errors);
2578	if (test_opt(sb, ERRORS_RO) && def_errors != EXT4_ERRORS_RO)
2579		SEQ_OPTS_PUTS("errors=remount-ro");
2580	if (test_opt(sb, ERRORS_CONT) && def_errors != EXT4_ERRORS_CONTINUE)
2581		SEQ_OPTS_PUTS("errors=continue");
2582	if (test_opt(sb, ERRORS_PANIC) && def_errors != EXT4_ERRORS_PANIC)
2583		SEQ_OPTS_PUTS("errors=panic");
2584	if (nodefs || sbi->s_commit_interval != JBD2_DEFAULT_MAX_COMMIT_AGE*HZ)
2585		SEQ_OPTS_PRINT("commit=%lu", sbi->s_commit_interval / HZ);
2586	if (nodefs || sbi->s_min_batch_time != EXT4_DEF_MIN_BATCH_TIME)
2587		SEQ_OPTS_PRINT("min_batch_time=%u", sbi->s_min_batch_time);
2588	if (nodefs || sbi->s_max_batch_time != EXT4_DEF_MAX_BATCH_TIME)
2589		SEQ_OPTS_PRINT("max_batch_time=%u", sbi->s_max_batch_time);
2590	if (sb->s_flags & SB_I_VERSION)
2591		SEQ_OPTS_PUTS("i_version");
2592	if (nodefs || sbi->s_stripe)
2593		SEQ_OPTS_PRINT("stripe=%lu", sbi->s_stripe);
2594	if (nodefs || EXT4_MOUNT_DATA_FLAGS &
2595			(sbi->s_mount_opt ^ def_mount_opt)) {
2596		if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)
2597			SEQ_OPTS_PUTS("data=journal");
2598		else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA)
2599			SEQ_OPTS_PUTS("data=ordered");
2600		else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)
2601			SEQ_OPTS_PUTS("data=writeback");
2602	}
2603	if (nodefs ||
2604	    sbi->s_inode_readahead_blks != EXT4_DEF_INODE_READAHEAD_BLKS)
2605		SEQ_OPTS_PRINT("inode_readahead_blks=%u",
2606			       sbi->s_inode_readahead_blks);
2607
2608	if (test_opt(sb, INIT_INODE_TABLE) && (nodefs ||
2609		       (sbi->s_li_wait_mult != EXT4_DEF_LI_WAIT_MULT)))
2610		SEQ_OPTS_PRINT("init_itable=%u", sbi->s_li_wait_mult);
2611	if (nodefs || sbi->s_max_dir_size_kb)
2612		SEQ_OPTS_PRINT("max_dir_size_kb=%u", sbi->s_max_dir_size_kb);
2613	if (test_opt(sb, DATA_ERR_ABORT))
2614		SEQ_OPTS_PUTS("data_err=abort");
2615
2616	fscrypt_show_test_dummy_encryption(seq, sep, sb);
2617
2618	if (sb->s_flags & SB_INLINECRYPT)
2619		SEQ_OPTS_PUTS("inlinecrypt");
2620
2621	if (test_opt(sb, DAX_ALWAYS)) {
2622		if (IS_EXT2_SB(sb))
2623			SEQ_OPTS_PUTS("dax");
2624		else
2625			SEQ_OPTS_PUTS("dax=always");
2626	} else if (test_opt2(sb, DAX_NEVER)) {
2627		SEQ_OPTS_PUTS("dax=never");
2628	} else if (test_opt2(sb, DAX_INODE)) {
2629		SEQ_OPTS_PUTS("dax=inode");
2630	}
2631	ext4_show_quota_options(seq, sb);
2632	return 0;
2633}
2634
2635static int ext4_show_options(struct seq_file *seq, struct dentry *root)
2636{
2637	return _ext4_show_options(seq, root->d_sb, 0);
2638}
2639
2640int ext4_seq_options_show(struct seq_file *seq, void *offset)
2641{
2642	struct super_block *sb = seq->private;
2643	int rc;
2644
2645	seq_puts(seq, sb_rdonly(sb) ? "ro" : "rw");
2646	rc = _ext4_show_options(seq, sb, 1);
2647	seq_puts(seq, "\n");
2648	return rc;
2649}
2650
2651static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
2652			    int read_only)
2653{
2654	struct ext4_sb_info *sbi = EXT4_SB(sb);
2655	int err = 0;
2656
2657	if (le32_to_cpu(es->s_rev_level) > EXT4_MAX_SUPP_REV) {
2658		ext4_msg(sb, KERN_ERR, "revision level too high, "
2659			 "forcing read-only mode");
2660		err = -EROFS;
2661		goto done;
2662	}
2663	if (read_only)
2664		goto done;
2665	if (!(sbi->s_mount_state & EXT4_VALID_FS))
2666		ext4_msg(sb, KERN_WARNING, "warning: mounting unchecked fs, "
2667			 "running e2fsck is recommended");
2668	else if (sbi->s_mount_state & EXT4_ERROR_FS)
2669		ext4_msg(sb, KERN_WARNING,
2670			 "warning: mounting fs with errors, "
2671			 "running e2fsck is recommended");
2672	else if ((__s16) le16_to_cpu(es->s_max_mnt_count) > 0 &&
2673		 le16_to_cpu(es->s_mnt_count) >=
2674		 (unsigned short) (__s16) le16_to_cpu(es->s_max_mnt_count))
2675		ext4_msg(sb, KERN_WARNING,
2676			 "warning: maximal mount count reached, "
2677			 "running e2fsck is recommended");
2678	else if (le32_to_cpu(es->s_checkinterval) &&
2679		 (ext4_get_tstamp(es, s_lastcheck) +
2680		  le32_to_cpu(es->s_checkinterval) <= ktime_get_real_seconds()))
2681		ext4_msg(sb, KERN_WARNING,
2682			 "warning: checktime reached, "
2683			 "running e2fsck is recommended");
2684	if (!sbi->s_journal)
2685		es->s_state &= cpu_to_le16(~EXT4_VALID_FS);
2686	if (!(__s16) le16_to_cpu(es->s_max_mnt_count))
2687		es->s_max_mnt_count = cpu_to_le16(EXT4_DFL_MAX_MNT_COUNT);
2688	le16_add_cpu(&es->s_mnt_count, 1);
2689	ext4_update_tstamp(es, s_mtime);
2690	if (sbi->s_journal)
2691		ext4_set_feature_journal_needs_recovery(sb);
2692
2693	err = ext4_commit_super(sb);
2694done:
2695	if (test_opt(sb, DEBUG))
2696		printk(KERN_INFO "[EXT4 FS bs=%lu, gc=%u, "
2697				"bpg=%lu, ipg=%lu, mo=%04x, mo2=%04x]\n",
2698			sb->s_blocksize,
2699			sbi->s_groups_count,
2700			EXT4_BLOCKS_PER_GROUP(sb),
2701			EXT4_INODES_PER_GROUP(sb),
2702			sbi->s_mount_opt, sbi->s_mount_opt2);
2703
2704	cleancache_init_fs(sb);
2705	return err;
2706}
2707
2708int ext4_alloc_flex_bg_array(struct super_block *sb, ext4_group_t ngroup)
2709{
2710	struct ext4_sb_info *sbi = EXT4_SB(sb);
2711	struct flex_groups **old_groups, **new_groups;
2712	int size, i, j;
2713
2714	if (!sbi->s_log_groups_per_flex)
2715		return 0;
2716
2717	size = ext4_flex_group(sbi, ngroup - 1) + 1;
2718	if (size <= sbi->s_flex_groups_allocated)
2719		return 0;
2720
2721	new_groups = kvzalloc(roundup_pow_of_two(size *
2722			      sizeof(*sbi->s_flex_groups)), GFP_KERNEL);
2723	if (!new_groups) {
2724		ext4_msg(sb, KERN_ERR,
2725			 "not enough memory for %d flex group pointers", size);
2726		return -ENOMEM;
2727	}
2728	for (i = sbi->s_flex_groups_allocated; i < size; i++) {
2729		new_groups[i] = kvzalloc(roundup_pow_of_two(
2730					 sizeof(struct flex_groups)),
2731					 GFP_KERNEL);
2732		if (!new_groups[i]) {
2733			for (j = sbi->s_flex_groups_allocated; j < i; j++)
2734				kvfree(new_groups[j]);
2735			kvfree(new_groups);
2736			ext4_msg(sb, KERN_ERR,
2737				 "not enough memory for %d flex groups", size);
2738			return -ENOMEM;
2739		}
2740	}
2741	rcu_read_lock();
2742	old_groups = rcu_dereference(sbi->s_flex_groups);
2743	if (old_groups)
2744		memcpy(new_groups, old_groups,
2745		       (sbi->s_flex_groups_allocated *
2746			sizeof(struct flex_groups *)));
2747	rcu_read_unlock();
2748	rcu_assign_pointer(sbi->s_flex_groups, new_groups);
2749	sbi->s_flex_groups_allocated = size;
2750	if (old_groups)
2751		ext4_kvfree_array_rcu(old_groups);
2752	return 0;
2753}
2754
2755static int ext4_fill_flex_info(struct super_block *sb)
2756{
2757	struct ext4_sb_info *sbi = EXT4_SB(sb);
2758	struct ext4_group_desc *gdp = NULL;
2759	struct flex_groups *fg;
2760	ext4_group_t flex_group;
2761	int i, err;
2762
2763	sbi->s_log_groups_per_flex = sbi->s_es->s_log_groups_per_flex;
2764	if (sbi->s_log_groups_per_flex < 1 || sbi->s_log_groups_per_flex > 31) {
2765		sbi->s_log_groups_per_flex = 0;
2766		return 1;
2767	}
2768
2769	err = ext4_alloc_flex_bg_array(sb, sbi->s_groups_count);
2770	if (err)
2771		goto failed;
2772
2773	for (i = 0; i < sbi->s_groups_count; i++) {
2774		gdp = ext4_get_group_desc(sb, i, NULL);
2775
2776		flex_group = ext4_flex_group(sbi, i);
2777		fg = sbi_array_rcu_deref(sbi, s_flex_groups, flex_group);
2778		atomic_add(ext4_free_inodes_count(sb, gdp), &fg->free_inodes);
2779		atomic64_add(ext4_free_group_clusters(sb, gdp),
2780			     &fg->free_clusters);
2781		atomic_add(ext4_used_dirs_count(sb, gdp), &fg->used_dirs);
2782	}
2783
2784	return 1;
2785failed:
2786	return 0;
2787}
2788
2789static __le16 ext4_group_desc_csum(struct super_block *sb, __u32 block_group,
2790				   struct ext4_group_desc *gdp)
2791{
2792	int offset = offsetof(struct ext4_group_desc, bg_checksum);
2793	__u16 crc = 0;
2794	__le32 le_group = cpu_to_le32(block_group);
2795	struct ext4_sb_info *sbi = EXT4_SB(sb);
2796
2797	if (ext4_has_metadata_csum(sbi->s_sb)) {
2798		/* Use new metadata_csum algorithm */
2799		__u32 csum32;
2800		__u16 dummy_csum = 0;
2801
2802		csum32 = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&le_group,
2803				     sizeof(le_group));
2804		csum32 = ext4_chksum(sbi, csum32, (__u8 *)gdp, offset);
2805		csum32 = ext4_chksum(sbi, csum32, (__u8 *)&dummy_csum,
2806				     sizeof(dummy_csum));
2807		offset += sizeof(dummy_csum);
2808		if (offset < sbi->s_desc_size)
2809			csum32 = ext4_chksum(sbi, csum32, (__u8 *)gdp + offset,
2810					     sbi->s_desc_size - offset);
2811
2812		crc = csum32 & 0xFFFF;
2813		goto out;
2814	}
2815
2816	/* old crc16 code */
2817	if (!ext4_has_feature_gdt_csum(sb))
2818		return 0;
2819
2820	crc = crc16(~0, sbi->s_es->s_uuid, sizeof(sbi->s_es->s_uuid));
2821	crc = crc16(crc, (__u8 *)&le_group, sizeof(le_group));
2822	crc = crc16(crc, (__u8 *)gdp, offset);
2823	offset += sizeof(gdp->bg_checksum); /* skip checksum */
2824	/* for checksum of struct ext4_group_desc do the rest...*/
2825	if (ext4_has_feature_64bit(sb) &&
2826	    offset < le16_to_cpu(sbi->s_es->s_desc_size))
2827		crc = crc16(crc, (__u8 *)gdp + offset,
2828			    le16_to_cpu(sbi->s_es->s_desc_size) -
2829				offset);
2830
2831out:
2832	return cpu_to_le16(crc);
2833}
2834
2835int ext4_group_desc_csum_verify(struct super_block *sb, __u32 block_group,
2836				struct ext4_group_desc *gdp)
2837{
2838	if (ext4_has_group_desc_csum(sb) &&
2839	    (gdp->bg_checksum != ext4_group_desc_csum(sb, block_group, gdp)))
2840		return 0;
2841
2842	return 1;
2843}
2844
2845void ext4_group_desc_csum_set(struct super_block *sb, __u32 block_group,
2846			      struct ext4_group_desc *gdp)
2847{
2848	if (!ext4_has_group_desc_csum(sb))
2849		return;
2850	gdp->bg_checksum = ext4_group_desc_csum(sb, block_group, gdp);
2851}
2852
2853/* Called at mount-time, super-block is locked */
2854static int ext4_check_descriptors(struct super_block *sb,
2855				  ext4_fsblk_t sb_block,
2856				  ext4_group_t *first_not_zeroed)
2857{
2858	struct ext4_sb_info *sbi = EXT4_SB(sb);
2859	ext4_fsblk_t first_block = le32_to_cpu(sbi->s_es->s_first_data_block);
2860	ext4_fsblk_t last_block;
2861	ext4_fsblk_t last_bg_block = sb_block + ext4_bg_num_gdb(sb, 0);
2862	ext4_fsblk_t block_bitmap;
2863	ext4_fsblk_t inode_bitmap;
2864	ext4_fsblk_t inode_table;
2865	int flexbg_flag = 0;
2866	ext4_group_t i, grp = sbi->s_groups_count;
2867
2868	if (ext4_has_feature_flex_bg(sb))
2869		flexbg_flag = 1;
2870
2871	ext4_debug("Checking group descriptors");
2872
2873	for (i = 0; i < sbi->s_groups_count; i++) {
2874		struct ext4_group_desc *gdp = ext4_get_group_desc(sb, i, NULL);
2875
2876		if (i == sbi->s_groups_count - 1 || flexbg_flag)
2877			last_block = ext4_blocks_count(sbi->s_es) - 1;
2878		else
2879			last_block = first_block +
2880				(EXT4_BLOCKS_PER_GROUP(sb) - 1);
2881
2882		if ((grp == sbi->s_groups_count) &&
2883		   !(gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_ZEROED)))
2884			grp = i;
2885
2886		block_bitmap = ext4_block_bitmap(sb, gdp);
2887		if (block_bitmap == sb_block) {
2888			ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
2889				 "Block bitmap for group %u overlaps "
2890				 "superblock", i);
2891			if (!sb_rdonly(sb))
2892				return 0;
2893		}
2894		if (block_bitmap >= sb_block + 1 &&
2895		    block_bitmap <= last_bg_block) {
2896			ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
2897				 "Block bitmap for group %u overlaps "
2898				 "block group descriptors", i);
2899			if (!sb_rdonly(sb))
2900				return 0;
2901		}
2902		if (block_bitmap < first_block || block_bitmap > last_block) {
2903			ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
2904			       "Block bitmap for group %u not in group "
2905			       "(block %llu)!", i, block_bitmap);
2906			return 0;
2907		}
2908		inode_bitmap = ext4_inode_bitmap(sb, gdp);
2909		if (inode_bitmap == sb_block) {
2910			ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
2911				 "Inode bitmap for group %u overlaps "
2912				 "superblock", i);
2913			if (!sb_rdonly(sb))
2914				return 0;
2915		}
2916		if (inode_bitmap >= sb_block + 1 &&
2917		    inode_bitmap <= last_bg_block) {
2918			ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
2919				 "Inode bitmap for group %u overlaps "
2920				 "block group descriptors", i);
2921			if (!sb_rdonly(sb))
2922				return 0;
2923		}
2924		if (inode_bitmap < first_block || inode_bitmap > last_block) {
2925			ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
2926			       "Inode bitmap for group %u not in group "
2927			       "(block %llu)!", i, inode_bitmap);
2928			return 0;
2929		}
2930		inode_table = ext4_inode_table(sb, gdp);
2931		if (inode_table == sb_block) {
2932			ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
2933				 "Inode table for group %u overlaps "
2934				 "superblock", i);
2935			if (!sb_rdonly(sb))
2936				return 0;
2937		}
2938		if (inode_table >= sb_block + 1 &&
2939		    inode_table <= last_bg_block) {
2940			ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
2941				 "Inode table for group %u overlaps "
2942				 "block group descriptors", i);
2943			if (!sb_rdonly(sb))
2944				return 0;
2945		}
2946		if (inode_table < first_block ||
2947		    inode_table + sbi->s_itb_per_group - 1 > last_block) {
2948			ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
2949			       "Inode table for group %u not in group "
2950			       "(block %llu)!", i, inode_table);
2951			return 0;
2952		}
2953		ext4_lock_group(sb, i);
2954		if (!ext4_group_desc_csum_verify(sb, i, gdp)) {
2955			ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
2956				 "Checksum for group %u failed (%u!=%u)",
2957				 i, le16_to_cpu(ext4_group_desc_csum(sb, i,
2958				     gdp)), le16_to_cpu(gdp->bg_checksum));
2959			if (!sb_rdonly(sb)) {
2960				ext4_unlock_group(sb, i);
2961				return 0;
2962			}
2963		}
2964		ext4_unlock_group(sb, i);
2965		if (!flexbg_flag)
2966			first_block += EXT4_BLOCKS_PER_GROUP(sb);
2967	}
2968	if (NULL != first_not_zeroed)
2969		*first_not_zeroed = grp;
2970	return 1;
2971}
2972
2973/* ext4_orphan_cleanup() walks a singly-linked list of inodes (starting at
2974 * the superblock) which were deleted from all directories, but held open by
2975 * a process at the time of a crash.  We walk the list and try to delete these
2976 * inodes at recovery time (only with a read-write filesystem).
2977 *
2978 * In order to keep the orphan inode chain consistent during traversal (in
2979 * case of crash during recovery), we link each inode into the superblock
2980 * orphan list_head and handle it the same way as an inode deletion during
2981 * normal operation (which journals the operations for us).
2982 *
2983 * We only do an iget() and an iput() on each inode, which is very safe if we
2984 * accidentally point at an in-use or already deleted inode.  The worst that
2985 * can happen in this case is that we get a "bit already cleared" message from
2986 * ext4_free_inode().  The only reason we would point at a wrong inode is if
2987 * e2fsck was run on this filesystem, and it must have already done the orphan
2988 * inode cleanup for us, so we can safely abort without any further action.
2989 */
2990static void ext4_orphan_cleanup(struct super_block *sb,
2991				struct ext4_super_block *es)
2992{
2993	unsigned int s_flags = sb->s_flags;
2994	int ret, nr_orphans = 0, nr_truncates = 0;
2995#ifdef CONFIG_QUOTA
2996	int quota_update = 0;
2997	int i;
2998#endif
2999	if (!es->s_last_orphan) {
3000		jbd_debug(4, "no orphan inodes to clean up\n");
3001		return;
3002	}
3003
3004	if (bdev_read_only(sb->s_bdev)) {
3005		ext4_msg(sb, KERN_ERR, "write access "
3006			"unavailable, skipping orphan cleanup");
3007		return;
3008	}
3009
3010	/* Check if feature set would not allow a r/w mount */
3011	if (!ext4_feature_set_ok(sb, 0)) {
3012		ext4_msg(sb, KERN_INFO, "Skipping orphan cleanup due to "
3013			 "unknown ROCOMPAT features");
3014		return;
3015	}
3016
3017	if (EXT4_SB(sb)->s_mount_state & EXT4_ERROR_FS) {
3018		/* don't clear list on RO mount w/ errors */
3019		if (es->s_last_orphan && !(s_flags & SB_RDONLY)) {
3020			ext4_msg(sb, KERN_INFO, "Errors on filesystem, "
3021				  "clearing orphan list.\n");
3022			es->s_last_orphan = 0;
3023		}
3024		jbd_debug(1, "Skipping orphan recovery on fs with errors.\n");
3025		return;
3026	}
3027
3028	if (s_flags & SB_RDONLY) {
3029		ext4_msg(sb, KERN_INFO, "orphan cleanup on readonly fs");
3030		sb->s_flags &= ~SB_RDONLY;
3031	}
3032#ifdef CONFIG_QUOTA
3033	/*
3034	 * Turn on quotas which were not enabled for read-only mounts if
3035	 * filesystem has quota feature, so that they are updated correctly.
3036	 */
3037	if (ext4_has_feature_quota(sb) && (s_flags & SB_RDONLY)) {
3038		int ret = ext4_enable_quotas(sb);
3039
3040		if (!ret)
3041			quota_update = 1;
3042		else
3043			ext4_msg(sb, KERN_ERR,
3044				"Cannot turn on quotas: error %d", ret);
3045	}
3046
3047	/* Turn on journaled quotas used for old sytle */
3048	for (i = 0; i < EXT4_MAXQUOTAS; i++) {
3049		if (EXT4_SB(sb)->s_qf_names[i]) {
3050			int ret = ext4_quota_on_mount(sb, i);
3051
3052			if (!ret)
3053				quota_update = 1;
3054			else
3055				ext4_msg(sb, KERN_ERR,
3056					"Cannot turn on journaled "
3057					"quota: type %d: error %d", i, ret);
3058		}
3059	}
3060#endif
3061
3062	while (es->s_last_orphan) {
3063		struct inode *inode;
3064
3065		/*
3066		 * We may have encountered an error during cleanup; if
3067		 * so, skip the rest.
3068		 */
3069		if (EXT4_SB(sb)->s_mount_state & EXT4_ERROR_FS) {
3070			jbd_debug(1, "Skipping orphan recovery on fs with errors.\n");
3071			es->s_last_orphan = 0;
3072			break;
3073		}
3074
3075		inode = ext4_orphan_get(sb, le32_to_cpu(es->s_last_orphan));
3076		if (IS_ERR(inode)) {
3077			es->s_last_orphan = 0;
3078			break;
3079		}
3080
3081		list_add(&EXT4_I(inode)->i_orphan, &EXT4_SB(sb)->s_orphan);
3082		dquot_initialize(inode);
3083		if (inode->i_nlink) {
3084			if (test_opt(sb, DEBUG))
3085				ext4_msg(sb, KERN_DEBUG,
3086					"%s: truncating inode %lu to %lld bytes",
3087					__func__, inode->i_ino, inode->i_size);
3088			jbd_debug(2, "truncating inode %lu to %lld bytes\n",
3089				  inode->i_ino, inode->i_size);
3090			inode_lock(inode);
3091			truncate_inode_pages(inode->i_mapping, inode->i_size);
3092			ret = ext4_truncate(inode);
3093			if (ret) {
3094				/*
3095				 * We need to clean up the in-core orphan list
3096				 * manually if ext4_truncate() failed to get a
3097				 * transaction handle.
3098				 */
3099				ext4_orphan_del(NULL, inode);
3100				ext4_std_error(inode->i_sb, ret);
3101			}
3102			inode_unlock(inode);
3103			nr_truncates++;
3104		} else {
3105			if (test_opt(sb, DEBUG))
3106				ext4_msg(sb, KERN_DEBUG,
3107					"%s: deleting unreferenced inode %lu",
3108					__func__, inode->i_ino);
3109			jbd_debug(2, "deleting unreferenced inode %lu\n",
3110				  inode->i_ino);
3111			nr_orphans++;
3112		}
3113		iput(inode);  /* The delete magic happens here! */
3114	}
3115
3116#define PLURAL(x) (x), ((x) == 1) ? "" : "s"
3117
3118	if (nr_orphans)
3119		ext4_msg(sb, KERN_INFO, "%d orphan inode%s deleted",
3120		       PLURAL(nr_orphans));
3121	if (nr_truncates)
3122		ext4_msg(sb, KERN_INFO, "%d truncate%s cleaned up",
3123		       PLURAL(nr_truncates));
3124#ifdef CONFIG_QUOTA
3125	/* Turn off quotas if they were enabled for orphan cleanup */
3126	if (quota_update) {
3127		for (i = 0; i < EXT4_MAXQUOTAS; i++) {
3128			if (sb_dqopt(sb)->files[i])
3129				dquot_quota_off(sb, i);
3130		}
3131	}
3132#endif
3133	sb->s_flags = s_flags; /* Restore SB_RDONLY status */
3134}
3135
3136/*
3137 * Maximal extent format file size.
3138 * Resulting logical blkno at s_maxbytes must fit in our on-disk
3139 * extent format containers, within a sector_t, and within i_blocks
3140 * in the vfs.  ext4 inode has 48 bits of i_block in fsblock units,
3141 * so that won't be a limiting factor.
3142 *
3143 * However there is other limiting factor. We do store extents in the form
3144 * of starting block and length, hence the resulting length of the extent
3145 * covering maximum file size must fit into on-disk format containers as
3146 * well. Given that length is always by 1 unit bigger than max unit (because
3147 * we count 0 as well) we have to lower the s_maxbytes by one fs block.
3148 *
3149 * Note, this does *not* consider any metadata overhead for vfs i_blocks.
3150 */
3151static loff_t ext4_max_size(int blkbits, int has_huge_files)
3152{
3153	loff_t res;
3154	loff_t upper_limit = MAX_LFS_FILESIZE;
3155
3156	BUILD_BUG_ON(sizeof(blkcnt_t) < sizeof(u64));
3157
3158	if (!has_huge_files) {
3159		upper_limit = (1LL << 32) - 1;
3160
3161		/* total blocks in file system block size */
3162		upper_limit >>= (blkbits - 9);
3163		upper_limit <<= blkbits;
3164	}
3165
3166	/*
3167	 * 32-bit extent-start container, ee_block. We lower the maxbytes
3168	 * by one fs block, so ee_len can cover the extent of maximum file
3169	 * size
3170	 */
3171	res = (1LL << 32) - 1;
3172	res <<= blkbits;
3173
3174	/* Sanity check against vm- & vfs- imposed limits */
3175	if (res > upper_limit)
3176		res = upper_limit;
3177
3178	return res;
3179}
3180
3181/*
3182 * Maximal bitmap file size.  There is a direct, and {,double-,triple-}indirect
3183 * block limit, and also a limit of (2^48 - 1) 512-byte sectors in i_blocks.
3184 * We need to be 1 filesystem block less than the 2^48 sector limit.
3185 */
3186static loff_t ext4_max_bitmap_size(int bits, int has_huge_files)
3187{
3188	loff_t res = EXT4_NDIR_BLOCKS;
3189	int meta_blocks;
3190	loff_t upper_limit;
3191	/* This is calculated to be the largest file size for a dense, block
3192	 * mapped file such that the file's total number of 512-byte sectors,
3193	 * including data and all indirect blocks, does not exceed (2^48 - 1).
3194	 *
3195	 * __u32 i_blocks_lo and _u16 i_blocks_high represent the total
3196	 * number of 512-byte sectors of the file.
3197	 */
3198
3199	if (!has_huge_files) {
3200		/*
3201		 * !has_huge_files or implies that the inode i_block field
3202		 * represents total file blocks in 2^32 512-byte sectors ==
3203		 * size of vfs inode i_blocks * 8
3204		 */
3205		upper_limit = (1LL << 32) - 1;
3206
3207		/* total blocks in file system block size */
3208		upper_limit >>= (bits - 9);
3209
3210	} else {
3211		/*
3212		 * We use 48 bit ext4_inode i_blocks
3213		 * With EXT4_HUGE_FILE_FL set the i_blocks
3214		 * represent total number of blocks in
3215		 * file system block size
3216		 */
3217		upper_limit = (1LL << 48) - 1;
3218
3219	}
3220
3221	/* indirect blocks */
3222	meta_blocks = 1;
3223	/* double indirect blocks */
3224	meta_blocks += 1 + (1LL << (bits-2));
3225	/* tripple indirect blocks */
3226	meta_blocks += 1 + (1LL << (bits-2)) + (1LL << (2*(bits-2)));
3227
3228	upper_limit -= meta_blocks;
3229	upper_limit <<= bits;
3230
3231	res += 1LL << (bits-2);
3232	res += 1LL << (2*(bits-2));
3233	res += 1LL << (3*(bits-2));
3234	res <<= bits;
3235	if (res > upper_limit)
3236		res = upper_limit;
3237
3238	if (res > MAX_LFS_FILESIZE)
3239		res = MAX_LFS_FILESIZE;
3240
3241	return res;
3242}
3243
3244static ext4_fsblk_t descriptor_loc(struct super_block *sb,
3245				   ext4_fsblk_t logical_sb_block, int nr)
3246{
3247	struct ext4_sb_info *sbi = EXT4_SB(sb);
3248	ext4_group_t bg, first_meta_bg;
3249	int has_super = 0;
3250
3251	first_meta_bg = le32_to_cpu(sbi->s_es->s_first_meta_bg);
3252
3253	if (!ext4_has_feature_meta_bg(sb) || nr < first_meta_bg)
3254		return logical_sb_block + nr + 1;
3255	bg = sbi->s_desc_per_block * nr;
3256	if (ext4_bg_has_super(sb, bg))
3257		has_super = 1;
3258
3259	/*
3260	 * If we have a meta_bg fs with 1k blocks, group 0's GDT is at
3261	 * block 2, not 1.  If s_first_data_block == 0 (bigalloc is enabled
3262	 * on modern mke2fs or blksize > 1k on older mke2fs) then we must
3263	 * compensate.
3264	 */
3265	if (sb->s_blocksize == 1024 && nr == 0 &&
3266	    le32_to_cpu(sbi->s_es->s_first_data_block) == 0)
3267		has_super++;
3268
3269	return (has_super + ext4_group_first_block_no(sb, bg));
3270}
3271
3272/**
3273 * ext4_get_stripe_size: Get the stripe size.
3274 * @sbi: In memory super block info
3275 *
3276 * If we have specified it via mount option, then
3277 * use the mount option value. If the value specified at mount time is
3278 * greater than the blocks per group use the super block value.
3279 * If the super block value is greater than blocks per group return 0.
3280 * Allocator needs it be less than blocks per group.
3281 *
3282 */
3283static unsigned long ext4_get_stripe_size(struct ext4_sb_info *sbi)
3284{
3285	unsigned long stride = le16_to_cpu(sbi->s_es->s_raid_stride);
3286	unsigned long stripe_width =
3287			le32_to_cpu(sbi->s_es->s_raid_stripe_width);
3288	int ret;
3289
3290	if (sbi->s_stripe && sbi->s_stripe <= sbi->s_blocks_per_group)
3291		ret = sbi->s_stripe;
3292	else if (stripe_width && stripe_width <= sbi->s_blocks_per_group)
3293		ret = stripe_width;
3294	else if (stride && stride <= sbi->s_blocks_per_group)
3295		ret = stride;
3296	else
3297		ret = 0;
3298
3299	/*
3300	 * If the stripe width is 1, this makes no sense and
3301	 * we set it to 0 to turn off stripe handling code.
3302	 */
3303	if (ret <= 1)
3304		ret = 0;
3305
3306	return ret;
3307}
3308
3309/*
3310 * Check whether this filesystem can be mounted based on
3311 * the features present and the RDONLY/RDWR mount requested.
3312 * Returns 1 if this filesystem can be mounted as requested,
3313 * 0 if it cannot be.
3314 */
3315static int ext4_feature_set_ok(struct super_block *sb, int readonly)
3316{
3317	if (ext4_has_unknown_ext4_incompat_features(sb)) {
3318		ext4_msg(sb, KERN_ERR,
3319			"Couldn't mount because of "
3320			"unsupported optional features (%x)",
3321			(le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_incompat) &
3322			~EXT4_FEATURE_INCOMPAT_SUPP));
3323		return 0;
3324	}
3325
3326#ifndef CONFIG_UNICODE
3327	if (ext4_has_feature_casefold(sb)) {
3328		ext4_msg(sb, KERN_ERR,
3329			 "Filesystem with casefold feature cannot be "
3330			 "mounted without CONFIG_UNICODE");
3331		return 0;
3332	}
3333#endif
3334
3335	if (readonly)
3336		return 1;
3337
3338	if (ext4_has_feature_readonly(sb)) {
3339		ext4_msg(sb, KERN_INFO, "filesystem is read-only");
3340		sb->s_flags |= SB_RDONLY;
3341		return 1;
3342	}
3343
3344	/* Check that feature set is OK for a read-write mount */
3345	if (ext4_has_unknown_ext4_ro_compat_features(sb)) {
3346		ext4_msg(sb, KERN_ERR, "couldn't mount RDWR because of "
3347			 "unsupported optional features (%x)",
3348			 (le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_ro_compat) &
3349				~EXT4_FEATURE_RO_COMPAT_SUPP));
3350		return 0;
3351	}
3352	if (ext4_has_feature_bigalloc(sb) && !ext4_has_feature_extents(sb)) {
3353		ext4_msg(sb, KERN_ERR,
3354			 "Can't support bigalloc feature without "
3355			 "extents feature\n");
3356		return 0;
3357	}
3358
3359#if !IS_ENABLED(CONFIG_QUOTA) || !IS_ENABLED(CONFIG_QFMT_V2)
3360	if (!readonly && (ext4_has_feature_quota(sb) ||
3361			  ext4_has_feature_project(sb))) {
3362		ext4_msg(sb, KERN_ERR,
3363			 "The kernel was not built with CONFIG_QUOTA and CONFIG_QFMT_V2");
3364		return 0;
3365	}
3366#endif  /* CONFIG_QUOTA */
3367	return 1;
3368}
3369
3370/*
3371 * This function is called once a day if we have errors logged
3372 * on the file system
3373 */
3374static void print_daily_error_info(struct timer_list *t)
3375{
3376	struct ext4_sb_info *sbi = from_timer(sbi, t, s_err_report);
3377	struct super_block *sb = sbi->s_sb;
3378	struct ext4_super_block *es = sbi->s_es;
3379
3380	if (es->s_error_count)
3381		/* fsck newer than v1.41.13 is needed to clean this condition. */
3382		ext4_msg(sb, KERN_NOTICE, "error count since last fsck: %u",
3383			 le32_to_cpu(es->s_error_count));
3384	if (es->s_first_error_time) {
3385		printk(KERN_NOTICE "EXT4-fs (%s): initial error at time %llu: %.*s:%d",
3386		       sb->s_id,
3387		       ext4_get_tstamp(es, s_first_error_time),
3388		       (int) sizeof(es->s_first_error_func),
3389		       es->s_first_error_func,
3390		       le32_to_cpu(es->s_first_error_line));
3391		if (es->s_first_error_ino)
3392			printk(KERN_CONT ": inode %u",
3393			       le32_to_cpu(es->s_first_error_ino));
3394		if (es->s_first_error_block)
3395			printk(KERN_CONT ": block %llu", (unsigned long long)
3396			       le64_to_cpu(es->s_first_error_block));
3397		printk(KERN_CONT "\n");
3398	}
3399	if (es->s_last_error_time) {
3400		printk(KERN_NOTICE "EXT4-fs (%s): last error at time %llu: %.*s:%d",
3401		       sb->s_id,
3402		       ext4_get_tstamp(es, s_last_error_time),
3403		       (int) sizeof(es->s_last_error_func),
3404		       es->s_last_error_func,
3405		       le32_to_cpu(es->s_last_error_line));
3406		if (es->s_last_error_ino)
3407			printk(KERN_CONT ": inode %u",
3408			       le32_to_cpu(es->s_last_error_ino));
3409		if (es->s_last_error_block)
3410			printk(KERN_CONT ": block %llu", (unsigned long long)
3411			       le64_to_cpu(es->s_last_error_block));
3412		printk(KERN_CONT "\n");
3413	}
3414	mod_timer(&sbi->s_err_report, jiffies + 24*60*60*HZ);  /* Once a day */
3415}
3416
3417/* Find next suitable group and run ext4_init_inode_table */
3418static int ext4_run_li_request(struct ext4_li_request *elr)
3419{
3420	struct ext4_group_desc *gdp = NULL;
3421	struct super_block *sb = elr->lr_super;
3422	ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count;
3423	ext4_group_t group = elr->lr_next_group;
3424	unsigned long timeout = 0;
3425	unsigned int prefetch_ios = 0;
3426	int ret = 0;
3427
3428	if (elr->lr_mode == EXT4_LI_MODE_PREFETCH_BBITMAP) {
3429		elr->lr_next_group = ext4_mb_prefetch(sb, group,
3430				EXT4_SB(sb)->s_mb_prefetch, &prefetch_ios);
3431		if (prefetch_ios)
3432			ext4_mb_prefetch_fini(sb, elr->lr_next_group,
3433					      prefetch_ios);
3434		trace_ext4_prefetch_bitmaps(sb, group, elr->lr_next_group,
3435					    prefetch_ios);
3436		if (group >= elr->lr_next_group) {
3437			ret = 1;
3438			if (elr->lr_first_not_zeroed != ngroups &&
3439			    !sb_rdonly(sb) && test_opt(sb, INIT_INODE_TABLE)) {
3440				elr->lr_next_group = elr->lr_first_not_zeroed;
3441				elr->lr_mode = EXT4_LI_MODE_ITABLE;
3442				ret = 0;
3443			}
3444		}
3445		return ret;
3446	}
3447
3448	for (; group < ngroups; group++) {
3449		gdp = ext4_get_group_desc(sb, group, NULL);
3450		if (!gdp) {
3451			ret = 1;
3452			break;
3453		}
3454
3455		if (!(gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_ZEROED)))
3456			break;
3457	}
3458
3459	if (group >= ngroups)
3460		ret = 1;
3461
3462	if (!ret) {
3463		timeout = jiffies;
3464		ret = ext4_init_inode_table(sb, group,
3465					    elr->lr_timeout ? 0 : 1);
3466		trace_ext4_lazy_itable_init(sb, group);
3467		if (elr->lr_timeout == 0) {
3468			timeout = (jiffies - timeout) *
3469				EXT4_SB(elr->lr_super)->s_li_wait_mult;
3470			elr->lr_timeout = timeout;
3471		}
3472		elr->lr_next_sched = jiffies + elr->lr_timeout;
3473		elr->lr_next_group = group + 1;
3474	}
3475	return ret;
3476}
3477
3478/*
3479 * Remove lr_request from the list_request and free the
3480 * request structure. Should be called with li_list_mtx held
3481 */
3482static void ext4_remove_li_request(struct ext4_li_request *elr)
3483{
3484	if (!elr)
3485		return;
3486
3487	list_del(&elr->lr_request);
3488	EXT4_SB(elr->lr_super)->s_li_request = NULL;
3489	kfree(elr);
3490}
3491
3492static void ext4_unregister_li_request(struct super_block *sb)
3493{
3494	mutex_lock(&ext4_li_mtx);
3495	if (!ext4_li_info) {
3496		mutex_unlock(&ext4_li_mtx);
3497		return;
3498	}
3499
3500	mutex_lock(&ext4_li_info->li_list_mtx);
3501	ext4_remove_li_request(EXT4_SB(sb)->s_li_request);
3502	mutex_unlock(&ext4_li_info->li_list_mtx);
3503	mutex_unlock(&ext4_li_mtx);
3504}
3505
3506static struct task_struct *ext4_lazyinit_task;
3507
3508/*
3509 * This is the function where ext4lazyinit thread lives. It walks
3510 * through the request list searching for next scheduled filesystem.
3511 * When such a fs is found, run the lazy initialization request
3512 * (ext4_rn_li_request) and keep track of the time spend in this
3513 * function. Based on that time we compute next schedule time of
3514 * the request. When walking through the list is complete, compute
3515 * next waking time and put itself into sleep.
3516 */
3517static int ext4_lazyinit_thread(void *arg)
3518{
3519	struct ext4_lazy_init *eli = (struct ext4_lazy_init *)arg;
3520	struct list_head *pos, *n;
3521	struct ext4_li_request *elr;
3522	unsigned long next_wakeup, cur;
3523
3524	BUG_ON(NULL == eli);
3525
3526cont_thread:
3527	while (true) {
3528		next_wakeup = MAX_JIFFY_OFFSET;
3529
3530		mutex_lock(&eli->li_list_mtx);
3531		if (list_empty(&eli->li_request_list)) {
3532			mutex_unlock(&eli->li_list_mtx);
3533			goto exit_thread;
3534		}
3535		list_for_each_safe(pos, n, &eli->li_request_list) {
3536			int err = 0;
3537			int progress = 0;
3538			elr = list_entry(pos, struct ext4_li_request,
3539					 lr_request);
3540
3541			if (time_before(jiffies, elr->lr_next_sched)) {
3542				if (time_before(elr->lr_next_sched, next_wakeup))
3543					next_wakeup = elr->lr_next_sched;
3544				continue;
3545			}
3546			if (down_read_trylock(&elr->lr_super->s_umount)) {
3547				if (sb_start_write_trylock(elr->lr_super)) {
3548					progress = 1;
3549					/*
3550					 * We hold sb->s_umount, sb can not
3551					 * be removed from the list, it is
3552					 * now safe to drop li_list_mtx
3553					 */
3554					mutex_unlock(&eli->li_list_mtx);
3555					err = ext4_run_li_request(elr);
3556					sb_end_write(elr->lr_super);
3557					mutex_lock(&eli->li_list_mtx);
3558					n = pos->next;
3559				}
3560				up_read((&elr->lr_super->s_umount));
3561			}
3562			/* error, remove the lazy_init job */
3563			if (err) {
3564				ext4_remove_li_request(elr);
3565				continue;
3566			}
3567			if (!progress) {
3568				elr->lr_next_sched = jiffies +
3569					(prandom_u32()
3570					 % (EXT4_DEF_LI_MAX_START_DELAY * HZ));
3571			}
3572			if (time_before(elr->lr_next_sched, next_wakeup))
3573				next_wakeup = elr->lr_next_sched;
3574		}
3575		mutex_unlock(&eli->li_list_mtx);
3576
3577		try_to_freeze();
3578
3579		cur = jiffies;
3580		if ((time_after_eq(cur, next_wakeup)) ||
3581		    (MAX_JIFFY_OFFSET == next_wakeup)) {
3582			cond_resched();
3583			continue;
3584		}
3585
3586		schedule_timeout_interruptible(next_wakeup - cur);
3587
3588		if (kthread_should_stop()) {
3589			ext4_clear_request_list();
3590			goto exit_thread;
3591		}
3592	}
3593
3594exit_thread:
3595	/*
3596	 * It looks like the request list is empty, but we need
3597	 * to check it under the li_list_mtx lock, to prevent any
3598	 * additions into it, and of course we should lock ext4_li_mtx
3599	 * to atomically free the list and ext4_li_info, because at
3600	 * this point another ext4 filesystem could be registering
3601	 * new one.
3602	 */
3603	mutex_lock(&ext4_li_mtx);
3604	mutex_lock(&eli->li_list_mtx);
3605	if (!list_empty(&eli->li_request_list)) {
3606		mutex_unlock(&eli->li_list_mtx);
3607		mutex_unlock(&ext4_li_mtx);
3608		goto cont_thread;
3609	}
3610	mutex_unlock(&eli->li_list_mtx);
3611	kfree(ext4_li_info);
3612	ext4_li_info = NULL;
3613	mutex_unlock(&ext4_li_mtx);
3614
3615	return 0;
3616}
3617
3618static void ext4_clear_request_list(void)
3619{
3620	struct list_head *pos, *n;
3621	struct ext4_li_request *elr;
3622
3623	mutex_lock(&ext4_li_info->li_list_mtx);
3624	list_for_each_safe(pos, n, &ext4_li_info->li_request_list) {
3625		elr = list_entry(pos, struct ext4_li_request,
3626				 lr_request);
3627		ext4_remove_li_request(elr);
3628	}
3629	mutex_unlock(&ext4_li_info->li_list_mtx);
3630}
3631
3632static int ext4_run_lazyinit_thread(void)
3633{
3634	ext4_lazyinit_task = kthread_run(ext4_lazyinit_thread,
3635					 ext4_li_info, "ext4lazyinit");
3636	if (IS_ERR(ext4_lazyinit_task)) {
3637		int err = PTR_ERR(ext4_lazyinit_task);
3638		ext4_clear_request_list();
3639		kfree(ext4_li_info);
3640		ext4_li_info = NULL;
3641		printk(KERN_CRIT "EXT4-fs: error %d creating inode table "
3642				 "initialization thread\n",
3643				 err);
3644		return err;
3645	}
3646	ext4_li_info->li_state |= EXT4_LAZYINIT_RUNNING;
3647	return 0;
3648}
3649
3650/*
3651 * Check whether it make sense to run itable init. thread or not.
3652 * If there is at least one uninitialized inode table, return
3653 * corresponding group number, else the loop goes through all
3654 * groups and return total number of groups.
3655 */
3656static ext4_group_t ext4_has_uninit_itable(struct super_block *sb)
3657{
3658	ext4_group_t group, ngroups = EXT4_SB(sb)->s_groups_count;
3659	struct ext4_group_desc *gdp = NULL;
3660
3661	if (!ext4_has_group_desc_csum(sb))
3662		return ngroups;
3663
3664	for (group = 0; group < ngroups; group++) {
3665		gdp = ext4_get_group_desc(sb, group, NULL);
3666		if (!gdp)
3667			continue;
3668
3669		if (!(gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_ZEROED)))
3670			break;
3671	}
3672
3673	return group;
3674}
3675
3676static int ext4_li_info_new(void)
3677{
3678	struct ext4_lazy_init *eli = NULL;
3679
3680	eli = kzalloc(sizeof(*eli), GFP_KERNEL);
3681	if (!eli)
3682		return -ENOMEM;
3683
3684	INIT_LIST_HEAD(&eli->li_request_list);
3685	mutex_init(&eli->li_list_mtx);
3686
3687	eli->li_state |= EXT4_LAZYINIT_QUIT;
3688
3689	ext4_li_info = eli;
3690
3691	return 0;
3692}
3693
3694static struct ext4_li_request *ext4_li_request_new(struct super_block *sb,
3695					    ext4_group_t start)
3696{
3697	struct ext4_li_request *elr;
3698
3699	elr = kzalloc(sizeof(*elr), GFP_KERNEL);
3700	if (!elr)
3701		return NULL;
3702
3703	elr->lr_super = sb;
3704	elr->lr_first_not_zeroed = start;
3705	if (test_opt(sb, NO_PREFETCH_BLOCK_BITMAPS)) {
3706		elr->lr_mode = EXT4_LI_MODE_ITABLE;
3707		elr->lr_next_group = start;
3708	} else {
3709		elr->lr_mode = EXT4_LI_MODE_PREFETCH_BBITMAP;
3710	}
3711
3712	/*
3713	 * Randomize first schedule time of the request to
3714	 * spread the inode table initialization requests
3715	 * better.
3716	 */
3717	elr->lr_next_sched = jiffies + (prandom_u32() %
3718				(EXT4_DEF_LI_MAX_START_DELAY * HZ));
3719	return elr;
3720}
3721
3722int ext4_register_li_request(struct super_block *sb,
3723			     ext4_group_t first_not_zeroed)
3724{
3725	struct ext4_sb_info *sbi = EXT4_SB(sb);
3726	struct ext4_li_request *elr = NULL;
3727	ext4_group_t ngroups = sbi->s_groups_count;
3728	int ret = 0;
3729
3730	mutex_lock(&ext4_li_mtx);
3731	if (sbi->s_li_request != NULL) {
3732		/*
3733		 * Reset timeout so it can be computed again, because
3734		 * s_li_wait_mult might have changed.
3735		 */
3736		sbi->s_li_request->lr_timeout = 0;
3737		goto out;
3738	}
3739
3740	if (test_opt(sb, NO_PREFETCH_BLOCK_BITMAPS) &&
3741	    (first_not_zeroed == ngroups || sb_rdonly(sb) ||
3742	     !test_opt(sb, INIT_INODE_TABLE)))
3743		goto out;
3744
3745	elr = ext4_li_request_new(sb, first_not_zeroed);
3746	if (!elr) {
3747		ret = -ENOMEM;
3748		goto out;
3749	}
3750
3751	if (NULL == ext4_li_info) {
3752		ret = ext4_li_info_new();
3753		if (ret)
3754			goto out;
3755	}
3756
3757	mutex_lock(&ext4_li_info->li_list_mtx);
3758	list_add(&elr->lr_request, &ext4_li_info->li_request_list);
3759	mutex_unlock(&ext4_li_info->li_list_mtx);
3760
3761	sbi->s_li_request = elr;
3762	/*
3763	 * set elr to NULL here since it has been inserted to
3764	 * the request_list and the removal and free of it is
3765	 * handled by ext4_clear_request_list from now on.
3766	 */
3767	elr = NULL;
3768
3769	if (!(ext4_li_info->li_state & EXT4_LAZYINIT_RUNNING)) {
3770		ret = ext4_run_lazyinit_thread();
3771		if (ret)
3772			goto out;
3773	}
3774out:
3775	mutex_unlock(&ext4_li_mtx);
3776	if (ret)
3777		kfree(elr);
3778	return ret;
3779}
3780
3781/*
3782 * We do not need to lock anything since this is called on
3783 * module unload.
3784 */
3785static void ext4_destroy_lazyinit_thread(void)
3786{
3787	/*
3788	 * If thread exited earlier
3789	 * there's nothing to be done.
3790	 */
3791	if (!ext4_li_info || !ext4_lazyinit_task)
3792		return;
3793
3794	kthread_stop(ext4_lazyinit_task);
3795}
3796
3797static int set_journal_csum_feature_set(struct super_block *sb)
3798{
3799	int ret = 1;
3800	int compat, incompat;
3801	struct ext4_sb_info *sbi = EXT4_SB(sb);
3802
3803	if (ext4_has_metadata_csum(sb)) {
3804		/* journal checksum v3 */
3805		compat = 0;
3806		incompat = JBD2_FEATURE_INCOMPAT_CSUM_V3;
3807	} else {
3808		/* journal checksum v1 */
3809		compat = JBD2_FEATURE_COMPAT_CHECKSUM;
3810		incompat = 0;
3811	}
3812
3813	jbd2_journal_clear_features(sbi->s_journal,
3814			JBD2_FEATURE_COMPAT_CHECKSUM, 0,
3815			JBD2_FEATURE_INCOMPAT_CSUM_V3 |
3816			JBD2_FEATURE_INCOMPAT_CSUM_V2);
3817	if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) {
3818		ret = jbd2_journal_set_features(sbi->s_journal,
3819				compat, 0,
3820				JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT |
3821				incompat);
3822	} else if (test_opt(sb, JOURNAL_CHECKSUM)) {
3823		ret = jbd2_journal_set_features(sbi->s_journal,
3824				compat, 0,
3825				incompat);
3826		jbd2_journal_clear_features(sbi->s_journal, 0, 0,
3827				JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT);
3828	} else {
3829		jbd2_journal_clear_features(sbi->s_journal, 0, 0,
3830				JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT);
3831	}
3832
3833	return ret;
3834}
3835
3836/*
3837 * Note: calculating the overhead so we can be compatible with
3838 * historical BSD practice is quite difficult in the face of
3839 * clusters/bigalloc.  This is because multiple metadata blocks from
3840 * different block group can end up in the same allocation cluster.
3841 * Calculating the exact overhead in the face of clustered allocation
3842 * requires either O(all block bitmaps) in memory or O(number of block
3843 * groups**2) in time.  We will still calculate the superblock for
3844 * older file systems --- and if we come across with a bigalloc file
3845 * system with zero in s_overhead_clusters the estimate will be close to
3846 * correct especially for very large cluster sizes --- but for newer
3847 * file systems, it's better to calculate this figure once at mkfs
3848 * time, and store it in the superblock.  If the superblock value is
3849 * present (even for non-bigalloc file systems), we will use it.
3850 */
3851static int count_overhead(struct super_block *sb, ext4_group_t grp,
3852			  char *buf)
3853{
3854	struct ext4_sb_info	*sbi = EXT4_SB(sb);
3855	struct ext4_group_desc	*gdp;
3856	ext4_fsblk_t		first_block, last_block, b;
3857	ext4_group_t		i, ngroups = ext4_get_groups_count(sb);
3858	int			s, j, count = 0;
3859
3860	if (!ext4_has_feature_bigalloc(sb))
3861		return (ext4_bg_has_super(sb, grp) + ext4_bg_num_gdb(sb, grp) +
3862			sbi->s_itb_per_group + 2);
3863
3864	first_block = le32_to_cpu(sbi->s_es->s_first_data_block) +
3865		(grp * EXT4_BLOCKS_PER_GROUP(sb));
3866	last_block = first_block + EXT4_BLOCKS_PER_GROUP(sb) - 1;
3867	for (i = 0; i < ngroups; i++) {
3868		gdp = ext4_get_group_desc(sb, i, NULL);
3869		b = ext4_block_bitmap(sb, gdp);
3870		if (b >= first_block && b <= last_block) {
3871			ext4_set_bit(EXT4_B2C(sbi, b - first_block), buf);
3872			count++;
3873		}
3874		b = ext4_inode_bitmap(sb, gdp);
3875		if (b >= first_block && b <= last_block) {
3876			ext4_set_bit(EXT4_B2C(sbi, b - first_block), buf);
3877			count++;
3878		}
3879		b = ext4_inode_table(sb, gdp);
3880		if (b >= first_block && b + sbi->s_itb_per_group <= last_block)
3881			for (j = 0; j < sbi->s_itb_per_group; j++, b++) {
3882				int c = EXT4_B2C(sbi, b - first_block);
3883				ext4_set_bit(c, buf);
3884				count++;
3885			}
3886		if (i != grp)
3887			continue;
3888		s = 0;
3889		if (ext4_bg_has_super(sb, grp)) {
3890			ext4_set_bit(s++, buf);
3891			count++;
3892		}
3893		j = ext4_bg_num_gdb(sb, grp);
3894		if (s + j > EXT4_BLOCKS_PER_GROUP(sb)) {
3895			ext4_error(sb, "Invalid number of block group "
3896				   "descriptor blocks: %d", j);
3897			j = EXT4_BLOCKS_PER_GROUP(sb) - s;
3898		}
3899		count += j;
3900		for (; j > 0; j--)
3901			ext4_set_bit(EXT4_B2C(sbi, s++), buf);
3902	}
3903	if (!count)
3904		return 0;
3905	return EXT4_CLUSTERS_PER_GROUP(sb) -
3906		ext4_count_free(buf, EXT4_CLUSTERS_PER_GROUP(sb) / 8);
3907}
3908
3909/*
3910 * Compute the overhead and stash it in sbi->s_overhead
3911 */
3912int ext4_calculate_overhead(struct super_block *sb)
3913{
3914	struct ext4_sb_info *sbi = EXT4_SB(sb);
3915	struct ext4_super_block *es = sbi->s_es;
3916	struct inode *j_inode;
3917	unsigned int j_blocks, j_inum = le32_to_cpu(es->s_journal_inum);
3918	ext4_group_t i, ngroups = ext4_get_groups_count(sb);
3919	ext4_fsblk_t overhead = 0;
3920	char *buf = (char *) get_zeroed_page(GFP_NOFS);
3921
3922	if (!buf)
3923		return -ENOMEM;
3924
3925	/*
3926	 * Compute the overhead (FS structures).  This is constant
3927	 * for a given filesystem unless the number of block groups
3928	 * changes so we cache the previous value until it does.
3929	 */
3930
3931	/*
3932	 * All of the blocks before first_data_block are overhead
3933	 */
3934	overhead = EXT4_B2C(sbi, le32_to_cpu(es->s_first_data_block));
3935
3936	/*
3937	 * Add the overhead found in each block group
3938	 */
3939	for (i = 0; i < ngroups; i++) {
3940		int blks;
3941
3942		blks = count_overhead(sb, i, buf);
3943		overhead += blks;
3944		if (blks)
3945			memset(buf, 0, PAGE_SIZE);
3946		cond_resched();
3947	}
3948
3949	/*
3950	 * Add the internal journal blocks whether the journal has been
3951	 * loaded or not
3952	 */
3953	if (sbi->s_journal && !sbi->s_journal_bdev)
3954		overhead += EXT4_NUM_B2C(sbi, sbi->s_journal->j_total_len);
3955	else if (ext4_has_feature_journal(sb) && !sbi->s_journal && j_inum) {
3956		/* j_inum for internal journal is non-zero */
3957		j_inode = ext4_get_journal_inode(sb, j_inum);
3958		if (j_inode) {
3959			j_blocks = j_inode->i_size >> sb->s_blocksize_bits;
3960			overhead += EXT4_NUM_B2C(sbi, j_blocks);
3961			iput(j_inode);
3962		} else {
3963			ext4_msg(sb, KERN_ERR, "can't get journal size");
3964		}
3965	}
3966	sbi->s_overhead = overhead;
3967	smp_wmb();
3968	free_page((unsigned long) buf);
3969	return 0;
3970}
3971
3972static void ext4_set_resv_clusters(struct super_block *sb)
3973{
3974	ext4_fsblk_t resv_clusters;
3975	struct ext4_sb_info *sbi = EXT4_SB(sb);
3976
3977	/*
3978	 * There's no need to reserve anything when we aren't using extents.
3979	 * The space estimates are exact, there are no unwritten extents,
3980	 * hole punching doesn't need new metadata... This is needed especially
3981	 * to keep ext2/3 backward compatibility.
3982	 */
3983	if (!ext4_has_feature_extents(sb))
3984		return;
3985	/*
3986	 * By default we reserve 2% or 4096 clusters, whichever is smaller.
3987	 * This should cover the situations where we can not afford to run
3988	 * out of space like for example punch hole, or converting
3989	 * unwritten extents in delalloc path. In most cases such
3990	 * allocation would require 1, or 2 blocks, higher numbers are
3991	 * very rare.
3992	 */
3993	resv_clusters = (ext4_blocks_count(sbi->s_es) >>
3994			 sbi->s_cluster_bits);
3995
3996	do_div(resv_clusters, 50);
3997	resv_clusters = min_t(ext4_fsblk_t, resv_clusters, 4096);
3998
3999	atomic64_set(&sbi->s_resv_clusters, resv_clusters);
4000}
4001
4002static const char *ext4_quota_mode(struct super_block *sb)
4003{
4004#ifdef CONFIG_QUOTA
4005	if (!ext4_quota_capable(sb))
4006		return "none";
4007
4008	if (EXT4_SB(sb)->s_journal && ext4_is_quota_journalled(sb))
4009		return "journalled";
4010	else
4011		return "writeback";
4012#else
4013	return "disabled";
4014#endif
4015}
4016
4017static int ext4_fill_super(struct super_block *sb, void *data, int silent)
4018{
4019	struct dax_device *dax_dev = fs_dax_get_by_bdev(sb->s_bdev);
4020	char *orig_data = kstrdup(data, GFP_KERNEL);
4021	struct buffer_head *bh, **group_desc;
4022	struct ext4_super_block *es = NULL;
4023	struct ext4_sb_info *sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
4024	struct flex_groups **flex_groups;
4025	ext4_fsblk_t block;
4026	ext4_fsblk_t sb_block = get_sb_block(&data);
4027	ext4_fsblk_t logical_sb_block;
4028	unsigned long offset = 0;
4029	unsigned long def_mount_opts;
4030	struct inode *root;
4031	const char *descr;
4032	int ret = -ENOMEM;
4033	int blocksize, clustersize;
4034	unsigned int db_count;
4035	unsigned int i;
4036	int needs_recovery, has_huge_files;
4037	__u64 blocks_count;
4038	int err = 0;
4039	ext4_group_t first_not_zeroed;
4040	struct ext4_parsed_options parsed_opts;
4041
4042	/* Set defaults for the variables that will be set during parsing */
4043	parsed_opts.journal_ioprio = DEFAULT_JOURNAL_IOPRIO;
4044	parsed_opts.journal_devnum = 0;
4045	parsed_opts.mb_optimize_scan = DEFAULT_MB_OPTIMIZE_SCAN;
4046
4047	if ((data && !orig_data) || !sbi)
4048		goto out_free_base;
4049
4050	sbi->s_daxdev = dax_dev;
4051	sbi->s_blockgroup_lock =
4052		kzalloc(sizeof(struct blockgroup_lock), GFP_KERNEL);
4053	if (!sbi->s_blockgroup_lock)
4054		goto out_free_base;
4055
4056	sb->s_fs_info = sbi;
4057	sbi->s_sb = sb;
4058	sbi->s_inode_readahead_blks = EXT4_DEF_INODE_READAHEAD_BLKS;
4059	sbi->s_sb_block = sb_block;
4060	sbi->s_sectors_written_start =
4061		part_stat_read(sb->s_bdev, sectors[STAT_WRITE]);
4062
4063	/* Cleanup superblock name */
4064	strreplace(sb->s_id, '/', '!');
4065
4066	/* -EINVAL is default */
4067	ret = -EINVAL;
4068	blocksize = sb_min_blocksize(sb, EXT4_MIN_BLOCK_SIZE);
4069	if (!blocksize) {
4070		ext4_msg(sb, KERN_ERR, "unable to set blocksize");
4071		goto out_fail;
4072	}
4073
4074	/*
4075	 * The ext4 superblock will not be buffer aligned for other than 1kB
4076	 * block sizes.  We need to calculate the offset from buffer start.
4077	 */
4078	if (blocksize != EXT4_MIN_BLOCK_SIZE) {
4079		logical_sb_block = sb_block * EXT4_MIN_BLOCK_SIZE;
4080		offset = do_div(logical_sb_block, blocksize);
4081	} else {
4082		logical_sb_block = sb_block;
4083	}
4084
4085	bh = ext4_sb_bread_unmovable(sb, logical_sb_block);
4086	if (IS_ERR(bh)) {
4087		ext4_msg(sb, KERN_ERR, "unable to read superblock");
4088		ret = PTR_ERR(bh);
4089		goto out_fail;
4090	}
4091	/*
4092	 * Note: s_es must be initialized as soon as possible because
4093	 *       some ext4 macro-instructions depend on its value
4094	 */
4095	es = (struct ext4_super_block *) (bh->b_data + offset);
4096	sbi->s_es = es;
4097	sb->s_magic = le16_to_cpu(es->s_magic);
4098	if (sb->s_magic != EXT4_SUPER_MAGIC)
4099		goto cantfind_ext4;
4100	sbi->s_kbytes_written = le64_to_cpu(es->s_kbytes_written);
4101
4102	/* Warn if metadata_csum and gdt_csum are both set. */
4103	if (ext4_has_feature_metadata_csum(sb) &&
4104	    ext4_has_feature_gdt_csum(sb))
4105		ext4_warning(sb, "metadata_csum and uninit_bg are "
4106			     "redundant flags; please run fsck.");
4107
4108	/* Check for a known checksum algorithm */
4109	if (!ext4_verify_csum_type(sb, es)) {
4110		ext4_msg(sb, KERN_ERR, "VFS: Found ext4 filesystem with "
4111			 "unknown checksum algorithm.");
4112		silent = 1;
4113		goto cantfind_ext4;
4114	}
4115
4116	/* Load the checksum driver */
4117	sbi->s_chksum_driver = crypto_alloc_shash("crc32c", 0, 0);
4118	if (IS_ERR(sbi->s_chksum_driver)) {
4119		ext4_msg(sb, KERN_ERR, "Cannot load crc32c driver.");
4120		ret = PTR_ERR(sbi->s_chksum_driver);
4121		sbi->s_chksum_driver = NULL;
4122		goto failed_mount;
4123	}
4124
4125	/* Check superblock checksum */
4126	if (!ext4_superblock_csum_verify(sb, es)) {
4127		ext4_msg(sb, KERN_ERR, "VFS: Found ext4 filesystem with "
4128			 "invalid superblock checksum.  Run e2fsck?");
4129		silent = 1;
4130		ret = -EFSBADCRC;
4131		goto cantfind_ext4;
4132	}
4133
4134	/* Precompute checksum seed for all metadata */
4135	if (ext4_has_feature_csum_seed(sb))
4136		sbi->s_csum_seed = le32_to_cpu(es->s_checksum_seed);
4137	else if (ext4_has_metadata_csum(sb) || ext4_has_feature_ea_inode(sb))
4138		sbi->s_csum_seed = ext4_chksum(sbi, ~0, es->s_uuid,
4139					       sizeof(es->s_uuid));
4140
4141	/* Set defaults before we parse the mount options */
4142	def_mount_opts = le32_to_cpu(es->s_default_mount_opts);
4143	set_opt(sb, INIT_INODE_TABLE);
4144	if (def_mount_opts & EXT4_DEFM_DEBUG)
4145		set_opt(sb, DEBUG);
4146	if (def_mount_opts & EXT4_DEFM_BSDGROUPS)
4147		set_opt(sb, GRPID);
4148	if (def_mount_opts & EXT4_DEFM_UID16)
4149		set_opt(sb, NO_UID32);
4150	/* xattr user namespace & acls are now defaulted on */
4151	set_opt(sb, XATTR_USER);
4152#ifdef CONFIG_EXT4_FS_POSIX_ACL
4153	set_opt(sb, POSIX_ACL);
4154#endif
4155	if (ext4_has_feature_fast_commit(sb))
4156		set_opt2(sb, JOURNAL_FAST_COMMIT);
4157	/* don't forget to enable journal_csum when metadata_csum is enabled. */
4158	if (ext4_has_metadata_csum(sb))
4159		set_opt(sb, JOURNAL_CHECKSUM);
4160
4161	if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_DATA)
4162		set_opt(sb, JOURNAL_DATA);
4163	else if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_ORDERED)
4164		set_opt(sb, ORDERED_DATA);
4165	else if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_WBACK)
4166		set_opt(sb, WRITEBACK_DATA);
4167
4168	if (le16_to_cpu(sbi->s_es->s_errors) == EXT4_ERRORS_PANIC)
4169		set_opt(sb, ERRORS_PANIC);
4170	else if (le16_to_cpu(sbi->s_es->s_errors) == EXT4_ERRORS_CONTINUE)
4171		set_opt(sb, ERRORS_CONT);
4172	else
4173		set_opt(sb, ERRORS_RO);
4174	/* block_validity enabled by default; disable with noblock_validity */
4175	set_opt(sb, BLOCK_VALIDITY);
4176	if (def_mount_opts & EXT4_DEFM_DISCARD)
4177		set_opt(sb, DISCARD);
4178
4179	sbi->s_resuid = make_kuid(&init_user_ns, le16_to_cpu(es->s_def_resuid));
4180	sbi->s_resgid = make_kgid(&init_user_ns, le16_to_cpu(es->s_def_resgid));
4181	sbi->s_commit_interval = JBD2_DEFAULT_MAX_COMMIT_AGE * HZ;
4182	sbi->s_min_batch_time = EXT4_DEF_MIN_BATCH_TIME;
4183	sbi->s_max_batch_time = EXT4_DEF_MAX_BATCH_TIME;
4184
4185	if ((def_mount_opts & EXT4_DEFM_NOBARRIER) == 0)
4186		set_opt(sb, BARRIER);
4187
4188	/*
4189	 * enable delayed allocation by default
4190	 * Use -o nodelalloc to turn it off
4191	 */
4192	if (!IS_EXT3_SB(sb) && !IS_EXT2_SB(sb) &&
4193	    ((def_mount_opts & EXT4_DEFM_NODELALLOC) == 0))
4194		set_opt(sb, DELALLOC);
4195
4196	/*
4197	 * set default s_li_wait_mult for lazyinit, for the case there is
4198	 * no mount option specified.
4199	 */
4200	sbi->s_li_wait_mult = EXT4_DEF_LI_WAIT_MULT;
4201
4202	if (le32_to_cpu(es->s_log_block_size) >
4203	    (EXT4_MAX_BLOCK_LOG_SIZE - EXT4_MIN_BLOCK_LOG_SIZE)) {
4204		ext4_msg(sb, KERN_ERR,
4205			 "Invalid log block size: %u",
4206			 le32_to_cpu(es->s_log_block_size));
4207		goto failed_mount;
4208	}
4209	if (le32_to_cpu(es->s_log_cluster_size) >
4210	    (EXT4_MAX_CLUSTER_LOG_SIZE - EXT4_MIN_BLOCK_LOG_SIZE)) {
4211		ext4_msg(sb, KERN_ERR,
4212			 "Invalid log cluster size: %u",
4213			 le32_to_cpu(es->s_log_cluster_size));
4214		goto failed_mount;
4215	}
4216
4217	blocksize = EXT4_MIN_BLOCK_SIZE << le32_to_cpu(es->s_log_block_size);
4218
4219	if (blocksize == PAGE_SIZE)
4220		set_opt(sb, DIOREAD_NOLOCK);
4221
4222	if (le32_to_cpu(es->s_rev_level) == EXT4_GOOD_OLD_REV) {
4223		sbi->s_inode_size = EXT4_GOOD_OLD_INODE_SIZE;
4224		sbi->s_first_ino = EXT4_GOOD_OLD_FIRST_INO;
4225	} else {
4226		sbi->s_inode_size = le16_to_cpu(es->s_inode_size);
4227		sbi->s_first_ino = le32_to_cpu(es->s_first_ino);
4228		if (sbi->s_first_ino < EXT4_GOOD_OLD_FIRST_INO) {
4229			ext4_msg(sb, KERN_ERR, "invalid first ino: %u",
4230				 sbi->s_first_ino);
4231			goto failed_mount;
4232		}
4233		if ((sbi->s_inode_size < EXT4_GOOD_OLD_INODE_SIZE) ||
4234		    (!is_power_of_2(sbi->s_inode_size)) ||
4235		    (sbi->s_inode_size > blocksize)) {
4236			ext4_msg(sb, KERN_ERR,
4237			       "unsupported inode size: %d",
4238			       sbi->s_inode_size);
4239			ext4_msg(sb, KERN_ERR, "blocksize: %d", blocksize);
4240			goto failed_mount;
4241		}
4242		/*
4243		 * i_atime_extra is the last extra field available for
4244		 * [acm]times in struct ext4_inode. Checking for that
4245		 * field should suffice to ensure we have extra space
4246		 * for all three.
4247		 */
4248		if (sbi->s_inode_size >= offsetof(struct ext4_inode, i_atime_extra) +
4249			sizeof(((struct ext4_inode *)0)->i_atime_extra)) {
4250			sb->s_time_gran = 1;
4251			sb->s_time_max = EXT4_EXTRA_TIMESTAMP_MAX;
4252		} else {
4253			sb->s_time_gran = NSEC_PER_SEC;
4254			sb->s_time_max = EXT4_NON_EXTRA_TIMESTAMP_MAX;
4255		}
4256		sb->s_time_min = EXT4_TIMESTAMP_MIN;
4257	}
4258	if (sbi->s_inode_size > EXT4_GOOD_OLD_INODE_SIZE) {
4259		sbi->s_want_extra_isize = sizeof(struct ext4_inode) -
4260			EXT4_GOOD_OLD_INODE_SIZE;
4261		if (ext4_has_feature_extra_isize(sb)) {
4262			unsigned v, max = (sbi->s_inode_size -
4263					   EXT4_GOOD_OLD_INODE_SIZE);
4264
4265			v = le16_to_cpu(es->s_want_extra_isize);
4266			if (v > max) {
4267				ext4_msg(sb, KERN_ERR,
4268					 "bad s_want_extra_isize: %d", v);
4269				goto failed_mount;
4270			}
4271			if (sbi->s_want_extra_isize < v)
4272				sbi->s_want_extra_isize = v;
4273
4274			v = le16_to_cpu(es->s_min_extra_isize);
4275			if (v > max) {
4276				ext4_msg(sb, KERN_ERR,
4277					 "bad s_min_extra_isize: %d", v);
4278				goto failed_mount;
4279			}
4280			if (sbi->s_want_extra_isize < v)
4281				sbi->s_want_extra_isize = v;
4282		}
4283	}
4284
4285	if (sbi->s_es->s_mount_opts[0]) {
4286		char *s_mount_opts = kstrndup(sbi->s_es->s_mount_opts,
4287					      sizeof(sbi->s_es->s_mount_opts),
4288					      GFP_KERNEL);
4289		if (!s_mount_opts)
4290			goto failed_mount;
4291		if (!parse_options(s_mount_opts, sb, &parsed_opts, 0)) {
4292			ext4_msg(sb, KERN_WARNING,
4293				 "failed to parse options in superblock: %s",
4294				 s_mount_opts);
4295		}
4296		kfree(s_mount_opts);
4297	}
4298	sbi->s_def_mount_opt = sbi->s_mount_opt;
4299	if (!parse_options((char *) data, sb, &parsed_opts, 0))
4300		goto failed_mount;
4301
4302#ifdef CONFIG_UNICODE
4303	if (ext4_has_feature_casefold(sb) && !sb->s_encoding) {
4304		const struct ext4_sb_encodings *encoding_info;
4305		struct unicode_map *encoding;
4306		__u16 encoding_flags;
4307
4308		if (ext4_sb_read_encoding(es, &encoding_info,
4309					  &encoding_flags)) {
4310			ext4_msg(sb, KERN_ERR,
4311				 "Encoding requested by superblock is unknown");
4312			goto failed_mount;
4313		}
4314
4315		encoding = utf8_load(encoding_info->version);
4316		if (IS_ERR(encoding)) {
4317			ext4_msg(sb, KERN_ERR,
4318				 "can't mount with superblock charset: %s-%s "
4319				 "not supported by the kernel. flags: 0x%x.",
4320				 encoding_info->name, encoding_info->version,
4321				 encoding_flags);
4322			goto failed_mount;
4323		}
4324		ext4_msg(sb, KERN_INFO,"Using encoding defined by superblock: "
4325			 "%s-%s with flags 0x%hx", encoding_info->name,
4326			 encoding_info->version?:"\b", encoding_flags);
4327
4328		sb->s_encoding = encoding;
4329		sb->s_encoding_flags = encoding_flags;
4330	}
4331#endif
4332
4333	if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) {
4334		printk_once(KERN_WARNING "EXT4-fs: Warning: mounting with data=journal disables delayed allocation, dioread_nolock, O_DIRECT and fast_commit support!\n");
4335		/* can't mount with both data=journal and dioread_nolock. */
4336		clear_opt(sb, DIOREAD_NOLOCK);
4337		clear_opt2(sb, JOURNAL_FAST_COMMIT);
4338		if (test_opt2(sb, EXPLICIT_DELALLOC)) {
4339			ext4_msg(sb, KERN_ERR, "can't mount with "
4340				 "both data=journal and delalloc");
4341			goto failed_mount;
4342		}
4343		if (test_opt(sb, DAX_ALWAYS)) {
4344			ext4_msg(sb, KERN_ERR, "can't mount with "
4345				 "both data=journal and dax");
4346			goto failed_mount;
4347		}
4348		if (ext4_has_feature_encrypt(sb)) {
4349			ext4_msg(sb, KERN_WARNING,
4350				 "encrypted files will use data=ordered "
4351				 "instead of data journaling mode");
4352		}
4353		if (test_opt(sb, DELALLOC))
4354			clear_opt(sb, DELALLOC);
4355	} else {
4356		sb->s_iflags |= SB_I_CGROUPWB;
4357	}
4358
4359	sb->s_flags = (sb->s_flags & ~SB_POSIXACL) |
4360		(test_opt(sb, POSIX_ACL) ? SB_POSIXACL : 0);
4361
4362	if (le32_to_cpu(es->s_rev_level) == EXT4_GOOD_OLD_REV &&
4363	    (ext4_has_compat_features(sb) ||
4364	     ext4_has_ro_compat_features(sb) ||
4365	     ext4_has_incompat_features(sb)))
4366		ext4_msg(sb, KERN_WARNING,
4367		       "feature flags set on rev 0 fs, "
4368		       "running e2fsck is recommended");
4369
4370	if (es->s_creator_os == cpu_to_le32(EXT4_OS_HURD)) {
4371		set_opt2(sb, HURD_COMPAT);
4372		if (ext4_has_feature_64bit(sb)) {
4373			ext4_msg(sb, KERN_ERR,
4374				 "The Hurd can't support 64-bit file systems");
4375			goto failed_mount;
4376		}
4377
4378		/*
4379		 * ea_inode feature uses l_i_version field which is not
4380		 * available in HURD_COMPAT mode.
4381		 */
4382		if (ext4_has_feature_ea_inode(sb)) {
4383			ext4_msg(sb, KERN_ERR,
4384				 "ea_inode feature is not supported for Hurd");
4385			goto failed_mount;
4386		}
4387	}
4388
4389	if (IS_EXT2_SB(sb)) {
4390		if (ext2_feature_set_ok(sb))
4391			ext4_msg(sb, KERN_INFO, "mounting ext2 file system "
4392				 "using the ext4 subsystem");
4393		else {
4394			/*
4395			 * If we're probing be silent, if this looks like
4396			 * it's actually an ext[34] filesystem.
4397			 */
4398			if (silent && ext4_feature_set_ok(sb, sb_rdonly(sb)))
4399				goto failed_mount;
4400			ext4_msg(sb, KERN_ERR, "couldn't mount as ext2 due "
4401				 "to feature incompatibilities");
4402			goto failed_mount;
4403		}
4404	}
4405
4406	if (IS_EXT3_SB(sb)) {
4407		if (ext3_feature_set_ok(sb))
4408			ext4_msg(sb, KERN_INFO, "mounting ext3 file system "
4409				 "using the ext4 subsystem");
4410		else {
4411			/*
4412			 * If we're probing be silent, if this looks like
4413			 * it's actually an ext4 filesystem.
4414			 */
4415			if (silent && ext4_feature_set_ok(sb, sb_rdonly(sb)))
4416				goto failed_mount;
4417			ext4_msg(sb, KERN_ERR, "couldn't mount as ext3 due "
4418				 "to feature incompatibilities");
4419			goto failed_mount;
4420		}
4421	}
4422
4423	/*
4424	 * Check feature flags regardless of the revision level, since we
4425	 * previously didn't change the revision level when setting the flags,
4426	 * so there is a chance incompat flags are set on a rev 0 filesystem.
4427	 */
4428	if (!ext4_feature_set_ok(sb, (sb_rdonly(sb))))
4429		goto failed_mount;
4430
4431	if (le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks) > (blocksize / 4)) {
4432		ext4_msg(sb, KERN_ERR,
4433			 "Number of reserved GDT blocks insanely large: %d",
4434			 le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks));
4435		goto failed_mount;
4436	}
4437
4438	if (bdev_dax_supported(sb->s_bdev, blocksize))
4439		set_bit(EXT4_FLAGS_BDEV_IS_DAX, &sbi->s_ext4_flags);
4440
4441	if (sbi->s_mount_opt & EXT4_MOUNT_DAX_ALWAYS) {
4442		if (ext4_has_feature_inline_data(sb)) {
4443			ext4_msg(sb, KERN_ERR, "Cannot use DAX on a filesystem"
4444					" that may contain inline data");
4445			goto failed_mount;
4446		}
4447		if (!test_bit(EXT4_FLAGS_BDEV_IS_DAX, &sbi->s_ext4_flags)) {
4448			ext4_msg(sb, KERN_ERR,
4449				"DAX unsupported by block device.");
4450			goto failed_mount;
4451		}
4452	}
4453
4454	if (ext4_has_feature_encrypt(sb) && es->s_encryption_level) {
4455		ext4_msg(sb, KERN_ERR, "Unsupported encryption level %d",
4456			 es->s_encryption_level);
4457		goto failed_mount;
4458	}
4459
4460	if (sb->s_blocksize != blocksize) {
4461		/*
4462		 * bh must be released before kill_bdev(), otherwise
4463		 * it won't be freed and its page also. kill_bdev()
4464		 * is called by sb_set_blocksize().
4465		 */
4466		brelse(bh);
4467		/* Validate the filesystem blocksize */
4468		if (!sb_set_blocksize(sb, blocksize)) {
4469			ext4_msg(sb, KERN_ERR, "bad block size %d",
4470					blocksize);
4471			bh = NULL;
4472			goto failed_mount;
4473		}
4474
4475		logical_sb_block = sb_block * EXT4_MIN_BLOCK_SIZE;
4476		offset = do_div(logical_sb_block, blocksize);
4477		bh = ext4_sb_bread_unmovable(sb, logical_sb_block);
4478		if (IS_ERR(bh)) {
4479			ext4_msg(sb, KERN_ERR,
4480			       "Can't read superblock on 2nd try");
4481			ret = PTR_ERR(bh);
4482			bh = NULL;
4483			goto failed_mount;
4484		}
4485		es = (struct ext4_super_block *)(bh->b_data + offset);
4486		sbi->s_es = es;
4487		if (es->s_magic != cpu_to_le16(EXT4_SUPER_MAGIC)) {
4488			ext4_msg(sb, KERN_ERR,
4489			       "Magic mismatch, very weird!");
4490			goto failed_mount;
4491		}
4492	}
4493
4494	has_huge_files = ext4_has_feature_huge_file(sb);
4495	sbi->s_bitmap_maxbytes = ext4_max_bitmap_size(sb->s_blocksize_bits,
4496						      has_huge_files);
4497	sb->s_maxbytes = ext4_max_size(sb->s_blocksize_bits, has_huge_files);
4498
4499	sbi->s_desc_size = le16_to_cpu(es->s_desc_size);
4500	if (ext4_has_feature_64bit(sb)) {
4501		if (sbi->s_desc_size < EXT4_MIN_DESC_SIZE_64BIT ||
4502		    sbi->s_desc_size > EXT4_MAX_DESC_SIZE ||
4503		    !is_power_of_2(sbi->s_desc_size)) {
4504			ext4_msg(sb, KERN_ERR,
4505			       "unsupported descriptor size %lu",
4506			       sbi->s_desc_size);
4507			goto failed_mount;
4508		}
4509	} else
4510		sbi->s_desc_size = EXT4_MIN_DESC_SIZE;
4511
4512	sbi->s_blocks_per_group = le32_to_cpu(es->s_blocks_per_group);
4513	sbi->s_inodes_per_group = le32_to_cpu(es->s_inodes_per_group);
4514
4515	sbi->s_inodes_per_block = blocksize / EXT4_INODE_SIZE(sb);
4516	if (sbi->s_inodes_per_block == 0)
4517		goto cantfind_ext4;
4518	if (sbi->s_inodes_per_group < sbi->s_inodes_per_block ||
4519	    sbi->s_inodes_per_group > blocksize * 8) {
4520		ext4_msg(sb, KERN_ERR, "invalid inodes per group: %lu\n",
4521			 sbi->s_inodes_per_group);
4522		goto failed_mount;
4523	}
4524	sbi->s_itb_per_group = sbi->s_inodes_per_group /
4525					sbi->s_inodes_per_block;
4526	sbi->s_desc_per_block = blocksize / EXT4_DESC_SIZE(sb);
4527	sbi->s_sbh = bh;
4528	sbi->s_mount_state = le16_to_cpu(es->s_state);
4529	sbi->s_addr_per_block_bits = ilog2(EXT4_ADDR_PER_BLOCK(sb));
4530	sbi->s_desc_per_block_bits = ilog2(EXT4_DESC_PER_BLOCK(sb));
4531
4532	for (i = 0; i < 4; i++)
4533		sbi->s_hash_seed[i] = le32_to_cpu(es->s_hash_seed[i]);
4534	sbi->s_def_hash_version = es->s_def_hash_version;
4535	if (ext4_has_feature_dir_index(sb)) {
4536		i = le32_to_cpu(es->s_flags);
4537		if (i & EXT2_FLAGS_UNSIGNED_HASH)
4538			sbi->s_hash_unsigned = 3;
4539		else if ((i & EXT2_FLAGS_SIGNED_HASH) == 0) {
4540#ifdef __CHAR_UNSIGNED__
4541			if (!sb_rdonly(sb))
4542				es->s_flags |=
4543					cpu_to_le32(EXT2_FLAGS_UNSIGNED_HASH);
4544			sbi->s_hash_unsigned = 3;
4545#else
4546			if (!sb_rdonly(sb))
4547				es->s_flags |=
4548					cpu_to_le32(EXT2_FLAGS_SIGNED_HASH);
4549#endif
4550		}
4551	}
4552
4553	/* Handle clustersize */
4554	clustersize = BLOCK_SIZE << le32_to_cpu(es->s_log_cluster_size);
4555	if (ext4_has_feature_bigalloc(sb)) {
4556		if (clustersize < blocksize) {
4557			ext4_msg(sb, KERN_ERR,
4558				 "cluster size (%d) smaller than "
4559				 "block size (%d)", clustersize, blocksize);
4560			goto failed_mount;
4561		}
4562		sbi->s_cluster_bits = le32_to_cpu(es->s_log_cluster_size) -
4563			le32_to_cpu(es->s_log_block_size);
4564		sbi->s_clusters_per_group =
4565			le32_to_cpu(es->s_clusters_per_group);
4566		if (sbi->s_clusters_per_group > blocksize * 8) {
4567			ext4_msg(sb, KERN_ERR,
4568				 "#clusters per group too big: %lu",
4569				 sbi->s_clusters_per_group);
4570			goto failed_mount;
4571		}
4572		if (sbi->s_blocks_per_group !=
4573		    (sbi->s_clusters_per_group * (clustersize / blocksize))) {
4574			ext4_msg(sb, KERN_ERR, "blocks per group (%lu) and "
4575				 "clusters per group (%lu) inconsistent",
4576				 sbi->s_blocks_per_group,
4577				 sbi->s_clusters_per_group);
4578			goto failed_mount;
4579		}
4580	} else {
4581		if (clustersize != blocksize) {
4582			ext4_msg(sb, KERN_ERR,
4583				 "fragment/cluster size (%d) != "
4584				 "block size (%d)", clustersize, blocksize);
4585			goto failed_mount;
4586		}
4587		if (sbi->s_blocks_per_group > blocksize * 8) {
4588			ext4_msg(sb, KERN_ERR,
4589				 "#blocks per group too big: %lu",
4590				 sbi->s_blocks_per_group);
4591			goto failed_mount;
4592		}
4593		sbi->s_clusters_per_group = sbi->s_blocks_per_group;
4594		sbi->s_cluster_bits = 0;
4595	}
4596	sbi->s_cluster_ratio = clustersize / blocksize;
4597
4598	/* Do we have standard group size of clustersize * 8 blocks ? */
4599	if (sbi->s_blocks_per_group == clustersize << 3)
4600		set_opt2(sb, STD_GROUP_SIZE);
4601
4602	/*
4603	 * Test whether we have more sectors than will fit in sector_t,
4604	 * and whether the max offset is addressable by the page cache.
4605	 */
4606	err = generic_check_addressable(sb->s_blocksize_bits,
4607					ext4_blocks_count(es));
4608	if (err) {
4609		ext4_msg(sb, KERN_ERR, "filesystem"
4610			 " too large to mount safely on this system");
4611		goto failed_mount;
4612	}
4613
4614	if (EXT4_BLOCKS_PER_GROUP(sb) == 0)
4615		goto cantfind_ext4;
4616
4617	/* check blocks count against device size */
4618	blocks_count = sb->s_bdev->bd_inode->i_size >> sb->s_blocksize_bits;
4619	if (blocks_count && ext4_blocks_count(es) > blocks_count) {
4620		ext4_msg(sb, KERN_WARNING, "bad geometry: block count %llu "
4621		       "exceeds size of device (%llu blocks)",
4622		       ext4_blocks_count(es), blocks_count);
4623		goto failed_mount;
4624	}
4625
4626	/*
4627	 * It makes no sense for the first data block to be beyond the end
4628	 * of the filesystem.
4629	 */
4630	if (le32_to_cpu(es->s_first_data_block) >= ext4_blocks_count(es)) {
4631		ext4_msg(sb, KERN_WARNING, "bad geometry: first data "
4632			 "block %u is beyond end of filesystem (%llu)",
4633			 le32_to_cpu(es->s_first_data_block),
4634			 ext4_blocks_count(es));
4635		goto failed_mount;
4636	}
4637	if ((es->s_first_data_block == 0) && (es->s_log_block_size == 0) &&
4638	    (sbi->s_cluster_ratio == 1)) {
4639		ext4_msg(sb, KERN_WARNING, "bad geometry: first data "
4640			 "block is 0 with a 1k block and cluster size");
4641		goto failed_mount;
4642	}
4643
4644	blocks_count = (ext4_blocks_count(es) -
4645			le32_to_cpu(es->s_first_data_block) +
4646			EXT4_BLOCKS_PER_GROUP(sb) - 1);
4647	do_div(blocks_count, EXT4_BLOCKS_PER_GROUP(sb));
4648	if (blocks_count > ((uint64_t)1<<32) - EXT4_DESC_PER_BLOCK(sb)) {
4649		ext4_msg(sb, KERN_WARNING, "groups count too large: %llu "
4650		       "(block count %llu, first data block %u, "
4651		       "blocks per group %lu)", blocks_count,
4652		       ext4_blocks_count(es),
4653		       le32_to_cpu(es->s_first_data_block),
4654		       EXT4_BLOCKS_PER_GROUP(sb));
4655		goto failed_mount;
4656	}
4657	sbi->s_groups_count = blocks_count;
4658	sbi->s_blockfile_groups = min_t(ext4_group_t, sbi->s_groups_count,
4659			(EXT4_MAX_BLOCK_FILE_PHYS / EXT4_BLOCKS_PER_GROUP(sb)));
4660	if (((u64)sbi->s_groups_count * sbi->s_inodes_per_group) !=
4661	    le32_to_cpu(es->s_inodes_count)) {
4662		ext4_msg(sb, KERN_ERR, "inodes count not valid: %u vs %llu",
4663			 le32_to_cpu(es->s_inodes_count),
4664			 ((u64)sbi->s_groups_count * sbi->s_inodes_per_group));
4665		ret = -EINVAL;
4666		goto failed_mount;
4667	}
4668	db_count = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) /
4669		   EXT4_DESC_PER_BLOCK(sb);
4670	if (ext4_has_feature_meta_bg(sb)) {
4671		if (le32_to_cpu(es->s_first_meta_bg) > db_count) {
4672			ext4_msg(sb, KERN_WARNING,
4673				 "first meta block group too large: %u "
4674				 "(group descriptor block count %u)",
4675				 le32_to_cpu(es->s_first_meta_bg), db_count);
4676			goto failed_mount;
4677		}
4678	}
4679	rcu_assign_pointer(sbi->s_group_desc,
4680			   kvmalloc_array(db_count,
4681					  sizeof(struct buffer_head *),
4682					  GFP_KERNEL));
4683	if (sbi->s_group_desc == NULL) {
4684		ext4_msg(sb, KERN_ERR, "not enough memory");
4685		ret = -ENOMEM;
4686		goto failed_mount;
4687	}
4688
4689	bgl_lock_init(sbi->s_blockgroup_lock);
4690
4691	/* Pre-read the descriptors into the buffer cache */
4692	for (i = 0; i < db_count; i++) {
4693		block = descriptor_loc(sb, logical_sb_block, i);
4694		ext4_sb_breadahead_unmovable(sb, block);
4695	}
4696
4697	for (i = 0; i < db_count; i++) {
4698		struct buffer_head *bh;
4699
4700		block = descriptor_loc(sb, logical_sb_block, i);
4701		bh = ext4_sb_bread_unmovable(sb, block);
4702		if (IS_ERR(bh)) {
4703			ext4_msg(sb, KERN_ERR,
4704			       "can't read group descriptor %d", i);
4705			db_count = i;
4706			ret = PTR_ERR(bh);
4707			goto failed_mount2;
4708		}
4709		rcu_read_lock();
4710		rcu_dereference(sbi->s_group_desc)[i] = bh;
4711		rcu_read_unlock();
4712	}
4713	sbi->s_gdb_count = db_count;
4714	if (!ext4_check_descriptors(sb, logical_sb_block, &first_not_zeroed)) {
4715		ext4_msg(sb, KERN_ERR, "group descriptors corrupted!");
4716		ret = -EFSCORRUPTED;
4717		goto failed_mount2;
4718	}
4719
4720	timer_setup(&sbi->s_err_report, print_daily_error_info, 0);
4721	spin_lock_init(&sbi->s_error_lock);
4722	INIT_WORK(&sbi->s_error_work, flush_stashed_error_work);
4723
4724	/* Register extent status tree shrinker */
4725	if (ext4_es_register_shrinker(sbi))
4726		goto failed_mount3;
4727
4728	sbi->s_stripe = ext4_get_stripe_size(sbi);
4729	sbi->s_extent_max_zeroout_kb = 32;
4730
4731	/*
4732	 * set up enough so that it can read an inode
4733	 */
4734	sb->s_op = &ext4_sops;
4735	sb->s_export_op = &ext4_export_ops;
4736	sb->s_xattr = ext4_xattr_handlers;
4737#ifdef CONFIG_FS_ENCRYPTION
4738	sb->s_cop = &ext4_cryptops;
4739#endif
4740#ifdef CONFIG_FS_VERITY
4741	sb->s_vop = &ext4_verityops;
4742#endif
4743#ifdef CONFIG_QUOTA
4744	sb->dq_op = &ext4_quota_operations;
4745	if (ext4_has_feature_quota(sb))
4746		sb->s_qcop = &dquot_quotactl_sysfile_ops;
4747	else
4748		sb->s_qcop = &ext4_qctl_operations;
4749	sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP | QTYPE_MASK_PRJ;
4750#endif
4751	memcpy(&sb->s_uuid, es->s_uuid, sizeof(es->s_uuid));
4752
4753	INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */
4754	mutex_init(&sbi->s_orphan_lock);
4755
4756	/* Initialize fast commit stuff */
4757	atomic_set(&sbi->s_fc_subtid, 0);
4758	atomic_set(&sbi->s_fc_ineligible_updates, 0);
4759	INIT_LIST_HEAD(&sbi->s_fc_q[FC_Q_MAIN]);
4760	INIT_LIST_HEAD(&sbi->s_fc_q[FC_Q_STAGING]);
4761	INIT_LIST_HEAD(&sbi->s_fc_dentry_q[FC_Q_MAIN]);
4762	INIT_LIST_HEAD(&sbi->s_fc_dentry_q[FC_Q_STAGING]);
4763	sbi->s_fc_bytes = 0;
4764	ext4_clear_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
4765	ext4_clear_mount_flag(sb, EXT4_MF_FC_COMMITTING);
4766	spin_lock_init(&sbi->s_fc_lock);
4767	memset(&sbi->s_fc_stats, 0, sizeof(sbi->s_fc_stats));
4768	sbi->s_fc_replay_state.fc_regions = NULL;
4769	sbi->s_fc_replay_state.fc_regions_size = 0;
4770	sbi->s_fc_replay_state.fc_regions_used = 0;
4771	sbi->s_fc_replay_state.fc_regions_valid = 0;
4772	sbi->s_fc_replay_state.fc_modified_inodes = NULL;
4773	sbi->s_fc_replay_state.fc_modified_inodes_size = 0;
4774	sbi->s_fc_replay_state.fc_modified_inodes_used = 0;
4775
4776	sb->s_root = NULL;
4777
4778	needs_recovery = (es->s_last_orphan != 0 ||
4779			  ext4_has_feature_journal_needs_recovery(sb));
4780
4781	if (ext4_has_feature_mmp(sb) && !sb_rdonly(sb))
4782		if (ext4_multi_mount_protect(sb, le64_to_cpu(es->s_mmp_block)))
4783			goto failed_mount3a;
4784
4785	/*
4786	 * The first inode we look at is the journal inode.  Don't try
4787	 * root first: it may be modified in the journal!
4788	 */
4789	if (!test_opt(sb, NOLOAD) && ext4_has_feature_journal(sb)) {
4790		err = ext4_load_journal(sb, es, parsed_opts.journal_devnum);
4791		if (err)
4792			goto failed_mount3a;
4793	} else if (test_opt(sb, NOLOAD) && !sb_rdonly(sb) &&
4794		   ext4_has_feature_journal_needs_recovery(sb)) {
4795		ext4_msg(sb, KERN_ERR, "required journal recovery "
4796		       "suppressed and not mounted read-only");
4797		goto failed_mount_wq;
4798	} else {
4799		/* Nojournal mode, all journal mount options are illegal */
4800		if (test_opt2(sb, EXPLICIT_JOURNAL_CHECKSUM)) {
4801			ext4_msg(sb, KERN_ERR, "can't mount with "
4802				 "journal_checksum, fs mounted w/o journal");
4803			goto failed_mount_wq;
4804		}
4805		if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) {
4806			ext4_msg(sb, KERN_ERR, "can't mount with "
4807				 "journal_async_commit, fs mounted w/o journal");
4808			goto failed_mount_wq;
4809		}
4810		if (sbi->s_commit_interval != JBD2_DEFAULT_MAX_COMMIT_AGE*HZ) {
4811			ext4_msg(sb, KERN_ERR, "can't mount with "
4812				 "commit=%lu, fs mounted w/o journal",
4813				 sbi->s_commit_interval / HZ);
4814			goto failed_mount_wq;
4815		}
4816		if (EXT4_MOUNT_DATA_FLAGS &
4817		    (sbi->s_mount_opt ^ sbi->s_def_mount_opt)) {
4818			ext4_msg(sb, KERN_ERR, "can't mount with "
4819				 "data=, fs mounted w/o journal");
4820			goto failed_mount_wq;
4821		}
4822		sbi->s_def_mount_opt &= ~EXT4_MOUNT_JOURNAL_CHECKSUM;
4823		clear_opt(sb, JOURNAL_CHECKSUM);
4824		clear_opt(sb, DATA_FLAGS);
4825		clear_opt2(sb, JOURNAL_FAST_COMMIT);
4826		sbi->s_journal = NULL;
4827		needs_recovery = 0;
4828		goto no_journal;
4829	}
4830
4831	if (ext4_has_feature_64bit(sb) &&
4832	    !jbd2_journal_set_features(EXT4_SB(sb)->s_journal, 0, 0,
4833				       JBD2_FEATURE_INCOMPAT_64BIT)) {
4834		ext4_msg(sb, KERN_ERR, "Failed to set 64-bit journal feature");
4835		goto failed_mount_wq;
4836	}
4837
4838	if (!set_journal_csum_feature_set(sb)) {
4839		ext4_msg(sb, KERN_ERR, "Failed to set journal checksum "
4840			 "feature set");
4841		goto failed_mount_wq;
4842	}
4843
4844	if (test_opt2(sb, JOURNAL_FAST_COMMIT) &&
4845		!jbd2_journal_set_features(EXT4_SB(sb)->s_journal, 0, 0,
4846					  JBD2_FEATURE_INCOMPAT_FAST_COMMIT)) {
4847		ext4_msg(sb, KERN_ERR,
4848			"Failed to set fast commit journal feature");
4849		goto failed_mount_wq;
4850	}
4851
4852	/* We have now updated the journal if required, so we can
4853	 * validate the data journaling mode. */
4854	switch (test_opt(sb, DATA_FLAGS)) {
4855	case 0:
4856		/* No mode set, assume a default based on the journal
4857		 * capabilities: ORDERED_DATA if the journal can
4858		 * cope, else JOURNAL_DATA
4859		 */
4860		if (jbd2_journal_check_available_features
4861		    (sbi->s_journal, 0, 0, JBD2_FEATURE_INCOMPAT_REVOKE)) {
4862			set_opt(sb, ORDERED_DATA);
4863			sbi->s_def_mount_opt |= EXT4_MOUNT_ORDERED_DATA;
4864		} else {
4865			set_opt(sb, JOURNAL_DATA);
4866			sbi->s_def_mount_opt |= EXT4_MOUNT_JOURNAL_DATA;
4867		}
4868		break;
4869
4870	case EXT4_MOUNT_ORDERED_DATA:
4871	case EXT4_MOUNT_WRITEBACK_DATA:
4872		if (!jbd2_journal_check_available_features
4873		    (sbi->s_journal, 0, 0, JBD2_FEATURE_INCOMPAT_REVOKE)) {
4874			ext4_msg(sb, KERN_ERR, "Journal does not support "
4875			       "requested data journaling mode");
4876			goto failed_mount_wq;
4877		}
4878		break;
4879	default:
4880		break;
4881	}
4882
4883	if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA &&
4884	    test_opt(sb, JOURNAL_ASYNC_COMMIT)) {
4885		ext4_msg(sb, KERN_ERR, "can't mount with "
4886			"journal_async_commit in data=ordered mode");
4887		goto failed_mount_wq;
4888	}
4889
4890	set_task_ioprio(sbi->s_journal->j_task, parsed_opts.journal_ioprio);
4891
4892	sbi->s_journal->j_submit_inode_data_buffers =
4893		ext4_journal_submit_inode_data_buffers;
4894	sbi->s_journal->j_finish_inode_data_buffers =
4895		ext4_journal_finish_inode_data_buffers;
4896
4897no_journal:
4898	if (!test_opt(sb, NO_MBCACHE)) {
4899		sbi->s_ea_block_cache = ext4_xattr_create_cache();
4900		if (!sbi->s_ea_block_cache) {
4901			ext4_msg(sb, KERN_ERR,
4902				 "Failed to create ea_block_cache");
4903			goto failed_mount_wq;
4904		}
4905
4906		if (ext4_has_feature_ea_inode(sb)) {
4907			sbi->s_ea_inode_cache = ext4_xattr_create_cache();
4908			if (!sbi->s_ea_inode_cache) {
4909				ext4_msg(sb, KERN_ERR,
4910					 "Failed to create ea_inode_cache");
4911				goto failed_mount_wq;
4912			}
4913		}
4914	}
4915
4916	if (ext4_has_feature_verity(sb) && blocksize != PAGE_SIZE) {
4917		ext4_msg(sb, KERN_ERR, "Unsupported blocksize for fs-verity");
4918		goto failed_mount_wq;
4919	}
4920
4921	if (DUMMY_ENCRYPTION_ENABLED(sbi) && !sb_rdonly(sb) &&
4922	    !ext4_has_feature_encrypt(sb)) {
4923		ext4_set_feature_encrypt(sb);
4924		ext4_commit_super(sb);
4925	}
4926
4927	/*
4928	 * Get the # of file system overhead blocks from the
4929	 * superblock if present.
4930	 */
4931	if (es->s_overhead_clusters)
4932		sbi->s_overhead = le32_to_cpu(es->s_overhead_clusters);
4933	else {
4934		err = ext4_calculate_overhead(sb);
4935		if (err)
4936			goto failed_mount_wq;
4937	}
4938
4939	/*
4940	 * The maximum number of concurrent works can be high and
4941	 * concurrency isn't really necessary.  Limit it to 1.
4942	 */
4943	EXT4_SB(sb)->rsv_conversion_wq =
4944		alloc_workqueue("ext4-rsv-conversion", WQ_MEM_RECLAIM | WQ_UNBOUND, 1);
4945	if (!EXT4_SB(sb)->rsv_conversion_wq) {
4946		printk(KERN_ERR "EXT4-fs: failed to create workqueue\n");
4947		ret = -ENOMEM;
4948		goto failed_mount4;
4949	}
4950
4951	/*
4952	 * The jbd2_journal_load will have done any necessary log recovery,
4953	 * so we can safely mount the rest of the filesystem now.
4954	 */
4955
4956	root = ext4_iget(sb, EXT4_ROOT_INO, EXT4_IGET_SPECIAL);
4957	if (IS_ERR(root)) {
4958		ext4_msg(sb, KERN_ERR, "get root inode failed");
4959		ret = PTR_ERR(root);
4960		root = NULL;
4961		goto failed_mount4;
4962	}
4963	if (!S_ISDIR(root->i_mode) || !root->i_blocks || !root->i_size) {
4964		ext4_msg(sb, KERN_ERR, "corrupt root inode, run e2fsck");
4965		iput(root);
4966		goto failed_mount4;
4967	}
4968
4969	sb->s_root = d_make_root(root);
4970	if (!sb->s_root) {
4971		ext4_msg(sb, KERN_ERR, "get root dentry failed");
4972		ret = -ENOMEM;
4973		goto failed_mount4;
4974	}
4975
4976	ret = ext4_setup_super(sb, es, sb_rdonly(sb));
4977	if (ret == -EROFS) {
4978		sb->s_flags |= SB_RDONLY;
4979		ret = 0;
4980	} else if (ret)
4981		goto failed_mount4a;
4982
4983	ext4_set_resv_clusters(sb);
4984
4985	if (test_opt(sb, BLOCK_VALIDITY)) {
4986		err = ext4_setup_system_zone(sb);
4987		if (err) {
4988			ext4_msg(sb, KERN_ERR, "failed to initialize system "
4989				 "zone (%d)", err);
4990			goto failed_mount4a;
4991		}
4992	}
4993	ext4_fc_replay_cleanup(sb);
4994
4995	ext4_ext_init(sb);
4996
4997	/*
4998	 * Enable optimize_scan if number of groups is > threshold. This can be
4999	 * turned off by passing "mb_optimize_scan=0". This can also be
5000	 * turned on forcefully by passing "mb_optimize_scan=1".
5001	 */
5002	if (parsed_opts.mb_optimize_scan == 1)
5003		set_opt2(sb, MB_OPTIMIZE_SCAN);
5004	else if (parsed_opts.mb_optimize_scan == 0)
5005		clear_opt2(sb, MB_OPTIMIZE_SCAN);
5006	else if (sbi->s_groups_count >= MB_DEFAULT_LINEAR_SCAN_THRESHOLD)
5007		set_opt2(sb, MB_OPTIMIZE_SCAN);
5008
5009	err = ext4_mb_init(sb);
5010	if (err) {
5011		ext4_msg(sb, KERN_ERR, "failed to initialize mballoc (%d)",
5012			 err);
5013		goto failed_mount5;
5014	}
5015
5016	/*
5017	 * We can only set up the journal commit callback once
5018	 * mballoc is initialized
5019	 */
5020	if (sbi->s_journal)
5021		sbi->s_journal->j_commit_callback =
5022			ext4_journal_commit_callback;
5023
5024	block = ext4_count_free_clusters(sb);
5025	ext4_free_blocks_count_set(sbi->s_es,
5026				   EXT4_C2B(sbi, block));
5027	err = percpu_counter_init(&sbi->s_freeclusters_counter, block,
5028				  GFP_KERNEL);
5029	if (!err) {
5030		unsigned long freei = ext4_count_free_inodes(sb);
5031		sbi->s_es->s_free_inodes_count = cpu_to_le32(freei);
5032		err = percpu_counter_init(&sbi->s_freeinodes_counter, freei,
5033					  GFP_KERNEL);
5034	}
5035	if (!err)
5036		err = percpu_counter_init(&sbi->s_dirs_counter,
5037					  ext4_count_dirs(sb), GFP_KERNEL);
5038	if (!err)
5039		err = percpu_counter_init(&sbi->s_dirtyclusters_counter, 0,
5040					  GFP_KERNEL);
5041	if (!err)
5042		err = percpu_counter_init(&sbi->s_sra_exceeded_retry_limit, 0,
5043					  GFP_KERNEL);
5044	if (!err)
5045		err = percpu_init_rwsem(&sbi->s_writepages_rwsem);
5046
5047	if (err) {
5048		ext4_msg(sb, KERN_ERR, "insufficient memory");
5049		goto failed_mount6;
5050	}
5051
5052	if (ext4_has_feature_flex_bg(sb))
5053		if (!ext4_fill_flex_info(sb)) {
5054			ext4_msg(sb, KERN_ERR,
5055			       "unable to initialize "
5056			       "flex_bg meta info!");
5057			ret = -ENOMEM;
5058			goto failed_mount6;
5059		}
5060
5061	err = ext4_register_li_request(sb, first_not_zeroed);
5062	if (err)
5063		goto failed_mount6;
5064
5065	err = ext4_register_sysfs(sb);
5066	if (err)
5067		goto failed_mount7;
5068
5069#ifdef CONFIG_QUOTA
5070	/* Enable quota usage during mount. */
5071	if (ext4_has_feature_quota(sb) && !sb_rdonly(sb)) {
5072		err = ext4_enable_quotas(sb);
5073		if (err)
5074			goto failed_mount8;
5075	}
5076#endif  /* CONFIG_QUOTA */
5077
5078	/*
5079	 * Save the original bdev mapping's wb_err value which could be
5080	 * used to detect the metadata async write error.
5081	 */
5082	spin_lock_init(&sbi->s_bdev_wb_lock);
5083	errseq_check_and_advance(&sb->s_bdev->bd_inode->i_mapping->wb_err,
5084				 &sbi->s_bdev_wb_err);
5085	sb->s_bdev->bd_super = sb;
5086	EXT4_SB(sb)->s_mount_state |= EXT4_ORPHAN_FS;
5087	ext4_orphan_cleanup(sb, es);
5088	EXT4_SB(sb)->s_mount_state &= ~EXT4_ORPHAN_FS;
5089	if (needs_recovery) {
5090		ext4_msg(sb, KERN_INFO, "recovery complete");
5091		err = ext4_mark_recovery_complete(sb, es);
5092		if (err)
5093			goto failed_mount8;
5094	}
5095	if (EXT4_SB(sb)->s_journal) {
5096		if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)
5097			descr = " journalled data mode";
5098		else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA)
5099			descr = " ordered data mode";
5100		else
5101			descr = " writeback data mode";
5102	} else
5103		descr = "out journal";
5104
5105	if (test_opt(sb, DISCARD)) {
5106		struct request_queue *q = bdev_get_queue(sb->s_bdev);
5107		if (!blk_queue_discard(q))
5108			ext4_msg(sb, KERN_WARNING,
5109				 "mounting with \"discard\" option, but "
5110				 "the device does not support discard");
5111	}
5112
5113	if (___ratelimit(&ext4_mount_msg_ratelimit, "EXT4-fs mount"))
5114		ext4_msg(sb, KERN_INFO, "mounted filesystem with%s. "
5115			 "Opts: %.*s%s%s. Quota mode: %s.", descr,
5116			 (int) sizeof(sbi->s_es->s_mount_opts),
5117			 sbi->s_es->s_mount_opts,
5118			 *sbi->s_es->s_mount_opts ? "; " : "", orig_data,
5119			 ext4_quota_mode(sb));
5120
5121	if (es->s_error_count)
5122		mod_timer(&sbi->s_err_report, jiffies + 300*HZ); /* 5 minutes */
5123
5124	/* Enable message ratelimiting. Default is 10 messages per 5 secs. */
5125	ratelimit_state_init(&sbi->s_err_ratelimit_state, 5 * HZ, 10);
5126	ratelimit_state_init(&sbi->s_warning_ratelimit_state, 5 * HZ, 10);
5127	ratelimit_state_init(&sbi->s_msg_ratelimit_state, 5 * HZ, 10);
5128	atomic_set(&sbi->s_warning_count, 0);
5129	atomic_set(&sbi->s_msg_count, 0);
5130
5131	kfree(orig_data);
5132	return 0;
5133
5134cantfind_ext4:
5135	if (!silent)
5136		ext4_msg(sb, KERN_ERR, "VFS: Can't find ext4 filesystem");
5137	goto failed_mount;
5138
5139failed_mount8:
5140	ext4_unregister_sysfs(sb);
5141	kobject_put(&sbi->s_kobj);
5142failed_mount7:
5143	ext4_unregister_li_request(sb);
5144failed_mount6:
5145	ext4_mb_release(sb);
5146	rcu_read_lock();
5147	flex_groups = rcu_dereference(sbi->s_flex_groups);
5148	if (flex_groups) {
5149		for (i = 0; i < sbi->s_flex_groups_allocated; i++)
5150			kvfree(flex_groups[i]);
5151		kvfree(flex_groups);
5152	}
5153	rcu_read_unlock();
5154	percpu_counter_destroy(&sbi->s_freeclusters_counter);
5155	percpu_counter_destroy(&sbi->s_freeinodes_counter);
5156	percpu_counter_destroy(&sbi->s_dirs_counter);
5157	percpu_counter_destroy(&sbi->s_dirtyclusters_counter);
5158	percpu_counter_destroy(&sbi->s_sra_exceeded_retry_limit);
5159	percpu_free_rwsem(&sbi->s_writepages_rwsem);
5160failed_mount5:
5161	ext4_ext_release(sb);
5162	ext4_release_system_zone(sb);
5163failed_mount4a:
5164	dput(sb->s_root);
5165	sb->s_root = NULL;
5166failed_mount4:
5167	ext4_msg(sb, KERN_ERR, "mount failed");
5168	if (EXT4_SB(sb)->rsv_conversion_wq)
5169		destroy_workqueue(EXT4_SB(sb)->rsv_conversion_wq);
5170failed_mount_wq:
5171	ext4_xattr_destroy_cache(sbi->s_ea_inode_cache);
5172	sbi->s_ea_inode_cache = NULL;
5173
5174	ext4_xattr_destroy_cache(sbi->s_ea_block_cache);
5175	sbi->s_ea_block_cache = NULL;
5176
5177	if (sbi->s_journal) {
5178		jbd2_journal_destroy(sbi->s_journal);
5179		sbi->s_journal = NULL;
5180	}
5181failed_mount3a:
5182	ext4_es_unregister_shrinker(sbi);
5183failed_mount3:
5184	flush_work(&sbi->s_error_work);
5185	del_timer_sync(&sbi->s_err_report);
5186	ext4_stop_mmpd(sbi);
5187failed_mount2:
5188	rcu_read_lock();
5189	group_desc = rcu_dereference(sbi->s_group_desc);
5190	for (i = 0; i < db_count; i++)
5191		brelse(group_desc[i]);
5192	kvfree(group_desc);
5193	rcu_read_unlock();
5194failed_mount:
5195	if (sbi->s_chksum_driver)
5196		crypto_free_shash(sbi->s_chksum_driver);
5197
5198#ifdef CONFIG_UNICODE
5199	utf8_unload(sb->s_encoding);
5200#endif
5201
5202#ifdef CONFIG_QUOTA
5203	for (i = 0; i < EXT4_MAXQUOTAS; i++)
5204		kfree(get_qf_name(sb, sbi, i));
5205#endif
5206	fscrypt_free_dummy_policy(&sbi->s_dummy_enc_policy);
5207	/* ext4_blkdev_remove() calls kill_bdev(), release bh before it. */
5208	brelse(bh);
5209	ext4_blkdev_remove(sbi);
5210out_fail:
5211	sb->s_fs_info = NULL;
5212	kfree(sbi->s_blockgroup_lock);
5213out_free_base:
5214	kfree(sbi);
5215	kfree(orig_data);
5216	fs_put_dax(dax_dev);
5217	return err ? err : ret;
5218}
5219
5220/*
5221 * Setup any per-fs journal parameters now.  We'll do this both on
5222 * initial mount, once the journal has been initialised but before we've
5223 * done any recovery; and again on any subsequent remount.
5224 */
5225static void ext4_init_journal_params(struct super_block *sb, journal_t *journal)
5226{
5227	struct ext4_sb_info *sbi = EXT4_SB(sb);
5228
5229	journal->j_commit_interval = sbi->s_commit_interval;
5230	journal->j_min_batch_time = sbi->s_min_batch_time;
5231	journal->j_max_batch_time = sbi->s_max_batch_time;
5232	ext4_fc_init(sb, journal);
5233
5234	write_lock(&journal->j_state_lock);
5235	if (test_opt(sb, BARRIER))
5236		journal->j_flags |= JBD2_BARRIER;
5237	else
5238		journal->j_flags &= ~JBD2_BARRIER;
5239	if (test_opt(sb, DATA_ERR_ABORT))
5240		journal->j_flags |= JBD2_ABORT_ON_SYNCDATA_ERR;
5241	else
5242		journal->j_flags &= ~JBD2_ABORT_ON_SYNCDATA_ERR;
5243	write_unlock(&journal->j_state_lock);
5244}
5245
5246static struct inode *ext4_get_journal_inode(struct super_block *sb,
5247					     unsigned int journal_inum)
5248{
5249	struct inode *journal_inode;
5250
5251	/*
5252	 * Test for the existence of a valid inode on disk.  Bad things
5253	 * happen if we iget() an unused inode, as the subsequent iput()
5254	 * will try to delete it.
5255	 */
5256	journal_inode = ext4_iget(sb, journal_inum, EXT4_IGET_SPECIAL);
5257	if (IS_ERR(journal_inode)) {
5258		ext4_msg(sb, KERN_ERR, "no journal found");
5259		return NULL;
5260	}
5261	if (!journal_inode->i_nlink) {
5262		make_bad_inode(journal_inode);
5263		iput(journal_inode);
5264		ext4_msg(sb, KERN_ERR, "journal inode is deleted");
5265		return NULL;
5266	}
5267
5268	jbd_debug(2, "Journal inode found at %p: %lld bytes\n",
5269		  journal_inode, journal_inode->i_size);
5270	if (!S_ISREG(journal_inode->i_mode)) {
5271		ext4_msg(sb, KERN_ERR, "invalid journal inode");
5272		iput(journal_inode);
5273		return NULL;
5274	}
5275	return journal_inode;
5276}
5277
5278static journal_t *ext4_get_journal(struct super_block *sb,
5279				   unsigned int journal_inum)
5280{
5281	struct inode *journal_inode;
5282	journal_t *journal;
5283
5284	if (WARN_ON_ONCE(!ext4_has_feature_journal(sb)))
5285		return NULL;
5286
5287	journal_inode = ext4_get_journal_inode(sb, journal_inum);
5288	if (!journal_inode)
5289		return NULL;
5290
5291	journal = jbd2_journal_init_inode(journal_inode);
5292	if (!journal) {
5293		ext4_msg(sb, KERN_ERR, "Could not load journal inode");
5294		iput(journal_inode);
5295		return NULL;
5296	}
5297	journal->j_private = sb;
5298	ext4_init_journal_params(sb, journal);
5299	return journal;
5300}
5301
5302static journal_t *ext4_get_dev_journal(struct super_block *sb,
5303				       dev_t j_dev)
5304{
5305	struct buffer_head *bh;
5306	journal_t *journal;
5307	ext4_fsblk_t start;
5308	ext4_fsblk_t len;
5309	int hblock, blocksize;
5310	ext4_fsblk_t sb_block;
5311	unsigned long offset;
5312	struct ext4_super_block *es;
5313	struct block_device *bdev;
5314
5315	if (WARN_ON_ONCE(!ext4_has_feature_journal(sb)))
5316		return NULL;
5317
5318	bdev = ext4_blkdev_get(j_dev, sb);
5319	if (bdev == NULL)
5320		return NULL;
5321
5322	blocksize = sb->s_blocksize;
5323	hblock = bdev_logical_block_size(bdev);
5324	if (blocksize < hblock) {
5325		ext4_msg(sb, KERN_ERR,
5326			"blocksize too small for journal device");
5327		goto out_bdev;
5328	}
5329
5330	sb_block = EXT4_MIN_BLOCK_SIZE / blocksize;
5331	offset = EXT4_MIN_BLOCK_SIZE % blocksize;
5332	set_blocksize(bdev, blocksize);
5333	if (!(bh = __bread(bdev, sb_block, blocksize))) {
5334		ext4_msg(sb, KERN_ERR, "couldn't read superblock of "
5335		       "external journal");
5336		goto out_bdev;
5337	}
5338
5339	es = (struct ext4_super_block *) (bh->b_data + offset);
5340	if ((le16_to_cpu(es->s_magic) != EXT4_SUPER_MAGIC) ||
5341	    !(le32_to_cpu(es->s_feature_incompat) &
5342	      EXT4_FEATURE_INCOMPAT_JOURNAL_DEV)) {
5343		ext4_msg(sb, KERN_ERR, "external journal has "
5344					"bad superblock");
5345		brelse(bh);
5346		goto out_bdev;
5347	}
5348
5349	if ((le32_to_cpu(es->s_feature_ro_compat) &
5350	     EXT4_FEATURE_RO_COMPAT_METADATA_CSUM) &&
5351	    es->s_checksum != ext4_superblock_csum(sb, es)) {
5352		ext4_msg(sb, KERN_ERR, "external journal has "
5353				       "corrupt superblock");
5354		brelse(bh);
5355		goto out_bdev;
5356	}
5357
5358	if (memcmp(EXT4_SB(sb)->s_es->s_journal_uuid, es->s_uuid, 16)) {
5359		ext4_msg(sb, KERN_ERR, "journal UUID does not match");
5360		brelse(bh);
5361		goto out_bdev;
5362	}
5363
5364	len = ext4_blocks_count(es);
5365	start = sb_block + 1;
5366	brelse(bh);	/* we're done with the superblock */
5367
5368	journal = jbd2_journal_init_dev(bdev, sb->s_bdev,
5369					start, len, blocksize);
5370	if (!journal) {
5371		ext4_msg(sb, KERN_ERR, "failed to create device journal");
5372		goto out_bdev;
5373	}
5374	journal->j_private = sb;
5375	if (ext4_read_bh_lock(journal->j_sb_buffer, REQ_META | REQ_PRIO, true)) {
5376		ext4_msg(sb, KERN_ERR, "I/O error on journal device");
5377		goto out_journal;
5378	}
5379	if (be32_to_cpu(journal->j_superblock->s_nr_users) != 1) {
5380		ext4_msg(sb, KERN_ERR, "External journal has more than one "
5381					"user (unsupported) - %d",
5382			be32_to_cpu(journal->j_superblock->s_nr_users));
5383		goto out_journal;
5384	}
5385	EXT4_SB(sb)->s_journal_bdev = bdev;
5386	ext4_init_journal_params(sb, journal);
5387	return journal;
5388
5389out_journal:
5390	jbd2_journal_destroy(journal);
5391out_bdev:
5392	ext4_blkdev_put(bdev);
5393	return NULL;
5394}
5395
5396static int ext4_load_journal(struct super_block *sb,
5397			     struct ext4_super_block *es,
5398			     unsigned long journal_devnum)
5399{
5400	journal_t *journal;
5401	unsigned int journal_inum = le32_to_cpu(es->s_journal_inum);
5402	dev_t journal_dev;
5403	int err = 0;
5404	int really_read_only;
5405	int journal_dev_ro;
5406
5407	if (WARN_ON_ONCE(!ext4_has_feature_journal(sb)))
5408		return -EFSCORRUPTED;
5409
5410	if (journal_devnum &&
5411	    journal_devnum != le32_to_cpu(es->s_journal_dev)) {
5412		ext4_msg(sb, KERN_INFO, "external journal device major/minor "
5413			"numbers have changed");
5414		journal_dev = new_decode_dev(journal_devnum);
5415	} else
5416		journal_dev = new_decode_dev(le32_to_cpu(es->s_journal_dev));
5417
5418	if (journal_inum && journal_dev) {
5419		ext4_msg(sb, KERN_ERR,
5420			 "filesystem has both journal inode and journal device!");
5421		return -EINVAL;
5422	}
5423
5424	if (journal_inum) {
5425		journal = ext4_get_journal(sb, journal_inum);
5426		if (!journal)
5427			return -EINVAL;
5428	} else {
5429		journal = ext4_get_dev_journal(sb, journal_dev);
5430		if (!journal)
5431			return -EINVAL;
5432	}
5433
5434	journal_dev_ro = bdev_read_only(journal->j_dev);
5435	really_read_only = bdev_read_only(sb->s_bdev) | journal_dev_ro;
5436
5437	if (journal_dev_ro && !sb_rdonly(sb)) {
5438		ext4_msg(sb, KERN_ERR,
5439			 "journal device read-only, try mounting with '-o ro'");
5440		err = -EROFS;
5441		goto err_out;
5442	}
5443
5444	/*
5445	 * Are we loading a blank journal or performing recovery after a
5446	 * crash?  For recovery, we need to check in advance whether we
5447	 * can get read-write access to the device.
5448	 */
5449	if (ext4_has_feature_journal_needs_recovery(sb)) {
5450		if (sb_rdonly(sb)) {
5451			ext4_msg(sb, KERN_INFO, "INFO: recovery "
5452					"required on readonly filesystem");
5453			if (really_read_only) {
5454				ext4_msg(sb, KERN_ERR, "write access "
5455					"unavailable, cannot proceed "
5456					"(try mounting with noload)");
5457				err = -EROFS;
5458				goto err_out;
5459			}
5460			ext4_msg(sb, KERN_INFO, "write access will "
5461			       "be enabled during recovery");
5462		}
5463	}
5464
5465	if (!(journal->j_flags & JBD2_BARRIER))
5466		ext4_msg(sb, KERN_INFO, "barriers disabled");
5467
5468	if (!ext4_has_feature_journal_needs_recovery(sb))
5469		err = jbd2_journal_wipe(journal, !really_read_only);
5470	if (!err) {
5471		char *save = kmalloc(EXT4_S_ERR_LEN, GFP_KERNEL);
5472		if (save)
5473			memcpy(save, ((char *) es) +
5474			       EXT4_S_ERR_START, EXT4_S_ERR_LEN);
5475		err = jbd2_journal_load(journal);
5476		if (save)
5477			memcpy(((char *) es) + EXT4_S_ERR_START,
5478			       save, EXT4_S_ERR_LEN);
5479		kfree(save);
5480	}
5481
5482	if (err) {
5483		ext4_msg(sb, KERN_ERR, "error loading journal");
5484		goto err_out;
5485	}
5486
5487	EXT4_SB(sb)->s_journal = journal;
5488	err = ext4_clear_journal_err(sb, es);
5489	if (err) {
5490		EXT4_SB(sb)->s_journal = NULL;
5491		jbd2_journal_destroy(journal);
5492		return err;
5493	}
5494
5495	if (!really_read_only && journal_devnum &&
5496	    journal_devnum != le32_to_cpu(es->s_journal_dev)) {
5497		es->s_journal_dev = cpu_to_le32(journal_devnum);
5498
5499		/* Make sure we flush the recovery flag to disk. */
5500		ext4_commit_super(sb);
5501	}
5502
5503	return 0;
5504
5505err_out:
5506	jbd2_journal_destroy(journal);
5507	return err;
5508}
5509
5510/* Copy state of EXT4_SB(sb) into buffer for on-disk superblock */
5511static void ext4_update_super(struct super_block *sb)
5512{
5513	struct ext4_sb_info *sbi = EXT4_SB(sb);
5514	struct ext4_super_block *es = sbi->s_es;
5515	struct buffer_head *sbh = sbi->s_sbh;
5516
5517	lock_buffer(sbh);
5518	/*
5519	 * If the file system is mounted read-only, don't update the
5520	 * superblock write time.  This avoids updating the superblock
5521	 * write time when we are mounting the root file system
5522	 * read/only but we need to replay the journal; at that point,
5523	 * for people who are east of GMT and who make their clock
5524	 * tick in localtime for Windows bug-for-bug compatibility,
5525	 * the clock is set in the future, and this will cause e2fsck
5526	 * to complain and force a full file system check.
5527	 */
5528	if (!(sb->s_flags & SB_RDONLY))
5529		ext4_update_tstamp(es, s_wtime);
5530	es->s_kbytes_written =
5531		cpu_to_le64(sbi->s_kbytes_written +
5532		    ((part_stat_read(sb->s_bdev, sectors[STAT_WRITE]) -
5533		      sbi->s_sectors_written_start) >> 1));
5534	if (percpu_counter_initialized(&sbi->s_freeclusters_counter))
5535		ext4_free_blocks_count_set(es,
5536			EXT4_C2B(sbi, percpu_counter_sum_positive(
5537				&sbi->s_freeclusters_counter)));
5538	if (percpu_counter_initialized(&sbi->s_freeinodes_counter))
5539		es->s_free_inodes_count =
5540			cpu_to_le32(percpu_counter_sum_positive(
5541				&sbi->s_freeinodes_counter));
5542	/* Copy error information to the on-disk superblock */
5543	spin_lock(&sbi->s_error_lock);
5544	if (sbi->s_add_error_count > 0) {
5545		es->s_state |= cpu_to_le16(EXT4_ERROR_FS);
5546		if (!es->s_first_error_time && !es->s_first_error_time_hi) {
5547			__ext4_update_tstamp(&es->s_first_error_time,
5548					     &es->s_first_error_time_hi,
5549					     sbi->s_first_error_time);
5550			strncpy(es->s_first_error_func, sbi->s_first_error_func,
5551				sizeof(es->s_first_error_func));
5552			es->s_first_error_line =
5553				cpu_to_le32(sbi->s_first_error_line);
5554			es->s_first_error_ino =
5555				cpu_to_le32(sbi->s_first_error_ino);
5556			es->s_first_error_block =
5557				cpu_to_le64(sbi->s_first_error_block);
5558			es->s_first_error_errcode =
5559				ext4_errno_to_code(sbi->s_first_error_code);
5560		}
5561		__ext4_update_tstamp(&es->s_last_error_time,
5562				     &es->s_last_error_time_hi,
5563				     sbi->s_last_error_time);
5564		strncpy(es->s_last_error_func, sbi->s_last_error_func,
5565			sizeof(es->s_last_error_func));
5566		es->s_last_error_line = cpu_to_le32(sbi->s_last_error_line);
5567		es->s_last_error_ino = cpu_to_le32(sbi->s_last_error_ino);
5568		es->s_last_error_block = cpu_to_le64(sbi->s_last_error_block);
5569		es->s_last_error_errcode =
5570				ext4_errno_to_code(sbi->s_last_error_code);
5571		/*
5572		 * Start the daily error reporting function if it hasn't been
5573		 * started already
5574		 */
5575		if (!es->s_error_count)
5576			mod_timer(&sbi->s_err_report, jiffies + 24*60*60*HZ);
5577		le32_add_cpu(&es->s_error_count, sbi->s_add_error_count);
5578		sbi->s_add_error_count = 0;
5579	}
5580	spin_unlock(&sbi->s_error_lock);
5581
5582	ext4_superblock_csum_set(sb);
5583	unlock_buffer(sbh);
5584}
5585
5586static int ext4_commit_super(struct super_block *sb)
5587{
5588	struct buffer_head *sbh = EXT4_SB(sb)->s_sbh;
5589	int error = 0;
5590
5591	if (!sbh)
5592		return -EINVAL;
5593	if (block_device_ejected(sb))
5594		return -ENODEV;
5595
5596	ext4_update_super(sb);
5597
5598	if (buffer_write_io_error(sbh) || !buffer_uptodate(sbh)) {
5599		/*
5600		 * Oh, dear.  A previous attempt to write the
5601		 * superblock failed.  This could happen because the
5602		 * USB device was yanked out.  Or it could happen to
5603		 * be a transient write error and maybe the block will
5604		 * be remapped.  Nothing we can do but to retry the
5605		 * write and hope for the best.
5606		 */
5607		ext4_msg(sb, KERN_ERR, "previous I/O error to "
5608		       "superblock detected");
5609		clear_buffer_write_io_error(sbh);
5610		set_buffer_uptodate(sbh);
5611	}
5612	BUFFER_TRACE(sbh, "marking dirty");
5613	mark_buffer_dirty(sbh);
5614	error = __sync_dirty_buffer(sbh,
5615		REQ_SYNC | (test_opt(sb, BARRIER) ? REQ_FUA : 0));
5616	if (buffer_write_io_error(sbh)) {
5617		ext4_msg(sb, KERN_ERR, "I/O error while writing "
5618		       "superblock");
5619		clear_buffer_write_io_error(sbh);
5620		set_buffer_uptodate(sbh);
5621	}
5622	return error;
5623}
5624
5625/*
5626 * Have we just finished recovery?  If so, and if we are mounting (or
5627 * remounting) the filesystem readonly, then we will end up with a
5628 * consistent fs on disk.  Record that fact.
5629 */
5630static int ext4_mark_recovery_complete(struct super_block *sb,
5631				       struct ext4_super_block *es)
5632{
5633	int err;
5634	journal_t *journal = EXT4_SB(sb)->s_journal;
5635
5636	if (!ext4_has_feature_journal(sb)) {
5637		if (journal != NULL) {
5638			ext4_error(sb, "Journal got removed while the fs was "
5639				   "mounted!");
5640			return -EFSCORRUPTED;
5641		}
5642		return 0;
5643	}
5644	jbd2_journal_lock_updates(journal);
5645	err = jbd2_journal_flush(journal, 0);
5646	if (err < 0)
5647		goto out;
5648
5649	if (ext4_has_feature_journal_needs_recovery(sb) && sb_rdonly(sb)) {
5650		ext4_clear_feature_journal_needs_recovery(sb);
5651		ext4_commit_super(sb);
5652	}
5653out:
5654	jbd2_journal_unlock_updates(journal);
5655	return err;
5656}
5657
5658/*
5659 * If we are mounting (or read-write remounting) a filesystem whose journal
5660 * has recorded an error from a previous lifetime, move that error to the
5661 * main filesystem now.
5662 */
5663static int ext4_clear_journal_err(struct super_block *sb,
5664				   struct ext4_super_block *es)
5665{
5666	journal_t *journal;
5667	int j_errno;
5668	const char *errstr;
5669
5670	if (!ext4_has_feature_journal(sb)) {
5671		ext4_error(sb, "Journal got removed while the fs was mounted!");
5672		return -EFSCORRUPTED;
5673	}
5674
5675	journal = EXT4_SB(sb)->s_journal;
5676
5677	/*
5678	 * Now check for any error status which may have been recorded in the
5679	 * journal by a prior ext4_error() or ext4_abort()
5680	 */
5681
5682	j_errno = jbd2_journal_errno(journal);
5683	if (j_errno) {
5684		char nbuf[16];
5685
5686		errstr = ext4_decode_error(sb, j_errno, nbuf);
5687		ext4_warning(sb, "Filesystem error recorded "
5688			     "from previous mount: %s", errstr);
5689		ext4_warning(sb, "Marking fs in need of filesystem check.");
5690
5691		EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
5692		es->s_state |= cpu_to_le16(EXT4_ERROR_FS);
5693		ext4_commit_super(sb);
5694
5695		jbd2_journal_clear_err(journal);
5696		jbd2_journal_update_sb_errno(journal);
5697	}
5698	return 0;
5699}
5700
5701/*
5702 * Force the running and committing transactions to commit,
5703 * and wait on the commit.
5704 */
5705int ext4_force_commit(struct super_block *sb)
5706{
5707	journal_t *journal;
5708
5709	if (sb_rdonly(sb))
5710		return 0;
5711
5712	journal = EXT4_SB(sb)->s_journal;
5713	return ext4_journal_force_commit(journal);
5714}
5715
5716static int ext4_sync_fs(struct super_block *sb, int wait)
5717{
5718	int ret = 0;
5719	tid_t target;
5720	bool needs_barrier = false;
5721	struct ext4_sb_info *sbi = EXT4_SB(sb);
5722
5723	if (unlikely(ext4_forced_shutdown(sbi)))
5724		return 0;
5725
5726	trace_ext4_sync_fs(sb, wait);
5727	flush_workqueue(sbi->rsv_conversion_wq);
5728	/*
5729	 * Writeback quota in non-journalled quota case - journalled quota has
5730	 * no dirty dquots
5731	 */
5732	dquot_writeback_dquots(sb, -1);
5733	/*
5734	 * Data writeback is possible w/o journal transaction, so barrier must
5735	 * being sent at the end of the function. But we can skip it if
5736	 * transaction_commit will do it for us.
5737	 */
5738	if (sbi->s_journal) {
5739		target = jbd2_get_latest_transaction(sbi->s_journal);
5740		if (wait && sbi->s_journal->j_flags & JBD2_BARRIER &&
5741		    !jbd2_trans_will_send_data_barrier(sbi->s_journal, target))
5742			needs_barrier = true;
5743
5744		if (jbd2_journal_start_commit(sbi->s_journal, &target)) {
5745			if (wait)
5746				ret = jbd2_log_wait_commit(sbi->s_journal,
5747							   target);
5748		}
5749	} else if (wait && test_opt(sb, BARRIER))
5750		needs_barrier = true;
5751	if (needs_barrier) {
5752		int err;
5753		err = blkdev_issue_flush(sb->s_bdev);
5754		if (!ret)
5755			ret = err;
5756	}
5757
5758	return ret;
5759}
5760
5761/*
5762 * LVM calls this function before a (read-only) snapshot is created.  This
5763 * gives us a chance to flush the journal completely and mark the fs clean.
5764 *
5765 * Note that only this function cannot bring a filesystem to be in a clean
5766 * state independently. It relies on upper layer to stop all data & metadata
5767 * modifications.
5768 */
5769static int ext4_freeze(struct super_block *sb)
5770{
5771	int error = 0;
5772	journal_t *journal;
5773
5774	if (sb_rdonly(sb))
5775		return 0;
5776
5777	journal = EXT4_SB(sb)->s_journal;
5778
5779	if (journal) {
5780		/* Now we set up the journal barrier. */
5781		jbd2_journal_lock_updates(journal);
5782
5783		/*
5784		 * Don't clear the needs_recovery flag if we failed to
5785		 * flush the journal.
5786		 */
5787		error = jbd2_journal_flush(journal, 0);
5788		if (error < 0)
5789			goto out;
5790
5791		/* Journal blocked and flushed, clear needs_recovery flag. */
5792		ext4_clear_feature_journal_needs_recovery(sb);
5793	}
5794
5795	error = ext4_commit_super(sb);
5796out:
5797	if (journal)
5798		/* we rely on upper layer to stop further updates */
5799		jbd2_journal_unlock_updates(journal);
5800	return error;
5801}
5802
5803/*
5804 * Called by LVM after the snapshot is done.  We need to reset the RECOVER
5805 * flag here, even though the filesystem is not technically dirty yet.
5806 */
5807static int ext4_unfreeze(struct super_block *sb)
5808{
5809	if (sb_rdonly(sb) || ext4_forced_shutdown(EXT4_SB(sb)))
5810		return 0;
5811
5812	if (EXT4_SB(sb)->s_journal) {
5813		/* Reset the needs_recovery flag before the fs is unlocked. */
5814		ext4_set_feature_journal_needs_recovery(sb);
5815	}
5816
5817	ext4_commit_super(sb);
5818	return 0;
5819}
5820
5821/*
5822 * Structure to save mount options for ext4_remount's benefit
5823 */
5824struct ext4_mount_options {
5825	unsigned long s_mount_opt;
5826	unsigned long s_mount_opt2;
5827	kuid_t s_resuid;
5828	kgid_t s_resgid;
5829	unsigned long s_commit_interval;
5830	u32 s_min_batch_time, s_max_batch_time;
5831#ifdef CONFIG_QUOTA
5832	int s_jquota_fmt;
5833	char *s_qf_names[EXT4_MAXQUOTAS];
5834#endif
5835};
5836
5837static int ext4_remount(struct super_block *sb, int *flags, char *data)
5838{
5839	struct ext4_super_block *es;
5840	struct ext4_sb_info *sbi = EXT4_SB(sb);
5841	unsigned long old_sb_flags, vfs_flags;
5842	struct ext4_mount_options old_opts;
5843	int enable_quota = 0;
5844	ext4_group_t g;
5845	int err = 0;
5846#ifdef CONFIG_QUOTA
5847	int i, j;
5848	char *to_free[EXT4_MAXQUOTAS];
5849#endif
5850	char *orig_data = kstrdup(data, GFP_KERNEL);
5851	struct ext4_parsed_options parsed_opts;
5852
5853	parsed_opts.journal_ioprio = DEFAULT_JOURNAL_IOPRIO;
5854	parsed_opts.journal_devnum = 0;
5855
5856	if (data && !orig_data)
5857		return -ENOMEM;
5858
5859	/* Store the original options */
5860	old_sb_flags = sb->s_flags;
5861	old_opts.s_mount_opt = sbi->s_mount_opt;
5862	old_opts.s_mount_opt2 = sbi->s_mount_opt2;
5863	old_opts.s_resuid = sbi->s_resuid;
5864	old_opts.s_resgid = sbi->s_resgid;
5865	old_opts.s_commit_interval = sbi->s_commit_interval;
5866	old_opts.s_min_batch_time = sbi->s_min_batch_time;
5867	old_opts.s_max_batch_time = sbi->s_max_batch_time;
5868#ifdef CONFIG_QUOTA
5869	old_opts.s_jquota_fmt = sbi->s_jquota_fmt;
5870	for (i = 0; i < EXT4_MAXQUOTAS; i++)
5871		if (sbi->s_qf_names[i]) {
5872			char *qf_name = get_qf_name(sb, sbi, i);
5873
5874			old_opts.s_qf_names[i] = kstrdup(qf_name, GFP_KERNEL);
5875			if (!old_opts.s_qf_names[i]) {
5876				for (j = 0; j < i; j++)
5877					kfree(old_opts.s_qf_names[j]);
5878				kfree(orig_data);
5879				return -ENOMEM;
5880			}
5881		} else
5882			old_opts.s_qf_names[i] = NULL;
5883#endif
5884	if (sbi->s_journal && sbi->s_journal->j_task->io_context)
5885		parsed_opts.journal_ioprio =
5886			sbi->s_journal->j_task->io_context->ioprio;
5887
5888	/*
5889	 * Some options can be enabled by ext4 and/or by VFS mount flag
5890	 * either way we need to make sure it matches in both *flags and
5891	 * s_flags. Copy those selected flags from *flags to s_flags
5892	 */
5893	vfs_flags = SB_LAZYTIME | SB_I_VERSION;
5894	sb->s_flags = (sb->s_flags & ~vfs_flags) | (*flags & vfs_flags);
5895
5896	if (!parse_options(data, sb, &parsed_opts, 1)) {
5897		err = -EINVAL;
5898		goto restore_opts;
5899	}
5900
5901	if ((old_opts.s_mount_opt & EXT4_MOUNT_JOURNAL_CHECKSUM) ^
5902	    test_opt(sb, JOURNAL_CHECKSUM)) {
5903		ext4_msg(sb, KERN_ERR, "changing journal_checksum "
5904			 "during remount not supported; ignoring");
5905		sbi->s_mount_opt ^= EXT4_MOUNT_JOURNAL_CHECKSUM;
5906	}
5907
5908	if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) {
5909		if (test_opt2(sb, EXPLICIT_DELALLOC)) {
5910			ext4_msg(sb, KERN_ERR, "can't mount with "
5911				 "both data=journal and delalloc");
5912			err = -EINVAL;
5913			goto restore_opts;
5914		}
5915		if (test_opt(sb, DIOREAD_NOLOCK)) {
5916			ext4_msg(sb, KERN_ERR, "can't mount with "
5917				 "both data=journal and dioread_nolock");
5918			err = -EINVAL;
5919			goto restore_opts;
5920		}
5921	} else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA) {
5922		if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) {
5923			ext4_msg(sb, KERN_ERR, "can't mount with "
5924				"journal_async_commit in data=ordered mode");
5925			err = -EINVAL;
5926			goto restore_opts;
5927		}
5928	}
5929
5930	if ((sbi->s_mount_opt ^ old_opts.s_mount_opt) & EXT4_MOUNT_NO_MBCACHE) {
5931		ext4_msg(sb, KERN_ERR, "can't enable nombcache during remount");
5932		err = -EINVAL;
5933		goto restore_opts;
5934	}
5935
5936	if (ext4_test_mount_flag(sb, EXT4_MF_FS_ABORTED))
5937		ext4_abort(sb, EXT4_ERR_ESHUTDOWN, "Abort forced by user");
5938
5939	sb->s_flags = (sb->s_flags & ~SB_POSIXACL) |
5940		(test_opt(sb, POSIX_ACL) ? SB_POSIXACL : 0);
5941
5942	es = sbi->s_es;
5943
5944	if (sbi->s_journal) {
5945		ext4_init_journal_params(sb, sbi->s_journal);
5946		set_task_ioprio(sbi->s_journal->j_task, parsed_opts.journal_ioprio);
5947	}
5948
5949	/* Flush outstanding errors before changing fs state */
5950	flush_work(&sbi->s_error_work);
5951
5952	if ((bool)(*flags & SB_RDONLY) != sb_rdonly(sb)) {
5953		if (ext4_test_mount_flag(sb, EXT4_MF_FS_ABORTED)) {
5954			err = -EROFS;
5955			goto restore_opts;
5956		}
5957
5958		if (*flags & SB_RDONLY) {
5959			err = sync_filesystem(sb);
5960			if (err < 0)
5961				goto restore_opts;
5962			err = dquot_suspend(sb, -1);
5963			if (err < 0)
5964				goto restore_opts;
5965
5966			/*
5967			 * First of all, the unconditional stuff we have to do
5968			 * to disable replay of the journal when we next remount
5969			 */
5970			sb->s_flags |= SB_RDONLY;
5971
5972			/*
5973			 * OK, test if we are remounting a valid rw partition
5974			 * readonly, and if so set the rdonly flag and then
5975			 * mark the partition as valid again.
5976			 */
5977			if (!(es->s_state & cpu_to_le16(EXT4_VALID_FS)) &&
5978			    (sbi->s_mount_state & EXT4_VALID_FS))
5979				es->s_state = cpu_to_le16(sbi->s_mount_state);
5980
5981			if (sbi->s_journal) {
5982				/*
5983				 * We let remount-ro finish even if marking fs
5984				 * as clean failed...
5985				 */
5986				ext4_mark_recovery_complete(sb, es);
5987			}
5988		} else {
5989			/* Make sure we can mount this feature set readwrite */
5990			if (ext4_has_feature_readonly(sb) ||
5991			    !ext4_feature_set_ok(sb, 0)) {
5992				err = -EROFS;
5993				goto restore_opts;
5994			}
5995			/*
5996			 * Make sure the group descriptor checksums
5997			 * are sane.  If they aren't, refuse to remount r/w.
5998			 */
5999			for (g = 0; g < sbi->s_groups_count; g++) {
6000				struct ext4_group_desc *gdp =
6001					ext4_get_group_desc(sb, g, NULL);
6002
6003				if (!ext4_group_desc_csum_verify(sb, g, gdp)) {
6004					ext4_msg(sb, KERN_ERR,
6005	       "ext4_remount: Checksum for group %u failed (%u!=%u)",
6006		g, le16_to_cpu(ext4_group_desc_csum(sb, g, gdp)),
6007					       le16_to_cpu(gdp->bg_checksum));
6008					err = -EFSBADCRC;
6009					goto restore_opts;
6010				}
6011			}
6012
6013			/*
6014			 * If we have an unprocessed orphan list hanging
6015			 * around from a previously readonly bdev mount,
6016			 * require a full umount/remount for now.
6017			 */
6018			if (es->s_last_orphan) {
6019				ext4_msg(sb, KERN_WARNING, "Couldn't "
6020				       "remount RDWR because of unprocessed "
6021				       "orphan inode list.  Please "
6022				       "umount/remount instead");
6023				err = -EINVAL;
6024				goto restore_opts;
6025			}
6026
6027			/*
6028			 * Mounting a RDONLY partition read-write, so reread
6029			 * and store the current valid flag.  (It may have
6030			 * been changed by e2fsck since we originally mounted
6031			 * the partition.)
6032			 */
6033			if (sbi->s_journal) {
6034				err = ext4_clear_journal_err(sb, es);
6035				if (err)
6036					goto restore_opts;
6037			}
6038			sbi->s_mount_state = le16_to_cpu(es->s_state);
6039
6040			err = ext4_setup_super(sb, es, 0);
6041			if (err)
6042				goto restore_opts;
6043
6044			sb->s_flags &= ~SB_RDONLY;
6045			if (ext4_has_feature_mmp(sb))
6046				if (ext4_multi_mount_protect(sb,
6047						le64_to_cpu(es->s_mmp_block))) {
6048					err = -EROFS;
6049					goto restore_opts;
6050				}
6051			enable_quota = 1;
6052		}
6053	}
6054
6055	/*
6056	 * Reinitialize lazy itable initialization thread based on
6057	 * current settings
6058	 */
6059	if (sb_rdonly(sb) || !test_opt(sb, INIT_INODE_TABLE))
6060		ext4_unregister_li_request(sb);
6061	else {
6062		ext4_group_t first_not_zeroed;
6063		first_not_zeroed = ext4_has_uninit_itable(sb);
6064		ext4_register_li_request(sb, first_not_zeroed);
6065	}
6066
6067	/*
6068	 * Handle creation of system zone data early because it can fail.
6069	 * Releasing of existing data is done when we are sure remount will
6070	 * succeed.
6071	 */
6072	if (test_opt(sb, BLOCK_VALIDITY) && !sbi->s_system_blks) {
6073		err = ext4_setup_system_zone(sb);
6074		if (err)
6075			goto restore_opts;
6076	}
6077
6078	if (sbi->s_journal == NULL && !(old_sb_flags & SB_RDONLY)) {
6079		err = ext4_commit_super(sb);
6080		if (err)
6081			goto restore_opts;
6082	}
6083
6084#ifdef CONFIG_QUOTA
6085	/* Release old quota file names */
6086	for (i = 0; i < EXT4_MAXQUOTAS; i++)
6087		kfree(old_opts.s_qf_names[i]);
6088	if (enable_quota) {
6089		if (sb_any_quota_suspended(sb))
6090			dquot_resume(sb, -1);
6091		else if (ext4_has_feature_quota(sb)) {
6092			err = ext4_enable_quotas(sb);
6093			if (err)
6094				goto restore_opts;
6095		}
6096	}
6097#endif
6098	if (!test_opt(sb, BLOCK_VALIDITY) && sbi->s_system_blks)
6099		ext4_release_system_zone(sb);
6100
6101	if (!ext4_has_feature_mmp(sb) || sb_rdonly(sb))
6102		ext4_stop_mmpd(sbi);
6103
6104	/*
6105	 * Some options can be enabled by ext4 and/or by VFS mount flag
6106	 * either way we need to make sure it matches in both *flags and
6107	 * s_flags. Copy those selected flags from s_flags to *flags
6108	 */
6109	*flags = (*flags & ~vfs_flags) | (sb->s_flags & vfs_flags);
6110
6111	ext4_msg(sb, KERN_INFO, "re-mounted. Opts: %s. Quota mode: %s.",
6112		 orig_data, ext4_quota_mode(sb));
6113	kfree(orig_data);
6114	return 0;
6115
6116restore_opts:
6117	sb->s_flags = old_sb_flags;
6118	sbi->s_mount_opt = old_opts.s_mount_opt;
6119	sbi->s_mount_opt2 = old_opts.s_mount_opt2;
6120	sbi->s_resuid = old_opts.s_resuid;
6121	sbi->s_resgid = old_opts.s_resgid;
6122	sbi->s_commit_interval = old_opts.s_commit_interval;
6123	sbi->s_min_batch_time = old_opts.s_min_batch_time;
6124	sbi->s_max_batch_time = old_opts.s_max_batch_time;
6125	if (!test_opt(sb, BLOCK_VALIDITY) && sbi->s_system_blks)
6126		ext4_release_system_zone(sb);
6127#ifdef CONFIG_QUOTA
6128	sbi->s_jquota_fmt = old_opts.s_jquota_fmt;
6129	for (i = 0; i < EXT4_MAXQUOTAS; i++) {
6130		to_free[i] = get_qf_name(sb, sbi, i);
6131		rcu_assign_pointer(sbi->s_qf_names[i], old_opts.s_qf_names[i]);
6132	}
6133	synchronize_rcu();
6134	for (i = 0; i < EXT4_MAXQUOTAS; i++)
6135		kfree(to_free[i]);
6136#endif
6137	if (!ext4_has_feature_mmp(sb) || sb_rdonly(sb))
6138		ext4_stop_mmpd(sbi);
6139	kfree(orig_data);
6140	return err;
6141}
6142
6143#ifdef CONFIG_QUOTA
6144static int ext4_statfs_project(struct super_block *sb,
6145			       kprojid_t projid, struct kstatfs *buf)
6146{
6147	struct kqid qid;
6148	struct dquot *dquot;
6149	u64 limit;
6150	u64 curblock;
6151
6152	qid = make_kqid_projid(projid);
6153	dquot = dqget(sb, qid);
6154	if (IS_ERR(dquot))
6155		return PTR_ERR(dquot);
6156	spin_lock(&dquot->dq_dqb_lock);
6157
6158	limit = min_not_zero(dquot->dq_dqb.dqb_bsoftlimit,
6159			     dquot->dq_dqb.dqb_bhardlimit);
6160	limit >>= sb->s_blocksize_bits;
6161
6162	if (limit && buf->f_blocks > limit) {
6163		curblock = (dquot->dq_dqb.dqb_curspace +
6164			    dquot->dq_dqb.dqb_rsvspace) >> sb->s_blocksize_bits;
6165		buf->f_blocks = limit;
6166		buf->f_bfree = buf->f_bavail =
6167			(buf->f_blocks > curblock) ?
6168			 (buf->f_blocks - curblock) : 0;
6169	}
6170
6171	limit = min_not_zero(dquot->dq_dqb.dqb_isoftlimit,
6172			     dquot->dq_dqb.dqb_ihardlimit);
6173	if (limit && buf->f_files > limit) {
6174		buf->f_files = limit;
6175		buf->f_ffree =
6176			(buf->f_files > dquot->dq_dqb.dqb_curinodes) ?
6177			 (buf->f_files - dquot->dq_dqb.dqb_curinodes) : 0;
6178	}
6179
6180	spin_unlock(&dquot->dq_dqb_lock);
6181	dqput(dquot);
6182	return 0;
6183}
6184#endif
6185
6186static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf)
6187{
6188	struct super_block *sb = dentry->d_sb;
6189	struct ext4_sb_info *sbi = EXT4_SB(sb);
6190	struct ext4_super_block *es = sbi->s_es;
6191	ext4_fsblk_t overhead = 0, resv_blocks;
6192	s64 bfree;
6193	resv_blocks = EXT4_C2B(sbi, atomic64_read(&sbi->s_resv_clusters));
6194
6195	if (!test_opt(sb, MINIX_DF))
6196		overhead = sbi->s_overhead;
6197
6198	buf->f_type = EXT4_SUPER_MAGIC;
6199	buf->f_bsize = sb->s_blocksize;
6200	buf->f_blocks = ext4_blocks_count(es) - EXT4_C2B(sbi, overhead);
6201	bfree = percpu_counter_sum_positive(&sbi->s_freeclusters_counter) -
6202		percpu_counter_sum_positive(&sbi->s_dirtyclusters_counter);
6203	/* prevent underflow in case that few free space is available */
6204	buf->f_bfree = EXT4_C2B(sbi, max_t(s64, bfree, 0));
6205	buf->f_bavail = buf->f_bfree -
6206			(ext4_r_blocks_count(es) + resv_blocks);
6207	if (buf->f_bfree < (ext4_r_blocks_count(es) + resv_blocks))
6208		buf->f_bavail = 0;
6209	buf->f_files = le32_to_cpu(es->s_inodes_count);
6210	buf->f_ffree = percpu_counter_sum_positive(&sbi->s_freeinodes_counter);
6211	buf->f_namelen = EXT4_NAME_LEN;
6212	buf->f_fsid = uuid_to_fsid(es->s_uuid);
6213
6214#ifdef CONFIG_QUOTA
6215	if (ext4_test_inode_flag(dentry->d_inode, EXT4_INODE_PROJINHERIT) &&
6216	    sb_has_quota_limits_enabled(sb, PRJQUOTA))
6217		ext4_statfs_project(sb, EXT4_I(dentry->d_inode)->i_projid, buf);
6218#endif
6219	return 0;
6220}
6221
6222
6223#ifdef CONFIG_QUOTA
6224
6225/*
6226 * Helper functions so that transaction is started before we acquire dqio_sem
6227 * to keep correct lock ordering of transaction > dqio_sem
6228 */
6229static inline struct inode *dquot_to_inode(struct dquot *dquot)
6230{
6231	return sb_dqopt(dquot->dq_sb)->files[dquot->dq_id.type];
6232}
6233
6234static int ext4_write_dquot(struct dquot *dquot)
6235{
6236	int ret, err;
6237	handle_t *handle;
6238	struct inode *inode;
6239
6240	inode = dquot_to_inode(dquot);
6241	handle = ext4_journal_start(inode, EXT4_HT_QUOTA,
6242				    EXT4_QUOTA_TRANS_BLOCKS(dquot->dq_sb));
6243	if (IS_ERR(handle))
6244		return PTR_ERR(handle);
6245	ret = dquot_commit(dquot);
6246	err = ext4_journal_stop(handle);
6247	if (!ret)
6248		ret = err;
6249	return ret;
6250}
6251
6252static int ext4_acquire_dquot(struct dquot *dquot)
6253{
6254	int ret, err;
6255	handle_t *handle;
6256
6257	handle = ext4_journal_start(dquot_to_inode(dquot), EXT4_HT_QUOTA,
6258				    EXT4_QUOTA_INIT_BLOCKS(dquot->dq_sb));
6259	if (IS_ERR(handle))
6260		return PTR_ERR(handle);
6261	ret = dquot_acquire(dquot);
6262	err = ext4_journal_stop(handle);
6263	if (!ret)
6264		ret = err;
6265	return ret;
6266}
6267
6268static int ext4_release_dquot(struct dquot *dquot)
6269{
6270	int ret, err;
6271	handle_t *handle;
6272
6273	handle = ext4_journal_start(dquot_to_inode(dquot), EXT4_HT_QUOTA,
6274				    EXT4_QUOTA_DEL_BLOCKS(dquot->dq_sb));
6275	if (IS_ERR(handle)) {
6276		/* Release dquot anyway to avoid endless cycle in dqput() */
6277		dquot_release(dquot);
6278		return PTR_ERR(handle);
6279	}
6280	ret = dquot_release(dquot);
6281	err = ext4_journal_stop(handle);
6282	if (!ret)
6283		ret = err;
6284	return ret;
6285}
6286
6287static int ext4_mark_dquot_dirty(struct dquot *dquot)
6288{
6289	struct super_block *sb = dquot->dq_sb;
6290
6291	if (ext4_is_quota_journalled(sb)) {
6292		dquot_mark_dquot_dirty(dquot);
6293		return ext4_write_dquot(dquot);
6294	} else {
6295		return dquot_mark_dquot_dirty(dquot);
6296	}
6297}
6298
6299static int ext4_write_info(struct super_block *sb, int type)
6300{
6301	int ret, err;
6302	handle_t *handle;
6303
6304	/* Data block + inode block */
6305	handle = ext4_journal_start(d_inode(sb->s_root), EXT4_HT_QUOTA, 2);
6306	if (IS_ERR(handle))
6307		return PTR_ERR(handle);
6308	ret = dquot_commit_info(sb, type);
6309	err = ext4_journal_stop(handle);
6310	if (!ret)
6311		ret = err;
6312	return ret;
6313}
6314
6315/*
6316 * Turn on quotas during mount time - we need to find
6317 * the quota file and such...
6318 */
6319static int ext4_quota_on_mount(struct super_block *sb, int type)
6320{
6321	return dquot_quota_on_mount(sb, get_qf_name(sb, EXT4_SB(sb), type),
6322					EXT4_SB(sb)->s_jquota_fmt, type);
6323}
6324
6325static void lockdep_set_quota_inode(struct inode *inode, int subclass)
6326{
6327	struct ext4_inode_info *ei = EXT4_I(inode);
6328
6329	/* The first argument of lockdep_set_subclass has to be
6330	 * *exactly* the same as the argument to init_rwsem() --- in
6331	 * this case, in init_once() --- or lockdep gets unhappy
6332	 * because the name of the lock is set using the
6333	 * stringification of the argument to init_rwsem().
6334	 */
6335	(void) ei;	/* shut up clang warning if !CONFIG_LOCKDEP */
6336	lockdep_set_subclass(&ei->i_data_sem, subclass);
6337}
6338
6339/*
6340 * Standard function to be called on quota_on
6341 */
6342static int ext4_quota_on(struct super_block *sb, int type, int format_id,
6343			 const struct path *path)
6344{
6345	int err;
6346
6347	if (!test_opt(sb, QUOTA))
6348		return -EINVAL;
6349
6350	/* Quotafile not on the same filesystem? */
6351	if (path->dentry->d_sb != sb)
6352		return -EXDEV;
6353
6354	/* Quota already enabled for this file? */
6355	if (IS_NOQUOTA(d_inode(path->dentry)))
6356		return -EBUSY;
6357
6358	/* Journaling quota? */
6359	if (EXT4_SB(sb)->s_qf_names[type]) {
6360		/* Quotafile not in fs root? */
6361		if (path->dentry->d_parent != sb->s_root)
6362			ext4_msg(sb, KERN_WARNING,
6363				"Quota file not on filesystem root. "
6364				"Journaled quota will not work");
6365		sb_dqopt(sb)->flags |= DQUOT_NOLIST_DIRTY;
6366	} else {
6367		/*
6368		 * Clear the flag just in case mount options changed since
6369		 * last time.
6370		 */
6371		sb_dqopt(sb)->flags &= ~DQUOT_NOLIST_DIRTY;
6372	}
6373
6374	/*
6375	 * When we journal data on quota file, we have to flush journal to see
6376	 * all updates to the file when we bypass pagecache...
6377	 */
6378	if (EXT4_SB(sb)->s_journal &&
6379	    ext4_should_journal_data(d_inode(path->dentry))) {
6380		/*
6381		 * We don't need to lock updates but journal_flush() could
6382		 * otherwise be livelocked...
6383		 */
6384		jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal);
6385		err = jbd2_journal_flush(EXT4_SB(sb)->s_journal, 0);
6386		jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
6387		if (err)
6388			return err;
6389	}
6390
6391	lockdep_set_quota_inode(path->dentry->d_inode, I_DATA_SEM_QUOTA);
6392	err = dquot_quota_on(sb, type, format_id, path);
6393	if (err) {
6394		lockdep_set_quota_inode(path->dentry->d_inode,
6395					     I_DATA_SEM_NORMAL);
6396	} else {
6397		struct inode *inode = d_inode(path->dentry);
6398		handle_t *handle;
6399
6400		/*
6401		 * Set inode flags to prevent userspace from messing with quota
6402		 * files. If this fails, we return success anyway since quotas
6403		 * are already enabled and this is not a hard failure.
6404		 */
6405		inode_lock(inode);
6406		handle = ext4_journal_start(inode, EXT4_HT_QUOTA, 1);
6407		if (IS_ERR(handle))
6408			goto unlock_inode;
6409		EXT4_I(inode)->i_flags |= EXT4_NOATIME_FL | EXT4_IMMUTABLE_FL;
6410		inode_set_flags(inode, S_NOATIME | S_IMMUTABLE,
6411				S_NOATIME | S_IMMUTABLE);
6412		err = ext4_mark_inode_dirty(handle, inode);
6413		ext4_journal_stop(handle);
6414	unlock_inode:
6415		inode_unlock(inode);
6416	}
6417	return err;
6418}
6419
6420static int ext4_quota_enable(struct super_block *sb, int type, int format_id,
6421			     unsigned int flags)
6422{
6423	int err;
6424	struct inode *qf_inode;
6425	unsigned long qf_inums[EXT4_MAXQUOTAS] = {
6426		le32_to_cpu(EXT4_SB(sb)->s_es->s_usr_quota_inum),
6427		le32_to_cpu(EXT4_SB(sb)->s_es->s_grp_quota_inum),
6428		le32_to_cpu(EXT4_SB(sb)->s_es->s_prj_quota_inum)
6429	};
6430
6431	BUG_ON(!ext4_has_feature_quota(sb));
6432
6433	if (!qf_inums[type])
6434		return -EPERM;
6435
6436	qf_inode = ext4_iget(sb, qf_inums[type], EXT4_IGET_SPECIAL);
6437	if (IS_ERR(qf_inode)) {
6438		ext4_error(sb, "Bad quota inode # %lu", qf_inums[type]);
6439		return PTR_ERR(qf_inode);
6440	}
6441
6442	/* Don't account quota for quota files to avoid recursion */
6443	qf_inode->i_flags |= S_NOQUOTA;
6444	lockdep_set_quota_inode(qf_inode, I_DATA_SEM_QUOTA);
6445	err = dquot_load_quota_inode(qf_inode, type, format_id, flags);
6446	if (err)
6447		lockdep_set_quota_inode(qf_inode, I_DATA_SEM_NORMAL);
6448	iput(qf_inode);
6449
6450	return err;
6451}
6452
6453/* Enable usage tracking for all quota types. */
6454static int ext4_enable_quotas(struct super_block *sb)
6455{
6456	int type, err = 0;
6457	unsigned long qf_inums[EXT4_MAXQUOTAS] = {
6458		le32_to_cpu(EXT4_SB(sb)->s_es->s_usr_quota_inum),
6459		le32_to_cpu(EXT4_SB(sb)->s_es->s_grp_quota_inum),
6460		le32_to_cpu(EXT4_SB(sb)->s_es->s_prj_quota_inum)
6461	};
6462	bool quota_mopt[EXT4_MAXQUOTAS] = {
6463		test_opt(sb, USRQUOTA),
6464		test_opt(sb, GRPQUOTA),
6465		test_opt(sb, PRJQUOTA),
6466	};
6467
6468	sb_dqopt(sb)->flags |= DQUOT_QUOTA_SYS_FILE | DQUOT_NOLIST_DIRTY;
6469	for (type = 0; type < EXT4_MAXQUOTAS; type++) {
6470		if (qf_inums[type]) {
6471			err = ext4_quota_enable(sb, type, QFMT_VFS_V1,
6472				DQUOT_USAGE_ENABLED |
6473				(quota_mopt[type] ? DQUOT_LIMITS_ENABLED : 0));
6474			if (err) {
6475				ext4_warning(sb,
6476					"Failed to enable quota tracking "
6477					"(type=%d, err=%d). Please run "
6478					"e2fsck to fix.", type, err);
6479				for (type--; type >= 0; type--)
6480					dquot_quota_off(sb, type);
6481
6482				return err;
6483			}
6484		}
6485	}
6486	return 0;
6487}
6488
6489static int ext4_quota_off(struct super_block *sb, int type)
6490{
6491	struct inode *inode = sb_dqopt(sb)->files[type];
6492	handle_t *handle;
6493	int err;
6494
6495	/* Force all delayed allocation blocks to be allocated.
6496	 * Caller already holds s_umount sem */
6497	if (test_opt(sb, DELALLOC))
6498		sync_filesystem(sb);
6499
6500	if (!inode || !igrab(inode))
6501		goto out;
6502
6503	err = dquot_quota_off(sb, type);
6504	if (err || ext4_has_feature_quota(sb))
6505		goto out_put;
6506
6507	inode_lock(inode);
6508	/*
6509	 * Update modification times of quota files when userspace can
6510	 * start looking at them. If we fail, we return success anyway since
6511	 * this is not a hard failure and quotas are already disabled.
6512	 */
6513	handle = ext4_journal_start(inode, EXT4_HT_QUOTA, 1);
6514	if (IS_ERR(handle)) {
6515		err = PTR_ERR(handle);
6516		goto out_unlock;
6517	}
6518	EXT4_I(inode)->i_flags &= ~(EXT4_NOATIME_FL | EXT4_IMMUTABLE_FL);
6519	inode_set_flags(inode, 0, S_NOATIME | S_IMMUTABLE);
6520	inode->i_mtime = inode->i_ctime = current_time(inode);
6521	err = ext4_mark_inode_dirty(handle, inode);
6522	ext4_journal_stop(handle);
6523out_unlock:
6524	inode_unlock(inode);
6525out_put:
6526	lockdep_set_quota_inode(inode, I_DATA_SEM_NORMAL);
6527	iput(inode);
6528	return err;
6529out:
6530	return dquot_quota_off(sb, type);
6531}
6532
6533/* Read data from quotafile - avoid pagecache and such because we cannot afford
6534 * acquiring the locks... As quota files are never truncated and quota code
6535 * itself serializes the operations (and no one else should touch the files)
6536 * we don't have to be afraid of races */
6537static ssize_t ext4_quota_read(struct super_block *sb, int type, char *data,
6538			       size_t len, loff_t off)
6539{
6540	struct inode *inode = sb_dqopt(sb)->files[type];
6541	ext4_lblk_t blk = off >> EXT4_BLOCK_SIZE_BITS(sb);
6542	int offset = off & (sb->s_blocksize - 1);
6543	int tocopy;
6544	size_t toread;
6545	struct buffer_head *bh;
6546	loff_t i_size = i_size_read(inode);
6547
6548	if (off > i_size)
6549		return 0;
6550	if (off+len > i_size)
6551		len = i_size-off;
6552	toread = len;
6553	while (toread > 0) {
6554		tocopy = sb->s_blocksize - offset < toread ?
6555				sb->s_blocksize - offset : toread;
6556		bh = ext4_bread(NULL, inode, blk, 0);
6557		if (IS_ERR(bh))
6558			return PTR_ERR(bh);
6559		if (!bh)	/* A hole? */
6560			memset(data, 0, tocopy);
6561		else
6562			memcpy(data, bh->b_data+offset, tocopy);
6563		brelse(bh);
6564		offset = 0;
6565		toread -= tocopy;
6566		data += tocopy;
6567		blk++;
6568	}
6569	return len;
6570}
6571
6572/* Write to quotafile (we know the transaction is already started and has
6573 * enough credits) */
6574static ssize_t ext4_quota_write(struct super_block *sb, int type,
6575				const char *data, size_t len, loff_t off)
6576{
6577	struct inode *inode = sb_dqopt(sb)->files[type];
6578	ext4_lblk_t blk = off >> EXT4_BLOCK_SIZE_BITS(sb);
6579	int err = 0, err2 = 0, offset = off & (sb->s_blocksize - 1);
6580	int retries = 0;
6581	struct buffer_head *bh;
6582	handle_t *handle = journal_current_handle();
6583
6584	if (EXT4_SB(sb)->s_journal && !handle) {
6585		ext4_msg(sb, KERN_WARNING, "Quota write (off=%llu, len=%llu)"
6586			" cancelled because transaction is not started",
6587			(unsigned long long)off, (unsigned long long)len);
6588		return -EIO;
6589	}
6590	/*
6591	 * Since we account only one data block in transaction credits,
6592	 * then it is impossible to cross a block boundary.
6593	 */
6594	if (sb->s_blocksize - offset < len) {
6595		ext4_msg(sb, KERN_WARNING, "Quota write (off=%llu, len=%llu)"
6596			" cancelled because not block aligned",
6597			(unsigned long long)off, (unsigned long long)len);
6598		return -EIO;
6599	}
6600
6601	do {
6602		bh = ext4_bread(handle, inode, blk,
6603				EXT4_GET_BLOCKS_CREATE |
6604				EXT4_GET_BLOCKS_METADATA_NOFAIL);
6605	} while (PTR_ERR(bh) == -ENOSPC &&
6606		 ext4_should_retry_alloc(inode->i_sb, &retries));
6607	if (IS_ERR(bh))
6608		return PTR_ERR(bh);
6609	if (!bh)
6610		goto out;
6611	BUFFER_TRACE(bh, "get write access");
6612	err = ext4_journal_get_write_access(handle, bh);
6613	if (err) {
6614		brelse(bh);
6615		return err;
6616	}
6617	lock_buffer(bh);
6618	memcpy(bh->b_data+offset, data, len);
6619	flush_dcache_page(bh->b_page);
6620	unlock_buffer(bh);
6621	err = ext4_handle_dirty_metadata(handle, NULL, bh);
6622	brelse(bh);
6623out:
6624	if (inode->i_size < off + len) {
6625		i_size_write(inode, off + len);
6626		EXT4_I(inode)->i_disksize = inode->i_size;
6627		err2 = ext4_mark_inode_dirty(handle, inode);
6628		if (unlikely(err2 && !err))
6629			err = err2;
6630	}
6631	return err ? err : len;
6632}
6633#endif
6634
6635static struct dentry *ext4_mount(struct file_system_type *fs_type, int flags,
6636		       const char *dev_name, void *data)
6637{
6638	return mount_bdev(fs_type, flags, dev_name, data, ext4_fill_super);
6639}
6640
6641#if !defined(CONFIG_EXT2_FS) && !defined(CONFIG_EXT2_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT2)
6642static inline void register_as_ext2(void)
6643{
6644	int err = register_filesystem(&ext2_fs_type);
6645	if (err)
6646		printk(KERN_WARNING
6647		       "EXT4-fs: Unable to register as ext2 (%d)\n", err);
6648}
6649
6650static inline void unregister_as_ext2(void)
6651{
6652	unregister_filesystem(&ext2_fs_type);
6653}
6654
6655static inline int ext2_feature_set_ok(struct super_block *sb)
6656{
6657	if (ext4_has_unknown_ext2_incompat_features(sb))
6658		return 0;
6659	if (sb_rdonly(sb))
6660		return 1;
6661	if (ext4_has_unknown_ext2_ro_compat_features(sb))
6662		return 0;
6663	return 1;
6664}
6665#else
6666static inline void register_as_ext2(void) { }
6667static inline void unregister_as_ext2(void) { }
6668static inline int ext2_feature_set_ok(struct super_block *sb) { return 0; }
6669#endif
6670
6671static inline void register_as_ext3(void)
6672{
6673	int err = register_filesystem(&ext3_fs_type);
6674	if (err)
6675		printk(KERN_WARNING
6676		       "EXT4-fs: Unable to register as ext3 (%d)\n", err);
6677}
6678
6679static inline void unregister_as_ext3(void)
6680{
6681	unregister_filesystem(&ext3_fs_type);
6682}
6683
6684static inline int ext3_feature_set_ok(struct super_block *sb)
6685{
6686	if (ext4_has_unknown_ext3_incompat_features(sb))
6687		return 0;
6688	if (!ext4_has_feature_journal(sb))
6689		return 0;
6690	if (sb_rdonly(sb))
6691		return 1;
6692	if (ext4_has_unknown_ext3_ro_compat_features(sb))
6693		return 0;
6694	return 1;
6695}
6696
6697static struct file_system_type ext4_fs_type = {
6698	.owner		= THIS_MODULE,
6699	.name		= "ext4",
6700	.mount		= ext4_mount,
6701	.kill_sb	= kill_block_super,
6702	.fs_flags	= FS_REQUIRES_DEV | FS_ALLOW_IDMAP,
6703};
6704MODULE_ALIAS_FS("ext4");
6705
6706/* Shared across all ext4 file systems */
6707wait_queue_head_t ext4__ioend_wq[EXT4_WQ_HASH_SZ];
6708
6709static int __init ext4_init_fs(void)
6710{
6711	int i, err;
6712
6713	ratelimit_state_init(&ext4_mount_msg_ratelimit, 30 * HZ, 64);
6714	ext4_li_info = NULL;
6715
6716	/* Build-time check for flags consistency */
6717	ext4_check_flag_values();
6718
6719	for (i = 0; i < EXT4_WQ_HASH_SZ; i++)
6720		init_waitqueue_head(&ext4__ioend_wq[i]);
6721
6722	err = ext4_init_es();
6723	if (err)
6724		return err;
6725
6726	err = ext4_init_pending();
6727	if (err)
6728		goto out7;
6729
6730	err = ext4_init_post_read_processing();
6731	if (err)
6732		goto out6;
6733
6734	err = ext4_init_pageio();
6735	if (err)
6736		goto out5;
6737
6738	err = ext4_init_system_zone();
6739	if (err)
6740		goto out4;
6741
6742	err = ext4_init_sysfs();
6743	if (err)
6744		goto out3;
6745
6746	err = ext4_init_mballoc();
6747	if (err)
6748		goto out2;
6749	err = init_inodecache();
6750	if (err)
6751		goto out1;
6752
6753	err = ext4_fc_init_dentry_cache();
6754	if (err)
6755		goto out05;
6756
6757	register_as_ext3();
6758	register_as_ext2();
6759	err = register_filesystem(&ext4_fs_type);
6760	if (err)
6761		goto out;
6762
6763	return 0;
6764out:
6765	unregister_as_ext2();
6766	unregister_as_ext3();
6767out05:
6768	destroy_inodecache();
6769out1:
6770	ext4_exit_mballoc();
6771out2:
6772	ext4_exit_sysfs();
6773out3:
6774	ext4_exit_system_zone();
6775out4:
6776	ext4_exit_pageio();
6777out5:
6778	ext4_exit_post_read_processing();
6779out6:
6780	ext4_exit_pending();
6781out7:
6782	ext4_exit_es();
6783
6784	return err;
6785}
6786
6787static void __exit ext4_exit_fs(void)
6788{
6789	ext4_destroy_lazyinit_thread();
6790	unregister_as_ext2();
6791	unregister_as_ext3();
6792	unregister_filesystem(&ext4_fs_type);
6793	destroy_inodecache();
6794	ext4_exit_mballoc();
6795	ext4_exit_sysfs();
6796	ext4_exit_system_zone();
6797	ext4_exit_pageio();
6798	ext4_exit_post_read_processing();
6799	ext4_exit_es();
6800	ext4_exit_pending();
6801}
6802
6803MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others");
6804MODULE_DESCRIPTION("Fourth Extended Filesystem");
6805MODULE_LICENSE("GPL");
6806MODULE_SOFTDEP("pre: crc32c");
6807module_init(ext4_init_fs)
6808module_exit(ext4_exit_fs)